From 9d00965910d5a7818ffe55b18d3e9dd4c7210e1b Mon Sep 17 00:00:00 2001 From: Prasad Pandit Date: Thu, 25 Apr 2024 12:34:12 +0530 Subject: [PATCH 4/4] linux-aio: add IO_CMD_FDSYNC command support RH-Author: Prasad Pandit RH-MergeRequest: 378: linux-aio: add IO_CMD_FDSYNC command support RH-Jira: RHEL-43261 RH-Acked-by: Miroslav Rezanina RH-Commit: [1/1] 3b80d4a162aad1a87322078a7d7b060a9496035b Libaio defines IO_CMD_FDSYNC command to sync all outstanding asynchronous I/O operations, by flushing out file data to the disk storage. Enable linux-aio to submit such aio request. When using aio=native without fdsync() support, QEMU creates pthreads, and destroying these pthreads results in TLB flushes. In a real-time guest environment, TLB flushes cause a latency spike. This patch helps to avoid such spikes. Jira: https://issues.redhat.com/browse/RHEL-43261 Reviewed-by: Stefan Hajnoczi Signed-off-by: Prasad Pandit Message-ID: <20240425070412.37248-1-ppandit@redhat.com> Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf (cherry picked from commit 24687abf237e3c15816d689a8e4b08d7c3190dcb) Signed-off-by: Prasad Pandit --- block/file-posix.c | 9 +++++++++ block/linux-aio.c | 21 ++++++++++++++++++++- include/block/raw-aio.h | 1 + 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/block/file-posix.c b/block/file-posix.c index 35684f7e21..9831b08fb6 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -159,6 +159,7 @@ typedef struct BDRVRawState { bool has_discard:1; bool has_write_zeroes:1; bool use_linux_aio:1; + bool has_laio_fdsync:1; bool use_linux_io_uring:1; int page_cache_inconsistent; /* errno from fdatasync failure */ bool has_fallocate; @@ -718,6 +719,9 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } + if (s->use_linux_aio) { + s->has_laio_fdsync = laio_has_fdsync(s->fd); + } #else if (s->use_linux_aio) { error_setg(errp, "aio=native was specified, but is not supported " @@ -2599,6 +2603,11 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) if (raw_check_linux_io_uring(s)) { return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH); } +#endif +#ifdef CONFIG_LINUX_AIO + if (s->has_laio_fdsync && raw_check_linux_aio(s)) { + return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0); + } #endif return raw_thread_pool_submit(handle_aiocb_flush, &acb); } diff --git a/block/linux-aio.c b/block/linux-aio.c index ec05d946f3..e3b5ec9aba 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -384,6 +384,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, case QEMU_AIO_READ: io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); break; + case QEMU_AIO_FLUSH: + io_prep_fdsync(iocbs, fd); + break; /* Currently Linux kernel does not support other operations */ default: fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", @@ -412,7 +415,7 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, AioContext *ctx = qemu_get_current_aio_context(); struct qemu_laiocb laiocb = { .co = qemu_coroutine_self(), - .nbytes = qiov->size, + .nbytes = qiov ? qiov->size : 0, .ctx = aio_get_linux_aio(ctx), .ret = -EINPROGRESS, .is_read = (type == QEMU_AIO_READ), @@ -486,3 +489,19 @@ void laio_cleanup(LinuxAioState *s) } g_free(s); } + +bool laio_has_fdsync(int fd) +{ + struct iocb cb; + struct iocb *cbs[] = {&cb, NULL}; + + io_context_t ctx = 0; + io_setup(1, &ctx); + + /* check if host kernel supports IO_CMD_FDSYNC */ + io_prep_fdsync(&cb, fd); + int ret = io_submit(ctx, 1, cbs); + + io_destroy(ctx); + return (ret == -EINVAL) ? false : true; +} diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index 0f63c2800c..3166903a56 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -60,6 +60,7 @@ void laio_cleanup(LinuxAioState *s); int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, int type, uint64_t dev_max_batch); +bool laio_has_fdsync(int); void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context); void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context); #endif -- 2.39.3