On Wed, Sep 01, 2021 at 05:09:23PM +0100, Stefan Hajnoczi wrote: > It was reported that enabling SafeStack reduces IOPS significantly > (>25%) with the following fio benchmark on virtio-blk using a NVMe host > block device: > > # fio --rw=randrw --bs=4k --iodepth=64 --runtime=1m --direct=1 \ > --filename=/dev/vdb --name=job1 --ioengine=libaio --thread \ > --group_reporting --numjobs=16 --time_based \ > --output=/tmp/fio_result > > Serge Guelton and I found that SafeStack is not really at fault, it just > increases the cost of coroutine creation. This fio workload exhausts the > coroutine pool and coroutine creation becomes a bottleneck. Previous > work by Honghao Wang also pointed to excessive coroutine creation. > > Creating new coroutines is expensive due to allocating new stacks with > mmap(2) and mprotect(2). Currently there are thread-local and global > pools that recycle old Coroutine objects and their stacks but the > hardcoded size limit of 64 for thread-local pools and 128 for the global > pool is insufficient for the fio benchmark shown above. > > This patch changes the coroutine pool algorithm to a simple thread-local > pool without a size limit. Threads periodically shrink the pool down to > a size sufficient for the maximum observed number of coroutines. > > This is a very simple algorithm. Fancier things could be done like > keeping a minimum number of coroutines around to avoid latency when a > new coroutine is created after a long period of inactivity. Another > thought is to stop the timer when the pool size is zero for power saving > on threads that aren't using coroutines. However, I'd rather not add > bells and whistles unless they are really necessary. > > The global pool is removed by this patch. It can help to hide the fact > that local pools are easily exhausted, but it's doesn't fix the root > cause. I don't think there is a need for a global pool because QEMU's > threads are long-lived, so let's keep things simple. > > Performance of the above fio benchmark is as follows: > > Before After > IOPS 60k 97k > > Memory usage varies over time as needed by the workload: > > VSZ (KB) RSS (KB) > Before fio 4705248 843128 > During fio 5747668 (+ ~100 MB) 849280 > After fio 4694996 (- ~100 MB) 845184 > > This confirms that coroutines are indeed being freed when no longer > needed. > > Thanks to Serge Guelton for working on identifying the bottleneck with > me! > > Reported-by: Tingting Mao > Cc: Serge Guelton > Cc: Honghao Wang > Cc: Paolo Bonzini > Cc: Daniele Buono > Signed-off-by: Stefan Hajnoczi > --- > include/qemu/coroutine-pool-timer.h | 36 +++++++++++++++++ > include/qemu/coroutine.h | 7 ++++ > iothread.c | 6 +++ > util/coroutine-pool-timer.c | 35 ++++++++++++++++ > util/main-loop.c | 5 +++ > util/qemu-coroutine.c | 62 ++++++++++++++--------------- > util/meson.build | 1 + > 7 files changed, 119 insertions(+), 33 deletions(-) > create mode 100644 include/qemu/coroutine-pool-timer.h > create mode 100644 util/coroutine-pool-timer.c Adding Andrew and Jenifer in case they have thoughts on improving QEMU's coroutine pool algorithm. > > diff --git a/include/qemu/coroutine-pool-timer.h b/include/qemu/coroutine-pool-timer.h > new file mode 100644 > index 0000000000..c0b520ce99 > --- /dev/null > +++ b/include/qemu/coroutine-pool-timer.h > @@ -0,0 +1,36 @@ > +/* > + * QEMU coroutine pool timer > + * > + * Copyright (c) 2021 Red Hat, Inc. > + * > + * SPDX-License-Identifier: LGPL-2.1-or-later > + * > + * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. > + * See the COPYING.LIB file in the top-level directory. > + * > + */ > +#ifndef COROUTINE_POOL_TIMER_H > +#define COROUTINE_POOL_TIMER_H > + > +#include "qemu/osdep.h" > +#include "block/aio.h" > + > +/** > + * A timer that periodically resizes this thread's coroutine pool, freeing > + * memory if there are too many unused coroutines. > + * > + * Threads that make heavy use of coroutines should use this. Failure to resize > + * the coroutine pool can lead to large amounts of memory sitting idle and > + * never being used after the first time. > + */ > +typedef struct { > + QEMUTimer *timer; > +} CoroutinePoolTimer; > + > +/* Call this before the thread runs the AioContext */ > +void coroutine_pool_timer_init(CoroutinePoolTimer *pt, AioContext *ctx); > + > +/* Call this before the AioContext from the init function is destroyed */ > +void coroutine_pool_timer_cleanup(CoroutinePoolTimer *pt); > + > +#endif /* COROUTINE_POOL_TIMER_H */ > diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h > index 4829ff373d..fdb2955ff9 100644 > --- a/include/qemu/coroutine.h > +++ b/include/qemu/coroutine.h > @@ -122,6 +122,13 @@ bool qemu_in_coroutine(void); > */ > bool qemu_coroutine_entered(Coroutine *co); > > +/** > + * Optionally call this function periodically to shrink the thread-local pool > + * down. Spiky workloads can create many coroutines and then never reach that > + * level again. Shrinking the pool reclaims memory in this case. > + */ > +void qemu_coroutine_pool_periodic_resize(void); > + > /** > * Provides a mutex that can be used to synchronise coroutines > */ > diff --git a/iothread.c b/iothread.c > index ddbbde61f7..39a24f1a55 100644 > --- a/iothread.c > +++ b/iothread.c > @@ -23,6 +23,7 @@ > #include "qemu/error-report.h" > #include "qemu/rcu.h" > #include "qemu/main-loop.h" > +#include "qemu/coroutine-pool-timer.h" > > typedef ObjectClass IOThreadClass; > > @@ -42,6 +43,7 @@ DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD, > static void *iothread_run(void *opaque) > { > IOThread *iothread = opaque; > + CoroutinePoolTimer co_pool_timer; > > rcu_register_thread(); > /* > @@ -53,6 +55,8 @@ static void *iothread_run(void *opaque) > iothread->thread_id = qemu_get_thread_id(); > qemu_sem_post(&iothread->init_done_sem); > > + coroutine_pool_timer_init(&co_pool_timer, iothread->ctx); > + > while (iothread->running) { > /* > * Note: from functional-wise the g_main_loop_run() below can > @@ -74,6 +78,8 @@ static void *iothread_run(void *opaque) > } > } > > + coroutine_pool_timer_cleanup(&co_pool_timer); > + > g_main_context_pop_thread_default(iothread->worker_context); > rcu_unregister_thread(); > return NULL; > diff --git a/util/coroutine-pool-timer.c b/util/coroutine-pool-timer.c > new file mode 100644 > index 0000000000..36d3216718 > --- /dev/null > +++ b/util/coroutine-pool-timer.c > @@ -0,0 +1,35 @@ > +/* > + * QEMU coroutine pool timer > + * > + * Copyright (c) 2021 Red Hat, Inc. > + * > + * SPDX-License-Identifier: LGPL-2.1-or-later > + * > + * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. > + * See the COPYING.LIB file in the top-level directory. > + * > + */ > +#include "qemu/coroutine-pool-timer.h" > + > +static void coroutine_pool_timer_cb(void *opaque) > +{ > + CoroutinePoolTimer *pt = opaque; > + int64_t expiry_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + > + 15 * NANOSECONDS_PER_SECOND; > + > + qemu_coroutine_pool_periodic_resize(); > + timer_mod(pt->timer, expiry_time_ns); > +} > + > +void coroutine_pool_timer_init(CoroutinePoolTimer *pt, AioContext *ctx) > +{ > + pt->timer = aio_timer_new(ctx, QEMU_CLOCK_REALTIME, SCALE_NS, > + coroutine_pool_timer_cb, pt); > + coroutine_pool_timer_cb(pt); > +} > + > +void coroutine_pool_timer_cleanup(CoroutinePoolTimer *pt) > +{ > + timer_free(pt->timer); > + pt->timer = NULL; > +} > diff --git a/util/main-loop.c b/util/main-loop.c > index 06b18b195c..23342e2215 100644 > --- a/util/main-loop.c > +++ b/util/main-loop.c > @@ -33,6 +33,7 @@ > #include "qemu/error-report.h" > #include "qemu/queue.h" > #include "qemu/compiler.h" > +#include "qemu/coroutine-pool-timer.h" > > #ifndef _WIN32 > #include > @@ -131,6 +132,7 @@ static int qemu_signal_init(Error **errp) > > static AioContext *qemu_aio_context; > static QEMUBH *qemu_notify_bh; > +static CoroutinePoolTimer main_loop_co_pool_timer; > > static void notify_event_cb(void *opaque) > { > @@ -181,6 +183,9 @@ int qemu_init_main_loop(Error **errp) > g_source_set_name(src, "io-handler"); > g_source_attach(src, NULL); > g_source_unref(src); > + > + coroutine_pool_timer_init(&main_loop_co_pool_timer, qemu_aio_context); > + > return 0; > } > > diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c > index 38fb6d3084..105dbfa89e 100644 > --- a/util/qemu-coroutine.c > +++ b/util/qemu-coroutine.c > @@ -20,15 +20,11 @@ > #include "qemu/coroutine_int.h" > #include "block/aio.h" > > -enum { > - POOL_BATCH_SIZE = 64, > -}; > - > /** Free list to speed up creation */ > -static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool); > -static unsigned int release_pool_size; > static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool); > static __thread unsigned int alloc_pool_size; > +static __thread unsigned int num_coroutines; > +static __thread unsigned int max_coroutines_this_slice; > static __thread Notifier coroutine_pool_cleanup_notifier; > > static void coroutine_pool_cleanup(Notifier *n, void *value) > @@ -48,26 +44,19 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque) > > if (CONFIG_COROUTINE_POOL) { > co = QSLIST_FIRST(&alloc_pool); > - if (!co) { > - if (release_pool_size > POOL_BATCH_SIZE) { > - /* Slow path; a good place to register the destructor, too. */ > - if (!coroutine_pool_cleanup_notifier.notify) { > - coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup; > - qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); > - } > - > - /* This is not exact; there could be a little skew between > - * release_pool_size and the actual size of release_pool. But > - * it is just a heuristic, it does not need to be perfect. > - */ > - alloc_pool_size = qatomic_xchg(&release_pool_size, 0); > - QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool); > - co = QSLIST_FIRST(&alloc_pool); > - } > - } > if (co) { > QSLIST_REMOVE_HEAD(&alloc_pool, pool_next); > alloc_pool_size--; > + } else { > + if (!coroutine_pool_cleanup_notifier.notify) { > + coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup; > + qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); > + } > + } > + > + num_coroutines++; > + if (num_coroutines > max_coroutines_this_slice) { > + max_coroutines_this_slice = num_coroutines; > } > } > > @@ -86,21 +75,28 @@ static void coroutine_delete(Coroutine *co) > co->caller = NULL; > > if (CONFIG_COROUTINE_POOL) { > - if (release_pool_size < POOL_BATCH_SIZE * 2) { > - QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); > - qatomic_inc(&release_pool_size); > - return; > - } > - if (alloc_pool_size < POOL_BATCH_SIZE) { > - QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next); > - alloc_pool_size++; > - return; > - } > + num_coroutines--; > + QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next); > + alloc_pool_size++; > + return; > } > > qemu_coroutine_delete(co); > } > > +void qemu_coroutine_pool_periodic_resize(void) > +{ > + unsigned pool_size_target = max_coroutines_this_slice - num_coroutines; > + max_coroutines_this_slice = num_coroutines; > + > + while (alloc_pool_size > pool_size_target) { > + Coroutine *co = QSLIST_FIRST(&alloc_pool); > + QSLIST_REMOVE_HEAD(&alloc_pool, pool_next); > + qemu_coroutine_delete(co); > + alloc_pool_size--; > + } > +} > + > void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co) > { > QSIMPLEQ_HEAD(, Coroutine) pending = QSIMPLEQ_HEAD_INITIALIZER(pending); > diff --git a/util/meson.build b/util/meson.build > index 779f413c86..06241097d2 100644 > --- a/util/meson.build > +++ b/util/meson.build > @@ -63,6 +63,7 @@ if have_block > util_ss.add(files('buffer.c')) > util_ss.add(files('bufferiszero.c')) > util_ss.add(files('coroutine-@0@.c'.format(config_host['CONFIG_COROUTINE_BACKEND']))) > + util_ss.add(files('coroutine-pool-timer.c')) > util_ss.add(files('hbitmap.c')) > util_ss.add(files('hexdump.c')) > util_ss.add(files('iova-tree.c')) > -- > 2.31.1 >