On Wed, Sep 01, 2021 at 05:09:23PM +0100, Stefan Hajnoczi wrote:
> It was reported that enabling SafeStack reduces IOPS significantly
> (>25%) with the following fio benchmark on virtio-blk using a NVMe host
> block device:
> 
>   # fio --rw=randrw --bs=4k --iodepth=64 --runtime=1m --direct=1 \
> 	--filename=/dev/vdb --name=job1 --ioengine=libaio --thread \
> 	--group_reporting --numjobs=16 --time_based \
>         --output=/tmp/fio_result
> 
> Serge Guelton and I found that SafeStack is not really at fault, it just
> increases the cost of coroutine creation. This fio workload exhausts the
> coroutine pool and coroutine creation becomes a bottleneck. Previous
> work by Honghao Wang also pointed to excessive coroutine creation.
> 
> Creating new coroutines is expensive due to allocating new stacks with
> mmap(2) and mprotect(2). Currently there are thread-local and global
> pools that recycle old Coroutine objects and their stacks but the
> hardcoded size limit of 64 for thread-local pools and 128 for the global
> pool is insufficient for the fio benchmark shown above.
> 
> This patch changes the coroutine pool algorithm to a simple thread-local
> pool without a size limit. Threads periodically shrink the pool down to
> a size sufficient for the maximum observed number of coroutines.
> 
> This is a very simple algorithm. Fancier things could be done like
> keeping a minimum number of coroutines around to avoid latency when a
> new coroutine is created after a long period of inactivity. Another
> thought is to stop the timer when the pool size is zero for power saving
> on threads that aren't using coroutines. However, I'd rather not add
> bells and whistles unless they are really necessary.
> 
> The global pool is removed by this patch. It can help to hide the fact
> that local pools are easily exhausted, but it's doesn't fix the root
> cause. I don't think there is a need for a global pool because QEMU's
> threads are long-lived, so let's keep things simple.
> 
> Performance of the above fio benchmark is as follows:
> 
>       Before   After
> IOPS     60k     97k
> 
> Memory usage varies over time as needed by the workload:
> 
>             VSZ (KB)             RSS (KB)
> Before fio  4705248              843128
> During fio  5747668 (+ ~100 MB)  849280
> After fio   4694996 (- ~100 MB)  845184
> 
> This confirms that coroutines are indeed being freed when no longer
> needed.
> 
> Thanks to Serge Guelton for working on identifying the bottleneck with
> me!
> 
> Reported-by: Tingting Mao <timao@redhat.com>
> Cc: Serge Guelton <sguelton@redhat.com>
> Cc: Honghao Wang <wanghonghao@bytedance.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Daniele Buono <dbuono@linux.vnet.ibm.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  include/qemu/coroutine-pool-timer.h | 36 +++++++++++++++++
>  include/qemu/coroutine.h            |  7 ++++
>  iothread.c                          |  6 +++
>  util/coroutine-pool-timer.c         | 35 ++++++++++++++++
>  util/main-loop.c                    |  5 +++
>  util/qemu-coroutine.c               | 62 ++++++++++++++---------------
>  util/meson.build                    |  1 +
>  7 files changed, 119 insertions(+), 33 deletions(-)
>  create mode 100644 include/qemu/coroutine-pool-timer.h
>  create mode 100644 util/coroutine-pool-timer.c

Adding Andrew and Jenifer in case they have thoughts on improving QEMU's
coroutine pool algorithm.

> 
> diff --git a/include/qemu/coroutine-pool-timer.h b/include/qemu/coroutine-pool-timer.h
> new file mode 100644
> index 0000000000..c0b520ce99
> --- /dev/null
> +++ b/include/qemu/coroutine-pool-timer.h
> @@ -0,0 +1,36 @@
> +/*
> + * QEMU coroutine pool timer
> + *
> + * Copyright (c) 2021 Red Hat, Inc.
> + *
> + * SPDX-License-Identifier: LGPL-2.1-or-later
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +#ifndef COROUTINE_POOL_TIMER_H
> +#define COROUTINE_POOL_TIMER_H
> +
> +#include "qemu/osdep.h"
> +#include "block/aio.h"
> +
> +/**
> + * A timer that periodically resizes this thread's coroutine pool, freeing
> + * memory if there are too many unused coroutines.
> + *
> + * Threads that make heavy use of coroutines should use this. Failure to resize
> + * the coroutine pool can lead to large amounts of memory sitting idle and
> + * never being used after the first time.
> + */
> +typedef struct {
> +    QEMUTimer *timer;
> +} CoroutinePoolTimer;
> +
> +/* Call this before the thread runs the AioContext */
> +void coroutine_pool_timer_init(CoroutinePoolTimer *pt, AioContext *ctx);
> +
> +/* Call this before the AioContext from the init function is destroyed */
> +void coroutine_pool_timer_cleanup(CoroutinePoolTimer *pt);
> +
> +#endif /* COROUTINE_POOL_TIMER_H */
> diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
> index 4829ff373d..fdb2955ff9 100644
> --- a/include/qemu/coroutine.h
> +++ b/include/qemu/coroutine.h
> @@ -122,6 +122,13 @@ bool qemu_in_coroutine(void);
>   */
>  bool qemu_coroutine_entered(Coroutine *co);
>  
> +/**
> + * Optionally call this function periodically to shrink the thread-local pool
> + * down. Spiky workloads can create many coroutines and then never reach that
> + * level again. Shrinking the pool reclaims memory in this case.
> + */
> +void qemu_coroutine_pool_periodic_resize(void);
> +
>  /**
>   * Provides a mutex that can be used to synchronise coroutines
>   */
> diff --git a/iothread.c b/iothread.c
> index ddbbde61f7..39a24f1a55 100644
> --- a/iothread.c
> +++ b/iothread.c
> @@ -23,6 +23,7 @@
>  #include "qemu/error-report.h"
>  #include "qemu/rcu.h"
>  #include "qemu/main-loop.h"
> +#include "qemu/coroutine-pool-timer.h"
>  
>  typedef ObjectClass IOThreadClass;
>  
> @@ -42,6 +43,7 @@ DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD,
>  static void *iothread_run(void *opaque)
>  {
>      IOThread *iothread = opaque;
> +    CoroutinePoolTimer co_pool_timer;
>  
>      rcu_register_thread();
>      /*
> @@ -53,6 +55,8 @@ static void *iothread_run(void *opaque)
>      iothread->thread_id = qemu_get_thread_id();
>      qemu_sem_post(&iothread->init_done_sem);
>  
> +    coroutine_pool_timer_init(&co_pool_timer, iothread->ctx);
> +
>      while (iothread->running) {
>          /*
>           * Note: from functional-wise the g_main_loop_run() below can
> @@ -74,6 +78,8 @@ static void *iothread_run(void *opaque)
>          }
>      }
>  
> +    coroutine_pool_timer_cleanup(&co_pool_timer);
> +
>      g_main_context_pop_thread_default(iothread->worker_context);
>      rcu_unregister_thread();
>      return NULL;
> diff --git a/util/coroutine-pool-timer.c b/util/coroutine-pool-timer.c
> new file mode 100644
> index 0000000000..36d3216718
> --- /dev/null
> +++ b/util/coroutine-pool-timer.c
> @@ -0,0 +1,35 @@
> +/*
> + * QEMU coroutine pool timer
> + *
> + * Copyright (c) 2021 Red Hat, Inc.
> + *
> + * SPDX-License-Identifier: LGPL-2.1-or-later
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +#include "qemu/coroutine-pool-timer.h"
> +
> +static void coroutine_pool_timer_cb(void *opaque)
> +{
> +    CoroutinePoolTimer *pt = opaque;
> +    int64_t expiry_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
> +                             15 * NANOSECONDS_PER_SECOND;
> +
> +    qemu_coroutine_pool_periodic_resize();
> +    timer_mod(pt->timer, expiry_time_ns);
> +}
> +
> +void coroutine_pool_timer_init(CoroutinePoolTimer *pt, AioContext *ctx)
> +{
> +    pt->timer = aio_timer_new(ctx, QEMU_CLOCK_REALTIME, SCALE_NS,
> +                              coroutine_pool_timer_cb, pt);
> +    coroutine_pool_timer_cb(pt);
> +}
> +
> +void coroutine_pool_timer_cleanup(CoroutinePoolTimer *pt)
> +{
> +    timer_free(pt->timer);
> +    pt->timer = NULL;
> +}
> diff --git a/util/main-loop.c b/util/main-loop.c
> index 06b18b195c..23342e2215 100644
> --- a/util/main-loop.c
> +++ b/util/main-loop.c
> @@ -33,6 +33,7 @@
>  #include "qemu/error-report.h"
>  #include "qemu/queue.h"
>  #include "qemu/compiler.h"
> +#include "qemu/coroutine-pool-timer.h"
>  
>  #ifndef _WIN32
>  #include <sys/wait.h>
> @@ -131,6 +132,7 @@ static int qemu_signal_init(Error **errp)
>  
>  static AioContext *qemu_aio_context;
>  static QEMUBH *qemu_notify_bh;
> +static CoroutinePoolTimer main_loop_co_pool_timer;
>  
>  static void notify_event_cb(void *opaque)
>  {
> @@ -181,6 +183,9 @@ int qemu_init_main_loop(Error **errp)
>      g_source_set_name(src, "io-handler");
>      g_source_attach(src, NULL);
>      g_source_unref(src);
> +
> +    coroutine_pool_timer_init(&main_loop_co_pool_timer, qemu_aio_context);
> +
>      return 0;
>  }
>  
> diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
> index 38fb6d3084..105dbfa89e 100644
> --- a/util/qemu-coroutine.c
> +++ b/util/qemu-coroutine.c
> @@ -20,15 +20,11 @@
>  #include "qemu/coroutine_int.h"
>  #include "block/aio.h"
>  
> -enum {
> -    POOL_BATCH_SIZE = 64,
> -};
> -
>  /** Free list to speed up creation */
> -static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
> -static unsigned int release_pool_size;
>  static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
>  static __thread unsigned int alloc_pool_size;
> +static __thread unsigned int num_coroutines;
> +static __thread unsigned int max_coroutines_this_slice;
>  static __thread Notifier coroutine_pool_cleanup_notifier;
>  
>  static void coroutine_pool_cleanup(Notifier *n, void *value)
> @@ -48,26 +44,19 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
>  
>      if (CONFIG_COROUTINE_POOL) {
>          co = QSLIST_FIRST(&alloc_pool);
> -        if (!co) {
> -            if (release_pool_size > POOL_BATCH_SIZE) {
> -                /* Slow path; a good place to register the destructor, too.  */
> -                if (!coroutine_pool_cleanup_notifier.notify) {
> -                    coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
> -                    qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
> -                }
> -
> -                /* This is not exact; there could be a little skew between
> -                 * release_pool_size and the actual size of release_pool.  But
> -                 * it is just a heuristic, it does not need to be perfect.
> -                 */
> -                alloc_pool_size = qatomic_xchg(&release_pool_size, 0);
> -                QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
> -                co = QSLIST_FIRST(&alloc_pool);
> -            }
> -        }
>          if (co) {
>              QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
>              alloc_pool_size--;
> +        } else {
> +            if (!coroutine_pool_cleanup_notifier.notify) {
> +                coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
> +                qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
> +            }
> +        }
> +
> +        num_coroutines++;
> +        if (num_coroutines > max_coroutines_this_slice) {
> +            max_coroutines_this_slice = num_coroutines;
>          }
>      }
>  
> @@ -86,21 +75,28 @@ static void coroutine_delete(Coroutine *co)
>      co->caller = NULL;
>  
>      if (CONFIG_COROUTINE_POOL) {
> -        if (release_pool_size < POOL_BATCH_SIZE * 2) {
> -            QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
> -            qatomic_inc(&release_pool_size);
> -            return;
> -        }
> -        if (alloc_pool_size < POOL_BATCH_SIZE) {
> -            QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
> -            alloc_pool_size++;
> -            return;
> -        }
> +        num_coroutines--;
> +        QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
> +        alloc_pool_size++;
> +        return;
>      }
>  
>      qemu_coroutine_delete(co);
>  }
>  
> +void qemu_coroutine_pool_periodic_resize(void)
> +{
> +    unsigned pool_size_target = max_coroutines_this_slice - num_coroutines;
> +    max_coroutines_this_slice = num_coroutines;
> +
> +    while (alloc_pool_size > pool_size_target) {
> +        Coroutine *co = QSLIST_FIRST(&alloc_pool);
> +        QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
> +        qemu_coroutine_delete(co);
> +        alloc_pool_size--;
> +    }
> +}
> +
>  void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
>  {
>      QSIMPLEQ_HEAD(, Coroutine) pending = QSIMPLEQ_HEAD_INITIALIZER(pending);
> diff --git a/util/meson.build b/util/meson.build
> index 779f413c86..06241097d2 100644
> --- a/util/meson.build
> +++ b/util/meson.build
> @@ -63,6 +63,7 @@ if have_block
>    util_ss.add(files('buffer.c'))
>    util_ss.add(files('bufferiszero.c'))
>    util_ss.add(files('coroutine-@0@.c'.format(config_host['CONFIG_COROUTINE_BACKEND'])))
> +  util_ss.add(files('coroutine-pool-timer.c'))
>    util_ss.add(files('hbitmap.c'))
>    util_ss.add(files('hexdump.c'))
>    util_ss.add(files('iova-tree.c'))
> -- 
> 2.31.1
>