Re: [PATCH] util: NUMA aware memory preallocation

From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
To: Michal Privoznik <mprivozn@redhat.com>, jmario@redhat.com
Cc: qemu-devel@nongnu.org, david@redhat.com, berrange@redhat.com
Subject: Re: [PATCH] util: NUMA aware memory preallocation
Date: Wed, 11 May 2022 09:34:07 +0100	[thread overview]
Message-ID: <Ynt0/9jfeUPg4JxN@work-vm> (raw)
In-Reply-To: <ffdcd118d59b379ede2b64745144165a40f6a813.1652165704.git.mprivozn@redhat.com>

* Michal Privoznik (mprivozn@redhat.com) wrote:
> When allocating large amounts of memory the task is offloaded
> onto threads. These threads then use various techniques to
> allocate the memory fully (madvise(), writing into the memory).
> However, these threads are free to run on any CPU, which becomes
> problematic on NUMA machines because it may happen that a thread
> is running on a distant node.
> 
> Ideally, this is something that a management application would
> resolve, but we are not anywhere close to that, Firstly, memory
> allocation happens before monitor socket is even available. But
> okay, that's what -preconfig is for. But then the problem is that
> 'object-add' would not return until all memory is preallocated.
> 
> Long story short, management application has no way of learning
> TIDs of allocator threads so it can't make them run NUMA aware.
> 
> But what we can do is to propagate the 'host-nodes' attribute of
> MemoryBackend object down to where preallocation threads are
> created and set their affinity according to the attribute.

Joe (cc'd) sent me some numbers for this which emphasise how useful it
is:
 | On systems with 4 physical numa nodes and 2-6 Tb of memory, this numa-aware
 |preallocation provided about a 25% speedup in touching the pages.
 |The speedup gets larger as the numa node count and memory sizes grow.
....
 | In a simple parallel 1Gb page-zeroing test on a very large system (32-numa
 | nodes and 47Tb of memory), the numa-aware preallocation was 2.3X faster
 | than letting the threads float wherever.
 | We're working with someone whose large guest normally takes 4.5 hours to
 | boot.  With Michal P's initial patch to parallelize the preallocation, that
 | time dropped to about 1 hour.  Including this numa-aware preallocation
 | would reduce the guest boot time to less than 1/2 hour.

so chopping *half an hour* off the startup time seems a worthy
optimisation (even if most of us aren't fortunate enough to have 47T of
ram).

Dave

> Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2074000
> Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
> ---
>  backends/hostmem.c     |  6 ++--
>  hw/virtio/virtio-mem.c |  2 +-
>  include/qemu/osdep.h   |  2 ++
>  util/meson.build       |  2 +-
>  util/oslib-posix.c     | 74 ++++++++++++++++++++++++++++++++++++++++--
>  util/oslib-win32.c     |  2 ++
>  6 files changed, 82 insertions(+), 6 deletions(-)
> 
> diff --git a/backends/hostmem.c b/backends/hostmem.c
> index a7bae3d713..7373472c7e 100644
> --- a/backends/hostmem.c
> +++ b/backends/hostmem.c
> @@ -232,7 +232,8 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
>          void *ptr = memory_region_get_ram_ptr(&backend->mr);
>          uint64_t sz = memory_region_size(&backend->mr);
>  
> -        os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads, &local_err);
> +        os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads,
> +                        backend->host_nodes, MAX_NODES, &local_err);
>          if (local_err) {
>              error_propagate(errp, local_err);
>              return;
> @@ -394,7 +395,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
>           */
>          if (backend->prealloc) {
>              os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,
> -                            backend->prealloc_threads, &local_err);
> +                            backend->prealloc_threads, backend->host_nodes,
> +                            MAX_NODES, &local_err);
>              if (local_err) {
>                  goto out;
>              }
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index 5aca408726..48b104cdf6 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -467,7 +467,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
>              int fd = memory_region_get_fd(&vmem->memdev->mr);
>              Error *local_err = NULL;
>  
> -            os_mem_prealloc(fd, area, size, 1, &local_err);
> +            os_mem_prealloc(fd, area, size, 1, NULL, 0, &local_err);
>              if (local_err) {
>                  static bool warned;
>  
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index 1c1e7eca98..474cbf3b86 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -577,6 +577,8 @@ unsigned long qemu_getauxval(unsigned long type);
>  void qemu_set_tty_echo(int fd, bool echo);
>  
>  void os_mem_prealloc(int fd, char *area, size_t sz, int smp_cpus,
> +                     const unsigned long *host_nodes,
> +                     unsigned long max_node,
>                       Error **errp);
>  
>  /**
> diff --git a/util/meson.build b/util/meson.build
> index 8f16018cd4..393ff74570 100644
> --- a/util/meson.build
> +++ b/util/meson.build
> @@ -15,7 +15,7 @@ freebsd_dep = []
>  if targetos == 'freebsd'
>    freebsd_dep = util
>  endif
> -util_ss.add(when: 'CONFIG_POSIX', if_true: [files('oslib-posix.c'), freebsd_dep])
> +util_ss.add(when: 'CONFIG_POSIX', if_true: [files('oslib-posix.c'), freebsd_dep, numa])
>  util_ss.add(when: 'CONFIG_POSIX', if_true: files('qemu-thread-posix.c'))
>  util_ss.add(when: 'CONFIG_POSIX', if_true: files('memfd.c'))
>  util_ss.add(when: 'CONFIG_WIN32', if_true: files('aio-win32.c'))
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 477990f39b..1572b9b178 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -73,6 +73,10 @@
>  #include "qemu/error-report.h"
>  #endif
>  
> +#ifdef CONFIG_NUMA
> +#include <numa.h>
> +#endif
> +
>  #define MAX_MEM_PREALLOC_THREAD_COUNT 16
>  
>  struct MemsetThread;
> @@ -82,6 +86,9 @@ typedef struct MemsetContext {
>      bool any_thread_failed;
>      struct MemsetThread *threads;
>      int num_threads;
> +#ifdef CONFIG_NUMA
> +    struct bitmask *nodemask;
> +#endif
>  } MemsetContext;
>  
>  struct MemsetThread {
> @@ -420,6 +427,12 @@ static void *do_touch_pages(void *arg)
>      }
>      qemu_mutex_unlock(&page_mutex);
>  
> +#ifdef CONFIG_NUMA
> +    if (memset_args->context->nodemask) {
> +        numa_run_on_node_mask(memset_args->context->nodemask);
> +    }
> +#endif
> +
>      /* unblock SIGBUS */
>      sigemptyset(&set);
>      sigaddset(&set, SIGBUS);
> @@ -463,6 +476,12 @@ static void *do_madv_populate_write_pages(void *arg)
>      }
>      qemu_mutex_unlock(&page_mutex);
>  
> +#ifdef CONFIG_NUMA
> +    if (memset_args->context->nodemask) {
> +        numa_run_on_node_mask(memset_args->context->nodemask);
> +    }
> +#endif
> +
>      if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) {
>          ret = -errno;
>      }
> @@ -489,7 +508,9 @@ static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
>  }
>  
>  static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
> -                           int smp_cpus, bool use_madv_populate_write)
> +                           int smp_cpus, const unsigned long *host_nodes,
> +                           unsigned long max_node,
> +                           bool use_madv_populate_write)
>  {
>      static gsize initialized = 0;
>      MemsetContext context = {
> @@ -499,6 +520,7 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>      void *(*touch_fn)(void *);
>      int ret = 0, i = 0;
>      char *addr = area;
> +    unsigned long value = max_node;
>  
>      if (g_once_init_enter(&initialized)) {
>          qemu_mutex_init(&page_mutex);
> @@ -520,6 +542,48 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>          touch_fn = do_touch_pages;
>      }
>  
> +    if (host_nodes) {
> +        value = find_first_bit(host_nodes, max_node);
> +    }
> +    if (value != max_node) {
> +#ifdef CONFIG_NUMA
> +        struct bitmask *cpus = numa_allocate_cpumask();
> +        g_autofree unsigned long *zerocpumask;
> +        size_t zerocpumasklen;
> +        g_autofree unsigned long *zeronodemask;
> +        size_t zeronodemasklen;
> +
> +        context.nodemask = numa_bitmask_alloc(max_node);
> +
> +        zerocpumasklen = cpus->size / sizeof(unsigned long);
> +        zerocpumask = g_new0(unsigned long, zerocpumasklen);
> +
> +        for (; value != max_node;
> +             value = find_next_bit(host_nodes, max_node, value + 1)) {
> +            if (numa_node_to_cpus(value, cpus) ||
> +                memcmp(cpus->maskp, zerocpumask, zerocpumasklen) == 0)
> +                continue;
> +
> +            /* If given NUMA node has CPUs run threads on them. */
> +            numa_bitmask_setbit(context.nodemask, value);
> +        }
> +
> +        numa_bitmask_free(cpus);
> +
> +        zeronodemasklen = max_node / sizeof(unsigned long);
> +        zeronodemask = g_new0(unsigned long, zeronodemasklen);
> +
> +        if (memcmp(context.nodemask->maskp,
> +                   zeronodemask, zeronodemasklen) == 0) {
> +            /* If no NUMA has a CPU available, then don't pin threads. */
> +            g_clear_pointer(&context.nodemask, numa_bitmask_free);
> +        }
> +#else
> +        errno = -EINVAL;
> +        return -1;
> +#endif
> +    }
> +
>      context.threads = g_new0(MemsetThread, context.num_threads);
>      numpages_per_thread = numpages / context.num_threads;
>      leftover = numpages % context.num_threads;
> @@ -554,6 +618,10 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>      if (!use_madv_populate_write) {
>          sigbus_memset_context = NULL;
>      }
> +
> +#ifdef CONFIG_NUMA
> +    g_clear_pointer(&context.nodemask, numa_bitmask_free);
> +#endif
>      g_free(context.threads);
>  
>      return ret;
> @@ -566,6 +634,8 @@ static bool madv_populate_write_possible(char *area, size_t pagesize)
>  }
>  
>  void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
> +                     const unsigned long *host_nodes,
> +                     unsigned long max_node,
>                       Error **errp)
>  {
>      static gsize initialized;
> @@ -608,7 +678,7 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
>  
>      /* touch pages simultaneously */
>      ret = touch_all_pages(area, hpagesize, numpages, smp_cpus,
> -                          use_madv_populate_write);
> +                          host_nodes, max_node, use_madv_populate_write);
>      if (ret) {
>          error_setg_errno(errp, -ret,
>                           "os_mem_prealloc: preallocating memory failed");
> diff --git a/util/oslib-win32.c b/util/oslib-win32.c
> index dafef4f157..6efd912355 100644
> --- a/util/oslib-win32.c
> +++ b/util/oslib-win32.c
> @@ -314,6 +314,8 @@ int getpagesize(void)
>  }
>  
>  void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
> +                     const unsigned long *host_nodes,
> +                     unsigned long max_node,
>                       Error **errp)
>  {
>      int i;
> -- 
> 2.35.1
> 
> 
-- 
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK