All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Daniel P. Berrangé" <berrange@redhat.com>
To: David Hildenbrand <david@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>,
	Eduardo Habkost <ehabkost@redhat.com>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	qemu-devel@nongnu.org,
	"Dr . David Alan Gilbert" <dgilbert@redhat.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Igor Mammedov <imammedo@redhat.com>,
	Marek Kedzierski <mkedzier@redhat.com>
Subject: Re: [PATCH v1 1/3] util/oslib-posix: Support MADV_POPULATE_WRITE for os_mem_prealloc()
Date: Tue, 20 Jul 2021 15:08:35 +0100	[thread overview]
Message-ID: <YPbY45FmTYNVPKCs@redhat.com> (raw)
In-Reply-To: <20210714112306.67793-2-david@redhat.com>

On Wed, Jul 14, 2021 at 01:23:04PM +0200, David Hildenbrand wrote:
> Let's sense support and use it for preallocation. MADV_POPULATE_WRITE
> does not require a SIGBUS handler, doesn't actually touch page content,
> and avoids context switches; it is, therefore, faster and easier to handle
> than our current approach.
> 
> This resolves the TODO in do_touch_pages().
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  include/qemu/osdep.h |  7 ++++
>  util/oslib-posix.c   | 84 +++++++++++++++++++++++++++++++++-----------
>  2 files changed, 71 insertions(+), 20 deletions(-)
> 

> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index e8bdb02e1d..679796ac1f 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -484,10 +484,6 @@ static void *do_touch_pages(void *arg)
>               *
>               * 'volatile' to stop compiler optimizing this away
>               * to a no-op
> -             *
> -             * TODO: get a better solution from kernel so we
> -             * don't need to write at all so we don't cause
> -             * wear on the storage backing the region...
>               */
>              *(volatile char *)addr = *addr;
>              addr += hpagesize;
> @@ -497,6 +493,27 @@ static void *do_touch_pages(void *arg)
>      return NULL;
>  }
>  
> +static void *do_madv_populate_write_pages(void *arg)
> +{
> +    MemsetThread *memset_args = (MemsetThread *)arg;
> +    const size_t size = memset_args->numpages * memset_args->hpagesize;
> +    char * const addr = memset_args->addr;
> +    int ret;
> +
> +    /* See do_touch_pages(). */
> +    qemu_mutex_lock(&page_mutex);
> +    while (!threads_created_flag) {
> +        qemu_cond_wait(&page_cond, &page_mutex);
> +    }
> +    qemu_mutex_unlock(&page_mutex);
> +
> +    ret = qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE);
> +    if (ret) {
> +        memset_thread_failed = true;
> +    }
> +    return NULL;
> +}
> +
>  static inline int get_memset_num_threads(int smp_cpus)
>  {
>      long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
> @@ -510,10 +527,11 @@ static inline int get_memset_num_threads(int smp_cpus)
>  }
>  
>  static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
> -                            int smp_cpus)
> +                            int smp_cpus, bool use_madv_populate_write)
>  {
>      static gsize initialized = 0;
>      size_t numpages_per_thread, leftover;
> +    void *(*touch_fn)(void *);
>      char *addr = area;
>      int i = 0;
>  
> @@ -523,6 +541,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>          g_once_init_leave(&initialized, 1);
>      }
>  
> +    if (use_madv_populate_write) {
> +        touch_fn = do_madv_populate_write_pages;
> +    } else {
> +        touch_fn = do_touch_pages;
> +    }
> +
>      memset_thread_failed = false;
>      threads_created_flag = false;
>      memset_num_threads = get_memset_num_threads(smp_cpus);
> @@ -534,7 +558,7 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>          memset_thread[i].numpages = numpages_per_thread + (i < leftover);
>          memset_thread[i].hpagesize = hpagesize;
>          qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
> -                           do_touch_pages, &memset_thread[i],
> +                           touch_fn, &memset_thread[i],
>                             QEMU_THREAD_JOINABLE);
>          addr += memset_thread[i].numpages * hpagesize;
>      }

Do you have an indication of what the speed differential is for the
old read/write dance vs the kernel madvise. We needed to use threads
previously because the read/write dance is pretty terribly slow.

Is that still a problem with the madvise approach ? I would (perhaps
naively), expect that the kernel would be able to do this efficiently
for arbitrarily large memory regions, such that QEMU would not need
to play games with threads.


> @@ -553,6 +577,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>      return memset_thread_failed;
>  }
>  
> +static bool madv_populate_write_possible(char *area, size_t pagesize)
> +{
> +    return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
> +           errno != EINVAL;
> +}
> +
>  void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
>                       Error **errp)
>  {
> @@ -560,29 +590,43 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
>      struct sigaction act, oldact;
>      size_t hpagesize = qemu_fd_getpagesize(fd);
>      size_t numpages = DIV_ROUND_UP(memory, hpagesize);
> +    bool use_madv_populate_write;

Initialized with random garbage from the stack

> +
> +    /*
> +     * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
> +     * some special mappings, such as mapping /dev/mem.
> +     */
> +    if (madv_populate_write_possible(area, hpagesize)) {
> +        use_madv_populate_write = true;
> +    }

but this implicitly assumes it was initialized to false.

>  
> -    memset(&act, 0, sizeof(act));
> -    act.sa_handler = &sigbus_handler;
> -    act.sa_flags = 0;
> +    if (!use_madv_populate_write) {
> +        memset(&act, 0, sizeof(act));
> +        act.sa_handler = &sigbus_handler;
> +        act.sa_flags = 0;
>  
> -    ret = sigaction(SIGBUS, &act, &oldact);
> -    if (ret) {
> -        error_setg_errno(errp, errno,
> -            "os_mem_prealloc: failed to install signal handler");
> -        return;
> +        ret = sigaction(SIGBUS, &act, &oldact);
> +        if (ret) {
> +            error_setg_errno(errp, errno,
> +                "os_mem_prealloc: failed to install signal handler");
> +            return;
> +        }
>      }
>  
>      /* touch pages simultaneously */
> -    if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
> +    if (touch_all_pages(area, hpagesize, numpages, smp_cpus,
> +                        use_madv_populate_write)) {
>          error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
>              "pages available to allocate guest RAM");
>      }
>  
> -    ret = sigaction(SIGBUS, &oldact, NULL);
> -    if (ret) {
> -        /* Terminate QEMU since it can't recover from error */
> -        perror("os_mem_prealloc: failed to reinstall signal handler");
> -        exit(1);
> +    if (!use_madv_populate_write) {
> +        ret = sigaction(SIGBUS, &oldact, NULL);
> +        if (ret) {
> +            /* Terminate QEMU since it can't recover from error */
> +            perror("os_mem_prealloc: failed to reinstall signal handler");
> +            exit(1);
> +        }
>      }
>  }
>  
> -- 
> 2.31.1
> 
> 

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|



  reply	other threads:[~2021-07-20 14:10 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-14 11:23 [PATCH v1 0/3] util/oslib-posix: Support MADV_POPULATE_WRITE for os_mem_prealloc() David Hildenbrand
2021-07-14 11:23 ` [PATCH v1 1/3] " David Hildenbrand
2021-07-20 14:08   ` Daniel P. Berrangé [this message]
2021-07-20 14:34     ` David Hildenbrand
2021-07-14 11:23 ` [PATCH v1 2/3] util/oslib-posix: Introduce and use MemsetContext for touch_all_pages() David Hildenbrand
2021-07-20 14:27   ` Daniel P. Berrangé
2021-07-14 11:23 ` [PATCH v1 3/3] util/oslib-posix: Support concurrent os_mem_prealloc() invocation David Hildenbrand
2021-07-20 14:22   ` Daniel P. Berrangé
2021-07-20 14:27     ` David Hildenbrand
2021-07-20 14:31       ` Daniel P. Berrangé
2021-07-20 14:35         ` David Hildenbrand
2021-07-20 13:55 ` [PATCH v1 0/3] util/oslib-posix: Support MADV_POPULATE_WRITE for os_mem_prealloc() Pankaj Gupta
2021-07-20 13:58   ` Pankaj Gupta
2021-07-20 14:45 ` Daniel P. Berrangé
2021-07-21  8:23   ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YPbY45FmTYNVPKCs@redhat.com \
    --to=berrange@redhat.com \
    --cc=david@redhat.com \
    --cc=dgilbert@redhat.com \
    --cc=ehabkost@redhat.com \
    --cc=imammedo@redhat.com \
    --cc=mkedzier@redhat.com \
    --cc=mst@redhat.com \
    --cc=pankaj.gupta.linux@gmail.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.