Re: [PATCH RFC 14/21] migration: Map hugetlbfs ramblocks twice, and pre-allocate

From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
To: Peter Xu <peterx@redhat.com>
Cc: qemu-devel@nongnu.org,
	Leonardo Bras Soares Passos <lsoaresp@redhat.com>,
	James Houghton <jthoughton@google.com>,
	Juan Quintela <quintela@redhat.com>
Subject: Re: [PATCH RFC 14/21] migration: Map hugetlbfs ramblocks twice, and pre-allocate
Date: Wed, 25 Jan 2023 14:25:47 +0000	[thread overview]
Message-ID: <Y9E769Iag8YL5iQy@work-vm> (raw)
In-Reply-To: <20230117220914.2062125-15-peterx@redhat.com>

* Peter Xu (peterx@redhat.com) wrote:
> Add a RAMBlock.host_mirror for all the hugetlbfs backed guest memories.
> It'll be used to remap the same region twice and it'll be used to service
> page faults using UFFDIO_CONTINUE.
> 
> To make sure all accesses to these ranges will generate minor page faults
> not missing page faults, we need to pre-allocate the files to make sure
> page cache exist start from the beginning.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

> ---
>  include/exec/ramblock.h |  7 +++++
>  migration/ram.c         | 59 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 66 insertions(+)
> 
> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
> index 3f31ce1591..c76683c3c8 100644
> --- a/include/exec/ramblock.h
> +++ b/include/exec/ramblock.h
> @@ -28,6 +28,13 @@ struct RAMBlock {
>      struct rcu_head rcu;
>      struct MemoryRegion *mr;
>      uint8_t *host;
> +    /*
> +     * This is only used for hugetlbfs ramblocks where doublemap is
> +     * enabled.  The pointer is managed by dest host migration code, and
> +     * should be NULL when migration is finished.  On src host, it should
> +     * always be NULL.
> +     */
> +    uint8_t *host_mirror;
>      uint8_t *colo_cache; /* For colo, VM's ram cache */
>      ram_addr_t offset;
>      ram_addr_t used_length;
> diff --git a/migration/ram.c b/migration/ram.c
> index 2ebf414f5f..37d7b3553a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -3879,6 +3879,57 @@ void colo_release_ram_cache(void)
>      ram_state_cleanup(&ram_state);
>  }
>  
> +static int migrate_hugetlb_doublemap_init(void)
> +{
> +    RAMBlock *rb;
> +    void *addr;
> +    int ret;
> +
> +    if (!migrate_hugetlb_doublemap()) {
> +        return 0;
> +    }
> +
> +    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
> +        if (qemu_ram_is_hugetlb(rb)) {
> +            /*
> +             * Firstly, we remap the same ramblock into another range of
> +             * virtual address, so that we can write to the pages without
> +             * touching the page tables that directly mapped for the guest.
> +             */
> +            addr = ramblock_file_map(rb);
> +            if (addr == MAP_FAILED) {
> +                ret = -errno;
> +                error_report("%s: Duplicate mapping for hugetlb ramblock '%s'"
> +                             "failed: %s", __func__, qemu_ram_get_idstr(rb),
> +                             strerror(errno));
> +                return ret;
> +            }
> +            rb->host_mirror = addr;
> +
> +            /*
> +             * We need to make sure we pre-allocate the range with
> +             * hugetlbfs pages before hand, so that all the page fault will
> +             * be trapped as MINOR faults always, rather than MISSING
> +             * faults in userfaultfd.
> +             */
> +            ret = qemu_madvise(addr, rb->mmap_length, QEMU_MADV_POPULATE_WRITE);
> +            if (ret) {
> +                error_report("Failed to populate hugetlb ramblock '%s': "
> +                             "%s", qemu_ram_get_idstr(rb), strerror(-ret));
> +                return ret;
> +            }
> +        }
> +    }
> +
> +    /*
> +     * When reach here, it means we've setup the mirror mapping for all the
> +     * hugetlbfs pages.  Hence when page fault happens, we'll be able to
> +     * resolve page faults using UFFDIO_CONTINUE for hugetlbfs pages, but
> +     * we'll keep using UFFDIO_COPY for anonymous pages.
> +     */
> +    return 0;
> +}
> +
>  /**
>   * ram_load_setup: Setup RAM for migration incoming side
>   *
> @@ -3893,6 +3944,10 @@ static int ram_load_setup(QEMUFile *f, void *opaque)
>          return -1;
>      }
>  
> +    if (migrate_hugetlb_doublemap_init()) {
> +        return -1;
> +    }
> +
>      xbzrle_load_setup();
>      ramblock_recv_map_init();
>  
> @@ -3913,6 +3968,10 @@ static int ram_load_cleanup(void *opaque)
>      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
>          g_free(rb->receivedmap);
>          rb->receivedmap = NULL;
> +        if (rb->host_mirror) {
> +            munmap(rb->host_mirror, rb->mmap_length);
> +            rb->host_mirror = NULL;
> +        }
>      }
>  
>      return 0;
> -- 
> 2.37.3
> 
-- 
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK