Re: [PATCH v5 16/24] libxc: allocate memory with vNUMA information for HVM guest

From: Andrew Cooper <andrew.cooper3@citrix.com>
To: Wei Liu <wei.liu2@citrix.com>, xen-devel@lists.xen.org
Cc: dario.faggioli@citrix.com, JBeulich@suse.com,
	ian.jackson@eu.citrix.com, ian.campbell@citrix.com,
	ufimtseva@gmail.com
Subject: Re: [PATCH v5 16/24] libxc: allocate memory with vNUMA information for HVM guest
Date: Fri, 13 Feb 2015 16:22:44 +0000	[thread overview]
Message-ID: <54DE24D4.5010708@citrix.com> (raw)
In-Reply-To: <1423770294-9779-17-git-send-email-wei.liu2@citrix.com>

On 12/02/15 19:44, Wei Liu wrote:
> The algorithm is more or less the same as the one used for PV guest.
> Libxc gets hold of the mapping of vnode to pnode and size of each vnode
> then allocate memory accordingly.
>
> And then the function returns low memory end, high memory end and mmio
> start to caller. Libxl needs those values to construct vmemranges for
> that guest.
>
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> Cc: Ian Campbell <ian.campbell@citrix.com>
> Cc: Ian Jackson <ian.jackson@eu.citrix.com>
> Cc: Dario Faggioli <dario.faggioli@citrix.com>
> Cc: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
> Changes in v5:
> 1. Use a better loop variable name vnid.
>
> Changes in v4:
> 1. Adapt to new interface.
> 2. Shorten error message.
> 3. This patch includes only functional changes.
>
> Changes in v3:
> 1. Rewrite commit log.
> 2. Add a few code comments.
> ---
>  tools/libxc/include/xenguest.h |  11 +++++
>  tools/libxc/xc_hvm_build_x86.c | 105 ++++++++++++++++++++++++++++++++++-------
>  2 files changed, 100 insertions(+), 16 deletions(-)
>
> diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
> index 40bbac8..ff66cb1 100644
> --- a/tools/libxc/include/xenguest.h
> +++ b/tools/libxc/include/xenguest.h
> @@ -230,6 +230,17 @@ struct xc_hvm_build_args {
>      struct xc_hvm_firmware_module smbios_module;
>      /* Whether to use claim hypercall (1 - enable, 0 - disable). */
>      int claim_enabled;
> +
> +    /* vNUMA information*/
> +    xen_vmemrange_t *vmemranges;
> +    unsigned int nr_vmemranges;
> +    unsigned int *vnode_to_pnode;
> +    unsigned int nr_vnodes;
> +
> +    /* Out parameters  */
> +    uint64_t lowmem_end;
> +    uint64_t highmem_end;
> +    uint64_t mmio_start;
>  };
>  
>  /**
> diff --git a/tools/libxc/xc_hvm_build_x86.c b/tools/libxc/xc_hvm_build_x86.c
> index ecc3224..a2a3777 100644
> --- a/tools/libxc/xc_hvm_build_x86.c
> +++ b/tools/libxc/xc_hvm_build_x86.c
> @@ -89,7 +89,8 @@ static int modules_init(struct xc_hvm_build_args *args,
>  }
>  
>  static void build_hvm_info(void *hvm_info_page, uint64_t mem_size,
> -                           uint64_t mmio_start, uint64_t mmio_size)
> +                           uint64_t mmio_start, uint64_t mmio_size,
> +                           struct xc_hvm_build_args *args)
>  {
>      struct hvm_info_table *hvm_info = (struct hvm_info_table *)
>          (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET);
> @@ -119,6 +120,10 @@ static void build_hvm_info(void *hvm_info_page, uint64_t mem_size,
>      hvm_info->high_mem_pgend = highmem_end >> PAGE_SHIFT;
>      hvm_info->reserved_mem_pgstart = ioreq_server_pfn(0);
>  
> +    args->lowmem_end = lowmem_end;
> +    args->highmem_end = highmem_end;
> +    args->mmio_start = mmio_start;
> +
>      /* Finish with the checksum. */
>      for ( i = 0, sum = 0; i < hvm_info->length; i++ )
>          sum += ((uint8_t *)hvm_info)[i];
> @@ -244,7 +249,7 @@ static int setup_guest(xc_interface *xch,
>                         char *image, unsigned long image_size)
>  {
>      xen_pfn_t *page_array = NULL;
> -    unsigned long i, nr_pages = args->mem_size >> PAGE_SHIFT;
> +    unsigned long i, vmemid, nr_pages = args->mem_size >> PAGE_SHIFT;
>      unsigned long target_pages = args->mem_target >> PAGE_SHIFT;
>      uint64_t mmio_start = (1ull << 32) - args->mmio_size;
>      uint64_t mmio_size = args->mmio_size;
> @@ -258,13 +263,13 @@ static int setup_guest(xc_interface *xch,
>      xen_capabilities_info_t caps;
>      unsigned long stat_normal_pages = 0, stat_2mb_pages = 0, 
>          stat_1gb_pages = 0;
> -    int pod_mode = 0;
> +    unsigned int memflags = 0;
>      int claim_enabled = args->claim_enabled;
>      xen_pfn_t special_array[NR_SPECIAL_PAGES];
>      xen_pfn_t ioreq_server_array[NR_IOREQ_SERVER_PAGES];
> -
> -    if ( nr_pages > target_pages )
> -        pod_mode = XENMEMF_populate_on_demand;
> +    uint64_t total_pages;
> +    xen_vmemrange_t dummy_vmemrange;
> +    unsigned int dummy_vnode_to_pnode;
>  
>      memset(&elf, 0, sizeof(elf));
>      if ( elf_init(&elf, image, image_size) != 0 )
> @@ -276,6 +281,43 @@ static int setup_guest(xc_interface *xch,
>      v_start = 0;
>      v_end = args->mem_size;
>  
> +    if ( nr_pages > target_pages )
> +        memflags |= XENMEMF_populate_on_demand;
> +
> +    if ( args->nr_vmemranges == 0 )
> +    {
> +        /* Build dummy vnode information */
> +        dummy_vmemrange.start = 0;
> +        dummy_vmemrange.end   = args->mem_size;
> +        dummy_vmemrange.flags = 0;
> +        dummy_vmemrange.nid   = 0;
> +        args->nr_vmemranges = 1;
> +        args->vmemranges = &dummy_vmemrange;
> +
> +        dummy_vnode_to_pnode = XC_VNUMA_NO_NODE;
> +        args->nr_vnodes = 1;
> +        args->vnode_to_pnode = &dummy_vnode_to_pnode;
> +    }
> +    else
> +    {
> +        if ( nr_pages > target_pages )
> +        {
> +            PERROR("Cannot enable vNUMA and PoD at the same time");

We would solve a large number of interaction issues like this if someone
had the time to reimplement PoD using the paging system to page in a
page of zeroes.

It would be functionally identical from the guests point of view,
wouldn't need any toolstack interaction, and would reduce the number of
moving parts involved in setting up memory for domain.

(I don't suggest this being a prerequisite to this patch series.)

~Andrew

> +            goto error_out;
> +        }
> +    }
> +
> +    total_pages = 0;
> +    for ( i = 0; i < args->nr_vmemranges; i++ )
> +        total_pages += ((args->vmemranges[i].end - args->vmemranges[i].start)
> +                        >> PAGE_SHIFT);
> +    if ( total_pages != (args->mem_size >> PAGE_SHIFT) )
> +    {
> +        PERROR("vNUMA memory pages mismatch (0x%"PRIx64" != 0x%"PRIx64")",
> +               total_pages, args->mem_size >> PAGE_SHIFT);
> +        goto error_out;
> +    }
> +
>      if ( xc_version(xch, XENVER_capabilities, &caps) != 0 )
>      {
>          PERROR("Could not get Xen capabilities");
> @@ -320,7 +362,7 @@ static int setup_guest(xc_interface *xch,
>          }
>      }
>  
> -    if ( pod_mode )
> +    if ( memflags & XENMEMF_populate_on_demand )
>      {
>          /*
>           * Subtract VGA_HOLE_SIZE from target_pages for the VGA
> @@ -349,15 +391,40 @@ static int setup_guest(xc_interface *xch,
>       * ensure that we can be preempted and hence dom0 remains responsive.
>       */
>      rc = xc_domain_populate_physmap_exact(
> -        xch, dom, 0xa0, 0, pod_mode, &page_array[0x00]);
> -    cur_pages = 0xc0;
> -    stat_normal_pages = 0xc0;
> +        xch, dom, 0xa0, 0, memflags, &page_array[0x00]);
>  
> +    stat_normal_pages = 0;
> +    for ( vmemid = 0; vmemid < args->nr_vmemranges; vmemid++ )
>      {
> -        while ( (rc == 0) && (nr_pages > cur_pages) )
> +        unsigned int new_memflags = memflags;
> +        uint64_t end_pages;
> +        unsigned int vnode = args->vmemranges[vmemid].nid;
> +        unsigned int pnode = args->vnode_to_pnode[vnode];
> +
> +        if ( pnode != XC_VNUMA_NO_NODE )
> +        {
> +            new_memflags |= XENMEMF_exact_node(pnode);
> +            new_memflags |= XENMEMF_exact_node_request;
> +        }
> +
> +        end_pages = args->vmemranges[i].end >> PAGE_SHIFT;
> +        /*
> +         * Consider vga hole belongs to the vmemrange that covers
> +         * 0xA0000-0xC0000. Note that 0x00000-0xA0000 is populated just
> +         * before this loop.
> +         */
> +        if ( args->vmemranges[vmemid].start == 0 )
> +        {
> +            cur_pages = 0xc0;
> +            stat_normal_pages += 0xc0;
> +        }
> +        else
> +            cur_pages = args->vmemranges[vmemid].start >> PAGE_SHIFT;
> +
> +        while ( (rc == 0) && (end_pages > cur_pages) )
>          {
>              /* Clip count to maximum 1GB extent. */
> -            unsigned long count = nr_pages - cur_pages;
> +            unsigned long count = end_pages - cur_pages;
>              unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS;
>  
>              if ( count > max_pages )
> @@ -394,7 +461,7 @@ static int setup_guest(xc_interface *xch,
>  
>                  done = xc_domain_populate_physmap(xch, dom, nr_extents,
>                                                    SUPERPAGE_1GB_SHIFT,
> -                                                  pod_mode, sp_extents);
> +                                                  memflags, sp_extents);
>  
>                  if ( done > 0 )
>                  {
> @@ -434,7 +501,7 @@ static int setup_guest(xc_interface *xch,
>  
>                      done = xc_domain_populate_physmap(xch, dom, nr_extents,
>                                                        SUPERPAGE_2MB_SHIFT,
> -                                                      pod_mode, sp_extents);
> +                                                      memflags, sp_extents);
>  
>                      if ( done > 0 )
>                      {
> @@ -450,11 +517,14 @@ static int setup_guest(xc_interface *xch,
>              if ( count != 0 )
>              {
>                  rc = xc_domain_populate_physmap_exact(
> -                    xch, dom, count, 0, pod_mode, &page_array[cur_pages]);
> +                    xch, dom, count, 0, new_memflags, &page_array[cur_pages]);
>                  cur_pages += count;
>                  stat_normal_pages += count;
>              }
>          }
> +
> +        if ( rc != 0 )
> +            break;
>      }
>  
>      if ( rc != 0 )
> @@ -478,7 +548,7 @@ static int setup_guest(xc_interface *xch,
>                xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
>                HVM_INFO_PFN)) == NULL )
>          goto error_out;
> -    build_hvm_info(hvm_info_page, v_end, mmio_start, mmio_size);
> +    build_hvm_info(hvm_info_page, v_end, mmio_start, mmio_size, args);
>      munmap(hvm_info_page, PAGE_SIZE);
>  
>      /* Allocate and clear special pages. */
> @@ -617,6 +687,9 @@ int xc_hvm_build(xc_interface *xch, uint32_t domid,
>              args.acpi_module.guest_addr_out;
>          hvm_args->smbios_module.guest_addr_out = 
>              args.smbios_module.guest_addr_out;
> +        hvm_args->lowmem_end = args.lowmem_end;
> +        hvm_args->highmem_end = args.highmem_end;
> +        hvm_args->mmio_start = args.mmio_start;
>      }
>  
>      free(image);