[PATCH v10 6/9] libxl: build numa nodes memory blocks

From: Elena Ufimtseva <ufimtseva@gmail.com>
To: xen-devel@lists.xen.org
Cc: keir@xen.org, Ian.Campbell@citrix.com,
	stefano.stabellini@eu.citrix.com, george.dunlap@eu.citrix.com,
	msw@linux.com, dario.faggioli@citrix.com, lccycc123@gmail.com,
	ian.jackson@eu.citrix.com, JBeulich@suse.com,
	Elena Ufimtseva <ufimtseva@gmail.com>
Subject: [PATCH v10 6/9] libxl: build numa nodes memory blocks
Date: Wed,  3 Sep 2014 00:24:15 -0400	[thread overview]
Message-ID: <1409718258-3276-4-git-send-email-ufimtseva@gmail.com> (raw)
In-Reply-To: <1409718258-3276-1-git-send-email-ufimtseva@gmail.com>

Create the vmemrange structure based on the
PV guests E820 map. Values are in in Megabytes.
Also export the E820 filter code e820_sanitize
out to be available internally.
As Xen can support mutiranges vNUMA nodes, for
PV guest it is one range per one node. The other
domain types should have their building routines
implemented.

Changes since v8:
    - Added setting of the vnuma memory ranges node ids;

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 tools/libxl/libxl_internal.h |    9 ++
 tools/libxl/libxl_numa.c     |  201 ++++++++++++++++++++++++++++++++++++++++++
 tools/libxl/libxl_x86.c      |    3 +-
 3 files changed, 212 insertions(+), 1 deletion(-)

diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index beb052e..63ccb5e 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3088,6 +3088,15 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
     libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
 }
 
+bool libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info *info);
+
+int e820_sanitize(libxl_ctx *ctx, struct e820entry src[], uint32_t *nr_entries,
+                  unsigned long map_limitkb, unsigned long balloon_kb);
+
+int libxl__vnuma_align_mem(libxl__gc *gc, uint32_t domid,
+                           struct libxl_domain_build_info *b_info,
+                           vmemrange_t *memblks);
+
 _hidden int libxl__ms_vm_genid_set(libxl__gc *gc, uint32_t domid,
                                    const libxl_ms_vm_genid *id);
 
diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
index 94ca4fe..c416faf 100644
--- a/tools/libxl/libxl_numa.c
+++ b/tools/libxl/libxl_numa.c
@@ -19,6 +19,10 @@
 
 #include "libxl_internal.h"
 
+#include "libxl_vnuma.h"
+
+#include "xc_private.h"
+
 /*
  * What follows are helpers for generating all the k-combinations
  * without repetitions of a set S with n elements in it. Formally
@@ -508,6 +512,203 @@ int libxl__get_numa_candidate(libxl__gc *gc,
 }
 
 /*
+ * Check if we can fit vnuma nodes to numa pnodes
+ * from vnode_to_pnode array.
+ */
+bool libxl__vnodemap_is_usable(libxl__gc *gc,
+                            libxl_domain_build_info *info)
+{
+    unsigned int i;
+    libxl_numainfo *ninfo = NULL;
+    unsigned long long *claim;
+    unsigned int node;
+    uint64_t *sz_array;
+    int nr_nodes = 0;
+
+    /* Cannot use specified mapping if not NUMA machine. */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL)
+        return false;
+
+    sz_array = info->vnuma_mem;
+    claim = libxl__calloc(gc, info->vnodes, sizeof(*claim));
+    /* Get total memory required on each physical node. */
+    for (i = 0; i < info->vnodes; i++)
+    {
+        node = info->vnuma_vnodemap[i];
+
+        if (node < nr_nodes)
+            claim[node] += (sz_array[i] << 20);
+        else
+            goto vnodemapout;
+   }
+   for (i = 0; i < nr_nodes; i++) {
+       if (claim[i] > ninfo[i].free)
+          /* Cannot complete user request, falling to default. */
+          goto vnodemapout;
+   }
+
+ vnodemapout:
+   return true;
+}
+
+/*
+ * Returns number of absent pages within e820 map
+ * between start and end addresses passed. Needed
+ * to correctly set numa memory ranges for domain.
+ */
+static unsigned long e820_memory_hole_size(unsigned long start,
+                                            unsigned long end,
+                                            struct e820entry e820[],
+                                            unsigned int nr)
+{
+    unsigned int i;
+    unsigned long absent, start_blk, end_blk;
+
+    /* init absent number of pages with all memmap size. */
+    absent = end - start;
+    for (i = 0; i < nr; i++) {
+        /* if not E820_RAM region, skip it. */
+        if (e820[i].type != E820_RAM)
+            continue;
+
+        start_blk = e820[i].addr;
+        end_blk = e820[i].addr + e820[i].size;
+        /* beginning address is within this region? */
+        if (start >= start_blk && start <= end_blk) {
+            if (end > end_blk)
+                absent -= end_blk - start;
+            else
+                /* fit the region? then no absent pages. */
+                absent -= end - start;
+            continue;
+        }
+        /* found the end of range in this region? */
+        if (end <= end_blk && end >= start_blk) {
+            absent -= end - start_blk;
+            /* no need to look for more ranges. */
+            break;
+        }
+    }
+    return absent;
+}
+
+/*
+ * For each node, build memory block start and end addresses.
+ * Substract any memory hole from the range found in e820 map.
+ * vnode memory size are passed here in megabytes, the result is
+ * in memory block addresses.
+ * Linux kernel will adjust numa memory block sizes on its own.
+ * But we want to provide to the kernel numa block addresses that
+ * will be the same in kernel and hypervisor.
+ */
+int libxl__vnuma_align_mem(libxl__gc *gc,
+                            uint32_t domid,
+                            /* IN: mem sizes in megabytes */
+                            libxl_domain_build_info *b_info,
+                            /* OUT: linux NUMA blocks addresses */
+                            vmemrange_t *memblks)
+{
+    unsigned int i;
+    int j, rc;
+    uint64_t next_start_blk, end_max = 0, size;
+    uint32_t nr;
+    struct e820entry map[E820MAX];
+
+    errno = ERROR_INVAL;
+    if (b_info->vnodes == 0)
+        return -EINVAL;
+
+    if (!memblks || !b_info->vnuma_mem)
+        return -EINVAL;
+
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+
+    /* Retrieve e820 map for this host. */
+    rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX);
+
+    if (rc < 0) {
+        errno = rc;
+        return -EINVAL;
+    }
+    nr = rc;
+    rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb,
+                       (b_info->max_memkb - b_info->target_memkb) +
+                       b_info->u.pv.slack_memkb);
+    if (rc) {
+        errno = rc;
+        return -EINVAL;
+    }
+
+    /* find max memory address for this host. */
+    for (j = 0; j < nr; j++) {
+        if (map[j].type == E820_RAM) {
+            end_max = max(end_max, map[j].addr + map[j].size);
+        }
+    }
+
+    memset(memblks, 0, sizeof(*memblks) * b_info->vnodes);
+    next_start_blk = 0;
+
+    memblks[0].start = map[0].addr;
+
+    for (i = 0; i < b_info->vnodes; i++) {
+
+        memblks[i].start += next_start_blk;
+        memblks[i].end = memblks[i].start + (b_info->vnuma_mem[i] << 20);
+
+        if (memblks[i].end > end_max) {
+            LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,
+                    "Shrunk vNUMA memory block %d address to max e820 address: \
+                    %#010lx -> %#010lx\n", i, memblks[i].end, end_max);
+            memblks[i].end = end_max;
+            break;
+        }
+
+        size = memblks[i].end - memblks[i].start;
+        /*
+         * For pv host with e820_host option turned on we need
+         * to take into account memory holes. For pv host with
+         * e820_host disabled or unset, the map is a contiguous
+         * RAM region.
+         */
+        if (libxl_defbool_val(b_info->u.pv.e820_host)) {
+            while((memblks[i].end - memblks[i].start -
+                   e820_memory_hole_size(memblks[i].start,
+                   memblks[i].end, map, nr)) < size ) {
+
+                memblks[i].end += MIN_VNODE_SIZE << 10;
+                if (memblks[i].end > end_max) {
+                    memblks[i].end = end_max;
+                    LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,
+                            "Shrunk vNUMA memory block %d address to max e820 \
+                            address: %#010lx -> %#010lx\n", i, memblks[i].end,
+                            end_max);
+                    break;
+                }
+            }
+        }
+        next_start_blk = memblks[i].end;
+        LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,"i %d, start  = %#010lx, \
+                    end = %#010lx\n", i, memblks[i].start, memblks[i].end);
+    }
+
+    /* Did not form memory addresses for every node? */
+    if (i != b_info->vnodes)  {
+        LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "Not all nodes were populated with \
+                block addresses, only %d out of %d", i, b_info->vnodes);
+        return -EINVAL;
+    }
+
+    for (i = 0; i < b_info->vnodes; i++) {
+        memblks[i].nid = i;
+        memblks[i].flags = 0;
+    }
+
+    return 0;
+}
+
+/*
  * Local variables:
  * mode: C
  * c-basic-offset: 4
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index 7589060..46e84e4 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -1,5 +1,6 @@
 #include "libxl_internal.h"
 #include "libxl_arch.h"
+#include "libxl_vnuma.h"
 
 static const char *e820_names(int type)
 {
@@ -14,7 +15,7 @@ static const char *e820_names(int type)
     return "Unknown";
 }
 
-static int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
+int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
                          uint32_t *nr_entries,
                          unsigned long map_limitkb,
                          unsigned long balloon_kb)
-- 
1.7.10.4