[PATCH RFC v2 4/7] libxl/vNUMA: vNUMA libxl functionality.

* [PATCH RFC v2 4/7] libxl/vNUMA: vNUMA libxl functionality.
@ 2013-09-13  8:50 Elena Ufimtseva
  2013-09-17 16:36 ` George Dunlap
  0 siblings, 1 reply; 6+ messages in thread
From: Elena Ufimtseva @ 2013-09-13  8:50 UTC (permalink / raw)
  To: xen-devel
  Cc: Ian.Campbell, stefano.stabellini, george.dunlap, dario.faggioli,
	lccycc123, ian.jackson, Elena Ufimtseva, sw

vNUMA libxl supporting functionality.
libxl supporting functionality for vNUMA includes:
* having vNUMA memory areas sizes, transforms it to
start and end pfns based on domain e820 map;
* contructs vnode_to_pnode map for vNUMA nodes memory
allocation and pass it to Xen; the mechanism considers
automatic NUMA placement in case of presence of hardware
NUMA; In best case scenario all vnodes will be allocated
within one pnode. If the domain spans different pnodes,
the vnodes will be one-by-one placed to pnodes. If such
allocation is impossible due to the memory constraints,
the allocation will use default mechanism; this is worst
case scenario.
* passes to Xen vNUMA topology of domain;

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>

---
Changes since v1:
* chnged libxl__init_vnodemap to take into account automatic
NUMA placement; vnode_to_pnode map is formed based on nodemap
mask instead of all set of NUMA nodes;
* defined LIBXL_BUILD_HAVE_VNUMA for external applications;
* fixed int to unsigned int for non-negative structure members;

TODO:
* implement proposed by Dario mechanism where vnode_to_pnode
mapping will be turned off in worst case when some of the vnodes
cannot be bound to pnode instead of falling back to default (sparse)
allocation for that node (if other ideas will not emerge);
* define xc_dominfo vnuma related fields and add vnuma initiliaztion to
xcinfo2xlinfo;
---
 tools/libxl/libxl.c          |   19 ++++++++
 tools/libxl/libxl.h          |   20 +++++++-
 tools/libxl/libxl_arch.h     |    5 ++
 tools/libxl/libxl_dom.c      |  105 +++++++++++++++++++++++++++++++++++++++++-
 tools/libxl/libxl_internal.h |    3 ++
 tools/libxl/libxl_types.idl  |    5 +-
 tools/libxl/libxl_x86.c      |   86 ++++++++++++++++++++++++++++++++++
 7 files changed, 240 insertions(+), 3 deletions(-)

diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 81785df..de809db 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -4293,6 +4293,25 @@ static int libxl__set_vcpuonline_qmp(libxl__gc *gc, uint32_t domid,
     }
     return 0;
 }
+int libxl_domain_setvnodes(libxl_ctx *ctx,
+                            uint32_t domid,
+                            uint16_t nr_vnodes,
+                            uint16_t nr_vcpus,
+                            vnuma_memblk_t *vnuma_memblks,
+                            unsigned int *vdistance,
+                            unsigned int *vcpu_to_vnode,
+                            unsigned int *vnode_to_pnode)
+{
+    GC_INIT(ctx);
+    int ret;
+    ret = xc_domain_setvnodes(ctx->xch, domid, nr_vnodes,
+                                nr_vcpus, vnuma_memblks,
+                                vdistance, 
+                                vcpu_to_vnode,
+                                vnode_to_pnode);
+    GC_FREE;
+    return ret;
+}
 
 int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap)
 {
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index be19bf5..722b655 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -90,6 +90,14 @@
 #define LIBXL_HAVE_BUILDINFO_HVM_VENDOR_DEVICE 1
 
 /*
+ * LIBXL_HAVE_BUILDINFO_VNUMA indicates that vnuma topology will be
+ * build for the guest upon request and with VM configuration.
+ * It will try to define best allocation for vNUMA
+ * nodes on real NUMA nodes.
+ */
+#define LIBXL_HAVE_BUILDINFO_VNUMA 1
+
+/*
  * libxl ABI compatibility
  *
  * The only guarantee which libxl makes regarding ABI compatibility
@@ -275,7 +283,7 @@
 #include <errno.h>
 #include <netinet/in.h>
 #include <sys/wait.h> /* for pid_t */
-
+#include <xen/memory.h>
 #include <xentoollog.h>
 
 #include <libxl_uuid.h>
@@ -706,6 +714,16 @@ void libxl_vcpuinfo_list_free(libxl_vcpuinfo *, int nr_vcpus);
 void libxl_device_vtpm_list_free(libxl_device_vtpm*, int nr_vtpms);
 void libxl_vtpminfo_list_free(libxl_vtpminfo *, int nr_vtpms);
 
+/* vNUMA topology */
+int libxl_domain_setvnodes(libxl_ctx *ctx,
+                            uint32_t domid,
+                            uint16_t nr_vnodes,
+                            uint16_t nr_vcpus,
+                            vnuma_memblk_t *vnuma_memblks,
+                            unsigned int *vdistance,
+                            unsigned int *vcpu_to_vnode,
+                            unsigned int *vnode_to_pnode);
+
 /*
  * Devices
  * =======
diff --git a/tools/libxl/libxl_arch.h b/tools/libxl/libxl_arch.h
index abe6685..8710270 100644
--- a/tools/libxl/libxl_arch.h
+++ b/tools/libxl/libxl_arch.h
@@ -18,5 +18,10 @@
 /* arch specific internal domain creation function */
 int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
                uint32_t domid);
+int libxl__vnuma_align_mem(libxl__gc *gc,
+                            uint32_t domid,
+                            struct libxl_domain_build_info *b_info,
+                            vnuma_memblk_t *memblks); 
 
+unsigned long e820_memory_hole_size(unsigned long start, unsigned long end, struct e820entry e820[], int nr);
 #endif
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 6e2252a..0a21c44 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -201,6 +201,74 @@ static int numa_place_domain(libxl__gc *gc, uint32_t domid,
     return rc;
 }
 
+#define set_all_vnodes(n)    for(i=0; i< info->nr_vnodes; i++) \
+                                info->vnode_to_pnode[i] = n
+
+/* prepares vnode to pnode map for domain vNUMA memory allocation */
+int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid,
+                        libxl_domain_build_info *info)
+{
+    int i, n, start, nr_nodes, rc;
+    uint64_t *mems;
+    unsigned long long *claim;
+    libxl_numainfo *ninfo = NULL;
+
+    rc = -EINVAL;
+    if (info->vnode_to_pnode == NULL)
+        info->vnode_to_pnode = calloc(info->nr_vnodes, sizeof(*info->vnode_to_pnode));
+        set_all_vnodes(NUMA_NO_NODE);
+    /* 
+     * check if we have any hardware NUMA nodes selected,
+     * otherwise NUMA_NO_NODE set and used default allocation
+     */ 
+    if (libxl_bitmap_is_empty(&info->nodemap))
+        return 0;
+    claim = calloc(info->nr_vnodes, sizeof(*claim));
+    if (claim == NULL)
+        return rc;
+    mems = info->vnuma_memszs;
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL) {
+        LOG(DEBUG, "No HW NUMA found\n");
+        goto vnmapout;
+    }
+    /* check if all vnodes will fit in one node */
+    libxl_for_each_set_bit(n, info->nodemap) {
+        if(ninfo[n].free/1024 >= info->max_memkb  && 
+           libxl_bitmap_test(&info->nodemap, n))
+           {
+               /* 
+                * all domain v-nodes will fit one p-node, 
+                * p-node is a best candidate selected by automatic 
+                * NUMA placement.
+                */
+               set_all_vnodes(n);
+               return 0;
+           }
+    }
+    /* TODO: change algorithm. The current just fits the nodes
+     * Will be nice to have them also sorted by size  */
+    /* If no p-node found, will be set to NUMA_NO_NODE and allocation will fail */
+    start =  0;
+    libxl_for_each_set_bit(n, info->nodemap)
+    {
+        for ( i = start; i < info->nr_vnodes; i++ )
+        {
+            if ( ((claim[n] + mems[i]) <= ninfo[n].free)    && 
+                 /*vnode was not set yet */
+                 (info->vnode_to_pnode[i] == NUMA_NO_NODE ) )
+            {
+                info->vnode_to_pnode[i] = n;
+                claim[n] += mems[i];
+            }
+        }
+    }
+    rc = 0;
+vnmapout:
+    if (claim) free(claim);
+    return rc;
+}
+
 int libxl__build_pre(libxl__gc *gc, uint32_t domid,
               libxl_domain_config *d_config, libxl__domain_build_state *state)
 {
@@ -232,6 +300,29 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
         if (rc)
             return rc;
     }
+
+    if (info->nr_vnodes != 0) {
+        vnuma_memblk_t *memblks = libxl__calloc(gc, info->nr_vnodes,
+                                                    sizeof(*memblks));
+        libxl__vnuma_align_mem(gc, domid, info, memblks);
+        if (libxl__init_vnodemap(gc, domid, info)) {
+            LOG(DEBUG, "Failed to call init_vnodemap\n");
+            rc = libxl_domain_setvnodes(ctx, domid, info->nr_vnodes,
+                                    info->max_vcpus, memblks,
+                                    info->vdistance, info->vcpu_to_vnode,
+                                    NULL);
+        }
+        else
+            rc = libxl_domain_setvnodes(ctx, domid, info->nr_vnodes,
+                                    info->max_vcpus, memblks,
+                                    info->vdistance, info->vcpu_to_vnode,
+                                    info->vnode_to_pnode);
+        if (rc < 0 ) LOG(DEBUG, "Failed to call xc_domain_setvnodes\n");
+    }
+    else {
+        LOG(DEBUG, "Will not construct vNUMA topology with nodes %d\n", info->nr_vnodes);
+    }
+    
     libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap);
     libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap);
 
@@ -368,7 +459,19 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
             }
         }
     }
-
+    if (info->nr_vnodes != 0) { 
+        dom->nr_vnodes = info->nr_vnodes;
+        dom->vnode_to_pnode = malloc(info->nr_vnodes * sizeof(*info->vnode_to_pnode));
+        dom->vmemsizes = malloc(info->nr_vnodes * sizeof(*info->vnuma_memszs));
+        if (dom->vmemsizes == NULL || dom->vnode_to_pnode == NULL) {
+            LOGE(ERROR, "Failed to allocate memory for vNUMA domain image.\n");
+            goto out;
+        }
+        memcpy(dom->vmemsizes, info->vnuma_memszs,
+               sizeof(*info->vnuma_memszs) * info->nr_vnodes);
+        memcpy(dom->vnode_to_pnode, info->vnode_to_pnode,
+               sizeof(*info->vnode_to_pnode) * info->nr_vnodes);
+    }
     dom->flags = flags;
     dom->console_evtchn = state->console_port;
     dom->console_domid = state->console_domid;
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index f051d91..7f94de7 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2709,6 +2709,7 @@ static inline void libxl__ctx_unlock(libxl_ctx *ctx) {
 #define CTX_LOCK (libxl__ctx_lock(CTX))
 #define CTX_UNLOCK (libxl__ctx_unlock(CTX))
 
+#define NUMA_NO_NODE ~((uint32_t)0) 
 /*
  * Automatic NUMA placement
  *
@@ -2832,6 +2833,8 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
     libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
 }
 
+int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid,
+                                libxl_domain_build_info *info);
 /*
  * Inserts "elm_new" into the sorted list "head".
  *
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 85341a0..5173c09 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -279,7 +279,10 @@ libxl_domain_build_info = Struct("domain_build_info",[
     ("disable_migrate", libxl_defbool),
     ("cpuid",           libxl_cpuid_policy_list),
     ("blkdev_start",    string),
-    
+    ("vnuma_memszs",    Array(uint64, "nr_vnodes")),
+    ("vcpu_to_vnode",   Array(uint32, "nr_vnodemap")),
+    ("vdistance",       Array(uint32, "nr_vdist")),
+    ("vnode_to_pnode",  Array(uint32, "nr_vnode_to_pnode")),
     ("device_model_version", libxl_device_model_version),
     ("device_model_stubdomain", libxl_defbool),
     # if you set device_model you must set device_model_version too
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index a78c91d..f7b1aeb 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -308,3 +308,89 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
 
     return ret;
 }
+
+unsigned long e820_memory_hole_size(unsigned long start, unsigned long end, struct e820entry e820[], int nr)
+{
+#define clamp(val, min, max) ({             \
+    typeof(val) __val = (val);              \
+    typeof(min) __min = (min);              \
+    typeof(max) __max = (max);              \
+    (void) (&__val == &__min);              \
+    (void) (&__val == &__max);              \
+    __val = __val < __min ? __min: __val;   \
+    __val > __max ? __max: __val; })
+    int i;
+    unsigned long absent, start_pfn, end_pfn;
+    absent = start - end;
+    for(i = 0; i < nr; i++) {
+        if(e820[i].type == E820_RAM) {
+            start_pfn = clamp(e820[i].addr, start, end);
+            end_pfn =   clamp(e820[i].addr + e820[i].size, start, end);
+            absent -= end_pfn - start_pfn;
+        }
+    }
+    return absent;
+}
+
+/* 
+ * Aligns memory blocks in domain info for linux NUMA build image. 
+ * Takes libxl_domain_build_info memory sizes and returns aligned to
+ * domain e820 map linux numa blocks in guest pfns.
+ */
+int libxl__vnuma_align_mem(libxl__gc *gc,
+                            uint32_t domid,
+                            libxl_domain_build_info *b_info, /* IN: mem sizes */
+                            vnuma_memblk_t *memblks)        /* OUT: linux numa blocks in pfn */
+{
+#ifndef roundup
+#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+#endif 
+    int i, rc;
+    unsigned long shift = 0, size, node_min_size = 1, limit;
+    unsigned long end_max;
+    uint32_t nr;
+    struct e820entry map[E820MAX];
+    
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+    rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX);
+    if (rc < 0) {
+        errno = rc;
+        return -EINVAL;
+    }
+    nr = rc;
+    rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb,
+                       (b_info->max_memkb - b_info->target_memkb) +
+                       b_info->u.pv.slack_memkb);
+    if (rc)
+        return ERROR_FAIL;
+    end_max = map[nr-1].addr + map[nr-1].size;
+    shift = 0;
+    memset(memblks, 0, sizeof(*memblks)*b_info->nr_vnodes);
+    memblks[0].start = map[0].addr;
+    for(i = 0; i < b_info->nr_vnodes; i++) {
+        memblks[i].start += shift;
+        memblks[i].end += shift + b_info->vnuma_memszs[i];
+        limit = size = memblks[i].end - memblks[i].start;
+        while (memblks[i].end - memblks[i].start - 
+                e820_memory_hole_size(memblks[i].start, memblks[i].end, map, nr) 
+                < size) {
+            memblks[i].end += node_min_size;
+            shift += node_min_size;
+            if (memblks[i].end - memblks[i].start >= limit) {
+                memblks[i].end = memblks[i].start + limit;
+                break;
+            }
+            if (memblks[i].end == end_max) {
+                memblks[i].end = end_max;
+                break;
+            }
+        }
+        shift = memblks[i].end;
+        memblks[i].start = roundup(memblks[i].start, 4*1024);
+        LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,"start = %#010lx, end = %#010lx\n",
+                                        memblks[i].start, memblks[i].end);
+    }
+    if(memblks[i-1].end > end_max)
+        memblks[i-1].end = end_max;
+    return 0;
+}
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread