All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v10 3/9] vnuma hook to debug-keys u
@ 2014-09-03  4:24 Elena Ufimtseva
  2014-09-03  4:24 ` [PATCH v10 4/9] libxc: Introduce xc_domain_setvnuma to set vNUMA Elena Ufimtseva
                   ` (6 more replies)
  0 siblings, 7 replies; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-03  4:24 UTC (permalink / raw)
  To: xen-devel
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, JBeulich,
	Elena Ufimtseva

Add debug-keys hook to display vnuma topology.

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 xen/arch/x86/numa.c |   32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
index 555276e..346725f 100644
--- a/xen/arch/x86/numa.c
+++ b/xen/arch/x86/numa.c
@@ -347,10 +347,11 @@ EXPORT_SYMBOL(node_data);
 static void dump_numa(unsigned char key)
 {
     s_time_t now = NOW();
-    int i;
+    int i, j, err, n;
     struct domain *d;
     struct page_info *page;
     unsigned int page_num_node[MAX_NUMNODES];
+    uint64_t mem;
 
     printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
           (u32)(now>>32), (u32)now);
@@ -389,6 +390,35 @@ static void dump_numa(unsigned char key)
 
         for_each_online_node(i)
             printk("    Node %u: %u\n", i, page_num_node[i]);
+
+        if (!d->vnuma)
+            continue;
+
+        printk("   %u vnodes, %u vcpus\n", d->vnuma->nr_vnodes, d->max_vcpus);
+        for (i = 0; i < d->vnuma->nr_vnodes; i++) {
+            err = snprintf(keyhandler_scratch, 12, "%u",
+                    d->vnuma->vnode_to_pnode[i]);
+            if (err < 0 || d->vnuma->vnode_to_pnode[i] == NUMA_NO_NODE)
+                snprintf(keyhandler_scratch, 3, "???");
+
+            printk("        vnode %3u - pnode %s,", i, keyhandler_scratch);
+            mem = d->vnuma->vmemrange[i].end - d->vnuma->vmemrange[i].start;
+            printk(" %"PRIu64" MB, ", mem >> 20);
+
+            printk("vcpu nrs: ");
+            for (j = 0, n = 0; j < d->max_vcpus; j++) {
+                if (d->vnuma->vcpu_to_vnode[j] == i) {
+                    if ( ((n + 1) % 8) == 0 )
+                        printk("%d\n", j);
+                    else if ( !(n % 8) && n != 0 )
+                        printk("%s%d ", "             ", j);
+                    else
+                        printk("%d ", j);
+                    n++;
+                }
+            }
+            printk("\n");
+        }
     }
 
     rcu_read_unlock(&domlist_read_lock);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH v10 4/9] libxc: Introduce xc_domain_setvnuma to set vNUMA
  2014-09-03  4:24 [PATCH v10 3/9] vnuma hook to debug-keys u Elena Ufimtseva
@ 2014-09-03  4:24 ` Elena Ufimtseva
  2014-09-03 14:53   ` Ian Campbell
  2014-09-03  4:24 ` [PATCH v10 5/9] libxl: vnuma types declararion Elena Ufimtseva
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-03  4:24 UTC (permalink / raw)
  To: xen-devel
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, JBeulich,
	Elena Ufimtseva

With the introduction of the XEN_DOMCTL_setvnumainfo
in patch titled: "xen: vnuma topology and subop hypercalls"
we put in the plumbing here to use from the toolstack. The user
is allowed to call this multiple times if they wish so.
It will error out if the nr_vnodes or nr_vcpus is zero.

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>

Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 tools/libxc/xc_domain.c |   68 +++++++++++++++++++++++++++++++++++++++++++++++
 tools/libxc/xenctrl.h   |   10 +++++++
 2 files changed, 78 insertions(+)

diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index c67ac9a..624a207 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -2124,6 +2124,74 @@ int xc_domain_set_max_evtchn(xc_interface *xch, uint32_t domid,
     return do_domctl(xch, &domctl);
 }
 
+/* Plumbing Xen with vNUMA topology */
+int xc_domain_setvnuma(xc_interface *xch,
+                       uint32_t domid,
+                       uint32_t nr_vnodes,
+                       uint32_t nr_vmemranges,
+                       uint32_t nr_vcpus,
+                       vmemrange_t *vmemrange,
+                       unsigned int *vdistance,
+                       unsigned int *vcpu_to_vnode,
+                       unsigned int *vnode_to_pnode)
+{
+    int rc;
+    DECLARE_DOMCTL;
+    DECLARE_HYPERCALL_BOUNCE(vmemrange, sizeof(*vmemrange) * nr_vmemranges,
+                             XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
+    DECLARE_HYPERCALL_BOUNCE(vdistance, sizeof(*vdistance) *
+                             nr_vnodes * nr_vnodes,
+                             XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
+    DECLARE_HYPERCALL_BOUNCE(vcpu_to_vnode, sizeof(*vcpu_to_vnode) * nr_vcpus,
+                             XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
+    DECLARE_HYPERCALL_BOUNCE(vnode_to_pnode, sizeof(*vnode_to_pnode) *
+                             nr_vnodes,
+                             XC_HYPERCALL_BUFFER_BOUNCE_BOTH);
+    errno = EINVAL;
+
+    if ( nr_vnodes == 0 || nr_vmemranges == 0 ||
+         nr_vmemranges < nr_vnodes || nr_vcpus == 0 )
+        return -1;
+
+    if ( !vdistance || !vcpu_to_vnode || !vmemrange || !vnode_to_pnode )
+    {
+        PERROR("%s: Cant set vnuma without initializing topology", __func__);
+        return -1;
+    }
+
+    if ( xc_hypercall_bounce_pre(xch, vmemrange)      ||
+         xc_hypercall_bounce_pre(xch, vdistance)      ||
+         xc_hypercall_bounce_pre(xch, vcpu_to_vnode)  ||
+         xc_hypercall_bounce_pre(xch, vnode_to_pnode) )
+    {
+        rc = -1;
+        goto vnumaset_fail;
+
+    }
+
+    set_xen_guest_handle(domctl.u.vnuma.vmemrange, vmemrange);
+    set_xen_guest_handle(domctl.u.vnuma.vdistance, vdistance);
+    set_xen_guest_handle(domctl.u.vnuma.vcpu_to_vnode, vcpu_to_vnode);
+    set_xen_guest_handle(domctl.u.vnuma.vnode_to_pnode, vnode_to_pnode);
+
+    domctl.cmd = XEN_DOMCTL_setvnumainfo;
+    domctl.domain = (domid_t)domid;
+    domctl.u.vnuma.nr_vnodes = nr_vnodes;
+    domctl.u.vnuma.nr_vmemranges = nr_vmemranges;
+    domctl.u.vnuma.nr_vcpus = nr_vcpus;
+    domctl.u.vnuma.pad = 0;
+
+    rc = do_domctl(xch, &domctl);
+
+ vnumaset_fail:
+    xc_hypercall_bounce_post(xch, vmemrange);
+    xc_hypercall_bounce_post(xch, vdistance);
+    xc_hypercall_bounce_post(xch, vcpu_to_vnode);
+    xc_hypercall_bounce_post(xch, vnode_to_pnode);
+
+    return rc;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index 1c5d0db..1c8aa42 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -1245,6 +1245,16 @@ int xc_domain_set_memmap_limit(xc_interface *xch,
                                uint32_t domid,
                                unsigned long map_limitkb);
 
+int xc_domain_setvnuma(xc_interface *xch,
+                        uint32_t domid,
+                        uint32_t nr_vnodes,
+                        uint32_t nr_regions,
+                        uint32_t nr_vcpus,
+                        vmemrange_t *vmemrange,
+                        unsigned int *vdistance,
+                        unsigned int *vcpu_to_vnode,
+                        unsigned int *vnode_to_pnode);
+
 #if defined(__i386__) || defined(__x86_64__)
 /*
  * PC BIOS standard E820 types and structure.
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH v10 5/9] libxl: vnuma types declararion
  2014-09-03  4:24 [PATCH v10 3/9] vnuma hook to debug-keys u Elena Ufimtseva
  2014-09-03  4:24 ` [PATCH v10 4/9] libxc: Introduce xc_domain_setvnuma to set vNUMA Elena Ufimtseva
@ 2014-09-03  4:24 ` Elena Ufimtseva
  2014-09-03 15:03   ` Ian Campbell
  2014-09-03 16:04   ` Ian Campbell
  2014-09-03  4:24 ` [PATCH v10 6/9] libxl: build numa nodes memory blocks Elena Ufimtseva
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-03  4:24 UTC (permalink / raw)
  To: xen-devel
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, JBeulich,
	Elena Ufimtseva

Adds vnuma topology types declarations to libxl_domain_build_info
structure.

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 tools/libxl/libxl_types.idl |    8 +++++++-
 tools/libxl/libxl_vnuma.h   |   16 ++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 tools/libxl/libxl_vnuma.h

diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 08a7927..ea8bac0 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -333,7 +333,13 @@ libxl_domain_build_info = Struct("domain_build_info",[
     ("disable_migrate", libxl_defbool),
     ("cpuid",           libxl_cpuid_policy_list),
     ("blkdev_start",    string),
-    
+    ("vnodes",          uint32),
+    ("vmemranges",      uint32),
+    ("vnuma_mem",       Array(uint64, "num_vnuma_mem")),
+    ("vnuma_vcpumap",   Array(uint32, "num_vnuma_vcpumap")),
+    ("vdistance",       Array(uint32, "num_vdistance")),
+    ("vnuma_vnodemap",  Array(uint32, "num_vnuma_vnondemap")),
+    ("vnuma_autoplacement",  libxl_defbool),
     ("device_model_version", libxl_device_model_version),
     ("device_model_stubdomain", libxl_defbool),
     # if you set device_model you must set device_model_version too
diff --git a/tools/libxl/libxl_vnuma.h b/tools/libxl/libxl_vnuma.h
new file mode 100644
index 0000000..4312070
--- /dev/null
+++ b/tools/libxl/libxl_vnuma.h
@@ -0,0 +1,16 @@
+#ifndef LIBXL_VNUMA_H
+#define LIBXL_VNUMA_H
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#define VNUMA_NO_NODE ~((unsigned int)0)
+
+/*
+ * Min vNUMA node size in MBytes from Linux for x86 architecture.
+ * See linux source code arch/x86/include/asm/numa.h
+ */
+#define MIN_VNODE_SIZE  (4)
+
+#define MAX_VNUMA_NODES ((unsigned int)1 << 10)
+
+#endif
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH v10 6/9] libxl: build numa nodes memory blocks
  2014-09-03  4:24 [PATCH v10 3/9] vnuma hook to debug-keys u Elena Ufimtseva
  2014-09-03  4:24 ` [PATCH v10 4/9] libxc: Introduce xc_domain_setvnuma to set vNUMA Elena Ufimtseva
  2014-09-03  4:24 ` [PATCH v10 5/9] libxl: vnuma types declararion Elena Ufimtseva
@ 2014-09-03  4:24 ` Elena Ufimtseva
  2014-09-03 15:21   ` Ian Campbell
  2014-09-12 10:18   ` Dario Faggioli
  2014-09-03  4:24 ` [PATCH v10 7/9] libxc: allocate domain memory for vnuma enabled Elena Ufimtseva
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-03  4:24 UTC (permalink / raw)
  To: xen-devel
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, JBeulich,
	Elena Ufimtseva

Create the vmemrange structure based on the
PV guests E820 map. Values are in in Megabytes.
Also export the E820 filter code e820_sanitize
out to be available internally.
As Xen can support mutiranges vNUMA nodes, for
PV guest it is one range per one node. The other
domain types should have their building routines
implemented.

Changes since v8:
    - Added setting of the vnuma memory ranges node ids;

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 tools/libxl/libxl_internal.h |    9 ++
 tools/libxl/libxl_numa.c     |  201 ++++++++++++++++++++++++++++++++++++++++++
 tools/libxl/libxl_x86.c      |    3 +-
 3 files changed, 212 insertions(+), 1 deletion(-)

diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index beb052e..63ccb5e 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3088,6 +3088,15 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
     libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
 }
 
+bool libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info *info);
+
+int e820_sanitize(libxl_ctx *ctx, struct e820entry src[], uint32_t *nr_entries,
+                  unsigned long map_limitkb, unsigned long balloon_kb);
+
+int libxl__vnuma_align_mem(libxl__gc *gc, uint32_t domid,
+                           struct libxl_domain_build_info *b_info,
+                           vmemrange_t *memblks);
+
 _hidden int libxl__ms_vm_genid_set(libxl__gc *gc, uint32_t domid,
                                    const libxl_ms_vm_genid *id);
 
diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
index 94ca4fe..c416faf 100644
--- a/tools/libxl/libxl_numa.c
+++ b/tools/libxl/libxl_numa.c
@@ -19,6 +19,10 @@
 
 #include "libxl_internal.h"
 
+#include "libxl_vnuma.h"
+
+#include "xc_private.h"
+
 /*
  * What follows are helpers for generating all the k-combinations
  * without repetitions of a set S with n elements in it. Formally
@@ -508,6 +512,203 @@ int libxl__get_numa_candidate(libxl__gc *gc,
 }
 
 /*
+ * Check if we can fit vnuma nodes to numa pnodes
+ * from vnode_to_pnode array.
+ */
+bool libxl__vnodemap_is_usable(libxl__gc *gc,
+                            libxl_domain_build_info *info)
+{
+    unsigned int i;
+    libxl_numainfo *ninfo = NULL;
+    unsigned long long *claim;
+    unsigned int node;
+    uint64_t *sz_array;
+    int nr_nodes = 0;
+
+    /* Cannot use specified mapping if not NUMA machine. */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL)
+        return false;
+
+    sz_array = info->vnuma_mem;
+    claim = libxl__calloc(gc, info->vnodes, sizeof(*claim));
+    /* Get total memory required on each physical node. */
+    for (i = 0; i < info->vnodes; i++)
+    {
+        node = info->vnuma_vnodemap[i];
+
+        if (node < nr_nodes)
+            claim[node] += (sz_array[i] << 20);
+        else
+            goto vnodemapout;
+   }
+   for (i = 0; i < nr_nodes; i++) {
+       if (claim[i] > ninfo[i].free)
+          /* Cannot complete user request, falling to default. */
+          goto vnodemapout;
+   }
+
+ vnodemapout:
+   return true;
+}
+
+/*
+ * Returns number of absent pages within e820 map
+ * between start and end addresses passed. Needed
+ * to correctly set numa memory ranges for domain.
+ */
+static unsigned long e820_memory_hole_size(unsigned long start,
+                                            unsigned long end,
+                                            struct e820entry e820[],
+                                            unsigned int nr)
+{
+    unsigned int i;
+    unsigned long absent, start_blk, end_blk;
+
+    /* init absent number of pages with all memmap size. */
+    absent = end - start;
+    for (i = 0; i < nr; i++) {
+        /* if not E820_RAM region, skip it. */
+        if (e820[i].type != E820_RAM)
+            continue;
+
+        start_blk = e820[i].addr;
+        end_blk = e820[i].addr + e820[i].size;
+        /* beginning address is within this region? */
+        if (start >= start_blk && start <= end_blk) {
+            if (end > end_blk)
+                absent -= end_blk - start;
+            else
+                /* fit the region? then no absent pages. */
+                absent -= end - start;
+            continue;
+        }
+        /* found the end of range in this region? */
+        if (end <= end_blk && end >= start_blk) {
+            absent -= end - start_blk;
+            /* no need to look for more ranges. */
+            break;
+        }
+    }
+    return absent;
+}
+
+/*
+ * For each node, build memory block start and end addresses.
+ * Substract any memory hole from the range found in e820 map.
+ * vnode memory size are passed here in megabytes, the result is
+ * in memory block addresses.
+ * Linux kernel will adjust numa memory block sizes on its own.
+ * But we want to provide to the kernel numa block addresses that
+ * will be the same in kernel and hypervisor.
+ */
+int libxl__vnuma_align_mem(libxl__gc *gc,
+                            uint32_t domid,
+                            /* IN: mem sizes in megabytes */
+                            libxl_domain_build_info *b_info,
+                            /* OUT: linux NUMA blocks addresses */
+                            vmemrange_t *memblks)
+{
+    unsigned int i;
+    int j, rc;
+    uint64_t next_start_blk, end_max = 0, size;
+    uint32_t nr;
+    struct e820entry map[E820MAX];
+
+    errno = ERROR_INVAL;
+    if (b_info->vnodes == 0)
+        return -EINVAL;
+
+    if (!memblks || !b_info->vnuma_mem)
+        return -EINVAL;
+
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+
+    /* Retrieve e820 map for this host. */
+    rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX);
+
+    if (rc < 0) {
+        errno = rc;
+        return -EINVAL;
+    }
+    nr = rc;
+    rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb,
+                       (b_info->max_memkb - b_info->target_memkb) +
+                       b_info->u.pv.slack_memkb);
+    if (rc) {
+        errno = rc;
+        return -EINVAL;
+    }
+
+    /* find max memory address for this host. */
+    for (j = 0; j < nr; j++) {
+        if (map[j].type == E820_RAM) {
+            end_max = max(end_max, map[j].addr + map[j].size);
+        }
+    }
+
+    memset(memblks, 0, sizeof(*memblks) * b_info->vnodes);
+    next_start_blk = 0;
+
+    memblks[0].start = map[0].addr;
+
+    for (i = 0; i < b_info->vnodes; i++) {
+
+        memblks[i].start += next_start_blk;
+        memblks[i].end = memblks[i].start + (b_info->vnuma_mem[i] << 20);
+
+        if (memblks[i].end > end_max) {
+            LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,
+                    "Shrunk vNUMA memory block %d address to max e820 address: \
+                    %#010lx -> %#010lx\n", i, memblks[i].end, end_max);
+            memblks[i].end = end_max;
+            break;
+        }
+
+        size = memblks[i].end - memblks[i].start;
+        /*
+         * For pv host with e820_host option turned on we need
+         * to take into account memory holes. For pv host with
+         * e820_host disabled or unset, the map is a contiguous
+         * RAM region.
+         */
+        if (libxl_defbool_val(b_info->u.pv.e820_host)) {
+            while((memblks[i].end - memblks[i].start -
+                   e820_memory_hole_size(memblks[i].start,
+                   memblks[i].end, map, nr)) < size ) {
+
+                memblks[i].end += MIN_VNODE_SIZE << 10;
+                if (memblks[i].end > end_max) {
+                    memblks[i].end = end_max;
+                    LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,
+                            "Shrunk vNUMA memory block %d address to max e820 \
+                            address: %#010lx -> %#010lx\n", i, memblks[i].end,
+                            end_max);
+                    break;
+                }
+            }
+        }
+        next_start_blk = memblks[i].end;
+        LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,"i %d, start  = %#010lx, \
+                    end = %#010lx\n", i, memblks[i].start, memblks[i].end);
+    }
+
+    /* Did not form memory addresses for every node? */
+    if (i != b_info->vnodes)  {
+        LIBXL__LOG(ctx, LIBXL__LOG_ERROR, "Not all nodes were populated with \
+                block addresses, only %d out of %d", i, b_info->vnodes);
+        return -EINVAL;
+    }
+
+    for (i = 0; i < b_info->vnodes; i++) {
+        memblks[i].nid = i;
+        memblks[i].flags = 0;
+    }
+
+    return 0;
+}
+
+/*
  * Local variables:
  * mode: C
  * c-basic-offset: 4
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index 7589060..46e84e4 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -1,5 +1,6 @@
 #include "libxl_internal.h"
 #include "libxl_arch.h"
+#include "libxl_vnuma.h"
 
 static const char *e820_names(int type)
 {
@@ -14,7 +15,7 @@ static const char *e820_names(int type)
     return "Unknown";
 }
 
-static int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
+int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
                          uint32_t *nr_entries,
                          unsigned long map_limitkb,
                          unsigned long balloon_kb)
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH v10 7/9] libxc: allocate domain memory for vnuma enabled
  2014-09-03  4:24 [PATCH v10 3/9] vnuma hook to debug-keys u Elena Ufimtseva
                   ` (2 preceding siblings ...)
  2014-09-03  4:24 ` [PATCH v10 6/9] libxl: build numa nodes memory blocks Elena Ufimtseva
@ 2014-09-03  4:24 ` Elena Ufimtseva
  2014-09-03 15:26   ` Ian Campbell
  2014-09-12 11:06   ` Dario Faggioli
  2014-09-03  4:24 ` [PATCH v10 8/9] libxl: vnuma nodes placement bits Elena Ufimtseva
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-03  4:24 UTC (permalink / raw)
  To: xen-devel
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, JBeulich,
	Elena Ufimtseva

vNUMA-aware domain memory allocation based on provided
vnode to pnode map. If this map is not defined, use
default allocation. Default allocation will not specify
any physical node when allocating memory.
Domain creation will fail if at least one node was not defined.

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 tools/libxc/xc_dom.h     |   13 ++++++++
 tools/libxc/xc_dom_x86.c |   76 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/tools/libxc/xc_dom.h b/tools/libxc/xc_dom.h
index 6ae6a9f..61c2a06 100644
--- a/tools/libxc/xc_dom.h
+++ b/tools/libxc/xc_dom.h
@@ -164,6 +164,16 @@ struct xc_dom_image {
 
     /* kernel loader */
     struct xc_dom_arch *arch_hooks;
+
+   /*
+    * vNUMA topology and memory allocation structure.
+    * Defines the way to allocate memory on per NUMA
+    * physical defined by vnode_to_pnode.
+    */
+    uint32_t vnodes;
+    uint64_t *numa_memszs;
+    unsigned int *vnode_to_pnode;
+
     /* allocate up to virt_alloc_end */
     int (*allocate) (struct xc_dom_image * dom, xen_vaddr_t up_to);
 };
@@ -385,6 +395,9 @@ static inline xen_pfn_t xc_dom_p2m_guest(struct xc_dom_image *dom,
 int arch_setup_meminit(struct xc_dom_image *dom);
 int arch_setup_bootearly(struct xc_dom_image *dom);
 int arch_setup_bootlate(struct xc_dom_image *dom);
+int arch_boot_alloc(struct xc_dom_image *dom);
+
+#define LIBXC_VNUMA_NO_NODE ~((unsigned int)0)
 
 /*
  * Local variables:
diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c
index bf06fe4..f2b4c98 100644
--- a/tools/libxc/xc_dom_x86.c
+++ b/tools/libxc/xc_dom_x86.c
@@ -759,7 +759,7 @@ static int x86_shadow(xc_interface *xch, domid_t domid)
 int arch_setup_meminit(struct xc_dom_image *dom)
 {
     int rc;
-    xen_pfn_t pfn, allocsz, i, j, mfn;
+    xen_pfn_t pfn, i, j, mfn;
 
     rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type);
     if ( rc )
@@ -811,25 +811,77 @@ int arch_setup_meminit(struct xc_dom_image *dom)
         /* setup initial p2m */
         for ( pfn = 0; pfn < dom->total_pages; pfn++ )
             dom->p2m_host[pfn] = pfn;
+
+        /*
+         * Any PV domain should have at least one vNUMA node.
+         * If no config was defined, one default vNUMA node
+         * will be set.
+         */
+        if ( dom->vnodes == 0 ) {
+            xc_dom_printf(dom->xch,
+                         "%s: Cannot construct vNUMA topology with 0 vnodes\n",
+                         __FUNCTION__);
+            return -EINVAL;
+        }
         
         /* allocate guest memory */
-        for ( i = rc = allocsz = 0;
-              (i < dom->total_pages) && !rc;
-              i += allocsz )
-        {
-            allocsz = dom->total_pages - i;
-            if ( allocsz > 1024*1024 )
-                allocsz = 1024*1024;
-            rc = xc_domain_populate_physmap_exact(
-                dom->xch, dom->guest_domid, allocsz,
-                0, 0, &dom->p2m_host[i]);
-        }
+        rc = arch_boot_alloc(dom);
+        if ( rc )
+            return rc;
 
         /* Ensure no unclaimed pages are left unused.
          * OK to call if hadn't done the earlier claim call. */
         (void)xc_domain_claim_pages(dom->xch, dom->guest_domid,
                                     0 /* cancels the claim */);
     }
+    return rc;
+}
+
+/*
+ * Allocates domain memory taking into account
+ * defined vnuma topology and vnode_to_pnode map.
+ * Any pv guest will have at least one vnuma node
+ * with vnuma_memszs[0] = domain memory and the rest
+ * topology initialized with default values.
+ */
+int arch_boot_alloc(struct xc_dom_image *dom)
+{
+    int rc;
+    unsigned int n, memflags;
+    unsigned long long vnode_pages;
+    unsigned long long allocsz = 0, node_pfn_base, i;
+
+    rc = allocsz = node_pfn_base = n = 0;
+
+    for ( n = 0; n < dom->vnodes; n++ )
+    {
+        memflags = 0;
+        if ( dom->vnode_to_pnode[n] != LIBXC_VNUMA_NO_NODE )
+        {
+            memflags |= XENMEMF_exact_node(dom->vnode_to_pnode[n]);
+            memflags |= XENMEMF_exact_node_request;
+        }
+        /* memeszs are in megabytes, calc pages from it for this node. */
+        vnode_pages = (dom->numa_memszs[n] << 20) >> PAGE_SHIFT_X86;
+        for ( i = 0; i < vnode_pages; i += allocsz )
+        {
+            allocsz = vnode_pages - i;
+            if ( allocsz > 1024*1024 )
+                allocsz = 1024*1024;
+
+            rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
+                                            allocsz, 0, memflags,
+                                            &dom->p2m_host[node_pfn_base + i]);
+            if ( rc )
+            {
+                xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
+                        "%s: Failed allocation of %Lu pages for vnode %d on pnode %d out of %lu\n",
+                        __FUNCTION__, vnode_pages, n, dom->vnode_to_pnode[n], dom->total_pages);
+                return rc;
+            }
+        }
+        node_pfn_base += i;
+    }
 
     return rc;
 }
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH v10 8/9] libxl: vnuma nodes placement bits
  2014-09-03  4:24 [PATCH v10 3/9] vnuma hook to debug-keys u Elena Ufimtseva
                   ` (3 preceding siblings ...)
  2014-09-03  4:24 ` [PATCH v10 7/9] libxc: allocate domain memory for vnuma enabled Elena Ufimtseva
@ 2014-09-03  4:24 ` Elena Ufimtseva
  2014-09-03 15:50   ` Konrad Rzeszutek Wilk
                     ` (2 more replies)
  2014-09-03  4:24 ` [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc Elena Ufimtseva
  2014-09-03 15:37 ` [PATCH v10 3/9] vnuma hook to debug-keys u Konrad Rzeszutek Wilk
  6 siblings, 3 replies; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-03  4:24 UTC (permalink / raw)
  To: xen-devel
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, JBeulich,
	Elena Ufimtseva

Automatic numa placement cancels manual vnode placement
mechanism. If numa placement explicitly specified, try
to fit vnodes to the physical nodes.

Changes since v8:
    - Addded number of ranges vmemranges for future support of
      multi-range nodes; For PV guest it is set to same value
      as number of vNUMA nodes.

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 tools/libxl/libxl_create.c |    1 +
 tools/libxl/libxl_dom.c    |  204 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 205 insertions(+)

diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index fc332ef..a4977d2 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -203,6 +203,7 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
     }
 
     libxl_defbool_setdefault(&b_info->numa_placement, true);
+    libxl_defbool_setdefault(&b_info->vnuma_autoplacement, true);
 
     if (b_info->max_memkb == LIBXL_MEMKB_DEFAULT)
         b_info->max_memkb = 32 * 1024;
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index c944804..b1376c4 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -23,6 +23,7 @@
 #include <xc_dom.h>
 #include <xen/hvm/hvm_info_table.h>
 #include <xen/hvm/hvm_xs_strings.h>
+#include <libxl_vnuma.h>
 
 libxl_domain_type libxl__domain_type(libxl__gc *gc, uint32_t domid)
 {
@@ -227,12 +228,114 @@ static void hvm_set_conf_params(xc_interface *handle, uint32_t domid,
                     libxl_defbool_val(info->u.hvm.nested_hvm));
 }
 
+/* sets vnode_to_pnode map. */
+static int libxl__init_vnode_to_pnode(libxl__gc *gc, uint32_t domid,
+                        libxl_domain_build_info *info)
+{
+    unsigned int i, n;
+    int nr_nodes = 0;
+    uint64_t *vnodes_mem;
+    unsigned long long *nodes_claim = NULL;
+    libxl_numainfo *ninfo = NULL;
+
+    if (info->vnuma_vnodemap == NULL) {
+        info->vnuma_vnodemap = libxl__calloc(gc, info->vnodes,
+                                      sizeof(*info->vnuma_vnodemap));
+    }
+
+    /* default setting. */
+    for (i = 0; i < info->vnodes; i++)
+        info->vnuma_vnodemap[i] = LIBXC_VNUMA_NO_NODE;
+
+    /* Get NUMA info. */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL)
+        return ERROR_FAIL;
+    /* Nothing to see if only one NUMA node. */
+    if (nr_nodes <= 1)
+        return 0;
+
+    vnodes_mem = info->vnuma_mem;
+    /*
+     * TODO: change algorithm. The current just fits the nodes
+     * by its memory sizes. If no p-node found, will be used default
+     * value of LIBXC_VNUMA_NO_NODE.
+     */
+    nodes_claim = libxl__calloc(gc, info->vnodes, sizeof(*nodes_claim));
+    if ( !nodes_claim )
+        return ERROR_FAIL;
+
+    libxl_for_each_set_bit(n, info->nodemap)
+    {
+        for (i = 0; i < info->vnodes; i++)
+        {
+            unsigned long mem_sz = vnodes_mem[i] << 20;
+            if ((nodes_claim[n] + mem_sz <= ninfo[n].free) &&
+                 /* vnode was not set yet. */
+                 (info->vnuma_vnodemap[i] == LIBXC_VNUMA_NO_NODE ) )
+            {
+                info->vnuma_vnodemap[i] = n;
+                nodes_claim[n] += mem_sz;
+            }
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Builds vnode memory regions from configuration info
+ * for vnuma nodes with multiple regions.
+ */
+static int libxl__build_vnuma_ranges(libxl__gc *gc,
+                              uint32_t domid,
+                              /* IN: mem sizes in megabytes */
+                              libxl_domain_build_info *b_info,
+                              /* OUT: linux NUMA blocks addresses */
+                              vmemrange_t **memrange)
+{
+    /*
+     * For non-PV domains, contruction of the regions will
+     * need to have its own implementation.
+     */
+    if (b_info->type != LIBXL_DOMAIN_TYPE_PV) {
+        LOG(DETAIL, "vNUMA is only supported for PV guests now.\n");
+        errno = EINVAL;
+        return -1;
+    }
+
+    if (b_info->vnodes == 0) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    b_info->vmemranges = b_info->vnodes;
+
+    *memrange = libxl__calloc(gc, b_info->vnodes,
+                              sizeof(vmemrange_t));
+
+    /*
+     * For PV domain along with alignment, regions nid will
+     * be set to corresponding vnuma node number and ignored
+     * later during allocation.
+     */
+
+    if (libxl__vnuma_align_mem(gc, domid, b_info, *memrange) < 0) {
+        LOG(DETAIL, "Failed to align memory map.\n");
+        errno = ERROR_INVAL;
+        return ERROR_FAIL;
+    }
+
+    return 0;
+}
+
 int libxl__build_pre(libxl__gc *gc, uint32_t domid,
               libxl_domain_config *d_config, libxl__domain_build_state *state)
 {
     libxl_domain_build_info *const info = &d_config->b_info;
     libxl_ctx *ctx = libxl__gc_owner(gc);
     char *xs_domid, *con_domid;
+    struct vmemrange *memrange;
     int rc;
 
     if (xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus) != 0) {
@@ -240,6 +343,20 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
         return ERROR_FAIL;
     }
 
+    if (libxl__build_vnuma_ranges(gc, domid, info, &memrange) != 0) {
+        LOG(DETAIL, "Failed to build vnuma nodes memory ranges.\n");
+        return ERROR_FAIL;
+
+    }
+
+    /*
+     * NUMA placement and vNUMA autoplacement handling:
+     * If numa_placement is set to default, do not use vnode to pnode
+     * mapping as automatic placement algorithm will find best numa nodes.
+     * If numa_placement is not used, we can try and use domain vnode
+     * to pnode mask.
+     */
+
     /*
      * Check if the domain has any CPU or node affinity already. If not, try
      * to build up the latter via automatic NUMA placement. In fact, in case
@@ -298,7 +415,33 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
                                    NULL, &cpumap_soft);
 
         libxl_bitmap_dispose(&cpumap_soft);
+
+        /*
+         * If vnode_to_pnode mask was defined, dont use it if we automatically
+         * place domain on NUMA nodes, just give warning.
+         */
+        if (!libxl_defbool_val(info->vnuma_autoplacement)) {
+            LOG(INFO, "Automatic NUMA placement for domain is turned on. \
+                vnode to physical nodes mapping will not be used.");
+        }
+        if (libxl__init_vnode_to_pnode(gc, domid, info) < 0) {
+            LOG(ERROR, "Failed to build vnode to pnode map\n");
+            return ERROR_FAIL;
+        }
+    } else {
+        if (!libxl_defbool_val(info->vnuma_autoplacement)) {
+                if (!libxl__vnodemap_is_usable(gc, info)) {
+                    LOG(ERROR, "Defined vnode to pnode domain map cannot be used.\n");
+                    return ERROR_FAIL;
+                }
+        } else {
+            if (libxl__init_vnode_to_pnode(gc, domid, info) < 0) {
+                LOG(ERROR, "Failed to build vnode to pnode map.\n");
+                return ERROR_FAIL;
+            }
+        }
     }
+
     if (info->nodemap.size)
         libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap);
     /* As mentioned in libxl.h, vcpu_hard_array takes precedence */
@@ -339,6 +482,22 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
         return ERROR_FAIL;
     }
 
+    /*
+     * XEN_DOMCTL_setvnuma subop hypercall needs to know max mem
+     * for domain set by xc_domain_setmaxmem. So set vNUMA after
+     * maxmem is being set.
+     * memrange should contain regions if multi-region nodes are
+     * suppoted. For PV domain regions are ignored.
+     */
+    if (xc_domain_setvnuma(ctx->xch, domid, info->vnodes,
+        info->vmemranges,
+        info->max_vcpus, memrange,
+        info->vdistance, info->vnuma_vcpumap,
+        info->vnuma_vnodemap) < 0) {
+        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "Couldn't set vnuma topology");
+        return ERROR_FAIL;
+    }
+
     xs_domid = xs_read(ctx->xsh, XBT_NULL, "/tool/xenstored/domid", NULL);
     state->store_domid = xs_domid ? atoi(xs_domid) : 0;
     free(xs_domid);
@@ -434,6 +593,46 @@ retry_transaction:
     return 0;
 }
 
+/*
+ * Function fills the xc_dom_image with memory sizes for later
+ * use in domain memory allocator. No regions here are being
+ * filled and used in allocator as the regions do belong to one node.
+ */
+static int libxl__dom_vnuma_init(struct libxl_domain_build_info *info,
+                                 struct xc_dom_image *dom)
+{
+    errno = ERROR_INVAL;
+
+    if (info->vnodes == 0)
+        return -1;
+
+    info->vmemranges = info->vnodes;
+
+    dom->vnode_to_pnode = (unsigned int *)malloc(
+                            info->vnodes * sizeof(*info->vnuma_vnodemap));
+    dom->numa_memszs = (uint64_t *)malloc(
+                          info->vnodes * sizeof(*info->vnuma_mem));
+
+    errno = ERROR_FAIL;
+    if ( dom->numa_memszs == NULL || dom->vnode_to_pnode == NULL ) {
+        info->vnodes = 0;
+        if (dom->vnode_to_pnode)
+            free(dom->vnode_to_pnode);
+        if (dom->numa_memszs)
+            free(dom->numa_memszs);
+        return -1;
+    }
+
+    memcpy(dom->numa_memszs, info->vnuma_mem,
+            sizeof(*info->vnuma_mem) * info->vnodes);
+    memcpy(dom->vnode_to_pnode, info->vnuma_vnodemap,
+            sizeof(*info->vnuma_vnodemap) * info->vnodes);
+
+    dom->vnodes = info->vnodes;
+
+    return 0;
+}
+
 int libxl__build_pv(libxl__gc *gc, uint32_t domid,
              libxl_domain_build_info *info, libxl__domain_build_state *state)
 {
@@ -491,6 +690,11 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
     dom->xenstore_domid = state->store_domid;
     dom->claim_enabled = libxl_defbool_val(info->claim_mode);
 
+    if ( (ret = libxl__dom_vnuma_init(info, dom)) != 0 ) {
+        LOGE(ERROR, "Failed to set doman vnuma");
+        goto out;
+    }
+
     if ( (ret = xc_dom_boot_xen_init(dom, ctx->xch, domid)) != 0 ) {
         LOGE(ERROR, "xc_dom_boot_xen_init failed");
         goto out;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc
  2014-09-03  4:24 [PATCH v10 3/9] vnuma hook to debug-keys u Elena Ufimtseva
                   ` (4 preceding siblings ...)
  2014-09-03  4:24 ` [PATCH v10 8/9] libxl: vnuma nodes placement bits Elena Ufimtseva
@ 2014-09-03  4:24 ` Elena Ufimtseva
  2014-09-03 15:42   ` Konrad Rzeszutek Wilk
  2014-09-11 17:13   ` Dario Faggioli
  2014-09-03 15:37 ` [PATCH v10 3/9] vnuma hook to debug-keys u Konrad Rzeszutek Wilk
  6 siblings, 2 replies; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-03  4:24 UTC (permalink / raw)
  To: xen-devel
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, JBeulich,
	Elena Ufimtseva

Parses vnuma topoplogy number of nodes and memory
ranges. If not defined, initializes vnuma with
only one node and default topology. This one node covers
all domain memory and all vcpus assigned to it.

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 docs/man/xl.cfg.pod.5    |   77 +++++++++
 tools/libxl/xl_cmdimpl.c |  433 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 510 insertions(+)

diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
index f1fc906..2ee2cbc 100644
--- a/docs/man/xl.cfg.pod.5
+++ b/docs/man/xl.cfg.pod.5
@@ -264,6 +264,83 @@ if the values of B<memory=> and B<maxmem=> differ.
 A "pre-ballooned" HVM guest needs a balloon driver, without a balloon driver
 it will crash.
 
+=item B<vnuma_nodes=N>
+
+Number of vNUMA nodes the guest will be initialized with on boot.
+PV guest by default will have one vnuma node.
+
+=item B<vnuma_mem=[vmem1, vmem2, ...]>
+
+List of memory sizes for each node, defined in MBytes. Number of items listed must
+match nr_vnodes. If the sum of all vnode memories does not match the domain memory
+or there are missing nodes, it will fail.
+If not specified, memory will be equally split between vnodes. Current minimum
+memory size for one node is limited by 32MB.
+
+Example: vnuma_mem=[1024, 1024, 2048, 2048]
+Total amount of memory in guest: 6GB
+
+=item B<vdistance=[d1, d2]>
+
+Defines the distance table for vNUMA nodes. NUMA topology distances are
+represented by two dimensional square matrix. One element of it [i,j] is
+a distance between nodes i and j. Trivial case is where all diagonal elements
+are equal and matrix is symmetrical. vdistance configuration option allows
+to define two values d1 and d2. d1 will be used for all diagonal elements of
+distance matrix. All other values will be equal to d2 value. Usually distances
+are multiple of 10 in Linux and same rule used here.
+If not specified, the default constants values will be used for distance,
+e.g. [10, 20]. For one node default distance is [10];
+
+Examples:
+vnodes = 3
+vdistance=[10, 20]
+will create this distance table (this is default setting as well):
+[10, 20, 20]
+[20, 10, 20]
+[20, 20, 10]
+
+=item B<vnuma_vcpumap=[node_nr, node_nr, ...]>
+
+Defines vcpu to vnode mapping as a list of integers. The position in the list
+is a vcpu number, and the value is the vnode number to which the vcpu will be
+assigned to.
+Current limitations:
+- vNUMA node must have at least one vcpu, otherwise default vcpu_to_vnode will be used.
+- Total number of vnodes cannot be bigger then number of vcpus.
+
+Example:
+Map of 4 vcpus to 2 vnodes:
+0,1 vcpu -> vnode0
+2,3 vcpu -> vnode1:
+
+vnuma_vcpumap = [0, 0, 1, 1]
+ 4 vcpus here -  0  1  2  3
+
+=item B<vnuma_vnodemap=[p1, p2, ..., pn]>
+
+List of physical node numbers, position in the list represents vnode number.
+Used for manual placement of vnuma nodes to physical NUMA nodes.
+Will not be used if automatic numa placement is active.
+
+Example:
+assume NUMA machine with 4 physical nodes. Placing vnuma node 0 to pnode 2,
+vnuma node 1 to pnode 3:
+vnode0 -> pnode2
+vnode1 -> pnode3
+
+vnuma_vnodemap=[2, 3]
+first vnode will be placed on node 2, second on node 3.
+
+=item B<vnuma_autoplacement=[0|1]>
+
+If set to 1 and automatic NUMA placement is enabled, automatically will find the best
+physical node to place vnuma nodes on. vnuma_vnodemap will be ignored. Automatic NUMA
+placement is enabled if domain has no pinned cpus.
+If vnuma_autoplacement is set to 0, then the vnodes will be placed on NUMA nodes set
+in vnuma_vnodemap if there is enough memory on physical nodes. If not, then the allocation
+will be made on any of the available node and be placed on multiple physical NUMA nodes.
+
 =back
 
 =head3 Event Actions
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index 409a795..1af2250 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -40,6 +40,7 @@
 #include "libxl_json.h"
 #include "libxlutil.h"
 #include "xl.h"
+#include "libxl_vnuma.h"
 
 /* For calls which return an errno on failure */
 #define CHK_ERRNOVAL( call ) ({                                         \
@@ -797,6 +798,432 @@ static void parse_vcpu_affinity(libxl_domain_build_info *b_info,
     }
 }
 
+static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i)
+{
+    const char *buf;
+    char *ep;
+    unsigned long ul;
+    int rc = -EINVAL;
+
+    buf = xlu_cfg_get_listitem(list, i);
+    if (!buf)
+        return rc;
+    ul = strtoul(buf, &ep, 10);
+    if (ep == buf)
+        return rc;
+    if (ul >= UINT16_MAX)
+        return rc;
+    return (unsigned int)ul;
+}
+
+static void vdistance_set(unsigned int *vdistance,
+                                unsigned int nr_vnodes,
+                                unsigned int samenode,
+                                unsigned int othernode)
+{
+    unsigned int idx, slot;
+    for (idx = 0; idx < nr_vnodes; idx++)
+        for (slot = 0; slot < nr_vnodes; slot++)
+            *(vdistance + slot * nr_vnodes + idx) =
+                idx == slot ? samenode : othernode;
+}
+
+static void vcputovnode_default(unsigned int *cpu_to_node,
+                                unsigned int nr_vnodes,
+                                unsigned int max_vcpus)
+{
+    unsigned int cpu;
+    for (cpu = 0; cpu < max_vcpus; cpu++)
+        cpu_to_node[cpu] = cpu % nr_vnodes;
+}
+
+/* Split domain memory between vNUMA nodes equally. */
+static int split_vnumamem(libxl_domain_build_info *b_info)
+{
+    unsigned long long vnodemem = 0;
+    unsigned long n;
+    unsigned int i;
+
+    if (b_info->vnodes == 0)
+        return -1;
+
+    vnodemem = (b_info->max_memkb >> 10) / b_info->vnodes;
+    if (vnodemem < MIN_VNODE_SIZE)
+        return -1;
+    /* reminder in MBytes. */
+    n = (b_info->max_memkb >> 10) % b_info->vnodes;
+    /* get final sizes in MBytes. */
+    for (i = 0; i < (b_info->vnodes - 1); i++)
+        b_info->vnuma_mem[i] = vnodemem;
+    /* add the reminder to the last node. */
+    b_info->vnuma_mem[i] = vnodemem + n;
+    return 0;
+}
+
+static void vnuma_vnodemap_default(unsigned int *vnuma_vnodemap,
+                                   unsigned int nr_vnodes)
+{
+    unsigned int i;
+    for (i = 0; i < nr_vnodes; i++)
+        vnuma_vnodemap[i] = VNUMA_NO_NODE;
+}
+
+/*
+ * init vNUMA to "zero config" with one node and all other
+ * topology parameters set to default.
+ */
+static int vnuma_default_config(libxl_domain_build_info *b_info)
+{
+    b_info->vnodes = 1;
+    /* all memory goes to this one vnode, as well as vcpus. */
+    if (!(b_info->vnuma_mem = (uint64_t *)calloc(b_info->vnodes,
+                                sizeof(*b_info->vnuma_mem))))
+        goto bad_vnumazerocfg;
+
+    if (!(b_info->vnuma_vcpumap = (unsigned int *)calloc(b_info->max_vcpus,
+                                sizeof(*b_info->vnuma_vcpumap))))
+        goto bad_vnumazerocfg;
+
+    if (!(b_info->vdistance = (unsigned int *)calloc(b_info->vnodes *
+                                b_info->vnodes, sizeof(*b_info->vdistance))))
+        goto bad_vnumazerocfg;
+
+    if (!(b_info->vnuma_vnodemap = (unsigned int *)calloc(b_info->vnodes,
+                                sizeof(*b_info->vnuma_vnodemap))))
+        goto bad_vnumazerocfg;
+
+    b_info->vnuma_mem[0] = b_info->max_memkb >> 10;
+
+    /* all vcpus assigned to this vnode. */
+    vcputovnode_default(b_info->vnuma_vcpumap, b_info->vnodes,
+                        b_info->max_vcpus);
+
+    /* default vdistance is 10. */
+    vdistance_set(b_info->vdistance, b_info->vnodes, 10, 10);
+
+    /* VNUMA_NO_NODE for vnode_to_pnode. */
+    vnuma_vnodemap_default(b_info->vnuma_vnodemap, b_info->vnodes);
+
+    /*
+     * will be placed to some physical nodes defined by automatic
+     * numa placement or VNUMA_NO_NODE will not request exact node.
+     */
+    libxl_defbool_set(&b_info->vnuma_autoplacement, true);
+    return 0;
+
+ bad_vnumazerocfg:
+    return -1;
+}
+
+static void free_vnuma_info(libxl_domain_build_info *b_info)
+{
+    free(b_info->vnuma_mem);
+    free(b_info->vdistance);
+    free(b_info->vnuma_vcpumap);
+    free(b_info->vnuma_vnodemap);
+
+    b_info->vnuma_mem = NULL;
+    b_info->vdistance = NULL;
+    b_info->vnuma_vcpumap = NULL;
+    b_info->vnuma_vnodemap = NULL;
+
+    b_info->vnodes = 0;
+    b_info->vmemranges = 0;
+}
+
+static int parse_vnuma_mem(XLU_Config *config,
+                            libxl_domain_build_info **b_info)
+{
+    libxl_domain_build_info *dst;
+    XLU_ConfigList *vnumamemcfg;
+    int nr_vnuma_regions, i;
+    unsigned long long vnuma_memparsed = 0;
+    unsigned long ul;
+    const char *buf;
+    char *ep;
+
+    dst = *b_info;
+    if (!xlu_cfg_get_list(config, "vnuma_mem",
+                          &vnumamemcfg, &nr_vnuma_regions, 0)) {
+
+        if (nr_vnuma_regions != dst->vnodes) {
+            fprintf(stderr, "Number of numa regions (vnumamem = %d) is \
+                    incorrect (should be %d).\n", nr_vnuma_regions,
+                    dst->vnodes);
+            goto bad_vnuma_mem;
+        }
+
+        dst->vnuma_mem = calloc(dst->vnodes,
+                                 sizeof(*dst->vnuma_mem));
+        if (dst->vnuma_mem == NULL) {
+            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
+            goto bad_vnuma_mem;
+        }
+
+        /*
+         * Will parse only nr_vnodes times, even if we have more/less regions.
+         * Take care of it later if less or discard if too many regions.
+         */
+        for (i = 0; i < dst->vnodes; i++) {
+            buf = xlu_cfg_get_listitem(vnumamemcfg, i);
+            if (!buf) {
+                fprintf(stderr,
+                        "xl: Unable to get element %d in vnuma memory list.\n", i);
+                goto bad_vnuma_mem;
+            }
+
+            ul = strtoul(buf, &ep, 10);
+            if (ep == buf) {
+                fprintf(stderr, "xl: Invalid argument parsing vnumamem: %s.\n", buf);
+                goto bad_vnuma_mem;
+            }
+
+            /* 32Mb is a min size for a node, taken from Linux */
+            if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) {
+                fprintf(stderr, "xl: vnuma memory %lu is not within %u - %u range.\n",
+                        ul, MIN_VNODE_SIZE, UINT32_MAX);
+                goto bad_vnuma_mem;
+            }
+
+            /* memory in MBytes */
+            dst->vnuma_mem[i] = ul;
+        }
+
+        /* Total memory for vNUMA parsed to verify */
+        for (i = 0; i < nr_vnuma_regions; i++)
+            vnuma_memparsed = vnuma_memparsed + (dst->vnuma_mem[i]);
+
+        /* Amount of memory for vnodes same as total? */
+        if ((vnuma_memparsed << 10) != (dst->max_memkb)) {
+            fprintf(stderr, "xl: vnuma memory is not the same as domain \
+                    memory size.\n");
+            goto bad_vnuma_mem;
+        }
+    } else {
+        dst->vnuma_mem = calloc(dst->vnodes,
+                                      sizeof(*dst->vnuma_mem));
+        if (dst->vnuma_mem == NULL) {
+            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
+            goto bad_vnuma_mem;
+        }
+
+        fprintf(stderr, "WARNING: vNUMA memory ranges were not specified.\n");
+        fprintf(stderr, "Using default equal vnode memory size %lu Kbytes \
+                to cover %lu Kbytes.\n",
+                dst->max_memkb / dst->vnodes, dst->max_memkb);
+
+        if (split_vnumamem(dst) < 0) {
+            fprintf(stderr, "Could not split vnuma memory into equal chunks.\n");
+            goto bad_vnuma_mem;
+        }
+    }
+    return 0;
+
+ bad_vnuma_mem:
+    return -1;
+}
+
+static int parse_vnuma_distance(XLU_Config *config,
+                                libxl_domain_build_info **b_info)
+{
+    libxl_domain_build_info *dst;
+    XLU_ConfigList *vdistancecfg;
+    int nr_vdist;
+
+    dst = *b_info;
+    dst->vdistance = calloc(dst->vnodes * dst->vnodes,
+                               sizeof(*dst->vdistance));
+    if (dst->vdistance == NULL)
+        goto bad_distance;
+
+    if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0)) {
+        int d1, d2, i;
+        /*
+         * First value is the same node distance, the second as the
+         * rest of distances. The following is required right now to
+         * avoid non-symmetrical distance table as it may break latest kernel.
+         * TODO: Better way to analyze extended distance table, possibly
+         * OS specific.
+         */
+
+        for (i = 0; i < nr_vdist; i++) {
+            d1 = get_list_item_uint(vdistancecfg, i);
+        }
+
+        d1 = get_list_item_uint(vdistancecfg, 0);
+        if (dst->vnodes > 1)
+           d2 = get_list_item_uint(vdistancecfg, 1);
+        else
+           d2 = d1;
+
+        if (d1 >= 0 && d2 >= 0) {
+            if (d1 < d2)
+                fprintf(stderr, "WARNING: vnuma distance d1 < d2, %u < %u\n", d1, d2);
+            vdistance_set(dst->vdistance, dst->vnodes, d1, d2);
+        } else {
+            fprintf(stderr, "WARNING: vnuma distance values are incorrect.\n");
+            goto bad_distance;
+        }
+    } else {
+        fprintf(stderr, "Could not parse vnuma distances.\n");
+        vdistance_set(dst->vdistance, dst->vnodes, 10, 20);
+    }
+    return 0;
+
+ bad_distance:
+    return -1;
+}
+
+static int parse_vnuma_vcpumap(XLU_Config *config,
+                                libxl_domain_build_info **b_info)
+{
+    libxl_domain_build_info *dst;
+    XLU_ConfigList *vcpumap;
+    int nr_vcpumap, i;
+
+    dst = *b_info;
+    dst->vnuma_vcpumap = (unsigned int *)calloc(dst->max_vcpus,
+                                     sizeof(*dst->vnuma_vcpumap));
+    if (dst->vnuma_vcpumap == NULL)
+        goto bad_vcpumap;
+
+    if (!xlu_cfg_get_list(config, "vnuma_vcpumap",
+                          &vcpumap, &nr_vcpumap, 0)) {
+        if (nr_vcpumap == dst->max_vcpus) {
+            unsigned int  vnode, vcpumask = 0, vmask;
+
+            vmask = ~(~0 << nr_vcpumap);
+            for (i = 0; i < nr_vcpumap; i++) {
+                vnode = get_list_item_uint(vcpumap, i);
+                if (vnode >= 0 && vnode < dst->vnodes) {
+                    vcpumask |= (1 << i);
+                    dst->vnuma_vcpumap[i] = vnode;
+                }
+            }
+
+            /* Did it covered all vnodes in the vcpu mask? */
+            if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) {
+                fprintf(stderr, "WARNING: Not all vnodes were covered \
+                        in numa_cpumask.\n");
+                goto bad_vcpumap;
+            }
+        } else {
+            fprintf(stderr, "WARNING:  Bad vnuma_vcpumap.\n");
+            goto bad_vcpumap;
+        }
+    }
+    else
+        vcputovnode_default(dst->vnuma_vcpumap,
+                            dst->vnodes,
+                            dst->max_vcpus);
+    return 0;
+
+ bad_vcpumap:
+    return -1;
+}
+
+static int parse_vnuma_vnodemap(XLU_Config *config,
+                                libxl_domain_build_info **b_info)
+{
+    libxl_domain_build_info *dst;
+    XLU_ConfigList *vnodemap;
+    int nr_vnodemap, i;
+
+    dst = *b_info;
+
+    /* There is mapping to NUMA physical nodes? */
+    dst->vnuma_vnodemap = (unsigned int *)calloc(dst->vnodes,
+                           sizeof(*dst->vnuma_vnodemap));
+    if (dst->vnuma_vnodemap == NULL)
+        goto bad_vnodemap;
+
+    if (!xlu_cfg_get_list(config, "vnuma_vnodemap",
+                          &vnodemap, &nr_vnodemap, 0)) {
+        /*
+         * If not specified or incorrect, will be defined
+         * later based on the machine architecture, configuration
+         * and memory availble when creating domain.
+         */
+        libxl_defbool_set(&dst->vnuma_autoplacement, false);
+        if (nr_vnodemap == dst->vnodes) {
+            unsigned int vnodemask = 0, pnode, smask;
+            smask = ~(~0 << dst->vnodes);
+            for (i = 0; i < dst->vnodes; i++) {
+                pnode = get_list_item_uint(vnodemap, i);
+                if (pnode >= 0) {
+                    vnodemask |= (1 << i);
+                    dst->vnuma_vnodemap[i] = pnode;
+                }
+            }
+
+            /* Did it covered all vnodes in the mask? */
+            if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) {
+                fprintf(stderr, "WARNING: Not all vnodes were covered \
+                        vnuma_vnodemap.\n");
+                fprintf(stderr, "Automatic placement will be used for vnodes.\n");
+                libxl_defbool_set(&dst->vnuma_autoplacement, true);
+                vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
+            }
+        }
+        else {
+            fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n");
+            fprintf(stderr, "Automatic placement will be used for vnodes.\n");
+            libxl_defbool_set(&dst->vnuma_autoplacement, true);
+            vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
+        }
+    }
+    else {
+        fprintf(stderr, "WARNING: Missing vnuma_vnodemap.\n");
+        fprintf(stderr, "Automatic placement will be used for vnodes.\n");
+        libxl_defbool_set(&dst->vnuma_autoplacement, true);
+        vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
+    }
+    return 0;
+
+ bad_vnodemap:
+    return -1;
+
+}
+
+static void parse_vnuma_config(XLU_Config *config,
+                               libxl_domain_build_info *b_info)
+{
+    long l;
+
+    if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) {
+        if (l > MAX_VNUMA_NODES) {
+            fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n",
+                    MAX_VNUMA_NODES);
+            goto bad_vnuma_config;
+        }
+        b_info->vnodes = l;
+
+        if (!xlu_cfg_get_defbool(config, "vnuma_autoplacement",
+                    &b_info->vnuma_autoplacement, 0))
+            libxl_defbool_set(&b_info->vnuma_autoplacement, false);
+
+        /* Only construct nodes with at least one vcpu. */
+        if (b_info->vnodes != 0 && b_info->max_vcpus >= b_info->vnodes) {
+            if (parse_vnuma_mem(config, &b_info) ||
+                parse_vnuma_distance(config, &b_info) ||
+                parse_vnuma_vcpumap(config, &b_info) ||
+                parse_vnuma_vnodemap(config, &b_info))
+                goto bad_vnuma_config;
+        }
+        else if (vnuma_default_config(b_info))
+            goto bad_vnuma_config;
+    }
+    /* If vnuma topology is not defined for domain, init one node */
+    else if (vnuma_default_config(b_info))
+            goto bad_vnuma_config;
+    return;
+
+ bad_vnuma_config:
+    fprintf(stderr, "Failed to parse vnuma config or set default vnuma config.\n");
+    free_vnuma_info(b_info);
+    exit(1);
+}
+
 static void parse_config_data(const char *config_source,
                               const char *config_data,
                               int config_len,
@@ -924,6 +1351,12 @@ static void parse_config_data(const char *config_source,
 
     libxl_defbool_set(&b_info->claim_mode, claim_mode);
 
+    /*
+     * If there is no vnuma in config, "zero" vnuma config
+     * will be initialized with one node and other defaults.
+     */
+    parse_vnuma_config(config, b_info);
+
     if (xlu_cfg_get_string (config, "on_poweroff", &buf, 0))
         buf = "destroy";
     if (!parse_action_on_shutdown(buf, &d_config->on_poweroff)) {
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 4/9] libxc: Introduce xc_domain_setvnuma to set vNUMA
  2014-09-03  4:24 ` [PATCH v10 4/9] libxc: Introduce xc_domain_setvnuma to set vNUMA Elena Ufimtseva
@ 2014-09-03 14:53   ` Ian Campbell
  0 siblings, 0 replies; 32+ messages in thread
From: Ian Campbell @ 2014-09-03 14:53 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, stefano.stabellini, george.dunlap, msw, dario.faggioli,
	lccycc123, ian.jackson, xen-devel, JBeulich

On Wed, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
> With the introduction of the XEN_DOMCTL_setvnumainfo
> in patch titled: "xen: vnuma topology and subop hypercalls"
> we put in the plumbing here to use from the toolstack. The user
> is allowed to call this multiple times if they wish so.
> It will error out if the nr_vnodes or nr_vcpus is zero.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> 
> Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

This looks like a valid wrapping of a hypercall to me:
Acked-by: Ian Campbell <ian.campbell@citrix.com>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 5/9] libxl: vnuma types declararion
  2014-09-03  4:24 ` [PATCH v10 5/9] libxl: vnuma types declararion Elena Ufimtseva
@ 2014-09-03 15:03   ` Ian Campbell
  2014-09-03 16:04   ` Ian Campbell
  1 sibling, 0 replies; 32+ messages in thread
From: Ian Campbell @ 2014-09-03 15:03 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, stefano.stabellini, george.dunlap, msw, dario.faggioli,
	lccycc123, ian.jackson, xen-devel, JBeulich

On Wed, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
> Adds vnuma topology types declarations to libxl_domain_build_info
> structure.

Normally we would add the data types along with the implementation
rather than splitting it out like this.

Anyway, at some point in this series once everything is in place you
will need to introduce a LIBXL_HAVE_ define to indicate the availabilty
of this feature, (maybe this comes later)

> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
>  tools/libxl/libxl_types.idl |    8 +++++++-
>  tools/libxl/libxl_vnuma.h   |   16 ++++++++++++++++
>  2 files changed, 23 insertions(+), 1 deletion(-)
>  create mode 100644 tools/libxl/libxl_vnuma.h
> 
> diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
> index 08a7927..ea8bac0 100644
> --- a/tools/libxl/libxl_types.idl
> +++ b/tools/libxl/libxl_types.idl
> @@ -333,7 +333,13 @@ libxl_domain_build_info = Struct("domain_build_info",[
>      ("disable_migrate", libxl_defbool),
>      ("cpuid",           libxl_cpuid_policy_list),
>      ("blkdev_start",    string),
> -    
> +    ("vnodes",          uint32),
> +    ("vmemranges",      uint32),
> +    ("vnuma_mem",       Array(uint64, "num_vnuma_mem")),
> +    ("vnuma_vcpumap",   Array(uint32, "num_vnuma_vcpumap")),
> +    ("vdistance",       Array(uint32, "num_vdistance")),
> +    ("vnuma_vnodemap",  Array(uint32, "num_vnuma_vnondemap")),
> +    ("vnuma_autoplacement",  libxl_defbool),
>      ("device_model_version", libxl_device_model_version),
>      ("device_model_stubdomain", libxl_defbool),
>      # if you set device_model you must set device_model_version too
> diff --git a/tools/libxl/libxl_vnuma.h b/tools/libxl/libxl_vnuma.h
> new file mode 100644
> index 0000000..4312070
> --- /dev/null
> +++ b/tools/libxl/libxl_vnuma.h
> @@ -0,0 +1,16 @@
> +#ifndef LIBXL_VNUMA_H
> +#define LIBXL_VNUMA_H
> +
> +#include "libxl_osdeps.h" /* must come before any other headers */
> +
> +#define VNUMA_NO_NODE ~((unsigned int)0)

Is this intended to be assigned to e.g. vnume_vnodemap? If so it should
probably have the same type (e.g. uint32_t).

> +
> +/*
> + * Min vNUMA node size in MBytes from Linux for x86 architecture.
> + * See linux source code arch/x86/include/asm/numa.h
> + */
> +#define MIN_VNODE_SIZE  (4)
> +
> +#define MAX_VNUMA_NODES ((unsigned int)1 << 10)

(unsigned int)1 is "1U" I think.

> +
> +#endif

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 6/9] libxl: build numa nodes memory blocks
  2014-09-03  4:24 ` [PATCH v10 6/9] libxl: build numa nodes memory blocks Elena Ufimtseva
@ 2014-09-03 15:21   ` Ian Campbell
  2014-09-04  4:47     ` Elena Ufimtseva
  2014-09-05  3:50     ` Elena Ufimtseva
  2014-09-12 10:18   ` Dario Faggioli
  1 sibling, 2 replies; 32+ messages in thread
From: Ian Campbell @ 2014-09-03 15:21 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, stefano.stabellini, george.dunlap, msw, dario.faggioli,
	lccycc123, ian.jackson, xen-devel, JBeulich

On Wed, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
> Create the vmemrange structure based on the
> PV guests E820 map. Values are in in Megabytes.
> Also export the E820 filter code e820_sanitize
> out to be available internally.
> As Xen can support mutiranges vNUMA nodes, for
> PV guest it is one range per one node. The other
> domain types should have their building routines
> implemented.
> 
> Changes since v8:
>     - Added setting of the vnuma memory ranges node ids;
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
>  tools/libxl/libxl_internal.h |    9 ++
>  tools/libxl/libxl_numa.c     |  201 ++++++++++++++++++++++++++++++++++++++++++
>  tools/libxl/libxl_x86.c      |    3 +-
>  3 files changed, 212 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
> index beb052e..63ccb5e 100644
> --- a/tools/libxl/libxl_internal.h
> +++ b/tools/libxl/libxl_internal.h
> @@ -3088,6 +3088,15 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
>      libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
>  }
>  
> +bool libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info *info);
> +
> +int e820_sanitize(libxl_ctx *ctx, struct e820entry src[], uint32_t *nr_entries,
> +                  unsigned long map_limitkb, unsigned long balloon_kb);

This is x86 specific, so simply exposing it to common libxl code as
you've done will break on arm.

I'm not sure if this means moving the whole thing to libxl_x86 and
adding an ARM stub or if there is some common stuff from which the x86
stuff needs to be abstracted.

> +
> +int libxl__vnuma_align_mem(libxl__gc *gc, uint32_t domid,
> +                           struct libxl_domain_build_info *b_info,
> +                           vmemrange_t *memblks);
> +
>  _hidden int libxl__ms_vm_genid_set(libxl__gc *gc, uint32_t domid,
>                                     const libxl_ms_vm_genid *id);
>  
> diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
> index 94ca4fe..c416faf 100644
> --- a/tools/libxl/libxl_numa.c
> +++ b/tools/libxl/libxl_numa.c
> @@ -19,6 +19,10 @@
>  
>  #include "libxl_internal.h"
>  
> +#include "libxl_vnuma.h"
> +
> +#include "xc_private.h"
> +
>  /*
>   * What follows are helpers for generating all the k-combinations
>   * without repetitions of a set S with n elements in it. Formally
> @@ -508,6 +512,203 @@ int libxl__get_numa_candidate(libxl__gc *gc,
>  }
>  
>  /*
> + * Check if we can fit vnuma nodes to numa pnodes
> + * from vnode_to_pnode array.
> + */
> +bool libxl__vnodemap_is_usable(libxl__gc *gc,
> +                            libxl_domain_build_info *info)
> +{
> +    unsigned int i;
> +    libxl_numainfo *ninfo = NULL;
> +    unsigned long long *claim;
> +    unsigned int node;
> +    uint64_t *sz_array;

I don't think you set this, so please make it const so as to make sure.

> +    int nr_nodes = 0;
> +
> +    /* Cannot use specified mapping if not NUMA machine. */
> +    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
> +    if (ninfo == NULL)
> +        return false;
> +
> +    sz_array = info->vnuma_mem;
> +    claim = libxl__calloc(gc, info->vnodes, sizeof(*claim));
> +    /* Get total memory required on each physical node. */
> +    for (i = 0; i < info->vnodes; i++)
> +    {
> +        node = info->vnuma_vnodemap[i];
> +
> +        if (node < nr_nodes)
> +            claim[node] += (sz_array[i] << 20);
> +        else
> +            goto vnodemapout;
> +   }
> +   for (i = 0; i < nr_nodes; i++) {
> +       if (claim[i] > ninfo[i].free)
> +          /* Cannot complete user request, falling to default. */
> +          goto vnodemapout;
> +   }
> +
> + vnodemapout:
> +   return true;

Apart from non-NUMA systems, you always return true. Is that really
correct right? I'd expect one or the other "goto vnodemapout" to want to
return false.

"out" is OK as a label name.


> +
> +/*
> + * For each node, build memory block start and end addresses.
> + * Substract any memory hole from the range found in e820 map.
> + * vnode memory size are passed here in megabytes, the result is
> + * in memory block addresses.
> + * Linux kernel will adjust numa memory block sizes on its own.
> + * But we want to provide to the kernel numa block addresses that
> + * will be the same in kernel and hypervisor.

You shouldn't be making these sorts of guest OS assumptions. This is not
only Linux specific but also (presumably) specific to the version of
Linux you happened to be looking at.

Please define the semantics wrt the interface and guarantees which Xen
wants to provide to all PV guests, not just one particular kernel/

> + */
> +int libxl__vnuma_align_mem(libxl__gc *gc,

Is "align" in the name here accurate? The doc comment says "build"
instead, which suggests it is creating things (perhaps aligned as it
goes).

> +                            uint32_t domid,
> +                            /* IN: mem sizes in megabytes */
> +                            libxl_domain_build_info *b_info,

if it is input only then please make it const.

> +                            /* OUT: linux NUMA blocks addresses */

Again -- Linux specific.

> +    libxl_ctx *ctx = libxl__gc_owner(gc);

You can use CTX instead of creating this local thing.

Ian.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 7/9] libxc: allocate domain memory for vnuma enabled
  2014-09-03  4:24 ` [PATCH v10 7/9] libxc: allocate domain memory for vnuma enabled Elena Ufimtseva
@ 2014-09-03 15:26   ` Ian Campbell
  2014-09-12 11:06   ` Dario Faggioli
  1 sibling, 0 replies; 32+ messages in thread
From: Ian Campbell @ 2014-09-03 15:26 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, stefano.stabellini, george.dunlap, msw, dario.faggioli,
	lccycc123, ian.jackson, xen-devel, JBeulich

On Wed, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
> vNUMA-aware domain memory allocation based on provided
> vnode to pnode map. If this map is not defined, use
> default allocation. Default allocation will not specify
> any physical node when allocating memory.
> Domain creation will fail if at least one node was not defined.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
>  tools/libxc/xc_dom.h     |   13 ++++++++
>  tools/libxc/xc_dom_x86.c |   76 ++++++++++++++++++++++++++++++++++++++--------
>  2 files changed, 77 insertions(+), 12 deletions(-)
> 
> diff --git a/tools/libxc/xc_dom.h b/tools/libxc/xc_dom.h
> index 6ae6a9f..61c2a06 100644
> --- a/tools/libxc/xc_dom.h
> +++ b/tools/libxc/xc_dom.h
> @@ -164,6 +164,16 @@ struct xc_dom_image {
>  
>      /* kernel loader */
>      struct xc_dom_arch *arch_hooks;
> +
> +   /*
> +    * vNUMA topology and memory allocation structure.
> +    * Defines the way to allocate memory on per NUMA
> +    * physical defined by vnode_to_pnode.
> +    */
> +    uint32_t vnodes;
> +    uint64_t *numa_memszs;
> +    unsigned int *vnode_to_pnode;
> +
>      /* allocate up to virt_alloc_end */
>      int (*allocate) (struct xc_dom_image * dom, xen_vaddr_t up_to);
>  };
> @@ -385,6 +395,9 @@ static inline xen_pfn_t xc_dom_p2m_guest(struct xc_dom_image *dom,
>  int arch_setup_meminit(struct xc_dom_image *dom);
>  int arch_setup_bootearly(struct xc_dom_image *dom);
>  int arch_setup_bootlate(struct xc_dom_image *dom);
> +int arch_boot_alloc(struct xc_dom_image *dom);

I don't think this should be public. A static helper within xc_dom_x86.c
would be fine.

Otherwise the question becomes what should be arm version of this
function be?

> +
> +#define LIBXC_VNUMA_NO_NODE ~((unsigned int)0)
>  
>  /*
>   * Local variables:
> diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c
> index bf06fe4..f2b4c98 100644
> --- a/tools/libxc/xc_dom_x86.c
> +++ b/tools/libxc/xc_dom_x86.c
> @@ -759,7 +759,7 @@ static int x86_shadow(xc_interface *xch, domid_t domid)
>  int arch_setup_meminit(struct xc_dom_image *dom)
>  {
>      int rc;
> -    xen_pfn_t pfn, allocsz, i, j, mfn;
> +    xen_pfn_t pfn, i, j, mfn;
>  
>      rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type);
>      if ( rc )
> @@ -811,25 +811,77 @@ int arch_setup_meminit(struct xc_dom_image *dom)
>          /* setup initial p2m */
>          for ( pfn = 0; pfn < dom->total_pages; pfn++ )
>              dom->p2m_host[pfn] = pfn;
> +
> +        /*
> +         * Any PV domain should have at least one vNUMA node.
> +         * If no config was defined, one default vNUMA node
> +         * will be set.
> +         */
> +        if ( dom->vnodes == 0 ) {
> +            xc_dom_printf(dom->xch,
> +                         "%s: Cannot construct vNUMA topology with 0 vnodes\n",
> +                         __FUNCTION__);
> +            return -EINVAL;
> +        }
>          
>          /* allocate guest memory */
> -        for ( i = rc = allocsz = 0;
> -              (i < dom->total_pages) && !rc;
> -              i += allocsz )
> -        {
> -            allocsz = dom->total_pages - i;
> -            if ( allocsz > 1024*1024 )
> -                allocsz = 1024*1024;
> -            rc = xc_domain_populate_physmap_exact(
> -                dom->xch, dom->guest_domid, allocsz,
> -                0, 0, &dom->p2m_host[i]);
> -        }
> +        rc = arch_boot_alloc(dom);
> +        if ( rc )
> +            return rc;
>  
>          /* Ensure no unclaimed pages are left unused.
>           * OK to call if hadn't done the earlier claim call. */
>          (void)xc_domain_claim_pages(dom->xch, dom->guest_domid,
>                                      0 /* cancels the claim */);
>      }
> +    return rc;
> +}
> +
> +/*
> + * Allocates domain memory taking into account
> + * defined vnuma topology and vnode_to_pnode map.
> + * Any pv guest will have at least one vnuma node
> + * with vnuma_memszs[0] = domain memory and the rest
> + * topology initialized with default values.
> + */
> +int arch_boot_alloc(struct xc_dom_image *dom)

This seems to make no reference to dom->total_pages which the old code
used to do. What guarantees that the numa nodes don't sum to too much?

> +{
> +    int rc;
> +    unsigned int n, memflags;
> +    unsigned long long vnode_pages;
> +    unsigned long long allocsz = 0, node_pfn_base, i;
> +
> +    rc = allocsz = node_pfn_base = n = 0;
> +
> +    for ( n = 0; n < dom->vnodes; n++ )
> +    {
> +        memflags = 0;
> +        if ( dom->vnode_to_pnode[n] != LIBXC_VNUMA_NO_NODE )
> +        {
> +            memflags |= XENMEMF_exact_node(dom->vnode_to_pnode[n]);
> +            memflags |= XENMEMF_exact_node_request;
> +        }
> +        /* memeszs are in megabytes, calc pages from it for this node. */
> +        vnode_pages = (dom->numa_memszs[n] << 20) >> PAGE_SHIFT_X86;
> +        for ( i = 0; i < vnode_pages; i += allocsz )
> +        {
> +            allocsz = vnode_pages - i;
> +            if ( allocsz > 1024*1024 )
> +                allocsz = 1024*1024;
> +
> +            rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
> +                                            allocsz, 0, memflags,
> +                                            &dom->p2m_host[node_pfn_base + i]);
> +            if ( rc )
> +            {
> +                xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
> +                        "%s: Failed allocation of %Lu pages for vnode %d on pnode %d out of %lu\n",
> +                        __FUNCTION__, vnode_pages, n, dom->vnode_to_pnode[n], dom->total_pages);
> +                return rc;
> +            }
> +        }
> +        node_pfn_base += i;
> +    }
>  
>      return rc;
>  }

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 3/9] vnuma hook to debug-keys u
  2014-09-03  4:24 [PATCH v10 3/9] vnuma hook to debug-keys u Elena Ufimtseva
                   ` (5 preceding siblings ...)
  2014-09-03  4:24 ` [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc Elena Ufimtseva
@ 2014-09-03 15:37 ` Konrad Rzeszutek Wilk
  6 siblings, 0 replies; 32+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-09-03 15:37 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, xen-devel, JBeulich

On Wed, Sep 03, 2014 at 12:24:12AM -0400, Elena Ufimtseva wrote:
> Add debug-keys hook to display vnuma topology.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>

Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

> ---
>  xen/arch/x86/numa.c |   32 +++++++++++++++++++++++++++++++-
>  1 file changed, 31 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
> index 555276e..346725f 100644
> --- a/xen/arch/x86/numa.c
> +++ b/xen/arch/x86/numa.c
> @@ -347,10 +347,11 @@ EXPORT_SYMBOL(node_data);
>  static void dump_numa(unsigned char key)
>  {
>      s_time_t now = NOW();
> -    int i;
> +    int i, j, err, n;
>      struct domain *d;
>      struct page_info *page;
>      unsigned int page_num_node[MAX_NUMNODES];
> +    uint64_t mem;
>  
>      printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
>            (u32)(now>>32), (u32)now);
> @@ -389,6 +390,35 @@ static void dump_numa(unsigned char key)
>  
>          for_each_online_node(i)
>              printk("    Node %u: %u\n", i, page_num_node[i]);
> +
> +        if (!d->vnuma)
> +            continue;
> +
> +        printk("   %u vnodes, %u vcpus\n", d->vnuma->nr_vnodes, d->max_vcpus);
> +        for (i = 0; i < d->vnuma->nr_vnodes; i++) {
> +            err = snprintf(keyhandler_scratch, 12, "%u",
> +                    d->vnuma->vnode_to_pnode[i]);
> +            if (err < 0 || d->vnuma->vnode_to_pnode[i] == NUMA_NO_NODE)
> +                snprintf(keyhandler_scratch, 3, "???");
> +
> +            printk("        vnode %3u - pnode %s,", i, keyhandler_scratch);
> +            mem = d->vnuma->vmemrange[i].end - d->vnuma->vmemrange[i].start;
> +            printk(" %"PRIu64" MB, ", mem >> 20);
> +
> +            printk("vcpu nrs: ");
> +            for (j = 0, n = 0; j < d->max_vcpus; j++) {
> +                if (d->vnuma->vcpu_to_vnode[j] == i) {
> +                    if ( ((n + 1) % 8) == 0 )
> +                        printk("%d\n", j);
> +                    else if ( !(n % 8) && n != 0 )
> +                        printk("%s%d ", "             ", j);
> +                    else
> +                        printk("%d ", j);
> +                    n++;
> +                }
> +            }
> +            printk("\n");
> +        }
>      }
>  
>      rcu_read_unlock(&domlist_read_lock);
> -- 
> 1.7.10.4
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc
  2014-09-03  4:24 ` [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc Elena Ufimtseva
@ 2014-09-03 15:42   ` Konrad Rzeszutek Wilk
  2014-09-11 17:13   ` Dario Faggioli
  1 sibling, 0 replies; 32+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-09-03 15:42 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, xen-devel, JBeulich

On Wed, Sep 03, 2014 at 12:24:18AM -0400, Elena Ufimtseva wrote:
> Parses vnuma topoplogy number of nodes and memory
> ranges. If not defined, initializes vnuma with
> only one node and default topology. This one node covers
> all domain memory and all vcpus assigned to it.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>


Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

but I am no English native speaker on the docs part so it might
be a good thing for such a person to look over it.


> ---
>  docs/man/xl.cfg.pod.5    |   77 +++++++++
>  tools/libxl/xl_cmdimpl.c |  433 ++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 510 insertions(+)
> 
> diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
> index f1fc906..2ee2cbc 100644
> --- a/docs/man/xl.cfg.pod.5
> +++ b/docs/man/xl.cfg.pod.5
> @@ -264,6 +264,83 @@ if the values of B<memory=> and B<maxmem=> differ.
>  A "pre-ballooned" HVM guest needs a balloon driver, without a balloon driver
>  it will crash.
>  
> +=item B<vnuma_nodes=N>
> +
> +Number of vNUMA nodes the guest will be initialized with on boot.
> +PV guest by default will have one vnuma node.
> +
> +=item B<vnuma_mem=[vmem1, vmem2, ...]>
> +
> +List of memory sizes for each node, defined in MBytes. Number of items listed must
> +match nr_vnodes. If the sum of all vnode memories does not match the domain memory
> +or there are missing nodes, it will fail.
> +If not specified, memory will be equally split between vnodes. Current minimum
> +memory size for one node is limited by 32MB.
> +
> +Example: vnuma_mem=[1024, 1024, 2048, 2048]
> +Total amount of memory in guest: 6GB
> +
> +=item B<vdistance=[d1, d2]>
> +
> +Defines the distance table for vNUMA nodes. NUMA topology distances are
> +represented by two dimensional square matrix. One element of it [i,j] is
> +a distance between nodes i and j. Trivial case is where all diagonal elements
> +are equal and matrix is symmetrical. vdistance configuration option allows
> +to define two values d1 and d2. d1 will be used for all diagonal elements of
> +distance matrix. All other values will be equal to d2 value. Usually distances
> +are multiple of 10 in Linux and same rule used here.
> +If not specified, the default constants values will be used for distance,
> +e.g. [10, 20]. For one node default distance is [10];
> +
> +Examples:
> +vnodes = 3
> +vdistance=[10, 20]
> +will create this distance table (this is default setting as well):
> +[10, 20, 20]
> +[20, 10, 20]
> +[20, 20, 10]
> +
> +=item B<vnuma_vcpumap=[node_nr, node_nr, ...]>
> +
> +Defines vcpu to vnode mapping as a list of integers. The position in the list
> +is a vcpu number, and the value is the vnode number to which the vcpu will be
> +assigned to.
> +Current limitations:
> +- vNUMA node must have at least one vcpu, otherwise default vcpu_to_vnode will be used.
> +- Total number of vnodes cannot be bigger then number of vcpus.
> +
> +Example:
> +Map of 4 vcpus to 2 vnodes:
> +0,1 vcpu -> vnode0
> +2,3 vcpu -> vnode1:
> +
> +vnuma_vcpumap = [0, 0, 1, 1]
> + 4 vcpus here -  0  1  2  3
> +
> +=item B<vnuma_vnodemap=[p1, p2, ..., pn]>
> +
> +List of physical node numbers, position in the list represents vnode number.
> +Used for manual placement of vnuma nodes to physical NUMA nodes.
> +Will not be used if automatic numa placement is active.
> +
> +Example:
> +assume NUMA machine with 4 physical nodes. Placing vnuma node 0 to pnode 2,
> +vnuma node 1 to pnode 3:
> +vnode0 -> pnode2
> +vnode1 -> pnode3
> +
> +vnuma_vnodemap=[2, 3]
> +first vnode will be placed on node 2, second on node 3.
> +
> +=item B<vnuma_autoplacement=[0|1]>
> +
> +If set to 1 and automatic NUMA placement is enabled, automatically will find the best
> +physical node to place vnuma nodes on. vnuma_vnodemap will be ignored. Automatic NUMA
> +placement is enabled if domain has no pinned cpus.
> +If vnuma_autoplacement is set to 0, then the vnodes will be placed on NUMA nodes set
> +in vnuma_vnodemap if there is enough memory on physical nodes. If not, then the allocation
> +will be made on any of the available node and be placed on multiple physical NUMA nodes.
> +
>  =back
>  
>  =head3 Event Actions
> diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
> index 409a795..1af2250 100644
> --- a/tools/libxl/xl_cmdimpl.c
> +++ b/tools/libxl/xl_cmdimpl.c
> @@ -40,6 +40,7 @@
>  #include "libxl_json.h"
>  #include "libxlutil.h"
>  #include "xl.h"
> +#include "libxl_vnuma.h"
>  
>  /* For calls which return an errno on failure */
>  #define CHK_ERRNOVAL( call ) ({                                         \
> @@ -797,6 +798,432 @@ static void parse_vcpu_affinity(libxl_domain_build_info *b_info,
>      }
>  }
>  
> +static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i)
> +{
> +    const char *buf;
> +    char *ep;
> +    unsigned long ul;
> +    int rc = -EINVAL;
> +
> +    buf = xlu_cfg_get_listitem(list, i);
> +    if (!buf)
> +        return rc;
> +    ul = strtoul(buf, &ep, 10);
> +    if (ep == buf)
> +        return rc;
> +    if (ul >= UINT16_MAX)
> +        return rc;
> +    return (unsigned int)ul;
> +}
> +
> +static void vdistance_set(unsigned int *vdistance,
> +                                unsigned int nr_vnodes,
> +                                unsigned int samenode,
> +                                unsigned int othernode)
> +{
> +    unsigned int idx, slot;
> +    for (idx = 0; idx < nr_vnodes; idx++)
> +        for (slot = 0; slot < nr_vnodes; slot++)
> +            *(vdistance + slot * nr_vnodes + idx) =
> +                idx == slot ? samenode : othernode;
> +}
> +
> +static void vcputovnode_default(unsigned int *cpu_to_node,
> +                                unsigned int nr_vnodes,
> +                                unsigned int max_vcpus)
> +{
> +    unsigned int cpu;
> +    for (cpu = 0; cpu < max_vcpus; cpu++)
> +        cpu_to_node[cpu] = cpu % nr_vnodes;
> +}
> +
> +/* Split domain memory between vNUMA nodes equally. */
> +static int split_vnumamem(libxl_domain_build_info *b_info)
> +{
> +    unsigned long long vnodemem = 0;
> +    unsigned long n;
> +    unsigned int i;
> +
> +    if (b_info->vnodes == 0)
> +        return -1;
> +
> +    vnodemem = (b_info->max_memkb >> 10) / b_info->vnodes;
> +    if (vnodemem < MIN_VNODE_SIZE)
> +        return -1;
> +    /* reminder in MBytes. */
> +    n = (b_info->max_memkb >> 10) % b_info->vnodes;
> +    /* get final sizes in MBytes. */
> +    for (i = 0; i < (b_info->vnodes - 1); i++)
> +        b_info->vnuma_mem[i] = vnodemem;
> +    /* add the reminder to the last node. */
> +    b_info->vnuma_mem[i] = vnodemem + n;
> +    return 0;
> +}
> +
> +static void vnuma_vnodemap_default(unsigned int *vnuma_vnodemap,
> +                                   unsigned int nr_vnodes)
> +{
> +    unsigned int i;
> +    for (i = 0; i < nr_vnodes; i++)
> +        vnuma_vnodemap[i] = VNUMA_NO_NODE;
> +}
> +
> +/*
> + * init vNUMA to "zero config" with one node and all other
> + * topology parameters set to default.
> + */
> +static int vnuma_default_config(libxl_domain_build_info *b_info)
> +{
> +    b_info->vnodes = 1;
> +    /* all memory goes to this one vnode, as well as vcpus. */
> +    if (!(b_info->vnuma_mem = (uint64_t *)calloc(b_info->vnodes,
> +                                sizeof(*b_info->vnuma_mem))))
> +        goto bad_vnumazerocfg;
> +
> +    if (!(b_info->vnuma_vcpumap = (unsigned int *)calloc(b_info->max_vcpus,
> +                                sizeof(*b_info->vnuma_vcpumap))))
> +        goto bad_vnumazerocfg;
> +
> +    if (!(b_info->vdistance = (unsigned int *)calloc(b_info->vnodes *
> +                                b_info->vnodes, sizeof(*b_info->vdistance))))
> +        goto bad_vnumazerocfg;
> +
> +    if (!(b_info->vnuma_vnodemap = (unsigned int *)calloc(b_info->vnodes,
> +                                sizeof(*b_info->vnuma_vnodemap))))
> +        goto bad_vnumazerocfg;
> +
> +    b_info->vnuma_mem[0] = b_info->max_memkb >> 10;
> +
> +    /* all vcpus assigned to this vnode. */
> +    vcputovnode_default(b_info->vnuma_vcpumap, b_info->vnodes,
> +                        b_info->max_vcpus);
> +
> +    /* default vdistance is 10. */
> +    vdistance_set(b_info->vdistance, b_info->vnodes, 10, 10);
> +
> +    /* VNUMA_NO_NODE for vnode_to_pnode. */
> +    vnuma_vnodemap_default(b_info->vnuma_vnodemap, b_info->vnodes);
> +
> +    /*
> +     * will be placed to some physical nodes defined by automatic
> +     * numa placement or VNUMA_NO_NODE will not request exact node.
> +     */
> +    libxl_defbool_set(&b_info->vnuma_autoplacement, true);
> +    return 0;
> +
> + bad_vnumazerocfg:
> +    return -1;
> +}
> +
> +static void free_vnuma_info(libxl_domain_build_info *b_info)
> +{
> +    free(b_info->vnuma_mem);
> +    free(b_info->vdistance);
> +    free(b_info->vnuma_vcpumap);
> +    free(b_info->vnuma_vnodemap);
> +
> +    b_info->vnuma_mem = NULL;
> +    b_info->vdistance = NULL;
> +    b_info->vnuma_vcpumap = NULL;
> +    b_info->vnuma_vnodemap = NULL;
> +
> +    b_info->vnodes = 0;
> +    b_info->vmemranges = 0;
> +}
> +
> +static int parse_vnuma_mem(XLU_Config *config,
> +                            libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
> +    XLU_ConfigList *vnumamemcfg;
> +    int nr_vnuma_regions, i;
> +    unsigned long long vnuma_memparsed = 0;
> +    unsigned long ul;
> +    const char *buf;
> +    char *ep;
> +
> +    dst = *b_info;
> +    if (!xlu_cfg_get_list(config, "vnuma_mem",
> +                          &vnumamemcfg, &nr_vnuma_regions, 0)) {
> +
> +        if (nr_vnuma_regions != dst->vnodes) {
> +            fprintf(stderr, "Number of numa regions (vnumamem = %d) is \
> +                    incorrect (should be %d).\n", nr_vnuma_regions,
> +                    dst->vnodes);
> +            goto bad_vnuma_mem;
> +        }
> +
> +        dst->vnuma_mem = calloc(dst->vnodes,
> +                                 sizeof(*dst->vnuma_mem));
> +        if (dst->vnuma_mem == NULL) {
> +            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
> +            goto bad_vnuma_mem;
> +        }
> +
> +        /*
> +         * Will parse only nr_vnodes times, even if we have more/less regions.
> +         * Take care of it later if less or discard if too many regions.
> +         */
> +        for (i = 0; i < dst->vnodes; i++) {
> +            buf = xlu_cfg_get_listitem(vnumamemcfg, i);
> +            if (!buf) {
> +                fprintf(stderr,
> +                        "xl: Unable to get element %d in vnuma memory list.\n", i);
> +                goto bad_vnuma_mem;
> +            }
> +
> +            ul = strtoul(buf, &ep, 10);
> +            if (ep == buf) {
> +                fprintf(stderr, "xl: Invalid argument parsing vnumamem: %s.\n", buf);
> +                goto bad_vnuma_mem;
> +            }
> +
> +            /* 32Mb is a min size for a node, taken from Linux */
> +            if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) {
> +                fprintf(stderr, "xl: vnuma memory %lu is not within %u - %u range.\n",
> +                        ul, MIN_VNODE_SIZE, UINT32_MAX);
> +                goto bad_vnuma_mem;
> +            }
> +
> +            /* memory in MBytes */
> +            dst->vnuma_mem[i] = ul;
> +        }
> +
> +        /* Total memory for vNUMA parsed to verify */
> +        for (i = 0; i < nr_vnuma_regions; i++)
> +            vnuma_memparsed = vnuma_memparsed + (dst->vnuma_mem[i]);
> +
> +        /* Amount of memory for vnodes same as total? */
> +        if ((vnuma_memparsed << 10) != (dst->max_memkb)) {
> +            fprintf(stderr, "xl: vnuma memory is not the same as domain \
> +                    memory size.\n");
> +            goto bad_vnuma_mem;
> +        }
> +    } else {
> +        dst->vnuma_mem = calloc(dst->vnodes,
> +                                      sizeof(*dst->vnuma_mem));
> +        if (dst->vnuma_mem == NULL) {
> +            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
> +            goto bad_vnuma_mem;
> +        }
> +
> +        fprintf(stderr, "WARNING: vNUMA memory ranges were not specified.\n");
> +        fprintf(stderr, "Using default equal vnode memory size %lu Kbytes \
> +                to cover %lu Kbytes.\n",
> +                dst->max_memkb / dst->vnodes, dst->max_memkb);
> +
> +        if (split_vnumamem(dst) < 0) {
> +            fprintf(stderr, "Could not split vnuma memory into equal chunks.\n");
> +            goto bad_vnuma_mem;
> +        }
> +    }
> +    return 0;
> +
> + bad_vnuma_mem:
> +    return -1;
> +}
> +
> +static int parse_vnuma_distance(XLU_Config *config,
> +                                libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
> +    XLU_ConfigList *vdistancecfg;
> +    int nr_vdist;
> +
> +    dst = *b_info;
> +    dst->vdistance = calloc(dst->vnodes * dst->vnodes,
> +                               sizeof(*dst->vdistance));
> +    if (dst->vdistance == NULL)
> +        goto bad_distance;
> +
> +    if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0)) {
> +        int d1, d2, i;
> +        /*
> +         * First value is the same node distance, the second as the
> +         * rest of distances. The following is required right now to
> +         * avoid non-symmetrical distance table as it may break latest kernel.
> +         * TODO: Better way to analyze extended distance table, possibly
> +         * OS specific.
> +         */
> +
> +        for (i = 0; i < nr_vdist; i++) {
> +            d1 = get_list_item_uint(vdistancecfg, i);
> +        }
> +
> +        d1 = get_list_item_uint(vdistancecfg, 0);
> +        if (dst->vnodes > 1)
> +           d2 = get_list_item_uint(vdistancecfg, 1);
> +        else
> +           d2 = d1;
> +
> +        if (d1 >= 0 && d2 >= 0) {
> +            if (d1 < d2)
> +                fprintf(stderr, "WARNING: vnuma distance d1 < d2, %u < %u\n", d1, d2);
> +            vdistance_set(dst->vdistance, dst->vnodes, d1, d2);
> +        } else {
> +            fprintf(stderr, "WARNING: vnuma distance values are incorrect.\n");
> +            goto bad_distance;
> +        }
> +    } else {
> +        fprintf(stderr, "Could not parse vnuma distances.\n");
> +        vdistance_set(dst->vdistance, dst->vnodes, 10, 20);
> +    }
> +    return 0;
> +
> + bad_distance:
> +    return -1;
> +}
> +
> +static int parse_vnuma_vcpumap(XLU_Config *config,
> +                                libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
> +    XLU_ConfigList *vcpumap;
> +    int nr_vcpumap, i;
> +
> +    dst = *b_info;
> +    dst->vnuma_vcpumap = (unsigned int *)calloc(dst->max_vcpus,
> +                                     sizeof(*dst->vnuma_vcpumap));
> +    if (dst->vnuma_vcpumap == NULL)
> +        goto bad_vcpumap;
> +
> +    if (!xlu_cfg_get_list(config, "vnuma_vcpumap",
> +                          &vcpumap, &nr_vcpumap, 0)) {
> +        if (nr_vcpumap == dst->max_vcpus) {
> +            unsigned int  vnode, vcpumask = 0, vmask;
> +
> +            vmask = ~(~0 << nr_vcpumap);
> +            for (i = 0; i < nr_vcpumap; i++) {
> +                vnode = get_list_item_uint(vcpumap, i);
> +                if (vnode >= 0 && vnode < dst->vnodes) {
> +                    vcpumask |= (1 << i);
> +                    dst->vnuma_vcpumap[i] = vnode;
> +                }
> +            }
> +
> +            /* Did it covered all vnodes in the vcpu mask? */
> +            if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) {
> +                fprintf(stderr, "WARNING: Not all vnodes were covered \
> +                        in numa_cpumask.\n");
> +                goto bad_vcpumap;
> +            }
> +        } else {
> +            fprintf(stderr, "WARNING:  Bad vnuma_vcpumap.\n");
> +            goto bad_vcpumap;
> +        }
> +    }
> +    else
> +        vcputovnode_default(dst->vnuma_vcpumap,
> +                            dst->vnodes,
> +                            dst->max_vcpus);
> +    return 0;
> +
> + bad_vcpumap:
> +    return -1;
> +}
> +
> +static int parse_vnuma_vnodemap(XLU_Config *config,
> +                                libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
> +    XLU_ConfigList *vnodemap;
> +    int nr_vnodemap, i;
> +
> +    dst = *b_info;
> +
> +    /* There is mapping to NUMA physical nodes? */
> +    dst->vnuma_vnodemap = (unsigned int *)calloc(dst->vnodes,
> +                           sizeof(*dst->vnuma_vnodemap));
> +    if (dst->vnuma_vnodemap == NULL)
> +        goto bad_vnodemap;
> +
> +    if (!xlu_cfg_get_list(config, "vnuma_vnodemap",
> +                          &vnodemap, &nr_vnodemap, 0)) {
> +        /*
> +         * If not specified or incorrect, will be defined
> +         * later based on the machine architecture, configuration
> +         * and memory availble when creating domain.
> +         */
> +        libxl_defbool_set(&dst->vnuma_autoplacement, false);
> +        if (nr_vnodemap == dst->vnodes) {
> +            unsigned int vnodemask = 0, pnode, smask;
> +            smask = ~(~0 << dst->vnodes);
> +            for (i = 0; i < dst->vnodes; i++) {
> +                pnode = get_list_item_uint(vnodemap, i);
> +                if (pnode >= 0) {
> +                    vnodemask |= (1 << i);
> +                    dst->vnuma_vnodemap[i] = pnode;
> +                }
> +            }
> +
> +            /* Did it covered all vnodes in the mask? */
> +            if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) {
> +                fprintf(stderr, "WARNING: Not all vnodes were covered \
> +                        vnuma_vnodemap.\n");
> +                fprintf(stderr, "Automatic placement will be used for vnodes.\n");
> +                libxl_defbool_set(&dst->vnuma_autoplacement, true);
> +                vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> +            }
> +        }
> +        else {
> +            fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n");
> +            fprintf(stderr, "Automatic placement will be used for vnodes.\n");
> +            libxl_defbool_set(&dst->vnuma_autoplacement, true);
> +            vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> +        }
> +    }
> +    else {
> +        fprintf(stderr, "WARNING: Missing vnuma_vnodemap.\n");
> +        fprintf(stderr, "Automatic placement will be used for vnodes.\n");
> +        libxl_defbool_set(&dst->vnuma_autoplacement, true);
> +        vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> +    }
> +    return 0;
> +
> + bad_vnodemap:
> +    return -1;
> +
> +}
> +
> +static void parse_vnuma_config(XLU_Config *config,
> +                               libxl_domain_build_info *b_info)
> +{
> +    long l;
> +
> +    if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) {
> +        if (l > MAX_VNUMA_NODES) {
> +            fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n",
> +                    MAX_VNUMA_NODES);
> +            goto bad_vnuma_config;
> +        }
> +        b_info->vnodes = l;
> +
> +        if (!xlu_cfg_get_defbool(config, "vnuma_autoplacement",
> +                    &b_info->vnuma_autoplacement, 0))
> +            libxl_defbool_set(&b_info->vnuma_autoplacement, false);
> +
> +        /* Only construct nodes with at least one vcpu. */
> +        if (b_info->vnodes != 0 && b_info->max_vcpus >= b_info->vnodes) {
> +            if (parse_vnuma_mem(config, &b_info) ||
> +                parse_vnuma_distance(config, &b_info) ||
> +                parse_vnuma_vcpumap(config, &b_info) ||
> +                parse_vnuma_vnodemap(config, &b_info))
> +                goto bad_vnuma_config;
> +        }
> +        else if (vnuma_default_config(b_info))
> +            goto bad_vnuma_config;
> +    }
> +    /* If vnuma topology is not defined for domain, init one node */
> +    else if (vnuma_default_config(b_info))
> +            goto bad_vnuma_config;
> +    return;
> +
> + bad_vnuma_config:
> +    fprintf(stderr, "Failed to parse vnuma config or set default vnuma config.\n");
> +    free_vnuma_info(b_info);
> +    exit(1);
> +}
> +
>  static void parse_config_data(const char *config_source,
>                                const char *config_data,
>                                int config_len,
> @@ -924,6 +1351,12 @@ static void parse_config_data(const char *config_source,
>  
>      libxl_defbool_set(&b_info->claim_mode, claim_mode);
>  
> +    /*
> +     * If there is no vnuma in config, "zero" vnuma config
> +     * will be initialized with one node and other defaults.
> +     */
> +    parse_vnuma_config(config, b_info);
> +
>      if (xlu_cfg_get_string (config, "on_poweroff", &buf, 0))
>          buf = "destroy";
>      if (!parse_action_on_shutdown(buf, &d_config->on_poweroff)) {
> -- 
> 1.7.10.4
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 8/9] libxl: vnuma nodes placement bits
  2014-09-03  4:24 ` [PATCH v10 8/9] libxl: vnuma nodes placement bits Elena Ufimtseva
@ 2014-09-03 15:50   ` Konrad Rzeszutek Wilk
  2014-09-03 15:52   ` Ian Campbell
  2014-09-12 16:51   ` Dario Faggioli
  2 siblings, 0 replies; 32+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-09-03 15:50 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, xen-devel, JBeulich

On Wed, Sep 03, 2014 at 12:24:17AM -0400, Elena Ufimtseva wrote:
> Automatic numa placement cancels manual vnode placement
> mechanism. If numa placement explicitly specified, try
> to fit vnodes to the physical nodes.
> 
> Changes since v8:
>     - Addded number of ranges vmemranges for future support of
>       multi-range nodes; For PV guest it is set to same value
>       as number of vNUMA nodes.
> 

Three comments below. The errno values should have the default Exxx
(EINVAL, EAGAIN, etc), while the return should return ERROR_EINVAL, ERROR_BADFAIL,
ERROR_FAIL, etc (see  libxl_types.idl).


> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
>  tools/libxl/libxl_create.c |    1 +
>  tools/libxl/libxl_dom.c    |  204 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 205 insertions(+)
> 
> diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
> index fc332ef..a4977d2 100644
> --- a/tools/libxl/libxl_create.c
> +++ b/tools/libxl/libxl_create.c
> @@ -203,6 +203,7 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
>      }
>  
>      libxl_defbool_setdefault(&b_info->numa_placement, true);
> +    libxl_defbool_setdefault(&b_info->vnuma_autoplacement, true);
>  
>      if (b_info->max_memkb == LIBXL_MEMKB_DEFAULT)
>          b_info->max_memkb = 32 * 1024;
> diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
> index c944804..b1376c4 100644
> --- a/tools/libxl/libxl_dom.c
> +++ b/tools/libxl/libxl_dom.c
> @@ -23,6 +23,7 @@
>  #include <xc_dom.h>
>  #include <xen/hvm/hvm_info_table.h>
>  #include <xen/hvm/hvm_xs_strings.h>
> +#include <libxl_vnuma.h>
>  
>  libxl_domain_type libxl__domain_type(libxl__gc *gc, uint32_t domid)
>  {
> @@ -227,12 +228,114 @@ static void hvm_set_conf_params(xc_interface *handle, uint32_t domid,
>                      libxl_defbool_val(info->u.hvm.nested_hvm));
>  }
>  
> +/* sets vnode_to_pnode map. */
> +static int libxl__init_vnode_to_pnode(libxl__gc *gc, uint32_t domid,
> +                        libxl_domain_build_info *info)
> +{
> +    unsigned int i, n;
> +    int nr_nodes = 0;
> +    uint64_t *vnodes_mem;
> +    unsigned long long *nodes_claim = NULL;
> +    libxl_numainfo *ninfo = NULL;
> +
> +    if (info->vnuma_vnodemap == NULL) {
> +        info->vnuma_vnodemap = libxl__calloc(gc, info->vnodes,
> +                                      sizeof(*info->vnuma_vnodemap));
> +    }
> +
> +    /* default setting. */
> +    for (i = 0; i < info->vnodes; i++)
> +        info->vnuma_vnodemap[i] = LIBXC_VNUMA_NO_NODE;
> +
> +    /* Get NUMA info. */
> +    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
> +    if (ninfo == NULL)
> +        return ERROR_FAIL;
> +    /* Nothing to see if only one NUMA node. */
> +    if (nr_nodes <= 1)
> +        return 0;
> +
> +    vnodes_mem = info->vnuma_mem;
> +    /*
> +     * TODO: change algorithm. The current just fits the nodes
> +     * by its memory sizes. If no p-node found, will be used default
> +     * value of LIBXC_VNUMA_NO_NODE.
> +     */
> +    nodes_claim = libxl__calloc(gc, info->vnodes, sizeof(*nodes_claim));
> +    if ( !nodes_claim )

A bit odd. The rest of the patch has a different style.

> +        return ERROR_FAIL;
> +
> +    libxl_for_each_set_bit(n, info->nodemap)
> +    {
> +        for (i = 0; i < info->vnodes; i++)
> +        {
> +            unsigned long mem_sz = vnodes_mem[i] << 20;
> +            if ((nodes_claim[n] + mem_sz <= ninfo[n].free) &&
> +                 /* vnode was not set yet. */
> +                 (info->vnuma_vnodemap[i] == LIBXC_VNUMA_NO_NODE ) )
> +            {
> +                info->vnuma_vnodemap[i] = n;
> +                nodes_claim[n] += mem_sz;
> +            }
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Builds vnode memory regions from configuration info
> + * for vnuma nodes with multiple regions.
> + */
> +static int libxl__build_vnuma_ranges(libxl__gc *gc,
> +                              uint32_t domid,
> +                              /* IN: mem sizes in megabytes */
> +                              libxl_domain_build_info *b_info,
> +                              /* OUT: linux NUMA blocks addresses */
> +                              vmemrange_t **memrange)
> +{
> +    /*
> +     * For non-PV domains, contruction of the regions will
> +     * need to have its own implementation.
> +     */
> +    if (b_info->type != LIBXL_DOMAIN_TYPE_PV) {
> +        LOG(DETAIL, "vNUMA is only supported for PV guests now.\n");
> +        errno = EINVAL;
> +        return -1;
> +    }
> +
> +    if (b_info->vnodes == 0) {
> +        errno = EINVAL;
> +        return -1;

How come you return -1 here..
> +    }
> +
> +    b_info->vmemranges = b_info->vnodes;
> +
> +    *memrange = libxl__calloc(gc, b_info->vnodes,
> +                              sizeof(vmemrange_t));
> +
> +    /*
> +     * For PV domain along with alignment, regions nid will
> +     * be set to corresponding vnuma node number and ignored
> +     * later during allocation.
> +     */
> +
> +    if (libxl__vnuma_align_mem(gc, domid, b_info, *memrange) < 0) {
> +        LOG(DETAIL, "Failed to align memory map.\n");
> +        errno = ERROR_INVAL;

errno = EINVAL?

> +        return ERROR_FAIL;

.. but here you return ERROR_FAIL ?
> +    }
> +
> +    return 0;
> +}
> +
>  int libxl__build_pre(libxl__gc *gc, uint32_t domid,
>                libxl_domain_config *d_config, libxl__domain_build_state *state)
>  {
>      libxl_domain_build_info *const info = &d_config->b_info;
>      libxl_ctx *ctx = libxl__gc_owner(gc);
>      char *xs_domid, *con_domid;
> +    struct vmemrange *memrange;
>      int rc;
>  
>      if (xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus) != 0) {
> @@ -240,6 +343,20 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
>          return ERROR_FAIL;
>      }
>  
> +    if (libxl__build_vnuma_ranges(gc, domid, info, &memrange) != 0) {
> +        LOG(DETAIL, "Failed to build vnuma nodes memory ranges.\n");
> +        return ERROR_FAIL;
> +
> +    }
> +
> +    /*
> +     * NUMA placement and vNUMA autoplacement handling:
> +     * If numa_placement is set to default, do not use vnode to pnode
> +     * mapping as automatic placement algorithm will find best numa nodes.
> +     * If numa_placement is not used, we can try and use domain vnode
> +     * to pnode mask.
> +     */
> +
>      /*
>       * Check if the domain has any CPU or node affinity already. If not, try
>       * to build up the latter via automatic NUMA placement. In fact, in case
> @@ -298,7 +415,33 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
>                                     NULL, &cpumap_soft);
>  
>          libxl_bitmap_dispose(&cpumap_soft);
> +
> +        /*
> +         * If vnode_to_pnode mask was defined, dont use it if we automatically
> +         * place domain on NUMA nodes, just give warning.
> +         */
> +        if (!libxl_defbool_val(info->vnuma_autoplacement)) {
> +            LOG(INFO, "Automatic NUMA placement for domain is turned on. \
> +                vnode to physical nodes mapping will not be used.");
> +        }
> +        if (libxl__init_vnode_to_pnode(gc, domid, info) < 0) {
> +            LOG(ERROR, "Failed to build vnode to pnode map\n");
> +            return ERROR_FAIL;
> +        }
> +    } else {
> +        if (!libxl_defbool_val(info->vnuma_autoplacement)) {
> +                if (!libxl__vnodemap_is_usable(gc, info)) {
> +                    LOG(ERROR, "Defined vnode to pnode domain map cannot be used.\n");
> +                    return ERROR_FAIL;
> +                }
> +        } else {
> +            if (libxl__init_vnode_to_pnode(gc, domid, info) < 0) {
> +                LOG(ERROR, "Failed to build vnode to pnode map.\n");
> +                return ERROR_FAIL;
> +            }
> +        }
>      }
> +

Spurious.
>      if (info->nodemap.size)
>          libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap);
>      /* As mentioned in libxl.h, vcpu_hard_array takes precedence */
> @@ -339,6 +482,22 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
>          return ERROR_FAIL;
>      }
>  
> +    /*
> +     * XEN_DOMCTL_setvnuma subop hypercall needs to know max mem
> +     * for domain set by xc_domain_setmaxmem. So set vNUMA after
> +     * maxmem is being set.
> +     * memrange should contain regions if multi-region nodes are
> +     * suppoted. For PV domain regions are ignored.
> +     */
> +    if (xc_domain_setvnuma(ctx->xch, domid, info->vnodes,
> +        info->vmemranges,
> +        info->max_vcpus, memrange,
> +        info->vdistance, info->vnuma_vcpumap,
> +        info->vnuma_vnodemap) < 0) {
> +        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "Couldn't set vnuma topology");
> +        return ERROR_FAIL;
> +    }
> +
>      xs_domid = xs_read(ctx->xsh, XBT_NULL, "/tool/xenstored/domid", NULL);
>      state->store_domid = xs_domid ? atoi(xs_domid) : 0;
>      free(xs_domid);
> @@ -434,6 +593,46 @@ retry_transaction:
>      return 0;
>  }
>  
> +/*
> + * Function fills the xc_dom_image with memory sizes for later
> + * use in domain memory allocator. No regions here are being
> + * filled and used in allocator as the regions do belong to one node.
> + */
> +static int libxl__dom_vnuma_init(struct libxl_domain_build_info *info,
> +                                 struct xc_dom_image *dom)
> +{
> +    errno = ERROR_INVAL;


Shouldnt that be EINVAL?

> +
> +    if (info->vnodes == 0)
> +        return -1;
> +
> +    info->vmemranges = info->vnodes;
> +
> +    dom->vnode_to_pnode = (unsigned int *)malloc(
> +                            info->vnodes * sizeof(*info->vnuma_vnodemap));
> +    dom->numa_memszs = (uint64_t *)malloc(
> +                          info->vnodes * sizeof(*info->vnuma_mem));
> +
> +    errno = ERROR_FAIL;
> +    if ( dom->numa_memszs == NULL || dom->vnode_to_pnode == NULL ) {
> +        info->vnodes = 0;
> +        if (dom->vnode_to_pnode)
> +            free(dom->vnode_to_pnode);
> +        if (dom->numa_memszs)
> +            free(dom->numa_memszs);
> +        return -1;

And this return ERROR_FAIL?

> +    }
> +
> +    memcpy(dom->numa_memszs, info->vnuma_mem,
> +            sizeof(*info->vnuma_mem) * info->vnodes);
> +    memcpy(dom->vnode_to_pnode, info->vnuma_vnodemap,
> +            sizeof(*info->vnuma_vnodemap) * info->vnodes);
> +
> +    dom->vnodes = info->vnodes;
> +
> +    return 0;
> +}
> +
>  int libxl__build_pv(libxl__gc *gc, uint32_t domid,
>               libxl_domain_build_info *info, libxl__domain_build_state *state)
>  {
> @@ -491,6 +690,11 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
>      dom->xenstore_domid = state->store_domid;
>      dom->claim_enabled = libxl_defbool_val(info->claim_mode);
>  
> +    if ( (ret = libxl__dom_vnuma_init(info, dom)) != 0 ) {
> +        LOGE(ERROR, "Failed to set doman vnuma");
> +        goto out;
> +    }
> +
>      if ( (ret = xc_dom_boot_xen_init(dom, ctx->xch, domid)) != 0 ) {
>          LOGE(ERROR, "xc_dom_boot_xen_init failed");
>          goto out;
> -- 
> 1.7.10.4
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 8/9] libxl: vnuma nodes placement bits
  2014-09-03  4:24 ` [PATCH v10 8/9] libxl: vnuma nodes placement bits Elena Ufimtseva
  2014-09-03 15:50   ` Konrad Rzeszutek Wilk
@ 2014-09-03 15:52   ` Ian Campbell
  2014-09-12 16:51   ` Dario Faggioli
  2 siblings, 0 replies; 32+ messages in thread
From: Ian Campbell @ 2014-09-03 15:52 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, stefano.stabellini, george.dunlap, msw, dario.faggioli,
	lccycc123, ian.jackson, xen-devel, JBeulich

On Wed, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:

> +    /* default setting. */
> +    for (i = 0; i < info->vnodes; i++)
> +        info->vnuma_vnodemap[i] = LIBXC_VNUMA_NO_NODE;

What if the user had specified this?

> +static int libxl__build_vnuma_ranges(libxl__gc *gc,
> +                              uint32_t domid,
> +                              /* IN: mem sizes in megabytes */
> +                              libxl_domain_build_info *b_info,

const please.

> +                              /* OUT: linux NUMA blocks addresses */

More Linux specifics.

> +                              vmemrange_t **memrange)
> +{
> +    /*
> +     * For non-PV domains, contruction of the regions will

"construction"

> +    b_info->vmemranges = b_info->vnodes;

What if the user had specified vmemranges themselves?

Actually, I'm becomign increasingly confused about the fields in the
libxl interface, so I think I'm going to go back to patch #5 to clarify
that before I try to understand any more of this code.

Ian.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 5/9] libxl: vnuma types declararion
  2014-09-03  4:24 ` [PATCH v10 5/9] libxl: vnuma types declararion Elena Ufimtseva
  2014-09-03 15:03   ` Ian Campbell
@ 2014-09-03 16:04   ` Ian Campbell
  2014-09-04  3:48     ` Elena Ufimtseva
  1 sibling, 1 reply; 32+ messages in thread
From: Ian Campbell @ 2014-09-03 16:04 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, stefano.stabellini, george.dunlap, msw, dario.faggioli,
	lccycc123, ian.jackson, xen-devel, JBeulich

On Wed, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
> Adds vnuma topology types declarations to libxl_domain_build_info
> structure.

So, as mentioned I was becoming increasingly confused about what all
these fields represent. So lets stop and take stock.

> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
>  tools/libxl/libxl_types.idl |    8 +++++++-
>  tools/libxl/libxl_vnuma.h   |   16 ++++++++++++++++
>  2 files changed, 23 insertions(+), 1 deletion(-)
>  create mode 100644 tools/libxl/libxl_vnuma.h
> 
> diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
> index 08a7927..ea8bac0 100644
> --- a/tools/libxl/libxl_types.idl
> +++ b/tools/libxl/libxl_types.idl
> @@ -333,7 +333,13 @@ libxl_domain_build_info = Struct("domain_build_info",[
>      ("disable_migrate", libxl_defbool),
>      ("cpuid",           libxl_cpuid_policy_list),
>      ("blkdev_start",    string),
> -    
> +    ("vnodes",          uint32),

I suppose this is the number of NUMA nodes the guest should have?

> +    ("vmemranges",      uint32),

and I suppose this is a number of memory ranges? Does this differ ever
from vnodes?

> +    ("vnuma_mem",       Array(uint64, "num_vnuma_mem")),

The size of each memory node?

When/how can num_vnuma_mem differ from vmemranges?

> +    ("vnuma_vcpumap",   Array(uint32, "num_vnuma_vcpumap")),

Is this supposed to map numa node to a bitmap of vcpus? If yes:

When/how can num_vnuma_vcpumap differ from vnodes?

What happens if a guest has 33 vcpus?

Or maybe the output of this is a single vcpu number?

> +    ("vdistance",       Array(uint32, "num_vdistance")),

nodes to distances? Can num_vdistance ever differ from vnodes?

I thought distances were between two nodes, in which case doesn't this
need to be multidimensional?

> +    ("vnuma_vnodemap",  Array(uint32, "num_vnuma_vnondemap")),

I've run out of guesses for what this might be. vnode to pnode perhaps?
You appear to unconditionally overwrite whatever the users might have
put here.

> +    ("vnuma_autoplacement",  libxl_defbool),

Why/how does this differ from the existing numa placement option?

Ian.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 5/9] libxl: vnuma types declararion
  2014-09-03 16:04   ` Ian Campbell
@ 2014-09-04  3:48     ` Elena Ufimtseva
       [not found]       ` <1409826927.15057.22.camel@kazak.uk.xensource.com>
  0 siblings, 1 reply; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-04  3:48 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Keir Fraser, Stefano Stabellini, George Dunlap, Matt Wilson,
	Dario Faggioli, Li Yechen, Ian Jackson, xen-devel, Jan Beulich

On Wed, Sep 3, 2014 at 12:04 PM, Ian Campbell <Ian.Campbell@citrix.com> wrote:
> On Wed, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
>> Adds vnuma topology types declarations to libxl_domain_build_info
>> structure.
>
> So, as mentioned I was becoming increasingly confused about what all
> these fields represent. So lets stop and take stock.

That is not good )
I guess I did not understand correctly about new convention you have
mentioned in previous review.
But I know I need to have it re-worked, so let me try to answer.

>
>> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
>> ---
>>  tools/libxl/libxl_types.idl |    8 +++++++-
>>  tools/libxl/libxl_vnuma.h   |   16 ++++++++++++++++
>>  2 files changed, 23 insertions(+), 1 deletion(-)
>>  create mode 100644 tools/libxl/libxl_vnuma.h
>>
>> diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
>> index 08a7927..ea8bac0 100644
>> --- a/tools/libxl/libxl_types.idl
>> +++ b/tools/libxl/libxl_types.idl
>> @@ -333,7 +333,13 @@ libxl_domain_build_info = Struct("domain_build_info",[
>>      ("disable_migrate", libxl_defbool),
>>      ("cpuid",           libxl_cpuid_policy_list),
>>      ("blkdev_start",    string),
>> -
>> +    ("vnodes",          uint32),
>
> I suppose this is the number of NUMA nodes the guest should have?

Yes.

>
>> +    ("vmemranges",      uint32),
>
> and I suppose this is a number of memory ranges? Does this differ ever
> from vnodes?

Yes it can be different, it can as one node can have more than one memory range.
Parsing does not take this into account and further in the tool stack
there is no pretty way to work around this (see at the end).

>
>> +    ("vnuma_mem",       Array(uint64, "num_vnuma_mem")),
>
> The size of each memory node?

Yes.
>
> When/how can num_vnuma_mem differ from vmemranges?

Well, I did not work on it yet as vmemranges are not supported yet,
but will be needed for multi-range memory nodes.
But I agree, it does not look right to me as well.

>
>> +    ("vnuma_vcpumap",   Array(uint32, "num_vnuma_vcpumap")),
>
> Is this supposed to map numa node to a bitmap of vcpus? If yes:

It is a map, but not a bitmap of vcpus. Its an array of nr_vcpus size,
each element is the node number.
>
> When/how can num_vnuma_vcpumap differ from vnodes?

num_vnuma_vcpumap is the number of vcpus in map from config or by
default number of vcpus per domain.

>
> What happens if a guest has 33 vcpus?
>
> Or maybe the output of this is a single vcpu number?
>
>> +    ("vdistance",       Array(uint32, "num_vdistance")),
>
> nodes to distances? Can num_vdistance ever differ from vnodes?

num_vdistance is a square of vnodes.
>
> I thought distances were between two nodes, in which case doesn't this
> need to be multidimensional?

yes, but parser just fills it on per-row order, treating it as it is a
multidimensional array.

>
>> +    ("vnuma_vnodemap",  Array(uint32, "num_vnuma_vnondemap")),
>
> I've run out of guesses for what this might be. vnode to pnode perhaps?

Yes, it it vnode to pnode map. I guess me changing the field names
brought lots of confusion.


> You appear to unconditionally overwrite whatever the users might have
> put here.
>
>> +    ("vnuma_autoplacement",  libxl_defbool),
>
> Why/how does this differ from the existing numa placement option?


Basically, its meaning can be briefly described as to try to build
vnode to pnode mapping if automatic vnuma placement
enabled, or try to use the user-defined one.
But I am thinking maybe you are right, and there is no need in that
vnuma_autoplacement at all.
Just to remove it, and if numa placement is in automatic mode, just
use it. If not, try to use vnode to pnode mapping.

Now I think that is what Dario was talking about for some time )


Ok, now going back to these types.

  ("vnodes",          uint32),
  ("vmemranges",      uint32),
  ("vnuma_mem",       Array(uint64, "num_vnuma_mem")),
  ("vnuma_vcpumap",   Array(uint32, "num_vnuma_vcpumap")),
  ("vdistance",       Array(uint32, "num_vdistance")),
  ("vnuma_vnodemap",  Array(uint32, "num_vnuma_vnondemap")),


vnodes -  number of vnuma nodes;
vmemranges - number of vmemranges (not used, but added support in
Xen); Not sure how to deal with this. Should I just remove it as its
not used by tool stack yet?

vnuma_mem - vnode memory sizes (conflicts now with vmemranges);
vnuma_vcpumap - map of vnuma nodes and vcpus (length is number of
vcpus, each element is vnuma node number);
vdistance - distances, num_vdistance = vnodes * vnodes;
vnuma_vnodemap - mapping of vnuma nodes to physical NUMA nodes;

And vnuma_autoplacement probably not needed here at all.

About vmemranges. They will be needed for vnuma nodes what have
multiple ranges of memories.
I added it in type definition to specify support when calling
hypercall. As of now, PV domain have one memory range per node.

There is libxl__build_vnuma_ranges function which for PV domain
basically converts vnuma memory node sizes into ranges.
But for PV domain there is only one range per vnode.

Maybe there is a better way of doing this, maybe even remove confusing
at this point vmemranges?


Elena
>
> Ian.
>



-- 
Elena

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 6/9] libxl: build numa nodes memory blocks
  2014-09-03 15:21   ` Ian Campbell
@ 2014-09-04  4:47     ` Elena Ufimtseva
  2014-09-05  3:50     ` Elena Ufimtseva
  1 sibling, 0 replies; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-04  4:47 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Keir Fraser, Stefano Stabellini, George Dunlap, Matt Wilson,
	Dario Faggioli, Li Yechen, Ian Jackson, xen-devel, Jan Beulich

On Wed, Sep 3, 2014 at 11:21 AM, Ian Campbell <Ian.Campbell@citrix.com> wrote:
> On Wed, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
>> Create the vmemrange structure based on the
>> PV guests E820 map. Values are in in Megabytes.
>> Also export the E820 filter code e820_sanitize
>> out to be available internally.
>> As Xen can support mutiranges vNUMA nodes, for
>> PV guest it is one range per one node. The other
>> domain types should have their building routines
>> implemented.
>>
>> Changes since v8:
>>     - Added setting of the vnuma memory ranges node ids;
>>
>> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
>> ---
>>  tools/libxl/libxl_internal.h |    9 ++
>>  tools/libxl/libxl_numa.c     |  201 ++++++++++++++++++++++++++++++++++++++++++
>>  tools/libxl/libxl_x86.c      |    3 +-
>>  3 files changed, 212 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
>> index beb052e..63ccb5e 100644
>> --- a/tools/libxl/libxl_internal.h
>> +++ b/tools/libxl/libxl_internal.h
>> @@ -3088,6 +3088,15 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
>>      libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
>>  }
>>
>> +bool libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info *info);
>> +
>> +int e820_sanitize(libxl_ctx *ctx, struct e820entry src[], uint32_t *nr_entries,
>> +                  unsigned long map_limitkb, unsigned long balloon_kb);
>
> This is x86 specific, so simply exposing it to common libxl code as
> you've done will break on arm.
>
> I'm not sure if this means moving the whole thing to libxl_x86 and
> adding an ARM stub or if there is some common stuff from which the x86
> stuff needs to be abstracted.
>
>> +
>> +int libxl__vnuma_align_mem(libxl__gc *gc, uint32_t domid,
>> +                           struct libxl_domain_build_info *b_info,
>> +                           vmemrange_t *memblks);
>> +
>>  _hidden int libxl__ms_vm_genid_set(libxl__gc *gc, uint32_t domid,
>>                                     const libxl_ms_vm_genid *id);
>>
>> diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
>> index 94ca4fe..c416faf 100644
>> --- a/tools/libxl/libxl_numa.c
>> +++ b/tools/libxl/libxl_numa.c
>> @@ -19,6 +19,10 @@
>>
>>  #include "libxl_internal.h"
>>
>> +#include "libxl_vnuma.h"
>> +
>> +#include "xc_private.h"
>> +
>>  /*
>>   * What follows are helpers for generating all the k-combinations
>>   * without repetitions of a set S with n elements in it. Formally
>> @@ -508,6 +512,203 @@ int libxl__get_numa_candidate(libxl__gc *gc,
>>  }
>>
>>  /*
>> + * Check if we can fit vnuma nodes to numa pnodes
>> + * from vnode_to_pnode array.
>> + */
>> +bool libxl__vnodemap_is_usable(libxl__gc *gc,
>> +                            libxl_domain_build_info *info)
>> +{
>> +    unsigned int i;
>> +    libxl_numainfo *ninfo = NULL;
>> +    unsigned long long *claim;
>> +    unsigned int node;
>> +    uint64_t *sz_array;
>
> I don't think you set this, so please make it const so as to make sure.
>
>> +    int nr_nodes = 0;
>> +
>> +    /* Cannot use specified mapping if not NUMA machine. */
>> +    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
>> +    if (ninfo == NULL)
>> +        return false;
>> +
>> +    sz_array = info->vnuma_mem;
>> +    claim = libxl__calloc(gc, info->vnodes, sizeof(*claim));
>> +    /* Get total memory required on each physical node. */
>> +    for (i = 0; i < info->vnodes; i++)
>> +    {
>> +        node = info->vnuma_vnodemap[i];
>> +
>> +        if (node < nr_nodes)
>> +            claim[node] += (sz_array[i] << 20);
>> +        else
>> +            goto vnodemapout;
>> +   }
>> +   for (i = 0; i < nr_nodes; i++) {
>> +       if (claim[i] > ninfo[i].free)
>> +          /* Cannot complete user request, falling to default. */
>> +          goto vnodemapout;
>> +   }
>> +
>> + vnodemapout:
>> +   return true;
>
> Apart from non-NUMA systems, you always return true. Is that really
> correct right? I'd expect one or the other "goto vnodemapout" to want to
> return false.
>
> "out" is OK as a label name.
>
>
>> +
>> +/*
>> + * For each node, build memory block start and end addresses.
>> + * Substract any memory hole from the range found in e820 map.
>> + * vnode memory size are passed here in megabytes, the result is
>> + * in memory block addresses.
>> + * Linux kernel will adjust numa memory block sizes on its own.
>> + * But we want to provide to the kernel numa block addresses that
>> + * will be the same in kernel and hypervisor.
>
> You shouldn't be making these sorts of guest OS assumptions. This is not
> only Linux specific but also (presumably) specific to the version of
> Linux you happened to be looking at.
>
> Please define the semantics wrt the interface and guarantees which Xen
> wants to provide to all PV guests, not just one particular kernel/
>
>> + */
>> +int libxl__vnuma_align_mem(libxl__gc *gc,
>
> Is "align" in the name here accurate? The doc comment says "build"
> instead, which suggests it is creating things (perhaps aligned as it
> goes).
>
>> +                            uint32_t domid,
>> +                            /* IN: mem sizes in megabytes */
>> +                            libxl_domain_build_info *b_info,
>
> if it is input only then please make it const.
>
>> +                            /* OUT: linux NUMA blocks addresses */
>
> Again -- Linux specific.
>
>> +    libxl_ctx *ctx = libxl__gc_owner(gc);
>
> You can use CTX instead of creating this local thing.
>
> Ian.
>

Thanks Ian for reviewing.
I will take a closer look at your comments tomorrow and reply.

-- 
Elena

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 6/9] libxl: build numa nodes memory blocks
  2014-09-03 15:21   ` Ian Campbell
  2014-09-04  4:47     ` Elena Ufimtseva
@ 2014-09-05  3:50     ` Elena Ufimtseva
  1 sibling, 0 replies; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-05  3:50 UTC (permalink / raw)
  To: Ian Campbell, Wei Liu
  Cc: Keir Fraser, Stefano Stabellini, George Dunlap, Matt Wilson,
	Dario Faggioli, Li Yechen, Ian Jackson, xen-devel, Jan Beulich

On Wed, Sep 3, 2014 at 11:21 AM, Ian Campbell <Ian.Campbell@citrix.com> wrote:
> On Wed, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
>> Create the vmemrange structure based on the
>> PV guests E820 map. Values are in in Megabytes.
>> Also export the E820 filter code e820_sanitize
>> out to be available internally.
>> As Xen can support mutiranges vNUMA nodes, for
>> PV guest it is one range per one node. The other
>> domain types should have their building routines
>> implemented.
>>
>> Changes since v8:
>>     - Added setting of the vnuma memory ranges node ids;
>>
>> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
>> ---
>>  tools/libxl/libxl_internal.h |    9 ++
>>  tools/libxl/libxl_numa.c     |  201 ++++++++++++++++++++++++++++++++++++++++++
>>  tools/libxl/libxl_x86.c      |    3 +-
>>  3 files changed, 212 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
>> index beb052e..63ccb5e 100644
>> --- a/tools/libxl/libxl_internal.h
>> +++ b/tools/libxl/libxl_internal.h
>> @@ -3088,6 +3088,15 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
>>      libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
>>  }
>>
>> +bool libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info *info);
>> +
>> +int e820_sanitize(libxl_ctx *ctx, struct e820entry src[], uint32_t *nr_entries,
>> +                  unsigned long map_limitkb, unsigned long balloon_kb);
>
> This is x86 specific, so simply exposing it to common libxl code as
> you've done will break on arm.
>
> I'm not sure if this means moving the whole thing to libxl_x86 and
> adding an ARM stub or if there is some common stuff from which the x86
> stuff needs to be abstracted.
>
>> +
>> +int libxl__vnuma_align_mem(libxl__gc *gc, uint32_t domid,
>> +                           struct libxl_domain_build_info *b_info,
>> +                           vmemrange_t *memblks);
>> +
>>  _hidden int libxl__ms_vm_genid_set(libxl__gc *gc, uint32_t domid,
>>                                     const libxl_ms_vm_genid *id);
>>
>> diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
>> index 94ca4fe..c416faf 100644
>> --- a/tools/libxl/libxl_numa.c
>> +++ b/tools/libxl/libxl_numa.c
>> @@ -19,6 +19,10 @@
>>
>>  #include "libxl_internal.h"
>>
>> +#include "libxl_vnuma.h"
>> +
>> +#include "xc_private.h"
>> +
>>  /*
>>   * What follows are helpers for generating all the k-combinations
>>   * without repetitions of a set S with n elements in it. Formally
>> @@ -508,6 +512,203 @@ int libxl__get_numa_candidate(libxl__gc *gc,
>>  }
>>
>>  /*
>> + * Check if we can fit vnuma nodes to numa pnodes
>> + * from vnode_to_pnode array.
>> + */
>> +bool libxl__vnodemap_is_usable(libxl__gc *gc,
>> +                            libxl_domain_build_info *info)
>> +{
>> +    unsigned int i;
>> +    libxl_numainfo *ninfo = NULL;
>> +    unsigned long long *claim;
>> +    unsigned int node;
>> +    uint64_t *sz_array;
>
> I don't think you set this, so please make it const so as to make sure.
>
>> +    int nr_nodes = 0;
>> +
>> +    /* Cannot use specified mapping if not NUMA machine. */
>> +    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
>> +    if (ninfo == NULL)
>> +        return false;
>> +
>> +    sz_array = info->vnuma_mem;
>> +    claim = libxl__calloc(gc, info->vnodes, sizeof(*claim));
>> +    /* Get total memory required on each physical node. */
>> +    for (i = 0; i < info->vnodes; i++)
>> +    {
>> +        node = info->vnuma_vnodemap[i];
>> +
>> +        if (node < nr_nodes)
>> +            claim[node] += (sz_array[i] << 20);
>> +        else
>> +            goto vnodemapout;
>> +   }
>> +   for (i = 0; i < nr_nodes; i++) {
>> +       if (claim[i] > ninfo[i].free)
>> +          /* Cannot complete user request, falling to default. */
>> +          goto vnodemapout;
>> +   }
>> +
>> + vnodemapout:
>> +   return true;
>
> Apart from non-NUMA systems, you always return true. Is that really
> correct right? I'd expect one or the other "goto vnodemapout" to want to
> return false.
>
> "out" is OK as a label name.
>
>
>> +
>> +/*
>> + * For each node, build memory block start and end addresses.
>> + * Substract any memory hole from the range found in e820 map.
>> + * vnode memory size are passed here in megabytes, the result is
>> + * in memory block addresses.
>> + * Linux kernel will adjust numa memory block sizes on its own.
>> + * But we want to provide to the kernel numa block addresses that
>> + * will be the same in kernel and hypervisor.
>
> You shouldn't be making these sorts of guest OS assumptions. This is not
> only Linux specific but also (presumably) specific to the version of
> Linux you happened to be looking at.
>
> Please define the semantics wrt the interface and guarantees which Xen
> wants to provide to all PV guests, not just one particular kernel/
>
>> + */
>> +int libxl__vnuma_align_mem(libxl__gc *gc,
>
> Is "align" in the name here accurate? The doc comment says "build"
> instead, which suggests it is creating things (perhaps aligned as it
> goes).
>
>> +                            uint32_t domid,
>> +                            /* IN: mem sizes in megabytes */
>> +                            libxl_domain_build_info *b_info,
>
> if it is input only then please make it const.
>
>> +                            /* OUT: linux NUMA blocks addresses */
>
> Again -- Linux specific.
>
>> +    libxl_ctx *ctx = libxl__gc_owner(gc);
>
> You can use CTX instead of creating this local thing.
>
> Ian.
>

+ Wei

Looks like it will take a bit more time to make sure this part is in a
good shape.
I will poke around to see how it can be done.

Wei, maybe you will some idea?

-- 
Elena

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 5/9] libxl: vnuma types declararion
       [not found]       ` <1409826927.15057.22.camel@kazak.uk.xensource.com>
@ 2014-09-08 16:13         ` Dario Faggioli
  2014-09-08 19:47           ` Elena Ufimtseva
  2014-09-16  6:17         ` Elena Ufimtseva
  1 sibling, 1 reply; 32+ messages in thread
From: Dario Faggioli @ 2014-09-08 16:13 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Keir Fraser, Elena Ufimtseva, Stefano Stabellini, George Dunlap,
	Matt Wilson, Li Yechen, Ian Jackson, xen-devel, Jan Beulich


[-- Attachment #1.1: Type: text/plain, Size: 3895 bytes --]

On gio, 2014-09-04 at 11:35 +0100, Ian Campbell wrote:
> On Wed, 2014-09-03 at 23:48 -0400, Elena Ufimtseva wrote:
> >
> > Ok, now going back to these types.
> > 
> >   ("vnodes",          uint32),
> >   ("vmemranges",      uint32),
> >   ("vnuma_mem",       Array(uint64, "num_vnuma_mem")),
> >   ("vnuma_vcpumap",   Array(uint32, "num_vnuma_vcpumap")),
> >   ("vdistance",       Array(uint32, "num_vdistance")),
> >   ("vnuma_vnodemap",  Array(uint32, "num_vnuma_vnondemap")),
> > 
> > 
> > vnodes -  number of vnuma nodes;
> > vmemranges - number of vmemranges (not used, but added support in
> > Xen); Not sure how to deal with this. Should I just remove it as its
> > not used by tool stack yet?
> 
> I think that would be best, otherwise we risk tying ourselves to an API
> which might not actually work when we come to try and use it.
> 
I agree. FTR, this is mostly mine and other h/v folks noticing that it
would have been nice to have multirange support very late. Elena adapted
the hypervisor patches quite quickly, so, thanks Elena! :-)

For the tools part, I also think it'd be best to keep this out for now.

> > vnuma_mem - vnode memory sizes (conflicts now with vmemranges);
> > vnuma_vcpumap - map of vnuma nodes and vcpus (length is number of
> > vcpus, each element is vnuma node number);
> > vdistance - distances, num_vdistance = vnodes * vnodes;
> > vnuma_vnodemap - mapping of vnuma nodes to physical NUMA nodes;
> > 
> > And vnuma_autoplacement probably not needed here at all.
> > 
I *REALLY* don't think this is necessary.

> Then I would take the rest and wrap them in a libxl_vnode_info struct:
> 
> libxl_vnode_info = Struct("libxl_vnode_info", [
>     ("mem", MemKB), # Size of this node's memory
>     ("distances", Array(...)), # Distances from this node to the others (single dimensional array)
>     ("pnode", TYPE), # which pnode this vnode is associated with
> ])
> 
> Then in the (b|c)_config
>      ("vnuma_nodes", Array(libxl_vnode_info...))
> 
> This avoids the issue of having to have various num_* which are expected
> to always be identical. Except for vnuma_nodes[N].num_distances, but I
> think that is unavoidable.
> 
My +1 to this way of arranging the interface.

> The only thing I wasn't sure of was how to fit the vcpumap into this.
> AIUI this is a map from vcpu number to vnode number (i.e. for each vcpu
> it tells you which vnuma node it is part of). 
>
Correct.

> I presume it isn't
> possible/desirable to have a vcpu be part of multiple vnuma nodes.
>
> One possibility would be to have a libxl_bitmap/cpumap as part of the
> libxl_vnode_info (containing the set of vcpus which are part of this
> node) but that would be prone to allowing a vcpu to be in multiple
> nodes.
> 
Not ideal, but not too bad from my point of view.

> Another possibility would be an array in (b|c)_config as you have now,
> which would be annoying because it's num_foo really ought to be the
> ->vcpus count and the IDL doesn't really allow that.
> 
> I'm not sure what is the lesser of the two evils here.
> 
Sure, both are not ideal. I think I like the former, the one using
libxl_cpumap-s, better. It looks more consistent to me, and it fit
nicely with the array of struct way of restructuring the API Ian
described above.

It allows the user to ask for a vcpu to be part of more than one node,
yes, but identifying and at least warning about this, or even sanity
checking things, should not be impossible (a bit expensive, perhaps, but
probably not to much for a toolstack level check).

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 5/9] libxl: vnuma types declararion
  2014-09-08 16:13         ` Dario Faggioli
@ 2014-09-08 19:47           ` Elena Ufimtseva
  0 siblings, 0 replies; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-08 19:47 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Keir Fraser, Ian Campbell, Stefano Stabellini, George Dunlap,
	Matt Wilson, Li Yechen, Ian Jackson, xen-devel, Jan Beulich

On Mon, Sep 8, 2014 at 12:13 PM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> On gio, 2014-09-04 at 11:35 +0100, Ian Campbell wrote:
>> On Wed, 2014-09-03 at 23:48 -0400, Elena Ufimtseva wrote:
>> >
>> > Ok, now going back to these types.
>> >
>> >   ("vnodes",          uint32),
>> >   ("vmemranges",      uint32),
>> >   ("vnuma_mem",       Array(uint64, "num_vnuma_mem")),
>> >   ("vnuma_vcpumap",   Array(uint32, "num_vnuma_vcpumap")),
>> >   ("vdistance",       Array(uint32, "num_vdistance")),
>> >   ("vnuma_vnodemap",  Array(uint32, "num_vnuma_vnondemap")),
>> >
>> >
>> > vnodes -  number of vnuma nodes;
>> > vmemranges - number of vmemranges (not used, but added support in
>> > Xen); Not sure how to deal with this. Should I just remove it as its
>> > not used by tool stack yet?
>>
>> I think that would be best, otherwise we risk tying ourselves to an API
>> which might not actually work when we come to try and use it.
>>
> I agree. FTR, this is mostly mine and other h/v folks noticing that it
> would have been nice to have multirange support very late. Elena adapted
> the hypervisor patches quite quickly, so, thanks Elena! :-)

Thanks Dario!

>
> For the tools part, I also think it'd be best to keep this out for now.
>
>> > vnuma_mem - vnode memory sizes (conflicts now with vmemranges);
>> > vnuma_vcpumap - map of vnuma nodes and vcpus (length is number of
>> > vcpus, each element is vnuma node number);
>> > vdistance - distances, num_vdistance = vnodes * vnodes;
>> > vnuma_vnodemap - mapping of vnuma nodes to physical NUMA nodes;
>> >
>> > And vnuma_autoplacement probably not needed here at all.
>> >
> I *REALLY* don't think this is necessary.

Yes, I already removed that.

>
>> Then I would take the rest and wrap them in a libxl_vnode_info struct:
>>
>> libxl_vnode_info = Struct("libxl_vnode_info", [
>>     ("mem", MemKB), # Size of this node's memory
>>     ("distances", Array(...)), # Distances from this node to the others (single dimensional array)
>>     ("pnode", TYPE), # which pnode this vnode is associated with
>> ])
>>
>> Then in the (b|c)_config
>>      ("vnuma_nodes", Array(libxl_vnode_info...))
>>
>> This avoids the issue of having to have various num_* which are expected
>> to always be identical. Except for vnuma_nodes[N].num_distances, but I
>> think that is unavoidable.
>>
> My +1 to this way of arranging the interface.
>
>> The only thing I wasn't sure of was how to fit the vcpumap into this.
>> AIUI this is a map from vcpu number to vnode number (i.e. for each vcpu
>> it tells you which vnuma node it is part of).
>>
> Correct.
>
>> I presume it isn't
>> possible/desirable to have a vcpu be part of multiple vnuma nodes.
>>
>> One possibility would be to have a libxl_bitmap/cpumap as part of the
>> libxl_vnode_info (containing the set of vcpus which are part of this
>> node) but that would be prone to allowing a vcpu to be in multiple
>> nodes.
>>
> Not ideal, but not too bad from my point of view.
>
>> Another possibility would be an array in (b|c)_config as you have now,
>> which would be annoying because it's num_foo really ought to be the
>> ->vcpus count and the IDL doesn't really allow that.
>>
>> I'm not sure what is the lesser of the two evils here.
>>
> Sure, both are not ideal. I think I like the former, the one using
> libxl_cpumap-s, better. It looks more consistent to me, and it fit
> nicely with the array of struct way of restructuring the API Ian
> described above.
>
> It allows the user to ask for a vcpu to be part of more than one node,
> yes, but identifying and at least warning about this, or even sanity
> checking things, should not be impossible (a bit expensive, perhaps, but
> probably not to much for a toolstack level check).

Thank you Dario. I will work on this and on re-arranging libxl vnuma structure.

>
> Regards,
> Dario
>
> --
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
>



-- 
Elena

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc
  2014-09-03  4:24 ` [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc Elena Ufimtseva
  2014-09-03 15:42   ` Konrad Rzeszutek Wilk
@ 2014-09-11 17:13   ` Dario Faggioli
  2014-09-12  2:04     ` Elena Ufimtseva
  1 sibling, 1 reply; 32+ messages in thread
From: Dario Faggioli @ 2014-09-11 17:13 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	lccycc123, ian.jackson, xen-devel, JBeulich


[-- Attachment #1.1: Type: text/plain, Size: 26828 bytes --]

So, as far as I understood it, v11 did not include a toolstack part, so
this version is what I should be looking at, is this correct?

I'm also looking at the commits in the github repo (for v11), and the
code seems to be the same as here.

Please, do correct me if I'm wrong. Hopefully, the comments I'm leaving
about this version, will be useful anyway.

Also, as we agreed upon changing the libxl data structures for this
feature (i.e., the vNUMA related field names and types), the review will
focus on the behavior of the xl parser (and on the structure of the
parsing code, of course), and neglect that part.

So, first of all, about the subject: 'libxl: vnuma...'. This patch looks
more about xl than libxl (so I'd expect something like 'xl: vnuma...').

On mer, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
> --- a/docs/man/xl.cfg.pod.5
> +++ b/docs/man/xl.cfg.pod.5
> @@ -264,6 +264,83 @@ if the values of B<memory=> and B<maxmem=> differ.
>  A "pre-ballooned" HVM guest needs a balloon driver, without a balloon driver
>  it will crash.
>
In general, about the config file interface, what I think what we want
an user to be able to specify is:

 1. number of virtual NUMA nodes
    (default: 1)
 2. amount of memory on each virtual NUMA node
    (default: total tot_memory / nr_nodes)
 3. the 'distances' from each node to each other one
    (default: something sane :-) )
 4. what vcpus on each virtual NUMA node
    (default: distribute evenly)
 5. virtual node to physical node mapping
    (default: driven by the existing automatic placement logic, or
    by pinning)

Which is pretty much what this patch allows to do, so, good! :-D

The syntax used for each of these parameters is also mostly fine, IMO,
with the only exception of the mapping between vCPUs and vNODEs (more on
that below).

About the name of the parameters, in this patch, we have:

 1. "vnuma_nodes"
 2. "vnuma_mem"
 3. "vdistance"
 4. "vnuma_cpumap"
 5. "vnuma_nodemap"

I have to admit, I don't especially like these, so here's my proposal.
I'm of course open to other views here, I'm actually just tossing names
there (with a brief rationale and explanation)... :-)

 1. "vnodes": for vCPUs, we have "vcpus", I think something along the 
              same line would make sense;
 2. "vnodes_memory" : same reasoning as above (BTW, we may want to go
                      for something like "vnodes_maxmem", as that is
                      probably what we're actually specifying, isn't it?
                      I'm not sure how important it is to stress that,
                      though);
 3. "vnodes_distance[s]": or just "vnodes_dist[s]"
 4. "vcpus_to_vnodes"
 5. "vnodes_to_pnodes": not particularly proud of this, but can't think
                        of anything better right now.
 
Thoughts?

> +=item B<vnuma_nodes=N>
> +
> +Number of vNUMA nodes the guest will be initialized with on boot.
> +PV guest by default will have one vnuma node.
> +
It's not only PV guests that will have, by default, just one node...
This is true for every guest (the reason being vNUMA is not even
implemented for such other guest types!! :-) ).

As it stands, it looks to me like other kind of guest than PV can have a
different number of NUMA nodes by default.

> +=item B<vnuma_mem=[vmem1, vmem2, ...]>
> +
> +List of memory sizes for each node, defined in MBytes. Number of items listed must
> +match nr_vnodes. 
>
"must match the number of virtual NUMA nodes specified by
B<vnuma_nodes=>"

> If the sum of all vnode memories does not match the domain memory
>
", specified by B<memory=> and/or B<maxmem=>,"

> +or there are missing nodes, it will fail.
> +If not specified, memory will be equally split between vnodes. Current minimum
> +memory size for one node is limited by 32MB.
> +
> +Example: vnuma_mem=[1024, 1024, 2048, 2048]
> +Total amount of memory in guest: 6GB
> +

> +=item B<vnuma_vcpumap=[node_nr, node_nr, ...]>
> +
> +Defines vcpu to vnode mapping as a list of integers. The position in the list
> +is a vcpu number, and the value is the vnode number to which the vcpu will be
> +assigned to.
> +Current limitations:
> +- vNUMA node must have at least one vcpu, otherwise default vcpu_to_vnode will be used.
> +- Total number of vnodes cannot be bigger then number of vcpus.
> +
> +Example:
> +Map of 4 vcpus to 2 vnodes:
> +0,1 vcpu -> vnode0
> +2,3 vcpu -> vnode1:
> +
> +vnuma_vcpumap = [0, 0, 1, 1]
> + 4 vcpus here -  0  1  2  3
> +
>
I wonder whether it would be better to turn this into something lieke
the below:

["0,1,2,3", "4-7", "8,10,12,14", "9,11,13,15"]

With the following meaning: there are 4 nodes, and vCPUs 0,1,2,3 belongs
to node 0, vCPUs 4,5,6,7 belongs to node 1, etc.

It looks a lot more easy to use to me. With the current syntax, the same
configuration would be expressed like this:

[0,0,0,0,1,1,1,1,2,3,2,3,2,3,2,3]

Quite hard to get right when both writing and reading it, isn't it?

Elena's example looks like this:

["0,1", "2,3"] (with my notation)
[0, 0, 1, 1]   (with this patch notation)

From an implementation point of view, the code for parsing strings like
these (and also for parsing list of strings like this), is already there
in xl, so it shouldn't even be too big of a piece of work...

> +=item B<vnuma_vnodemap=[p1, p2, ..., pn]>
> +
> +List of physical node numbers, position in the list represents vnode number.
> +Used for manual placement of vnuma nodes to physical NUMA nodes.
> +Will not be used if automatic numa placement is active.
> +
> +Example:
> +assume NUMA machine with 4 physical nodes. Placing vnuma node 0 to pnode 2,
> +vnuma node 1 to pnode 3:
> +vnode0 -> pnode2
> +vnode1 -> pnode3
> +
> +vnuma_vnodemap=[2, 3]
> +first vnode will be placed on node 2, second on node 3.
> +
>
I'm fine with this.

> +=item B<vnuma_autoplacement=[0|1]>
> +
To be killed. :-)

>  =head3 Event Actions
> diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
> index 409a795..1af2250 100644
> --- a/tools/libxl/xl_cmdimpl.c
> +++ b/tools/libxl/xl_cmdimpl.c
> @@ -40,6 +40,7 @@
>  #include "libxl_json.h"
>  #include "libxlutil.h"
>  #include "xl.h"
> +#include "libxl_vnuma.h"
>  
>  /* For calls which return an errno on failure */
>  #define CHK_ERRNOVAL( call ) ({                                         \
> @@ -797,6 +798,432 @@ static void parse_vcpu_affinity(libxl_domain_build_info *b_info,
>      }
>  }
>  
> +static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i)
> +{
> +    const char *buf;
> +    char *ep;
> +    unsigned long ul;
> +    int rc = -EINVAL;
> +
> +    buf = xlu_cfg_get_listitem(list, i);
> +    if (!buf)
> +        return rc;
> +    ul = strtoul(buf, &ep, 10);
> +    if (ep == buf)
> +        return rc;
> +    if (ul >= UINT16_MAX)
> +        return rc;
> +    return (unsigned int)ul;
> +}
> +
I see what you want to achieve here, but I don't particularly like
having this as an helper here. If we want something this generic,
perhaps put this in xlu? Although, I'm not sure about this either...

It's probably not such a great deal, and it's a tool's maintainer call,
I think.

> +static void vdistance_set(unsigned int *vdistance,
> +                                unsigned int nr_vnodes,
> +                                unsigned int samenode,
> +                                unsigned int othernode)
> +{
> +    unsigned int idx, slot;
> +    for (idx = 0; idx < nr_vnodes; idx++)
> +        for (slot = 0; slot < nr_vnodes; slot++)
> +            *(vdistance + slot * nr_vnodes + idx) =
> +                idx == slot ? samenode : othernode;
> +}
> +
> +static void vcputovnode_default(unsigned int *cpu_to_node,
> +                                unsigned int nr_vnodes,
> +                                unsigned int max_vcpus)
> +{
> +    unsigned int cpu;
> +    for (cpu = 0; cpu < max_vcpus; cpu++)
> +        cpu_to_node[cpu] = cpu % nr_vnodes;
> +}
> +
> +/* Split domain memory between vNUMA nodes equally. */
> +static int split_vnumamem(libxl_domain_build_info *b_info)
> +{
> +    unsigned long long vnodemem = 0;
> +    unsigned long n;
> +    unsigned int i;
> +
> +    if (b_info->vnodes == 0)
> +        return -1;
> +
This is never called (and that's ok!) with when b_info->vnodes==0, so I
find the check more confusing than helpful.

An assert(), maybe?

> +    vnodemem = (b_info->max_memkb >> 10) / b_info->vnodes;
> +    if (vnodemem < MIN_VNODE_SIZE)
> +        return -1;
> +    /* reminder in MBytes. */
> +    n = (b_info->max_memkb >> 10) % b_info->vnodes;
> +    /* get final sizes in MBytes. */
> +    for (i = 0; i < (b_info->vnodes - 1); i++)
> +        b_info->vnuma_mem[i] = vnodemem;
> +    /* add the reminder to the last node. */
> +    b_info->vnuma_mem[i] = vnodemem + n;
> +    return 0;
> +}
> +
> +static void vnuma_vnodemap_default(unsigned int *vnuma_vnodemap,
> +                                   unsigned int nr_vnodes)
> +{
> +    unsigned int i;
> +    for (i = 0; i < nr_vnodes; i++)
> +        vnuma_vnodemap[i] = VNUMA_NO_NODE;
> +}
> +
> +/*
> + * init vNUMA to "zero config" with one node and all other
> + * topology parameters set to default.
> + */
> +static int vnuma_default_config(libxl_domain_build_info *b_info)
> +{
> +    b_info->vnodes = 1;
> +    /* all memory goes to this one vnode, as well as vcpus. */
> +    if (!(b_info->vnuma_mem = (uint64_t *)calloc(b_info->vnodes,
> +                                sizeof(*b_info->vnuma_mem))))
> +        goto bad_vnumazerocfg;
> +
> +    if (!(b_info->vnuma_vcpumap = (unsigned int *)calloc(b_info->max_vcpus,
> +                                sizeof(*b_info->vnuma_vcpumap))))
> +        goto bad_vnumazerocfg;
> +
> +    if (!(b_info->vdistance = (unsigned int *)calloc(b_info->vnodes *
> +                                b_info->vnodes, sizeof(*b_info->vdistance))))
> +        goto bad_vnumazerocfg;
> +
> +    if (!(b_info->vnuma_vnodemap = (unsigned int *)calloc(b_info->vnodes,
> +                                sizeof(*b_info->vnuma_vnodemap))))
> +        goto bad_vnumazerocfg;
> +
> +    b_info->vnuma_mem[0] = b_info->max_memkb >> 10;
> +
> +    /* all vcpus assigned to this vnode. */
> +    vcputovnode_default(b_info->vnuma_vcpumap, b_info->vnodes,
> +                        b_info->max_vcpus);
> +
> +    /* default vdistance is 10. */
> +    vdistance_set(b_info->vdistance, b_info->vnodes, 10, 10);
> +
> +    /* VNUMA_NO_NODE for vnode_to_pnode. */
> +    vnuma_vnodemap_default(b_info->vnuma_vnodemap, b_info->vnodes);
> +
> +    /*
> +     * will be placed to some physical nodes defined by automatic
> +     * numa placement or VNUMA_NO_NODE will not request exact node.
> +     */
> +    libxl_defbool_set(&b_info->vnuma_autoplacement, true);
> +    return 0;
> +
> + bad_vnumazerocfg:
> +    return -1;
> +}
> +
Can't this, and with it most of the above helpers, be implemented in
such a way that it fits inside libxl__domain_build_info_setdefault()
(and, if yes, moved in a libxl related patch)?

I'm not 100% sure, but it really looks similar to the kind of things we
usually do there rather than in xl.

> +static void free_vnuma_info(libxl_domain_build_info *b_info)
> +{
> +    free(b_info->vnuma_mem);
> +    free(b_info->vdistance);
> +    free(b_info->vnuma_vcpumap);
> +    free(b_info->vnuma_vnodemap);
> +
> +    b_info->vnuma_mem = NULL;
> +    b_info->vdistance = NULL;
> +    b_info->vnuma_vcpumap = NULL;
> +    b_info->vnuma_vnodemap = NULL;
> +
> +    b_info->vnodes = 0;
> +    b_info->vmemranges = 0;
> +}
> +
This appears to be called only once, right before an "exit(1)", in xl.
Do we need it? For sure we don't need the =NULL and =0 thing, do we?

> +static int parse_vnuma_mem(XLU_Config *config,
> +                            libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
>
Why the **b_info and the *dst thing?

> +    XLU_ConfigList *vnumamemcfg;
> +    int nr_vnuma_regions, i;
> +    unsigned long long vnuma_memparsed = 0;
> +    unsigned long ul;
> +    const char *buf;
> +    char *ep;
> +
> +    dst = *b_info;
> +    if (!xlu_cfg_get_list(config, "vnuma_mem",
> +                          &vnumamemcfg, &nr_vnuma_regions, 0)) {
> +
> +        if (nr_vnuma_regions != dst->vnodes) {
> +            fprintf(stderr, "Number of numa regions (vnumamem = %d) is \
> +                    incorrect (should be %d).\n", nr_vnuma_regions,
> +                    dst->vnodes);
>
I find the term 'regions' rather confusing, both in the variable name
and in the error message. I'd go for something like nr_vnuma_mem, for
the variable. For the message, maybe something like this:

 "Specified the amount of memory for %d virtual nodes,\n"
 "while number of virtual node is %d!\n"

But maybe it's just me. :-)

Oh, the "(vnumamem = %d)" part, at least, makes very few sense to me, as
I don't thing the word 'vnumamem', written like that and associated with
an integer, makes much sense to an user. At least that should change, I
think.

> +            goto bad_vnuma_mem;
> +        }
> +
> +        dst->vnuma_mem = calloc(dst->vnodes,
> +                                 sizeof(*dst->vnuma_mem));
>
If you go for the libxl__build_info_setdefault() route I suggested
above, this may need to change a bit, as you'll probably find
b_info->vnuma_mem allocated already.

Anyway, I don't think it would be a big deal to, in that case, either
use realloc() here, or use NULL as default value (and hence set it to
that in libxl__build_info_sedefault() ).

> +        if (dst->vnuma_mem == NULL) {
> +            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
>
Ditto, about 'ranges'.

> +            goto bad_vnuma_mem;
> +        }
> +
> +        /*
> +         * Will parse only nr_vnodes times, even if we have more/less regions.
>
How can it be that you have fewer or more entries than b_info->vnodes? 

> +         * Take care of it later if less or discard if too many regions.
>
You took care of that above already...

> +         */
> +        for (i = 0; i < dst->vnodes; i++) {
> +            buf = xlu_cfg_get_listitem(vnumamemcfg, i);
> +            if (!buf) {
> +                fprintf(stderr,
> +                        "xl: Unable to get element %d in vnuma memory list.\n", i);
> +                goto bad_vnuma_mem;
>
The common pattern in xl seems to be to exit() right away, if this
happens (whether to exit with 1 or -1, that seems a bit inconsistent to
me, at a quick glance).

> +            }
> +
> +            ul = strtoul(buf, &ep, 10);
> +            if (ep == buf) {
> +                fprintf(stderr, "xl: Invalid argument parsing vnumamem: %s.\n", buf);
> +                goto bad_vnuma_mem;
>
And the same (i.e., just exit() ) applies here... And pretty much
everywhere, actually! :-)

> +            }
> +
> +            /* 32Mb is a min size for a node, taken from Linux */
> +            if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) {
>
What's the idea behing UINT32_MAX here?

I guess what I mean is, if there is, for any reason, a maximum virtual
node size, then it may be useful to #define it, e.g., MAX_VNODE_SIZE
(just as you're doing for MIN_VNODE_SIZE). If there is not, then we can
just live with what the type can handle, and it will be lower layers
that will return appropriate errors if values are wrong/unrealistic.

Like this, you're saying that such maximum virtual node size is 4096 TB
(if I'm not doing the math wrong), which I don't think it makes much
sense... :-P

> +                fprintf(stderr, "xl: vnuma memory %lu is not within %u - %u range.\n",
> +                        ul, MIN_VNODE_SIZE, UINT32_MAX);
>
"xl: memory for virtual node %lu is outside of [%u, %u] MB.\n"

> +                goto bad_vnuma_mem;
> +            }
> +
> +            /* memory in MBytes */
> +            dst->vnuma_mem[i] = ul;
> +        }
> +
> +        /* Total memory for vNUMA parsed to verify */
> +        for (i = 0; i < nr_vnuma_regions; i++)
> +            vnuma_memparsed = vnuma_memparsed + (dst->vnuma_mem[i]);
> +
Not a big deal, but can't you accumulate while doing the parsing?

> +        /* Amount of memory for vnodes same as total? */
> +        if ((vnuma_memparsed << 10) != (dst->max_memkb)) {
> +            fprintf(stderr, "xl: vnuma memory is not the same as domain \
> +                    memory size.\n");
> +            goto bad_vnuma_mem;
> +        }
> +    } else {
> +        dst->vnuma_mem = calloc(dst->vnodes,
> +                                      sizeof(*dst->vnuma_mem));
> +        if (dst->vnuma_mem == NULL) {
> +            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
> +            goto bad_vnuma_mem;
> +        }
> +
The allocation is the same as above. It can live outside of the if,
can't it? Especially considering that, as said before, you can just
exit() on failure.

> +        fprintf(stderr, "WARNING: vNUMA memory ranges were not specified.\n");
> +        fprintf(stderr, "Using default equal vnode memory size %lu Kbytes \
> +                to cover %lu Kbytes.\n",
> +                dst->max_memkb / dst->vnodes, dst->max_memkb);
> +
Does this deserve a warning? In the man page, we say it's legal not to
specify this, so I may well have read that, and may be doing this on
purpose...

> +        if (split_vnumamem(dst) < 0) {
> +            fprintf(stderr, "Could not split vnuma memory into equal chunks.\n");
> +            goto bad_vnuma_mem;
> +        }
> +    }
> +    return 0;
> +
> + bad_vnuma_mem:
> +    return -1;
> +}
> +
> +static int parse_vnuma_distance(XLU_Config *config,
> +                                libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
>
Same as above.

> +    XLU_ConfigList *vdistancecfg;
> +    int nr_vdist;
> +
> +    dst = *b_info;
> +    dst->vdistance = calloc(dst->vnodes * dst->vnodes,
> +                               sizeof(*dst->vdistance));
> +    if (dst->vdistance == NULL)
> +        goto bad_distance;
> +
> +    if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0)) {
> +        int d1, d2, i;
> +        /*
> +         * First value is the same node distance, the second as the
> +         * rest of distances. The following is required right now to
>
I'm not a native English speaker, but 'The latter' sounds more correct
than 'The following' (if that is what you mean; if it's not, then I
didn't understand what you mean! :-) ).

> +         * avoid non-symmetrical distance table as it may break latest kernel.
> +         * TODO: Better way to analyze extended distance table, possibly
> +         * OS specific.
> +         */
> +
Ok. So, AFAIUI, for now, you want the list to be two elements long, is
that right? If yes, should we check that nr_vdist==2 ?

What happens if I have b_info->vnodes > 1, and I pass a list only one
element long ?

> +        for (i = 0; i < nr_vdist; i++) {
> +            d1 = get_list_item_uint(vdistancecfg, i);
> +        }
> +
Mmm... What's this?

> +        d1 = get_list_item_uint(vdistancecfg, 0);
> +        if (dst->vnodes > 1)
> +           d2 = get_list_item_uint(vdistancecfg, 1);
> +        else
> +           d2 = d1;
> +
> +        if (d1 >= 0 && d2 >= 0) {
> +            if (d1 < d2)
> +                fprintf(stderr, "WARNING: vnuma distance d1 < d2, %u < %u\n", d1, d2);
>
I agree this looks weird, and deserves a warning.

I'd mention why that is the case, in the warning message, i.e., because
the distance from a node to _itself_ is being set to a bigger value than
the istance between two different nodes.

> +            vdistance_set(dst->vdistance, dst->vnodes, d1, d2);
> +        } else {
> +            fprintf(stderr, "WARNING: vnuma distance values are incorrect.\n");
> +            goto bad_distance;
>
After printing this "WARNING", you goto bad_distance, which means this
returns -1, which means we below exit(1).

So, this really looks like an error, rather than a warning to me.

> +        }
> +    } else {
> +        fprintf(stderr, "Could not parse vnuma distances.\n");
> +        vdistance_set(dst->vdistance, dst->vnodes, 10, 20);
> +    }
> +    return 0;
> +
> + bad_distance:
> +    return -1;
>
As said before, xl often just exit on failures like these.

> +}
> +
> +static int parse_vnuma_vcpumap(XLU_Config *config,
> +                                libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
>
...

> +    XLU_ConfigList *vcpumap;
> +    int nr_vcpumap, i;
> +
> +    dst = *b_info;
> +    dst->vnuma_vcpumap = (unsigned int *)calloc(dst->max_vcpus,
> +                                     sizeof(*dst->vnuma_vcpumap));
>
You don't need the cast.

> +    if (dst->vnuma_vcpumap == NULL)
> +        goto bad_vcpumap;
> +
exit(1);

> +    if (!xlu_cfg_get_list(config, "vnuma_vcpumap",
> +                          &vcpumap, &nr_vcpumap, 0)) {
> +        if (nr_vcpumap == dst->max_vcpus) {
>
    if (nr_vcpumap != b_info->max_vcpus) {
        fprintf(stderr, "xl: mapping of vCPUs to vNODEs specified for "
                        "%d vCPUs, while the domain has %d",
                nr_vcpumap, b_inf->max_vcpus);
        exit(1)
    }

and then continue with one less level of indentation. :-)

> +            unsigned int  vnode, vcpumask = 0, vmask;
> +
> +            vmask = ~(~0 << nr_vcpumap);
> +            for (i = 0; i < nr_vcpumap; i++) {
> +                vnode = get_list_item_uint(vcpumap, i);
> +                if (vnode >= 0 && vnode < dst->vnodes) {
> +                    vcpumask |= (1 << i);
> +                    dst->vnuma_vcpumap[i] = vnode;
> +                }
> +            }
> +
> +            /* Did it covered all vnodes in the vcpu mask? */
> +            if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) {
> +                fprintf(stderr, "WARNING: Not all vnodes were covered \
> +                        in numa_cpumask.\n");
> +                goto bad_vcpumap;
> +            }
>
As we settled on using libxl_bitmap for this, I expect this to change
quite a bit (e.g., we'd have to build a bitmap per each node, the
libxl_bitmap manipulating functions should be used, etc.).

> +        } else {
> +            fprintf(stderr, "WARNING:  Bad vnuma_vcpumap.\n");
> +            goto bad_vcpumap;
> +        }
> +    }
> +    else
> +        vcputovnode_default(dst->vnuma_vcpumap,
> +                            dst->vnodes,
> +                            dst->max_vcpus);
> +    return 0;
> +
> + bad_vcpumap:
> +    return -1;
> +}
> +
> +static int parse_vnuma_vnodemap(XLU_Config *config,
> +                                libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
No need?

> +    XLU_ConfigList *vnodemap;
> +    int nr_vnodemap, i;
> +
> +    dst = *b_info;
> +
> +    /* There is mapping to NUMA physical nodes? */
> +    dst->vnuma_vnodemap = (unsigned int *)calloc(dst->vnodes,
> +                           sizeof(*dst->vnuma_vnodemap));
>
no cast.

> +    if (dst->vnuma_vnodemap == NULL)
> +        goto bad_vnodemap;
> +
> +    if (!xlu_cfg_get_list(config, "vnuma_vnodemap",
> +                          &vnodemap, &nr_vnodemap, 0)) {
> +        /*
> +         * If not specified or incorrect, will be defined
> +         * later based on the machine architecture, configuration
> +         * and memory availble when creating domain.
> +         */
> +        libxl_defbool_set(&dst->vnuma_autoplacement, false);
>
As agreed in the other part of this thread, there is already an
automatic placement switch to act on, in this case.

Note that the way that switch is being used right now in libxl code, may
need to change a bit, but that is, nevertheless, the right thing to do.
So have a look at that, please. :-)

> +        if (nr_vnodemap == dst->vnodes) {
> +            unsigned int vnodemask = 0, pnode, smask;
> +            smask = ~(~0 << dst->vnodes);
> +            for (i = 0; i < dst->vnodes; i++) {
> +                pnode = get_list_item_uint(vnodemap, i);
> +                if (pnode >= 0) {
> +                    vnodemask |= (1 << i);
> +                    dst->vnuma_vnodemap[i] = pnode;
> +                }
> +            }
> +
> +            /* Did it covered all vnodes in the mask? */
> +            if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) {
> +                fprintf(stderr, "WARNING: Not all vnodes were covered \
> +                        vnuma_vnodemap.\n");
> +                fprintf(stderr, "Automatic placement will be used for vnodes.\n");
> +                libxl_defbool_set(&dst->vnuma_autoplacement, true);
> +                vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> +            }
> +        }
> +        else {
> +            fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n");
> +            fprintf(stderr, "Automatic placement will be used for vnodes.\n");
> +            libxl_defbool_set(&dst->vnuma_autoplacement, true);
> +            vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> +        }
> +    }
>
This is also about to change quite substantially, so I'm avoiding
commenting on it...

> +static void parse_vnuma_config(XLU_Config *config,
> +                               libxl_domain_build_info *b_info)
> +{
> +    long l;
> +
> +    if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) {
> +        if (l > MAX_VNUMA_NODES) {
> +            fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n",
> +                    MAX_VNUMA_NODES);
> +            goto bad_vnuma_config;
> +        }
> +        b_info->vnodes = l;
> +
> +        if (!xlu_cfg_get_defbool(config, "vnuma_autoplacement",
> +                    &b_info->vnuma_autoplacement, 0))
> +            libxl_defbool_set(&b_info->vnuma_autoplacement, false);
> +
"vnuma_autoplacement" is, AFAIUI, going away, so no comments. ;-P


Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc
  2014-09-11 17:13   ` Dario Faggioli
@ 2014-09-12  2:04     ` Elena Ufimtseva
  2014-09-12  9:02       ` Dario Faggioli
  0 siblings, 1 reply; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-12  2:04 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Keir Fraser, Ian Campbell, Stefano Stabellini, George Dunlap,
	Matt Wilson, Li Yechen, Ian Jackson, xen-devel, Jan Beulich

Hello Dario

Thank you for such a thorough review.
And you are right, it will be changed slightly as we agreed on the
different approach to declare vnuma in libxl :)


On Thu, Sep 11, 2014 at 1:13 PM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> So, as far as I understood it, v11 did not include a toolstack part, so
> this version is what I should be looking at, is this correct?
>
> I'm also looking at the commits in the github repo (for v11), and the
> code seems to be the same as here.
>
> Please, do correct me if I'm wrong. Hopefully, the comments I'm leaving
> about this version, will be useful anyway.
>
> Also, as we agreed upon changing the libxl data structures for this
> feature (i.e., the vNUMA related field names and types), the review will
> focus on the behavior of the xl parser (and on the structure of the
> parsing code, of course), and neglect that part.
>
> So, first of all, about the subject: 'libxl: vnuma...'. This patch looks
> more about xl than libxl (so I'd expect something like 'xl: vnuma...').
>
> On mer, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
>> --- a/docs/man/xl.cfg.pod.5
>> +++ b/docs/man/xl.cfg.pod.5
>> @@ -264,6 +264,83 @@ if the values of B<memory=> and B<maxmem=> differ.
>>  A "pre-ballooned" HVM guest needs a balloon driver, without a balloon driver
>>  it will crash.
>>
> In general, about the config file interface, what I think what we want
> an user to be able to specify is:
>
>  1. number of virtual NUMA nodes
>     (default: 1)
>  2. amount of memory on each virtual NUMA node
>     (default: total tot_memory / nr_nodes)
>  3. the 'distances' from each node to each other one
>     (default: something sane :-) )
>  4. what vcpus on each virtual NUMA node
>     (default: distribute evenly)
>  5. virtual node to physical node mapping
>     (default: driven by the existing automatic placement logic, or
>     by pinning)
>
> Which is pretty much what this patch allows to do, so, good! :-D
>
> The syntax used for each of these parameters is also mostly fine, IMO,
> with the only exception of the mapping between vCPUs and vNODEs (more on
> that below).
>
> About the name of the parameters, in this patch, we have:
>
>  1. "vnuma_nodes"
>  2. "vnuma_mem"
>  3. "vdistance"
>  4. "vnuma_cpumap"
>  5. "vnuma_nodemap"
>
> I have to admit, I don't especially like these, so here's my proposal.
> I'm of course open to other views here, I'm actually just tossing names
> there (with a brief rationale and explanation)... :-)
>
>  1. "vnodes": for vCPUs, we have "vcpus", I think something along the
>               same line would make sense;
>  2. "vnodes_memory" : same reasoning as above (BTW, we may want to go
>                       for something like "vnodes_maxmem", as that is
>                       probably what we're actually specifying, isn't it?
>                       I'm not sure how important it is to stress that,
>                       though);
>  3. "vnodes_distance[s]": or just "vnodes_dist[s]"
>  4. "vcpus_to_vnodes"
>  5. "vnodes_to_pnodes": not particularly proud of this, but can't think
>                         of anything better right now.
>
> Thoughts?

Yep, totally fine with these names.

>
>> +=item B<vnuma_nodes=N>
>> +
>> +Number of vNUMA nodes the guest will be initialized with on boot.
>> +PV guest by default will have one vnuma node.
>> +
> It's not only PV guests that will have, by default, just one node...
> This is true for every guest (the reason being vNUMA is not even
> implemented for such other guest types!! :-) ).
>
> As it stands, it looks to me like other kind of guest than PV can have a
> different number of NUMA nodes by default.
>
>> +=item B<vnuma_mem=[vmem1, vmem2, ...]>
>> +
>> +List of memory sizes for each node, defined in MBytes. Number of items listed must
>> +match nr_vnodes.
>>
> "must match the number of virtual NUMA nodes specified by
> B<vnuma_nodes=>"
>
>> If the sum of all vnode memories does not match the domain memory
>>
> ", specified by B<memory=> and/or B<maxmem=>,"
>
>> +or there are missing nodes, it will fail.
>> +If not specified, memory will be equally split between vnodes. Current minimum
>> +memory size for one node is limited by 32MB.
>> +
>> +Example: vnuma_mem=[1024, 1024, 2048, 2048]
>> +Total amount of memory in guest: 6GB
>> +
>
>> +=item B<vnuma_vcpumap=[node_nr, node_nr, ...]>
>> +
>> +Defines vcpu to vnode mapping as a list of integers. The position in the list
>> +is a vcpu number, and the value is the vnode number to which the vcpu will be
>> +assigned to.
>> +Current limitations:
>> +- vNUMA node must have at least one vcpu, otherwise default vcpu_to_vnode will be used.
>> +- Total number of vnodes cannot be bigger then number of vcpus.
>> +
>> +Example:
>> +Map of 4 vcpus to 2 vnodes:
>> +0,1 vcpu -> vnode0
>> +2,3 vcpu -> vnode1:
>> +
>> +vnuma_vcpumap = [0, 0, 1, 1]
>> + 4 vcpus here -  0  1  2  3
>> +
>>
> I wonder whether it would be better to turn this into something lieke
> the below:
>
> ["0,1,2,3", "4-7", "8,10,12,14", "9,11,13,15"]

Yep, I think its good to have more unified approach.
>
> With the following meaning: there are 4 nodes, and vCPUs 0,1,2,3 belongs
> to node 0, vCPUs 4,5,6,7 belongs to node 1, etc.
>
> It looks a lot more easy to use to me. With the current syntax, the same
> configuration would be expressed like this:
>
> [0,0,0,0,1,1,1,1,2,3,2,3,2,3,2,3]
>
> Quite hard to get right when both writing and reading it, isn't it?
>
> Elena's example looks like this:
>
> ["0,1", "2,3"] (with my notation)
> [0, 0, 1, 1]   (with this patch notation)
>
> From an implementation point of view, the code for parsing strings like
> these (and also for parsing list of strings like this), is already there
> in xl, so it shouldn't even be too big of a piece of work...
>
>> +=item B<vnuma_vnodemap=[p1, p2, ..., pn]>
>> +
>> +List of physical node numbers, position in the list represents vnode number.
>> +Used for manual placement of vnuma nodes to physical NUMA nodes.
>> +Will not be used if automatic numa placement is active.
>> +
>> +Example:
>> +assume NUMA machine with 4 physical nodes. Placing vnuma node 0 to pnode 2,
>> +vnuma node 1 to pnode 3:
>> +vnode0 -> pnode2
>> +vnode1 -> pnode3
>> +
>> +vnuma_vnodemap=[2, 3]
>> +first vnode will be placed on node 2, second on node 3.
>> +
>>
> I'm fine with this.
>
>> +=item B<vnuma_autoplacement=[0|1]>
>> +
> To be killed. :-)
>
>>  =head3 Event Actions
>> diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
>> index 409a795..1af2250 100644
>> --- a/tools/libxl/xl_cmdimpl.c
>> +++ b/tools/libxl/xl_cmdimpl.c
>> @@ -40,6 +40,7 @@
>>  #include "libxl_json.h"
>>  #include "libxlutil.h"
>>  #include "xl.h"
>> +#include "libxl_vnuma.h"
>>
>>  /* For calls which return an errno on failure */
>>  #define CHK_ERRNOVAL( call ) ({                                         \
>> @@ -797,6 +798,432 @@ static void parse_vcpu_affinity(libxl_domain_build_info *b_info,
>>      }
>>  }
>>
>> +static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i)
>> +{
>> +    const char *buf;
>> +    char *ep;
>> +    unsigned long ul;
>> +    int rc = -EINVAL;
>> +
>> +    buf = xlu_cfg_get_listitem(list, i);
>> +    if (!buf)
>> +        return rc;
>> +    ul = strtoul(buf, &ep, 10);
>> +    if (ep == buf)
>> +        return rc;
>> +    if (ul >= UINT16_MAX)
>> +        return rc;
>> +    return (unsigned int)ul;
>> +}
>> +
> I see what you want to achieve here, but I don't particularly like
> having this as an helper here. If we want something this generic,
> perhaps put this in xlu? Although, I'm not sure about this either...
>
> It's probably not such a great deal, and it's a tool's maintainer call,
> I think.
>
>> +static void vdistance_set(unsigned int *vdistance,
>> +                                unsigned int nr_vnodes,
>> +                                unsigned int samenode,
>> +                                unsigned int othernode)
>> +{
>> +    unsigned int idx, slot;
>> +    for (idx = 0; idx < nr_vnodes; idx++)
>> +        for (slot = 0; slot < nr_vnodes; slot++)
>> +            *(vdistance + slot * nr_vnodes + idx) =
>> +                idx == slot ? samenode : othernode;
>> +}
>> +
>> +static void vcputovnode_default(unsigned int *cpu_to_node,
>> +                                unsigned int nr_vnodes,
>> +                                unsigned int max_vcpus)
>> +{
>> +    unsigned int cpu;
>> +    for (cpu = 0; cpu < max_vcpus; cpu++)
>> +        cpu_to_node[cpu] = cpu % nr_vnodes;
>> +}
>> +
>> +/* Split domain memory between vNUMA nodes equally. */
>> +static int split_vnumamem(libxl_domain_build_info *b_info)
>> +{
>> +    unsigned long long vnodemem = 0;
>> +    unsigned long n;
>> +    unsigned int i;
>> +
>> +    if (b_info->vnodes == 0)
>> +        return -1;
>> +
> This is never called (and that's ok!) with when b_info->vnodes==0, so I
> find the check more confusing than helpful.
>
> An assert(), maybe?
>
>> +    vnodemem = (b_info->max_memkb >> 10) / b_info->vnodes;
>> +    if (vnodemem < MIN_VNODE_SIZE)
>> +        return -1;
>> +    /* reminder in MBytes. */
>> +    n = (b_info->max_memkb >> 10) % b_info->vnodes;
>> +    /* get final sizes in MBytes. */
>> +    for (i = 0; i < (b_info->vnodes - 1); i++)
>> +        b_info->vnuma_mem[i] = vnodemem;
>> +    /* add the reminder to the last node. */
>> +    b_info->vnuma_mem[i] = vnodemem + n;
>> +    return 0;
>> +}
>> +
>> +static void vnuma_vnodemap_default(unsigned int *vnuma_vnodemap,
>> +                                   unsigned int nr_vnodes)
>> +{
>> +    unsigned int i;
>> +    for (i = 0; i < nr_vnodes; i++)
>> +        vnuma_vnodemap[i] = VNUMA_NO_NODE;
>> +}
>> +
>> +/*
>> + * init vNUMA to "zero config" with one node and all other
>> + * topology parameters set to default.
>> + */
>> +static int vnuma_default_config(libxl_domain_build_info *b_info)
>> +{
>> +    b_info->vnodes = 1;
>> +    /* all memory goes to this one vnode, as well as vcpus. */
>> +    if (!(b_info->vnuma_mem = (uint64_t *)calloc(b_info->vnodes,
>> +                                sizeof(*b_info->vnuma_mem))))
>> +        goto bad_vnumazerocfg;
>> +
>> +    if (!(b_info->vnuma_vcpumap = (unsigned int *)calloc(b_info->max_vcpus,
>> +                                sizeof(*b_info->vnuma_vcpumap))))
>> +        goto bad_vnumazerocfg;
>> +
>> +    if (!(b_info->vdistance = (unsigned int *)calloc(b_info->vnodes *
>> +                                b_info->vnodes, sizeof(*b_info->vdistance))))
>> +        goto bad_vnumazerocfg;
>> +
>> +    if (!(b_info->vnuma_vnodemap = (unsigned int *)calloc(b_info->vnodes,
>> +                                sizeof(*b_info->vnuma_vnodemap))))
>> +        goto bad_vnumazerocfg;
>> +
>> +    b_info->vnuma_mem[0] = b_info->max_memkb >> 10;
>> +
>> +    /* all vcpus assigned to this vnode. */
>> +    vcputovnode_default(b_info->vnuma_vcpumap, b_info->vnodes,
>> +                        b_info->max_vcpus);
>> +
>> +    /* default vdistance is 10. */
>> +    vdistance_set(b_info->vdistance, b_info->vnodes, 10, 10);
>> +
>> +    /* VNUMA_NO_NODE for vnode_to_pnode. */
>> +    vnuma_vnodemap_default(b_info->vnuma_vnodemap, b_info->vnodes);
>> +
>> +    /*
>> +     * will be placed to some physical nodes defined by automatic
>> +     * numa placement or VNUMA_NO_NODE will not request exact node.
>> +     */
>> +    libxl_defbool_set(&b_info->vnuma_autoplacement, true);
>> +    return 0;
>> +
>> + bad_vnumazerocfg:
>> +    return -1;
>> +}
>> +
> Can't this, and with it most of the above helpers, be implemented in
> such a way that it fits inside libxl__domain_build_info_setdefault()
> (and, if yes, moved in a libxl related patch)?
>
> I'm not 100% sure, but it really looks similar to the kind of things we
> usually do there rather than in xl.
>
>> +static void free_vnuma_info(libxl_domain_build_info *b_info)
>> +{
>> +    free(b_info->vnuma_mem);
>> +    free(b_info->vdistance);
>> +    free(b_info->vnuma_vcpumap);
>> +    free(b_info->vnuma_vnodemap);
>> +
>> +    b_info->vnuma_mem = NULL;
>> +    b_info->vdistance = NULL;
>> +    b_info->vnuma_vcpumap = NULL;
>> +    b_info->vnuma_vnodemap = NULL;
>> +
>> +    b_info->vnodes = 0;
>> +    b_info->vmemranges = 0;
>> +}
>> +
> This appears to be called only once, right before an "exit(1)", in xl.
> Do we need it? For sure we don't need the =NULL and =0 thing, do we?
>
>> +static int parse_vnuma_mem(XLU_Config *config,
>> +                            libxl_domain_build_info **b_info)
>> +{
>> +    libxl_domain_build_info *dst;
>>
> Why the **b_info and the *dst thing?
>
>> +    XLU_ConfigList *vnumamemcfg;
>> +    int nr_vnuma_regions, i;
>> +    unsigned long long vnuma_memparsed = 0;
>> +    unsigned long ul;
>> +    const char *buf;
>> +    char *ep;
>> +
>> +    dst = *b_info;
>> +    if (!xlu_cfg_get_list(config, "vnuma_mem",
>> +                          &vnumamemcfg, &nr_vnuma_regions, 0)) {
>> +
>> +        if (nr_vnuma_regions != dst->vnodes) {
>> +            fprintf(stderr, "Number of numa regions (vnumamem = %d) is \
>> +                    incorrect (should be %d).\n", nr_vnuma_regions,
>> +                    dst->vnodes);
>>
> I find the term 'regions' rather confusing, both in the variable name
> and in the error message. I'd go for something like nr_vnuma_mem, for
> the variable. For the message, maybe something like this:
>
>  "Specified the amount of memory for %d virtual nodes,\n"
>  "while number of virtual node is %d!\n"
>
> But maybe it's just me. :-)
>
> Oh, the "(vnumamem = %d)" part, at least, makes very few sense to me, as
> I don't thing the word 'vnumamem', written like that and associated with
> an integer, makes much sense to an user. At least that should change, I
> think.
>
>> +            goto bad_vnuma_mem;
>> +        }
>> +
>> +        dst->vnuma_mem = calloc(dst->vnodes,
>> +                                 sizeof(*dst->vnuma_mem));
>>
> If you go for the libxl__build_info_setdefault() route I suggested
> above, this may need to change a bit, as you'll probably find
> b_info->vnuma_mem allocated already.
>
> Anyway, I don't think it would be a big deal to, in that case, either
> use realloc() here, or use NULL as default value (and hence set it to
> that in libxl__build_info_sedefault() ).
>
>> +        if (dst->vnuma_mem == NULL) {
>> +            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
>>
> Ditto, about 'ranges'.
>
>> +            goto bad_vnuma_mem;
>> +        }
>> +
>> +        /*
>> +         * Will parse only nr_vnodes times, even if we have more/less regions.
>>
> How can it be that you have fewer or more entries than b_info->vnodes?
>
>> +         * Take care of it later if less or discard if too many regions.
>>
> You took care of that above already...
>
>> +         */
>> +        for (i = 0; i < dst->vnodes; i++) {
>> +            buf = xlu_cfg_get_listitem(vnumamemcfg, i);
>> +            if (!buf) {
>> +                fprintf(stderr,
>> +                        "xl: Unable to get element %d in vnuma memory list.\n", i);
>> +                goto bad_vnuma_mem;
>>
> The common pattern in xl seems to be to exit() right away, if this
> happens (whether to exit with 1 or -1, that seems a bit inconsistent to
> me, at a quick glance).
>
>> +            }
>> +
>> +            ul = strtoul(buf, &ep, 10);
>> +            if (ep == buf) {
>> +                fprintf(stderr, "xl: Invalid argument parsing vnumamem: %s.\n", buf);
>> +                goto bad_vnuma_mem;
>>
> And the same (i.e., just exit() ) applies here... And pretty much
> everywhere, actually! :-)
>
>> +            }
>> +
>> +            /* 32Mb is a min size for a node, taken from Linux */
>> +            if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) {
>>
> What's the idea behing UINT32_MAX here?
>
> I guess what I mean is, if there is, for any reason, a maximum virtual
> node size, then it may be useful to #define it, e.g., MAX_VNODE_SIZE
> (just as you're doing for MIN_VNODE_SIZE). If there is not, then we can
> just live with what the type can handle, and it will be lower layers
> that will return appropriate errors if values are wrong/unrealistic.
>
> Like this, you're saying that such maximum virtual node size is 4096 TB
> (if I'm not doing the math wrong), which I don't think it makes much
> sense... :-P
>
>> +                fprintf(stderr, "xl: vnuma memory %lu is not within %u - %u range.\n",
>> +                        ul, MIN_VNODE_SIZE, UINT32_MAX);
>>
> "xl: memory for virtual node %lu is outside of [%u, %u] MB.\n"
>
>> +                goto bad_vnuma_mem;
>> +            }
>> +
>> +            /* memory in MBytes */
>> +            dst->vnuma_mem[i] = ul;
>> +        }
>> +
>> +        /* Total memory for vNUMA parsed to verify */
>> +        for (i = 0; i < nr_vnuma_regions; i++)
>> +            vnuma_memparsed = vnuma_memparsed + (dst->vnuma_mem[i]);
>> +
> Not a big deal, but can't you accumulate while doing the parsing?
>
>> +        /* Amount of memory for vnodes same as total? */
>> +        if ((vnuma_memparsed << 10) != (dst->max_memkb)) {
>> +            fprintf(stderr, "xl: vnuma memory is not the same as domain \
>> +                    memory size.\n");
>> +            goto bad_vnuma_mem;
>> +        }
>> +    } else {
>> +        dst->vnuma_mem = calloc(dst->vnodes,
>> +                                      sizeof(*dst->vnuma_mem));
>> +        if (dst->vnuma_mem == NULL) {
>> +            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
>> +            goto bad_vnuma_mem;
>> +        }
>> +
> The allocation is the same as above. It can live outside of the if,
> can't it? Especially considering that, as said before, you can just
> exit() on failure.
>
>> +        fprintf(stderr, "WARNING: vNUMA memory ranges were not specified.\n");
>> +        fprintf(stderr, "Using default equal vnode memory size %lu Kbytes \
>> +                to cover %lu Kbytes.\n",
>> +                dst->max_memkb / dst->vnodes, dst->max_memkb);
>> +
> Does this deserve a warning? In the man page, we say it's legal not to
> specify this, so I may well have read that, and may be doing this on
> purpose...
>
>> +        if (split_vnumamem(dst) < 0) {
>> +            fprintf(stderr, "Could not split vnuma memory into equal chunks.\n");
>> +            goto bad_vnuma_mem;
>> +        }
>> +    }
>> +    return 0;
>> +
>> + bad_vnuma_mem:
>> +    return -1;
>> +}
>> +
>> +static int parse_vnuma_distance(XLU_Config *config,
>> +                                libxl_domain_build_info **b_info)
>> +{
>> +    libxl_domain_build_info *dst;
>>
> Same as above.
>
>> +    XLU_ConfigList *vdistancecfg;
>> +    int nr_vdist;
>> +
>> +    dst = *b_info;
>> +    dst->vdistance = calloc(dst->vnodes * dst->vnodes,
>> +                               sizeof(*dst->vdistance));
>> +    if (dst->vdistance == NULL)
>> +        goto bad_distance;
>> +
>> +    if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0)) {
>> +        int d1, d2, i;
>> +        /*
>> +         * First value is the same node distance, the second as the
>> +         * rest of distances. The following is required right now to
>>
> I'm not a native English speaker, but 'The latter' sounds more correct
> than 'The following' (if that is what you mean; if it's not, then I
> didn't understand what you mean! :-) ).
>
>> +         * avoid non-symmetrical distance table as it may break latest kernel.
>> +         * TODO: Better way to analyze extended distance table, possibly
>> +         * OS specific.
>> +         */
>> +
> Ok. So, AFAIUI, for now, you want the list to be two elements long, is
> that right? If yes, should we check that nr_vdist==2 ?
>
> What happens if I have b_info->vnodes > 1, and I pass a list only one
> element long ?
>
>> +        for (i = 0; i < nr_vdist; i++) {
>> +            d1 = get_list_item_uint(vdistancecfg, i);
>> +        }
>> +
> Mmm... What's this?
>
>> +        d1 = get_list_item_uint(vdistancecfg, 0);
>> +        if (dst->vnodes > 1)
>> +           d2 = get_list_item_uint(vdistancecfg, 1);
>> +        else
>> +           d2 = d1;
>> +
>> +        if (d1 >= 0 && d2 >= 0) {
>> +            if (d1 < d2)
>> +                fprintf(stderr, "WARNING: vnuma distance d1 < d2, %u < %u\n", d1, d2);
>>
> I agree this looks weird, and deserves a warning.
>
> I'd mention why that is the case, in the warning message, i.e., because
> the distance from a node to _itself_ is being set to a bigger value than
> the istance between two different nodes.
>
>> +            vdistance_set(dst->vdistance, dst->vnodes, d1, d2);
>> +        } else {
>> +            fprintf(stderr, "WARNING: vnuma distance values are incorrect.\n");
>> +            goto bad_distance;
>>
> After printing this "WARNING", you goto bad_distance, which means this
> returns -1, which means we below exit(1).
>
> So, this really looks like an error, rather than a warning to me.
>
>> +        }
>> +    } else {
>> +        fprintf(stderr, "Could not parse vnuma distances.\n");
>> +        vdistance_set(dst->vdistance, dst->vnodes, 10, 20);
>> +    }
>> +    return 0;
>> +
>> + bad_distance:
>> +    return -1;
>>
> As said before, xl often just exit on failures like these.
>
>> +}
>> +
>> +static int parse_vnuma_vcpumap(XLU_Config *config,
>> +                                libxl_domain_build_info **b_info)
>> +{
>> +    libxl_domain_build_info *dst;
>>
> ...
>
>> +    XLU_ConfigList *vcpumap;
>> +    int nr_vcpumap, i;
>> +
>> +    dst = *b_info;
>> +    dst->vnuma_vcpumap = (unsigned int *)calloc(dst->max_vcpus,
>> +                                     sizeof(*dst->vnuma_vcpumap));
>>
> You don't need the cast.
>
>> +    if (dst->vnuma_vcpumap == NULL)
>> +        goto bad_vcpumap;
>> +
> exit(1);
>
>> +    if (!xlu_cfg_get_list(config, "vnuma_vcpumap",
>> +                          &vcpumap, &nr_vcpumap, 0)) {
>> +        if (nr_vcpumap == dst->max_vcpus) {
>>
>     if (nr_vcpumap != b_info->max_vcpus) {
>         fprintf(stderr, "xl: mapping of vCPUs to vNODEs specified for "
>                         "%d vCPUs, while the domain has %d",
>                 nr_vcpumap, b_inf->max_vcpus);
>         exit(1)
>     }
>
> and then continue with one less level of indentation. :-)
>
>> +            unsigned int  vnode, vcpumask = 0, vmask;
>> +
>> +            vmask = ~(~0 << nr_vcpumap);
>> +            for (i = 0; i < nr_vcpumap; i++) {
>> +                vnode = get_list_item_uint(vcpumap, i);
>> +                if (vnode >= 0 && vnode < dst->vnodes) {
>> +                    vcpumask |= (1 << i);
>> +                    dst->vnuma_vcpumap[i] = vnode;
>> +                }
>> +            }
>> +
>> +            /* Did it covered all vnodes in the vcpu mask? */
>> +            if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) {
>> +                fprintf(stderr, "WARNING: Not all vnodes were covered \
>> +                        in numa_cpumask.\n");
>> +                goto bad_vcpumap;
>> +            }
>>
> As we settled on using libxl_bitmap for this, I expect this to change
> quite a bit (e.g., we'd have to build a bitmap per each node, the
> libxl_bitmap manipulating functions should be used, etc.).
>
>> +        } else {
>> +            fprintf(stderr, "WARNING:  Bad vnuma_vcpumap.\n");
>> +            goto bad_vcpumap;
>> +        }
>> +    }
>> +    else
>> +        vcputovnode_default(dst->vnuma_vcpumap,
>> +                            dst->vnodes,
>> +                            dst->max_vcpus);
>> +    return 0;
>> +
>> + bad_vcpumap:
>> +    return -1;
>> +}
>> +
>> +static int parse_vnuma_vnodemap(XLU_Config *config,
>> +                                libxl_domain_build_info **b_info)
>> +{
>> +    libxl_domain_build_info *dst;
> No need?
>
>> +    XLU_ConfigList *vnodemap;
>> +    int nr_vnodemap, i;
>> +
>> +    dst = *b_info;
>> +
>> +    /* There is mapping to NUMA physical nodes? */
>> +    dst->vnuma_vnodemap = (unsigned int *)calloc(dst->vnodes,
>> +                           sizeof(*dst->vnuma_vnodemap));
>>
> no cast.
>
>> +    if (dst->vnuma_vnodemap == NULL)
>> +        goto bad_vnodemap;
>> +
>> +    if (!xlu_cfg_get_list(config, "vnuma_vnodemap",
>> +                          &vnodemap, &nr_vnodemap, 0)) {
>> +        /*
>> +         * If not specified or incorrect, will be defined
>> +         * later based on the machine architecture, configuration
>> +         * and memory availble when creating domain.
>> +         */
>> +        libxl_defbool_set(&dst->vnuma_autoplacement, false);
>>
> As agreed in the other part of this thread, there is already an
> automatic placement switch to act on, in this case.
>
> Note that the way that switch is being used right now in libxl code, may
> need to change a bit, but that is, nevertheless, the right thing to do.
> So have a look at that, please. :-)
>
>> +        if (nr_vnodemap == dst->vnodes) {
>> +            unsigned int vnodemask = 0, pnode, smask;
>> +            smask = ~(~0 << dst->vnodes);
>> +            for (i = 0; i < dst->vnodes; i++) {
>> +                pnode = get_list_item_uint(vnodemap, i);
>> +                if (pnode >= 0) {
>> +                    vnodemask |= (1 << i);
>> +                    dst->vnuma_vnodemap[i] = pnode;
>> +                }
>> +            }
>> +
>> +            /* Did it covered all vnodes in the mask? */
>> +            if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) {
>> +                fprintf(stderr, "WARNING: Not all vnodes were covered \
>> +                        vnuma_vnodemap.\n");
>> +                fprintf(stderr, "Automatic placement will be used for vnodes.\n");
>> +                libxl_defbool_set(&dst->vnuma_autoplacement, true);
>> +                vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
>> +            }
>> +        }
>> +        else {
>> +            fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n");
>> +            fprintf(stderr, "Automatic placement will be used for vnodes.\n");
>> +            libxl_defbool_set(&dst->vnuma_autoplacement, true);
>> +            vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
>> +        }
>> +    }
>>
> This is also about to change quite substantially, so I'm avoiding
> commenting on it...
>
>> +static void parse_vnuma_config(XLU_Config *config,
>> +                               libxl_domain_build_info *b_info)
>> +{
>> +    long l;
>> +
>> +    if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) {
>> +        if (l > MAX_VNUMA_NODES) {
>> +            fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n",
>> +                    MAX_VNUMA_NODES);
>> +            goto bad_vnuma_config;
>> +        }
>> +        b_info->vnodes = l;
>> +
>> +        if (!xlu_cfg_get_defbool(config, "vnuma_autoplacement",
>> +                    &b_info->vnuma_autoplacement, 0))
>> +            libxl_defbool_set(&b_info->vnuma_autoplacement, false);
>> +
> "vnuma_autoplacement" is, AFAIUI, going away, so no comments. ;-P
>
>
> Regards,
> Dario
>
> --
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
>

Majority of comments make sense to me.
Now I wanted to ask about the memory ranges :)
I am not sure at this point I should have them parsed in config file
(for the case where vNUMA node has multiple ranges) and as Ian has
mentioned I will be moving
these ranges away from libxl build info.
I guess for now It will be better to leave these ranges out of scope
of parsing code.
What do you think?

Thank you!

-- 
Elena

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc
  2014-09-12  2:04     ` Elena Ufimtseva
@ 2014-09-12  9:02       ` Dario Faggioli
  2014-09-12  9:31         ` Wei Liu
  0 siblings, 1 reply; 32+ messages in thread
From: Dario Faggioli @ 2014-09-12  9:02 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: Keir Fraser, Ian Campbell, Stefano Stabellini, George Dunlap,
	Matt Wilson, Ian Jackson, xen-devel, Jan Beulich, Wei Liu


[-- Attachment #1.1: Type: text/plain, Size: 2481 bytes --]

[Adding Wei]

On gio, 2014-09-11 at 22:04 -0400, Elena Ufimtseva wrote:
> Hello Dario
> 
Hi,

> Thank you for such a thorough review.
>
Yeah, well, sorry it took a bit! :-/

> Majority of comments make sense to me.
>
Cool. :-D

> Now I wanted to ask about the memory ranges :)
> I am not sure at this point I should have them parsed in config file
> (for the case where vNUMA node has multiple ranges) and as Ian has
> mentioned I will be moving
> these ranges away from libxl build info.
> I guess for now It will be better to leave these ranges out of scope
> of parsing code.
> What do you think?
> 
If, as I'm quite sure, with 'memory ranges' you're now talking about the
possibility of specifying more memory region for each node, yes, given
that we're keeping that out of the libxl interface, I'd live them out of
xl too.

If, with 'memory ranges', you mean the possibility of specifying
different memory size for each node, I think I'd keep this.

In fact, bbout memory, I'm fine with how you right now deal with
vnuma_mem (which I suggested to rename to "vnuma_memory" or
"vnuma_maxmem"), i.e.:

vnuma_memory = [1024, 1024, 2048, 2048]

When, in future, we will introduce toolstack support for defining nodes
with multiple memory regions, we can extend this syntax quite easily, to
something like

vnuma_memory = ["128,896", 256,256,512"]

with the following meaning: we have two nodes; node 0 has two regions,
one 128MB big, the other 896MB big. Node 1 has 3 regions, the first twos
256MB big, the third 512MB big.

Just to give an example.

If we want, that can be even more specific, i.e.:

vnuma_memory = ["128@0xffffaaa,896@0xccccdddd", ...]

meaning: node 0 has one region 128MB big, starting at address 0xffffaaa,
and another 896MB big, starting at address 0xccccdddd (ISTR we agreed on
something very similar to this for Arianna's iomem series).

This is just an example, of course, the point being that the current
interface is good for now, and can be easily extended, in a backward
compatible way.

This is my take on this, hoping I understood your question. Ian's
opinion is probably the one you want, though. :-)

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc
  2014-09-12  9:02       ` Dario Faggioli
@ 2014-09-12  9:31         ` Wei Liu
  2014-09-12  9:59           ` Dario Faggioli
  0 siblings, 1 reply; 32+ messages in thread
From: Wei Liu @ 2014-09-12  9:31 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Keir Fraser, Ian Campbell, Stefano Stabellini, George Dunlap,
	Matt Wilson, Ian Jackson, xen-devel, Jan Beulich, Wei Liu,
	Elena Ufimtseva

On Fri, Sep 12, 2014 at 11:02:30AM +0200, Dario Faggioli wrote:
> [Adding Wei]
> 
> On gio, 2014-09-11 at 22:04 -0400, Elena Ufimtseva wrote:
> > Hello Dario
> > 
> Hi,
> 
> > Thank you for such a thorough review.
> >
> Yeah, well, sorry it took a bit! :-/
> 

Sorry from me as well. I've been tracking this discussion for some time
but never weighted in because I needed to get my other 8 patches ready
for the freeze. No pressure. ;-)

> > Majority of comments make sense to me.
> >
> Cool. :-D
> 
> > Now I wanted to ask about the memory ranges :)
> > I am not sure at this point I should have them parsed in config file
> > (for the case where vNUMA node has multiple ranges) and as Ian has
> > mentioned I will be moving
> > these ranges away from libxl build info.
> > I guess for now It will be better to leave these ranges out of scope
> > of parsing code.
> > What do you think?
> > 
> If, as I'm quite sure, with 'memory ranges' you're now talking about the
> possibility of specifying more memory region for each node, yes, given
> that we're keeping that out of the libxl interface, I'd live them out of
> xl too.
> 

Agreed. We just need to make the syntax backward compatible.

> If, with 'memory ranges', you mean the possibility of specifying
> different memory size for each node, I think I'd keep this.
> 
> In fact, bbout memory, I'm fine with how you right now deal with
> vnuma_mem (which I suggested to rename to "vnuma_memory" or
> "vnuma_maxmem"), i.e.:
> 

I don't thin vnuma_maxmem reflects what this parameter means. The list
controls distribution of memory not the maximum amount. The sum of this
list should be equal to maxmem= in config file.

> vnuma_memory = [1024, 1024, 2048, 2048]
> 
> When, in future, we will introduce toolstack support for defining nodes
> with multiple memory regions, we can extend this syntax quite easily, to
> something like
> 
> vnuma_memory = ["128,896", 256,256,512"]
> 
> with the following meaning: we have two nodes; node 0 has two regions,
> one 128MB big, the other 896MB big. Node 1 has 3 regions, the first twos
> 256MB big, the third 512MB big.
> 
> Just to give an example.
> 
> If we want, that can be even more specific, i.e.:
> 
> vnuma_memory = ["128@0xffffaaa,896@0xccccdddd", ...]
> 
> meaning: node 0 has one region 128MB big, starting at address 0xffffaaa,
> and another 896MB big, starting at address 0xccccdddd (ISTR we agreed on
> something very similar to this for Arianna's iomem series).
> 

I think this syntax is good.

Wei.

> This is just an example, of course, the point being that the current
> interface is good for now, and can be easily extended, in a backward
> compatible way.
> 
> This is my take on this, hoping I understood your question. Ian's
> opinion is probably the one you want, though. :-)
> 
> Regards,
> Dario
> 
> -- 
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc
  2014-09-12  9:31         ` Wei Liu
@ 2014-09-12  9:59           ` Dario Faggioli
  0 siblings, 0 replies; 32+ messages in thread
From: Dario Faggioli @ 2014-09-12  9:59 UTC (permalink / raw)
  To: Wei Liu
  Cc: Keir Fraser, Ian Campbell, Stefano Stabellini, George Dunlap,
	Matt Wilson, Ian Jackson, xen-devel, Jan Beulich,
	Elena Ufimtseva


[-- Attachment #1.1: Type: text/plain, Size: 1083 bytes --]

On ven, 2014-09-12 at 10:31 +0100, Wei Liu wrote:
> On Fri, Sep 12, 2014 at 11:02:30AM +0200, Dario Faggioli wrote:

> > If, with 'memory ranges', you mean the possibility of specifying
> > different memory size for each node, I think I'd keep this.
> > 
> > In fact, bbout memory, I'm fine with how you right now deal with
> > vnuma_mem (which I suggested to rename to "vnuma_memory" or
> > "vnuma_maxmem"), i.e.:
> > 
> 
> I don't thin vnuma_maxmem reflects what this parameter means. The list
> controls distribution of memory not the maximum amount. The sum of this
> list should be equal to maxmem= in config file.
> 
I think you're right. And until we won't have NUMA aware ballooning,
what the maximum amount is will be a bit fuzzy! :-P

So, "vnuma_memory" it is, then. :-)

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 6/9] libxl: build numa nodes memory blocks
  2014-09-03  4:24 ` [PATCH v10 6/9] libxl: build numa nodes memory blocks Elena Ufimtseva
  2014-09-03 15:21   ` Ian Campbell
@ 2014-09-12 10:18   ` Dario Faggioli
  1 sibling, 0 replies; 32+ messages in thread
From: Dario Faggioli @ 2014-09-12 10:18 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	lccycc123, ian.jackson, xen-devel, JBeulich


[-- Attachment #1.1: Type: text/plain, Size: 8405 bytes --]


On mer, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
> Create the vmemrange structure based on the
> PV guests E820 map. Values are in in Megabytes.
> Also export the E820 filter code e820_sanitize
> out to be available internally.
>
Wait: how come we export something, so it will be available
internally? :-O

> As Xen can support mutiranges vNUMA nodes, for
> PV guest it is one range per one node. The other
> domain types should have their building routines
> implemented.
> 
> Changes since v8:
>     - Added setting of the vnuma memory ranges node ids;
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---

> --- a/tools/libxl/libxl_numa.c
> +++ b/tools/libxl/libxl_numa.c
> @@ -19,6 +19,10 @@
>  
>  #include "libxl_internal.h"
>  
> +#include "libxl_vnuma.h"
> +
> +#include "xc_private.h"
> +
>  /*
>   * What follows are helpers for generating all the k-combinations
>   * without repetitions of a set S with n elements in it. Formally
> @@ -508,6 +512,203 @@ int libxl__get_numa_candidate(libxl__gc *gc,
>  }
>  
>  /*
> + * Check if we can fit vnuma nodes to numa pnodes
> + * from vnode_to_pnode array.
> + */
> +bool libxl__vnodemap_is_usable(libxl__gc *gc,
> +                            libxl_domain_build_info *info)
> +{
I see why bool is being used, but I honestly think int, and then
returning proper LIBXL error codes, would be better, here.

Also, the name. You're checking/verifying that there is enough space, so
I'd make the function name reflect that (although, that is not something
super important).

Oh, and why is this function being defined in this patch? It's not used
anywhere in here and, although related, it seems to deal with something
quite different from what the rest of the code is about.

Perhaps you can move it somewhere else, and make this an x86 specific
patch, dealing with e820 and all the other stuff.

Finally, indentation (of the second parameters, in the new line) looks
wrong.

> +    unsigned int i;
> +    libxl_numainfo *ninfo = NULL;
> +    unsigned long long *claim;
>
In libxl_numainfo, free is an array of uint64_t. I think it'd be best to
make claim and free types match.

> +    unsigned int node;
>
This 'node' variable can live inside the first for.

> +    uint64_t *sz_array;
> +    int nr_nodes = 0;
> +
> +    /* Cannot use specified mapping if not NUMA machine. */
> +    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
> +    if (ninfo == NULL)
> +        return false;
> +
For example, here you should return ERROR_FAIL, rather than just false.

> +    sz_array = info->vnuma_mem;
>
Do you really need this indirection? Why can't you use info->vnuma_mem
directly?

> +    claim = libxl__calloc(gc, info->vnodes, sizeof(*claim));
> +    /* Get total memory required on each physical node. */
> +    for (i = 0; i < info->vnodes; i++)
> +    {
> +        node = info->vnuma_vnodemap[i];
> +
> +        if (node < nr_nodes)
> +            claim[node] += (sz_array[i] << 20);
> +        else
> +            goto vnodemapout;
> +   }
> +   for (i = 0; i < nr_nodes; i++) {
> +       if (claim[i] > ninfo[i].free)
> +          /* Cannot complete user request, falling to default. */
> +          goto vnodemapout;
> +   }
>
Both these goto could well have been just 'return true'. If you're
making this returning int, they both can be 'return 0'.

> +
> + vnodemapout:
> +   return true;
>
BTW, how can you distinguish the cases where there is enough space on
the pnodes, from the one where there is not? I guess that will be more
clear when looking at the callsites of this function, but I'd put things
in such a way that makes it evident from just reading the code of this
very function...

> +}
> +

> +/*
> + * Returns number of absent pages within e820 map
> + * between start and end addresses passed. Needed
> + * to correctly set numa memory ranges for domain.
> + */
> +static unsigned long e820_memory_hole_size(unsigned long start,
> +                                            unsigned long end,
> +                                            struct e820entry e820[],
> +                                            unsigned int nr)
>
Indentation looks wrong here too.

Also, how about a more "talking" name for nr ?

Apart from these things, my e820-fu is very limited, so I'm not sure I
can say much about the function. FWIW, I agree with Ian that this should
live somewhere more architecture specific.

> +/*
> + * For each node, build memory block start and end addresses.
> + * Substract any memory hole from the range found in e820 map.
> + * vnode memory size are passed here in megabytes, the result is
> + * in memory block addresses.
> + * Linux kernel will adjust numa memory block sizes on its own.
> + * But we want to provide to the kernel numa block addresses that
> + * will be the same in kernel and hypervisor.
> + */
> +int libxl__vnuma_align_mem(libxl__gc *gc,
> +                            uint32_t domid,
> +                            /* IN: mem sizes in megabytes */
> +                            libxl_domain_build_info *b_info,
> +                            /* OUT: linux NUMA blocks addresses */
> +                            vmemrange_t *memblks)
> +{
> +    unsigned int i;
> +    int j, rc;
> +    uint64_t next_start_blk, end_max = 0, size;
> +    uint32_t nr;
> +    struct e820entry map[E820MAX];
> +
> +    errno = ERROR_INVAL;
> +    if (b_info->vnodes == 0)
> +        return -EINVAL;
>
Aren't we sure that the number of virtual nodes is at least 1 (or
perhaps even more!), at this point?

In any case, I think this should be ERROR_INVAL.

> +
> +    if (!memblks || !b_info->vnuma_mem)
> +        return -EINVAL;
> +
Same here.

> +    libxl_ctx *ctx = libxl__gc_owner(gc);
> +
> +    /* Retrieve e820 map for this host. */
> +    rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX);
> +
> +    if (rc < 0) {
> +        errno = rc;
> +        return -EINVAL;
> +    }
>
I'm sure you should be returning a libxl error code. About errno, I
don't think that, as opposed to libxc, we ever set errno in libxl.

Actually, it should be xc_get_memory_map that, on error, sets errno and
returns -1. Whether that is what's happening or not, it's a different
story! :-)

> +    nr = rc;
>
I'd use nr directly, instead of fiddling with rc:

    nr = xc_get_machine_memory_map(...);
    if (nr < 0)
        return ERROR_FAIL;

Consider using a better name for nr here too (not super important, but
still...)

> +    rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb,
> +                       (b_info->max_memkb - b_info->target_memkb) +
> +                       b_info->u.pv.slack_memkb);
> +    if (rc) {
> +        errno = rc;
> +        return -EINVAL;
> +    }
> +
Same here. Return a meaningful libxl error code (probably ERROR_FAIL),
and forget about errno.

And this is true throughout the rest of the patch.

> --- a/tools/libxl/libxl_x86.c
> +++ b/tools/libxl/libxl_x86.c
> @@ -1,5 +1,6 @@
>  #include "libxl_internal.h"
>  #include "libxl_arch.h"
> +#include "libxl_vnuma.h"
>  
>  static const char *e820_names(int type)
>  {
> @@ -14,7 +15,7 @@ static const char *e820_names(int type)
>      return "Unknown";
>  }
>  
> -static int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
> +int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
>                           uint32_t *nr_entries,
>                           unsigned long map_limitkb,
>                           unsigned long balloon_kb)
>
Doing it the other way around, i.e., not exporting this one, moving your
stuff in this file and figure out a way for calling it in common code,
is probably the way to go.

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 7/9] libxc: allocate domain memory for vnuma enabled
  2014-09-03  4:24 ` [PATCH v10 7/9] libxc: allocate domain memory for vnuma enabled Elena Ufimtseva
  2014-09-03 15:26   ` Ian Campbell
@ 2014-09-12 11:06   ` Dario Faggioli
  1 sibling, 0 replies; 32+ messages in thread
From: Dario Faggioli @ 2014-09-12 11:06 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	lccycc123, ian.jackson, xen-devel, JBeulich, Wei Liu


[-- Attachment #1.1: Type: text/plain, Size: 4499 bytes --]

On mer, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:
> vNUMA-aware domain memory allocation based on provided
> vnode to pnode map. If this map is not defined, use
> default allocation. Default allocation will not specify
> any physical node when allocating memory.
> Domain creation will fail if at least one node was not defined.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---

> @@ -385,6 +395,9 @@ static inline xen_pfn_t xc_dom_p2m_guest(struct xc_dom_image *dom,
>  int arch_setup_meminit(struct xc_dom_image *dom);
>  int arch_setup_bootearly(struct xc_dom_image *dom);
>  int arch_setup_bootlate(struct xc_dom_image *dom);
> +int arch_boot_alloc(struct xc_dom_image *dom);
> +
> +#define LIBXC_VNUMA_NO_NODE ~((unsigned int)0)
>  
(~0U) should be fine

> --- a/tools/libxc/xc_dom_x86.c
> +++ b/tools/libxc/xc_dom_x86.c
> @@ -759,7 +759,7 @@ static int x86_shadow(xc_interface *xch, domid_t domid)
>  int arch_setup_meminit(struct xc_dom_image *dom)
>  {
>      int rc;
> -    xen_pfn_t pfn, allocsz, i, j, mfn;
> +    xen_pfn_t pfn, i, j, mfn;
>  
>      rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type);
>      if ( rc )
> @@ -811,25 +811,77 @@ int arch_setup_meminit(struct xc_dom_image *dom)
>          /* setup initial p2m */
>          for ( pfn = 0; pfn < dom->total_pages; pfn++ )
>              dom->p2m_host[pfn] = pfn;
> +
> +        /*
> +         * Any PV domain should have at least one vNUMA node.
> +         * If no config was defined, one default vNUMA node
> +         * will be set.
> +         */
> +        if ( dom->vnodes == 0 ) {
> +            xc_dom_printf(dom->xch,
> +                         "%s: Cannot construct vNUMA topology with 0 vnodes\n",
> +                         __FUNCTION__);
> +            return -EINVAL;
>
And here, at least in theory, the opposite of what we've been saying for
libxl holds, so you should be setting errno, and returning -1.

However, that is not consistently applied throughout all of libxc, so
have a look at what the convention is for this file/function, as well as
for the functions called by this function, and follow it.

> +/*
> + * Allocates domain memory taking into account
> + * defined vnuma topology and vnode_to_pnode map.
> + * Any pv guest will have at least one vnuma node
> + * with vnuma_memszs[0] = domain memory and the rest
> + * topology initialized with default values.
>
Maybe mention here where/who takes care of that, at the libxc level.

> + */
> +int arch_boot_alloc(struct xc_dom_image *dom)
> +{
> +    int rc;
> +    unsigned int n, memflags;
> +    unsigned long long vnode_pages;
> +    unsigned long long allocsz = 0, node_pfn_base, i;
> +
> +    rc = allocsz = node_pfn_base = n = 0;
> +
> +    for ( n = 0; n < dom->vnodes; n++ )
> +    {
> +        memflags = 0;
> +        if ( dom->vnode_to_pnode[n] != LIBXC_VNUMA_NO_NODE )
> +        {
> +            memflags |= XENMEMF_exact_node(dom->vnode_to_pnode[n]);
> +            memflags |= XENMEMF_exact_node_request;
> +        }
> +        /* memeszs are in megabytes, calc pages from it for this node. */
> +        vnode_pages = (dom->numa_memszs[n] << 20) >> PAGE_SHIFT_X86;
> +        for ( i = 0; i < vnode_pages; i += allocsz )
> +        {
> +            allocsz = vnode_pages - i;
> +            if ( allocsz > 1024*1024 )
> +                allocsz = 1024*1024;
> +
> +            rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
> +                                            allocsz, 0, memflags,
> +                                            &dom->p2m_host[node_pfn_base + i]);
> +            if ( rc )
> +            {
> +                xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
> +                        "%s: Failed allocation of %Lu pages for vnode %d on pnode %d out of %lu\n",
> +                        __FUNCTION__, vnode_pages, n, dom->vnode_to_pnode[n], dom->total_pages);
> +                return rc;
> +            }
> +        }
> +        node_pfn_base += i;
>
This looks fine, but I also think we better sanity check things against
total_pages, as the old code was doing.

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 8/9] libxl: vnuma nodes placement bits
  2014-09-03  4:24 ` [PATCH v10 8/9] libxl: vnuma nodes placement bits Elena Ufimtseva
  2014-09-03 15:50   ` Konrad Rzeszutek Wilk
  2014-09-03 15:52   ` Ian Campbell
@ 2014-09-12 16:51   ` Dario Faggioli
  2 siblings, 0 replies; 32+ messages in thread
From: Dario Faggioli @ 2014-09-12 16:51 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: keir, Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	lccycc123, ian.jackson, xen-devel, JBeulich, Wei Liu


[-- Attachment #1.1: Type: text/plain, Size: 12867 bytes --]

On mer, 2014-09-03 at 00:24 -0400, Elena Ufimtseva wrote:

> --- a/tools/libxl/libxl_dom.c
> +++ b/tools/libxl/libxl_dom.c
> @@ -23,6 +23,7 @@
>  #include <xc_dom.h>
>  #include <xen/hvm/hvm_info_table.h>
>  #include <xen/hvm/hvm_xs_strings.h>
> +#include <libxl_vnuma.h>
>  
>  libxl_domain_type libxl__domain_type(libxl__gc *gc, uint32_t domid)
>  {
> @@ -227,12 +228,114 @@ static void hvm_set_conf_params(xc_interface *handle, uint32_t domid,
>                      libxl_defbool_val(info->u.hvm.nested_hvm));
>  }
>  
> +/* sets vnode_to_pnode map. */
> +static int libxl__init_vnode_to_pnode(libxl__gc *gc, uint32_t domid,
> +                        libxl_domain_build_info *info)
>
Indentation.

> +{
> +    unsigned int i, n;
> +    int nr_nodes = 0;
> +    uint64_t *vnodes_mem;
> +    unsigned long long *nodes_claim = NULL;
> +    libxl_numainfo *ninfo = NULL;
> +
> +    if (info->vnuma_vnodemap == NULL) {
> +        info->vnuma_vnodemap = libxl__calloc(gc, info->vnodes,
> +                                      sizeof(*info->vnuma_vnodemap));
> +    }
> +
So, help me a bit, do you mind? :-D

If I'm not wrong, here, vnuma_vnodemap is NULL (only?) if no vnode to
pnode mapping has been specified in the config file, and so you have to
allocate it for the first time. In this case, info->vnodes will be 1, so
you're allocating a 1 element array (which is not wrong, I'm just
asking). Is this correct?

If yes, I repeat what I said while reviewing one of the other patches, I
think this can and, if yes, is better done in
libxl__domain_build_info_setdefault().

Or am I missing something.

> +    /* default setting. */
> +    for (i = 0; i < info->vnodes; i++)
> +        info->vnuma_vnodemap[i] = LIBXC_VNUMA_NO_NODE;
> +
This too, of course. BTW, LIBXC_ ?

> +    /* Get NUMA info. */
> +    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
> +    if (ninfo == NULL)
> +        return ERROR_FAIL;
>
The error reporting is ok, this time. However, I think you should hold
the error code in an rc variable, have a 'goto out;' here and, at the
'out' label, issue the proper cleanups.

There may not be much to cleanup a this very stage, but it's a pattern
that is usually pretty much always followed in libxl.

> +    /* Nothing to see if only one NUMA node. */
> +    if (nr_nodes <= 1)
> +        return 0;
> +
And in fact, once you're here, you've got at least
libxl_numainfo_list_free() disposal function to call. That's why I was
saying above, have an rc variable and return only at the bottom, using a
goto label.

> +    vnodes_mem = info->vnuma_mem;
> +    /*
> +     * TODO: change algorithm. The current just fits the nodes
> +     * by its memory sizes. If no p-node found, will be used default
> +     * value of LIBXC_VNUMA_NO_NODE.
> +     */
> +    nodes_claim = libxl__calloc(gc, info->vnodes, sizeof(*nodes_claim));
> +    if ( !nodes_claim )
> +        return ERROR_FAIL;
> +
Same as above, about the 'goto out'.

However, rather than changing the algorithm, I think this last part of
the function (the one below this very comment) should just go away or,
at most, be moved somewhere else.

I'll try as hard as I can to explain how I think this code and the
existing NUMA automatic placement should interact. To achieve that, I
may also consider sending some code, in the form of a draft patch (but
that will likely happen on Monday).

> +    libxl_for_each_set_bit(n, info->nodemap)
> +    {
> +        for (i = 0; i < info->vnodes; i++)
> +        {
> +            unsigned long mem_sz = vnodes_mem[i] << 20;
> +            if ((nodes_claim[n] + mem_sz <= ninfo[n].free) &&
> +                 /* vnode was not set yet. */
> +                 (info->vnuma_vnodemap[i] == LIBXC_VNUMA_NO_NODE ) )
> +            {
> +                info->vnuma_vnodemap[i] = n;
> +                nodes_claim[n] += mem_sz;
> +            }
> +        }
> +    }
> +
So, AFAIUI, in absence of any indication from the user, you are trying
to 'compact' things as much as possible, i.e., putting the vnodes on the
smallest possible number of pnode.

This is fine, but does not belong here (more details below).

> +    return 0;
> +}
> +
> +/*
> + * Builds vnode memory regions from configuration info
> + * for vnuma nodes with multiple regions.
> + */
> +static int libxl__build_vnuma_ranges(libxl__gc *gc,
> +                              uint32_t domid,
> +                              /* IN: mem sizes in megabytes */
> +                              libxl_domain_build_info *b_info,
> +                              /* OUT: linux NUMA blocks addresses */
> +                              vmemrange_t **memrange)
>
This is going away, right?

>  int libxl__build_pre(libxl__gc *gc, uint32_t domid,
>                libxl_domain_config *d_config, libxl__domain_build_state *state)
>  {
>      libxl_domain_build_info *const info = &d_config->b_info;
>      libxl_ctx *ctx = libxl__gc_owner(gc);
>      char *xs_domid, *con_domid;
> +    struct vmemrange *memrange;
>      int rc;
>  
>      if (xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus) != 0) {
> @@ -240,6 +343,20 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
>          return ERROR_FAIL;
>      }
>  
> +    if (libxl__build_vnuma_ranges(gc, domid, info, &memrange) != 0) {
> +        LOG(DETAIL, "Failed to build vnuma nodes memory ranges.\n");
> +        return ERROR_FAIL;
> +
> +    }
> +
> +    /*
> +     * NUMA placement and vNUMA autoplacement handling:
> +     * If numa_placement is set to default, do not use vnode to pnode
> +     * mapping as automatic placement algorithm will find best numa nodes.
> +     * If numa_placement is not used, we can try and use domain vnode
> +     * to pnode mask.
> +     */
> +
>      /*
>       * Check if the domain has any CPU or node affinity already. If not, try
>       * to build up the latter via automatic NUMA placement. In fact, in case
> @@ -298,7 +415,33 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
>                                     NULL, &cpumap_soft);
>  
>          libxl_bitmap_dispose(&cpumap_soft);
> +
> +        /*
> +         * If vnode_to_pnode mask was defined, dont use it if we automatically
> +         * place domain on NUMA nodes, just give warning.
> +         */
> +        if (!libxl_defbool_val(info->vnuma_autoplacement)) {
> +            LOG(INFO, "Automatic NUMA placement for domain is turned on. \
> +                vnode to physical nodes mapping will not be used.");
> +        }
> +        if (libxl__init_vnode_to_pnode(gc, domid, info) < 0) {
> +            LOG(ERROR, "Failed to build vnode to pnode map\n");
> +            return ERROR_FAIL;
> +        }
> +    } else {
> +        if (!libxl_defbool_val(info->vnuma_autoplacement)) {
> +                if (!libxl__vnodemap_is_usable(gc, info)) {
> +                    LOG(ERROR, "Defined vnode to pnode domain map cannot be used.\n");
> +                    return ERROR_FAIL;
> +                }
> +        } else {
> +            if (libxl__init_vnode_to_pnode(gc, domid, info) < 0) {
> +                LOG(ERROR, "Failed to build vnode to pnode map.\n");
> +                return ERROR_FAIL;
> +            }
> +        }
>      }
> +
The additional vnuma_autoplacement bool switch is going away, so this is
changing a lot in next version, and I'll comment directly on that.

About the interactions between automatic placement and vNUMA, here's
what I think.

There should exist the following situations:
 1) the user does not specify *any* vNUMA topology (or explicitly ask
    for 1 node, which is pointless, but possible), in the config file;
 2) the user specifies a vNUMA topology, and also a vnode to pnode map,
    in the config file;
 3) the user specifies a vNUMA topology, but _not_ a vnode to pnode
    map, in the config file;

Am I missing anything?

In all of the above cases, I think libxc should always receive a well
filled and consistent dom->vnode_to_pnode structure, and build the
domain image according to that. What changes, is where and how we
populate such vnode_to_pnode structure.

Here's how I'd proceed:

1) nothing to do, wrt vNUMA. In my opinion,
   libxl__domain_build_info_setdefault() should already set all the
   vNUMA libxl fields to match this situation, and we should not touch
   them in this case. libxl__dom_vnuma_init() (see below) will do the
   libxl-->libxc 'translation', and we're done.
   Note that this does not have any relationship or dependency with
   automatic NUMA placement, and with the existing numa_placement
   boolean build_info switch. That may or may not be enabled, for
   reasons vNUMA does not have to care or deal with!

2) quite similar to 1: we have to do something, but nothing too fancy or
   complicated. In fact, the values of the vNUMA libxl fields are again
   already set for us to what we want to just pass down to libxc... The
   only difference is that they've been set by xl, rather than by
   libxl__domain_build_info_setdefault(). :-)
   The only *practical* difference is that, as soon as we recognize that
   we're being explicitly passed a vnode to pnode map, we should disable
   automatic placement. That can easily happen by means of the existing
   boolean numa_placement switch, in build_info. What I'd do, is to set
   it to false from within xl, in the same code branch where the
   vnode_to_pnode config entry is parsed (look at what happens when, in
   xl, we find an hard or soft affinity being explicitly set).

3) in this case, since the user has not been explicit about vnode to
   pnode placement, we need to figure that out ourselves, and that is
   the job of NUMA automatic placement. What we hence do is leaving the
   automatic placement enabled, and modify the algorithm in order for it
   to:
    - take into account the vNUMA topology, when looking for an optimal
      placement solution,
    - provide a valid vnode_to_pnode map, as an output.
   At that point, such outputted vnode_to_pnode map will be passed down
   to libxc, as in all the previous cases.

I understand that this may look complex, but this is probably one of the
cases where the explanation is more complex than the code itself! :-P

In any case, I felt like it was important for me to provide my view on
how this should be implemented. I also appreciate that, especially
modifying the automatic placement logic, may be a bit hard for someone
which is not me. So, if Elena, wants to have a go on this, please, be my
guest. :-)

In case I don't see anything on xen-devel, as I said, I'll hack up a
prototype on Monday, and send it on the list.
 
> +/*
> + * Function fills the xc_dom_image with memory sizes for later
> + * use in domain memory allocator. No regions here are being
> + * filled and used in allocator as the regions do belong to one node.
> + */
> +static int libxl__dom_vnuma_init(struct libxl_domain_build_info *info,
> +                                 struct xc_dom_image *dom)
> +{
> +    errno = ERROR_INVAL;
> +
> +    if (info->vnodes == 0)
> +        return -1;
> +
> +    info->vmemranges = info->vnodes;
> +
> +    dom->vnode_to_pnode = (unsigned int *)malloc(
> +                            info->vnodes * sizeof(*info->vnuma_vnodemap));
> +    dom->numa_memszs = (uint64_t *)malloc(
> +                          info->vnodes * sizeof(*info->vnuma_mem));
> +
This is still libxl, so I think libxl__malloc() and friends is still the
right call to use, I think. If you don't want the memory you're
allocating to be garbage collected automatically, just use NOGC.

> +    errno = ERROR_FAIL;
> +    if ( dom->numa_memszs == NULL || dom->vnode_to_pnode == NULL ) {
> +        info->vnodes = 0;
> +        if (dom->vnode_to_pnode)
> +            free(dom->vnode_to_pnode);
> +        if (dom->numa_memszs)
> +            free(dom->numa_memszs);
> +        return -1;
> +    }
>
No errno, and return only libxl error codes in libxl. However, if you
use the proper libxl__*alloc() variant, error handling will just go
away! :-)

> +    memcpy(dom->numa_memszs, info->vnuma_mem,
> +            sizeof(*info->vnuma_mem) * info->vnodes);
> +    memcpy(dom->vnode_to_pnode, info->vnuma_vnodemap,
> +            sizeof(*info->vnuma_vnodemap) * info->vnodes);
> +
> +    dom->vnodes = info->vnodes;
> +
> +    return 0;
> +}
> +
Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 5/9] libxl: vnuma types declararion
       [not found]       ` <1409826927.15057.22.camel@kazak.uk.xensource.com>
  2014-09-08 16:13         ` Dario Faggioli
@ 2014-09-16  6:17         ` Elena Ufimtseva
  2014-09-16  7:16           ` Dario Faggioli
  1 sibling, 1 reply; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-16  6:17 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Keir Fraser, Stefano Stabellini, George Dunlap, Matt Wilson,
	Dario Faggioli, Li Yechen, Ian Jackson, xen-devel, Jan Beulich


[-- Attachment #1.1: Type: text/plain, Size: 5435 bytes --]

On Thu, Sep 4, 2014 at 6:35 AM, Ian Campbell <Ian.Campbell@citrix.com>
wrote:

> On Wed, 2014-09-03 at 23:48 -0400, Elena Ufimtseva wrote:
> > > What happens if a guest has 33 vcpus?
> > >
> > > Or maybe the output of this is a single vcpu number?
> > >
> > >> +    ("vdistance",       Array(uint32, "num_vdistance")),
> > >
> > > nodes to distances? Can num_vdistance ever differ from vnodes?
> >
> > num_vdistance is a square of vnodes.
> > >
> > > I thought distances were between two nodes, in which case doesn't this
> > > need to be multidimensional?
> >
> > yes, but parser just fills it on per-row order, treating it as it is a
> > multidimensional array.
>
> I suppose this is a workaround for the lack of multidimensional arrays
> in the IDL?
>
> If this is to be done then it needs a comment I think. In particular the
> stride length needs to be clearly defined.
>
> > > You appear to unconditionally overwrite whatever the users might have
> > > put here.
> > >
> > >> +    ("vnuma_autoplacement",  libxl_defbool),
> > >
> > > Why/how does this differ from the existing numa placement option?
> >
> >
> > Basically, its meaning can be briefly described as to try to build
> > vnode to pnode mapping if automatic vnuma placement
> > enabled, or try to use the user-defined one.
> > But I am thinking maybe you are right, and there is no need in that
> > vnuma_autoplacement at all.
> > Just to remove it, and if numa placement is in automatic mode, just
> > use it. If not, try to use vnode to pnode mapping.
> >
> > Now I think that is what Dario was talking about for some time )
> >
> >
> > Ok, now going back to these types.
> >
> >   ("vnodes",          uint32),
> >   ("vmemranges",      uint32),
> >   ("vnuma_mem",       Array(uint64, "num_vnuma_mem")),
> >   ("vnuma_vcpumap",   Array(uint32, "num_vnuma_vcpumap")),
> >   ("vdistance",       Array(uint32, "num_vdistance")),
> >   ("vnuma_vnodemap",  Array(uint32, "num_vnuma_vnondemap")),
> >
> >
> > vnodes -  number of vnuma nodes;
> > vmemranges - number of vmemranges (not used, but added support in
> > Xen); Not sure how to deal with this. Should I just remove it as its
> > not used by tool stack yet?
>
> I think that would be best, otherwise we risk tying ourselves to an API
> which might not actually work when we come to try and use it.
>
> If you need the concept internally while building you could add such
> things to libxl__domain_build_state instead.
>
> > vnuma_mem - vnode memory sizes (conflicts now with vmemranges);
> > vnuma_vcpumap - map of vnuma nodes and vcpus (length is number of
> > vcpus, each element is vnuma node number);
> > vdistance - distances, num_vdistance = vnodes * vnodes;
> > vnuma_vnodemap - mapping of vnuma nodes to physical NUMA nodes;
> >
> > And vnuma_autoplacement probably not needed here at all.
> >
> > About vmemranges. They will be needed for vnuma nodes what have
> > multiple ranges of memories.
> > I added it in type definition to specify support when calling
> > hypercall. As of now, PV domain have one memory range per node.
> >
> > There is libxl__build_vnuma_ranges function which for PV domain
> > basically converts vnuma memory node sizes into ranges.
> > But for PV domain there is only one range per vnode.
> >
> > Maybe there is a better way of doing this, maybe even remove confusing
> > at this point vmemranges?
>
> That would help, I think.
>
> Then I would take the rest and wrap them in a libxl_vnode_info struct:
>
> libxl_vnode_info = Struct("libxl_vnode_info", [
>     ("mem", MemKB), # Size of this node's memory
>     ("distances", Array(...)), # Distances from this node to the others
> (single dimensional array)
>     ("pnode", TYPE), # which pnode this vnode is associated with
> ])
>
> Then in the (b|c)_config
>      ("vnuma_nodes", Array(libxl_vnode_info...))
>

Hi Ian

Is there a better mechanism to parse first vnuma_nodes, and within it to
parse distances array?
So lets say config will be looking like this:

vnuma_nodes = [ a b c, d e f, g e k,... ]
but c, f, k cannot be arrays. Parser does not expect to have one more
string withing each element of a topmost string.

So this does not work:

vnuma_nodes  =  [ a b "distance array1", d e "distance array2", ...]
also this does not work:

vnuma_node = [ [], [], [], ..]
Looks like that I was trying to avoid with emulating multi dimensional
array for distance still persists here.

Maybe you know some tricky way to get around this?

Thank you!



>
> This avoids the issue of having to have various num_* which are expected
> to always be identical. Except for vnuma_nodes[N].num_distances, but I
> think that is unavoidable.
>
> The only thing I wasn't sure of was how to fit the vcpumap into this.
> AIUI this is a map from vcpu number to vnode number (i.e. for each vcpu
> it tells you which vnuma node it is part of). I presume it isn't
> possible/desirable to have a vcpu be part of multiple vnuma nodes.
>
> One possibility would be to have a libxl_bitmap/cpumap as part of the
> libxl_vnode_info (containing the set of vcpus which are part of this
> node) but that would be prone to allowing a vcpu to be in multiple
> nodes.
>
> Another possibility would be an array in (b|c)_config as you have now,
> which would be annoying because it's num_foo really ought to be the
> ->vcpus count and the IDL doesn't really allow that.
>
> I'm not sure what is the lesser of the two evils here.
>
> Ian.
>
>


-- 
Elena

[-- Attachment #1.2: Type: text/html, Size: 7170 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 5/9] libxl: vnuma types declararion
  2014-09-16  6:17         ` Elena Ufimtseva
@ 2014-09-16  7:16           ` Dario Faggioli
  2014-09-16 12:57             ` Elena Ufimtseva
  0 siblings, 1 reply; 32+ messages in thread
From: Dario Faggioli @ 2014-09-16  7:16 UTC (permalink / raw)
  To: Elena Ufimtseva
  Cc: Keir Fraser, Ian Campbell, Stefano Stabellini, George Dunlap,
	Matt Wilson, Li Yechen, Ian Jackson, xen-devel, Jan Beulich


[-- Attachment #1.1: Type: text/plain, Size: 2231 bytes --]

On Tue, 2014-09-16 at 02:17 -0400, Elena Ufimtseva wrote:

>         
>         Then I would take the rest and wrap them in a libxl_vnode_info
>         struct:
>         
>         libxl_vnode_info = Struct("libxl_vnode_info", [
>             ("mem", MemKB), # Size of this node's memory
>             ("distances", Array(...)), # Distances from this node to
>         the others (single dimensional array)
>             ("pnode", TYPE), # which pnode this vnode is associated
>         with
>         ])
>         
>         Then in the (b|c)_config
>              ("vnuma_nodes", Array(libxl_vnode_info...))
> 
> 
> Hi Ian
> 
> 
> Is there a better mechanism to parse first vnuma_nodes, and within it
> to parse distances array?
>
Parse? Why parse? The above is the suggested look of the _libxl_
interface, not of the xl config file.

For the xl config file, IMO, what you have in place is already fine,
with the modification of the name of the various options, as I mentioned
in one of my review message.

> So lets say config will be looking like this:
> 
> 
> vnuma_nodes = [ a b c, d e f, g e k,... ]
> but c, f, k cannot be arrays. Parser does not expect to have one more
> string withing each element of a topmost string.
> 
No need for the config file to look like this!

In the config file, you can have:

vnodes=4
vnodes_memory=[1024, 1024, 2048, 2048]
vnodes_distances=[20, 40]

etc.

Then, while parsing, you fill the libxl data structures defined in the
IDL file as Ian suggested.

> So this does not work:
> 
> 
> vnuma_nodes  =  [ a b "distance array1", d e "distance array2", ...]
> also this does not work:
> 
> 
> vnuma_node = [ [], [], [], ..]
> Looks like that I was trying to avoid with emulating multi dimensional
> array for distance still persists here.
> 
> 
> Maybe you know some tricky way to get around this?
> 
The trick is, just don't do that!!  :-P

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v10 5/9] libxl: vnuma types declararion
  2014-09-16  7:16           ` Dario Faggioli
@ 2014-09-16 12:57             ` Elena Ufimtseva
  0 siblings, 0 replies; 32+ messages in thread
From: Elena Ufimtseva @ 2014-09-16 12:57 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Keir Fraser, Ian Campbell, Stefano Stabellini, George Dunlap,
	Matt Wilson, Li Yechen, Ian Jackson, xen-devel, Jan Beulich


[-- Attachment #1.1: Type: text/plain, Size: 2428 bytes --]

On Tue, Sep 16, 2014 at 3:16 AM, Dario Faggioli <dario.faggioli@citrix.com>
wrote:

> On Tue, 2014-09-16 at 02:17 -0400, Elena Ufimtseva wrote:
>
> >
> >         Then I would take the rest and wrap them in a libxl_vnode_info
> >         struct:
> >
> >         libxl_vnode_info = Struct("libxl_vnode_info", [
> >             ("mem", MemKB), # Size of this node's memory
> >             ("distances", Array(...)), # Distances from this node to
> >         the others (single dimensional array)
> >             ("pnode", TYPE), # which pnode this vnode is associated
> >         with
> >         ])
> >
> >         Then in the (b|c)_config
> >              ("vnuma_nodes", Array(libxl_vnode_info...))
> >
> >
> > Hi Ian
> >
> >
> > Is there a better mechanism to parse first vnuma_nodes, and within it
> > to parse distances array?
> >
> Parse? Why parse? The above is the suggested look of the _libxl_
> interface, not of the xl config file.
>
> For the xl config file, IMO, what you have in place is already fine,
> with the modification of the name of the various options, as I mentioned
> in one of my review message.
>
> > So lets say config will be looking like this:
> >
> >
> > vnuma_nodes = [ a b c, d e f, g e k,... ]
> > but c, f, k cannot be arrays. Parser does not expect to have one more
> > string withing each element of a topmost string.
> >
> No need for the config file to look like this!
>
> In the config file, you can have:
>
> vnodes=4
> vnodes_memory=[1024, 1024, 2048, 2048]
> vnodes_distances=[20, 40]
>
> etc.
>
> Then, while parsing, you fill the libxl data structures defined in the
> IDL file as Ian suggested.
>
> > So this does not work:
> >
> >
> > vnuma_nodes  =  [ a b "distance array1", d e "distance array2", ...]
> > also this does not work:
> >
> >
> > vnuma_node = [ [], [], [], ..]
> > Looks like that I was trying to avoid with emulating multi dimensional
> > array for distance still persists here.
> >
> >
> > Maybe you know some tricky way to get around this?
> >
> The trick is, just don't do that!!  :-P
>

Thank you Dario, makes sense, I somehow thought about parsing instead )

>
> Regards,
> Dario
>
> --
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
>
>


-- 
Elena

[-- Attachment #1.2: Type: text/html, Size: 3604 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2014-09-16 12:57 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-03  4:24 [PATCH v10 3/9] vnuma hook to debug-keys u Elena Ufimtseva
2014-09-03  4:24 ` [PATCH v10 4/9] libxc: Introduce xc_domain_setvnuma to set vNUMA Elena Ufimtseva
2014-09-03 14:53   ` Ian Campbell
2014-09-03  4:24 ` [PATCH v10 5/9] libxl: vnuma types declararion Elena Ufimtseva
2014-09-03 15:03   ` Ian Campbell
2014-09-03 16:04   ` Ian Campbell
2014-09-04  3:48     ` Elena Ufimtseva
     [not found]       ` <1409826927.15057.22.camel@kazak.uk.xensource.com>
2014-09-08 16:13         ` Dario Faggioli
2014-09-08 19:47           ` Elena Ufimtseva
2014-09-16  6:17         ` Elena Ufimtseva
2014-09-16  7:16           ` Dario Faggioli
2014-09-16 12:57             ` Elena Ufimtseva
2014-09-03  4:24 ` [PATCH v10 6/9] libxl: build numa nodes memory blocks Elena Ufimtseva
2014-09-03 15:21   ` Ian Campbell
2014-09-04  4:47     ` Elena Ufimtseva
2014-09-05  3:50     ` Elena Ufimtseva
2014-09-12 10:18   ` Dario Faggioli
2014-09-03  4:24 ` [PATCH v10 7/9] libxc: allocate domain memory for vnuma enabled Elena Ufimtseva
2014-09-03 15:26   ` Ian Campbell
2014-09-12 11:06   ` Dario Faggioli
2014-09-03  4:24 ` [PATCH v10 8/9] libxl: vnuma nodes placement bits Elena Ufimtseva
2014-09-03 15:50   ` Konrad Rzeszutek Wilk
2014-09-03 15:52   ` Ian Campbell
2014-09-12 16:51   ` Dario Faggioli
2014-09-03  4:24 ` [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc Elena Ufimtseva
2014-09-03 15:42   ` Konrad Rzeszutek Wilk
2014-09-11 17:13   ` Dario Faggioli
2014-09-12  2:04     ` Elena Ufimtseva
2014-09-12  9:02       ` Dario Faggioli
2014-09-12  9:31         ` Wei Liu
2014-09-12  9:59           ` Dario Faggioli
2014-09-03 15:37 ` [PATCH v10 3/9] vnuma hook to debug-keys u Konrad Rzeszutek Wilk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.