All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 5/7] libxl: vNUMA configuration parser
@ 2013-11-14  3:27 Elena Ufimtseva
  0 siblings, 0 replies; only message in thread
From: Elena Ufimtseva @ 2013-11-14  3:27 UTC (permalink / raw)
  To: xen-devel
  Cc: Ian.Campbell, stefano.stabellini, george.dunlap, msw,
	dario.faggioli, lccycc123, ian.jackson, Elena Ufimtseva

Parses guest vNUMA related options, verifies and
sets default values for errorneous parameters.
To enable vNUMA, vnodes is a must in the config,
all other options can be omitted.

all possible config options are:

vnodes: number of vnuma nodes:
vnodes = 2

vnumamem: vnodes memory in MBytes:
vnumamem = [2048, 2048]

vdistance: distance table or part of it.
Can use minimal form with same-node distance first and
second - all other nodes distances:
vdistance = [10, 20]
will expand to square symmetrical matrix of any n x n:
[10, 20, 20]
[20, 10, 20]
[20, 20, 10]

If not defined or format is incorrect, default [10, 20] will be used.
Latest kernel (3.13-rc1) oopses on non-symmetrical distance table and
next version will introduce extended distance table in config.

vnuma_vcpumap: vcpu to vnode mapping. The index is a vcpu
number starting from 0, the value is the node number, starting
from zero.
vnuma_vcpumap =[1, 0, 1, 0]
node 1 - vcpus 0 and 2
node 0 - vcpus 1 and 3

If verification fails, default vcpu interleaved map over vnodes
will be used.

vnuma_vnodemap - vnode tp physical node mapping.
Suggested map will be verified and if the requested mapping
cannot be completed (no NUMA hardware, not correct mapping,
not enough memory), will try to set this with help of automatic
numa placement. If this fails, will be used not node specific
allocation.

vnuma_vnodemap = [1, 0]
vnode0 -> pnode1
vnode1 -> pnode0

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 tools/libxl/xl_cmdimpl.c |  268 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 266 insertions(+), 2 deletions(-)

diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index 5bd9b15..3963b75 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -40,6 +40,7 @@
 #include "libxl_json.h"
 #include "libxlutil.h"
 #include "xl.h"
+#include "libxl_vnuma.h"
 
 #define CHK_ERRNO( call ) ({                                            \
         int chk_errno = (call);                                         \
@@ -572,6 +573,74 @@ vcpp_out:
     return rc;
 }
 
+static void vnuma_info_release(libxl_domain_build_info *info)
+{
+    info->nr_vnodes = 0;
+    if ( info->vnuma_memszs ) free(info->vnuma_memszs);
+    if ( info->vdistance ) free(info->vdistance);
+    if ( info->vcpu_to_vnode ) free(info->vcpu_to_vnode);
+    if ( info->vnode_numamap ) free(info->vnode_numamap);
+}
+
+static int get_list_item_uint(XLU_ConfigList *list, unsigned int i)
+{
+    const char *buf;
+    char *ep;
+    unsigned long ul;
+    int rc = -EINVAL;
+    buf = xlu_cfg_get_listitem(list, i);
+    if (!buf) 
+        return rc;
+    ul = strtoul(buf, &ep, 10);
+    if (ep == buf) 
+        return rc;
+    if (ul >= UINT16_MAX) 
+        return rc; 
+    return (int)ul;
+}
+
+static void vdistance_default(unsigned int *vdistance,
+                                unsigned int nr_vnodes,
+                                unsigned int samenode,
+                                unsigned int othernode)
+{
+    int i, j;
+    for (i = 0; i < nr_vnodes; i++)
+        for (j = 0; j < nr_vnodes; j++)
+            *(vdistance + j * nr_vnodes + i) = i == j ? samenode : othernode;
+}
+
+static void vcputovnode_default(unsigned int *vcpu_to_vnode,
+                                unsigned int nr_vnodes,
+                                unsigned int max_vcpus)
+{
+    int i;
+    if (vcpu_to_vnode == NULL)
+        return;
+    for(i = 0; i < max_vcpus; i++)
+        vcpu_to_vnode[i] = i % nr_vnodes;
+}
+
+/* Split domain memory between vNUMA nodes equally */
+static int split_vnumamem(libxl_domain_build_info *b_info)
+{
+    unsigned long long vnodemem = 0;
+    unsigned long n;
+    unsigned int i;
+    /* In MBytes */ 
+    vnodemem = (b_info->max_memkb >> 10) / b_info->nr_vnodes;
+    if (vnodemem < MIN_VNODE_SIZE)
+        return -1;
+    /* reminder in MBytes */ 
+    n = (b_info->max_memkb >> 10) % b_info->nr_vnodes;
+    /* get final sizes in MBytes */
+    for(i = 0; i < (b_info->nr_vnodes - 1); i++)
+        b_info->vnuma_memszs[i] = vnodemem;
+    /* add the reminder to the last node */
+    b_info->vnuma_memszs[i] = vnodemem + n;
+    return 0;
+}
+
 static void parse_config_data(const char *config_source,
                               const char *config_data,
                               int config_len,
@@ -906,7 +975,11 @@ static void parse_config_data(const char *config_source,
     {
         char *cmdline = NULL;
         const char *root = NULL, *extra = "";
-
+        XLU_ConfigList *vnumamemcfg, *vdistancecfg, *vnodemap, *vcpumap;
+        int nr_vnuma_regions, nr_vdist, nr_vnodemap;
+        unsigned long long vnuma_memparsed = 0;
+        unsigned long ul;
+        
         xlu_cfg_replace_string (config, "kernel", &b_info->u.pv.kernel, 0);
 
         xlu_cfg_get_string (config, "root", &root, 0);
@@ -923,7 +996,198 @@ static void parse_config_data(const char *config_source,
             fprintf(stderr, "Failed to allocate memory for cmdline\n");
             exit(1);
         }
-
+        
+        if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) {
+            if (l > MAX_VNUMA_NODES) {
+                fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n", MAX_VNUMA_NODES);
+                exit(1);
+            }
+            b_info->nr_vnodes = l;
+            if (b_info->nr_vnodes != 0 || b_info->max_vcpus >= b_info->nr_vnodes) {
+                if (!xlu_cfg_get_list(config, "vnumamem",
+                                      &vnumamemcfg, &nr_vnuma_regions, 0)) {
+                    /* 
+                     * If number of regions parsed != number of nodes, check
+                     * the memory configuration anyways and if its ok we adjust total 
+                     * number of nodes. The memory parsed is in MBytes. 
+                     */
+                    if (nr_vnuma_regions != b_info->nr_vnodes)
+                        fprintf(stderr, "Number of numa regions is incorrect. Will attepmt to adjust.\n");
+                    b_info->vnuma_memszs = calloc(b_info->nr_vnodes,
+                                                  sizeof(*b_info->vnuma_memszs));
+                    if (b_info->vnuma_memszs == NULL) {
+                        fprintf(stderr, "unable to allocate memory for vnuma ranges.\n");
+                        exit(1);
+                    }
+                    char *ep;
+                    /* 
+                     * Will parse only nr_vnodes times, even if we have more/less regions.
+                     * Take care of it later if less or discard if too many regions.
+                     */
+                    for (i = 0; i < b_info->nr_vnodes; i++) {
+                        buf = xlu_cfg_get_listitem(vnumamemcfg, i);
+                        if (!buf) {
+                            fprintf(stderr,
+                                    "xl: Unable to get element %d in vnuma memroy list.\n", i);
+                            break;
+                        }
+                        ul = strtoul(buf, &ep, 10);
+                        if (ep == buf) {
+                            fprintf(stderr,
+                                    "xl: Invalid argument parsing vnumamem: %s.\n", buf);
+                            break;
+                        }
+                        /* 32Mb is a min size for a node, taken from Linux */
+                        if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) {
+                            fprintf(stderr, "xl: vnuma memory %lu is not withing %u - %u range.\n",
+                                    ul, MIN_VNODE_SIZE, UINT32_MAX);
+                            break;
+                        }
+                        /* memory in MBytes */
+                        b_info->vnuma_memszs[i] = ul;
+                    }
+                    /* Total memory for vNUMA parsed to verify */
+                    for(i = 0; i < nr_vnuma_regions; i++)
+                        vnuma_memparsed = vnuma_memparsed + (b_info->vnuma_memszs[i]);
+                    /* Now we have all inputs. Check for misconfigurations and adjust if needed */
+                    /* Amount of memory for vnodes same as total? */
+                    if((vnuma_memparsed << 10) == (b_info->max_memkb)) {
+                        if(b_info->nr_vnodes != nr_vnuma_regions) {
+                            fprintf(stderr, "xl: vnuma memory regions looks incorrect, will use first %d.\n",
+                                    b_info->nr_vnodes);
+                        }
+                    } else {
+                        fprintf(stderr, "WARNING: vNUMA memory %llu Kbytes is %s than total memory.\n",
+                                vnuma_memparsed << 10,
+                                (vnuma_memparsed << 10)> b_info->max_memkb ? "more" : "less");
+                        fprintf(stderr, "Each vnode will get equal memory size of %lu Kbytes.\n",
+                                b_info->max_memkb / b_info->nr_vnodes);
+                        if (split_vnumamem(b_info) < 0) {
+                            vnuma_info_release(b_info);
+                            exit(1);
+                        }
+                    }
+                } else {
+                    b_info->vnuma_memszs = calloc(b_info->nr_vnodes,
+                                                  sizeof(*b_info->vnuma_memszs));
+                    if (b_info->vnuma_memszs == NULL) {
+                        fprintf(stderr, "unable to allocate memory for vnuma ranges.\n");
+                        exit(1);
+                    }
+                    fprintf(stderr, "WARNING: vNUMA memory ranges were not specified.\n");
+                    fprintf(stderr, "Using default equal vnode memory size %lu Kbytes to cover %lu Kbytes.\n", 
+                            b_info->max_memkb / b_info->nr_vnodes,
+                            b_info->max_memkb);
+                    if (split_vnumamem(b_info) < 0) {
+                        vnuma_info_release(b_info);
+                        exit(1);
+                    }
+                }
+                b_info->vdistance = calloc(b_info->nr_vnodes * b_info->nr_vnodes,
+                                           sizeof(*b_info->vdistance));
+                if (b_info->vdistance == NULL) {
+                    vnuma_info_release(b_info);
+                    exit(1);
+                }
+                if(!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0) &&
+                        nr_vdist == 2) {
+                    /*
+                     * If only two elements are in the vdistance list, consider
+                     * first as value for same node distance, the second as the
+                     * rest of distances.
+                     * The following is required right now to avoid non-symmetrical
+                     * distance table as it may break latest kernel.
+                     * TODO: Better way to analyze extended distance table, possibly
+                     * OS specific.
+                     */
+                     int d1, d2;
+                     d1 = get_list_item_uint(vdistancecfg, 0);
+                     d2 = get_list_item_uint(vdistancecfg, 1);
+                     if (d1 >= 0 && d2 >= 0)
+                        vdistance_default(b_info->vdistance, b_info->nr_vnodes, d1, d2);
+                } else
+                    vdistance_default(b_info->vdistance, b_info->nr_vnodes, 10, 20);
+                    
+                b_info->vcpu_to_vnode = (unsigned int *)calloc(b_info->max_vcpus,
+                                                        sizeof(*b_info->vcpu_to_vnode));
+                if (b_info->vcpu_to_vnode == NULL) 
+                    exit(1);
+                
+                b_info->vcpu_to_vnode = (unsigned int *)calloc(b_info->max_vcpus, 
+                                                        sizeof(*b_info->vcpu_to_vnode));
+                                    
+                if (b_info->vcpu_to_vnode == NULL) {
+                    vnuma_info_release(b_info);
+                    exit(1);
+                }
+                if (!xlu_cfg_get_list(config, "vnuma_vcpumap",
+                                      &vcpumap, &nr_vnodemap, 0)) {
+                    if (nr_vnodemap == b_info->max_vcpus) {
+                        unsigned int vcpumask = 0, vcpu, smask;
+                        smask = ~(~0 << nr_vnodemap);
+                        for (i = 0; i < nr_vnodemap; i++) {
+                            vcpu = get_list_item_uint(vcpumap, i);
+                            if (vcpu >= 0) {
+                                vcpumask |= (1 << i);
+                                b_info->vcpu_to_vnode[i] = vcpu;
+                            }
+                        }
+                        /* Did it covered all vnodes in the vcpu mask? */
+                        if ( !((vcpumask & smask) + 1) == (1 << nr_vnodemap) ) {
+                            /* no, using default */
+                            vcputovnode_default(b_info->vcpu_to_vnode,
+                                                b_info->nr_vnodes,
+                                                b_info->max_vcpus);
+                        }
+                    } else
+                        vcputovnode_default(b_info->vcpu_to_vnode,
+                                            b_info->nr_vnodes,
+                                            b_info->max_vcpus);
+                }
+                else
+                    vcputovnode_default(b_info->vcpu_to_vnode,
+                                        b_info->nr_vnodes,
+                                        b_info->max_vcpus);
+                /* There is mapping to NUMA physical nodes? */
+                if (!xlu_cfg_get_list(config, "vnuma_vnodemap",&vnodemap,
+                                                        &nr_vnodemap, 0)) {
+                   /* 
+                   * If not specified or incorred, will be defined
+                   * later based on the machine architecture, configuration
+                   * and memory availble when creating domain.
+                   */
+                    if (nr_vnodemap == b_info->nr_vnodes) {
+                        b_info->vnode_numamap = (unsigned int *)calloc(b_info->nr_vnodes, 
+                                                sizeof(*b_info->vnode_numamap));
+                        if (b_info->vnode_numamap == NULL) {
+                            vnuma_info_release(b_info);
+                            exit(1);
+                        }
+
+                        unsigned int vnodemask = 0, pnode, smask;
+                        smask = ~(~0 << nr_vnodemap);
+                        for (i = 0; i < nr_vnodemap; i++) {
+                            pnode = get_list_item_uint(vnodemap, i); 
+                            if (pnode >= 0) {
+                                vnodemask |= (1 << i);
+                                b_info->vnode_numamap[i] = pnode;
+                            }
+                        }
+                        /* Did it covered all vnodes in the mask? */
+                        if ( !((vnodemask & smask) + 1) == (1 << nr_vnodemap) ) {
+                            fprintf(stderr, "WARNING: Not all vnodes were covered vnuma_vnodemap.\n");
+                            free(b_info->vnode_numamap);
+                            b_info->vnode_numamap = NULL;
+                        }
+                    }
+                }
+            }
+            else 
+                b_info->nr_vnodes=0;
+        }
+        else
+            b_info->nr_vnodes = 0;
+        
         xlu_cfg_replace_string (config, "bootloader", &b_info->u.pv.bootloader, 0);
         switch (xlu_cfg_get_list_as_string_list(config, "bootloader_args",
                                       &b_info->u.pv.bootloader_args, 1))
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2013-11-14  3:27 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-14  3:27 [PATCH v2 5/7] libxl: vNUMA configuration parser Elena Ufimtseva

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.