From mboxrd@z Thu Jan 1 00:00:00 1970 From: Konrad Rzeszutek Wilk Subject: Re: [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc Date: Wed, 3 Sep 2014 11:42:15 -0400 Message-ID: <20140903154215.GB4262@laptop.dumpdata.com> References: <1409718258-3276-1-git-send-email-ufimtseva@gmail.com> <1409718258-3276-7-git-send-email-ufimtseva@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: Content-Disposition: inline In-Reply-To: <1409718258-3276-7-git-send-email-ufimtseva@gmail.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Elena Ufimtseva Cc: keir@xen.org, Ian.Campbell@citrix.com, stefano.stabellini@eu.citrix.com, george.dunlap@eu.citrix.com, msw@linux.com, dario.faggioli@citrix.com, lccycc123@gmail.com, ian.jackson@eu.citrix.com, xen-devel@lists.xen.org, JBeulich@suse.com List-Id: xen-devel@lists.xenproject.org On Wed, Sep 03, 2014 at 12:24:18AM -0400, Elena Ufimtseva wrote: > Parses vnuma topoplogy number of nodes and memory > ranges. If not defined, initializes vnuma with > only one node and default topology. This one node covers > all domain memory and all vcpus assigned to it. > > Signed-off-by: Elena Ufimtseva Reviewed-by: Konrad Rzeszutek Wilk but I am no English native speaker on the docs part so it might be a good thing for such a person to look over it. > --- > docs/man/xl.cfg.pod.5 | 77 +++++++++ > tools/libxl/xl_cmdimpl.c | 433 ++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 510 insertions(+) > > diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5 > index f1fc906..2ee2cbc 100644 > --- a/docs/man/xl.cfg.pod.5 > +++ b/docs/man/xl.cfg.pod.5 > @@ -264,6 +264,83 @@ if the values of B and B differ. > A "pre-ballooned" HVM guest needs a balloon driver, without a balloon driver > it will crash. > > +=item B > + > +Number of vNUMA nodes the guest will be initialized with on boot. > +PV guest by default will have one vnuma node. > + > +=item B > + > +List of memory sizes for each node, defined in MBytes. Number of items listed must > +match nr_vnodes. If the sum of all vnode memories does not match the domain memory > +or there are missing nodes, it will fail. > +If not specified, memory will be equally split between vnodes. Current minimum > +memory size for one node is limited by 32MB. > + > +Example: vnuma_mem=[1024, 1024, 2048, 2048] > +Total amount of memory in guest: 6GB > + > +=item B > + > +Defines the distance table for vNUMA nodes. NUMA topology distances are > +represented by two dimensional square matrix. One element of it [i,j] is > +a distance between nodes i and j. Trivial case is where all diagonal elements > +are equal and matrix is symmetrical. vdistance configuration option allows > +to define two values d1 and d2. d1 will be used for all diagonal elements of > +distance matrix. All other values will be equal to d2 value. Usually distances > +are multiple of 10 in Linux and same rule used here. > +If not specified, the default constants values will be used for distance, > +e.g. [10, 20]. For one node default distance is [10]; > + > +Examples: > +vnodes = 3 > +vdistance=[10, 20] > +will create this distance table (this is default setting as well): > +[10, 20, 20] > +[20, 10, 20] > +[20, 20, 10] > + > +=item B > + > +Defines vcpu to vnode mapping as a list of integers. The position in the list > +is a vcpu number, and the value is the vnode number to which the vcpu will be > +assigned to. > +Current limitations: > +- vNUMA node must have at least one vcpu, otherwise default vcpu_to_vnode will be used. > +- Total number of vnodes cannot be bigger then number of vcpus. > + > +Example: > +Map of 4 vcpus to 2 vnodes: > +0,1 vcpu -> vnode0 > +2,3 vcpu -> vnode1: > + > +vnuma_vcpumap = [0, 0, 1, 1] > + 4 vcpus here - 0 1 2 3 > + > +=item B > + > +List of physical node numbers, position in the list represents vnode number. > +Used for manual placement of vnuma nodes to physical NUMA nodes. > +Will not be used if automatic numa placement is active. > + > +Example: > +assume NUMA machine with 4 physical nodes. Placing vnuma node 0 to pnode 2, > +vnuma node 1 to pnode 3: > +vnode0 -> pnode2 > +vnode1 -> pnode3 > + > +vnuma_vnodemap=[2, 3] > +first vnode will be placed on node 2, second on node 3. > + > +=item B > + > +If set to 1 and automatic NUMA placement is enabled, automatically will find the best > +physical node to place vnuma nodes on. vnuma_vnodemap will be ignored. Automatic NUMA > +placement is enabled if domain has no pinned cpus. > +If vnuma_autoplacement is set to 0, then the vnodes will be placed on NUMA nodes set > +in vnuma_vnodemap if there is enough memory on physical nodes. If not, then the allocation > +will be made on any of the available node and be placed on multiple physical NUMA nodes. > + > =back > > =head3 Event Actions > diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c > index 409a795..1af2250 100644 > --- a/tools/libxl/xl_cmdimpl.c > +++ b/tools/libxl/xl_cmdimpl.c > @@ -40,6 +40,7 @@ > #include "libxl_json.h" > #include "libxlutil.h" > #include "xl.h" > +#include "libxl_vnuma.h" > > /* For calls which return an errno on failure */ > #define CHK_ERRNOVAL( call ) ({ \ > @@ -797,6 +798,432 @@ static void parse_vcpu_affinity(libxl_domain_build_info *b_info, > } > } > > +static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i) > +{ > + const char *buf; > + char *ep; > + unsigned long ul; > + int rc = -EINVAL; > + > + buf = xlu_cfg_get_listitem(list, i); > + if (!buf) > + return rc; > + ul = strtoul(buf, &ep, 10); > + if (ep == buf) > + return rc; > + if (ul >= UINT16_MAX) > + return rc; > + return (unsigned int)ul; > +} > + > +static void vdistance_set(unsigned int *vdistance, > + unsigned int nr_vnodes, > + unsigned int samenode, > + unsigned int othernode) > +{ > + unsigned int idx, slot; > + for (idx = 0; idx < nr_vnodes; idx++) > + for (slot = 0; slot < nr_vnodes; slot++) > + *(vdistance + slot * nr_vnodes + idx) = > + idx == slot ? samenode : othernode; > +} > + > +static void vcputovnode_default(unsigned int *cpu_to_node, > + unsigned int nr_vnodes, > + unsigned int max_vcpus) > +{ > + unsigned int cpu; > + for (cpu = 0; cpu < max_vcpus; cpu++) > + cpu_to_node[cpu] = cpu % nr_vnodes; > +} > + > +/* Split domain memory between vNUMA nodes equally. */ > +static int split_vnumamem(libxl_domain_build_info *b_info) > +{ > + unsigned long long vnodemem = 0; > + unsigned long n; > + unsigned int i; > + > + if (b_info->vnodes == 0) > + return -1; > + > + vnodemem = (b_info->max_memkb >> 10) / b_info->vnodes; > + if (vnodemem < MIN_VNODE_SIZE) > + return -1; > + /* reminder in MBytes. */ > + n = (b_info->max_memkb >> 10) % b_info->vnodes; > + /* get final sizes in MBytes. */ > + for (i = 0; i < (b_info->vnodes - 1); i++) > + b_info->vnuma_mem[i] = vnodemem; > + /* add the reminder to the last node. */ > + b_info->vnuma_mem[i] = vnodemem + n; > + return 0; > +} > + > +static void vnuma_vnodemap_default(unsigned int *vnuma_vnodemap, > + unsigned int nr_vnodes) > +{ > + unsigned int i; > + for (i = 0; i < nr_vnodes; i++) > + vnuma_vnodemap[i] = VNUMA_NO_NODE; > +} > + > +/* > + * init vNUMA to "zero config" with one node and all other > + * topology parameters set to default. > + */ > +static int vnuma_default_config(libxl_domain_build_info *b_info) > +{ > + b_info->vnodes = 1; > + /* all memory goes to this one vnode, as well as vcpus. */ > + if (!(b_info->vnuma_mem = (uint64_t *)calloc(b_info->vnodes, > + sizeof(*b_info->vnuma_mem)))) > + goto bad_vnumazerocfg; > + > + if (!(b_info->vnuma_vcpumap = (unsigned int *)calloc(b_info->max_vcpus, > + sizeof(*b_info->vnuma_vcpumap)))) > + goto bad_vnumazerocfg; > + > + if (!(b_info->vdistance = (unsigned int *)calloc(b_info->vnodes * > + b_info->vnodes, sizeof(*b_info->vdistance)))) > + goto bad_vnumazerocfg; > + > + if (!(b_info->vnuma_vnodemap = (unsigned int *)calloc(b_info->vnodes, > + sizeof(*b_info->vnuma_vnodemap)))) > + goto bad_vnumazerocfg; > + > + b_info->vnuma_mem[0] = b_info->max_memkb >> 10; > + > + /* all vcpus assigned to this vnode. */ > + vcputovnode_default(b_info->vnuma_vcpumap, b_info->vnodes, > + b_info->max_vcpus); > + > + /* default vdistance is 10. */ > + vdistance_set(b_info->vdistance, b_info->vnodes, 10, 10); > + > + /* VNUMA_NO_NODE for vnode_to_pnode. */ > + vnuma_vnodemap_default(b_info->vnuma_vnodemap, b_info->vnodes); > + > + /* > + * will be placed to some physical nodes defined by automatic > + * numa placement or VNUMA_NO_NODE will not request exact node. > + */ > + libxl_defbool_set(&b_info->vnuma_autoplacement, true); > + return 0; > + > + bad_vnumazerocfg: > + return -1; > +} > + > +static void free_vnuma_info(libxl_domain_build_info *b_info) > +{ > + free(b_info->vnuma_mem); > + free(b_info->vdistance); > + free(b_info->vnuma_vcpumap); > + free(b_info->vnuma_vnodemap); > + > + b_info->vnuma_mem = NULL; > + b_info->vdistance = NULL; > + b_info->vnuma_vcpumap = NULL; > + b_info->vnuma_vnodemap = NULL; > + > + b_info->vnodes = 0; > + b_info->vmemranges = 0; > +} > + > +static int parse_vnuma_mem(XLU_Config *config, > + libxl_domain_build_info **b_info) > +{ > + libxl_domain_build_info *dst; > + XLU_ConfigList *vnumamemcfg; > + int nr_vnuma_regions, i; > + unsigned long long vnuma_memparsed = 0; > + unsigned long ul; > + const char *buf; > + char *ep; > + > + dst = *b_info; > + if (!xlu_cfg_get_list(config, "vnuma_mem", > + &vnumamemcfg, &nr_vnuma_regions, 0)) { > + > + if (nr_vnuma_regions != dst->vnodes) { > + fprintf(stderr, "Number of numa regions (vnumamem = %d) is \ > + incorrect (should be %d).\n", nr_vnuma_regions, > + dst->vnodes); > + goto bad_vnuma_mem; > + } > + > + dst->vnuma_mem = calloc(dst->vnodes, > + sizeof(*dst->vnuma_mem)); > + if (dst->vnuma_mem == NULL) { > + fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n"); > + goto bad_vnuma_mem; > + } > + > + /* > + * Will parse only nr_vnodes times, even if we have more/less regions. > + * Take care of it later if less or discard if too many regions. > + */ > + for (i = 0; i < dst->vnodes; i++) { > + buf = xlu_cfg_get_listitem(vnumamemcfg, i); > + if (!buf) { > + fprintf(stderr, > + "xl: Unable to get element %d in vnuma memory list.\n", i); > + goto bad_vnuma_mem; > + } > + > + ul = strtoul(buf, &ep, 10); > + if (ep == buf) { > + fprintf(stderr, "xl: Invalid argument parsing vnumamem: %s.\n", buf); > + goto bad_vnuma_mem; > + } > + > + /* 32Mb is a min size for a node, taken from Linux */ > + if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) { > + fprintf(stderr, "xl: vnuma memory %lu is not within %u - %u range.\n", > + ul, MIN_VNODE_SIZE, UINT32_MAX); > + goto bad_vnuma_mem; > + } > + > + /* memory in MBytes */ > + dst->vnuma_mem[i] = ul; > + } > + > + /* Total memory for vNUMA parsed to verify */ > + for (i = 0; i < nr_vnuma_regions; i++) > + vnuma_memparsed = vnuma_memparsed + (dst->vnuma_mem[i]); > + > + /* Amount of memory for vnodes same as total? */ > + if ((vnuma_memparsed << 10) != (dst->max_memkb)) { > + fprintf(stderr, "xl: vnuma memory is not the same as domain \ > + memory size.\n"); > + goto bad_vnuma_mem; > + } > + } else { > + dst->vnuma_mem = calloc(dst->vnodes, > + sizeof(*dst->vnuma_mem)); > + if (dst->vnuma_mem == NULL) { > + fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n"); > + goto bad_vnuma_mem; > + } > + > + fprintf(stderr, "WARNING: vNUMA memory ranges were not specified.\n"); > + fprintf(stderr, "Using default equal vnode memory size %lu Kbytes \ > + to cover %lu Kbytes.\n", > + dst->max_memkb / dst->vnodes, dst->max_memkb); > + > + if (split_vnumamem(dst) < 0) { > + fprintf(stderr, "Could not split vnuma memory into equal chunks.\n"); > + goto bad_vnuma_mem; > + } > + } > + return 0; > + > + bad_vnuma_mem: > + return -1; > +} > + > +static int parse_vnuma_distance(XLU_Config *config, > + libxl_domain_build_info **b_info) > +{ > + libxl_domain_build_info *dst; > + XLU_ConfigList *vdistancecfg; > + int nr_vdist; > + > + dst = *b_info; > + dst->vdistance = calloc(dst->vnodes * dst->vnodes, > + sizeof(*dst->vdistance)); > + if (dst->vdistance == NULL) > + goto bad_distance; > + > + if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0)) { > + int d1, d2, i; > + /* > + * First value is the same node distance, the second as the > + * rest of distances. The following is required right now to > + * avoid non-symmetrical distance table as it may break latest kernel. > + * TODO: Better way to analyze extended distance table, possibly > + * OS specific. > + */ > + > + for (i = 0; i < nr_vdist; i++) { > + d1 = get_list_item_uint(vdistancecfg, i); > + } > + > + d1 = get_list_item_uint(vdistancecfg, 0); > + if (dst->vnodes > 1) > + d2 = get_list_item_uint(vdistancecfg, 1); > + else > + d2 = d1; > + > + if (d1 >= 0 && d2 >= 0) { > + if (d1 < d2) > + fprintf(stderr, "WARNING: vnuma distance d1 < d2, %u < %u\n", d1, d2); > + vdistance_set(dst->vdistance, dst->vnodes, d1, d2); > + } else { > + fprintf(stderr, "WARNING: vnuma distance values are incorrect.\n"); > + goto bad_distance; > + } > + } else { > + fprintf(stderr, "Could not parse vnuma distances.\n"); > + vdistance_set(dst->vdistance, dst->vnodes, 10, 20); > + } > + return 0; > + > + bad_distance: > + return -1; > +} > + > +static int parse_vnuma_vcpumap(XLU_Config *config, > + libxl_domain_build_info **b_info) > +{ > + libxl_domain_build_info *dst; > + XLU_ConfigList *vcpumap; > + int nr_vcpumap, i; > + > + dst = *b_info; > + dst->vnuma_vcpumap = (unsigned int *)calloc(dst->max_vcpus, > + sizeof(*dst->vnuma_vcpumap)); > + if (dst->vnuma_vcpumap == NULL) > + goto bad_vcpumap; > + > + if (!xlu_cfg_get_list(config, "vnuma_vcpumap", > + &vcpumap, &nr_vcpumap, 0)) { > + if (nr_vcpumap == dst->max_vcpus) { > + unsigned int vnode, vcpumask = 0, vmask; > + > + vmask = ~(~0 << nr_vcpumap); > + for (i = 0; i < nr_vcpumap; i++) { > + vnode = get_list_item_uint(vcpumap, i); > + if (vnode >= 0 && vnode < dst->vnodes) { > + vcpumask |= (1 << i); > + dst->vnuma_vcpumap[i] = vnode; > + } > + } > + > + /* Did it covered all vnodes in the vcpu mask? */ > + if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) { > + fprintf(stderr, "WARNING: Not all vnodes were covered \ > + in numa_cpumask.\n"); > + goto bad_vcpumap; > + } > + } else { > + fprintf(stderr, "WARNING: Bad vnuma_vcpumap.\n"); > + goto bad_vcpumap; > + } > + } > + else > + vcputovnode_default(dst->vnuma_vcpumap, > + dst->vnodes, > + dst->max_vcpus); > + return 0; > + > + bad_vcpumap: > + return -1; > +} > + > +static int parse_vnuma_vnodemap(XLU_Config *config, > + libxl_domain_build_info **b_info) > +{ > + libxl_domain_build_info *dst; > + XLU_ConfigList *vnodemap; > + int nr_vnodemap, i; > + > + dst = *b_info; > + > + /* There is mapping to NUMA physical nodes? */ > + dst->vnuma_vnodemap = (unsigned int *)calloc(dst->vnodes, > + sizeof(*dst->vnuma_vnodemap)); > + if (dst->vnuma_vnodemap == NULL) > + goto bad_vnodemap; > + > + if (!xlu_cfg_get_list(config, "vnuma_vnodemap", > + &vnodemap, &nr_vnodemap, 0)) { > + /* > + * If not specified or incorrect, will be defined > + * later based on the machine architecture, configuration > + * and memory availble when creating domain. > + */ > + libxl_defbool_set(&dst->vnuma_autoplacement, false); > + if (nr_vnodemap == dst->vnodes) { > + unsigned int vnodemask = 0, pnode, smask; > + smask = ~(~0 << dst->vnodes); > + for (i = 0; i < dst->vnodes; i++) { > + pnode = get_list_item_uint(vnodemap, i); > + if (pnode >= 0) { > + vnodemask |= (1 << i); > + dst->vnuma_vnodemap[i] = pnode; > + } > + } > + > + /* Did it covered all vnodes in the mask? */ > + if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) { > + fprintf(stderr, "WARNING: Not all vnodes were covered \ > + vnuma_vnodemap.\n"); > + fprintf(stderr, "Automatic placement will be used for vnodes.\n"); > + libxl_defbool_set(&dst->vnuma_autoplacement, true); > + vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes); > + } > + } > + else { > + fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n"); > + fprintf(stderr, "Automatic placement will be used for vnodes.\n"); > + libxl_defbool_set(&dst->vnuma_autoplacement, true); > + vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes); > + } > + } > + else { > + fprintf(stderr, "WARNING: Missing vnuma_vnodemap.\n"); > + fprintf(stderr, "Automatic placement will be used for vnodes.\n"); > + libxl_defbool_set(&dst->vnuma_autoplacement, true); > + vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes); > + } > + return 0; > + > + bad_vnodemap: > + return -1; > + > +} > + > +static void parse_vnuma_config(XLU_Config *config, > + libxl_domain_build_info *b_info) > +{ > + long l; > + > + if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) { > + if (l > MAX_VNUMA_NODES) { > + fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n", > + MAX_VNUMA_NODES); > + goto bad_vnuma_config; > + } > + b_info->vnodes = l; > + > + if (!xlu_cfg_get_defbool(config, "vnuma_autoplacement", > + &b_info->vnuma_autoplacement, 0)) > + libxl_defbool_set(&b_info->vnuma_autoplacement, false); > + > + /* Only construct nodes with at least one vcpu. */ > + if (b_info->vnodes != 0 && b_info->max_vcpus >= b_info->vnodes) { > + if (parse_vnuma_mem(config, &b_info) || > + parse_vnuma_distance(config, &b_info) || > + parse_vnuma_vcpumap(config, &b_info) || > + parse_vnuma_vnodemap(config, &b_info)) > + goto bad_vnuma_config; > + } > + else if (vnuma_default_config(b_info)) > + goto bad_vnuma_config; > + } > + /* If vnuma topology is not defined for domain, init one node */ > + else if (vnuma_default_config(b_info)) > + goto bad_vnuma_config; > + return; > + > + bad_vnuma_config: > + fprintf(stderr, "Failed to parse vnuma config or set default vnuma config.\n"); > + free_vnuma_info(b_info); > + exit(1); > +} > + > static void parse_config_data(const char *config_source, > const char *config_data, > int config_len, > @@ -924,6 +1351,12 @@ static void parse_config_data(const char *config_source, > > libxl_defbool_set(&b_info->claim_mode, claim_mode); > > + /* > + * If there is no vnuma in config, "zero" vnuma config > + * will be initialized with one node and other defaults. > + */ > + parse_vnuma_config(config, b_info); > + > if (xlu_cfg_get_string (config, "on_poweroff", &buf, 0)) > buf = "destroy"; > if (!parse_action_on_shutdown(buf, &d_config->on_poweroff)) { > -- > 1.7.10.4 >