From mboxrd@z Thu Jan 1 00:00:00 1970 From: George Dunlap Subject: Re: [PATCH 4 of 8] xen: allow for explicitly specifying node-affinity Date: Tue, 9 Oct 2012 17:47:22 +0100 Message-ID: References: <12134421b216e9c8eef6.1349446102@Solace> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <12134421b216e9c8eef6.1349446102@Solace> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Dario Faggioli Cc: Marcus Granado , Andre Przywara , Ian Campbell , Anil Madhavapeddy , Andrew Cooper , Juergen Gross , Ian Jackson , xen-devel@lists.xen.org, Jan Beulich , Daniel De Graaf , Matt Wilson List-Id: xen-devel@lists.xenproject.org On Fri, Oct 5, 2012 at 3:08 PM, Dario Faggioli wrote: > Make it possible to pass the node-affinity of a domain to the hypervisor > from the upper layers, instead of always being computed automatically. > > Note that this also required generalizing the Flask hooks for setting > and getting the affinity, so that they now deal with both vcpu and > node affinity. > > Signed-off-by: Dario Faggioli > > diff --git a/xen/common/domain.c b/xen/common/domain.c > --- a/xen/common/domain.c > +++ b/xen/common/domain.c > @@ -222,6 +222,7 @@ struct domain *domain_create( > > spin_lock_init(&d->node_affinity_lock); > d->node_affinity = NODE_MASK_ALL; > + d->auto_node_affinity = 1; > > spin_lock_init(&d->shutdown_lock); > d->shutdown_code = -1; > @@ -362,11 +363,26 @@ void domain_update_node_affinity(struct > cpumask_or(cpumask, cpumask, online_affinity); > } > > - for_each_online_node ( node ) > - if ( cpumask_intersects(&node_to_cpumask(node), cpumask) ) > - node_set(node, nodemask); > + if ( d->auto_node_affinity ) > + { > + /* Node-affinity is automaically computed from all vcpu-affinities */ > + for_each_online_node ( node ) > + if ( cpumask_intersects(&node_to_cpumask(node), cpumask) ) > + node_set(node, nodemask); > > - d->node_affinity = nodemask; > + d->node_affinity = nodemask; > + } > + else > + { > + /* Node-affinity is provided by someone else, just filter out cpus > + * that are either offline or not in the affinity of any vcpus. */ > + for_each_node_mask ( node, d->node_affinity ) > + if ( !cpumask_intersects(&node_to_cpumask(node), cpumask) ) > + node_clear(node, d->node_affinity); > + } > + > + sched_set_node_affinity(d, &d->node_affinity); > + > spin_unlock(&d->node_affinity_lock); > > free_cpumask_var(online_affinity); > @@ -374,6 +390,36 @@ void domain_update_node_affinity(struct > } > > > +int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity) > +{ > + /* Being affine with no nodes is just wrong */ > + if ( nodes_empty(*affinity) ) > + return -EINVAL; > + > + spin_lock(&d->node_affinity_lock); > + > + /* > + * Being/becoming explicitly affine to all nodes is not particularly > + * useful. Let's take it as the `reset node affinity` command. > + */ > + if ( nodes_full(*affinity) ) > + { > + d->auto_node_affinity = 1; > + goto out; > + } > + > + d->auto_node_affinity = 0; > + d->node_affinity = *affinity; > + > +out: > + spin_unlock(&d->node_affinity_lock); > + > + domain_update_node_affinity(d); > + > + return 0; > +} > + > + > struct domain *get_domain_by_id(domid_t dom) > { > struct domain *d; > diff --git a/xen/common/domctl.c b/xen/common/domctl.c > --- a/xen/common/domctl.c > +++ b/xen/common/domctl.c > @@ -642,6 +642,40 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc > } > break; > > + case XEN_DOMCTL_setnodeaffinity: > + case XEN_DOMCTL_getnodeaffinity: > + { > + domid_t dom = op->domain; > + struct domain *d = rcu_lock_domain_by_id(dom); > + > + ret = -ESRCH; > + if ( d == NULL ) > + break; > + > + ret = xsm_nodeaffinity(op->cmd, d); > + if ( ret ) > + goto nodeaffinity_out; > + > + if ( op->cmd == XEN_DOMCTL_setnodeaffinity ) > + { > + nodemask_t new_affinity; > + > + ret = xenctl_bitmap_to_nodemask(&new_affinity, > + &op->u.nodeaffinity.nodemap); > + if ( !ret ) > + ret = domain_set_node_affinity(d, &new_affinity); > + } > + else > + { > + ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap, > + &d->node_affinity); > + } > + > + nodeaffinity_out: > + rcu_unlock_domain(d); > + } > + break; > + > case XEN_DOMCTL_setvcpuaffinity: > case XEN_DOMCTL_getvcpuaffinity: > { > diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c > --- a/xen/common/keyhandler.c > +++ b/xen/common/keyhandler.c > @@ -217,6 +217,14 @@ static void cpuset_print(char *set, int > *set++ = '\0'; > } > > +static void nodeset_print(char *set, int size, const nodemask_t *mask) > +{ > + *set++ = '['; > + set += nodelist_scnprintf(set, size-2, mask); > + *set++ = ']'; > + *set++ = '\0'; > +} > + > static void periodic_timer_print(char *str, int size, uint64_t period) > { > if ( period == 0 ) > @@ -272,6 +280,9 @@ static void dump_domains(unsigned char k > > dump_pageframe_info(d); > > + nodeset_print(tmpstr, sizeof(tmpstr), &d->node_affinity); > + printk("NODE affinity for domain %d: %s\n", d->domain_id, tmpstr); > + > printk("VCPU information and callbacks for domain %u:\n", > d->domain_id); > for_each_vcpu ( d, v ) > diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c > --- a/xen/common/sched_credit.c > +++ b/xen/common/sched_credit.c > @@ -238,6 +238,33 @@ static inline void > list_del_init(&svc->runq_elem); > } > > +/* > + * Translates node-affinity mask into a cpumask, so that we can use it during > + * actual scheduling. That of course will contain all the cpus from all the > + * set nodes in the original node-affinity mask. > + * > + * Note that any serialization needed to access mask safely is complete > + * responsibility of the caller of this function/hook. > + */ > +static void csched_set_node_affinity( > + const struct scheduler *ops, > + struct domain *d, > + nodemask_t *mask) > +{ > + struct csched_dom *sdom; > + int node; > + > + /* Skip idle domain since it doesn't even have a node_affinity_cpumask */ > + if ( unlikely(is_idle_domain(d)) ) > + return; > + > + sdom = CSCHED_DOM(d); > + cpumask_clear(sdom->node_affinity_cpumask); > + for_each_node_mask( node, *mask ) > + cpumask_or(sdom->node_affinity_cpumask, sdom->node_affinity_cpumask, > + &node_to_cpumask(node)); > +} > + > #define for_each_csched_balance_step(__step) \ > for ( (__step) = CSCHED_BALANCE_LAST; (__step) >= 0; (__step)-- ) > > @@ -260,7 +287,8 @@ csched_balance_cpumask(const struct vcpu > struct domain *d = vc->domain; > struct csched_dom *sdom = CSCHED_DOM(d); > > - if ( cpumask_full(sdom->node_affinity_cpumask) ) > + if ( cpumask_full(sdom->node_affinity_cpumask) || > + d->auto_node_affinity == 1 ) > return -1; > > cpumask_and(mask, sdom->node_affinity_cpumask, vc->cpu_affinity); > @@ -1786,6 +1814,8 @@ const struct scheduler sched_credit_def > .adjust = csched_dom_cntl, > .adjust_global = csched_sys_cntl, > > + .set_node_affinity = csched_set_node_affinity, > + > .pick_cpu = csched_cpu_pick, > .do_schedule = csched_schedule, > > diff --git a/xen/common/schedule.c b/xen/common/schedule.c > --- a/xen/common/schedule.c > +++ b/xen/common/schedule.c > @@ -588,6 +588,11 @@ int cpu_disable_scheduler(unsigned int c > return ret; > } > > +void sched_set_node_affinity(struct domain *d, nodemask_t *mask) > +{ > + SCHED_OP(DOM2OP(d), set_node_affinity, d, mask); > +} > + > int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity) > { > cpumask_t online_affinity; > diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h > --- a/xen/include/public/domctl.h > +++ b/xen/include/public/domctl.h > @@ -279,6 +279,16 @@ typedef struct xen_domctl_getvcpuinfo xe > DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t); > > > +/* Get/set the NUMA node(s) with which the guest has affinity with. */ > +/* XEN_DOMCTL_setnodeaffinity */ > +/* XEN_DOMCTL_getnodeaffinity */ > +struct xen_domctl_nodeaffinity { > + struct xenctl_bitmap nodemap;/* IN */ > +}; > +typedef struct xen_domctl_nodeaffinity xen_domctl_nodeaffinity_t; > +DEFINE_XEN_GUEST_HANDLE(xen_domctl_nodeaffinity_t); > + > + > /* Get/set which physical cpus a vcpu can execute on. */ > /* XEN_DOMCTL_setvcpuaffinity */ > /* XEN_DOMCTL_getvcpuaffinity */ > @@ -900,6 +910,8 @@ struct xen_domctl { > #define XEN_DOMCTL_set_access_required 64 > #define XEN_DOMCTL_audit_p2m 65 > #define XEN_DOMCTL_set_virq_handler 66 > +#define XEN_DOMCTL_setnodeaffinity 67 > +#define XEN_DOMCTL_getnodeaffinity 68 > #define XEN_DOMCTL_gdbsx_guestmemio 1000 > #define XEN_DOMCTL_gdbsx_pausevcpu 1001 > #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 > @@ -913,6 +925,7 @@ struct xen_domctl { > struct xen_domctl_getpageframeinfo getpageframeinfo; > struct xen_domctl_getpageframeinfo2 getpageframeinfo2; > struct xen_domctl_getpageframeinfo3 getpageframeinfo3; > + struct xen_domctl_nodeaffinity nodeaffinity; > struct xen_domctl_vcpuaffinity vcpuaffinity; > struct xen_domctl_shadow_op shadow_op; > struct xen_domctl_max_mem max_mem; > diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h > --- a/xen/include/xen/nodemask.h > +++ b/xen/include/xen/nodemask.h > @@ -8,8 +8,9 @@ > * See detailed comments in the file linux/bitmap.h describing the > * data type on which these nodemasks are based. > * > - * For details of nodemask_scnprintf() and nodemask_parse(), > - * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c. > + * For details of nodemask_scnprintf(), nodelist_scnpintf() and > + * nodemask_parse(), see bitmap_scnprintf() and bitmap_parse() > + * in lib/bitmap.c. > * > * The available nodemask operations are: > * > @@ -48,6 +49,7 @@ > * unsigned long *nodes_addr(mask) Array of unsigned long's in mask > * > * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing > + * int nodelist_scnprintf(buf, len, mask) Format nodemask as a list for printing > * int nodemask_parse(ubuf, ulen, mask) Parse ascii string as nodemask > * > * for_each_node_mask(node, mask) for-loop node over mask > @@ -280,6 +282,14 @@ static inline int __first_unset_node(con > > #define nodes_addr(src) ((src).bits) > > +#define nodelist_scnprintf(buf, len, src) \ > + __nodelist_scnprintf((buf), (len), (src), MAX_NUMNODES) > +static inline int __nodelist_scnprintf(char *buf, int len, > + const nodemask_t *srcp, int nbits) > +{ > + return bitmap_scnlistprintf(buf, len, srcp->bits, nbits); > +} > + > #if 0 > #define nodemask_scnprintf(buf, len, src) \ > __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES) > diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h > --- a/xen/include/xen/sched-if.h > +++ b/xen/include/xen/sched-if.h > @@ -182,6 +182,8 @@ struct scheduler { > struct xen_domctl_scheduler_op *); > int (*adjust_global) (const struct scheduler *, > struct xen_sysctl_scheduler_op *); > + void (*set_node_affinity) (const struct scheduler *, > + struct domain *, nodemask_t *); > void (*dump_settings) (const struct scheduler *); > void (*dump_cpu_state) (const struct scheduler *, int); > > diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -346,8 +346,12 @@ struct domain > /* Various mem_events */ > struct mem_event_per_domain *mem_event; > > - /* Currently computed from union of all vcpu cpu-affinity masks. */ > + /* > + * Can be specified by the user. If that is not the case, it is > + * computed from the union of all the vcpu cpu-affinity masks. > + */ > nodemask_t node_affinity; > + int auto_node_affinity; > unsigned int last_alloc_node; > spinlock_t node_affinity_lock; > }; > @@ -416,6 +420,7 @@ static inline void get_knownalive_domain > ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED)); > } > > +int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity); > void domain_update_node_affinity(struct domain *d); > > struct domain *domain_create( > @@ -519,6 +524,7 @@ void sched_destroy_domain(struct domain > int sched_move_domain(struct domain *d, struct cpupool *c); > long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *); > long sched_adjust_global(struct xen_sysctl_scheduler_op *); > +void sched_set_node_affinity(struct domain *, nodemask_t *); > int sched_id(void); > void sched_tick_suspend(void); > void sched_tick_resume(void); > diff --git a/xen/include/xsm/xsm.h b/xen/include/xsm/xsm.h > --- a/xen/include/xsm/xsm.h > +++ b/xen/include/xsm/xsm.h > @@ -56,6 +56,7 @@ struct xsm_operations { > int (*domain_create) (struct domain *d, u32 ssidref); > int (*max_vcpus) (struct domain *d); > int (*destroydomain) (struct domain *d); > + int (*nodeaffinity) (int cmd, struct domain *d); > int (*vcpuaffinity) (int cmd, struct domain *d); > int (*scheduler) (struct domain *d); > int (*getdomaininfo) (struct domain *d); > @@ -229,6 +230,11 @@ static inline int xsm_destroydomain (str > return xsm_call(destroydomain(d)); > } > > +static inline int xsm_nodeaffinity (int cmd, struct domain *d) > +{ > + return xsm_call(nodeaffinity(cmd, d)); > +} > + > static inline int xsm_vcpuaffinity (int cmd, struct domain *d) > { > return xsm_call(vcpuaffinity(cmd, d)); > diff --git a/xen/xsm/dummy.c b/xen/xsm/dummy.c > --- a/xen/xsm/dummy.c > +++ b/xen/xsm/dummy.c > @@ -634,6 +634,7 @@ void xsm_fixup_ops (struct xsm_operation > set_to_dummy_if_null(ops, domain_create); > set_to_dummy_if_null(ops, max_vcpus); > set_to_dummy_if_null(ops, destroydomain); > + set_to_dummy_if_null(ops, nodeaffinity); > set_to_dummy_if_null(ops, vcpuaffinity); > set_to_dummy_if_null(ops, scheduler); > set_to_dummy_if_null(ops, getdomaininfo); > diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c > --- a/xen/xsm/flask/hooks.c > +++ b/xen/xsm/flask/hooks.c > @@ -521,17 +521,19 @@ static int flask_destroydomain(struct do > DOMAIN__DESTROY); > } > > -static int flask_vcpuaffinity(int cmd, struct domain *d) > +static int flask_affinity(int cmd, struct domain *d) > { > u32 perm; > > switch ( cmd ) > { > case XEN_DOMCTL_setvcpuaffinity: > - perm = DOMAIN__SETVCPUAFFINITY; > + case XEN_DOMCTL_setnodeaffinity: > + perm = DOMAIN__SETAFFINITY; > break; > case XEN_DOMCTL_getvcpuaffinity: > - perm = DOMAIN__GETVCPUAFFINITY; > + case XEN_DOMCTL_getnodeaffinity: > + perm = DOMAIN__GETAFFINITY; > break; > default: > return -EPERM; > @@ -1473,7 +1475,8 @@ static struct xsm_operations flask_ops = > .domain_create = flask_domain_create, > .max_vcpus = flask_max_vcpus, > .destroydomain = flask_destroydomain, > - .vcpuaffinity = flask_vcpuaffinity, > + .nodeaffinity = flask_affinity, > + .vcpuaffinity = flask_affinity, > .scheduler = flask_scheduler, > .getdomaininfo = flask_getdomaininfo, > .getvcpucontext = flask_getvcpucontext, > diff --git a/xen/xsm/flask/include/av_perm_to_string.h b/xen/xsm/flask/include/av_perm_to_string.h > --- a/xen/xsm/flask/include/av_perm_to_string.h > +++ b/xen/xsm/flask/include/av_perm_to_string.h > @@ -37,8 +37,8 @@ > S_(SECCLASS_DOMAIN, DOMAIN__TRANSITION, "transition") > S_(SECCLASS_DOMAIN, DOMAIN__MAX_VCPUS, "max_vcpus") > S_(SECCLASS_DOMAIN, DOMAIN__DESTROY, "destroy") > - S_(SECCLASS_DOMAIN, DOMAIN__SETVCPUAFFINITY, "setvcpuaffinity") > - S_(SECCLASS_DOMAIN, DOMAIN__GETVCPUAFFINITY, "getvcpuaffinity") > + S_(SECCLASS_DOMAIN, DOMAIN__SETAFFINITY, "setaffinity") > + S_(SECCLASS_DOMAIN, DOMAIN__GETAFFINITY, "getaffinity") The top of this file says, "This file is automatically generated. Do not edit." I didn't see any files that might have been modified to effect these changes -- did I miss them? Or is the comment a lie? Or should you find that file and edit it instead? :-) > S_(SECCLASS_DOMAIN, DOMAIN__SCHEDULER, "scheduler") > S_(SECCLASS_DOMAIN, DOMAIN__GETDOMAININFO, "getdomaininfo") > S_(SECCLASS_DOMAIN, DOMAIN__GETVCPUINFO, "getvcpuinfo") > diff --git a/xen/xsm/flask/include/av_permissions.h b/xen/xsm/flask/include/av_permissions.h > --- a/xen/xsm/flask/include/av_permissions.h > +++ b/xen/xsm/flask/include/av_permissions.h > @@ -38,8 +38,8 @@ > #define DOMAIN__TRANSITION 0x00000020UL > #define DOMAIN__MAX_VCPUS 0x00000040UL > #define DOMAIN__DESTROY 0x00000080UL > -#define DOMAIN__SETVCPUAFFINITY 0x00000100UL > -#define DOMAIN__GETVCPUAFFINITY 0x00000200UL > +#define DOMAIN__SETAFFINITY 0x00000100UL > +#define DOMAIN__GETAFFINITY 0x00000200UL Same thing here. Other than that, looks good! -George > #define DOMAIN__SCHEDULER 0x00000400UL > #define DOMAIN__GETDOMAININFO 0x00000800UL > #define DOMAIN__GETVCPUINFO 0x00001000UL > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel