From mboxrd@z Thu Jan 1 00:00:00 1970 From: Dario Faggioli Subject: [PATCH 3 of 8] xen: let the (credit) scheduler know about `node affinity` Date: Fri, 05 Oct 2012 16:08:21 +0200 Message-ID: References: Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: xen-devel@lists.xen.org Cc: Andre Przywara , Ian Campbell , Anil Madhavapeddy , George Dunlap , Andrew Cooper , Juergen Gross , Ian Jackson , Jan Beulich , Marcus Granado , Daniel De Graaf , Matt Wilson List-Id: xen-devel@lists.xenproject.org As vcpu-affinity tells where vcpus can run, node-affinity tells where a domain's vcpus prefer to run. Respecting vcpu-affinity is the primary concern, but honouring node-affinity will likely result in some performances benefit. This change modifies the vcpu load balancing algorithm (for the credit scheduler only), introducing a two steps logic. During the first step, we use the node-affinity mask. The aim is giving precedence to the CPUs where it is known to be preferrable for the domain to run. If that fails in finding a valid CPU, the node-affinity is just ignored and, in the second step, we fall back to using cpu-affinity only. Signed-off-by: Dario Faggioli diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c --- a/xen/common/sched_credit.c +++ b/xen/common/sched_credit.c @@ -101,6 +101,13 @@ /* + * Node Balancing + */ +#define CSCHED_BALANCE_CPU_AFFINITY 0 +#define CSCHED_BALANCE_NODE_AFFINITY 1 +#define CSCHED_BALANCE_LAST CSCHED_BALANCE_NODE_AFFINITY + +/* * Boot parameters */ static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; @@ -148,6 +155,9 @@ struct csched_dom { struct list_head active_vcpu; struct list_head active_sdom_elem; struct domain *dom; + /* cpumask translated from the domain' node-affinity + * mask. Basically, the CPUs we prefer to be scheduled on. */ + cpumask_var_t node_affinity_cpumask; uint16_t active_vcpu_count; uint16_t weight; uint16_t cap; @@ -228,6 +238,39 @@ static inline void list_del_init(&svc->runq_elem); } +#define for_each_csched_balance_step(__step) \ + for ( (__step) = CSCHED_BALANCE_LAST; (__step) >= 0; (__step)-- ) + +/* + * Each csched-balance step should use its own cpumask. This function + * determines which one, given the step, and copies it in mask. Notice + * that, in the case of a node balancing step, it also filters out from + * the node-affinity mask the cpus that are not part of vc's cpu-affinity, + * as we do not want to end up running a vcpu where it is not allowed to! + * + * As an optimization, if a domain does not have any specific node-affinity + * (namely, its node affinity is automatically computed), we inform the + * caller that he can skip the first step by returning -1. + */ +static int +csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask) +{ + if ( step == CSCHED_BALANCE_NODE_AFFINITY ) + { + struct domain *d = vc->domain; + struct csched_dom *sdom = CSCHED_DOM(d); + + if ( cpumask_full(sdom->node_affinity_cpumask) ) + return -1; + + cpumask_and(mask, sdom->node_affinity_cpumask, vc->cpu_affinity); + } + else /* step == CSCHED_BALANACE_CPU_AFFINITY */ + cpumask_copy(mask, vc->cpu_affinity); + + return 0; +} + static void burn_credits(struct csched_vcpu *svc, s_time_t now) { s_time_t delta; @@ -250,6 +293,20 @@ boolean_param("tickle_one_idle_cpu", opt DEFINE_PER_CPU(unsigned int, last_tickle_cpu); static inline void +__cpumask_tickle(cpumask_t *mask, const cpumask_t *idle_mask) +{ + CSCHED_STAT_CRANK(tickle_idlers_some); + if ( opt_tickle_one_idle ) + { + this_cpu(last_tickle_cpu) = + cpumask_cycle(this_cpu(last_tickle_cpu), idle_mask); + cpumask_set_cpu(this_cpu(last_tickle_cpu), mask); + } + else + cpumask_or(mask, mask, idle_mask); +} + +static inline void __runq_tickle(unsigned int cpu, struct csched_vcpu *new) { struct csched_vcpu * const cur = @@ -287,22 +344,26 @@ static inline void } else { - cpumask_t idle_mask; + cpumask_t idle_mask, balance_mask; + int balance_step; - cpumask_and(&idle_mask, prv->idlers, new->vcpu->cpu_affinity); - if ( !cpumask_empty(&idle_mask) ) + for_each_csched_balance_step(balance_step) { - CSCHED_STAT_CRANK(tickle_idlers_some); - if ( opt_tickle_one_idle ) - { - this_cpu(last_tickle_cpu) = - cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask); - cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask); - } - else - cpumask_or(&mask, &mask, &idle_mask); + if ( csched_balance_cpumask(new->vcpu, balance_step, + &balance_mask) ) + continue; + + /* Look for idlers in the step's cpumask */ + cpumask_and(&idle_mask, prv->idlers, &balance_mask); + if ( !cpumask_empty(&idle_mask) ) + __cpumask_tickle(&mask, &idle_mask); + + cpumask_and(&mask, &mask, &balance_mask); + + /* We can quit balancing if we found someone to tickle */ + if ( !cpumask_empty(&mask) ) + break; } - cpumask_and(&mask, &mask, new->vcpu->cpu_affinity); } } @@ -443,35 +504,42 @@ static inline int } static inline int -__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu) +__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu, cpumask_t *mask) { /* * Don't pick up work that's in the peer's scheduling tail or hot on - * peer PCPU. Only pick up work that's allowed to run on our CPU. + * peer PCPU. Only pick up work that prefers and/or is allowed to run + * on our CPU. */ return !vc->is_running && !__csched_vcpu_is_cache_hot(vc) && - cpumask_test_cpu(dest_cpu, vc->cpu_affinity); + cpumask_test_cpu(dest_cpu, mask); } static int _csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit) { - cpumask_t cpus; + cpumask_t cpus, start_cpus; cpumask_t idlers; cpumask_t *online; + struct csched_dom *sdom = CSCHED_DOM(vc->domain); struct csched_pcpu *spc = NULL; int cpu; /* - * Pick from online CPUs in VCPU's affinity mask, giving a - * preference to its current processor if it's in there. + * Pick an online CPU from the && of vcpu-affinity and node-affinity + * masks (if not empty, in which case only the vcpu-affinity mask is + * used). Also, try to give a preference to its current processor if + * it's in there. */ online = cpupool_scheduler_cpumask(vc->domain->cpupool); cpumask_and(&cpus, online, vc->cpu_affinity); - cpu = cpumask_test_cpu(vc->processor, &cpus) + cpumask_and(&start_cpus, &cpus, sdom->node_affinity_cpumask); + if ( unlikely(cpumask_empty(&start_cpus)) ) + cpumask_copy(&start_cpus, &cpus); + cpu = cpumask_test_cpu(vc->processor, &start_cpus) ? vc->processor - : cpumask_cycle(vc->processor, &cpus); + : cpumask_cycle(vc->processor, &start_cpus); ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); /* @@ -867,6 +935,13 @@ csched_alloc_domdata(const struct schedu if ( sdom == NULL ) return NULL; + if ( !alloc_cpumask_var(&sdom->node_affinity_cpumask) ) + { + xfree(sdom); + return NULL; + } + cpumask_setall(sdom->node_affinity_cpumask); + /* Initialize credit and weight */ INIT_LIST_HEAD(&sdom->active_vcpu); sdom->active_vcpu_count = 0; @@ -900,6 +975,9 @@ csched_dom_init(const struct scheduler * static void csched_free_domdata(const struct scheduler *ops, void *data) { + struct csched_dom *sdom = data; + + free_cpumask_var(sdom->node_affinity_cpumask); xfree(data); } @@ -1211,30 +1289,48 @@ csched_runq_steal(int peer_cpu, int cpu, */ if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) ) { - list_for_each( iter, &peer_pcpu->runq ) + int balance_step; + + /* + * Take node-affinity into account. That means, for all the vcpus + * in peer_pcpu's runq, check _first_ if their node-affinity allows + * them to run on cpu. If not, retry the loop considering plain + * vcpu-affinity. Also, notice that as soon as one vcpu is found, + * balancing is considered done, and the vcpu is returned to the + * caller. + */ + for_each_csched_balance_step(balance_step) { - speer = __runq_elem(iter); + list_for_each( iter, &peer_pcpu->runq ) + { + cpumask_t balance_mask; - /* - * If next available VCPU here is not of strictly higher - * priority than ours, this PCPU is useless to us. - */ - if ( speer->pri <= pri ) - break; + speer = __runq_elem(iter); - /* Is this VCPU is runnable on our PCPU? */ - vc = speer->vcpu; - BUG_ON( is_idle_vcpu(vc) ); + /* + * If next available VCPU here is not of strictly higher + * priority than ours, this PCPU is useless to us. + */ + if ( speer->pri <= pri ) + break; - if (__csched_vcpu_is_migrateable(vc, cpu)) - { - /* We got a candidate. Grab it! */ - CSCHED_VCPU_STAT_CRANK(speer, migrate_q); - CSCHED_STAT_CRANK(migrate_queued); - WARN_ON(vc->is_urgent); - __runq_remove(speer); - vc->processor = cpu; - return speer; + /* Is this VCPU runnable on our PCPU? */ + vc = speer->vcpu; + BUG_ON( is_idle_vcpu(vc) ); + + if ( csched_balance_cpumask(vc, balance_step, &balance_mask) ) + continue; + + if (__csched_vcpu_is_migrateable(vc, cpu, &balance_mask)) + { + /* We got a candidate. Grab it! */ + CSCHED_VCPU_STAT_CRANK(speer, migrate_q); + CSCHED_STAT_CRANK(migrate_queued); + WARN_ON(vc->is_urgent); + __runq_remove(speer); + vc->processor = cpu; + return speer; + } } } }