From mboxrd@z Thu Jan  1 00:00:00 1970
From: Dario Faggioli <dario.faggioli@citrix.com>
Subject: [PATCH 3 of 8] xen: let the (credit) scheduler know
 about `node affinity`
Date: Fri, 05 Oct 2012 16:08:21 +0200
Message-ID: <ca2fa958879bbffa9bc6.1349446101@Solace>
References: <patchbomb.1349446098@Solace>
Mime-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Return-path: <xen-devel-bounces@lists.xen.org>
In-Reply-To: <patchbomb.1349446098@Solace>
List-Unsubscribe: <http://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <http://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Sender: xen-devel-bounces@lists.xen.org
Errors-To: xen-devel-bounces@lists.xen.org
To: xen-devel@lists.xen.org
Cc: Andre Przywara <andre.przywara@amd.com>, Ian Campbell <Ian.Campbell@citrix.com>, Anil Madhavapeddy <anil@recoil.org>, George Dunlap <george.dunlap@eu.citrix.com>, Andrew Cooper <Andrew.Cooper3@citrix.com>, Juergen Gross <juergen.gross@ts.fujitsu.com>, Ian Jackson <Ian.Jackson@eu.citrix.com>, Jan Beulich <JBeulich@suse.com>, Marcus Granado <Marcus.Granado@eu.citrix.com>, Daniel De Graaf <dgdegra@tycho.nsa.gov>, Matt Wilson <msw@amazon.com>
List-Id: xen-devel@lists.xenproject.org

As vcpu-affinity tells where vcpus can run, node-affinity tells
where a domain's vcpus prefer to run. Respecting vcpu-affinity is
the primary concern, but honouring node-affinity will likely
result in some performances benefit.

This change modifies the vcpu load balancing algorithm (for the
credit scheduler only), introducing a two steps logic.
During the first step, we use the node-affinity mask. The aim is
giving precedence to the CPUs where it is known to be preferrable
for the domain to run. If that fails in finding a valid CPU, the
node-affinity is just ignored and, in the second step, we fall
back to using cpu-affinity only.

Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>

diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -101,6 +101,13 @@
 
 
 /*
+ * Node Balancing
+ */
+#define CSCHED_BALANCE_CPU_AFFINITY     0
+#define CSCHED_BALANCE_NODE_AFFINITY    1
+#define CSCHED_BALANCE_LAST CSCHED_BALANCE_NODE_AFFINITY
+
+/*
  * Boot parameters
  */
 static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
@@ -148,6 +155,9 @@ struct csched_dom {
     struct list_head active_vcpu;
     struct list_head active_sdom_elem;
     struct domain *dom;
+    /* cpumask translated from the domain' node-affinity
+     * mask. Basically, the CPUs we prefer to be scheduled on. */
+    cpumask_var_t node_affinity_cpumask;
     uint16_t active_vcpu_count;
     uint16_t weight;
     uint16_t cap;
@@ -228,6 +238,39 @@ static inline void
     list_del_init(&svc->runq_elem);
 }
 
+#define for_each_csched_balance_step(__step) \
+    for ( (__step) = CSCHED_BALANCE_LAST; (__step) >= 0; (__step)-- )
+
+/*
+ * Each csched-balance step should use its own cpumask. This function
+ * determines which one, given the step, and copies it in mask. Notice
+ * that, in the case of a node balancing step, it also filters out from
+ * the node-affinity mask the cpus that are not part of vc's cpu-affinity,
+ * as we do not want to end up running a vcpu where it is not allowed to!
+ *
+ * As an optimization, if a domain does not have any specific node-affinity
+ * (namely, its node affinity is automatically computed), we inform the
+ * caller that he can skip the first step by returning -1.
+ */
+static int
+csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask)
+{
+    if ( step == CSCHED_BALANCE_NODE_AFFINITY )
+    {
+        struct domain *d = vc->domain;
+        struct csched_dom *sdom = CSCHED_DOM(d);
+
+        if ( cpumask_full(sdom->node_affinity_cpumask) )
+            return -1;
+
+        cpumask_and(mask, sdom->node_affinity_cpumask, vc->cpu_affinity);
+    }
+    else /* step == CSCHED_BALANACE_CPU_AFFINITY */
+        cpumask_copy(mask, vc->cpu_affinity);
+
+    return 0;
+}
+
 static void burn_credits(struct csched_vcpu *svc, s_time_t now)
 {
     s_time_t delta;
@@ -250,6 +293,20 @@ boolean_param("tickle_one_idle_cpu", opt
 DEFINE_PER_CPU(unsigned int, last_tickle_cpu);
 
 static inline void
+__cpumask_tickle(cpumask_t *mask, const cpumask_t *idle_mask)
+{
+    CSCHED_STAT_CRANK(tickle_idlers_some);
+    if ( opt_tickle_one_idle )
+    {
+        this_cpu(last_tickle_cpu) =
+            cpumask_cycle(this_cpu(last_tickle_cpu), idle_mask);
+        cpumask_set_cpu(this_cpu(last_tickle_cpu), mask);
+    }
+    else
+        cpumask_or(mask, mask, idle_mask);
+}
+
+static inline void
 __runq_tickle(unsigned int cpu, struct csched_vcpu *new)
 {
     struct csched_vcpu * const cur =
@@ -287,22 +344,26 @@ static inline void
         }
         else
         {
-            cpumask_t idle_mask;
+            cpumask_t idle_mask, balance_mask;
+            int balance_step;
 
-            cpumask_and(&idle_mask, prv->idlers, new->vcpu->cpu_affinity);
-            if ( !cpumask_empty(&idle_mask) )
+            for_each_csched_balance_step(balance_step)
             {
-                CSCHED_STAT_CRANK(tickle_idlers_some);
-                if ( opt_tickle_one_idle )
-                {
-                    this_cpu(last_tickle_cpu) = 
-                        cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask);
-                    cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
-                }
-                else
-                    cpumask_or(&mask, &mask, &idle_mask);
+                if ( csched_balance_cpumask(new->vcpu, balance_step,
+                                            &balance_mask) )
+                    continue;
+
+                /* Look for idlers in the step's cpumask */
+                cpumask_and(&idle_mask, prv->idlers, &balance_mask);
+                if ( !cpumask_empty(&idle_mask) )
+                    __cpumask_tickle(&mask, &idle_mask);
+
+                cpumask_and(&mask, &mask, &balance_mask);
+
+                /* We can quit balancing if we found someone to tickle */
+                if ( !cpumask_empty(&mask) )
+                    break;
             }
-            cpumask_and(&mask, &mask, new->vcpu->cpu_affinity);
         }
     }
 
@@ -443,35 +504,42 @@ static inline int
 }
 
 static inline int
-__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
+__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu, cpumask_t *mask)
 {
     /*
      * Don't pick up work that's in the peer's scheduling tail or hot on
-     * peer PCPU. Only pick up work that's allowed to run on our CPU.
+     * peer PCPU. Only pick up work that prefers and/or is allowed to run
+     * on our CPU.
      */
     return !vc->is_running &&
            !__csched_vcpu_is_cache_hot(vc) &&
-           cpumask_test_cpu(dest_cpu, vc->cpu_affinity);
+           cpumask_test_cpu(dest_cpu, mask);
 }
 
 static int
 _csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit)
 {
-    cpumask_t cpus;
+    cpumask_t cpus, start_cpus;
     cpumask_t idlers;
     cpumask_t *online;
+    struct csched_dom *sdom = CSCHED_DOM(vc->domain);
     struct csched_pcpu *spc = NULL;
     int cpu;
 
     /*
-     * Pick from online CPUs in VCPU's affinity mask, giving a
-     * preference to its current processor if it's in there.
+     * Pick an online CPU from the && of vcpu-affinity and node-affinity
+     * masks (if not empty, in which case only the vcpu-affinity mask is
+     * used). Also, try to give a preference to its current processor if
+     * it's in there.
      */
     online = cpupool_scheduler_cpumask(vc->domain->cpupool);
     cpumask_and(&cpus, online, vc->cpu_affinity);
-    cpu = cpumask_test_cpu(vc->processor, &cpus)
+    cpumask_and(&start_cpus, &cpus, sdom->node_affinity_cpumask);
+    if ( unlikely(cpumask_empty(&start_cpus)) )
+        cpumask_copy(&start_cpus, &cpus);
+    cpu = cpumask_test_cpu(vc->processor, &start_cpus)
             ? vc->processor
-            : cpumask_cycle(vc->processor, &cpus);
+            : cpumask_cycle(vc->processor, &start_cpus);
     ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
 
     /*
@@ -867,6 +935,13 @@ csched_alloc_domdata(const struct schedu
     if ( sdom == NULL )
         return NULL;
 
+    if ( !alloc_cpumask_var(&sdom->node_affinity_cpumask) )
+    {
+        xfree(sdom);
+        return NULL;
+    }
+    cpumask_setall(sdom->node_affinity_cpumask);
+
     /* Initialize credit and weight */
     INIT_LIST_HEAD(&sdom->active_vcpu);
     sdom->active_vcpu_count = 0;
@@ -900,6 +975,9 @@ csched_dom_init(const struct scheduler *
 static void
 csched_free_domdata(const struct scheduler *ops, void *data)
 {
+    struct csched_dom *sdom = data;
+
+    free_cpumask_var(sdom->node_affinity_cpumask);
     xfree(data);
 }
 
@@ -1211,30 +1289,48 @@ csched_runq_steal(int peer_cpu, int cpu,
      */
     if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
     {
-        list_for_each( iter, &peer_pcpu->runq )
+        int balance_step;
+
+        /*
+         * Take node-affinity into account. That means, for all the vcpus
+         * in peer_pcpu's runq, check _first_ if their node-affinity allows
+         * them to run on cpu. If not, retry the loop considering plain
+         * vcpu-affinity. Also, notice that as soon as one vcpu is found,
+         * balancing is considered done, and the vcpu is returned to the
+         * caller.
+         */
+        for_each_csched_balance_step(balance_step)
         {
-            speer = __runq_elem(iter);
+            list_for_each( iter, &peer_pcpu->runq )
+            {
+                cpumask_t balance_mask;
 
-            /*
-             * If next available VCPU here is not of strictly higher
-             * priority than ours, this PCPU is useless to us.
-             */
-            if ( speer->pri <= pri )
-                break;
+                speer = __runq_elem(iter);
 
-            /* Is this VCPU is runnable on our PCPU? */
-            vc = speer->vcpu;
-            BUG_ON( is_idle_vcpu(vc) );
+                /*
+                 * If next available VCPU here is not of strictly higher
+                 * priority than ours, this PCPU is useless to us.
+                 */
+                if ( speer->pri <= pri )
+                    break;
 
-            if (__csched_vcpu_is_migrateable(vc, cpu))
-            {
-                /* We got a candidate. Grab it! */
-                CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
-                CSCHED_STAT_CRANK(migrate_queued);
-                WARN_ON(vc->is_urgent);
-                __runq_remove(speer);
-                vc->processor = cpu;
-                return speer;
+                /* Is this VCPU runnable on our PCPU? */
+                vc = speer->vcpu;
+                BUG_ON( is_idle_vcpu(vc) );
+
+                if ( csched_balance_cpumask(vc, balance_step, &balance_mask) )
+                    continue;
+
+                if (__csched_vcpu_is_migrateable(vc, cpu, &balance_mask))
+                {
+                    /* We got a candidate. Grab it! */
+                    CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
+                    CSCHED_STAT_CRANK(migrate_queued);
+                    WARN_ON(vc->is_urgent);
+                    __runq_remove(speer);
+                    vc->processor = cpu;
+                    return speer;
+                }
             }
         }
     }