[PATCH v2] sched: credit2: respect per-vcpu hard affinity

From: "Justin T. Weaver" <jtweaver@hawaii.edu>
To: xen-devel@lists.xen.org
Cc: george.dunlap@eu.citrix.com, dario.faggioli@citrix.com,
	"Justin T. Weaver" <jtweaver@hawaii.edu>,
	henric@hawaii.edu
Subject: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
Date: Sun,  8 Feb 2015 17:45:50 -1000	[thread overview]
Message-ID: <1423453550-3526-1-git-send-email-jtweaver@hawaii.edu> (raw)

From: "Justin T. Weaver" <jtweaver@hawaii.edu>

by making sure that vcpus only run on the pcpu(s) they are allowed to
run on based on their hard affinity cpu masks.

Signed-off-by: Justin T. Weaver <jtweaver@hawaii.edu>
---
Changes in v2:
 * Added dynamically allocated cpu masks to avoid putting them on the stack;
   replaced temp masks from v1 throughout
 * Added helper function for code suggested in v1 review and called it in two
   locations in function choose_cpu
 * Removed v1 change to comment in the beginning of choose_cpu
 * Replaced two instances of cpumask_and/cpumask_empty with cpumask_intersects
 * Removed v1 re-work of code in function migrate; only change in migrate in
   v2 is the assignment of a valid pcpu from the destination run queue to
   vc->processor
 * In function csched2_vcpu_migrate: removed change from v1 that called
   function migrate even if cur and dest run queues were the same in order
   to get a runq_tickle call; added processor assignment to new_cpu to fix
   the real underlying issue which was the vcpu not getting a call to
   sched_move_irqs
 * Removed the looping added in v1 in function balance_load; may be added back
   later because it would help to have balance_load be more aware of hard
   affinity, but adding it does not affect credit2's current inability to
   respect hard affinity.
 * Removed coding style fix in function balance_load
 * Improved comment in function runq_candidate
---
 xen/common/sched_credit2.c |  122 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 108 insertions(+), 14 deletions(-)

diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index cf53770..de8fb5a 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -194,6 +194,12 @@ int opt_overload_balance_tolerance=-3;
 integer_param("credit2_balance_over", opt_overload_balance_tolerance);
 
 /*
+ * Use this to avoid having too many cpumask_t structs on the stack
+ */
+static cpumask_t **cpumask = NULL;
+#define csched2_cpumask cpumask[smp_processor_id()]
+
+/*
  * Per-runqueue data
  */
 struct csched2_runqueue_data {
@@ -268,6 +274,23 @@ struct csched2_dom {
     uint16_t nr_vcpus;
 };
 
+/*
+ * When a hard affinity change occurs, we may not be able to check some or
+ * all of the other run queues for a valid new processor for the given vcpu.
+ * Return svc's current pcpu if valid, otherwise return a safe pcpu.
+ */
+static int get_safe_pcpu(struct csched2_vcpu *svc)
+{
+    cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity, &svc->rqd->active);
+    if ( unlikely(cpumask_empty(csched2_cpumask)) )
+        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+            cpupool_online_cpumask(svc->vcpu->domain->cpupool));
+
+    if ( cpumask_test_cpu(svc->vcpu->processor, csched2_cpumask) )
+        return svc->vcpu->processor;
+    else
+        return cpumask_any(csched2_cpumask);
+}
 
 /*
  * Time-to-credit, credit-to-time.
@@ -501,8 +524,9 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu *
         goto tickle;
     }
     
-    /* Get a mask of idle, but not tickled */
+    /* Get a mask of idle, but not tickled, that new is allowed to run on. */
     cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
+    cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
     
     /* If it's not empty, choose one */
     i = cpumask_cycle(cpu, &mask);
@@ -513,9 +537,11 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu *
     }
 
     /* Otherwise, look for the non-idle cpu with the lowest credit,
-     * skipping cpus which have been tickled but not scheduled yet */
+     * skipping cpus which have been tickled but not scheduled yet,
+     * that new is allowed to run on. */
     cpumask_andnot(&mask, &rqd->active, &rqd->idle);
     cpumask_andnot(&mask, &mask, &rqd->tickled);
+    cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
 
     for_each_cpu(i, &mask)
     {
@@ -1063,9 +1089,8 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
             d2printk("%pv -\n", svc->vcpu);
             clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
         }
-        /* Leave it where it is for now.  When we actually pay attention
-         * to affinity we'll have to figure something out... */
-        return vc->processor;
+
+        return get_safe_pcpu(svc);
     }
 
     /* First check to see if we're here because someone else suggested a place
@@ -1081,13 +1106,17 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         else
         {
             d2printk("%pv +\n", svc->vcpu);
-            new_cpu = cpumask_cycle(vc->processor, &svc->migrate_rqd->active);
-            goto out_up;
+            cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
+                &svc->migrate_rqd->active);
+            if ( !cpumask_empty(csched2_cpumask) )
+            {
+                new_cpu = cpumask_any(csched2_cpumask);
+                goto out_up;
+            }
+            /* Fall-through to normal cpu pick */
         }
     }
 
-    /* FIXME: Pay attention to cpu affinity */                                                                                      
-
     min_avgload = MAX_LOAD;
 
     /* Find the runqueue with the lowest instantaneous load */
@@ -1099,17 +1128,24 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         rqd = prv->rqd + i;
 
         /* If checking a different runqueue, grab the lock,
-         * read the avg, and then release the lock.
+         * check hard affinity, read the avg, and then release the lock.
          *
          * If on our own runqueue, don't grab or release the lock;
          * but subtract our own load from the runqueue load to simulate
          * impartiality */
         if ( rqd == svc->rqd )
         {
+            if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+                continue;
             rqd_avgload = rqd->b_avgload - svc->avgload;
         }
         else if ( spin_trylock(&rqd->lock) )
         {
+            if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+            {
+                spin_unlock(&rqd->lock);
+                continue;
+            }
             rqd_avgload = rqd->b_avgload;
             spin_unlock(&rqd->lock);
         }
@@ -1123,12 +1159,16 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         }
     }
 
-    /* We didn't find anyone (most likely because of spinlock contention); leave it where it is */
     if ( min_rqi == -1 )
-        new_cpu = vc->processor;
+    {
+        /* No runqs found (most likely because of spinlock contention). */
+        new_cpu = get_safe_pcpu(svc);
+    }
     else
     {
-        new_cpu = cpumask_cycle(vc->processor, &prv->rqd[min_rqi].active);
+        cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
+            &prv->rqd[min_rqi].active);
+        new_cpu = cpumask_any(csched2_cpumask);
         BUG_ON(new_cpu >= nr_cpu_ids);
     }
 
@@ -1207,7 +1247,12 @@ static void migrate(const struct scheduler *ops,
             on_runq=1;
         }
         __runq_deassign(svc);
-        svc->vcpu->processor = cpumask_any(&trqd->active);
+
+        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+            &trqd->active);
+        svc->vcpu->processor = cpumask_any(csched2_cpumask);
+        BUG_ON(svc->vcpu->processor >= nr_cpu_ids);
+
         __runq_assign(svc, trqd);
         if ( on_runq )
         {
@@ -1330,6 +1375,12 @@ retry:
         if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) )
             continue;
 
+        /* Skip if it can't run on the destination runq. */
+        cpumask_and(csched2_cpumask, push_svc->vcpu->cpu_hard_affinity,
+            &st.orqd->active);
+        if ( cpumask_empty(csched2_cpumask) )
+            continue;
+
         list_for_each( pull_iter, &st.orqd->svc )
         {
             struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem);
@@ -1343,6 +1394,12 @@ retry:
             if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
                 continue;
 
+            /* Skip if it can't run on the destination runq. */
+            cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity,
+                &st.lrqd->active);
+            if ( cpumask_empty(csched2_cpumask) )
+                continue;
+
             consider(&st, push_svc, pull_svc);
         }
 
@@ -1360,6 +1417,12 @@ retry:
         if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
             continue;
 
+        /* Skip if it can't run on the destination runq. */
+        cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity,
+            &st.lrqd->active);
+        if ( cpumask_empty(csched2_cpumask) )
+            continue;
+
         /* Consider pull only */
         consider(&st, NULL, pull_svc);
     }
@@ -1396,6 +1459,15 @@ csched2_vcpu_migrate(
 
     /* Check if new_cpu is valid */
     BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED2_PRIV(ops)->initialized));
+    BUG_ON(!cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity));
+
+    /*
+     * Assign new_cpu to vc->processor here to get a call to sched_move_irqs
+     * in schedule.c in case there was a hard affinity change within the same
+     * run queue. vc will not be able to run in certain situations without
+     * this call.
+     */
+    vc->processor = new_cpu;
 
     trqd = RQD(ops, new_cpu);
 
@@ -1610,6 +1682,10 @@ runq_candidate(struct csched2_runqueue_data *rqd,
     {
         struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, runq_elem);
 
+        /* Only consider vcpus that are allowed to run on this processor. */
+        if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) )
+            continue;
+
         /* If this is on a different processor, don't pull it unless
          * its credit is at least CSCHED2_MIGRATE_RESIST higher. */
         if ( svc->vcpu->processor != cpu
@@ -1992,6 +2068,13 @@ csched2_alloc_pdata(const struct scheduler *ops, int cpu)
         printk("%s: cpu %d not online yet, deferring initializatgion\n",
                __func__, cpu);
 
+    /*
+     * For each new pcpu, allocate a cpumask_t for use throughout the
+     * scheduler to avoid putting any cpumask_t structs on the stack.
+     */
+    if ( !zalloc_cpumask_var(&cpumask[cpu]) )
+        return NULL;
+
     return (void *)1;
 }
 
@@ -2040,6 +2123,8 @@ csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
 
     spin_unlock_irqrestore(&prv->lock, flags);
 
+    free_cpumask_var(cpumask[cpu]);
+
     return;
 }
 
@@ -2127,16 +2212,25 @@ csched2_init(struct scheduler *ops)
 
     prv->load_window_shift = opt_load_window_shift;
 
+    cpumask = xzalloc_bytes(nr_cpu_ids * sizeof(cpumask_t *));
+    if ( cpumask == NULL )
+        return -ENOMEM;
+
     return 0;
 }
 
 static void
 csched2_deinit(const struct scheduler *ops)
 {
+    int i;
     struct csched2_private *prv;
 
     prv = CSCHED2_PRIV(ops);
     xfree(prv);
+
+    for ( i = 0; i < nr_cpu_ids; i++ )
+        free_cpumask_var(cpumask[i]);
+    xfree(cpumask);
 }
 
 
-- 
1.7.10.4