All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Justin T. Weaver" <jtweaver@hawaii.edu>
To: xen-devel@lists.xen.org
Cc: george.dunlap@eu.citrix.com, dario.faggioli@citrix.com,
	"Justin T. Weaver" <jtweaver@hawaii.edu>,
	henric@hawaii.edu
Subject: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
Date: Sun,  8 Feb 2015 17:45:50 -1000	[thread overview]
Message-ID: <1423453550-3526-1-git-send-email-jtweaver@hawaii.edu> (raw)

From: "Justin T. Weaver" <jtweaver@hawaii.edu>

by making sure that vcpus only run on the pcpu(s) they are allowed to
run on based on their hard affinity cpu masks.

Signed-off-by: Justin T. Weaver <jtweaver@hawaii.edu>
---
Changes in v2:
 * Added dynamically allocated cpu masks to avoid putting them on the stack;
   replaced temp masks from v1 throughout
 * Added helper function for code suggested in v1 review and called it in two
   locations in function choose_cpu
 * Removed v1 change to comment in the beginning of choose_cpu
 * Replaced two instances of cpumask_and/cpumask_empty with cpumask_intersects
 * Removed v1 re-work of code in function migrate; only change in migrate in
   v2 is the assignment of a valid pcpu from the destination run queue to
   vc->processor
 * In function csched2_vcpu_migrate: removed change from v1 that called
   function migrate even if cur and dest run queues were the same in order
   to get a runq_tickle call; added processor assignment to new_cpu to fix
   the real underlying issue which was the vcpu not getting a call to
   sched_move_irqs
 * Removed the looping added in v1 in function balance_load; may be added back
   later because it would help to have balance_load be more aware of hard
   affinity, but adding it does not affect credit2's current inability to
   respect hard affinity.
 * Removed coding style fix in function balance_load
 * Improved comment in function runq_candidate
---
 xen/common/sched_credit2.c |  122 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 108 insertions(+), 14 deletions(-)

diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index cf53770..de8fb5a 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -194,6 +194,12 @@ int opt_overload_balance_tolerance=-3;
 integer_param("credit2_balance_over", opt_overload_balance_tolerance);
 
 /*
+ * Use this to avoid having too many cpumask_t structs on the stack
+ */
+static cpumask_t **cpumask = NULL;
+#define csched2_cpumask cpumask[smp_processor_id()]
+
+/*
  * Per-runqueue data
  */
 struct csched2_runqueue_data {
@@ -268,6 +274,23 @@ struct csched2_dom {
     uint16_t nr_vcpus;
 };
 
+/*
+ * When a hard affinity change occurs, we may not be able to check some or
+ * all of the other run queues for a valid new processor for the given vcpu.
+ * Return svc's current pcpu if valid, otherwise return a safe pcpu.
+ */
+static int get_safe_pcpu(struct csched2_vcpu *svc)
+{
+    cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity, &svc->rqd->active);
+    if ( unlikely(cpumask_empty(csched2_cpumask)) )
+        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+            cpupool_online_cpumask(svc->vcpu->domain->cpupool));
+
+    if ( cpumask_test_cpu(svc->vcpu->processor, csched2_cpumask) )
+        return svc->vcpu->processor;
+    else
+        return cpumask_any(csched2_cpumask);
+}
 
 /*
  * Time-to-credit, credit-to-time.
@@ -501,8 +524,9 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu *
         goto tickle;
     }
     
-    /* Get a mask of idle, but not tickled */
+    /* Get a mask of idle, but not tickled, that new is allowed to run on. */
     cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
+    cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
     
     /* If it's not empty, choose one */
     i = cpumask_cycle(cpu, &mask);
@@ -513,9 +537,11 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu *
     }
 
     /* Otherwise, look for the non-idle cpu with the lowest credit,
-     * skipping cpus which have been tickled but not scheduled yet */
+     * skipping cpus which have been tickled but not scheduled yet,
+     * that new is allowed to run on. */
     cpumask_andnot(&mask, &rqd->active, &rqd->idle);
     cpumask_andnot(&mask, &mask, &rqd->tickled);
+    cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
 
     for_each_cpu(i, &mask)
     {
@@ -1063,9 +1089,8 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
             d2printk("%pv -\n", svc->vcpu);
             clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
         }
-        /* Leave it where it is for now.  When we actually pay attention
-         * to affinity we'll have to figure something out... */
-        return vc->processor;
+
+        return get_safe_pcpu(svc);
     }
 
     /* First check to see if we're here because someone else suggested a place
@@ -1081,13 +1106,17 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         else
         {
             d2printk("%pv +\n", svc->vcpu);
-            new_cpu = cpumask_cycle(vc->processor, &svc->migrate_rqd->active);
-            goto out_up;
+            cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
+                &svc->migrate_rqd->active);
+            if ( !cpumask_empty(csched2_cpumask) )
+            {
+                new_cpu = cpumask_any(csched2_cpumask);
+                goto out_up;
+            }
+            /* Fall-through to normal cpu pick */
         }
     }
 
-    /* FIXME: Pay attention to cpu affinity */                                                                                      
-
     min_avgload = MAX_LOAD;
 
     /* Find the runqueue with the lowest instantaneous load */
@@ -1099,17 +1128,24 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         rqd = prv->rqd + i;
 
         /* If checking a different runqueue, grab the lock,
-         * read the avg, and then release the lock.
+         * check hard affinity, read the avg, and then release the lock.
          *
          * If on our own runqueue, don't grab or release the lock;
          * but subtract our own load from the runqueue load to simulate
          * impartiality */
         if ( rqd == svc->rqd )
         {
+            if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+                continue;
             rqd_avgload = rqd->b_avgload - svc->avgload;
         }
         else if ( spin_trylock(&rqd->lock) )
         {
+            if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+            {
+                spin_unlock(&rqd->lock);
+                continue;
+            }
             rqd_avgload = rqd->b_avgload;
             spin_unlock(&rqd->lock);
         }
@@ -1123,12 +1159,16 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         }
     }
 
-    /* We didn't find anyone (most likely because of spinlock contention); leave it where it is */
     if ( min_rqi == -1 )
-        new_cpu = vc->processor;
+    {
+        /* No runqs found (most likely because of spinlock contention). */
+        new_cpu = get_safe_pcpu(svc);
+    }
     else
     {
-        new_cpu = cpumask_cycle(vc->processor, &prv->rqd[min_rqi].active);
+        cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
+            &prv->rqd[min_rqi].active);
+        new_cpu = cpumask_any(csched2_cpumask);
         BUG_ON(new_cpu >= nr_cpu_ids);
     }
 
@@ -1207,7 +1247,12 @@ static void migrate(const struct scheduler *ops,
             on_runq=1;
         }
         __runq_deassign(svc);
-        svc->vcpu->processor = cpumask_any(&trqd->active);
+
+        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+            &trqd->active);
+        svc->vcpu->processor = cpumask_any(csched2_cpumask);
+        BUG_ON(svc->vcpu->processor >= nr_cpu_ids);
+
         __runq_assign(svc, trqd);
         if ( on_runq )
         {
@@ -1330,6 +1375,12 @@ retry:
         if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) )
             continue;
 
+        /* Skip if it can't run on the destination runq. */
+        cpumask_and(csched2_cpumask, push_svc->vcpu->cpu_hard_affinity,
+            &st.orqd->active);
+        if ( cpumask_empty(csched2_cpumask) )
+            continue;
+
         list_for_each( pull_iter, &st.orqd->svc )
         {
             struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem);
@@ -1343,6 +1394,12 @@ retry:
             if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
                 continue;
 
+            /* Skip if it can't run on the destination runq. */
+            cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity,
+                &st.lrqd->active);
+            if ( cpumask_empty(csched2_cpumask) )
+                continue;
+
             consider(&st, push_svc, pull_svc);
         }
 
@@ -1360,6 +1417,12 @@ retry:
         if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
             continue;
 
+        /* Skip if it can't run on the destination runq. */
+        cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity,
+            &st.lrqd->active);
+        if ( cpumask_empty(csched2_cpumask) )
+            continue;
+
         /* Consider pull only */
         consider(&st, NULL, pull_svc);
     }
@@ -1396,6 +1459,15 @@ csched2_vcpu_migrate(
 
     /* Check if new_cpu is valid */
     BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED2_PRIV(ops)->initialized));
+    BUG_ON(!cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity));
+
+    /*
+     * Assign new_cpu to vc->processor here to get a call to sched_move_irqs
+     * in schedule.c in case there was a hard affinity change within the same
+     * run queue. vc will not be able to run in certain situations without
+     * this call.
+     */
+    vc->processor = new_cpu;
 
     trqd = RQD(ops, new_cpu);
 
@@ -1610,6 +1682,10 @@ runq_candidate(struct csched2_runqueue_data *rqd,
     {
         struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, runq_elem);
 
+        /* Only consider vcpus that are allowed to run on this processor. */
+        if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) )
+            continue;
+
         /* If this is on a different processor, don't pull it unless
          * its credit is at least CSCHED2_MIGRATE_RESIST higher. */
         if ( svc->vcpu->processor != cpu
@@ -1992,6 +2068,13 @@ csched2_alloc_pdata(const struct scheduler *ops, int cpu)
         printk("%s: cpu %d not online yet, deferring initializatgion\n",
                __func__, cpu);
 
+    /*
+     * For each new pcpu, allocate a cpumask_t for use throughout the
+     * scheduler to avoid putting any cpumask_t structs on the stack.
+     */
+    if ( !zalloc_cpumask_var(&cpumask[cpu]) )
+        return NULL;
+
     return (void *)1;
 }
 
@@ -2040,6 +2123,8 @@ csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
 
     spin_unlock_irqrestore(&prv->lock, flags);
 
+    free_cpumask_var(cpumask[cpu]);
+
     return;
 }
 
@@ -2127,16 +2212,25 @@ csched2_init(struct scheduler *ops)
 
     prv->load_window_shift = opt_load_window_shift;
 
+    cpumask = xzalloc_bytes(nr_cpu_ids * sizeof(cpumask_t *));
+    if ( cpumask == NULL )
+        return -ENOMEM;
+
     return 0;
 }
 
 static void
 csched2_deinit(const struct scheduler *ops)
 {
+    int i;
     struct csched2_private *prv;
 
     prv = CSCHED2_PRIV(ops);
     xfree(prv);
+
+    for ( i = 0; i < nr_cpu_ids; i++ )
+        free_cpumask_var(cpumask[i]);
+    xfree(cpumask);
 }
 
 
-- 
1.7.10.4

             reply	other threads:[~2015-02-09  3:45 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-02-09  3:45 Justin T. Weaver [this message]
2015-03-03  3:15 ` [PATCH v2] sched: credit2: respect per-vcpu hard affinity Dario Faggioli
2015-03-03  9:12   ` Jan Beulich
2015-03-04 11:03     ` Dario Faggioli
2015-03-04 12:50       ` Jan Beulich
2015-03-04 13:08         ` Dario Faggioli
2015-03-04 13:24           ` Jan Beulich
2015-03-06 15:18   ` George Dunlap
2015-03-06 17:02     ` Dario Faggioli
2015-03-09  7:11       ` Justin Weaver
2015-03-09 11:45         ` George Dunlap
2015-03-09 15:07           ` Dario Faggioli
2015-03-10 13:23         ` Dario Faggioli
2015-03-13 17:11 ` Dario Faggioli
2015-03-14  3:48   ` Justin Weaver

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1423453550-3526-1-git-send-email-jtweaver@hawaii.edu \
    --to=jtweaver@hawaii.edu \
    --cc=dario.faggioli@citrix.com \
    --cc=george.dunlap@eu.citrix.com \
    --cc=henric@hawaii.edu \
    --cc=xen-devel@lists.xen.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.