All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] sched: credit2: respect per-vcpu hard affinity
@ 2015-02-09  3:45 Justin T. Weaver
  2015-03-03  3:15 ` Dario Faggioli
  2015-03-13 17:11 ` Dario Faggioli
  0 siblings, 2 replies; 15+ messages in thread
From: Justin T. Weaver @ 2015-02-09  3:45 UTC (permalink / raw)
  To: xen-devel; +Cc: george.dunlap, dario.faggioli, Justin T. Weaver, henric

From: "Justin T. Weaver" <jtweaver@hawaii.edu>

by making sure that vcpus only run on the pcpu(s) they are allowed to
run on based on their hard affinity cpu masks.

Signed-off-by: Justin T. Weaver <jtweaver@hawaii.edu>
---
Changes in v2:
 * Added dynamically allocated cpu masks to avoid putting them on the stack;
   replaced temp masks from v1 throughout
 * Added helper function for code suggested in v1 review and called it in two
   locations in function choose_cpu
 * Removed v1 change to comment in the beginning of choose_cpu
 * Replaced two instances of cpumask_and/cpumask_empty with cpumask_intersects
 * Removed v1 re-work of code in function migrate; only change in migrate in
   v2 is the assignment of a valid pcpu from the destination run queue to
   vc->processor
 * In function csched2_vcpu_migrate: removed change from v1 that called
   function migrate even if cur and dest run queues were the same in order
   to get a runq_tickle call; added processor assignment to new_cpu to fix
   the real underlying issue which was the vcpu not getting a call to
   sched_move_irqs
 * Removed the looping added in v1 in function balance_load; may be added back
   later because it would help to have balance_load be more aware of hard
   affinity, but adding it does not affect credit2's current inability to
   respect hard affinity.
 * Removed coding style fix in function balance_load
 * Improved comment in function runq_candidate
---
 xen/common/sched_credit2.c |  122 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 108 insertions(+), 14 deletions(-)

diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index cf53770..de8fb5a 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -194,6 +194,12 @@ int opt_overload_balance_tolerance=-3;
 integer_param("credit2_balance_over", opt_overload_balance_tolerance);
 
 /*
+ * Use this to avoid having too many cpumask_t structs on the stack
+ */
+static cpumask_t **cpumask = NULL;
+#define csched2_cpumask cpumask[smp_processor_id()]
+
+/*
  * Per-runqueue data
  */
 struct csched2_runqueue_data {
@@ -268,6 +274,23 @@ struct csched2_dom {
     uint16_t nr_vcpus;
 };
 
+/*
+ * When a hard affinity change occurs, we may not be able to check some or
+ * all of the other run queues for a valid new processor for the given vcpu.
+ * Return svc's current pcpu if valid, otherwise return a safe pcpu.
+ */
+static int get_safe_pcpu(struct csched2_vcpu *svc)
+{
+    cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity, &svc->rqd->active);
+    if ( unlikely(cpumask_empty(csched2_cpumask)) )
+        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+            cpupool_online_cpumask(svc->vcpu->domain->cpupool));
+
+    if ( cpumask_test_cpu(svc->vcpu->processor, csched2_cpumask) )
+        return svc->vcpu->processor;
+    else
+        return cpumask_any(csched2_cpumask);
+}
 
 /*
  * Time-to-credit, credit-to-time.
@@ -501,8 +524,9 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu *
         goto tickle;
     }
     
-    /* Get a mask of idle, but not tickled */
+    /* Get a mask of idle, but not tickled, that new is allowed to run on. */
     cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
+    cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
     
     /* If it's not empty, choose one */
     i = cpumask_cycle(cpu, &mask);
@@ -513,9 +537,11 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched2_vcpu *
     }
 
     /* Otherwise, look for the non-idle cpu with the lowest credit,
-     * skipping cpus which have been tickled but not scheduled yet */
+     * skipping cpus which have been tickled but not scheduled yet,
+     * that new is allowed to run on. */
     cpumask_andnot(&mask, &rqd->active, &rqd->idle);
     cpumask_andnot(&mask, &mask, &rqd->tickled);
+    cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
 
     for_each_cpu(i, &mask)
     {
@@ -1063,9 +1089,8 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
             d2printk("%pv -\n", svc->vcpu);
             clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
         }
-        /* Leave it where it is for now.  When we actually pay attention
-         * to affinity we'll have to figure something out... */
-        return vc->processor;
+
+        return get_safe_pcpu(svc);
     }
 
     /* First check to see if we're here because someone else suggested a place
@@ -1081,13 +1106,17 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         else
         {
             d2printk("%pv +\n", svc->vcpu);
-            new_cpu = cpumask_cycle(vc->processor, &svc->migrate_rqd->active);
-            goto out_up;
+            cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
+                &svc->migrate_rqd->active);
+            if ( !cpumask_empty(csched2_cpumask) )
+            {
+                new_cpu = cpumask_any(csched2_cpumask);
+                goto out_up;
+            }
+            /* Fall-through to normal cpu pick */
         }
     }
 
-    /* FIXME: Pay attention to cpu affinity */                                                                                      
-
     min_avgload = MAX_LOAD;
 
     /* Find the runqueue with the lowest instantaneous load */
@@ -1099,17 +1128,24 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         rqd = prv->rqd + i;
 
         /* If checking a different runqueue, grab the lock,
-         * read the avg, and then release the lock.
+         * check hard affinity, read the avg, and then release the lock.
          *
          * If on our own runqueue, don't grab or release the lock;
          * but subtract our own load from the runqueue load to simulate
          * impartiality */
         if ( rqd == svc->rqd )
         {
+            if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+                continue;
             rqd_avgload = rqd->b_avgload - svc->avgload;
         }
         else if ( spin_trylock(&rqd->lock) )
         {
+            if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+            {
+                spin_unlock(&rqd->lock);
+                continue;
+            }
             rqd_avgload = rqd->b_avgload;
             spin_unlock(&rqd->lock);
         }
@@ -1123,12 +1159,16 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         }
     }
 
-    /* We didn't find anyone (most likely because of spinlock contention); leave it where it is */
     if ( min_rqi == -1 )
-        new_cpu = vc->processor;
+    {
+        /* No runqs found (most likely because of spinlock contention). */
+        new_cpu = get_safe_pcpu(svc);
+    }
     else
     {
-        new_cpu = cpumask_cycle(vc->processor, &prv->rqd[min_rqi].active);
+        cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
+            &prv->rqd[min_rqi].active);
+        new_cpu = cpumask_any(csched2_cpumask);
         BUG_ON(new_cpu >= nr_cpu_ids);
     }
 
@@ -1207,7 +1247,12 @@ static void migrate(const struct scheduler *ops,
             on_runq=1;
         }
         __runq_deassign(svc);
-        svc->vcpu->processor = cpumask_any(&trqd->active);
+
+        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+            &trqd->active);
+        svc->vcpu->processor = cpumask_any(csched2_cpumask);
+        BUG_ON(svc->vcpu->processor >= nr_cpu_ids);
+
         __runq_assign(svc, trqd);
         if ( on_runq )
         {
@@ -1330,6 +1375,12 @@ retry:
         if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) )
             continue;
 
+        /* Skip if it can't run on the destination runq. */
+        cpumask_and(csched2_cpumask, push_svc->vcpu->cpu_hard_affinity,
+            &st.orqd->active);
+        if ( cpumask_empty(csched2_cpumask) )
+            continue;
+
         list_for_each( pull_iter, &st.orqd->svc )
         {
             struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem);
@@ -1343,6 +1394,12 @@ retry:
             if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
                 continue;
 
+            /* Skip if it can't run on the destination runq. */
+            cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity,
+                &st.lrqd->active);
+            if ( cpumask_empty(csched2_cpumask) )
+                continue;
+
             consider(&st, push_svc, pull_svc);
         }
 
@@ -1360,6 +1417,12 @@ retry:
         if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
             continue;
 
+        /* Skip if it can't run on the destination runq. */
+        cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity,
+            &st.lrqd->active);
+        if ( cpumask_empty(csched2_cpumask) )
+            continue;
+
         /* Consider pull only */
         consider(&st, NULL, pull_svc);
     }
@@ -1396,6 +1459,15 @@ csched2_vcpu_migrate(
 
     /* Check if new_cpu is valid */
     BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED2_PRIV(ops)->initialized));
+    BUG_ON(!cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity));
+
+    /*
+     * Assign new_cpu to vc->processor here to get a call to sched_move_irqs
+     * in schedule.c in case there was a hard affinity change within the same
+     * run queue. vc will not be able to run in certain situations without
+     * this call.
+     */
+    vc->processor = new_cpu;
 
     trqd = RQD(ops, new_cpu);
 
@@ -1610,6 +1682,10 @@ runq_candidate(struct csched2_runqueue_data *rqd,
     {
         struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, runq_elem);
 
+        /* Only consider vcpus that are allowed to run on this processor. */
+        if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) )
+            continue;
+
         /* If this is on a different processor, don't pull it unless
          * its credit is at least CSCHED2_MIGRATE_RESIST higher. */
         if ( svc->vcpu->processor != cpu
@@ -1992,6 +2068,13 @@ csched2_alloc_pdata(const struct scheduler *ops, int cpu)
         printk("%s: cpu %d not online yet, deferring initializatgion\n",
                __func__, cpu);
 
+    /*
+     * For each new pcpu, allocate a cpumask_t for use throughout the
+     * scheduler to avoid putting any cpumask_t structs on the stack.
+     */
+    if ( !zalloc_cpumask_var(&cpumask[cpu]) )
+        return NULL;
+
     return (void *)1;
 }
 
@@ -2040,6 +2123,8 @@ csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
 
     spin_unlock_irqrestore(&prv->lock, flags);
 
+    free_cpumask_var(cpumask[cpu]);
+
     return;
 }
 
@@ -2127,16 +2212,25 @@ csched2_init(struct scheduler *ops)
 
     prv->load_window_shift = opt_load_window_shift;
 
+    cpumask = xzalloc_bytes(nr_cpu_ids * sizeof(cpumask_t *));
+    if ( cpumask == NULL )
+        return -ENOMEM;
+
     return 0;
 }
 
 static void
 csched2_deinit(const struct scheduler *ops)
 {
+    int i;
     struct csched2_private *prv;
 
     prv = CSCHED2_PRIV(ops);
     xfree(prv);
+
+    for ( i = 0; i < nr_cpu_ids; i++ )
+        free_cpumask_var(cpumask[i]);
+    xfree(cpumask);
 }
 
 
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-02-09  3:45 [PATCH v2] sched: credit2: respect per-vcpu hard affinity Justin T. Weaver
@ 2015-03-03  3:15 ` Dario Faggioli
  2015-03-03  9:12   ` Jan Beulich
  2015-03-06 15:18   ` George Dunlap
  2015-03-13 17:11 ` Dario Faggioli
  1 sibling, 2 replies; 15+ messages in thread
From: Dario Faggioli @ 2015-03-03  3:15 UTC (permalink / raw)
  To: jtweaver; +Cc: xen-devel, George Dunlap, henric


[-- Attachment #1.1: Type: text/plain, Size: 12646 bytes --]

On Sun, 2015-02-08 at 17:45 -1000, Justin T. Weaver wrote:
> From: "Justin T. Weaver" <jtweaver@hawaii.edu>
> 
> by making sure that vcpus only run on the pcpu(s) they are allowed to
> run on based on their hard affinity cpu masks.
> 
Ok, here I am reviewing this, at last... sorry for the delay! :-(

> Signed-off-by: Justin T. Weaver <jtweaver@hawaii.edu>
> ---
> Changes in v2:
>  * Added dynamically allocated cpu masks to avoid putting them on the stack;
>    replaced temp masks from v1 throughout
>  * Added helper function for code suggested in v1 review and called it in two
>    locations in function choose_cpu
>  * Removed v1 change to comment in the beginning of choose_cpu
>  * Replaced two instances of cpumask_and/cpumask_empty with cpumask_intersects
>  * Removed v1 re-work of code in function migrate; only change in migrate in
>    v2 is the assignment of a valid pcpu from the destination run queue to
>    vc->processor
>  * In function csched2_vcpu_migrate: removed change from v1 that called
>    function migrate even if cur and dest run queues were the same in order
>    to get a runq_tickle call; added processor assignment to new_cpu to fix
>    the real underlying issue which was the vcpu not getting a call to
>    sched_move_irqs
>
Aha, so that was the issue! Glad you figured this out. :-)

>  * Removed the looping added in v1 in function balance_load; may be added back
>    later because it would help to have balance_load be more aware of hard
>    affinity, but adding it does not affect credit2's current inability to
>    respect hard affinity.
>  * Removed coding style fix in function balance_load
>  * Improved comment in function runq_candidate
>
Thanks for putting this changes recap here. :-)

> ---
>  xen/common/sched_credit2.c |  122 +++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 108 insertions(+), 14 deletions(-)
> 
> diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
> index cf53770..de8fb5a 100644
> --- a/xen/common/sched_credit2.c
> +++ b/xen/common/sched_credit2.c
> @@ -194,6 +194,12 @@ int opt_overload_balance_tolerance=-3;
>  integer_param("credit2_balance_over", opt_overload_balance_tolerance);
>  
>  /*
> + * Use this to avoid having too many cpumask_t structs on the stack
> + */
> +static cpumask_t **cpumask = NULL;
>
Not just 'cpumask', please... It's too generic a name. Let's pick up
something that makes it easier to understand what's the purpose of this.

I'm not really sure right now what something like that could be...
Credit has balance_mask, but we're not going as far as introducing
multiple step load balancing here (not with this patch, at least).

Maybe affinity_cpumask, or hard_affinity_cpumask (and we'll rename to
something else when introducing soft affinity, if needed).

> +#define csched2_cpumask cpumask[smp_processor_id()]
> +
I like the idea, but put the right side between parentheses. Of course,
what just said about the name applies here too. :-)

> @@ -268,6 +274,23 @@ struct csched2_dom {
>      uint16_t nr_vcpus;
>  };
>  
> +/*
> + * When a hard affinity change occurs, we may not be able to check some or
> + * all of the other run queues for a valid new processor for the given vcpu.
> + * Return svc's current pcpu if valid, otherwise return a safe pcpu.
> + */
>
Add at least a very quick mention on why this is (could be) necessary.

> +static int get_safe_pcpu(struct csched2_vcpu *svc)
> +{
>
I also don't like the name... __choose_cpu() maybe ?

> +    cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity, &svc->rqd->active);
> +    if ( unlikely(cpumask_empty(csched2_cpumask)) )
> +        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
> +            cpupool_online_cpumask(svc->vcpu->domain->cpupool));
>
VCPU2ONLINE(svc->vcpu) would make the line shorter.

Also, I know I'm the one that suggested this form for the code, but
thinking again about it, I'm not sure the first if is worth:

    cpumask_and(csched2_cpumask, &svc->rqd->active, VCPU2ONLINE(svc->vcpu));
    cpumask_and(csched2_cpumask, csched2_cpumask, svc->vcpu->cpu_hard_affinity);

Oh, and, with either yours or my variant... can csched2_cpumask end up
empty after the two &&-s? That's important or, below, cpumask_any will
return garbage! :-/

From a quick inspection, it does not seem it can, in which case, I'd put
down an ASSERT() about this somewhere. OTOH, if it can, I'd look for
ways of preventing that to happen before getting here... It seems easier
and more correct to do that, rather than trying to figure out what to do
here if the mask is empty. :-O

> +
> +    if ( cpumask_test_cpu(svc->vcpu->processor, csched2_cpumask) )
> +        return svc->vcpu->processor;
> +    else
> +        return cpumask_any(csched2_cpumask);
>
And, perhaps, we could put back the likely/unlikely hint here (provided
we think the then branch is actually likely):

    if ( likely(svc->vcpu->processor, csched2_cpumask) )
        return svc->vcpu->processor;
    else
        return cpumask_any(csched2_cpumask);

> @@ -1081,13 +1106,17 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
>          else
>          {
>              d2printk("%pv +\n", svc->vcpu);
> -            new_cpu = cpumask_cycle(vc->processor, &svc->migrate_rqd->active);
> -            goto out_up;
> +            cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
> +                &svc->migrate_rqd->active);
> +            if ( !cpumask_empty(csched2_cpumask) )
> +            {
> +                new_cpu = cpumask_any(csched2_cpumask);
> +                goto out_up;
> +            }
>
               cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
                           &svc->migrate_rqd->active);
               new_cpu = cpumask_any(csched2_cpumask);
               if ( new_cpu < nr_cpu_ids )
               {
                   d2printk("%pv +\n", svc->vcpu);
                   goto out_up;
               }

As I told you last round, checking the results of most of the
cpumask_foo() operations against nr_cpu_ids, can likely avoid the need
of more cpumask fiddling, and hence should be done whenever it is
possible.

> +            /* Fall-through to normal cpu pick */
>
Don't add this here. Instead, kill the instance of the exact same
comment in the if(){}, and only have it once ...

>          }
>
           ^ ... here, i.e., outside of the if (){}else{} block.

>      }
>  
> -    /* FIXME: Pay attention to cpu affinity */                                                                                      
> -
>      min_avgload = MAX_LOAD;
>  
>      /* Find the runqueue with the lowest instantaneous load */
> @@ -1099,17 +1128,24 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
>          rqd = prv->rqd + i;
>  
>          /* If checking a different runqueue, grab the lock,
> -         * read the avg, and then release the lock.
> +         * check hard affinity, read the avg, and then release the lock.
>           *
>           * If on our own runqueue, don't grab or release the lock;
>           * but subtract our own load from the runqueue load to simulate
>           * impartiality */
>          if ( rqd == svc->rqd )
>          {
> +            if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
> +                continue;
>
I'd add a short comment on how it is possible that we are here and don't
have hard affinit with any pCPU in our runqueue (exactly as you
explained it during v1 review, i.e., in case affinit is being changed).

>              rqd_avgload = rqd->b_avgload - svc->avgload;
>          }
>          else if ( spin_trylock(&rqd->lock) )
>          {
> +            if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
> +            {
> +                spin_unlock(&rqd->lock);
> +                continue;
> +            }
>              rqd_avgload = rqd->b_avgload;
>              spin_unlock(&rqd->lock);
>          }
>
Something like this would be easier to read, IMO:

           if ( rqd == svc->rqd )
           {
               if ( cpumask_intersects(vc->cpu_hard_affinity,
                                       &rqd->active) )
                   rqd_avgload = rqd->b_avgload - svc->avgload;
           }
           else if ( spin_trylock(&rqd->lock) )
           {
               if ( cpumask_intersects(vc->cpu_hard_affinity,
                                       &rqd->active) )
                   rqd_avgload = rqd->b_avgload;

               spin_unlock(rqd->lock);
           }
           else
               continue;

Semantic should be the same, provided we initialize rqd_avgload to
MAX_LOAD, I would say.

In fact, without the two 'continue;' statements you were introducing,
we'll execute the if() that follows this block even if there was no
intersection with the hard affinity mask but, in that case, no chance we
have updated rqd_avgload, so it really should behave the same, what do
you think?

> @@ -1123,12 +1159,16 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
>          }
>      }
>  
> -    /* We didn't find anyone (most likely because of spinlock contention); leave it where it is */
>      if ( min_rqi == -1 )
> -        new_cpu = vc->processor;
> +    {
> +        /* No runqs found (most likely because of spinlock contention). */
> +        new_cpu = get_safe_pcpu(svc);
> +    }
>
No need to move the comment inside the if(), just kill the 'leave it
where it is' part.

> @@ -1330,6 +1375,12 @@ retry:
>          if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) )
>              continue;
>  
> +        /* Skip if it can't run on the destination runq. */
> +        cpumask_and(csched2_cpumask, push_svc->vcpu->cpu_hard_affinity,
> +            &st.orqd->active);
> +        if ( cpumask_empty(csched2_cpumask) )
> +            continue;
> +
>          list_for_each( pull_iter, &st.orqd->svc )
>          {
>              struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct csched2_vcpu, rqd_elem);
> @@ -1343,6 +1394,12 @@ retry:
>              if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
>                  continue;
>  
> +            /* Skip if it can't run on the destination runq. */
> +            cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity,
> +                &st.lrqd->active);
> +            if ( cpumask_empty(csched2_cpumask) )
> +                continue;
> +
>              consider(&st, push_svc, pull_svc);
>          }
>  
> @@ -1360,6 +1417,12 @@ retry:
>          if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
>              continue;
>  
> +        /* Skip if it can't run on the destination runq. */
> +        cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity,
> +            &st.lrqd->active);
> +        if ( cpumask_empty(csched2_cpumask) )
> +            continue;
> +
>
Same 4 lines of code (5, if we count the comment), repeated 3 times:
make an helper. :-)

> @@ -1396,6 +1459,15 @@ csched2_vcpu_migrate(
>  
>      /* Check if new_cpu is valid */
>      BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED2_PRIV(ops)->initialized));
> +    BUG_ON(!cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity));
>
Not sure.. An ASSERT() seems more suitable than a BUG_ON here... What's
the reasoning behind this?

> +    /*
> +     * Assign new_cpu to vc->processor here to get a call to sched_move_irqs
> +     * in schedule.c in case there was a hard affinity change within the same
> +     * run queue. vc will not be able to run in certain situations without
> +     * this call.
> +     */
> +    vc->processor = new_cpu;
>
Oh, and this is what was causing you troubles, in case source and
destination runqueue were the same... Help me understand, which call to
sched_move_irqs() in schedule.c were we missing? I'd say it is the one
in vcpu_migrate(), but that does not seem to care about vc->processor
(much rater about new_cpu)... what am I missing?

However, if they are not the same, the call to migrate() will override
this right away, won't it? What I mean to say is, if this is required
only in case trqd and svc->rqd are equal, can we tweak this part of
csched2_vcpu_migrate() as follows?

    if ( trqd != svc->rqd )
        migrate(ops, svc, trqd, NOW());
    else
        vc->processor = new_cpu;


Thanks and Regards,
Dario

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-03  3:15 ` Dario Faggioli
@ 2015-03-03  9:12   ` Jan Beulich
  2015-03-04 11:03     ` Dario Faggioli
  2015-03-06 15:18   ` George Dunlap
  1 sibling, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2015-03-03  9:12 UTC (permalink / raw)
  To: Dario Faggioli, jtweaver; +Cc: xen-devel, George Dunlap, henric

>>> On 03.03.15 at 04:15, <dario.faggioli@citrix.com> wrote:
> On Sun, 2015-02-08 at 17:45 -1000, Justin T. Weaver wrote:
>> --- a/xen/common/sched_credit2.c
>> +++ b/xen/common/sched_credit2.c
>> @@ -194,6 +194,12 @@ int opt_overload_balance_tolerance=-3;
>>  integer_param("credit2_balance_over", opt_overload_balance_tolerance);
>>  
>>  /*
>> + * Use this to avoid having too many cpumask_t structs on the stack
>> + */
>> +static cpumask_t **cpumask = NULL;
>>
> Not just 'cpumask', please... It's too generic a name. Let's pick up
> something that makes it easier to understand what's the purpose of this.
> 
> I'm not really sure right now what something like that could be...
> Credit has balance_mask, but we're not going as far as introducing
> multiple step load balancing here (not with this patch, at least).
> 
> Maybe affinity_cpumask, or hard_affinity_cpumask (and we'll rename to
> something else when introducing soft affinity, if needed).
> 
>> +#define csched2_cpumask cpumask[smp_processor_id()]
>> +
> I like the idea, but put the right side between parentheses.

Parentheses? Why? There's no operator with higher precedence
than postfix ones.

> Of course,
> what just said about the name applies here too. :-)

Right. Suitably done, variable and macro can actually share names.

>> +static int get_safe_pcpu(struct csched2_vcpu *svc)
>> +{
>>
> I also don't like the name... __choose_cpu() maybe ?

Why is everyone liking these double underscore prefixed names so
much? They're in conflict with the library name space and hence
should be avoided. Single underscore prefixed names (and the
underscore not followed by an upper case letter) is what the
standard sets aside for file scope (i.e. static) identifiers.

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-03  9:12   ` Jan Beulich
@ 2015-03-04 11:03     ` Dario Faggioli
  2015-03-04 12:50       ` Jan Beulich
  0 siblings, 1 reply; 15+ messages in thread
From: Dario Faggioli @ 2015-03-04 11:03 UTC (permalink / raw)
  To: JBeulich; +Cc: xen-devel, jtweaver, George Dunlap, henric


[-- Attachment #1.1: Type: text/plain, Size: 1604 bytes --]

On Tue, 2015-03-03 at 09:12 +0000, Jan Beulich wrote:
> >>> On 03.03.15 at 04:15, <dario.faggioli@citrix.com> wrote:
> > On Sun, 2015-02-08 at 17:45 -1000, Justin T. Weaver wrote:

> >> +#define csched2_cpumask cpumask[smp_processor_id()]
> >> +
> > I like the idea, but put the right side between parentheses.
> 
> Parentheses? Why? There's no operator with higher precedence
> than postfix ones.
> 
There certainly isn't. The why is my personal taste, mostly, which does
not count much, I know, so I grep-ed around the sources and found other
similar examples which have parentheses, and so I went ahead and asked.

However, I can certainly live without them. :-)


> >> +static int get_safe_pcpu(struct csched2_vcpu *svc)
> >> +{
> >>
> > I also don't like the name... __choose_cpu() maybe ?
> 
> Why is everyone liking these double underscore prefixed names so
> much? They're in conflict with the library name space and hence
> should be avoided. Single underscore prefixed names (and the
> underscore not followed by an upper case letter) is what the
> standard sets aside for file scope (i.e. static) identifiers.
> 
Well, I'm not sure I know why, but --from a purely aesthetic point of
view-- I actually like __foo more than _foo. However, the main reason
why I'm suggesting it here, is to follow suit, since __foo is what is
always used (in sched_credit2.c, but also in most of other files AFAICT)
for similar functions.

I see the point you're making, and I can live with _choose_cpu(), but
the result would look a bit inconsistent, IMO.

Regards,
Dario

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-04 11:03     ` Dario Faggioli
@ 2015-03-04 12:50       ` Jan Beulich
  2015-03-04 13:08         ` Dario Faggioli
  0 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2015-03-04 12:50 UTC (permalink / raw)
  To: Dario Faggioli; +Cc: xen-devel, jtweaver, George Dunlap, henric

>>> On 04.03.15 at 12:03, <dario.faggioli@citrix.com> wrote:
> I see the point you're making, and I can live with _choose_cpu(), but
> the result would look a bit inconsistent, IMO.

I'm tempted to ask for a cleanup patch then.

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-04 12:50       ` Jan Beulich
@ 2015-03-04 13:08         ` Dario Faggioli
  2015-03-04 13:24           ` Jan Beulich
  0 siblings, 1 reply; 15+ messages in thread
From: Dario Faggioli @ 2015-03-04 13:08 UTC (permalink / raw)
  To: JBeulich; +Cc: xen-devel, jtweaver, George Dunlap, henric


[-- Attachment #1.1: Type: text/plain, Size: 674 bytes --]

On Wed, 2015-03-04 at 12:50 +0000, Jan Beulich wrote:
> >>> On 04.03.15 at 12:03, <dario.faggioli@citrix.com> wrote:
> > I see the point you're making, and I can live with _choose_cpu(), but
> > the result would look a bit inconsistent, IMO.
> 
> I'm tempted to ask for a cleanup patch then.
> 
Yep, and I'd be fine with that. If Justin is up for this, as a pre-patch
(not part of the affinity series), fine, if not, I can do it.

I've checked and sched_credit.c does both (it mostly does __foo, the
only exception being _csched_cpu_pick). sched_rt.c does __foo too,
should we take the chance and (in separate patches) cleanup all of
these?

Regards,
Dario

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-04 13:08         ` Dario Faggioli
@ 2015-03-04 13:24           ` Jan Beulich
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2015-03-04 13:24 UTC (permalink / raw)
  To: Dario Faggioli; +Cc: xen-devel, jtweaver, George Dunlap, henric

>>> On 04.03.15 at 14:08, <dario.faggioli@citrix.com> wrote:
> On Wed, 2015-03-04 at 12:50 +0000, Jan Beulich wrote:
>> >>> On 04.03.15 at 12:03, <dario.faggioli@citrix.com> wrote:
>> > I see the point you're making, and I can live with _choose_cpu(), but
>> > the result would look a bit inconsistent, IMO.
>> 
>> I'm tempted to ask for a cleanup patch then.
>> 
> Yep, and I'd be fine with that. If Justin is up for this, as a pre-patch
> (not part of the affinity series), fine, if not, I can do it.
> 
> I've checked and sched_credit.c does both (it mostly does __foo, the
> only exception being _csched_cpu_pick). sched_rt.c does __foo too,
> should we take the chance and (in separate patches) cleanup all of
> these?

I'd say yes, but in the end it's all George's call.

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-03  3:15 ` Dario Faggioli
  2015-03-03  9:12   ` Jan Beulich
@ 2015-03-06 15:18   ` George Dunlap
  2015-03-06 17:02     ` Dario Faggioli
  1 sibling, 1 reply; 15+ messages in thread
From: George Dunlap @ 2015-03-06 15:18 UTC (permalink / raw)
  To: Dario Faggioli, jtweaver; +Cc: xen-devel, George Dunlap, henric

On 03/03/2015 03:15 AM, Dario Faggioli wrote:
> On Sun, 2015-02-08 at 17:45 -1000, Justin T. Weaver wrote:
>> From: "Justin T. Weaver" <jtweaver@hawaii.edu>
>>
>> by making sure that vcpus only run on the pcpu(s) they are allowed to
>> run on based on their hard affinity cpu masks.
>>
> Ok, here I am reviewing this, at last... sorry for the delay! :-(
> 
>> Signed-off-by: Justin T. Weaver <jtweaver@hawaii.edu>
>> ---
>> Changes in v2:
>>  * Added dynamically allocated cpu masks to avoid putting them on the stack;
>>    replaced temp masks from v1 throughout
>>  * Added helper function for code suggested in v1 review and called it in two
>>    locations in function choose_cpu
>>  * Removed v1 change to comment in the beginning of choose_cpu
>>  * Replaced two instances of cpumask_and/cpumask_empty with cpumask_intersects
>>  * Removed v1 re-work of code in function migrate; only change in migrate in
>>    v2 is the assignment of a valid pcpu from the destination run queue to
>>    vc->processor
>>  * In function csched2_vcpu_migrate: removed change from v1 that called
>>    function migrate even if cur and dest run queues were the same in order
>>    to get a runq_tickle call; added processor assignment to new_cpu to fix
>>    the real underlying issue which was the vcpu not getting a call to
>>    sched_move_irqs
>>
> Aha, so that was the issue! Glad you figured this out. :-)
> 
>>  * Removed the looping added in v1 in function balance_load; may be added back
>>    later because it would help to have balance_load be more aware of hard
>>    affinity, but adding it does not affect credit2's current inability to
>>    respect hard affinity.
>>  * Removed coding style fix in function balance_load
>>  * Improved comment in function runq_candidate
>>
> Thanks for putting this changes recap here. :-)
> 
>> ---
>>  xen/common/sched_credit2.c |  122 +++++++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 108 insertions(+), 14 deletions(-)
>>
>> diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
>> index cf53770..de8fb5a 100644
>> --- a/xen/common/sched_credit2.c
>> +++ b/xen/common/sched_credit2.c
>> @@ -194,6 +194,12 @@ int opt_overload_balance_tolerance=-3;
>>  integer_param("credit2_balance_over", opt_overload_balance_tolerance);
>>  
>>  /*
>> + * Use this to avoid having too many cpumask_t structs on the stack
>> + */
>> +static cpumask_t **cpumask = NULL;
>>
> Not just 'cpumask', please... It's too generic a name. Let's pick up
> something that makes it easier to understand what's the purpose of this.
> 
> I'm not really sure right now what something like that could be...
> Credit has balance_mask, but we're not going as far as introducing
> multiple step load balancing here (not with this patch, at least).
> 
> Maybe affinity_cpumask, or hard_affinity_cpumask (and we'll rename to
> something else when introducing soft affinity, if needed).

Well I think it's just being used as scratch space, right?  Why not
scratch_mask or something like that?

>> +#define csched2_cpumask cpumask[smp_processor_id()]
>> +
> I like the idea, but put the right side between parentheses. Of course,
> what just said about the name applies here too. :-)
> 
>> @@ -268,6 +274,23 @@ struct csched2_dom {
>>      uint16_t nr_vcpus;
>>  };
>>  
>> +/*
>> + * When a hard affinity change occurs, we may not be able to check some or
>> + * all of the other run queues for a valid new processor for the given vcpu.
>> + * Return svc's current pcpu if valid, otherwise return a safe pcpu.
>> + */
>>
> Add at least a very quick mention on why this is (could be) necessary.
> 
>> +static int get_safe_pcpu(struct csched2_vcpu *svc)
>> +{
>>
> I also don't like the name... __choose_cpu() maybe ?

I'm not 100% happy with the name, but I think "get_safe_pcpu" is more
descriptive than just "__choose_cpu".

> 
>> +    cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity, &svc->rqd->active);
>> +    if ( unlikely(cpumask_empty(csched2_cpumask)) )
>> +        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
>> +            cpupool_online_cpumask(svc->vcpu->domain->cpupool));
>>
> VCPU2ONLINE(svc->vcpu) would make the line shorter.
> 
> Also, I know I'm the one that suggested this form for the code, but
> thinking again about it, I'm not sure the first if is worth:
> 
>     cpumask_and(csched2_cpumask, &svc->rqd->active, VCPU2ONLINE(svc->vcpu));
>     cpumask_and(csched2_cpumask, csched2_cpumask, svc->vcpu->cpu_hard_affinity);

Actually, I was going to say is there any reason not to start with:

if ( likely(cpumask_test_cpu(svc->vcpu->processor,
svc->vcpu->cpu_hard_affinity))
 return svc->vcpu->processor;

And then go through doing the unions and what-not once we've determined
that the current processor isn't suitable?

> Oh, and, with either yours or my variant... can csched2_cpumask end up
> empty after the two &&-s? That's important or, below, cpumask_any will
> return garbage! :-/
> 
> From a quick inspection, it does not seem it can, in which case, I'd put
> down an ASSERT() about this somewhere. OTOH, if it can, I'd look for
> ways of preventing that to happen before getting here... It seems easier
> and more correct to do that, rather than trying to figure out what to do
> here if the mask is empty. :-O

Yes, we need to go through the code and make sure that we understand
what our assumptions are, so that people can't crash Xen by doing
irrational things like making the hard affinity not intersect with the
cpupool cpus.

If we make this an "unusual slow path", I don't see any reason to make
the code a bit more resilient by handling cases we don't expect to
happen.  It would be good to try to make sure we don't allow a situation
where union(hard_affinity, domain->cpupool) is empty, but I'd rather the
result be something not too bizarre (e.g., picking a random cpu in the
cpupool) than having the host crash or something.

>>      }
>>  
>> -    /* FIXME: Pay attention to cpu affinity */                                                                                      
>> -

Nice to see those disappear. :-)

>> @@ -1396,6 +1459,15 @@ csched2_vcpu_migrate(
>>  
>>      /* Check if new_cpu is valid */
>>      BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED2_PRIV(ops)->initialized));
>> +    BUG_ON(!cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity));
>>
> Not sure.. An ASSERT() seems more suitable than a BUG_ON here... What's
> the reasoning behind this?

Yes, BUG_ON() should be used where something is almost certainly going
to cause a crash anyway; so you want to cause it somewhere you'll get
the most information about what caused the crash.

ASSERT() should be used for "I believe this should always be the case,
and if it's not, I want to find out in testing".

>> +    /*
>> +     * Assign new_cpu to vc->processor here to get a call to sched_move_irqs
>> +     * in schedule.c in case there was a hard affinity change within the same
>> +     * run queue. vc will not be able to run in certain situations without
>> +     * this call.
>> +     */
>> +    vc->processor = new_cpu;
>>
> Oh, and this is what was causing you troubles, in case source and
> destination runqueue were the same... Help me understand, which call to
> sched_move_irqs() in schedule.c were we missing? I'd say it is the one
> in vcpu_migrate(), but that does not seem to care about vc->processor
> (much rater about new_cpu)... what am I missing?
> 
> However, if they are not the same, the call to migrate() will override
> this right away, won't it? What I mean to say is, if this is required
> only in case trqd and svc->rqd are equal, can we tweak this part of
> csched2_vcpu_migrate() as follows?
> 
>     if ( trqd != svc->rqd )
>         migrate(ops, svc, trqd, NOW());
>     else
>         vc->processor = new_cpu;

It does seem a bit weird to me looking at it now that when the generic
scheduler does vcpu_migrate(), we go through all the hassle of calling
pick_cpu, and then (if the runqueue happens to change) we end up picking
*yet another* random cpu within that runqueue.

But that's a fix for another time I think.  I agree with Dario's
suggestion here.

 -George

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-06 15:18   ` George Dunlap
@ 2015-03-06 17:02     ` Dario Faggioli
  2015-03-09  7:11       ` Justin Weaver
  0 siblings, 1 reply; 15+ messages in thread
From: Dario Faggioli @ 2015-03-06 17:02 UTC (permalink / raw)
  Cc: xen-devel, jtweaver, George Dunlap, henric


[-- Attachment #1.1: Type: text/plain, Size: 8133 bytes --]

On Fri, 2015-03-06 at 15:18 +0000, George Dunlap wrote:
> On 03/03/2015 03:15 AM, Dario Faggioli wrote:
> > On Sun, 2015-02-08 at 17:45 -1000, Justin T. Weaver wrote:

> >>  /*
> >> + * Use this to avoid having too many cpumask_t structs on the stack
> >> + */
> >> +static cpumask_t **cpumask = NULL;
> >>
> > Not just 'cpumask', please... It's too generic a name. Let's pick up
> > something that makes it easier to understand what's the purpose of this.
> > 
> > I'm not really sure right now what something like that could be...
> > Credit has balance_mask, but we're not going as far as introducing
> > multiple step load balancing here (not with this patch, at least).
> > 
> > Maybe affinity_cpumask, or hard_affinity_cpumask (and we'll rename to
> > something else when introducing soft affinity, if needed).
> 
> Well I think it's just being used as scratch space, right?  Why not
> scratch_mask or something like that?
> 
Fine with me.

> >> +static int get_safe_pcpu(struct csched2_vcpu *svc)
> >> +{
> >>
> > I also don't like the name... __choose_cpu() maybe ?
> 
> I'm not 100% happy with the name, but I think "get_safe_pcpu" is more
> descriptive than just "__choose_cpu".
>
I don't like the "_safe_" part, but that is not a big deal, I certainly
can live with it.

> >> +    cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity, &svc->rqd->active);
> >> +    if ( unlikely(cpumask_empty(csched2_cpumask)) )
> >> +        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
> >> +            cpupool_online_cpumask(svc->vcpu->domain->cpupool));
> >>
> > VCPU2ONLINE(svc->vcpu) would make the line shorter.
> > 
> > Also, I know I'm the one that suggested this form for the code, but
> > thinking again about it, I'm not sure the first if is worth:
> > 
> >     cpumask_and(csched2_cpumask, &svc->rqd->active, VCPU2ONLINE(svc->vcpu));
> >     cpumask_and(csched2_cpumask, csched2_cpumask, svc->vcpu->cpu_hard_affinity);
> 
> Actually, I was going to say is there any reason not to start with:
> 
> if ( likely(cpumask_test_cpu(svc->vcpu->processor,
> svc->vcpu->cpu_hard_affinity))
>  return svc->vcpu->processor;
> 
> And then go through doing the unions and what-not once we've determined
> that the current processor isn't suitable?
> 
I like the idea, and I think it actually makes sense. I'm trying to
think to a scenario where this can be bugous, and where we actually need
to do the filtering against rqd->active and online up front, but I can't
imagine one.

I think the possible source of trouble is affinity being changed, but
even then, if we were on vcpu->processor, and that still is in our hard
affinity mask, it ought to be right to stay there (and hence George's
suggestion should be fine)... Justin, what do you think?

> > Oh, and, with either yours or my variant... can csched2_cpumask end up
> > empty after the two &&-s? That's important or, below, cpumask_any will
> > return garbage! :-/
> > 
> > From a quick inspection, it does not seem it can, in which case, I'd put
> > down an ASSERT() about this somewhere. OTOH, if it can, I'd look for
> > ways of preventing that to happen before getting here... It seems easier
> > and more correct to do that, rather than trying to figure out what to do
> > here if the mask is empty. :-O
> 
> Yes, we need to go through the code and make sure that we understand
> what our assumptions are, so that people can't crash Xen by doing
> irrational things like making the hard affinity not intersect with the
> cpupool cpus.
> 
True.

Something like that can be figured out and either forbidden or, in
general, addressed in other places, rather than requiring us to care
here. In fact, this seems to me to be what's happening already (see
below).

> If we make this an "unusual slow path", I don't see any reason to make
> the code a bit more resilient by handling cases we don't expect to
> happen.
>
Well, again, true, in a general fashion. However --perhaps because I'm
not sure I'm getting what "unusual slow path" means in this context-- if
we say that we support hard affinity, I see the fact that vCPUs can be
found on pCPUs outside of their hard affinity as a very bad thing.

For instance, people may rely on this for various reasons, e.g., to
achieve a high level of isolation between domains, by partitioning the
hard affinity of their vCPUs accordingly, and this isolation not being
enforced can screw things arbitrarily bad for them, I think.

Whether this is worth exploding it probably debatable (and workload and
use case dependent), but it definitely falls in the "I want to catch
this during testing" (so ==> ASSERT) category, IMO.

> It would be good to try to make sure we don't allow a situation
> where union(hard_affinity, domain->cpupool) is empty, but I'd rather the
> result be something not too bizarre (e.g., picking a random cpu in the
> cpupool) than having the host crash or something.
> 
Yeah, I like robustness... but only up to a certain extent, I think, or
you end up having to deal with all sort of nonsensical situations pretty
much everywhere! :-)

AFAICR, and having another quick look around, I think
intersection(hard_affinity,domain->cpupool) (because with union(), you
really meant intersection(), right? ;-P) can't be empty, and that's The
Right Thing (TM).

This is because, when a domain is moved to a cpupool, it's hard and soft
affinity is just reset to "all" (see sched_move_domain() in schedule.c).
Also, when the set of pCPUs of a cpupool is altered, or on CPU hotplug,
if that alteration is making the intersection empty, again the hard
affinity is reset to "all" (see cpu_disable_scheduler()).

So, dealing with this here with anything else than ASSERTs would at a
minimum look confusing to me. It would make me think that it is indeed
something possible and legitimate, and hence question why the code cited
above does what it does, etc. etc.

So, to summarize: provided the analysis above is verified, I wouldn't
BUG_ON, nor I would try to handle it, and I'd go for an ASSERT.

> >> -    /* FIXME: Pay attention to cpu affinity */                                                                                      
> >> -
> 
> Nice to see those disappear. :-)
> 
:-)

> >> +    /*
> >> +     * Assign new_cpu to vc->processor here to get a call to sched_move_irqs
> >> +     * in schedule.c in case there was a hard affinity change within the same
> >> +     * run queue. vc will not be able to run in certain situations without
> >> +     * this call.
> >> +     */
> >> +    vc->processor = new_cpu;
> >>
> > Oh, and this is what was causing you troubles, in case source and
> > destination runqueue were the same... Help me understand, which call to
> > sched_move_irqs() in schedule.c were we missing? I'd say it is the one
> > in vcpu_migrate(), but that does not seem to care about vc->processor
> > (much rater about new_cpu)... what am I missing?
> > 
> > However, if they are not the same, the call to migrate() will override
> > this right away, won't it? What I mean to say is, if this is required
> > only in case trqd and svc->rqd are equal, can we tweak this part of
> > csched2_vcpu_migrate() as follows?
> > 
> >     if ( trqd != svc->rqd )
> >         migrate(ops, svc, trqd, NOW());
> >     else
> >         vc->processor = new_cpu;
> 
> It does seem a bit weird to me looking at it now that when the generic
> scheduler does vcpu_migrate(), we go through all the hassle of calling
> pick_cpu, and then (if the runqueue happens to change) we end up picking
> *yet another* random cpu within that runqueue.
> 
Not sure I'm getting 100% of what you mean here... Are you complaining
on the fact that, independently from Justin's patch, inside migrate() we
call cpumask_any() instead than using new_cpu from here? If yes, that
makes sense, and it should not be too hard to fix...

> But that's a fix for another time I think.  
>
Agreed (again, if I got what you meant :-) ).

Regards,
Dario

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-06 17:02     ` Dario Faggioli
@ 2015-03-09  7:11       ` Justin Weaver
  2015-03-09 11:45         ` George Dunlap
  2015-03-10 13:23         ` Dario Faggioli
  0 siblings, 2 replies; 15+ messages in thread
From: Justin Weaver @ 2015-03-09  7:11 UTC (permalink / raw)
  To: Dario Faggioli, George Dunlap, Jan Beulich; +Cc: Henri Casanova, xen-devel

Thanks to all for the comments! I've implemented most of the changes
recommended here in the v2 review. I should have a new patch set ready
this week (with an updated soft affinity patch as well). I'll just
address the comments where you asked for my feedback...

>>              rqd_avgload = rqd->b_avgload - svc->avgload;
>>          }
>>          else if ( spin_trylock(&rqd->lock) )
>>          {
>> +            if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
>> +            {
>> +                spin_unlock(&rqd->lock);
>> +                continue;
>> +            }
>>              rqd_avgload = rqd->b_avgload;
>>              spin_unlock(&rqd->lock);
>>          }
>>
> Something like this would be easier to read, IMO:
>
>            if ( rqd == svc->rqd )
>            {
>                if ( cpumask_intersects(vc->cpu_hard_affinity,
>                                        &rqd->active) )
>                    rqd_avgload = rqd->b_avgload - svc->avgload;
>            }
>            else if ( spin_trylock(&rqd->lock) )
>            {
>                if ( cpumask_intersects(vc->cpu_hard_affinity,
>                                        &rqd->active) )
>                    rqd_avgload = rqd->b_avgload;
>
>                spin_unlock(rqd->lock);
>            }
>            else
>                continue;
>
> Semantic should be the same, provided we initialize rqd_avgload to
> MAX_LOAD, I would say.
>
> In fact, without the two 'continue;' statements you were introducing,
> we'll execute the if() that follows this block even if there was no
> intersection with the hard affinity mask but, in that case, no chance we
> have updated rqd_avgload, so it really should behave the same, what do
> you think?

I like it. I implemented your changes along with initializing
rqd_avgload to MAX_LOAD. I think it's definitely easier to read
without the continue statements and without multiple spin_unlock
statements.

>> @@ -1396,6 +1459,15 @@ csched2_vcpu_migrate(
>>
>>      /* Check if new_cpu is valid */
>>      BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED2_PRIV(ops)->initialized));
>> +    BUG_ON(!cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity));
>>
> Not sure.. An ASSERT() seems more suitable than a BUG_ON here... What's
> the reasoning behind this?

I was just trying to identify a state that this function should never
be in. After reading through the discussion in the review I understand
that ASSERT is more appropriate. It's changed for v3.

> Oh, and this is what was causing you troubles, in case source and
> destination runqueue were the same... Help me understand, which call to
> sched_move_irqs() in schedule.c were we missing? I'd say it is the one
> in vcpu_migrate(), but that does not seem to care about vc->processor
> (much rater about new_cpu)... what am I missing?
>
> However, if they are not the same, the call to migrate() will override
> this right away, won't it? What I mean to say is, if this is required
> only in case trqd and svc->rqd are equal, can we tweak this part of
> csched2_vcpu_migrate() as follows?
>
>     if ( trqd != svc->rqd )
>         migrate(ops, svc, trqd, NOW());
>     else
>         vc->processor = new_cpu;

You are right; it does not have anything to do with sched_move_irqs()
not being called (like you said it doesn't care about vc->processor).
You are never going to believe any of my explanations now! :) Well
third time's a charm hopefully... Without the processor assignment
here the vcpu might go on being assigned to a processor it no longer
is allowed to run on. In that case, function runq_candidate may only
get called for the vcpu's old processor, and runq_candidate will no
longer let a vcpu run on a processor that it's not allowed to run on
(because of the hard affinity check first introduced in v1 of this
patch). So in that condition the vcpu never gets to run. That's still
somewhat of a vague explanation, but I have observed that that is what
happens. Anyway I think everyone agrees that the processor assignment
needs to be here, and I did move it to an else block for v3 like you
recommended above.

>> >> +static int get_safe_pcpu(struct csched2_vcpu *svc)
>> >> +{
>> >>
>> > I also don't like the name... __choose_cpu() maybe ?
>>
>> I'm not 100% happy with the name, but I think "get_safe_pcpu" is more
>> descriptive than just "__choose_cpu".
>>
> I don't like the "_safe_" part, but that is not a big deal, I certainly
> can live with it.

I changed it to _choose_valid_pcpu ... discuss! (Also, I can send out
a pre-patch to change the double underscores in the whole file)

>> >> +    cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity, &svc->rqd->active);
>> >> +    if ( unlikely(cpumask_empty(csched2_cpumask)) )
>> >> +        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
>> >> +            cpupool_online_cpumask(svc->vcpu->domain->cpupool));
>> >>
>> > VCPU2ONLINE(svc->vcpu) would make the line shorter.

I agree. VCPU2ONLINE is defined in schedule.c ... do you want me to
move it to a common header along with the other parts we discussed
(__vcpu_has_soft_affinity, etc.)? Okay to move them to sched-if.h, or
should I put them in a new header file?

>> > Also, I know I'm the one that suggested this form for the code, but
>> > thinking again about it, I'm not sure the first if is worth:
>> >
>> >     cpumask_and(csched2_cpumask, &svc->rqd->active, VCPU2ONLINE(svc->vcpu));
>> >     cpumask_and(csched2_cpumask, csched2_cpumask, svc->vcpu->cpu_hard_affinity);
>>
>> Actually, I was going to say is there any reason not to start with:
>>
>> if ( likely(cpumask_test_cpu(svc->vcpu->processor,
>> svc->vcpu->cpu_hard_affinity))
>>  return svc->vcpu->processor;
>>
>> And then go through doing the unions and what-not once we've determined
>> that the current processor isn't suitable?
>>
> I like the idea, and I think it actually makes sense. I'm trying to
> think to a scenario where this can be bugous, and where we actually need
> to do the filtering against rqd->active and online up front, but I can't
> imagine one.
>
> I think the possible source of trouble is affinity being changed, but
> even then, if we were on vcpu->processor, and that still is in our hard
> affinity mask, it ought to be right to stay there (and hence George's
> suggestion should be fine)... Justin, what do you think?

I think doing "if ( likely(cpumask_test_cpu(svc->vcpu->processor,
svc->vcpu->cpu_hard_affinity) )" first is the way to go.

>> > Oh, and, with either yours or my variant... can csched2_cpumask end up
>> > empty after the two &&-s? That's important or, below, cpumask_any will
>> > return garbage! :-/
>> >
>> > From a quick inspection, it does not seem it can, in which case, I'd put
>> > down an ASSERT() about this somewhere. OTOH, if it can, I'd look for
>> > ways of preventing that to happen before getting here... It seems easier
>> > and more correct to do that, rather than trying to figure out what to do
>> > here if the mask is empty. :-O
>>
>> Yes, we need to go through the code and make sure that we understand
>> what our assumptions are, so that people can't crash Xen by doing
>> irrational things like making the hard affinity not intersect with the
>> cpupool cpus.
>>
> True.
>
> Something like that can be figured out and either forbidden or, in
> general, addressed in other places, rather than requiring us to care
> here. In fact, this seems to me to be what's happening already (see
> below).

I don't think there's any way the mask can be empty after the
cpumask_and with the cpu_hard_affinity and the VCPU2ONLINE. In
schedule.c:vcpu_set_hard_affinity, if the intersection of the new hard
mask and VCPU2ONLINE is empty, the function returns -EINVAL. I took
into account all the discussion here and modified the function for v3.

Thank you!
Justin Weaver
University of Hawaii at Manoa

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-09  7:11       ` Justin Weaver
@ 2015-03-09 11:45         ` George Dunlap
  2015-03-09 15:07           ` Dario Faggioli
  2015-03-10 13:23         ` Dario Faggioli
  1 sibling, 1 reply; 15+ messages in thread
From: George Dunlap @ 2015-03-09 11:45 UTC (permalink / raw)
  To: Justin Weaver, Dario Faggioli, Jan Beulich; +Cc: Henri Casanova, xen-devel

On 03/09/2015 07:11 AM, Justin Weaver wrote:
>>>>> +static int get_safe_pcpu(struct csched2_vcpu *svc)
>>>>> +{
>>>>>
>>>> I also don't like the name... __choose_cpu() maybe ?
>>>
>>> I'm not 100% happy with the name, but I think "get_safe_pcpu" is more
>>> descriptive than just "__choose_cpu".
>>>
>> I don't like the "_safe_" part, but that is not a big deal, I certainly
>> can live with it.
> 
> I changed it to _choose_valid_pcpu ... discuss! (Also, I can send out
> a pre-patch to change the double underscores in the whole file)

What about "get_fallback_cpu()"?  That's basically what we want when
this function is called -- the runqueue we wanted wasn't available, so
we just want somewhere else reasonable to put it.

>>>> Oh, and, with either yours or my variant... can csched2_cpumask end up
>>>> empty after the two &&-s? That's important or, below, cpumask_any will
>>>> return garbage! :-/
>>>>
>>>> From a quick inspection, it does not seem it can, in which case, I'd put
>>>> down an ASSERT() about this somewhere. OTOH, if it can, I'd look for
>>>> ways of preventing that to happen before getting here... It seems easier
>>>> and more correct to do that, rather than trying to figure out what to do
>>>> here if the mask is empty. :-O
>>>
>>> Yes, we need to go through the code and make sure that we understand
>>> what our assumptions are, so that people can't crash Xen by doing
>>> irrational things like making the hard affinity not intersect with the
>>> cpupool cpus.
>>>
>> True.
>>
>> Something like that can be figured out and either forbidden or, in
>> general, addressed in other places, rather than requiring us to care
>> here. In fact, this seems to me to be what's happening already (see
>> below).
> 
> I don't think there's any way the mask can be empty after the
> cpumask_and with the cpu_hard_affinity and the VCPU2ONLINE. In
> schedule.c:vcpu_set_hard_affinity, if the intersection of the new hard
> mask and VCPU2ONLINE is empty, the function returns -EINVAL. I took
> into account all the discussion here and modified the function for v3.

What about this:

* Create a cpu pool with cpus 0 and 1.  online_cpus is now [0,1].
* Set a hard affinity of [1].  This succeeds.
* Move cpu 1 to a different cpupool.

After a quick look I don't see anything that updates the hard affinity
when cpus are removed from pools.

And, even if it does *now*, it's possible that something might be
changed in the future that would forget it; this ASSERT() isn't exactly
next to that code.

It seems like handling the case where hard_affinity and cpus_online
don't overlap would be fairly simple; and if there was ever a bug such
that this was possible, handling the case would change that bug from
"hit an ASSERT" (or "have undefined behavior") to "have well-defined
behavior".  So it seems to me like handling that case makes the software
more robust for little cost.

 -George

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-09 11:45         ` George Dunlap
@ 2015-03-09 15:07           ` Dario Faggioli
  0 siblings, 0 replies; 15+ messages in thread
From: Dario Faggioli @ 2015-03-09 15:07 UTC (permalink / raw)
  To: George Dunlap; +Cc: xen-devel, jtweaver, JBeulich, henric


[-- Attachment #1.1.1: Type: text/plain, Size: 6288 bytes --]

On Mon, 2015-03-09 at 11:45 +0000, George Dunlap wrote:
> On 03/09/2015 07:11 AM, Justin Weaver wrote:

> > I don't think there's any way the mask can be empty after the
> > cpumask_and with the cpu_hard_affinity and the VCPU2ONLINE. In
> > schedule.c:vcpu_set_hard_affinity, if the intersection of the new hard
> > mask and VCPU2ONLINE is empty, the function returns -EINVAL. I took
> > into account all the discussion here and modified the function for v3.
> 
> What about this:
> 
> * Create a cpu pool with cpus 0 and 1.  online_cpus is now [0,1].
> * Set a hard affinity of [1].  This succeeds.
>
So, I decided to try this:

# xl cpupool-list -c
Name               CPU list
Pool-0              0,1

# xl list -c
Name                                        ID   Mem VCPUs	State	Time(s)         Cpupool
Domain-0                                     0   507     4     r-----      19.9           Pool0
test-pv                                      1  2000     8     -b----      19.2           Pool0

# xl vcpu-list test-pv
Name                                ID  VCPU   CPU State   Time(s) Affinity (Hard / Soft)
test-pv                              1     0    1   -b-       5.5  1 / 1
test-pv                              1     1    1   -b-       2.4  1 / 1
test-pv                              1     2    1   -b-       2.9  1 / 1
test-pv                              1     3    1   -b-       1.9  1 / 1
test-pv                              1     4    1   -b-       1.0  1 / 1
test-pv                              1     5    1   -b-       2.0  1 / 1
test-pv                              1     6    1   -b-       1.2  1 / 1
test-pv                              1     7    1   -b-       2.4  1 / 1

# xl cpupool-cpu-remove Pool0 1
(XEN) ****************************************
(XEN) Panic on CPU 0:
(XEN) Assertion '!cpumask_empty(dom_cpumask)' failed at domain.c:460
(XEN) ****************************************

i.e., surprise surprise, there's already an assert guarding this... and
it triggers!! :-(

It seems to have been added in v4 of the per-vcpu soft affinity work:
http://xenbits.xen.org/gitweb/?p=people/dariof/xen.git;a=commit;h=4d4e999637f38e0bbd4318ad8e0c92fd1e580430

So, we must have had this discussion before! :-)

I've done a bit of archeology and the ASSERT() in
domain_update_node_affinity() was introduced by me (upon request) while
working on implementing per-vcpu soft affinity. Then, cs 93be8285 is
what causes the assert to trigger. Time seems not to match, but that's
because soft affinity was put on hold when it was decided not to include
it in 4.4, and I probably forgot to retest with a use case similar to
the above when resubmitting it! :-(

A patch to fix things is attached to this email, for convenience (I'll
submit it properly, with changelog and everything, right away).

After seeing this, I'm even more convinced that
!empty(online && hard_affinity) is really something we want and, as we
ASSERT() it in domain.c, the same should be done in sched_credit2.c.
OTOH, if we don't want to ASSERT() in the specific scheduler code, then
I'd remove the one in domain.c too (and, perhaps, just use online as a
fallback), or things would look inconsistet.

Of course, this can also be seen as a proof of George's point, that this
is something not obvious and not easy to catch. Still, I think that if
we say that we support hard vcpu affinity (a.k.a. pinning), we should
not allow vcpus outside their hard affinity, and the code must reflect
this.

> * Move cpu 1 to a different cpupool.
>
> After a quick look I don't see anything that updates the hard affinity
> when cpus are removed from pools.
> 
cpupool_unassign_cpu() calls cpupool_unassign_cpu_helper()that calls
cpu_disable_scheduler() which, if it finds that empty(online &&
hard_affinity)==true, it resets hard_affinity to "all".

Note that this is reported in the log, as a confirmation that this is
really something exceptional:

  (XEN) Breaking affinity for d1v0
  (XEN) Breaking affinity for d1v1
  (XEN) Breaking affinity for d1v2
  (XEN) Breaking affinity for d1v3
  (XEN) Breaking affinity for d1v4
  (XEN) Breaking affinity for d1v5
  (XEN) Breaking affinity for d1v6
  (XEN) Breaking affinity for d1v7

And also note that, as a consequence of fiddling with cpupools, the
affinity has been altered by Xen (i.e., vcpus still always run within
their hard affinity masks):


# xl vcpu-pin 1 all 1 1
# xl cpupool-cpu-remove Pool-0 1
# xl cpupool-list -c
Name               CPU list
Pool-0             0,2,3,4,5,6,7,8,9,10,11,12,13,14,15

# xl vcpu-list test-pv
Name                                ID  VCPU   CPU State   Time(s) Affinity (Hard / Soft)
test-pv                              1     0    2   -b-       6.1  all / 1
test-pv                              1     1    6   -b-       1.6  all / 1
test-pv                              1     2    4   -b-       1.6  all / 1
test-pv                              1     3    5   -b-       3.1  all / 1
test-pv                              1     4    4   -b-       0.8  all / 1
test-pv                              1     5    7   -b-       3.0  all / 1
test-pv                              1     6    3   -b-       1.1  all / 1
test-pv                              1     7    3   -b-       0.7  all / 1

That's what drove all the reasoning when changing
domain_update_node_affinity(), and led to that ASSERT(), during the soft
affinity work. The reason why the ASSERT() triggers, as can be seen in
the patch, is that, because of the cited changeset,
domain_update_node_affinity() is called too early.

> And, even if it does *now*, it's possible that something might be
> changed in the future that would forget it; this ASSERT() isn't exactly
> next to that code.
> 
But it would help us catch the bug at some point... As proved above! :-D

BTW, I've already written and submitted an OSSTest test involving
cpupools (we don't do anything at the moment). I'll see about adding
these kind of things to it.

> So it seems to me like handling that case makes the software
> more robust for little cost.
> 
Yep, I also agree that it at least won't cost much, in terms of runtime
overhead.

Regards,
Dario

[-- Attachment #1.1.2: patch --]
[-- Type: text/plain, Size: 1469 bytes --]

commit 14430d25448aa85df7cef991b0f52de91e37440b
Author: Dario Faggioli <dario.faggioli@citrix.com>
Date:   Mon Mar 9 15:26:30 2015 +0100

    Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>

diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
index a758a8b..ad7bbb5 100644
--- a/xen/common/cpupool.c
+++ b/xen/common/cpupool.c
@@ -296,6 +296,8 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
 static long cpupool_unassign_cpu_helper(void *info)
 {
     int cpu = cpupool_moving_cpu;
+    struct cpupool *c = info;
+    struct domain *d;
     long ret;
 
     cpupool_dprintk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
@@ -318,6 +320,11 @@ static long cpupool_unassign_cpu_helper(void *info)
         cpupool_cpu_moving = NULL;
     }
 
+    rcu_read_lock(&domlist_read_lock);
+    for_each_domain_in_cpupool(d, c)
+        domain_update_node_affinity(d);
+    rcu_read_unlock(&domlist_read_lock);
+
 out:
     spin_unlock(&cpupool_lock);
     cpupool_dprintk("cpupool_unassign_cpu ret=%ld\n", ret);
@@ -379,12 +386,6 @@ static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
     atomic_inc(&c->refcnt);
     cpupool_cpu_moving = c;
     cpumask_clear_cpu(cpu, c->cpu_valid);
-
-    rcu_read_lock(&domlist_read_lock);
-    for_each_domain_in_cpupool(d, c)
-        domain_update_node_affinity(d);
-    rcu_read_unlock(&domlist_read_lock);
-
     spin_unlock(&cpupool_lock);
 
     work_cpu = smp_processor_id();

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-09  7:11       ` Justin Weaver
  2015-03-09 11:45         ` George Dunlap
@ 2015-03-10 13:23         ` Dario Faggioli
  1 sibling, 0 replies; 15+ messages in thread
From: Dario Faggioli @ 2015-03-10 13:23 UTC (permalink / raw)
  To: jtweaver; +Cc: xen-devel, George Dunlap, JBeulich, henric


[-- Attachment #1.1: Type: text/plain, Size: 3574 bytes --]

On Sun, 2015-03-08 at 21:11 -1000, Justin Weaver wrote:
> Thanks to all for the comments! I've implemented most of the changes
> recommended here in the v2 review. I should have a new patch set ready
> this week (with an updated soft affinity patch as well). 
>
Great! :-)

> > Oh, and this is what was causing you troubles, in case source and
> > destination runqueue were the same... Help me understand, which call to
> > sched_move_irqs() in schedule.c were we missing? I'd say it is the one
> > in vcpu_migrate(), but that does not seem to care about vc->processor
> > (much rater about new_cpu)... what am I missing?
> >
> > However, if they are not the same, the call to migrate() will override
> > this right away, won't it? What I mean to say is, if this is required
> > only in case trqd and svc->rqd are equal, can we tweak this part of
> > csched2_vcpu_migrate() as follows?
> >
> >     if ( trqd != svc->rqd )
> >         migrate(ops, svc, trqd, NOW());
> >     else
> >         vc->processor = new_cpu;
> 
> You are right; it does not have anything to do with sched_move_irqs()
> not being called (like you said it doesn't care about vc->processor).
>
Ah, ok. :-)

> You are never going to believe any of my explanations now! :) 
>
EhEh... If I'd do that to people who fail to understand how things works
at the first or second attempt, I would stop believing myself! :-D :-D

> Without the processor assignment
> here the vcpu might go on being assigned to a processor it no longer
> is allowed to run on. 
>
Ok.

> In that case, function runq_candidate may only
> get called for the vcpu's old processor, and runq_candidate will no
> longer let a vcpu run on a processor that it's not allowed to run on
> (because of the hard affinity check first introduced in v1 of this
> patch). 
>
It mostly makes sense. Out of the top of my head, it still looks like
there should be a pCPU that, when scheduling, would pick it up... I need
to think more about this...

> So in that condition the vcpu never gets to run. That's still
> somewhat of a vague explanation, but I have observed that that is what
> happens. 
>
Do you mean you _actually_ saw this, with some debugging printk-s, or
tracing, or something like this?

> Anyway I think everyone agrees that the processor assignment
> needs to be here, and I did move it to an else block for v3 like you
> recommended above.
>
Yes, that's the point, the assignement above is correct, IMO, so it
should be there, whether or not it is the cause of the issue :-)

> > I don't like the "_safe_" part, but that is not a big deal, I certainly
> > can live with it.
> 
> I changed it to _choose_valid_pcpu ... discuss! 
>
I'm fine with what Goerge proposes in his email.

> (Also, I can send out
> a pre-patch to change the double underscores in the whole file)
> 
For static symbols, yes. As Jan says, it's George's call. If you're up
for it, I think it would be an improvement.

> >> > VCPU2ONLINE(svc->vcpu) would make the line shorter.
> 
> I agree. VCPU2ONLINE is defined in schedule.c ... do you want me to
> move it to a common header along with the other parts we discussed
> (__vcpu_has_soft_affinity, etc.)? 
>
Either that or, if you only need it once, just open code it.

> Okay to move them to sched-if.h, or
> should I put them in a new header file?
> 
sched-if.h is ok for the step-wise load balancing macros, and it would
be the proper place where to move this too, if we go for moving it.

Thanks and Regards,
Dario

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-02-09  3:45 [PATCH v2] sched: credit2: respect per-vcpu hard affinity Justin T. Weaver
  2015-03-03  3:15 ` Dario Faggioli
@ 2015-03-13 17:11 ` Dario Faggioli
  2015-03-14  3:48   ` Justin Weaver
  1 sibling, 1 reply; 15+ messages in thread
From: Dario Faggioli @ 2015-03-13 17:11 UTC (permalink / raw)
  To: jtweaver; +Cc: xen-devel, George Dunlap, henric


[-- Attachment #1.1: Type: text/plain, Size: 1651 bytes --]

Hey, I came across this patch again for other reasons, and I realized
I've a few more (minor) comments:

On Sun, 2015-02-08 at 17:45 -1000, Justin T. Weaver wrote:
> From: "Justin T. Weaver" <jtweaver@hawaii.edu>

> diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
> index cf53770..de8fb5a 100644
> --- a/xen/common/sched_credit2.c
> +++ b/xen/common/sched_credit2.c

> @@ -2127,16 +2212,25 @@ csched2_init(struct scheduler *ops)
>  
>      prv->load_window_shift = opt_load_window_shift;
>  
> +    cpumask = xzalloc_bytes(nr_cpu_ids * sizeof(cpumask_t *));
> +    if ( cpumask == NULL )
> +        return -ENOMEM;
> +
This can probably be xzalloc_array(), or even xmalloc_array(), if we
don't care zeroing all elements (see below about why I actually don't
think we need).

>      return 0;
>  }
>  
>  static void
>  csched2_deinit(const struct scheduler *ops)
>  {
> +    int i;
>      struct csched2_private *prv;
>  
>      prv = CSCHED2_PRIV(ops);
>      xfree(prv);
> +
> +    for ( i = 0; i < nr_cpu_ids; i++ )
> +        free_cpumask_var(cpumask[i]);
> +    xfree(cpumask);
>
Do we need the loop? I mean, all the pcpus go through
csched2_alloc_pdata(), when being activated under this scheduler, which
allocates their scratch masks. They then go through
csched2_free_pdata(), when deactivated from this scheduler, which frees
their scratch masks.

I would then expect that, when we get to here, all the elemtns of the
scratch mask array that were allocated, have also been freed, and hence
only freeing the array is necessary.

Am I missing something?

Regards,
Dario

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] sched: credit2: respect per-vcpu hard affinity
  2015-03-13 17:11 ` Dario Faggioli
@ 2015-03-14  3:48   ` Justin Weaver
  0 siblings, 0 replies; 15+ messages in thread
From: Justin Weaver @ 2015-03-14  3:48 UTC (permalink / raw)
  To: Dario Faggioli; +Cc: xen-devel, George Dunlap, henric

Dario,

>> Without the processor assignment
>> here the vcpu might go on being assigned to a processor it no longer
>> is allowed to run on.
>>
> Ok.
>
>> In that case, function runq_candidate may only
>> get called for the vcpu's old processor, and runq_candidate will no
>> longer let a vcpu run on a processor that it's not allowed to run on
>> (because of the hard affinity check first introduced in v1 of this
>> patch).
>>
> It mostly makes sense. Out of the top of my head, it still looks like
> there should be a pCPU that, when scheduling, would pick it up... I need
> to think more about this...
>
>> So in that condition the vcpu never gets to run. That's still
>> somewhat of a vague explanation, but I have observed that that is what
>> happens.
>>
> Do you mean you _actually_ saw this, with some debugging printk-s, or
> tracing, or something like this?

Yes, I saw the above behavior using printk-s. I commented out the line
that does the processor assignment if the run queues are the same. I
put some printks in key spots. For the test, I had only one guest vcpu
running; it had hard affinity with all cpus and I used xl vcpu-list to
see that it was on cpu 15 (two runqs 0-7,8-15). I then pinned it to 9
and it became unresponsive (vcpu-list showed ---), and printk output
only showed over and over and over again output from runq_candidate
saying something like 'vcpu can't run on cpu 15, not in hard affinity
mask'. A call to runq_candidate for cpu 9 never came up. Pinning it
back to 15 (or all) started it running again.


>> diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
>> index cf53770..de8fb5a 100644
>> --- a/xen/common/sched_credit2.c
>> +++ b/xen/common/sched_credit2.c
>
>> @@ -2127,16 +2212,25 @@ csched2_init(struct scheduler *ops)
>>
>>      prv->load_window_shift = opt_load_window_shift;
>>
>> +    cpumask = xzalloc_bytes(nr_cpu_ids * sizeof(cpumask_t *));
>> +    if ( cpumask == NULL )
>> +        return -ENOMEM;
>> +
> This can probably be xzalloc_array(), or even xmalloc_array(), if we
> don't care zeroing all elements (see below about why I actually don't
> think we need).

OK, I'll make this change. Yes, they don't have to be zeroed based on
what you point out below.

>>      return 0;
>>  }
>>
>>  static void
>>  csched2_deinit(const struct scheduler *ops)
>>  {
>> +    int i;
>>      struct csched2_private *prv;
>>
>>      prv = CSCHED2_PRIV(ops);
>>      xfree(prv);
>> +
>> +    for ( i = 0; i < nr_cpu_ids; i++ )
>> +        free_cpumask_var(cpumask[i]);
>> +    xfree(cpumask);
>>
> Do we need the loop? I mean, all the pcpus go through
> csched2_alloc_pdata(), when being activated under this scheduler, which
> allocates their scratch masks. They then go through
> csched2_free_pdata(), when deactivated from this scheduler, which frees
> their scratch masks.
>
> I would then expect that, when we get to here, all the elemtns of the
> scratch mask array that were allocated, have also been freed, and hence
> only freeing the array is necessary.
>
> Am I missing something?

I agree; what we allocate in csched2_init is what should be
deallocated in csched2_deinit. They should match. I'll make the
change.

Thanks,
Justin

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2015-03-14  3:48 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-09  3:45 [PATCH v2] sched: credit2: respect per-vcpu hard affinity Justin T. Weaver
2015-03-03  3:15 ` Dario Faggioli
2015-03-03  9:12   ` Jan Beulich
2015-03-04 11:03     ` Dario Faggioli
2015-03-04 12:50       ` Jan Beulich
2015-03-04 13:08         ` Dario Faggioli
2015-03-04 13:24           ` Jan Beulich
2015-03-06 15:18   ` George Dunlap
2015-03-06 17:02     ` Dario Faggioli
2015-03-09  7:11       ` Justin Weaver
2015-03-09 11:45         ` George Dunlap
2015-03-09 15:07           ` Dario Faggioli
2015-03-10 13:23         ` Dario Faggioli
2015-03-13 17:11 ` Dario Faggioli
2015-03-14  3:48   ` Justin Weaver

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.