All of lore.kernel.org
 help / color / mirror / Atom feed
From: Juergen Gross <jgross@suse.com>
To: xen-devel@lists.xenproject.org
Cc: Juergen Gross <jgross@suse.com>,
	George Dunlap <george.dunlap@eu.citrix.com>,
	Dario Faggioli <dfaggioli@suse.com>
Subject: [Xen-devel] [PATCH v6 16/20] xen/sched: support differing granularity in schedule_cpu_[add/rm]()
Date: Wed,  2 Oct 2019 09:27:41 +0200	[thread overview]
Message-ID: <20191002072745.24919-17-jgross@suse.com> (raw)
In-Reply-To: <20191002072745.24919-1-jgross@suse.com>

With core scheduling active schedule_cpu_[add/rm]() has to cope with
different scheduling granularity: a cpu not in any cpupool is subject
to granularity 1 (cpu scheduling), while a cpu in a cpupool might be
in a scheduling resource with more than one cpu.

Handle that by having arrays of old/new pdata and vdata and loop over
those where appropriate.

Additionally the scheduling resource(s) must either be merged or
split.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Dario Faggioli <dfaggioli@suse.com>
---
 xen/common/cpupool.c  |  18 ++--
 xen/common/schedule.c | 226 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 204 insertions(+), 40 deletions(-)

diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
index 13dffaadcf..04c3b3c04b 100644
--- a/xen/common/cpupool.c
+++ b/xen/common/cpupool.c
@@ -536,6 +536,7 @@ static void cpupool_cpu_remove(unsigned int cpu)
         ret = cpupool_unassign_cpu_finish(cpupool0);
         BUG_ON(ret);
     }
+    cpumask_clear_cpu(cpu, &cpupool_free_cpus);
 }
 
 /*
@@ -585,20 +586,19 @@ static void cpupool_cpu_remove_forced(unsigned int cpu)
     struct cpupool **c;
     int ret;
 
-    if ( cpumask_test_cpu(cpu, &cpupool_free_cpus) )
-        cpumask_clear_cpu(cpu, &cpupool_free_cpus);
-    else
+    for_each_cpupool ( c )
     {
-        for_each_cpupool(c)
+        if ( cpumask_test_cpu(cpu, (*c)->cpu_valid) )
         {
-            if ( cpumask_test_cpu(cpu, (*c)->cpu_valid) )
-            {
-                ret = cpupool_unassign_cpu(*c, cpu);
-                BUG_ON(ret);
-            }
+            ret = cpupool_unassign_cpu_start(*c, cpu);
+            BUG_ON(ret);
+            ret = cpupool_unassign_cpu_finish(*c);
+            BUG_ON(ret);
         }
     }
 
+    cpumask_clear_cpu(cpu, &cpupool_free_cpus);
+
     rcu_read_lock(&sched_res_rculock);
     sched_rm_cpu(cpu);
     rcu_read_unlock(&sched_res_rculock);
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index efe077b01f..e411b6d03e 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -425,27 +425,30 @@ static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
     unit->runstate_cnt[v->runstate.state]++;
 }
 
-static struct sched_unit *sched_alloc_unit(struct vcpu *v)
+static struct sched_unit *sched_alloc_unit_mem(void)
 {
-    struct sched_unit *unit, **prev_unit;
-    struct domain *d = v->domain;
-    unsigned int gran = cpupool_get_granularity(d->cpupool);
+    struct sched_unit *unit;
 
-    for_each_sched_unit ( d, unit )
-        if ( unit->unit_id / gran == v->vcpu_id / gran )
-            break;
+    unit = xzalloc(struct sched_unit);
+    if ( !unit )
+        return NULL;
 
-    if ( unit )
+    if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
+         !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
+         !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
     {
-        sched_unit_add_vcpu(unit, v);
-        return unit;
+        sched_free_unit_mem(unit);
+        unit = NULL;
     }
 
-    if ( (unit = xzalloc(struct sched_unit)) == NULL )
-        return NULL;
+    return unit;
+}
+
+static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d)
+{
+    struct sched_unit **prev_unit;
 
     unit->domain = d;
-    sched_unit_add_vcpu(unit, v);
 
     for ( prev_unit = &d->sched_unit_list; *prev_unit;
           prev_unit = &(*prev_unit)->next_in_list )
@@ -455,17 +458,31 @@ static struct sched_unit *sched_alloc_unit(struct vcpu *v)
 
     unit->next_in_list = *prev_unit;
     *prev_unit = unit;
+}
 
-    if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
-         !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
-         !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
-        goto fail;
+static struct sched_unit *sched_alloc_unit(struct vcpu *v)
+{
+    struct sched_unit *unit;
+    struct domain *d = v->domain;
+    unsigned int gran = cpupool_get_granularity(d->cpupool);
 
-    return unit;
+    for_each_sched_unit ( d, unit )
+        if ( unit->unit_id / gran == v->vcpu_id / gran )
+            break;
 
- fail:
-    sched_free_unit(unit, v);
-    return NULL;
+    if ( unit )
+    {
+        sched_unit_add_vcpu(unit, v);
+        return unit;
+    }
+
+    if ( (unit = sched_alloc_unit_mem()) == NULL )
+        return NULL;
+
+    sched_unit_add_vcpu(unit, v);
+    sched_domain_insert_unit(unit, d);
+
+    return unit;
 }
 
 static unsigned int sched_select_initial_cpu(const struct vcpu *v)
@@ -2419,18 +2436,28 @@ static void poll_timer_fn(void *data)
         vcpu_unblock(v);
 }
 
-static int cpu_schedule_up(unsigned int cpu)
+static struct sched_resource *sched_alloc_res(void)
 {
     struct sched_resource *sr;
 
     sr = xzalloc(struct sched_resource);
     if ( sr == NULL )
-        return -ENOMEM;
+        return NULL;
     if ( !zalloc_cpumask_var(&sr->cpus) )
     {
         xfree(sr);
-        return -ENOMEM;
+        return NULL;
     }
+    return sr;
+}
+
+static int cpu_schedule_up(unsigned int cpu)
+{
+    struct sched_resource *sr;
+
+    sr = sched_alloc_res();
+    if ( sr == NULL )
+        return -ENOMEM;
 
     sr->master_cpu = cpu;
     cpumask_copy(sr->cpus, cpumask_of(cpu));
@@ -2480,6 +2507,8 @@ static void sched_res_free(struct rcu_head *head)
     struct sched_resource *sr = container_of(head, struct sched_resource, rcu);
 
     free_cpumask_var(sr->cpus);
+    if ( sr->sched_unit_idle )
+        sched_free_unit_mem(sr->sched_unit_idle);
     xfree(sr);
 }
 
@@ -2496,6 +2525,8 @@ static void cpu_schedule_down(unsigned int cpu)
     cpumask_clear_cpu(cpu, &sched_res_mask);
     set_sched_res(cpu, NULL);
 
+    /* Keep idle unit. */
+    sr->sched_unit_idle = NULL;
     call_rcu(&sr->rcu, sched_res_free);
 
     rcu_read_unlock(&sched_res_rculock);
@@ -2575,6 +2606,30 @@ static struct notifier_block cpu_schedule_nfb = {
     .notifier_call = cpu_schedule_callback
 };
 
+static const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt,
+                                              unsigned int cpu)
+{
+    const cpumask_t *mask;
+
+    switch ( opt )
+    {
+    case SCHED_GRAN_cpu:
+        mask = cpumask_of(cpu);
+        break;
+    case SCHED_GRAN_core:
+        mask = per_cpu(cpu_sibling_mask, cpu);
+        break;
+    case SCHED_GRAN_socket:
+        mask = per_cpu(cpu_core_mask, cpu);
+        break;
+    default:
+        ASSERT_UNREACHABLE();
+        return NULL;
+    }
+
+    return mask;
+}
+
 /* Initialise the data structures. */
 void __init scheduler_init(void)
 {
@@ -2730,6 +2785,46 @@ int schedule_cpu_add(unsigned int cpu, struct cpupool *c)
      */
     old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
 
+    if ( cpupool_get_granularity(c) > 1 )
+    {
+        const cpumask_t *mask;
+        unsigned int cpu_iter, idx = 0;
+        struct sched_unit *old_unit, *master_unit;
+        struct sched_resource *sr_old;
+
+        /*
+         * We need to merge multiple idle_vcpu units and sched_resource structs
+         * into one. As the free cpus all share the same lock we are fine doing
+         * that now. The worst which could happen would be someone waiting for
+         * the lock, thus dereferencing sched_res->schedule_lock. This is the
+         * reason we are freeing struct sched_res via call_rcu() to avoid the
+         * lock pointer suddenly disappearing.
+         */
+        mask = sched_get_opt_cpumask(c->gran, cpu);
+        master_unit = idle_vcpu[cpu]->sched_unit;
+
+        for_each_cpu ( cpu_iter, mask )
+        {
+            if ( idx )
+                cpumask_clear_cpu(cpu_iter, &sched_res_mask);
+
+            per_cpu(sched_res_idx, cpu_iter) = idx++;
+
+            if ( cpu == cpu_iter )
+                continue;
+
+            old_unit = idle_vcpu[cpu_iter]->sched_unit;
+            sr_old = get_sched_res(cpu_iter);
+            kill_timer(&sr_old->s_timer);
+            idle_vcpu[cpu_iter]->sched_unit = master_unit;
+            master_unit->runstate_cnt[RUNSTATE_running]++;
+            set_sched_res(cpu_iter, sr);
+            cpumask_set_cpu(cpu_iter, sr->cpus);
+
+            call_rcu(&sr_old->rcu, sched_res_free);
+        }
+    }
+
     new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv);
 
     sr->scheduler = new_ops;
@@ -2767,33 +2862,100 @@ out:
  */
 int schedule_cpu_rm(unsigned int cpu)
 {
-    struct vcpu *idle;
     void *ppriv_old, *vpriv_old;
-    struct sched_resource *sr;
+    struct sched_resource *sr, **sr_new = NULL;
+    struct sched_unit *unit;
     struct scheduler *old_ops;
     spinlock_t *old_lock;
     unsigned long flags;
+    int idx, ret = -ENOMEM;
+    unsigned int cpu_iter;
 
     rcu_read_lock(&sched_res_rculock);
 
     sr = get_sched_res(cpu);
     old_ops = sr->scheduler;
 
+    if ( sr->granularity > 1 )
+    {
+        sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1);
+        if ( !sr_new )
+            goto out;
+        for ( idx = 0; idx < sr->granularity - 1; idx++ )
+        {
+            sr_new[idx] = sched_alloc_res();
+            if ( sr_new[idx] )
+            {
+                sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem();
+                if ( !sr_new[idx]->sched_unit_idle )
+                {
+                    sched_res_free(&sr_new[idx]->rcu);
+                    sr_new[idx] = NULL;
+                }
+            }
+            if ( !sr_new[idx] )
+            {
+                for ( idx--; idx >= 0; idx-- )
+                    sched_res_free(&sr_new[idx]->rcu);
+                goto out;
+            }
+            sr_new[idx]->curr = sr_new[idx]->sched_unit_idle;
+            sr_new[idx]->scheduler = &sched_idle_ops;
+            sr_new[idx]->granularity = 1;
+
+            /* We want the lock not to change when replacing the resource. */
+            sr_new[idx]->schedule_lock = sr->schedule_lock;
+        }
+    }
+
+    ret = 0;
     ASSERT(sr->cpupool != NULL);
     ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
     ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid));
 
-    idle = idle_vcpu[cpu];
-
     sched_do_tick_suspend(old_ops, cpu);
 
     /* See comment in schedule_cpu_add() regarding lock switching. */
     old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
 
-    vpriv_old = idle->sched_unit->priv;
+    vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
     ppriv_old = sr->sched_priv;
 
-    idle->sched_unit->priv = NULL;
+    idx = 0;
+    for_each_cpu ( cpu_iter, sr->cpus )
+    {
+        per_cpu(sched_res_idx, cpu_iter) = 0;
+        if ( cpu_iter == cpu )
+        {
+            idle_vcpu[cpu_iter]->sched_unit->priv = NULL;
+        }
+        else
+        {
+            /* Initialize unit. */
+            unit = sr_new[idx]->sched_unit_idle;
+            unit->res = sr_new[idx];
+            unit->is_running = true;
+            sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]);
+            sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain);
+
+            /* Adjust cpu masks of resources (old and new). */
+            cpumask_clear_cpu(cpu_iter, sr->cpus);
+            cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus);
+
+            /* Init timer. */
+            init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
+
+            /* Last resource initializations and insert resource pointer. */
+            sr_new[idx]->master_cpu = cpu_iter;
+            set_sched_res(cpu_iter, sr_new[idx]);
+
+            /* Last action: set the new lock pointer. */
+            smp_mb();
+            sr_new[idx]->schedule_lock = &sched_free_cpu_lock;
+
+            idx++;
+        }
+    }
     sr->scheduler = &sched_idle_ops;
     sr->sched_priv = NULL;
 
@@ -2811,9 +2973,11 @@ int schedule_cpu_rm(unsigned int cpu)
     sr->granularity = 1;
     sr->cpupool = NULL;
 
+out:
     rcu_read_unlock(&sched_res_rculock);
+    xfree(sr_new);
 
-    return 0;
+    return ret;
 }
 
 struct scheduler *scheduler_get_default(void)
-- 
2.16.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

  parent reply	other threads:[~2019-10-02  7:28 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-02  7:27 [Xen-devel] [PATCH v6 00/20] xen: add core scheduling support Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 01/20] xen/sched: add code to sync scheduling of all vcpus of a sched unit Juergen Gross
2019-10-02 13:56   ` Jürgen Groß
2019-10-02 14:38     ` Julien Grall
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 02/20] xen/sched: introduce unit_runnable_state() Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 03/20] xen/sched: add support for multiple vcpus per sched unit where missing Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 04/20] xen/sched: modify cpupool_domain_cpumask() to be an unit mask Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 05/20] xen/sched: support allocating multiple vcpus into one sched unit Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 06/20] xen/sched: add a percpu resource index Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 07/20] xen/sched: add fall back to idle vcpu when scheduling unit Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 08/20] xen/sched: make vcpu_wake() and vcpu_sleep() core scheduling aware Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 09/20] xen/sched: move per-cpu variable scheduler to struct sched_resource Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 10/20] xen/sched: move per-cpu variable cpupool " Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 11/20] xen/sched: reject switching smt on/off with core scheduling active Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 12/20] xen/sched: prepare per-cpupool scheduling granularity Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 13/20] xen/sched: split schedule_cpu_switch() Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 14/20] xen/sched: protect scheduling resource via rcu Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 15/20] xen/sched: support multiple cpus per scheduling resource Juergen Gross
2019-10-02  7:27 ` Juergen Gross [this message]
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 17/20] xen/sched: support core scheduling for moving cpus to/from cpupools Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 18/20] xen/sched: disable scheduling when entering ACPI deep sleep states Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 19/20] xen/sched: add scheduling granularity enum Juergen Gross
2019-10-02  7:27 ` [Xen-devel] [PATCH v6 20/20] docs: add "sched-gran" boot parameter documentation Juergen Gross
2019-10-03  9:47 ` [Xen-devel] [PATCH v6 00/20] xen: add core scheduling support Sergey Dyasli
2019-10-04  4:29   ` Jürgen Groß

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191002072745.24919-17-jgross@suse.com \
    --to=jgross@suse.com \
    --cc=dfaggioli@suse.com \
    --cc=george.dunlap@eu.citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.