Xen-Devel Archive on lore.kernel.org
 help / color / Atom feed
From: Juergen Gross <jgross@suse.com>
To: xen-devel@lists.xenproject.org
Cc: Juergen Gross <jgross@suse.com>,
	George Dunlap <george.dunlap@eu.citrix.com>,
	Dario Faggioli <dfaggioli@suse.com>
Subject: [Xen-devel] [PATCH v4 3/4] xen/sched: add minimalistic idle scheduler for free cpus
Date: Sat, 14 Sep 2019 07:19:43 +0200
Message-ID: <20190914051944.21409-4-jgross@suse.com> (raw)
In-Reply-To: <20190914051944.21409-1-jgross@suse.com>

Instead of having a full blown scheduler running for the free cpus
add a very minimalistic scheduler for that purpose only ever scheduling
the related idle vcpu. This has the big advantage of not needing any
per-cpu, per-domain or per-scheduling unit data for free cpus and in
turn simplifying moving cpus to and from cpupools a lot.

Right now, CPUs that are not in any pool, still belong to Pool-0's
scheduler. This forces us to make, within the scheduler, extra effort
to avoid actually running vCPUs on those.

In the case of Credit1, this also cause issue to weights
(re)distribution, as the number of CPUs available to the scheduler is
wrong.

This is described in the changelog of commit e7191920261d ("xen:
credit2: never consider CPUs outside of our cpupool").

This new scheduler will just use a common lock for all free cpus.

As this new scheduler is not user selectable don't register it as an
official scheduler, but just include it in schedule.c.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Dario Faggioli <dfaggioli@suse.com>
---
V3:
- use ZERO_BLOCK_PTR instead of (void *)1 (Jan Beulich, Andrew Cooper)
---
 xen/common/sched_credit.c |   9 ---
 xen/common/sched_null.c   |   7 ---
 xen/common/schedule.c     | 153 +++++++++++++++++++++++-----------------------
 3 files changed, 75 insertions(+), 94 deletions(-)

diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index 81dee5e472..70fe718127 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -617,15 +617,6 @@ csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
 {
     unsigned long flags;
     struct csched_private *prv = CSCHED_PRIV(ops);
-    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
-
-    /*
-     * This is called either during during boot, resume or hotplug, in
-     * case Credit1 is the scheduler chosen at boot. In such cases, the
-     * scheduler lock for cpu is already pointing to the default per-cpu
-     * spinlock, as Credit1 needs it, so there is no remapping to be done.
-     */
-    ASSERT(sd->schedule_lock == &sd->_lock && !spin_is_locked(&sd->_lock));
 
     spin_lock_irqsave(&prv->lock, flags);
     init_pdata(prv, pdata, cpu);
diff --git a/xen/common/sched_null.c b/xen/common/sched_null.c
index 26c6f0f129..6782ecda5c 100644
--- a/xen/common/sched_null.c
+++ b/xen/common/sched_null.c
@@ -167,17 +167,10 @@ static void init_pdata(struct null_private *prv, unsigned int cpu)
 static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
 {
     struct null_private *prv = null_priv(ops);
-    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
 
     /* alloc_pdata is not implemented, so we want this to be NULL. */
     ASSERT(!pdata);
 
-    /*
-     * The scheduler lock points already to the default per-cpu spinlock,
-     * so there is no remapping to be done.
-     */
-    ASSERT(sd->schedule_lock == &sd->_lock && !spin_is_locked(&sd->_lock));
-
     init_pdata(prv, cpu);
 }
 
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 93164c64f6..fdeec10c3b 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -54,6 +54,10 @@ boolean_param("sched_smt_power_savings", sched_smt_power_savings);
  * */
 int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
 integer_param("sched_ratelimit_us", sched_ratelimit_us);
+
+/* Common lock for free cpus. */
+static DEFINE_SPINLOCK(sched_free_cpu_lock);
+
 /* Various timer handlers. */
 static void s_timer_fn(void *unused);
 static void vcpu_periodic_timer_fn(void *data);
@@ -73,6 +77,58 @@ extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_arr
 
 static struct scheduler __read_mostly ops;
 
+static spinlock_t *
+sched_idle_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+                        void *pdata, void *vdata)
+{
+    idle_vcpu[cpu]->sched_priv = NULL;
+
+    return &sched_free_cpu_lock;
+}
+
+static int
+sched_idle_cpu_pick(const struct scheduler *ops, struct vcpu *v)
+{
+    return v->processor;
+}
+
+static void *
+sched_idle_alloc_vdata(const struct scheduler *ops, struct vcpu *v,
+                       void *dd)
+{
+    /* Any non-NULL pointer is fine here. */
+    return ZERO_BLOCK_PTR;
+}
+
+static void
+sched_idle_free_vdata(const struct scheduler *ops, void *priv)
+{
+}
+
+static struct task_slice sched_idle_schedule(
+    const struct scheduler *ops, s_time_t now,
+    bool tasklet_work_scheduled)
+{
+    const unsigned int cpu = smp_processor_id();
+    struct task_slice ret = { .time = -1 };
+
+    ret.task = idle_vcpu[cpu];
+    return ret;
+}
+
+static struct scheduler sched_idle_ops = {
+    .name           = "Idle Scheduler",
+    .opt_name       = "idle",
+    .sched_data     = NULL,
+
+    .pick_cpu       = sched_idle_cpu_pick,
+    .do_schedule    = sched_idle_schedule,
+
+    .alloc_vdata    = sched_idle_alloc_vdata,
+    .free_vdata     = sched_idle_free_vdata,
+    .switch_sched   = sched_idle_switch_sched,
+};
+
 static inline struct scheduler *dom_scheduler(const struct domain *d)
 {
     if ( likely(d->cpupool != NULL) )
@@ -1587,12 +1643,10 @@ static void poll_timer_fn(void *data)
 static int cpu_schedule_up(unsigned int cpu)
 {
     struct schedule_data *sd = &per_cpu(schedule_data, cpu);
-    void *sched_priv;
 
-    per_cpu(scheduler, cpu) = &ops;
+    per_cpu(scheduler, cpu) = &sched_idle_ops;
     spin_lock_init(&sd->_lock);
-    sd->schedule_lock = &sd->_lock;
-    sd->curr = idle_vcpu[cpu];
+    sd->schedule_lock = &sched_free_cpu_lock;
     init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
     atomic_set(&sd->urgent_count, 0);
 
@@ -1602,40 +1656,19 @@ static int cpu_schedule_up(unsigned int cpu)
 
     if ( idle_vcpu[cpu] == NULL )
         vcpu_create(idle_vcpu[0]->domain, cpu, cpu);
-    else
-    {
-        struct vcpu *idle = idle_vcpu[cpu];
-
-        /*
-         * During (ACPI?) suspend the idle vCPU for this pCPU is not freed,
-         * while its scheduler specific data (what is pointed by sched_priv)
-         * is. Also, at this stage of the resume path, we attach the pCPU
-         * to the default scheduler, no matter in what cpupool it was before
-         * suspend. To avoid inconsistency, let's allocate default scheduler
-         * data for the idle vCPU here. If the pCPU was in a different pool
-         * with a different scheduler, it is schedule_cpu_switch(), invoked
-         * later, that will set things up as appropriate.
-         */
-        ASSERT(idle->sched_priv == NULL);
 
-        idle->sched_priv = sched_alloc_vdata(&ops, idle,
-                                             idle->domain->sched_priv);
-        if ( idle->sched_priv == NULL )
-            return -ENOMEM;
-    }
     if ( idle_vcpu[cpu] == NULL )
         return -ENOMEM;
 
     /*
-     * We don't want to risk calling xfree() on an sd->sched_priv
-     * (e.g., inside free_pdata, from cpu_schedule_down() called
-     * during CPU_UP_CANCELLED) that contains an IS_ERR value.
+     * No need to allocate any scheduler data, as cpus coming online are
+     * free initially and the idle scheduler doesn't need any data areas
+     * allocated.
      */
-    sched_priv = sched_alloc_pdata(&ops, cpu);
-    if ( IS_ERR(sched_priv) )
-        return PTR_ERR(sched_priv);
 
-    sd->sched_priv = sched_priv;
+    sd->curr = idle_vcpu[cpu];
+
+    sd->sched_priv = NULL;
 
     return 0;
 }
@@ -1643,13 +1676,6 @@ static int cpu_schedule_up(unsigned int cpu)
 static void cpu_schedule_down(unsigned int cpu)
 {
     struct schedule_data *sd = &per_cpu(schedule_data, cpu);
-    struct scheduler *sched = per_cpu(scheduler, cpu);
-
-    sched_free_pdata(sched, sd->sched_priv, cpu);
-    sched_free_vdata(sched, idle_vcpu[cpu]->sched_priv);
-
-    idle_vcpu[cpu]->sched_priv = NULL;
-    sd->sched_priv = NULL;
 
     kill_timer(&sd->s_timer);
 }
@@ -1657,14 +1683,11 @@ static void cpu_schedule_down(unsigned int cpu)
 void sched_rm_cpu(unsigned int cpu)
 {
     int rc;
-    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
-    struct scheduler *sched = per_cpu(scheduler, cpu);
 
     rcu_read_lock(&domlist_read_lock);
     rc = cpu_disable_scheduler(cpu);
     BUG_ON(rc);
     rcu_read_unlock(&domlist_read_lock);
-    sched_deinit_pdata(sched, sd->sched_priv, cpu);
     cpu_schedule_down(cpu);
 }
 
@@ -1672,8 +1695,6 @@ static int cpu_schedule_callback(
     struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
     unsigned int cpu = (unsigned long)hcpu;
-    struct scheduler *sched = per_cpu(scheduler, cpu);
-    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
     int rc = 0;
 
     /*
@@ -1681,39 +1702,25 @@ static int cpu_schedule_callback(
      * allocating and initializing the per-pCPU scheduler specific data,
      * as well as "registering" this pCPU to the scheduler (which may
      * involve modifying some scheduler wide data structures).
-     * This happens by calling the alloc_pdata and init_pdata hooks, in
-     * this order. A scheduler that does not need to allocate any per-pCPU
-     * data can avoid implementing alloc_pdata. init_pdata may, however, be
-     * necessary/useful in this case too (e.g., it can contain the "register
-     * the pCPU to the scheduler" part). alloc_pdata (if present) is called
-     * during CPU_UP_PREPARE. init_pdata (if present) is called during
-     * CPU_STARTING.
+     * As new pCPUs always start as "free" cpus with the minimal idle
+     * scheduler being in charge, we don't need any of that.
      *
      * On the other hand, at teardown, we need to reverse what has been done
-     * during initialization, and then free the per-pCPU specific data. This
-     * happens by calling the deinit_pdata and free_pdata hooks, in this
+     * during initialization, and then free the per-pCPU specific data. A
+     * pCPU brought down is not forced through "free" cpus, so here we need to
+     * use the appropriate hooks.
+     *
+     * This happens by calling the deinit_pdata and free_pdata hooks, in this
      * order. If no per-pCPU memory was allocated, there is no need to
      * provide an implementation of free_pdata. deinit_pdata may, however,
      * be necessary/useful in this case too (e.g., it can undo something done
      * on scheduler wide data structure during init_pdata). Both deinit_pdata
      * and free_pdata are called during CPU_DEAD.
      *
-     * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED
-     * *before* having called init_pdata. In this case, as there is no
-     * initialization needing undoing, only free_pdata should be called.
-     * This means it is possible to call free_pdata just after alloc_pdata,
-     * without a init_pdata/deinit_pdata "cycle" in between the two.
-     *
-     * So, in summary, the usage pattern should look either
-     *  - alloc_pdata-->init_pdata-->deinit_pdata-->free_pdata, or
-     *  - alloc_pdata-->free_pdata.
+     * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED.
      */
     switch ( action )
     {
-    case CPU_STARTING:
-        if ( system_state != SYS_STATE_resume )
-            sched_init_pdata(sched, sd->sched_priv, cpu);
-        break;
     case CPU_UP_PREPARE:
         if ( system_state != SYS_STATE_resume )
             rc = cpu_schedule_up(cpu);
@@ -1824,9 +1831,7 @@ void __init scheduler_init(void)
     idle_domain->max_vcpus = nr_cpu_ids;
     if ( vcpu_create(idle_domain, 0, 0) == NULL )
         BUG();
-    this_cpu(schedule_data).sched_priv = sched_alloc_pdata(&ops, 0);
-    BUG_ON(IS_ERR(this_cpu(schedule_data).sched_priv));
-    sched_init_pdata(&ops, this_cpu(schedule_data).sched_priv, 0);
+    this_cpu(schedule_data).curr = idle_vcpu[0];
 }
 
 /*
@@ -1834,18 +1839,14 @@ void __init scheduler_init(void)
  * cpupool, or subject it to the scheduler of a new cpupool.
  *
  * For the pCPUs that are removed from their cpupool, their scheduler becomes
- * &ops (the default scheduler, selected at boot, which also services the
- * default cpupool). However, as these pCPUs are not really part of any pool,
- * there won't be any scheduling event on them, not even from the default
- * scheduler. Basically, they will just sit idle until they are explicitly
- * added back to a cpupool.
+ * &sched_idle_ops (the idle scheduler).
  */
 int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
 {
     struct vcpu *idle;
     void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
     struct scheduler *old_ops = per_cpu(scheduler, cpu);
-    struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
+    struct scheduler *new_ops = (c == NULL) ? &sched_idle_ops : c->sched;
     struct cpupool *old_pool = per_cpu(cpupool, cpu);
     struct schedule_data *sd = &per_cpu(schedule_data, cpu);
     spinlock_t *old_lock, *new_lock;
@@ -1865,9 +1866,6 @@ int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
     ASSERT((c == NULL && !cpumask_test_cpu(cpu, old_pool->cpu_valid)) ||
            (c != NULL && !cpumask_test_cpu(cpu, c->cpu_valid)));
 
-    if ( old_ops == new_ops )
-        goto out;
-
     /*
      * To setup the cpu for the new scheduler we need:
      *  - a valid instance of per-CPU scheduler specific data, as it is
@@ -1931,7 +1929,6 @@ int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
     sched_free_vdata(old_ops, vpriv_old);
     sched_free_pdata(old_ops, ppriv_old, cpu);
 
- out:
     per_cpu(cpupool, cpu) = c;
     /* When a cpu is added to a pool, trigger it to go pick up some work */
     if ( c != NULL )
-- 
2.16.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

  parent reply index

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-09-14  5:19 [Xen-devel] [PATCH v4 0/4] xen/sched: use new " Juergen Gross
2019-09-14  5:19 ` [Xen-devel] [PATCH v4 1/4] xen/sched: populate cpupool0 only after all cpus are up Juergen Gross
2019-09-14  5:19 ` [Xen-devel] [PATCH v4 2/4] xen/sched: remove cpu from pool0 before removing it Juergen Gross
2019-09-24 14:14   ` Dario Faggioli
2019-09-14  5:19 ` Juergen Gross [this message]
2019-09-14  5:19 ` [Xen-devel] [PATCH v4 4/4] xen/sched: switch to debugtrace in cpupool handling Juergen Gross

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190914051944.21409-4-jgross@suse.com \
    --to=jgross@suse.com \
    --cc=dfaggioli@suse.com \
    --cc=george.dunlap@eu.citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Xen-Devel Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/xen-devel/0 xen-devel/git/0.git
	git clone --mirror https://lore.kernel.org/xen-devel/1 xen-devel/git/1.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 xen-devel xen-devel/ https://lore.kernel.org/xen-devel \
		xen-devel@lists.xenproject.org xen-devel@lists.xen.org
	public-inbox-index xen-devel

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.xenproject.lists.xen-devel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git