[PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.

* [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
@ 2015-06-08 11:46 Dagaen Golomb
  2015-06-09 12:53 ` Dario Faggioli
  0 siblings, 1 reply; 21+ messages in thread
From: Dagaen Golomb @ 2015-06-08 11:46 UTC (permalink / raw)
  To: xen-devel
  Cc: wei.liu2, xisisu, george.dunlap, dario.faggioli, mengxu,
	jbeulich, lichong659

To do this, we create a new list that holds, for each
vcpu, the time least into the future that it may need to be
rescheduled. The scheduler chooses the lowest time off of this
list and waits until the specified time instead of running every
1 ms as it did before.

Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
---
 xen/common/sched_rt.c |  319 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 222 insertions(+), 97 deletions(-)

diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 7c39a9e..25f0458 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -4,6 +4,7 @@
  *
  * by Sisu Xi, 2013, Washington University in Saint Louis
  * and Meng Xu, 2014, University of Pennsylvania
+ * and Dagaen Golomb, 2015, University of Pennsylvania
  *
  * based on the code of credit Scheduler
  */
@@ -134,6 +135,8 @@ struct rt_private {
     struct list_head sdom;      /* list of availalbe domains, used for dump */
     struct list_head runq;      /* ordered list of runnable vcpus */
     struct list_head depletedq; /* unordered list of depleted vcpus */
+    struct list_head timerq;    /* ascending list of next required scheduling 
+                                   decision */
     cpumask_t tickled;          /* cpus been tickled */
 };
 
@@ -143,6 +146,7 @@ struct rt_private {
 struct rt_vcpu {
     struct list_head q_elem;    /* on the runq/depletedq list */
     struct list_head sdom_elem; /* on the domain VCPU list */
+    struct list_head t_elem;    /* on the timerq */
 
     /* Up-pointers */
     struct rt_dom *sdom;
@@ -156,6 +160,7 @@ struct rt_vcpu {
     s_time_t cur_budget;        /* current budget */
     s_time_t last_start;        /* last start time */
     s_time_t cur_deadline;      /* current deadline for EDF */
+    s_time_t next_sched_needed; /* next time to make scheduling decision */
 
     unsigned flags;             /* mark __RTDS_scheduled, etc.. */
 };
@@ -197,6 +202,11 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops)
     return &rt_priv(ops)->depletedq;
 }
 
+static inline struct list_head *rt_timerq(const struct scheduler *ops)
+{
+    return &rt_priv(ops)->timerq;
+}
+
 /*
  * Queue helper functions for runq and depletedq
  */
@@ -212,6 +222,11 @@ __q_elem(struct list_head *elem)
     return list_entry(elem, struct rt_vcpu, q_elem);
 }
 
+static struct rt_vcpu * __t_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct rt_vcpu, t_elem);
+}
+
 /*
  * Debug related code, dump vcpu/cpu information
  */
@@ -231,7 +246,8 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
 
     cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity);
     printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
-           " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
+           " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime
+           " next_sched=%"PRI_stime"\n"
            " \t\t onQ=%d runnable=%d cpu_hard_affinity=%s ",
             svc->vcpu->domain->domain_id,
             svc->vcpu->vcpu_id,
@@ -241,6 +257,7 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
             svc->cur_budget,
             svc->cur_deadline,
             svc->last_start,
+            svc->next_sched_needed,
             __vcpu_on_q(svc),
             vcpu_runnable(svc->vcpu),
             cpustr);
@@ -264,7 +281,7 @@ rt_dump_pcpu(const struct scheduler *ops, int cpu)
 static void
 rt_dump(const struct scheduler *ops)
 {
-    struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *iter;
+    struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *timerq, *iter;
     struct rt_private *prv = rt_priv(ops);
     struct rt_vcpu *svc;
     struct rt_dom *sdom;
@@ -277,6 +294,7 @@ rt_dump(const struct scheduler *ops)
 
     runq = rt_runq(ops);
     depletedq = rt_depletedq(ops);
+    timerq = rt_timerq(ops);
 
     printk("Global RunQueue info:\n");
     list_for_each( iter, runq )
@@ -292,6 +310,14 @@ rt_dump(const struct scheduler *ops)
         rt_dump_vcpu(ops, svc);
     }
 
+    printk("Global TimerQueue info:\n");
+    list_for_each( iter, timerq )
+    {
+        svc = __t_elem(iter);
+        printk("\tvcpu %d next_sched=%"PRI_stime"\n", svc->vcpu->vcpu_id, 
+			   svc->next_sched_needed);
+    }
+
     printk("Domain info:\n");
     list_for_each( iter_sdom, &prv->sdom )
     {
@@ -361,6 +387,12 @@ __q_remove(struct rt_vcpu *svc)
         list_del_init(&svc->q_elem);
 }
 
+static inline void __t_remove(struct rt_vcpu *svc)
+{
+	if( !list_empty(&svc->t_elem) )
+		list_del_init(&svc->t_elem);
+}
+
 /*
  * Insert svc with budget in RunQ according to EDF:
  * vcpus with smaller deadlines go first.
@@ -395,6 +427,72 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
 }
 
 /*
+ * Insert svc into the timerq, maintaining ascending order by next_sched_needed.
+ */
+static void __timerq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *timerq = rt_timerq(ops);
+    struct list_head *iter = timerq;
+
+    ASSERT( spin_is_locked(&prv->lock) );
+
+    ASSERT( list_empty(&svc->t_elem) );
+
+	list_for_each(iter, timerq)
+	{
+		struct rt_vcpu * iter_svc = __t_elem(iter);
+		if ( svc->next_sched_needed <= iter_svc->next_sched_needed )
+				break;
+	}
+	list_add_tail(&svc->t_elem, iter);
+}
+
+/*
+ * Return how far the lowest time on the timerq (that is after NOW) is in the
+ * future.
+ * Lock must be grabbed before calling this.
+ */
+static s_time_t __timerq_next(const struct scheduler *ops, s_time_t now)
+{
+    struct list_head *timerq = rt_timerq(ops);
+    struct list_head *iter, *tmp;
+
+    list_for_each_safe(iter, tmp, timerq)
+    {
+        struct rt_vcpu * iter_svc = __t_elem(iter);
+
+        if ( iter_svc->next_sched_needed > now )
+            return (iter_svc->next_sched_needed - now);
+        else
+            __t_remove(iter_svc);
+    }
+
+    return MAX_SCHEDULE;
+}
+
+/*
+ * Updates the next_sched_needed field for a vcpu, which is used for
+ * determining the next needed schedule time for this vcpu. Then updates
+ * timerq via reinsert.
+ */
+static void update_sched_time(const struct scheduler *ops, s_time_t now, 
+                              struct rt_vcpu *svc)
+{
+    /* update next needed schedule time */
+    if( test_bit(__RTDS_scheduled, &svc->flags) )
+        svc->next_sched_needed = now + svc->cur_budget;
+    else
+        svc->next_sched_needed = svc->cur_deadline;
+
+    /* Remove and reinsert to maintain sorted order in timerq */
+    __t_remove(svc);
+    __timerq_insert(ops, svc);
+
+    return;
+}
+
+/*
  * Init/Free related code
  */
 static int
@@ -413,6 +511,7 @@ rt_init(struct scheduler *ops)
     INIT_LIST_HEAD(&prv->sdom);
     INIT_LIST_HEAD(&prv->runq);
     INIT_LIST_HEAD(&prv->depletedq);
+    INIT_LIST_HEAD(&prv->timerq);
 
     cpumask_clear(&prv->tickled);
 
@@ -536,10 +635,12 @@ rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
 
     INIT_LIST_HEAD(&svc->q_elem);
     INIT_LIST_HEAD(&svc->sdom_elem);
+    INIT_LIST_HEAD(&svc->t_elem);
     svc->flags = 0U;
     svc->sdom = dd;
     svc->vcpu = vc;
     svc->last_start = 0;
+    svc->next_sched_needed = 0;
 
     svc->period = RTDS_DEFAULT_PERIOD;
     if ( !is_idle_vcpu(vc) )
@@ -579,7 +680,10 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
         rt_update_deadline(now, svc);
 
     if ( !__vcpu_on_q(svc) && vcpu_runnable(vc) && !vc->is_running )
+    {
         __runq_insert(ops, svc);
+        update_sched_time(ops, now, svc);
+    }
 
     /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
     list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
@@ -601,8 +705,8 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
     BUG_ON( sdom == NULL );
 
     lock = vcpu_schedule_lock_irq(vc);
-    if ( __vcpu_on_q(svc) )
-        __q_remove(svc);
+    __q_remove(svc);
+    __t_remove(svc);
     vcpu_schedule_unlock_irq(lock, vc);
 
     if ( !is_idle_vcpu(vc) )
@@ -763,6 +867,8 @@ __repl_update(const struct scheduler *ops, s_time_t now)
         /* reinsert the vcpu if its deadline is updated */
         __q_remove(svc);
         __runq_insert(ops, svc);
+        /* Update next needed sched time if deadline is updated */
+        update_sched_time(ops, now, svc);
     }
 
     list_for_each_safe(iter, tmp, depletedq)
@@ -773,11 +879,102 @@ __repl_update(const struct scheduler *ops, s_time_t now)
             rt_update_deadline(now, svc);
             __q_remove(svc); /* remove from depleted queue */
             __runq_insert(ops, svc); /* add to runq */
+            /* Update next needed sched time since deadline is updated */
+            update_sched_time(ops, now, svc);
         }
     }
 }
 
 /*
+ * Pick a cpu where to run a vcpu,
+ * possibly kicking out the vcpu running there
+ * Called by wake() and context_saved()
+ * We have a running candidate here, the kick logic is:
+ * Among all the cpus that are within the cpu affinity
+ * 1) if the new->cpu is idle, kick it. This could benefit cache hit
+ * 2) if there are any idle vcpu, kick it.
+ * 3) now all pcpus are busy;
+ *    among all the running vcpus, pick lowest priority one
+ *    if snext has higher priority, kick it.
+ *
+ * TODO:
+ * 1) what if these two vcpus belongs to the same domain?
+ *    replace a vcpu belonging to the same domain introduces more overhead
+ *
+ * lock is grabbed before calling this function
+ */
+static void
+runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
+    struct rt_vcpu *iter_svc;
+    struct vcpu *iter_vc;
+    int cpu = 0, cpu_to_tickle = 0;
+    cpumask_t not_tickled;
+    cpumask_t *online;
+
+    if ( new == NULL || is_idle_vcpu(new->vcpu) )
+        return;
+
+    online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool);
+    cpumask_and(&not_tickled, online, new->vcpu->cpu_hard_affinity);
+    cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);
+
+    /* 1) if new's previous cpu is idle, kick it for cache benefit */
+    if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) )
+    {
+        cpu_to_tickle = new->vcpu->processor;
+        goto out;
+    }
+
+    /* 2) if there are any idle pcpu, kick it */
+    /* The same loop also find the one with lowest priority */
+    for_each_cpu(cpu, &not_tickled)
+    {
+        iter_vc = curr_on_cpu(cpu);
+        if ( is_idle_vcpu(iter_vc) )
+        {
+            cpu_to_tickle = cpu;
+            goto out;
+        }
+        iter_svc = rt_vcpu(iter_vc);
+        if ( latest_deadline_vcpu == NULL ||
+             iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline )
+            latest_deadline_vcpu = iter_svc;
+    }
+
+    /* 3) candicate has higher priority, kick out lowest priority vcpu */
+    if ( latest_deadline_vcpu != NULL &&
+         new->cur_deadline < latest_deadline_vcpu->cur_deadline )
+    {
+        cpu_to_tickle = latest_deadline_vcpu->vcpu->processor;
+        goto out;
+    }
+
+    /* didn't tickle any cpu */
+    SCHED_STAT_CRANK(tickle_idlers_none);
+    return;
+out:
+    /* TRACE */
+    {
+        struct {
+            unsigned cpu:16, pad:16;
+        } d;
+        d.cpu = cpu_to_tickle;
+        d.pad = 0;
+        trace_var(TRC_RTDS_TICKLE, 0,
+                  sizeof(d),
+                  (unsigned char *)&d);
+    }
+
+    cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
+    SCHED_STAT_CRANK(tickle_idlers_some);
+    cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
+    return;
+}
+
+/*
  * schedule function for rt scheduler.
  * The lock is already grabbed in schedule.c, no need to lock here
  */
@@ -790,11 +987,9 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
     struct rt_vcpu *snext = NULL;
     struct task_slice ret = { .migrated = 0 };
 
-    /* clear ticked bit now that we've been scheduled */
-    cpumask_clear_cpu(cpu, &prv->tickled);
-
     /* burn_budget would return for IDLE VCPU */
     burn_budget(ops, scurr, now);
+    update_sched_time(ops, now, scurr);
 
     __repl_update(ops, now);
 
@@ -828,6 +1023,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
         if ( snext != scurr )
         {
             __q_remove(snext);
+            __t_remove(snext);
             set_bit(__RTDS_scheduled, &snext->flags);
         }
         if ( snext->vcpu->processor != cpu )
@@ -837,7 +1033,15 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
         }
     }
 
-    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
+    /* Check if we were tickled and clear bit if so. If not, tickle now since */
+    /* timer must have fired */
+    if ( cpumask_test_cpu(cpu, &prv->tickled) )
+        cpumask_clear_cpu(cpu, &prv->tickled);
+    else
+	    runq_tickle(ops, snext);
+
+    /* Use minimum from timerq */
+    ret.time = __timerq_next(ops, now);
     ret.task = snext->vcpu;
 
     /* TRACE */
@@ -879,95 +1083,8 @@ rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
         __q_remove(svc);
     else if ( test_bit(__RTDS_delayed_runq_add, &svc->flags) )
         clear_bit(__RTDS_delayed_runq_add, &svc->flags);
-}
-
-/*
- * Pick a cpu where to run a vcpu,
- * possibly kicking out the vcpu running there
- * Called by wake() and context_saved()
- * We have a running candidate here, the kick logic is:
- * Among all the cpus that are within the cpu affinity
- * 1) if the new->cpu is idle, kick it. This could benefit cache hit
- * 2) if there are any idle vcpu, kick it.
- * 3) now all pcpus are busy;
- *    among all the running vcpus, pick lowest priority one
- *    if snext has higher priority, kick it.
- *
- * TODO:
- * 1) what if these two vcpus belongs to the same domain?
- *    replace a vcpu belonging to the same domain introduces more overhead
- *
- * lock is grabbed before calling this function
- */
-static void
-runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
-{
-    struct rt_private *prv = rt_priv(ops);
-    struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
-    struct rt_vcpu *iter_svc;
-    struct vcpu *iter_vc;
-    int cpu = 0, cpu_to_tickle = 0;
-    cpumask_t not_tickled;
-    cpumask_t *online;
-
-    if ( new == NULL || is_idle_vcpu(new->vcpu) )
-        return;
-
-    online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool);
-    cpumask_and(&not_tickled, online, new->vcpu->cpu_hard_affinity);
-    cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);
 
-    /* 1) if new's previous cpu is idle, kick it for cache benefit */
-    if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) )
-    {
-        cpu_to_tickle = new->vcpu->processor;
-        goto out;
-    }
-
-    /* 2) if there are any idle pcpu, kick it */
-    /* The same loop also find the one with lowest priority */
-    for_each_cpu(cpu, &not_tickled)
-    {
-        iter_vc = curr_on_cpu(cpu);
-        if ( is_idle_vcpu(iter_vc) )
-        {
-            cpu_to_tickle = cpu;
-            goto out;
-        }
-        iter_svc = rt_vcpu(iter_vc);
-        if ( latest_deadline_vcpu == NULL ||
-             iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline )
-            latest_deadline_vcpu = iter_svc;
-    }
-
-    /* 3) candicate has higher priority, kick out lowest priority vcpu */
-    if ( latest_deadline_vcpu != NULL &&
-         new->cur_deadline < latest_deadline_vcpu->cur_deadline )
-    {
-        cpu_to_tickle = latest_deadline_vcpu->vcpu->processor;
-        goto out;
-    }
-
-    /* didn't tickle any cpu */
-    SCHED_STAT_CRANK(tickle_idlers_none);
-    return;
-out:
-    /* TRACE */
-    {
-        struct {
-            unsigned cpu:16, pad:16;
-        } d;
-        d.cpu = cpu_to_tickle;
-        d.pad = 0;
-        trace_var(TRC_RTDS_TICKLE, 0,
-                  sizeof(d),
-                  (unsigned char *)&d);
-    }
-
-    cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
-    SCHED_STAT_CRANK(tickle_idlers_some);
-    cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
-    return;
+    __t_remove(svc);
 }
 
 /*
@@ -988,6 +1105,8 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
 
     BUG_ON( is_idle_vcpu(vc) );
 
+    update_sched_time(ops, now, svc);
+
     if ( unlikely(curr_on_cpu(vc->processor) == vc) )
     {
         SCHED_STAT_CRANK(vcpu_wake_running);
@@ -1024,6 +1143,9 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
 
     __repl_update(ops, now);
 
+    /* Update timerq */
+    update_sched_time(ops, now, svc);
+
     ASSERT(!list_empty(&prv->sdom));
     sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
     online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
@@ -1056,8 +1178,11 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
     if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
          likely(vcpu_runnable(vc)) )
     {
+        s_time_t now = NOW();
+
         __runq_insert(ops, svc);
-        __repl_update(ops, NOW());
+        __repl_update(ops, now);
+        update_sched_time(ops, now, svc);
 
         ASSERT(!list_empty(&prv->sdom));
         sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 21+ messages in thread