xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
@ 2015-06-08 11:46 Dagaen Golomb
  2015-06-09 12:53 ` Dario Faggioli
  0 siblings, 1 reply; 21+ messages in thread
From: Dagaen Golomb @ 2015-06-08 11:46 UTC (permalink / raw)
  To: xen-devel
  Cc: wei.liu2, xisisu, george.dunlap, dario.faggioli, mengxu,
	jbeulich, lichong659

To do this, we create a new list that holds, for each
vcpu, the time least into the future that it may need to be
rescheduled. The scheduler chooses the lowest time off of this
list and waits until the specified time instead of running every
1 ms as it did before.

Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
---
 xen/common/sched_rt.c |  319 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 222 insertions(+), 97 deletions(-)

diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 7c39a9e..25f0458 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -4,6 +4,7 @@
  *
  * by Sisu Xi, 2013, Washington University in Saint Louis
  * and Meng Xu, 2014, University of Pennsylvania
+ * and Dagaen Golomb, 2015, University of Pennsylvania
  *
  * based on the code of credit Scheduler
  */
@@ -134,6 +135,8 @@ struct rt_private {
     struct list_head sdom;      /* list of availalbe domains, used for dump */
     struct list_head runq;      /* ordered list of runnable vcpus */
     struct list_head depletedq; /* unordered list of depleted vcpus */
+    struct list_head timerq;    /* ascending list of next required scheduling 
+                                   decision */
     cpumask_t tickled;          /* cpus been tickled */
 };
 
@@ -143,6 +146,7 @@ struct rt_private {
 struct rt_vcpu {
     struct list_head q_elem;    /* on the runq/depletedq list */
     struct list_head sdom_elem; /* on the domain VCPU list */
+    struct list_head t_elem;    /* on the timerq */
 
     /* Up-pointers */
     struct rt_dom *sdom;
@@ -156,6 +160,7 @@ struct rt_vcpu {
     s_time_t cur_budget;        /* current budget */
     s_time_t last_start;        /* last start time */
     s_time_t cur_deadline;      /* current deadline for EDF */
+    s_time_t next_sched_needed; /* next time to make scheduling decision */
 
     unsigned flags;             /* mark __RTDS_scheduled, etc.. */
 };
@@ -197,6 +202,11 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops)
     return &rt_priv(ops)->depletedq;
 }
 
+static inline struct list_head *rt_timerq(const struct scheduler *ops)
+{
+    return &rt_priv(ops)->timerq;
+}
+
 /*
  * Queue helper functions for runq and depletedq
  */
@@ -212,6 +222,11 @@ __q_elem(struct list_head *elem)
     return list_entry(elem, struct rt_vcpu, q_elem);
 }
 
+static struct rt_vcpu * __t_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct rt_vcpu, t_elem);
+}
+
 /*
  * Debug related code, dump vcpu/cpu information
  */
@@ -231,7 +246,8 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
 
     cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity);
     printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
-           " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
+           " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime
+           " next_sched=%"PRI_stime"\n"
            " \t\t onQ=%d runnable=%d cpu_hard_affinity=%s ",
             svc->vcpu->domain->domain_id,
             svc->vcpu->vcpu_id,
@@ -241,6 +257,7 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
             svc->cur_budget,
             svc->cur_deadline,
             svc->last_start,
+            svc->next_sched_needed,
             __vcpu_on_q(svc),
             vcpu_runnable(svc->vcpu),
             cpustr);
@@ -264,7 +281,7 @@ rt_dump_pcpu(const struct scheduler *ops, int cpu)
 static void
 rt_dump(const struct scheduler *ops)
 {
-    struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *iter;
+    struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *timerq, *iter;
     struct rt_private *prv = rt_priv(ops);
     struct rt_vcpu *svc;
     struct rt_dom *sdom;
@@ -277,6 +294,7 @@ rt_dump(const struct scheduler *ops)
 
     runq = rt_runq(ops);
     depletedq = rt_depletedq(ops);
+    timerq = rt_timerq(ops);
 
     printk("Global RunQueue info:\n");
     list_for_each( iter, runq )
@@ -292,6 +310,14 @@ rt_dump(const struct scheduler *ops)
         rt_dump_vcpu(ops, svc);
     }
 
+    printk("Global TimerQueue info:\n");
+    list_for_each( iter, timerq )
+    {
+        svc = __t_elem(iter);
+        printk("\tvcpu %d next_sched=%"PRI_stime"\n", svc->vcpu->vcpu_id, 
+			   svc->next_sched_needed);
+    }
+
     printk("Domain info:\n");
     list_for_each( iter_sdom, &prv->sdom )
     {
@@ -361,6 +387,12 @@ __q_remove(struct rt_vcpu *svc)
         list_del_init(&svc->q_elem);
 }
 
+static inline void __t_remove(struct rt_vcpu *svc)
+{
+	if( !list_empty(&svc->t_elem) )
+		list_del_init(&svc->t_elem);
+}
+
 /*
  * Insert svc with budget in RunQ according to EDF:
  * vcpus with smaller deadlines go first.
@@ -395,6 +427,72 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
 }
 
 /*
+ * Insert svc into the timerq, maintaining ascending order by next_sched_needed.
+ */
+static void __timerq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *timerq = rt_timerq(ops);
+    struct list_head *iter = timerq;
+
+    ASSERT( spin_is_locked(&prv->lock) );
+
+    ASSERT( list_empty(&svc->t_elem) );
+
+	list_for_each(iter, timerq)
+	{
+		struct rt_vcpu * iter_svc = __t_elem(iter);
+		if ( svc->next_sched_needed <= iter_svc->next_sched_needed )
+				break;
+	}
+	list_add_tail(&svc->t_elem, iter);
+}
+
+/*
+ * Return how far the lowest time on the timerq (that is after NOW) is in the
+ * future.
+ * Lock must be grabbed before calling this.
+ */
+static s_time_t __timerq_next(const struct scheduler *ops, s_time_t now)
+{
+    struct list_head *timerq = rt_timerq(ops);
+    struct list_head *iter, *tmp;
+
+    list_for_each_safe(iter, tmp, timerq)
+    {
+        struct rt_vcpu * iter_svc = __t_elem(iter);
+
+        if ( iter_svc->next_sched_needed > now )
+            return (iter_svc->next_sched_needed - now);
+        else
+            __t_remove(iter_svc);
+    }
+
+    return MAX_SCHEDULE;
+}
+
+/*
+ * Updates the next_sched_needed field for a vcpu, which is used for
+ * determining the next needed schedule time for this vcpu. Then updates
+ * timerq via reinsert.
+ */
+static void update_sched_time(const struct scheduler *ops, s_time_t now, 
+                              struct rt_vcpu *svc)
+{
+    /* update next needed schedule time */
+    if( test_bit(__RTDS_scheduled, &svc->flags) )
+        svc->next_sched_needed = now + svc->cur_budget;
+    else
+        svc->next_sched_needed = svc->cur_deadline;
+
+    /* Remove and reinsert to maintain sorted order in timerq */
+    __t_remove(svc);
+    __timerq_insert(ops, svc);
+
+    return;
+}
+
+/*
  * Init/Free related code
  */
 static int
@@ -413,6 +511,7 @@ rt_init(struct scheduler *ops)
     INIT_LIST_HEAD(&prv->sdom);
     INIT_LIST_HEAD(&prv->runq);
     INIT_LIST_HEAD(&prv->depletedq);
+    INIT_LIST_HEAD(&prv->timerq);
 
     cpumask_clear(&prv->tickled);
 
@@ -536,10 +635,12 @@ rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
 
     INIT_LIST_HEAD(&svc->q_elem);
     INIT_LIST_HEAD(&svc->sdom_elem);
+    INIT_LIST_HEAD(&svc->t_elem);
     svc->flags = 0U;
     svc->sdom = dd;
     svc->vcpu = vc;
     svc->last_start = 0;
+    svc->next_sched_needed = 0;
 
     svc->period = RTDS_DEFAULT_PERIOD;
     if ( !is_idle_vcpu(vc) )
@@ -579,7 +680,10 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
         rt_update_deadline(now, svc);
 
     if ( !__vcpu_on_q(svc) && vcpu_runnable(vc) && !vc->is_running )
+    {
         __runq_insert(ops, svc);
+        update_sched_time(ops, now, svc);
+    }
 
     /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
     list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
@@ -601,8 +705,8 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
     BUG_ON( sdom == NULL );
 
     lock = vcpu_schedule_lock_irq(vc);
-    if ( __vcpu_on_q(svc) )
-        __q_remove(svc);
+    __q_remove(svc);
+    __t_remove(svc);
     vcpu_schedule_unlock_irq(lock, vc);
 
     if ( !is_idle_vcpu(vc) )
@@ -763,6 +867,8 @@ __repl_update(const struct scheduler *ops, s_time_t now)
         /* reinsert the vcpu if its deadline is updated */
         __q_remove(svc);
         __runq_insert(ops, svc);
+        /* Update next needed sched time if deadline is updated */
+        update_sched_time(ops, now, svc);
     }
 
     list_for_each_safe(iter, tmp, depletedq)
@@ -773,11 +879,102 @@ __repl_update(const struct scheduler *ops, s_time_t now)
             rt_update_deadline(now, svc);
             __q_remove(svc); /* remove from depleted queue */
             __runq_insert(ops, svc); /* add to runq */
+            /* Update next needed sched time since deadline is updated */
+            update_sched_time(ops, now, svc);
         }
     }
 }
 
 /*
+ * Pick a cpu where to run a vcpu,
+ * possibly kicking out the vcpu running there
+ * Called by wake() and context_saved()
+ * We have a running candidate here, the kick logic is:
+ * Among all the cpus that are within the cpu affinity
+ * 1) if the new->cpu is idle, kick it. This could benefit cache hit
+ * 2) if there are any idle vcpu, kick it.
+ * 3) now all pcpus are busy;
+ *    among all the running vcpus, pick lowest priority one
+ *    if snext has higher priority, kick it.
+ *
+ * TODO:
+ * 1) what if these two vcpus belongs to the same domain?
+ *    replace a vcpu belonging to the same domain introduces more overhead
+ *
+ * lock is grabbed before calling this function
+ */
+static void
+runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
+    struct rt_vcpu *iter_svc;
+    struct vcpu *iter_vc;
+    int cpu = 0, cpu_to_tickle = 0;
+    cpumask_t not_tickled;
+    cpumask_t *online;
+
+    if ( new == NULL || is_idle_vcpu(new->vcpu) )
+        return;
+
+    online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool);
+    cpumask_and(&not_tickled, online, new->vcpu->cpu_hard_affinity);
+    cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);
+
+    /* 1) if new's previous cpu is idle, kick it for cache benefit */
+    if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) )
+    {
+        cpu_to_tickle = new->vcpu->processor;
+        goto out;
+    }
+
+    /* 2) if there are any idle pcpu, kick it */
+    /* The same loop also find the one with lowest priority */
+    for_each_cpu(cpu, &not_tickled)
+    {
+        iter_vc = curr_on_cpu(cpu);
+        if ( is_idle_vcpu(iter_vc) )
+        {
+            cpu_to_tickle = cpu;
+            goto out;
+        }
+        iter_svc = rt_vcpu(iter_vc);
+        if ( latest_deadline_vcpu == NULL ||
+             iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline )
+            latest_deadline_vcpu = iter_svc;
+    }
+
+    /* 3) candicate has higher priority, kick out lowest priority vcpu */
+    if ( latest_deadline_vcpu != NULL &&
+         new->cur_deadline < latest_deadline_vcpu->cur_deadline )
+    {
+        cpu_to_tickle = latest_deadline_vcpu->vcpu->processor;
+        goto out;
+    }
+
+    /* didn't tickle any cpu */
+    SCHED_STAT_CRANK(tickle_idlers_none);
+    return;
+out:
+    /* TRACE */
+    {
+        struct {
+            unsigned cpu:16, pad:16;
+        } d;
+        d.cpu = cpu_to_tickle;
+        d.pad = 0;
+        trace_var(TRC_RTDS_TICKLE, 0,
+                  sizeof(d),
+                  (unsigned char *)&d);
+    }
+
+    cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
+    SCHED_STAT_CRANK(tickle_idlers_some);
+    cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
+    return;
+}
+
+/*
  * schedule function for rt scheduler.
  * The lock is already grabbed in schedule.c, no need to lock here
  */
@@ -790,11 +987,9 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
     struct rt_vcpu *snext = NULL;
     struct task_slice ret = { .migrated = 0 };
 
-    /* clear ticked bit now that we've been scheduled */
-    cpumask_clear_cpu(cpu, &prv->tickled);
-
     /* burn_budget would return for IDLE VCPU */
     burn_budget(ops, scurr, now);
+    update_sched_time(ops, now, scurr);
 
     __repl_update(ops, now);
 
@@ -828,6 +1023,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
         if ( snext != scurr )
         {
             __q_remove(snext);
+            __t_remove(snext);
             set_bit(__RTDS_scheduled, &snext->flags);
         }
         if ( snext->vcpu->processor != cpu )
@@ -837,7 +1033,15 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
         }
     }
 
-    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
+    /* Check if we were tickled and clear bit if so. If not, tickle now since */
+    /* timer must have fired */
+    if ( cpumask_test_cpu(cpu, &prv->tickled) )
+        cpumask_clear_cpu(cpu, &prv->tickled);
+    else
+	    runq_tickle(ops, snext);
+
+    /* Use minimum from timerq */
+    ret.time = __timerq_next(ops, now);
     ret.task = snext->vcpu;
 
     /* TRACE */
@@ -879,95 +1083,8 @@ rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
         __q_remove(svc);
     else if ( test_bit(__RTDS_delayed_runq_add, &svc->flags) )
         clear_bit(__RTDS_delayed_runq_add, &svc->flags);
-}
-
-/*
- * Pick a cpu where to run a vcpu,
- * possibly kicking out the vcpu running there
- * Called by wake() and context_saved()
- * We have a running candidate here, the kick logic is:
- * Among all the cpus that are within the cpu affinity
- * 1) if the new->cpu is idle, kick it. This could benefit cache hit
- * 2) if there are any idle vcpu, kick it.
- * 3) now all pcpus are busy;
- *    among all the running vcpus, pick lowest priority one
- *    if snext has higher priority, kick it.
- *
- * TODO:
- * 1) what if these two vcpus belongs to the same domain?
- *    replace a vcpu belonging to the same domain introduces more overhead
- *
- * lock is grabbed before calling this function
- */
-static void
-runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
-{
-    struct rt_private *prv = rt_priv(ops);
-    struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
-    struct rt_vcpu *iter_svc;
-    struct vcpu *iter_vc;
-    int cpu = 0, cpu_to_tickle = 0;
-    cpumask_t not_tickled;
-    cpumask_t *online;
-
-    if ( new == NULL || is_idle_vcpu(new->vcpu) )
-        return;
-
-    online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool);
-    cpumask_and(&not_tickled, online, new->vcpu->cpu_hard_affinity);
-    cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);
 
-    /* 1) if new's previous cpu is idle, kick it for cache benefit */
-    if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) )
-    {
-        cpu_to_tickle = new->vcpu->processor;
-        goto out;
-    }
-
-    /* 2) if there are any idle pcpu, kick it */
-    /* The same loop also find the one with lowest priority */
-    for_each_cpu(cpu, &not_tickled)
-    {
-        iter_vc = curr_on_cpu(cpu);
-        if ( is_idle_vcpu(iter_vc) )
-        {
-            cpu_to_tickle = cpu;
-            goto out;
-        }
-        iter_svc = rt_vcpu(iter_vc);
-        if ( latest_deadline_vcpu == NULL ||
-             iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline )
-            latest_deadline_vcpu = iter_svc;
-    }
-
-    /* 3) candicate has higher priority, kick out lowest priority vcpu */
-    if ( latest_deadline_vcpu != NULL &&
-         new->cur_deadline < latest_deadline_vcpu->cur_deadline )
-    {
-        cpu_to_tickle = latest_deadline_vcpu->vcpu->processor;
-        goto out;
-    }
-
-    /* didn't tickle any cpu */
-    SCHED_STAT_CRANK(tickle_idlers_none);
-    return;
-out:
-    /* TRACE */
-    {
-        struct {
-            unsigned cpu:16, pad:16;
-        } d;
-        d.cpu = cpu_to_tickle;
-        d.pad = 0;
-        trace_var(TRC_RTDS_TICKLE, 0,
-                  sizeof(d),
-                  (unsigned char *)&d);
-    }
-
-    cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
-    SCHED_STAT_CRANK(tickle_idlers_some);
-    cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
-    return;
+    __t_remove(svc);
 }
 
 /*
@@ -988,6 +1105,8 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
 
     BUG_ON( is_idle_vcpu(vc) );
 
+    update_sched_time(ops, now, svc);
+
     if ( unlikely(curr_on_cpu(vc->processor) == vc) )
     {
         SCHED_STAT_CRANK(vcpu_wake_running);
@@ -1024,6 +1143,9 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
 
     __repl_update(ops, now);
 
+    /* Update timerq */
+    update_sched_time(ops, now, svc);
+
     ASSERT(!list_empty(&prv->sdom));
     sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
     online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
@@ -1056,8 +1178,11 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
     if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
          likely(vcpu_runnable(vc)) )
     {
+        s_time_t now = NOW();
+
         __runq_insert(ops, svc);
-        __repl_update(ops, NOW());
+        __repl_update(ops, now);
+        update_sched_time(ops, now, svc);
 
         ASSERT(!list_empty(&prv->sdom));
         sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-08 11:46 [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling Dagaen Golomb
@ 2015-06-09 12:53 ` Dario Faggioli
  2015-06-10  4:18   ` Dagaen Golomb
  2015-06-10  5:54   ` Meng Xu
  0 siblings, 2 replies; 21+ messages in thread
From: Dario Faggioli @ 2015-06-09 12:53 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: wei.liu2, xisisu, george.dunlap, xen-devel, mengxu, jbeulich, lichong659


[-- Attachment #1.1: Type: text/plain, Size: 11024 bytes --]

Hello Dagaen,

Thanks for doing this. The first question I have is whether you run any
test/benchmark that you can show the result of here.

For instance, a few weeks ago, Meng (while trying to do a completely
different thing) sent here on the list some numbers about the frequency
of the scheduler being called, and the overhead caused by that... Would
it be hard to run the same experiments and collect the numbers, with and
without your patch?

On Mon, 2015-06-08 at 07:46 -0400, Dagaen Golomb wrote:
> To do this, we create a new list that holds, for each
> vcpu, the time least into the future that it may need to be
> rescheduled.
>
Ok. Actually, what I really expected to see in this patch was, either:
 a) a new, scheduler wide, list of replenishment times _and_ a timer,
    always (re)programmed to fire at the most imminent one; or
 b) one timer for each vcpu, always (re)programmed to fire at the time
    instant when the replenishment for that vcpu should happen.

And note that, when I say "timer", I mean an actual Xen timer, i.e.,
those things that are started, stopped, and with a timer handling
routine being called when they expire. For an example, you can have a
look, in sched_credit.c, at 'ticker' or 'master_ticker'.

Deciding between a) or b) isn't easy, without actually trying to
implement them. I personally prefer b), and I think it would be a
superior long term solution (see right below), but given te current
architecture of the RTDS scheduler (e.g., the fact that it has a global
runq, etc), I won't nack a), which would most likely be simpler.

Performance and overhead wise, again, hard to tell in advance. b) would
introduce more overhead (as there are more timers), but will likely
reveal more scalable (which is not something of great importance for
this scheduler) and may allow for better latency (which _is_ something
of great importance for this scheduler :-) ), since there won't be the
need to lock the whole scheduler during the replenishments (which is,
OTOH, necessary with a)).

And that's it for the replenishments. For enforcing the budget, well, we
have the ret.time value of the task_slice struct returned by
rt_schedule, so that's event driven already, and we don't need to do
much about it.

Does all this make sense? Am I making myself clear enough?
If not, feel free to ask.

>  The scheduler chooses the lowest time off of this
> list and waits until the specified time instead of running every
> 1 ms as it did before.
> 
Yeah, well, I see what you mean and how you this is actually succeeding
(at least AFAICT, by looking at the code, again, it would be nice to
have some numbers) in improving the scheduler behavior.

However, this transition toward event driven-ness has two goals:
 * improve the scheduler behavior [check, at least to some extent]
 * improve the code (readability, maintainability, etc.)
   [not check at all :-(]

As an example, the __repl_update() function: it's called 2 times inside
rt_schedule() and a third from rt_context_saved(), which is basically
like saying it's called 3 times from rt_schedule(), and this always made
very few sense to me. In fact, I think that scheduling decisions and
replenishment events shouldn't be so tightly coupled. There are
interdependencies, of course (a replenishment could call for a
scheduling decision to be taken), but not like this. That's why I say
I'd like to see a timer handling routine dealing with replenishment, and
let rt_schedule() do it's job, which is:
 - enforcing the budget of the running vcpu. If it expires,
   _(re)program_ the replenishment timer
 - choose the best vcpu to run next, if necessary

With this patch, you're still using rt_schedule() to do both scheduling
and replenishments, although you're now doing it at actual replenishment
times, instead of checking for them every millisecond.

Also, the bits and pieces that you need to add in order to deal with
this new queue is, really, making things even more complex than they are
now, which is undesirable.

So, all this being said, let me know what you think about it (and that
applies to everyone else as well, of course, in particular Meng).

I won't comment on the code in too much details, as it will require some
quite radical restructuring, but I'll skim through it and provide some
hints anyway.

> Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
> Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
> ---
>  xen/common/sched_rt.c |  319 ++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 222 insertions(+), 97 deletions(-)
> 
First of all, it's a big patch. It's only changing one file and one
logical component, and for that reason, TBH, it's not that hard to
review. Still I think you can break it in at least two, and perhaps even
more, if you try to implement what I described above.

> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> index 7c39a9e..25f0458 100644
> --- a/xen/common/sched_rt.c
> +++ b/xen/common/sched_rt.c

> @@ -156,6 +160,7 @@ struct rt_vcpu {
>      s_time_t cur_budget;        /* current budget */
>      s_time_t last_start;        /* last start time */
>      s_time_t cur_deadline;      /* current deadline for EDF */
> +    s_time_t next_sched_needed; /* next time to make scheduling decision */
>
As an example of why I said that things should become simpler, and are
instead becoming more complex: with my solution, you don't need anything
like this. In fact, budget enforcement is already being done ok in
rt_schedule(), so no need to do anything more/different about it.
Replenishments should be programmed to fire at cur_deadline, so again,
no need to add this new field, and multiplex its actual value between
budget and deadline (which, yes, counts as being something rather
complex for me! :-D).

> @@ -361,6 +387,12 @@ __q_remove(struct rt_vcpu *svc)
>          list_del_init(&svc->q_elem);
>  }
>  
> +static inline void __t_remove(struct rt_vcpu *svc)
> +{
> +	if( !list_empty(&svc->t_elem) )
> +		list_del_init(&svc->t_elem);
> +}
> +
>
You're using hard tabs for indenting here. As you see everywhere esle,
Xen wants 4 spaces for this.

>  /*
>   * Insert svc with budget in RunQ according to EDF:
>   * vcpus with smaller deadlines go first.
> @@ -395,6 +427,72 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>  }
>  
>  /*
> + * Insert svc into the timerq, maintaining ascending order by next_sched_needed.
> + */
> +static void __timerq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> +{
> +    struct rt_private *prv = rt_priv(ops);
> +    struct list_head *timerq = rt_timerq(ops);
> +    struct list_head *iter = timerq;
> +
> +    ASSERT( spin_is_locked(&prv->lock) );
> +
> +    ASSERT( list_empty(&svc->t_elem) );
> +
The blank line between the asserts, I'd kill it.

> +	list_for_each(iter, timerq)
> +	{
>
You're still using tabs, and mixing them with spaces, making things look
even more cumbersome.

> +/*
> + * Return how far the lowest time on the timerq (that is after NOW) is in the
> + * future.
> + * Lock must be grabbed before calling this.
> + */
> +static s_time_t __timerq_next(const struct scheduler *ops, s_time_t now)
> +{
> +    struct list_head *timerq = rt_timerq(ops);
> +    struct list_head *iter, *tmp;
> +
> +    list_for_each_safe(iter, tmp, timerq)
> +    {
> +        struct rt_vcpu * iter_svc = __t_elem(iter);
> +
> +        if ( iter_svc->next_sched_needed > now )
> +            return (iter_svc->next_sched_needed - now);
> +        else
> +            __t_remove(iter_svc);
> +    }
> +
> +    return MAX_SCHEDULE;
> +}
>
If using a timer, you can just get rid of items while, in the timer
routine, you handle the event associated to them.

Also, why is MAX_SCHEDULE still there?

> +/*
> + * Updates the next_sched_needed field for a vcpu, which is used for
> + * determining the next needed schedule time for this vcpu. Then updates
> + * timerq via reinsert.
> + */
> +static void update_sched_time(const struct scheduler *ops, s_time_t now, 
> +                              struct rt_vcpu *svc)
> +{
> +    /* update next needed schedule time */
> +    if( test_bit(__RTDS_scheduled, &svc->flags) )
> +        svc->next_sched_needed = now + svc->cur_budget;
> +    else
> +        svc->next_sched_needed = svc->cur_deadline;
> +
> +    /* Remove and reinsert to maintain sorted order in timerq */
> +    __t_remove(svc);
> +    __timerq_insert(ops, svc);
> +
> +    return;
> +}
>
And here's the multiplexing, which --even worse-- (may) require
rearranging the queue! We really don't need anything like this.

>  /*
> + * Pick a cpu where to run a vcpu,
> + * possibly kicking out the vcpu running there
> + * Called by wake() and context_saved()
> + * We have a running candidate here, the kick logic is:
> + * Among all the cpus that are within the cpu affinity
> + * 1) if the new->cpu is idle, kick it. This could benefit cache hit
> + * 2) if there are any idle vcpu, kick it.
> + * 3) now all pcpus are busy;
> + *    among all the running vcpus, pick lowest priority one
> + *    if snext has higher priority, kick it.
> + *
> + * TODO:
> + * 1) what if these two vcpus belongs to the same domain?
> + *    replace a vcpu belonging to the same domain introduces more overhead
> + *
> + * lock is grabbed before calling this function
> + */
> +static void
> +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
> +{
> +    struct rt_private *prv = rt_priv(ops);
> +    struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
> +    struct rt_vcpu *iter_svc;
> +    struct vcpu *iter_vc;
> +    int cpu = 0, cpu_to_tickle = 0;
> +    cpumask_t not_tickled;
> +    cpumask_t *online;
> +
>
[snip]

And here you are moving a function up. In general, it is better to have
separate patches that do the movings, with the fact that the are all
about moving and that the code is not being changed, clearly stated in
the commit message. This is because it is really really hard to figure
out, e.g. from here, whether you also changed something in the function
while moving it, making reviewing a lot harder and more prone to error.

That being said, in this specific case, you're moving up runq_tickle(),
the purpose of which is to trigger a call to rt_schedule() (after going
through the softirq machinery)... in order to call it in rt_schedule().
That's pretty tricky, and is not how tickling should work.

Again, with an approach that really take advantage of timers, this won't
be necessary.

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-09 12:53 ` Dario Faggioli
@ 2015-06-10  4:18   ` Dagaen Golomb
  2015-06-10 22:30     ` Dario Faggioli
  2015-06-10  5:54   ` Meng Xu
  1 sibling, 1 reply; 21+ messages in thread
From: Dagaen Golomb @ 2015-06-10  4:18 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li


[-- Attachment #1.1: Type: text/plain, Size: 15723 bytes --]

Thanks for the feedback.


> Thanks for doing this. The first question I have is whether you run any
> test/benchmark that you can show the result of here.
>
> For instance, a few weeks ago, Meng (while trying to do a completely
> different thing) sent here on the list some numbers about the frequency
> of the scheduler being called, and the overhead caused by that... Would
> it be hard to run the same experiments and collect the numbers, with and
> without your patch?
>

I could run some experiments to measure this. An easier quick test may be
to count invocations and run the same workload on each, making sure each
run a long time to remove biases.


> On Mon, 2015-06-08 at 07:46 -0400, Dagaen Golomb wrote:
> > To do this, we create a new list that holds, for each
> > vcpu, the time least into the future that it may need to be
> > rescheduled.
> >
> Ok. Actually, what I really expected to see in this patch was, either:
>  a) a new, scheduler wide, list of replenishment times _and_ a timer,
>     always (re)programmed to fire at the most imminent one; or
>  b) one timer for each vcpu, always (re)programmed to fire at the time
>     instant when the replenishment for that vcpu should happen.
>

a) is essentially what we have going on here. The list is exactly that, a
list of possible scheduling change times (including replenishments as they
are coalesced with period), one per vcpu, and the timer is triggered using
the earliest one.


> And note that, when I say "timer", I mean an actual Xen timer, i.e.,
> those things that are started, stopped, and with a timer handling
> routine being called when they expire. For an example, you can have a
> look, in sched_credit.c, at 'ticker' or 'master_ticker'.
>

I will look at this. However, the current solution is likely functionally
equivalent and, with only one timer and a single list used to arm it, I'm
not sure if using a Xen timer is necessary. Do they incur any extra
overhead or coarsen granularity?


>
> Deciding between a) or b) isn't easy, without actually trying to
> implement them. I personally prefer b), and I think it would be a
> superior long term solution (see right below), but given te current
> architecture of the RTDS scheduler (e.g., the fact that it has a global
> runq, etc), I won't nack a), which would most likely be simpler.
>

I agree b) would may be better in the long run, but with the current
architecture a) is simpler. b) can be future work as the scheduler evolves.


> Performance and overhead wise, again, hard to tell in advance. b) would
> introduce more overhead (as there are more timers), but will likely
> reveal more scalable (which is not something of great importance for
> this scheduler) and may allow for better latency (which _is_ something
> of great importance for this scheduler :-) ), since there won't be the
> need to lock the whole scheduler during the replenishments (which is,
> OTOH, necessary with a)).
>
> And that's it for the replenishments. For enforcing the budget, well, we
> have the ret.time value of the task_slice struct returned by
> rt_schedule, so that's event driven already, and we don't need to do
> much about it.
>

The timerq will also hold the end of period if that is the next relevant
event, and if at head of the timerq this value will be used to arm the
timer to let the task run until budget exhaustion.


> Does all this make sense? Am I making myself clear enough?
> If not, feel free to ask.
>
> >  The scheduler chooses the lowest time off of this
> > list and waits until the specified time instead of running every
> > 1 ms as it did before.
> >
> Yeah, well, I see what you mean and how you this is actually succeeding
> (at least AFAICT, by looking at the code, again, it would be nice to
> have some numbers) in improving the scheduler behavior.
>
> However, this transition toward event driven-ness has two goals:
>  * improve the scheduler behavior [check, at least to some extent]
>  * improve the code (readability, maintainability, etc.)
>    [not check at all :-(]
>
> As an example, the __repl_update() function: it's called 2 times inside
> rt_schedule() and a third from rt_context_saved(), which is basically
> like saying it's called 3 times from rt_schedule(), and this always made
> very few sense to me. In fact, I think that scheduling decisions and
> replenishment events shouldn't be so tightly coupled. There are
> interdependencies, of course (a replenishment could call for a
> scheduling decision to be taken), but not like this. That's why I say
> I'd like to see a timer handling routine dealing with replenishment, and
> let rt_schedule() do it's job, which is:
>  - enforcing the budget of the running vcpu. If it expires,
>    _(re)program_ the replenishment timer
>  - choose the best vcpu to run next, if necessary
>
> With this patch, you're still using rt_schedule() to do both scheduling
> and replenishments, although you're now doing it at actual replenishment
> times, instead of checking for them every millisecond.
>
>
I think once you understand that the timerq is not only replenishments, but
any event that could cause a branch is the scheduler behavior, it becomes
more palatable to have them intertwined. Really, the timerq and scheduling
aren't as intertwined as they appear, they are piratically isolated
functionally. Its just that the timerq is most naturally serviced during
scheduling events when data that effects scheduling decisions is changing.
Its straightforward and efficient. Let me know your thoughts on this.


> Also, the bits and pieces that you need to add in order to deal with
> this new queue is, really, making things even more complex than they are
> now, which is undesirable.
>

While it does add some complexity, I don't feel a single queue and managing
it is that much extra complexity.


> So, all this being said, let me know what you think about it (and that
> applies to everyone else as well, of course, in particular Meng).
>
> I won't comment on the code in too much details, as it will require some
> quite radical restructuring, but I'll skim through it and provide some
> hints anyway.
>
> > Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
> > Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
> > ---
> >  xen/common/sched_rt.c |  319
> ++++++++++++++++++++++++++++++++++---------------
> >  1 file changed, 222 insertions(+), 97 deletions(-)
> >
> First of all, it's a big patch. It's only changing one file and one
> logical component, and for that reason, TBH, it's not that hard to
> review. Still I think you can break it in at least two, and perhaps even
> more, if you try to implement what I described above.
>

Noted.
Note that its not as large as it seems, a large chunk of the deletions and
an equal number of insertions include moving a function due to visibility
to other functions.


> > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> > index 7c39a9e..25f0458 100644
> > --- a/xen/common/sched_rt.c
> > +++ b/xen/common/sched_rt.c
>
> > @@ -156,6 +160,7 @@ struct rt_vcpu {
> >      s_time_t cur_budget;        /* current budget */
> >      s_time_t last_start;        /* last start time */
> >      s_time_t cur_deadline;      /* current deadline for EDF */
> > +    s_time_t next_sched_needed; /* next time to make scheduling
> decision */
> >
> As an example of why I said that things should become simpler, and are
> instead becoming more complex: with my solution, you don't need anything
> like this. In fact, budget enforcement is already being done ok in
> rt_schedule(), so no need to do anything more/different about it.
> Replenishments should be programmed to fire at cur_deadline, so again,
> no need to add this new field, and multiplex its actual value between
> budget and deadline (which, yes, counts as being something rather
> complex for me! :-D).
>

As mentioned, the timerq is handling all events that could change the
scheduling decision, not just replenishments. This tracks the earliest time
anything cause this to be scheduled differently, and could be extended if
any more insights are found. Budget enforcement is being done in
rt_schedule but its being done by this very list: a budget expiration is
one possible event that would require a vcpu reschedule.


>
> > @@ -361,6 +387,12 @@ __q_remove(struct rt_vcpu *svc)
> >          list_del_init(&svc->q_elem);
> >  }
> >
> > +static inline void __t_remove(struct rt_vcpu *svc)
> > +{
> > +     if( !list_empty(&svc->t_elem) )
> > +             list_del_init(&svc->t_elem);
> > +}
> > +
> >
> You're using hard tabs for indenting here. As you see everywhere esle,
> Xen wants 4 spaces for this.
>

Sorry, I thought I had gotten them all changed to spaces. My editor wasn't
configured correctly at first so some are lurking around. I will
exterminate the remaining ones.


> >  /*
> >   * Insert svc with budget in RunQ according to EDF:
> >   * vcpus with smaller deadlines go first.
> > @@ -395,6 +427,72 @@ __runq_insert(const struct scheduler *ops, struct
> rt_vcpu *svc)
> >  }
> >
> >  /*
> > + * Insert svc into the timerq, maintaining ascending order by
> next_sched_needed.
> > + */
> > +static void __timerq_insert(const struct scheduler *ops, struct rt_vcpu
> *svc)
> > +{
> > +    struct rt_private *prv = rt_priv(ops);
> > +    struct list_head *timerq = rt_timerq(ops);
> > +    struct list_head *iter = timerq;
> > +
> > +    ASSERT( spin_is_locked(&prv->lock) );
> > +
> > +    ASSERT( list_empty(&svc->t_elem) );
> > +
> The blank line between the asserts, I'd kill it.
>

Can do.


> > +     list_for_each(iter, timerq)
> > +     {
> >
> You're still using tabs, and mixing them with spaces, making things look
> even more cumbersome.
>

Again, sorry. Will exterminate.


>
> > +/*
> > + * Return how far the lowest time on the timerq (that is after NOW) is
> in the
> > + * future.
> > + * Lock must be grabbed before calling this.
> > + */
> > +static s_time_t __timerq_next(const struct scheduler *ops, s_time_t now)
> > +{
> > +    struct list_head *timerq = rt_timerq(ops);
> > +    struct list_head *iter, *tmp;
> > +
> > +    list_for_each_safe(iter, tmp, timerq)
> > +    {
> > +        struct rt_vcpu * iter_svc = __t_elem(iter);
> > +
> > +        if ( iter_svc->next_sched_needed > now )
> > +            return (iter_svc->next_sched_needed - now);
> > +        else
> > +            __t_remove(iter_svc);
> > +    }
> > +
> > +    return MAX_SCHEDULE;
> > +}
> >
> If using a timer, you can just get rid of items while, in the timer
> routine, you handle the event associated to them.
>
> Also, why is MAX_SCHEDULE still there?
>

Honestly, events do not have to be removed here, but its done to prevent
walking a list of stale values to get to the newest one on the next call.
This is done more for performance. Their non-removal would be functionally
equivalent.
With the timer alternative you mention, where would the timer events and
their data be held? I think that could be extra complexity, unless I fail
to understand the idea completely.

The current approach is actually somewhat simple if you think about it,
requiring just one piece of information per vcpu (the next_sched_needed
field) which is updated as information is available. It also uses a
guaranteed O(1) increase in memory with how the queue and next_sched_time
is set up, and most computational parts are rolled into existing routines
that need to be done anyways.

MAX_SCHEDULE may not be required, but I have it there as an "in case." For
example, it will cause the scheduler to be called after MAX_SCHEDULE even
when no events are occurring (such as no vcpus). Its there to ensure
progress in case of any bugs or unexpected behavior.


> > +/*
> > + * Updates the next_sched_needed field for a vcpu, which is used for
> > + * determining the next needed schedule time for this vcpu. Then updates
> > + * timerq via reinsert.
> > + */
> > +static void update_sched_time(const struct scheduler *ops, s_time_t now,
> > +                              struct rt_vcpu *svc)
> > +{
> > +    /* update next needed schedule time */
> > +    if( test_bit(__RTDS_scheduled, &svc->flags) )
> > +        svc->next_sched_needed = now + svc->cur_budget;
> > +    else
> > +        svc->next_sched_needed = svc->cur_deadline;
> > +
> > +    /* Remove and reinsert to maintain sorted order in timerq */
> > +    __t_remove(svc);
> > +    __timerq_insert(ops, svc);
> > +
> > +    return;
> > +}
> >
> And here's the multiplexing, which --even worse-- (may) require
> rearranging the queue! We really don't need anything like this.
>

I see what you mean here, but this doesn't seem bad to me. Right now its
only these two events and a simple if-else, but what is great is that this
method can be updated to include any number of tricks or insights that
could help determine when something may need be scheduled or not. Right now
it seems complex for what appears to be just two cases, but many more cases
or an entirely different accounting method could be placed here and it
would be fine. It makes for a nice "ground zero" for modifying the timer
update behavior.


> >  /*
> > + * Pick a cpu where to run a vcpu,
> > + * possibly kicking out the vcpu running there
> > + * Called by wake() and context_saved()
> > + * We have a running candidate here, the kick logic is:
> > + * Among all the cpus that are within the cpu affinity
> > + * 1) if the new->cpu is idle, kick it. This could benefit cache hit
> > + * 2) if there are any idle vcpu, kick it.
> > + * 3) now all pcpus are busy;
> > + *    among all the running vcpus, pick lowest priority one
> > + *    if snext has higher priority, kick it.
> > + *
> > + * TODO:
> > + * 1) what if these two vcpus belongs to the same domain?
> > + *    replace a vcpu belonging to the same domain introduces more
> overhead
> > + *
> > + * lock is grabbed before calling this function
> > + */
> > +static void
> > +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
> > +{
> > +    struct rt_private *prv = rt_priv(ops);
> > +    struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
> > +    struct rt_vcpu *iter_svc;
> > +    struct vcpu *iter_vc;
> > +    int cpu = 0, cpu_to_tickle = 0;
> > +    cpumask_t not_tickled;
> > +    cpumask_t *online;
> > +
> >
> [snip]
>
> And here you are moving a function up. In general, it is better to have
> separate patches that do the movings, with the fact that the are all
> about moving and that the code is not being changed, clearly stated in
> the commit message. This is because it is really really hard to figure
> out, e.g. from here, whether you also changed something in the function
> while moving it, making reviewing a lot harder and more prone to error.
>

Ah, thanks for mentioning. The move was required but I didn't know a
separate patch was customary. I can make it so.


> That being said, in this specific case, you're moving up runq_tickle(),
> the purpose of which is to trigger a call to rt_schedule() (after going
> through the softirq machinery)... in order to call it in rt_schedule().
> That's pretty tricky, and is not how tickling should work.
>

It set up to only tickle if needed. I'm not sure if this is an issue,


> Again, with an approach that really take advantage of timers, this won't
> be necessary.
>

I think there could be some discussion on the approaches. Hopefully others
comment as well. As mentioned this is actually a logically quite simple
change, with good efficiency, and the groud zero makes changes using lots
of information easy although right now it was kept simple to get the change
out. Future improvements to the update function was somewhat expected as
future work.

Thanks for the comments.
~Dagaen

[-- Attachment #1.2: Type: text/html, Size: 20508 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-09 12:53 ` Dario Faggioli
  2015-06-10  4:18   ` Dagaen Golomb
@ 2015-06-10  5:54   ` Meng Xu
  2015-06-10 17:46     ` Dario Faggioli
  1 sibling, 1 reply; 21+ messages in thread
From: Meng Xu @ 2015-06-10  5:54 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich,
	Chong Li, Dagaen Golomb

Hi Dario and Dagaen,

2015-06-09 5:53 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>:
>
> Hello Dagaen,
>
> Thanks for doing this. The first question I have is whether you run any
> test/benchmark that you can show the result of here.

Thank you very much Dario for your quick review!

>
> For instance, a few weeks ago, Meng (while trying to do a completely
> different thing) sent here on the list some numbers about the frequency
> of the scheduler being called, and the overhead caused by that... Would
> it be hard to run the same experiments and collect the numbers, with and
> without your patch?

I think there should be two ways to evaluate the performance:
1) Using xentrace to trace the frequency of the scheduler being called
and the overhead, when we run workload inside guest domains;
2) Using cyclictest as Dario mentioned before to test the real-time
performance at end user. Dagaen, I can provide you the commands to run
it, which is actually quite simple to run.

>
> On Mon, 2015-06-08 at 07:46 -0400, Dagaen Golomb wrote:
> > To do this, we create a new list that holds, for each
> > vcpu, the time least into the future that it may need to be
> > rescheduled.
> >
> Ok. Actually, what I really expected to see in this patch was, either:
>  a) a new, scheduler wide, list of replenishment times _and_ a timer,
>     always (re)programmed to fire at the most imminent one; or
>  b) one timer for each vcpu, always (re)programmed to fire at the time
>     instant when the replenishment for that vcpu should happen.

Ah...

Here what we are trying to do is:
Reuse the existing timer that is used by rt_schedule(). We use  that
timer as the timer to invoke the schedule function (rt_schedule). So
that timer has two purposes:
(a) budget enforcement as it was;
(b) budget replenish (and deadline enforcement since deadline = period
for each VCPU).

When the timer fires, the rt_schedule is called: enforcement the
budget, replenish the budget of VCPU(s), and pick the (next) VCPU.

In other words, we can think about the scheduler in another way:
The scheduling decision happens at the following two time points:
(a) budget enforcement as it was;
(b) budget replenish and deadline enforcement;

Whenever any of the above two situations occurs, the scheduler may
need to pick the next VCPU to run or preempt a current running VCPU.
Therefore, the logic for scheduling decision is unavoidable when
either of these two situation occurs, IMO.

In terms of performance/overhead, I think the option b) you pointed
out has the benefit of low overhead in updating the budget because we
don't need to hold the lock. However, the budget replenish occurs when
deadline of the VCPU is changed (which means the next period of the
VCPU arrives). Then it means the scheduler (may) need to make a new
scheduling decision, in which situation, the scheduler will hold the
lock for the runq. So I'm curious about the source of overhead the
option b) could save over the current implementation/design Dagaen is
doing.

Of course, it is hard to tell the performance difference until we
really run it. IMHO, we should at least have some mental excise and
have good reasons to believe the performance could improve under some
situations.

>
> And note that, when I say "timer", I mean an actual Xen timer, i.e.,
> those things that are started, stopped, and with a timer handling
> routine being called when they expire. For an example, you can have a
> look, in sched_credit.c, at 'ticker' or 'master_ticker'.
>
> Deciding between a) or b) isn't easy, without actually trying to
> implement them. I personally prefer b), and I think it would be a
> superior long term solution (see right below), but given te current
> architecture of the RTDS scheduler (e.g., the fact that it has a global
> runq, etc), I won't nack a), which would most likely be simpler.
>
> Performance and overhead wise, again, hard to tell in advance. b) would
> introduce more overhead (as there are more timers), but will likely
> reveal more scalable (which is not something of great importance for
> this scheduler) and may allow for better latency (which _is_ something
> of great importance for this scheduler :-) ), since there won't be the
> need to lock the whole scheduler during the replenishments (which is,
> OTOH, necessary with a)).

Right. This is true. But when we are replenishing the budget, we are
also changing the deadline, which affect VCPUs' priority and then
affect schedule decision.

>
> And that's it for the replenishments. For enforcing the budget, well, we
> have the ret.time value of the task_slice struct returned by
> rt_schedule, so that's event driven already, and we don't need to do
> much about it.
>
> Does all this make sense? Am I making myself clear enough?
> If not, feel free to ask.

Thank you very much for your detailed explanation. :-)
I just had a little confusion on how much the overhead the option b)
could save.

>
> >  The scheduler chooses the lowest time off of this
> > list and waits until the specified time instead of running every
> > 1 ms as it did before.
> >
> Yeah, well, I see what you mean and how you this is actually succeeding
> (at least AFAICT, by looking at the code, again, it would be nice to
> have some numbers) in improving the scheduler behavior.
>
> However, this transition toward event driven-ness has two goals:
>  * improve the scheduler behavior [check, at least to some extent]
>  * improve the code (readability, maintainability, etc.)
>    [not check at all :-(]

I see. We did consider the readability and maintainability factor in
the design but I think we just neglect the standards that "define" the
rules of readability and maintainability. Is there some documentation
that we could follow?

>
> As an example, the __repl_update() function: it's called 2 times inside
> rt_schedule() and a third from rt_context_saved(), which is basically
> like saying it's called 3 times from rt_schedule(), and this always made
> very few sense to me.

I see. This part should be improved. This is not good for sure.

> In fact, I think that scheduling decisions and
> replenishment events shouldn't be so tightly coupled. There are
> interdependencies, of course (a replenishment could call for a
> scheduling decision to be taken), but not like this.

I see. I think this is the root reason why we had one kind of design
and you had another kind of design. :-)

I see how you think this interdependecies should be handled (via Xen
timers per VCPU in option b), but I didn't quite get the
reason/principles why you think the current design is not good to
handle such interdependencies. :-(

> That's why I say
> I'd like to see a timer handling routine dealing with replenishment, and
> let rt_schedule() do it's job, which is:
>  - enforcing the budget of the running vcpu. If it expires,
>    _(re)program_ the replenishment timer
>  - choose the best vcpu to run next, if necessary
>
> With this patch, you're still using rt_schedule() to do both scheduling
> and replenishments, although you're now doing it at actual replenishment
> times, instead of checking for them every millisecond.
>
> Also, the bits and pieces that you need to add in order to deal with
> this new queue is, really, making things even more complex than they are
> now, which is undesirable.
>
> So, all this being said, let me know what you think about it (and that
> applies to everyone else as well, of course, in particular Meng).

I think I didn't quite get the underlining principles/reasons why the
current design is not good to handle the interdependencies between
budget replenishment and scheduling decisions.

I can understand the strength of option b). I think I can have better
understanding of option b) if I could understand the underlining
principles/reasons/rationale why the current design is not good. (I'm
not saying/arguing the current design is good. I just tried to
understand the weakness of it so that we could know how to handle it
best.)

>
> I won't comment on the code in too much details, as it will require some
> quite radical restructuring, but I'll skim through it and provide some
> hints anyway.

Sure.

>
> > Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
> > Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
> > ---
> >  xen/common/sched_rt.c |  319 ++++++++++++++++++++++++++++++++++---------------
> >  1 file changed, 222 insertions(+), 97 deletions(-)
> >
> First of all, it's a big patch. It's only changing one file and one
> logical component, and for that reason, TBH, it's not that hard to
> review. Still I think you can break it in at least two, and perhaps even
> more, if you try to implement what I described above.
>
> > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> > index 7c39a9e..25f0458 100644
> > --- a/xen/common/sched_rt.c
> > +++ b/xen/common/sched_rt.c
>
> > @@ -156,6 +160,7 @@ struct rt_vcpu {
> >      s_time_t cur_budget;        /* current budget */
> >      s_time_t last_start;        /* last start time */
> >      s_time_t cur_deadline;      /* current deadline for EDF */
> > +    s_time_t next_sched_needed; /* next time to make scheduling decision */
> >
> As an example of why I said that things should become simpler, and are
> instead becoming more complex: with my solution, you don't need anything
> like this. In fact, budget enforcement is already being done ok in
> rt_schedule(), so no need to do anything more/different about it.
> Replenishments should be programmed to fire at cur_deadline, so again,
> no need to add this new field, and multiplex its actual value between
> budget and deadline (which, yes, counts as being something rather
> complex for me! :-D).
>
> > @@ -361,6 +387,12 @@ __q_remove(struct rt_vcpu *svc)
> >          list_del_init(&svc->q_elem);
> >  }
> >
> > +static inline void __t_remove(struct rt_vcpu *svc)
> > +{
> > +     if( !list_empty(&svc->t_elem) )
> > +             list_del_init(&svc->t_elem);
> > +}
> > +
> >
> You're using hard tabs for indenting here. As you see everywhere esle,
> Xen wants 4 spaces for this.
>
> >  /*
> >   * Insert svc with budget in RunQ according to EDF:
> >   * vcpus with smaller deadlines go first.
> > @@ -395,6 +427,72 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> >  }
> >
> >  /*
> > + * Insert svc into the timerq, maintaining ascending order by next_sched_needed.
> > + */
> > +static void __timerq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> > +{
> > +    struct rt_private *prv = rt_priv(ops);
> > +    struct list_head *timerq = rt_timerq(ops);
> > +    struct list_head *iter = timerq;
> > +
> > +    ASSERT( spin_is_locked(&prv->lock) );
> > +
> > +    ASSERT( list_empty(&svc->t_elem) );
> > +
> The blank line between the asserts, I'd kill it.
>
> > +     list_for_each(iter, timerq)
> > +     {
> >
> You're still using tabs, and mixing them with spaces, making things look
> even more cumbersome.
>
> > +/*
> > + * Return how far the lowest time on the timerq (that is after NOW) is in the
> > + * future.
> > + * Lock must be grabbed before calling this.
> > + */
> > +static s_time_t __timerq_next(const struct scheduler *ops, s_time_t now)
> > +{
> > +    struct list_head *timerq = rt_timerq(ops);
> > +    struct list_head *iter, *tmp;
> > +
> > +    list_for_each_safe(iter, tmp, timerq)
> > +    {
> > +        struct rt_vcpu * iter_svc = __t_elem(iter);
> > +
> > +        if ( iter_svc->next_sched_needed > now )
> > +            return (iter_svc->next_sched_needed - now);
> > +        else
> > +            __t_remove(iter_svc);
> > +    }
> > +
> > +    return MAX_SCHEDULE;
> > +}
> >
> If using a timer, you can just get rid of items while, in the timer
> routine, you handle the event associated to them.
>
> Also, why is MAX_SCHEDULE still there?

Is it ok to use a BUG_ON here if it will never be called?

>
> > +/*
> > + * Updates the next_sched_needed field for a vcpu, which is used for
> > + * determining the next needed schedule time for this vcpu. Then updates
> > + * timerq via reinsert.
> > + */
> > +static void update_sched_time(const struct scheduler *ops, s_time_t now,
> > +                              struct rt_vcpu *svc)
> > +{
> > +    /* update next needed schedule time */
> > +    if( test_bit(__RTDS_scheduled, &svc->flags) )
> > +        svc->next_sched_needed = now + svc->cur_budget;
> > +    else
> > +        svc->next_sched_needed = svc->cur_deadline;
> > +
> > +    /* Remove and reinsert to maintain sorted order in timerq */
> > +    __t_remove(svc);
> > +    __timerq_insert(ops, svc);
> > +
> > +    return;
> > +}
> >
> And here's the multiplexing, which --even worse-- (may) require
> rearranging the queue! We really don't need anything like this.
>
> >  /*
> > + * Pick a cpu where to run a vcpu,
> > + * possibly kicking out the vcpu running there
> > + * Called by wake() and context_saved()
> > + * We have a running candidate here, the kick logic is:
> > + * Among all the cpus that are within the cpu affinity
> > + * 1) if the new->cpu is idle, kick it. This could benefit cache hit
> > + * 2) if there are any idle vcpu, kick it.
> > + * 3) now all pcpus are busy;
> > + *    among all the running vcpus, pick lowest priority one
> > + *    if snext has higher priority, kick it.
> > + *
> > + * TODO:
> > + * 1) what if these two vcpus belongs to the same domain?
> > + *    replace a vcpu belonging to the same domain introduces more overhead
> > + *
> > + * lock is grabbed before calling this function
> > + */
> > +static void
> > +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
> > +{
> > +    struct rt_private *prv = rt_priv(ops);
> > +    struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
> > +    struct rt_vcpu *iter_svc;
> > +    struct vcpu *iter_vc;
> > +    int cpu = 0, cpu_to_tickle = 0;
> > +    cpumask_t not_tickled;
> > +    cpumask_t *online;
> > +
> >
> [snip]
>
> And here you are moving a function up. In general, it is better to have
> separate patches that do the movings, with the fact that the are all
> about moving and that the code is not being changed, clearly stated in
> the commit message. This is because it is really really hard to figure
> out, e.g. from here, whether you also changed something in the function
> while moving it, making reviewing a lot harder and more prone to error.
>
> That being said, in this specific case, you're moving up runq_tickle(),
> the purpose of which is to trigger a call to rt_schedule() (after going
> through the softirq machinery)... in order to call it in rt_schedule().
> That's pretty tricky, and is not how tickling should work.
>
> Again, with an approach that really take advantage of timers, this won't
> be necessary.

The reason why we need to call runq_tickle here is to pick the "best"
CPU to run the next VCPU.
Consider the following situation:
The timer may be invoked on a CPU i, which has a medium-priority VCPU
running. However, we may have another CPU j, which has the lowest
priority VCPU running. The next VCPU should be place onto the CPU j to
run instead of on CPU i based on the global EDF scheduling policy.
Because we did not control where the timer is placed when we arm the
timer (in the current design), the timer may be invoked on CPU i, and
that's why we need runq_tickle() here.

But this is a (minor) issue since we need to settle down the design first. :-)

Thank you again for your helpful comments!

Best regards,

Meng

-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-10  5:54   ` Meng Xu
@ 2015-06-10 17:46     ` Dario Faggioli
  2015-06-11  5:50       ` Meng Xu
  0 siblings, 1 reply; 21+ messages in thread
From: Dario Faggioli @ 2015-06-10 17:46 UTC (permalink / raw)
  To: Meng Xu
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich,
	Chong Li, Dagaen Golomb


[-- Attachment #1.1: Type: text/plain, Size: 9741 bytes --]

On Tue, 2015-06-09 at 22:54 -0700, Meng Xu wrote:
> Hi Dario and Dagaen,
> 
Hi,

> 2015-06-09 5:53 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>:

> > Thanks for doing this. The first question I have is whether you run any
> > test/benchmark that you can show the result of here.
> 
> Thank you very much Dario for your quick review!
> 
It wasn't as quick as I wanted it to be. I'll improve on that. :-)

> > For instance, a few weeks ago, Meng (while trying to do a completely
> > different thing) sent here on the list some numbers about the frequency
> > of the scheduler being called, and the overhead caused by that... Would
> > it be hard to run the same experiments and collect the numbers, with and
> > without your patch?
> 
> I think there should be two ways to evaluate the performance:
> 1) Using xentrace to trace the frequency of the scheduler being called
> and the overhead, when we run workload inside guest domains;
>
Yes, that counts more as a way of testing whether what is done in here
actually works, as we expect the scheduler to be invoked a lot less, and
that would tell us whether or not it is the case.

So, please, provide these numbers (at least the number/frequency of
invocation, overhead measurements can come later) as soon as practical.

> 2) Using cyclictest as Dario mentioned before to test the real-time
> performance at end user. Dagaen, I can provide you the commands to run
> it, which is actually quite simple to run.
> 
It is indeed, and that's more similar to a proper performance evaluation
of an aspect which is rather critical for this scheduler, and really
important for people wanting to use it.

So, yes, seeing some results would be great, independently from the
specific work done in this patch.

> > On Mon, 2015-06-08 at 07:46 -0400, Dagaen Golomb wrote:
> > > To do this, we create a new list that holds, for each
> > > vcpu, the time least into the future that it may need to be
> > > rescheduled.
> > >
> > Ok. Actually, what I really expected to see in this patch was, either:
> >  a) a new, scheduler wide, list of replenishment times _and_ a timer,
> >     always (re)programmed to fire at the most imminent one; or
> >  b) one timer for each vcpu, always (re)programmed to fire at the time
> >     instant when the replenishment for that vcpu should happen.
> 
> Ah...
> 
> Here what we are trying to do is:
> Reuse the existing timer that is used by rt_schedule().
>
Yeah, and what I can't figure out is why you decided to do so. The
reason I don't like it is that things become a lot (more) complex to
understand, maintain and modify.

>  We use  that
> timer as the timer to invoke the schedule function (rt_schedule). So
> that timer has two purposes:
> (a) budget enforcement as it was;
> (b) budget replenish (and deadline enforcement since deadline = period
> for each VCPU).
> 
Exactly, the timer has two purposes. It actually has 3, as you're saying
yourself ("budget replenish (*and* deadline enforcement"). It should
have exactly one. :-)

> When the timer fires, the rt_schedule is called: enforcement the
> budget, replenish the budget of VCPU(s), and pick the (next) VCPU.
> 
Right. OTOH, I think that all it should do is pick the next vcpu, if
necessary. In fact, the purpose of all this that Dagaen is doing is to
dramatically reduce the calls to rt_scheduler(), when that is not
necessary. Well, replenishing the budget of all running vcpus around is
not the core scheduling function's business, especially considering that
we're holding a global spinlock.

> In other words, we can think about the scheduler in another way:
> The scheduling decision happens at the following two time points:
> (a) budget enforcement as it was;
> (b) budget replenish and deadline enforcement;
> 
The scheduler is all that's in sched_rt.c, point here is how things
should be organized.

> Whenever any of the above two situations occurs, the scheduler may
> need to pick the next VCPU to run or preempt a current running VCPU.
> Therefore, the logic for scheduling decision is unavoidable when
> either of these two situation occurs, IMO.
> 
A budget replenishment for a vcpu with a big period, in a busy system,
may well end up not requiring any immediate rescheduling of none of the
lower period vcpus, may not it?

So, with your approach, it is like this, when replenishment time for
vcpu X arrives:

  timer interrupt
    TIMER_SOFTIRQ raised
      process softirqs
        SCHEDULE_SOFTIRQ raised
          process softirqs
            rt_schedule()
              [spin_lock]
              burn_budget(scurr)
              __repl_update()  (goes through all runnable vcpus!!)
              snext = __runq_pick(): snext == scurr
              runq_tickle(snext)...WAIT, WHAT?!?! :-O

I mean, and I'm noticing this now, if the replenishments done during a
particular call to rt_schedule() are not enough to change the situation
on that particular pcpu, and hence the task which was running (and that
you are deliberately disturbing with _a_full_execution_ of the
scheduler's core function!) should continue to do so, you're calling
runq_tickle() right on it. So, AFAICT, you're tickling scurr, not a
newly replenished vcpu!

Am I missing or misreading something? Let's assume not, and see what
happens in this case...

Looking at runq_tickle(), if there are idle pcpus, you'll probably
tickle one of them, which will likely pick up the newly replenished
vcpu, like this (as a followup of the 'callchain' above):

  process softirqs
    SCHEDULE_SOFTIRQ
      rt_schedule()
        [spin_lock]
        __repl_update()  (goes through all runnable vcpus!! AGAIN!!)
        snext = runq_pick(): snext == vcpu X
        [spin_unlock]

If there are no idle pcpus, you probably won't tickle anyone, which is
also fine.

So, yay, it works!! Is this abuse of runq_ticke() there by desing? :-D

Jokes apart, if the above analysis is accurate, I think this is a good
enough example of what I meant when saying to Dagaen "this is making
things too complex".

> In terms of performance/overhead, I think the option b) you pointed
> out has the benefit of low overhead in updating the budget because we
> don't need to hold the lock. However, the budget replenish occurs when
> deadline of the VCPU is changed (which means the next period of the
> VCPU arrives). Then it means the scheduler (may) need to make a new
> scheduling decision, in which situation, the scheduler will hold the
> lock for the runq. So I'm curious about the source of overhead the
> option b) could save over the current implementation/design Dagaen is
> doing.
> 
Leave my b) option alone, for now. With a) done as I'm suggesting, for
one, you'll be removing __repl_update() from rt_schedule(), which means
no longer going through the list of all runnable vcpus with the global
scheduler spin lock held, which really is something we should aim at for
this scheduler, sooner rather than later.

Here's how I envision things to go. Again, I'm talking about sticking
with option a), so no per-vcpu timers, just 1 timer and a global queue,
which now is a replenishment queue:

  timer interrupt
    TIMER_SOFTIRQ raised
      process softirqs
        replenishment_timer_handler()
          [spin_lock]
            <for_each_replenishment_event(repl_time < NOW()) {
               replenish(vcpu)
               runq_tickle(vcpu)
             }>
          [spin_lock]

Then, on the tickled pcpus (if any):

  process softirqs
    SCHEDULE_SOFTIRQ
      rt_schedule()
        [spin_lock]
        snext = runq_pick(): snext == vcpu X
        [spin_unlock]

And get rid of __repl_update(), which makes my eyes bleed every time I
open sched_rt.c. :-)

Oh, and note that we probably can use a new spin lock for the
replenishment queue, different from the existing global scheduler
spinlock, lowering contention and increasing scalabiliy even further!

> > >  The scheduler chooses the lowest time off of this
> > > list and waits until the specified time instead of running every
> > > 1 ms as it did before.
> > >
> > Yeah, well, I see what you mean and how you this is actually succeeding
> > (at least AFAICT, by looking at the code, again, it would be nice to
> > have some numbers) in improving the scheduler behavior.
> >
> > However, this transition toward event driven-ness has two goals:
> >  * improve the scheduler behavior [check, at least to some extent]
> >  * improve the code (readability, maintainability, etc.)
> >    [not check at all :-(]
> 
> I see. We did consider the readability and maintainability factor in
> the design but I think we just neglect the standards that "define" the
> rules of readability and maintainability. Is there some documentation
> that we could follow?
> 
There is not, and probably never will be... and that's fine, because
that is not something you write down in a document.

It's trial and error, following suits and making and looking at examples
(like the analysis I made above).

> I see how you think this interdependecies should be handled (via Xen
> timers per VCPU in option b), but I didn't quite get the
> reason/principles why you think the current design is not good to
> handle such interdependencies. :-(
> 
Right. I hope this is more clear now.

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-10  4:18   ` Dagaen Golomb
@ 2015-06-10 22:30     ` Dario Faggioli
  2015-06-13 20:33       ` Dagaen Golomb
  0 siblings, 1 reply; 21+ messages in thread
From: Dario Faggioli @ 2015-06-10 22:30 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li


[-- Attachment #1.1: Type: text/plain, Size: 9922 bytes --]

No HTML, please.

On Wed, 2015-06-10 at 00:18 -0400, Dagaen Golomb wrote:


>         And note that, when I say "timer", I mean an actual Xen timer,
>         i.e.,
>         those things that are started, stopped, and with a timer
>         handling
>         routine being called when they expire. For an example, you can
>         have a
>         look, in sched_credit.c, at 'ticker' or 'master_ticker'.
> 
> 
> I will look at this. However, the current solution is likely
> functionally equivalent and, with only one timer and a single list
> used to arm it, I'm not sure if using a Xen timer is necessary. 
>
It is functionally equivalent, by chance (see the issue about calling
runq_tickle() on current vcpus in my reply to Meng). The reason why a
different approach is required is making the code look better, reducing
(instead than increasing) the complexity of sched_rt.c, lowering the
overhead caused by long running operations performed while holding the
global scheduler spin lock, and improving scalability.

> Do they incur any extra overhead or coarsen granularity?
>
"extra overhead or coarsen granularity" as compared to what? s_timer in
schedule.c (which is what you're using) is one of them already!

What I meant with "an actual timer" is that you should ad a new one, and
move some of the stuff currently being done in rt_schedule() in its
handling routine, rather than just adding a new queue of events to be
serviced by abusing s_timer.

>         Deciding between a) or b) isn't easy, without actually trying
>         to
>         implement them. I personally prefer b), and I think it would
>         be a
>         superior long term solution (see right below), but given te
>         current
>         architecture of the RTDS scheduler (e.g., the fact that it has
>         a global
>         runq, etc), I won't nack a), which would most likely be
>         simpler.
> 
> 
> I agree b) would may be better in the long run, but with the current
> architecture a) is simpler. b) can be future work as the scheduler
> evolves.
>
Sure. And in fact, I'm fine with a), if done properly.
>         
>         With this patch, you're still using rt_schedule() to do both
>         scheduling
>         and replenishments, although you're now doing it at actual
>         replenishment
>         times, instead of checking for them every millisecond.

> 
> I think once you understand that the timerq is not only
> replenishments, but any event that could cause a branch is the
> scheduler behavior, it becomes more palatable to have them
> intertwined. 
>
I got that, and I'm asking you to do it differently.

> Really, the timerq and scheduling aren't as intertwined as they
> appear, they are piratically isolated functionally. Its just that the
> timerq is most naturally serviced during scheduling events when data
> that effects scheduling decisions is changing. Its straightforward and
> efficient.
>
Yeah, replenishments are 'naturally serviced' in a 'straightforward and
efficient' way by looping on all runnable vcpus, in more than one place,
from within super-hot paths, with the global scheduler spin lock held.
Neat! :-/

Oh, and that's before you introducing of another list to be taken care
of, still under the same conditions. :-O

>         Also, the bits and pieces that you need to add in order to
>         deal with
>         this new queue is, really, making things even more complex
>         than they are
>         now, which is undesirable.

> While it does add some complexity, I don't feel a single queue and
> managing it is that much extra complexity.
>
But in fact the point of making the scheduler 'event driven' is to take
advantage of the chances that this offers for getting stuff *out* of
rt_schedule(), and the purpose is not "not adding that much extra
complexity", is making it _simpler_.


>         > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
>         > index 7c39a9e..25f0458 100644
>         > --- a/xen/common/sched_rt.c
>         > +++ b/xen/common/sched_rt.c
>         
>         > @@ -156,6 +160,7 @@ struct rt_vcpu {
>         >      s_time_t cur_budget;        /* current budget */
>         >      s_time_t last_start;        /* last start time */
>         >      s_time_t cur_deadline;      /* current deadline for EDF
>         */
>         > +    s_time_t next_sched_needed; /* next time to make
>         scheduling decision */
>         >
>         As an example of why I said that things should become simpler,
>         and are
>         instead becoming more complex: with my solution, you don't
>         need anything
>         like this. In fact, budget enforcement is already being done
>         ok in
>         rt_schedule(), so no need to do anything more/different about
>         it.
>         Replenishments should be programmed to fire at cur_deadline,
>         so again,
>         no need to add this new field, and multiplex its actual value
>         between
>         budget and deadline (which, yes, counts as being something
>         rather
>         complex for me! :-D).
> 
> 
> As mentioned, the timerq is handling all events that could change the
> scheduling decision, not just replenishments. 
>
Yes, exactly, it's handling *too much* events. :-)

For example, have a look at 'struct vcpu', in 
xen/include/xen/sched.h. It's got 3 timers in it:

    struct timer     periodic_timer;
    struct timer     singleshot_timer;
    struct timer     poll_timer;    /* timeout for SCHEDOP_poll */

It probably would have been possible to just use one, with a single and
mixed event queue, as you're doing, a 1k lines handling routines, etc.

Do you it it would have been easier or harder to implement, understand,
debug and maintain? I bet on harder.

> This tracks the earliest time anything cause this to be scheduled
> differently, and could be extended if any more insights are found.
> Budget enforcement is being done in rt_schedule but its being done by
> this very list: a budget expiration is one possible event that would
> require a vcpu reschedule.
>
OMG, 'extended'... You're thinking to actually put more stuff in
there?!? :-O

It's a rather key and already too long and complex critical section, so
please, let's aim at shortening and making it simpler, i.e., at
improving things, not making them worse.
>         
>         > +/*
>         > + * Return how far the lowest time on the timerq (that is
>         after NOW) is in the
>         > + * future.
>         > + * Lock must be grabbed before calling this.
>         > + */
>         > +static s_time_t __timerq_next(const struct scheduler *ops,
>         s_time_t now)
>         > +{
>         > +    struct list_head *timerq = rt_timerq(ops);
>         > +    struct list_head *iter, *tmp;
>         > +
>         > +    list_for_each_safe(iter, tmp, timerq)
>         > +    {
>         > +        struct rt_vcpu * iter_svc = __t_elem(iter);
>         > +
>         > +        if ( iter_svc->next_sched_needed > now )
>         > +            return (iter_svc->next_sched_needed - now);
>         > +        else
>         > +            __t_remove(iter_svc);
>         > +    }
>         > +
>         > +    return MAX_SCHEDULE;
>         > +}
>         >
>         If using a timer, you can just get rid of items while, in the
>         timer
>         routine, you handle the event associated to them.
>         
>         Also, why is MAX_SCHEDULE still there?
> 
> 
> Honestly, events do not have to be removed here, but its done to
> prevent walking a list of stale values to get to the newest one on the
> next call. This is done more for performance. Their non-removal would
> be functionally equivalent.
>
Of course you should remove the stale entries, I certainly was not
arguing that the list should just grow indefinitely!

Point is, again, that this is another list walk occurring in an hot
path, with a big spin lock held. We want to get rid of such thing, not
adding more of it.

> With the timer alternative you mention, where would the timer events
> and their data be held? I think that could be extra complexity, unless
> I fail to understand the idea completely.
>
In an event queue like yours, of course. But you won't go through it
and/or bookkeep it while in hot paths, with the scheduler lock held.

See my email to Meng to have more details on what I have in mind, or
fell free to ask more details.

> MAX_SCHEDULE may not be required, but I have it there as an "in case."
> For example, it will cause the scheduler to be called after
> MAX_SCHEDULE even when no events are occurring (such as no vcpus). Its
> there to ensure progress in case of any bugs or unexpected behavior.
>
Wait, so, all this work, and then you still want to call rt_schedule()
every millisecond, when there's nothing to do?!?! :-O
 
>         That being said, in this specific case, you're moving up
>         runq_tickle(),
>         the purpose of which is to trigger a call to rt_schedule()
>         (after going
>         through the softirq machinery)... in order to call it in
>         rt_schedule().
>         That's pretty tricky, and is not how tickling should work.

> It set up to only tickle if needed. I'm not sure if this is an issue,
>
It's wrong, AFAICT. See my reply to Meng and, please, comment by
replying to it, if you've got anything to say about this, to make the
discussion easier to follow.

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-10 17:46     ` Dario Faggioli
@ 2015-06-11  5:50       ` Meng Xu
  2015-06-12  9:28         ` Dario Faggioli
  0 siblings, 1 reply; 21+ messages in thread
From: Meng Xu @ 2015-06-11  5:50 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich,
	Chong Li, Dagaen Golomb

Hi Dario,

First I think I got most of the points you raised/explained! They are
very very clear and thanks for the detailed explanation of your idea!

I have some minor questions about your comments/advices.

>
>> 2015-06-09 5:53 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>:
>
>> > Thanks for doing this. The first question I have is whether you run any
>> > test/benchmark that you can show the result of here.
>>
>> Thank you very much Dario for your quick review!
>>
> It wasn't as quick as I wanted it to be. I'll improve on that. :-)
>
>> > For instance, a few weeks ago, Meng (while trying to do a completely
>> > different thing) sent here on the list some numbers about the frequency
>> > of the scheduler being called, and the overhead caused by that... Would
>> > it be hard to run the same experiments and collect the numbers, with and
>> > without your patch?
>>
>> I think there should be two ways to evaluate the performance:
>> 1) Using xentrace to trace the frequency of the scheduler being called
>> and the overhead, when we run workload inside guest domains;
>>
> Yes, that counts more as a way of testing whether what is done in here
> actually works, as we expect the scheduler to be invoked a lot less, and
> that would tell us whether or not it is the case.
>
> So, please, provide these numbers (at least the number/frequency of
> invocation, overhead measurements can come later) as soon as practical.
>
>> 2) Using cyclictest as Dario mentioned before to test the real-time
>> performance at end user. Dagaen, I can provide you the commands to run
>> it, which is actually quite simple to run.
>>
> It is indeed, and that's more similar to a proper performance evaluation
> of an aspect which is rather critical for this scheduler, and really
> important for people wanting to use it.
>
> So, yes, seeing some results would be great, independently from the
> specific work done in this patch.

Right! I have some ideas about this test but won't want to mess up the
focus of this thread. :-)
I will raise this test again when we come to it.


>
>> > On Mon, 2015-06-08 at 07:46 -0400, Dagaen Golomb wrote:
>> > > To do this, we create a new list that holds, for each
>> > > vcpu, the time least into the future that it may need to be
>> > > rescheduled.
>> > >
>> > Ok. Actually, what I really expected to see in this patch was, either:
>> >  a) a new, scheduler wide, list of replenishment times _and_ a timer,
>> >     always (re)programmed to fire at the most imminent one; or
>> >  b) one timer for each vcpu, always (re)programmed to fire at the time
>> >     instant when the replenishment for that vcpu should happen.
>>
>> Ah...
>>
>> Here what we are trying to do is:
>> Reuse the existing timer that is used by rt_schedule().
>>
> Yeah, and what I can't figure out is why you decided to do so. The
> reason I don't like it is that things become a lot (more) complex to
> understand, maintain and modify.

Now I get why you think it is harder to maintain. Right. reusing the
timer will just make the rt_schedule complex and make the hot path
longer.  If other schedulers are using this timer only for the budget
enforcement, it won't be a good idea to use it for other purposes in
the RTDS scheduler, considering the design consistency of different
schedulers.

>
>>  We use  that
>> timer as the timer to invoke the schedule function (rt_schedule). So
>> that timer has two purposes:
>> (a) budget enforcement as it was;
>> (b) budget replenish (and deadline enforcement since deadline = period
>> for each VCPU).
>>
> Exactly, the timer has two purposes. It actually has 3, as you're saying
> yourself ("budget replenish (*and* deadline enforcement"). It should
> have exactly one. :-)
>
>> When the timer fires, the rt_schedule is called: enforcement the
>> budget, replenish the budget of VCPU(s), and pick the (next) VCPU.
>>
> Right. OTOH, I think that all it should do is pick the next vcpu, if
> necessary. In fact, the purpose of all this that Dagaen is doing is to
> dramatically reduce the calls to rt_scheduler(), when that is not
> necessary. Well, replenishing the budget of all running vcpus around is
> not the core scheduling function's business, especially considering that
> we're holding a global spinlock.
>
>> In other words, we can think about the scheduler in another way:
>> The scheduling decision happens at the following two time points:
>> (a) budget enforcement as it was;
>> (b) budget replenish and deadline enforcement;
>>
> The scheduler is all that's in sched_rt.c, point here is how things
> should be organized.

Right. Got it.

>
>> Whenever any of the above two situations occurs, the scheduler may
>> need to pick the next VCPU to run or preempt a current running VCPU.
>> Therefore, the logic for scheduling decision is unavoidable when
>> either of these two situation occurs, IMO.
>>
> A budget replenishment for a vcpu with a big period, in a busy system,
> may well end up not requiring any immediate rescheduling of none of the
> lower period vcpus, may not it?

Right. VCPUs with larger period should get replenished with lower frequency.

>
> So, with your approach, it is like this, when replenishment time for
> vcpu X arrives:
>
>   timer interrupt
>     TIMER_SOFTIRQ raised
>       process softirqs
>         SCHEDULE_SOFTIRQ raised
>           process softirqs
>             rt_schedule()
>               [spin_lock]
>               burn_budget(scurr)
>               __repl_update()  (goes through all runnable vcpus!!)
>               snext = __runq_pick(): snext == scurr

Ah, I neglected this situation. As you said below, if scurr has higher
priority than the top VCPU of runq, we are going to tickle the scurr,
which should not happen.

>               runq_tickle(snext)...WAIT, WHAT?!?! :-O
>
> I mean, and I'm noticing this now, if the replenishments done during a
> particular call to rt_schedule() are not enough to change the situation
> on that particular pcpu, and hence the task which was running (and that
> you are deliberately disturbing with _a_full_execution_ of the
> scheduler's core function!) should continue to do so, you're calling
> runq_tickle() right on it. So, AFAICT, you're tickling scurr, not a
> newly replenished vcpu!
>
> Am I missing or misreading something? Let's assume not, and see what
> happens in this case...

You are right! Although this issue (i.e., tickling on scurr instead of
the next high priority VCPU) can be "fixed" (dirtily), it can be
avoided with the design option a) you said.

>
> Looking at runq_tickle(), if there are idle pcpus, you'll probably
> tickle one of them, which will likely pick up the newly replenished
> vcpu, like this (as a followup of the 'callchain' above):
>
>   process softirqs
>     SCHEDULE_SOFTIRQ
>       rt_schedule()
>         [spin_lock]
>         __repl_update()  (goes through all runnable vcpus!! AGAIN!!)
>         snext = runq_pick(): snext == vcpu X
>         [spin_unlock]
>
> If there are no idle pcpus, you probably won't tickle anyone, which is
> also fine.
>
> So, yay, it works!! Is this abuse of runq_ticke() there by desing? :-D

Yes. It abuse the use of rt_schedule() actually. Some of the
operations (e.g., __repl_update() ) are unnecessarily called twice,
which is a bad thing in the hot path. :-(

>
> Jokes apart, if the above analysis is accurate, I think this is a good
> enough example of what I meant when saying to Dagaen "this is making
> things too complex".
Yes. The flow chart you drew is very clear! Thanks!

@Dagaen, what do you think? Please comment on Dario's reply with your
opinion and raise any of your concerns.

>
>> In terms of performance/overhead, I think the option b) you pointed
>> out has the benefit of low overhead in updating the budget because we
>> don't need to hold the lock. However, the budget replenish occurs when
>> deadline of the VCPU is changed (which means the next period of the
>> VCPU arrives). Then it means the scheduler (may) need to make a new
>> scheduling decision, in which situation, the scheduler will hold the
>> lock for the runq. So I'm curious about the source of overhead the
>> option b) could save over the current implementation/design Dagaen is
>> doing.
>>
> Leave my b) option alone, for now. With a) done as I'm suggesting, for
> one, you'll be removing __repl_update() from rt_schedule(), which means
> no longer going through the list of all runnable vcpus with the global
> scheduler spin lock held, which really is something we should aim at for
> this scheduler, sooner rather than later.
>
> Here's how I envision things to go. Again, I'm talking about sticking
> with option a), so no per-vcpu timers, just 1 timer and a global queue,
> which now is a replenishment queue:
>
>   timer interrupt
>     TIMER_SOFTIRQ raised
>       process softirqs
>         replenishment_timer_handler()
>           [spin_lock]
>             <for_each_replenishment_event(repl_time < NOW()) {
>                replenish(vcpu)
>                runq_tickle(vcpu)
>              }>
>           [spin_lock]
>
> Then, on the tickled pcpus (if any):
>
>   process softirqs
>     SCHEDULE_SOFTIRQ
>       rt_schedule()
>         [spin_lock]
>         snext = runq_pick(): snext == vcpu X
>         [spin_unlock]
>
> And get rid of __repl_update(), which makes my eyes bleed every time I
> open sched_rt.c. :-)
>
> Oh, and note that we probably can use a new spin lock for the
> replenishment queue, different from the existing global scheduler
> spinlock, lowering contention and increasing scalabiliy even further!

Here is the main question I have about your advice.
If we are introducing a new spin lock for the replenishment queue,
what is the relation between the new lock and the old lock of the
runq?
Because the deadline decides the priority of VCPUs and thus decides
the ordering of VCPUs in the runq, the "replenish(vcpu)" will operate
on the runq as well. As shown in the workflow in your reply:
     >                replenish(vcpu)
     >                runq_tickle(vcpu)
The runq_tickle(vcpu) will pick the desired CPU. On that CPU,
rt_schedule will pick snext by runq_pick(). Therefore, the replenished
VCPU need to be resorted in the runq. So replenish(vcpu) will operates
on the runq.

Another question in my mind is: do we really need the replenish queue
to record the replenish time? Because the period = deadline in the
current implementation, the deadline of VCPUs in runq is actually the
replenish time. (We are reusing the runq here and I'm unsure if it
good or not in terms of maintenance.)

>
>> > >  The scheduler chooses the lowest time off of this
>> > > list and waits until the specified time instead of running every
>> > > 1 ms as it did before.
>> > >
>> > Yeah, well, I see what you mean and how you this is actually succeeding
>> > (at least AFAICT, by looking at the code, again, it would be nice to
>> > have some numbers) in improving the scheduler behavior.
>> >
>> > However, this transition toward event driven-ness has two goals:
>> >  * improve the scheduler behavior [check, at least to some extent]
>> >  * improve the code (readability, maintainability, etc.)
>> >    [not check at all :-(]
>>
>> I see. We did consider the readability and maintainability factor in
>> the design but I think we just neglect the standards that "define" the
>> rules of readability and maintainability. Is there some documentation
>> that we could follow?
>>
> There is not, and probably never will be... and that's fine, because
> that is not something you write down in a document.
>
> It's trial and error, following suits and making and looking at examples
> (like the analysis I made above).

I see. I like the analysis you made very much!  Thanks! :-)

>
>> I see how you think this interdependecies should be handled (via Xen
>> timers per VCPU in option b), but I didn't quite get the
>> reason/principles why you think the current design is not good to
>> handle such interdependencies. :-(
>>
> Right. I hope this is more clear now.

Yes. I think I get your idea now. The explanation is very clear and neat! :-)

Thank you very much and best regards,

Meng
-- 


-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-11  5:50       ` Meng Xu
@ 2015-06-12  9:28         ` Dario Faggioli
  2015-06-13 17:21           ` Meng Xu
  0 siblings, 1 reply; 21+ messages in thread
From: Dario Faggioli @ 2015-06-12  9:28 UTC (permalink / raw)
  To: Meng Xu
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich,
	Chong Li, Dagaen Golomb


[-- Attachment #1.1: Type: text/plain, Size: 7868 bytes --]

On Wed, 2015-06-10 at 22:50 -0700, Meng Xu wrote:
> Hi Dario,
> 
> First I think I got most of the points you raised/explained! They are
> very very clear and thanks for the detailed explanation of your idea!
> 
Glad you got it and (sort of :-) ) agree.

> >> 2015-06-09 5:53 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>:
> >
> >> 2) Using cyclictest as Dario mentioned before to test the real-time
> >> performance at end user. Dagaen, I can provide you the commands to run
> >> it, which is actually quite simple to run.

> > So, yes, seeing some results would be great, independently from the
> > specific work done in this patch.
> 
> Right! I have some ideas about this test but won't want to mess up the
> focus of this thread. :-)
> I will raise this test again when we come to it.
> 
Ok. Looking forward to see this happening.

> > Yeah, and what I can't figure out is why you decided to do so. The
> > reason I don't like it is that things become a lot (more) complex to
> > understand, maintain and modify.
> 
> Now I get why you think it is harder to maintain. Right. reusing the
> timer will just make the rt_schedule complex and make the hot path
> longer.
>
Exactly.

> >               runq_tickle(snext)...WAIT, WHAT?!?! :-O
> >
> > I mean, and I'm noticing this now, if the replenishments done during a
> > particular call to rt_schedule() are not enough to change the situation
> > on that particular pcpu, and hence the task which was running (and that
> > you are deliberately disturbing with _a_full_execution_ of the
> > scheduler's core function!) should continue to do so, you're calling
> > runq_tickle() right on it. So, AFAICT, you're tickling scurr, not a
> > newly replenished vcpu!
> >
> > Am I missing or misreading something? Let's assume not, and see what
> > happens in this case...
> 
> You are right! Although this issue (i.e., tickling on scurr instead of
> the next high priority VCPU) can be "fixed" (dirtily), it can be
> avoided with the design option a) you said.
> 
Of course it can be fixed.. Pretty much everything can! Point is the
reason why it happened, and how to make these things not happen and/or
easier to figure out. That's (one of) the point(s) of keeping things
simple and self contained, even within a single component (like a
scheduler), instead of "let's do everything in this 10k lines
function!" :-P

Glad that you saw what I meat.

> > Jokes apart, if the above analysis is accurate, I think this is a good
> > enough example of what I meant when saying to Dagaen "this is making
> > things too complex".
> Yes. The flow chart you drew is very clear! Thanks!
> 
> @Dagaen, what do you think? Please comment on Dario's reply with your
> opinion and raise any of your concerns.
> 
Indeed.

> > Here's how I envision things to go. Again, I'm talking about sticking
> > with option a), so no per-vcpu timers, just 1 timer and a global queue,
> > which now is a replenishment queue:
> >
> >   timer interrupt
> >     TIMER_SOFTIRQ raised
> >       process softirqs
> >         replenishment_timer_handler()
> >           [spin_lock]
> >             <for_each_replenishment_event(repl_time < NOW()) {
> >                replenish(vcpu)
> >                runq_tickle(vcpu)
> >              }>
> >           [spin_lock]
> >
> > Then, on the tickled pcpus (if any):
> >
> >   process softirqs
> >     SCHEDULE_SOFTIRQ
> >       rt_schedule()
> >         [spin_lock]
> >         snext = runq_pick(): snext == vcpu X
> >         [spin_unlock]
> >
> > And get rid of __repl_update(), which makes my eyes bleed every time I
> > open sched_rt.c. :-)
> >
> > Oh, and note that we probably can use a new spin lock for the
> > replenishment queue, different from the existing global scheduler
> > spinlock, lowering contention and increasing scalabiliy even further!
> 
> Here is the main question I have about your advice.
> If we are introducing a new spin lock for the replenishment queue,
> what is the relation between the new lock and the old lock of the
> runq?
>
That's hard to tell in advance, you'll know it completely only when
trying to implement it. But, yes, when more locks are involved, it's
impotant to figure out the relationships between each other, or we risk
introducing deadlock or, even if things are correct, fail to improve the
performance (or do even worse!!).

The idea is, since the two operations (scheduling/budget enforcement and
replenishments) are logically independent, and if they're implemented in
a way that stress this independence, then it make sens to try to use
different spin locks.

As soon as you have more than one spin lock, what you should pay the
most attention to is, if they need to 'nest' (one is acquired when the
other is already being held), that has to happen consistently, or
deadlock will occur! :-/

> Because the deadline decides the priority of VCPUs and thus decides
> the ordering of VCPUs in the runq, the "replenish(vcpu)" will operate
> on the runq as well. As shown in the workflow in your reply:
>      >                replenish(vcpu)
>      >                runq_tickle(vcpu)
> The runq_tickle(vcpu) will pick the desired CPU. On that CPU,
> rt_schedule will pick snext by runq_pick(). Therefore, the replenished
> VCPU need to be resorted in the runq. So replenish(vcpu) will operates
> on the runq.
> 
Oh, sure, absolutely. Calling __runq_insert() and runq_tickle(), clearly
must be done with the runqueue lock (which is the global scheduler lock,
at the moment) held.

Reading again what I wrote, I realize that I probably expressed myself
really bad, when hinting at that "let's use another lock" thing.

What I really wanted to say is that, if there will be a replenishment
queue, i.e., an event queue on which replenishment events are queued,
and that the timer handling routine scans, there may be some locking
required for consistently dealing with the queue itself. If that will be
the case, we probably can use _another_ spin lock, and let the timer
handling routine acquire the runqueue lock *only* when/if it gets to do
the insertions and the ticklings.
Sorry for this... I added that paragraph quickly, right before hitting
'send'... I probably would have better not done so! :-P

Again, this may or may not be necessary, and may or may not be possible,
depending on whether the two locks would nest nicely everywhere they're
required. This also depends on how we'll deal with the replenishment
timer.

So, the bottom line of all this is: get down to it, ad we'll see how it
will be better put together! :-D

> Another question in my mind is: do we really need the replenish queue
> to record the replenish time? Because the period = deadline in the
> current implementation, the deadline of VCPUs in runq is actually the
> replenish time. (We are reusing the runq here and I'm unsure if it
> good or not in terms of maintenance.)
> 
Again, let's see. The timer will be always programmed to fire at the
most imminent replenishment event, so it seems to me that you need
something that will tell you, when servicing replenishment X, what's the
next time instant you want the timer to fire, to perform the next
replenishment.

Actually, the depleted queue you have know, can well become the
replenishment queue (it will have to be kept sorted, though, I think).
Whether you want to keep it as the tail of the actual runqueue, or split
the two of them, it does not matter much, IMO.

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-12  9:28         ` Dario Faggioli
@ 2015-06-13 17:21           ` Meng Xu
  0 siblings, 0 replies; 21+ messages in thread
From: Meng Xu @ 2015-06-13 17:21 UTC (permalink / raw)
  To: Dario Faggioli, Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich,
	Chong Li

Hi Dario,

I think I got what you meant 100% now. :-)
I will ping Dagaen to see what he thinks and if he has any concerns.

>
> > > Here's how I envision things to go. Again, I'm talking about sticking
> > > with option a), so no per-vcpu timers, just 1 timer and a global queue,
> > > which now is a replenishment queue:
> > >
> > >   timer interrupt
> > >     TIMER_SOFTIRQ raised
> > >       process softirqs
> > >         replenishment_timer_handler()
> > >           [spin_lock]
> > >             <for_each_replenishment_event(repl_time < NOW()) {
> > >                replenish(vcpu)
> > >                runq_tickle(vcpu)
> > >              }>
> > >           [spin_lock]
> > >
> > > Then, on the tickled pcpus (if any):
> > >
> > >   process softirqs
> > >     SCHEDULE_SOFTIRQ
> > >       rt_schedule()
> > >         [spin_lock]
> > >         snext = runq_pick(): snext == vcpu X
> > >         [spin_unlock]
> > >
> > > And get rid of __repl_update(), which makes my eyes bleed every time I
> > > open sched_rt.c. :-)
> > >
> > > Oh, and note that we probably can use a new spin lock for the
> > > replenishment queue, different from the existing global scheduler
> > > spinlock, lowering contention and increasing scalabiliy even further!
> >
> > Here is the main question I have about your advice.
> > If we are introducing a new spin lock for the replenishment queue,
> > what is the relation between the new lock and the old lock of the
> > runq?
> >
> That's hard to tell in advance, you'll know it completely only when
> trying to implement it. But, yes, when more locks are involved, it's
> impotant to figure out the relationships between each other, or we risk
> introducing deadlock or, even if things are correct, fail to improve the
> performance (or do even worse!!).
>
> The idea is, since the two operations (scheduling/budget enforcement and
> replenishments) are logically independent, and if they're implemented in
> a way that stress this independence, then it make sens to try to use
> different spin locks.
>
> As soon as you have more than one spin lock, what you should pay the
> most attention to is, if they need to 'nest' (one is acquired when the
> other is already being held), that has to happen consistently, or
> deadlock will occur! :-/
>
> > Because the deadline decides the priority of VCPUs and thus decides
> > the ordering of VCPUs in the runq, the "replenish(vcpu)" will operate
> > on the runq as well. As shown in the workflow in your reply:
> >      >                replenish(vcpu)
> >      >                runq_tickle(vcpu)
> > The runq_tickle(vcpu) will pick the desired CPU. On that CPU,
> > rt_schedule will pick snext by runq_pick(). Therefore, the replenished
> > VCPU need to be resorted in the runq. So replenish(vcpu) will operates
> > on the runq.
> >
> Oh, sure, absolutely. Calling __runq_insert() and runq_tickle(), clearly
> must be done with the runqueue lock (which is the global scheduler lock,
> at the moment) held.
>
> Reading again what I wrote, I realize that I probably expressed myself
> really bad, when hinting at that "let's use another lock" thing.
>
> What I really wanted to say is that, if there will be a replenishment
> queue, i.e., an event queue on which replenishment events are queued,
> and that the timer handling routine scans, there may be some locking
> required for consistently dealing with the queue itself. If that will be
> the case, we probably can use _another_ spin lock, and let the timer
> handling routine acquire the runqueue lock *only* when/if it gets to do
> the insertions and the ticklings.
> Sorry for this... I added that paragraph quickly, right before hitting
> 'send'... I probably would have better not done so! :-P
>
> Again, this may or may not be necessary, and may or may not be possible,
> depending on whether the two locks would nest nicely everywhere they're
> required. This also depends on how we'll deal with the replenishment
> timer.
>
> So, the bottom line of all this is: get down to it, ad we'll see how it
> will be better put together! :-D

I see. Got it. :-)

>
> > Another question in my mind is: do we really need the replenish queue
> > to record the replenish time? Because the period = deadline in the
> > current implementation, the deadline of VCPUs in runq is actually the
> > replenish time. (We are reusing the runq here and I'm unsure if it
> > good or not in terms of maintenance.)
> >
> Again, let's see. The timer will be always programmed to fire at the
> most imminent replenishment event, so it seems to me that you need
> something that will tell you, when servicing replenishment X, what's the
> next time instant you want the timer to fire, to perform the next
> replenishment.
>
> Actually, the depleted queue you have know, can well become the
> replenishment queue (it will have to be kept sorted, though, I think).
> Whether you want to keep it as the tail of the actual runqueue, or split
> the two of them, it does not matter much, IMO.

Right. The runq and the depleted queue actually has the information of
the next replenish time for all VCPUs, after the depleted queue is
changed to be sorted.


Best regards,

Meng


-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-10 22:30     ` Dario Faggioli
@ 2015-06-13 20:33       ` Dagaen Golomb
  2015-06-16 17:07         ` Dagaen Golomb
  0 siblings, 1 reply; 21+ messages in thread
From: Dagaen Golomb @ 2015-06-13 20:33 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li

> No HTML, please.

Got it, sorry.

>>         And note that, when I say "timer", I mean an actual Xen timer,
>>         i.e.,
>>         those things that are started, stopped, and with a timer
>>         handling
>>         routine being called when they expire. For an example, you can
>>         have a
>>         look, in sched_credit.c, at 'ticker' or 'master_ticker'.
>>
>>
>> I will look at this. However, the current solution is likely
>> functionally equivalent and, with only one timer and a single list
>> used to arm it, I'm not sure if using a Xen timer is necessary.
>>
> It is functionally equivalent, by chance (see the issue about calling
> runq_tickle() on current vcpus in my reply to Meng). The reason why a
> different approach is required is making the code look better, reducing
> (instead than increasing) the complexity of sched_rt.c, lowering the
> overhead caused by long running operations performed while holding the
> global scheduler spin lock, and improving scalability.
>
>> Do they incur any extra overhead or coarsen granularity?
>>
> "extra overhead or coarsen granularity" as compared to what? s_timer in
> schedule.c (which is what you're using) is one of them already!
>
> What I meant with "an actual timer" is that you should ad a new one, and
> move some of the stuff currently being done in rt_schedule() in its
> handling routine, rather than just adding a new queue of events to be
> serviced by abusing s_timer.

Okay, I was wondering what you mean by "an actual timer."

>
>>         Deciding between a) or b) isn't easy, without actually trying
>>         to
>>         implement them. I personally prefer b), and I think it would
>>         be a
>>         superior long term solution (see right below), but given te
>>         current
>>         architecture of the RTDS scheduler (e.g., the fact that it has
>>         a global
>>         runq, etc), I won't nack a), which would most likely be
>>         simpler.
>>
>>
>> I agree b) would may be better in the long run, but with the current
>> architecture a) is simpler. b) can be future work as the scheduler
>> evolves.
>>
> Sure. And in fact, I'm fine with a), if done properly.
>>
>>         With this patch, you're still using rt_schedule() to do both
>>         scheduling
>>         and replenishments, although you're now doing it at actual
>>         replenishment
>>         times, instead of checking for them every millisecond.
>
>>
>> I think once you understand that the timerq is not only
>> replenishments, but any event that could cause a branch is the
>> scheduler behavior, it becomes more palatable to have them
>> intertwined.
>>
> I got that, and I'm asking you to do it differently.
>
>> Really, the timerq and scheduling aren't as intertwined as they
>> appear, they are piratically isolated functionally. Its just that the
>> timerq is most naturally serviced during scheduling events when data
>> that effects scheduling decisions is changing. Its straightforward and
>> efficient.
>>
> Yeah, replenishments are 'naturally serviced' in a 'straightforward and
> efficient' way by looping on all runnable vcpus, in more than one place,
> from within super-hot paths, with the global scheduler spin lock held.
> Neat! :-/
>
> Oh, and that's before you introducing of another list to be taken care
> of, still under the same conditions. :-O

The paths are already hot, so adding a small amount of extra work is a small
percentage increase. I'm usually against this kind of thing, too, but separating
to another timer, while runnable independently, may actually be more work if it
needs to use some of the same maintenance behavior. I guess the main argument
against is maintainability.

>
>>         Also, the bits and pieces that you need to add in order to
>>         deal with
>>         this new queue is, really, making things even more complex
>>         than they are
>>         now, which is undesirable.
>
>> While it does add some complexity, I don't feel a single queue and
>> managing it is that much extra complexity.
>>
> But in fact the point of making the scheduler 'event driven' is to take
> advantage of the chances that this offers for getting stuff *out* of
> rt_schedule(), and the purpose is not "not adding that much extra
> complexity", is making it _simpler_.

I understand where you are coming from now. I was looking at is mostly
from the perspective of performance. This explains our differences.

>
>
>>         > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
>>         > index 7c39a9e..25f0458 100644
>>         > --- a/xen/common/sched_rt.c
>>         > +++ b/xen/common/sched_rt.c
>>
>>         > @@ -156,6 +160,7 @@ struct rt_vcpu {
>>         >      s_time_t cur_budget;        /* current budget */
>>         >      s_time_t last_start;        /* last start time */
>>         >      s_time_t cur_deadline;      /* current deadline for EDF
>>         */
>>         > +    s_time_t next_sched_needed; /* next time to make
>>         scheduling decision */
>>         >
>>         As an example of why I said that things should become simpler,
>>         and are
>>         instead becoming more complex: with my solution, you don't
>>         need anything
>>         like this. In fact, budget enforcement is already being done
>>         ok in
>>         rt_schedule(), so no need to do anything more/different about
>>         it.
>>         Replenishments should be programmed to fire at cur_deadline,
>>         so again,
>>         no need to add this new field, and multiplex its actual value
>>         between
>>         budget and deadline (which, yes, counts as being something
>>         rather
>>         complex for me! :-D).
>>
>>
>> As mentioned, the timerq is handling all events that could change the
>> scheduling decision, not just replenishments.
>>
> Yes, exactly, it's handling *too much* events. :-)
>
> For example, have a look at 'struct vcpu', in
> xen/include/xen/sched.h. It's got 3 timers in it:
>
>     struct timer     periodic_timer;
>     struct timer     singleshot_timer;
>     struct timer     poll_timer;    /* timeout for SCHEDOP_poll */
>
> It probably would have been possible to just use one, with a single and
> mixed event queue, as you're doing, a 1k lines handling routines, etc.
>
> Do you it it would have been easier or harder to implement, understand,
> debug and maintain? I bet on harder.

Point taken.

>
>> This tracks the earliest time anything cause this to be scheduled
>> differently, and could be extended if any more insights are found.
>> Budget enforcement is being done in rt_schedule but its being done by
>> this very list: a budget expiration is one possible event that would
>> require a vcpu reschedule.
>>
> OMG, 'extended'... You're thinking to actually put more stuff in
> there?!? :-O
>
> It's a rather key and already too long and complex critical section, so
> please, let's aim at shortening and making it simpler, i.e., at
> improving things, not making them worse.

This can again be explained by our goal mismatch. I see where you are
coming from now.

>>
>>         > +/*
>>         > + * Return how far the lowest time on the timerq (that is
>>         after NOW) is in the
>>         > + * future.
>>         > + * Lock must be grabbed before calling this.
>>         > + */
>>         > +static s_time_t __timerq_next(const struct scheduler *ops,
>>         s_time_t now)
>>         > +{
>>         > +    struct list_head *timerq = rt_timerq(ops);
>>         > +    struct list_head *iter, *tmp;
>>         > +
>>         > +    list_for_each_safe(iter, tmp, timerq)
>>         > +    {
>>         > +        struct rt_vcpu * iter_svc = __t_elem(iter);
>>         > +
>>         > +        if ( iter_svc->next_sched_needed > now )
>>         > +            return (iter_svc->next_sched_needed - now);
>>         > +        else
>>         > +            __t_remove(iter_svc);
>>         > +    }
>>         > +
>>         > +    return MAX_SCHEDULE;
>>         > +}
>>         >
>>         If using a timer, you can just get rid of items while, in the
>>         timer
>>         routine, you handle the event associated to them.
>>
>>         Also, why is MAX_SCHEDULE still there?
>>
>>
>> Honestly, events do not have to be removed here, but its done to
>> prevent walking a list of stale values to get to the newest one on the
>> next call. This is done more for performance. Their non-removal would
>> be functionally equivalent.
>>
> Of course you should remove the stale entries, I certainly was not
> arguing that the list should just grow indefinitely!
>
> Point is, again, that this is another list walk occurring in an hot
> path, with a big spin lock held. We want to get rid of such thing, not
> adding more of it.
>
>> With the timer alternative you mention, where would the timer events
>> and their data be held? I think that could be extra complexity, unless
>> I fail to understand the idea completely.
>>
> In an event queue like yours, of course. But you won't go through it
> and/or bookkeep it while in hot paths, with the scheduler lock held.
>
> See my email to Meng to have more details on what I have in mind, or
> fell free to ask more details.
>
>> MAX_SCHEDULE may not be required, but I have it there as an "in case."
>> For example, it will cause the scheduler to be called after
>> MAX_SCHEDULE even when no events are occurring (such as no vcpus). Its
>> there to ensure progress in case of any bugs or unexpected behavior.
>>
> Wait, so, all this work, and then you still want to call rt_schedule()
> every millisecond, when there's nothing to do?!?! :-O
>
>>         That being said, in this specific case, you're moving up
>>         runq_tickle(),
>>         the purpose of which is to trigger a call to rt_schedule()
>>         (after going
>>         through the softirq machinery)... in order to call it in
>>         rt_schedule().
>>         That's pretty tricky, and is not how tickling should work.
>
>> It set up to only tickle if needed. I'm not sure if this is an issue,
>>
> It's wrong, AFAICT. See my reply to Meng and, please, comment by
> replying to it, if you've got anything to say about this, to make the
> discussion easier to follow.
>
> Regards,
> Dario

I understand what caused our mismatch now. I think option a) you
mentioned makes sense
given your values. I am going to look into the details of this
approach and get back with any
questions or concerns.

Regards,
~Dagaen

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-13 20:33       ` Dagaen Golomb
@ 2015-06-16 17:07         ` Dagaen Golomb
  2015-06-17  0:20           ` Dario Faggioli
  0 siblings, 1 reply; 21+ messages in thread
From: Dagaen Golomb @ 2015-06-16 17:07 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li

I'm not replying inline because this is a more general response that
is not limited
to any particular comment.

Separating the replenishment from the scheduler may be problematic. The nature
of the gEDF scheduler is that when a priority changes it should be instantly
reflected. If replenishments were done by themsevles, then the scheduler may
decide to wait for some period of time, and in this same time period a
replenishment
could change what should be executing. Either the gEDF policy will be
violated or
the replenishment routine would have to notify the scheduler after any
replenishment, requiring some form of interaction and possibly more scheduler
invocations than the current structure. The other option is for the scheduler to
check for replenishments, as it does now. Without any interaction a violation of
the policy could ensue. The gEDF is not like the credit scheduler where there is
an accounting period where, during this time, policy may be violated
temporarily.
This model is much easier in the credit scheduler because there is a
fixed slice.
Imagine if we wanted an accounting period of 0 for the scheduler. The
only option
would be to recompute the replenishments before any scheduling and to run the
scheduler at exactly when these replenishments would occur. As in,
replenishments
and scheduling would have to be coalesced. This is essentially what
the current RTDS
patch is doing.

Further, with two timer routines we now need to deal with any ordering
or preemption
between them (or logic to prevent / order such) which I could argue is far more
complexity than having them done at once as it is now. Being able to
reason about
the execution with one thread in mind is easier for most people than
several threads.
If both are armed for the same time, which should execute? Scheduler
first before
a possibly applicable replenishment? Or replenishment always first?
Both with added
logic to enforce this and prevent preemption, of course.

Due to this, it is my belief that by the nature of the gEDF policy the
current solution is
better, mostly because it appears to actually be the least complex way that is
functionally correct. The gEDF policy just isn't well suited for
decoupling events, as
the events are highly dependent on one another, and particularly dependent at an
instantaneous timescale. It would be a hard pitch for a gEDF scheduler with a
similar "accounting period" where the gEDF policy could be violated. That is
blasphemy in the real-time world. Any other option for implementing the policy
correctly would include more headache due to a multiple execution mindset and
any enforcement and interaction logic.

Its also worth noting that the stereotypical textbook event-driven
model is as we have
now: a single event loop that handles events. In this case the scheduler is the
conceptually the main loop and this makes perfect sense: its deciding
what to run
(think of the running VCPUs as callback functions and the abstraction
falls into place
perfectly). The event loop needs some information (about replenishments) before
deciding, and collecting this would be part of the decode and decision
phase for an
event signal.

Not only is there a complexity issue, but as mentioned before this may be the
best performing option, making the most information available and the
best timing
decision possible. Adding a few extra cycles to a hot path, even with
a lock being
held, is worth it if the scheduler and context switching is done less.
I'm sure the
entire pre-change scheduler is much longer than the time we've added, and the
other model proposed with more timers may require just as much added time as
well as aforementioned complexity, or would suffer from less
intelligent decisions
(hence more execution and/or policy violation) as dependance and
cooperation between
the timers decreased.

For the above reasons I think you should reconsider the current
implementation, as it
appears it may be the least complex and easiest to reason about
solution. This is not
to say that the current patch doesn't have issues. We would still aim
to fix any of the
concerns about multiple softirqs or CPU tickles.

Let me know if I'm missing some key insight into how the behavior
could be implemented
correctly and beautifully using the multiple timer approach. I simply
don't see how it can
be done without heavy interaction and information sharing between them
which really
defeats the purpose.

Regards,
~Dagaen

On Sat, Jun 13, 2015 at 4:33 PM, Dagaen Golomb <dgolomb@seas.upenn.edu> wrote:
>> No HTML, please.
>
> Got it, sorry.
>
>>>         And note that, when I say "timer", I mean an actual Xen timer,
>>>         i.e.,
>>>         those things that are started, stopped, and with a timer
>>>         handling
>>>         routine being called when they expire. For an example, you can
>>>         have a
>>>         look, in sched_credit.c, at 'ticker' or 'master_ticker'.
>>>
>>>
>>> I will look at this. However, the current solution is likely
>>> functionally equivalent and, with only one timer and a single list
>>> used to arm it, I'm not sure if using a Xen timer is necessary.
>>>
>> It is functionally equivalent, by chance (see the issue about calling
>> runq_tickle() on current vcpus in my reply to Meng). The reason why a
>> different approach is required is making the code look better, reducing
>> (instead than increasing) the complexity of sched_rt.c, lowering the
>> overhead caused by long running operations performed while holding the
>> global scheduler spin lock, and improving scalability.
>>
>>> Do they incur any extra overhead or coarsen granularity?
>>>
>> "extra overhead or coarsen granularity" as compared to what? s_timer in
>> schedule.c (which is what you're using) is one of them already!
>>
>> What I meant with "an actual timer" is that you should ad a new one, and
>> move some of the stuff currently being done in rt_schedule() in its
>> handling routine, rather than just adding a new queue of events to be
>> serviced by abusing s_timer.
>
> Okay, I was wondering what you mean by "an actual timer."
>
>>
>>>         Deciding between a) or b) isn't easy, without actually trying
>>>         to
>>>         implement them. I personally prefer b), and I think it would
>>>         be a
>>>         superior long term solution (see right below), but given te
>>>         current
>>>         architecture of the RTDS scheduler (e.g., the fact that it has
>>>         a global
>>>         runq, etc), I won't nack a), which would most likely be
>>>         simpler.
>>>
>>>
>>> I agree b) would may be better in the long run, but with the current
>>> architecture a) is simpler. b) can be future work as the scheduler
>>> evolves.
>>>
>> Sure. And in fact, I'm fine with a), if done properly.
>>>
>>>         With this patch, you're still using rt_schedule() to do both
>>>         scheduling
>>>         and replenishments, although you're now doing it at actual
>>>         replenishment
>>>         times, instead of checking for them every millisecond.
>>
>>>
>>> I think once you understand that the timerq is not only
>>> replenishments, but any event that could cause a branch is the
>>> scheduler behavior, it becomes more palatable to have them
>>> intertwined.
>>>
>> I got that, and I'm asking you to do it differently.
>>
>>> Really, the timerq and scheduling aren't as intertwined as they
>>> appear, they are piratically isolated functionally. Its just that the
>>> timerq is most naturally serviced during scheduling events when data
>>> that effects scheduling decisions is changing. Its straightforward and
>>> efficient.
>>>
>> Yeah, replenishments are 'naturally serviced' in a 'straightforward and
>> efficient' way by looping on all runnable vcpus, in more than one place,
>> from within super-hot paths, with the global scheduler spin lock held.
>> Neat! :-/
>>
>> Oh, and that's before you introducing of another list to be taken care
>> of, still under the same conditions. :-O
>
> The paths are already hot, so adding a small amount of extra work is a small
> percentage increase. I'm usually against this kind of thing, too, but separating
> to another timer, while runnable independently, may actually be more work if it
> needs to use some of the same maintenance behavior. I guess the main argument
> against is maintainability.
>
>>
>>>         Also, the bits and pieces that you need to add in order to
>>>         deal with
>>>         this new queue is, really, making things even more complex
>>>         than they are
>>>         now, which is undesirable.
>>
>>> While it does add some complexity, I don't feel a single queue and
>>> managing it is that much extra complexity.
>>>
>> But in fact the point of making the scheduler 'event driven' is to take
>> advantage of the chances that this offers for getting stuff *out* of
>> rt_schedule(), and the purpose is not "not adding that much extra
>> complexity", is making it _simpler_.
>
> I understand where you are coming from now. I was looking at is mostly
> from the perspective of performance. This explains our differences.
>
>>
>>
>>>         > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
>>>         > index 7c39a9e..25f0458 100644
>>>         > --- a/xen/common/sched_rt.c
>>>         > +++ b/xen/common/sched_rt.c
>>>
>>>         > @@ -156,6 +160,7 @@ struct rt_vcpu {
>>>         >      s_time_t cur_budget;        /* current budget */
>>>         >      s_time_t last_start;        /* last start time */
>>>         >      s_time_t cur_deadline;      /* current deadline for EDF
>>>         */
>>>         > +    s_time_t next_sched_needed; /* next time to make
>>>         scheduling decision */
>>>         >
>>>         As an example of why I said that things should become simpler,
>>>         and are
>>>         instead becoming more complex: with my solution, you don't
>>>         need anything
>>>         like this. In fact, budget enforcement is already being done
>>>         ok in
>>>         rt_schedule(), so no need to do anything more/different about
>>>         it.
>>>         Replenishments should be programmed to fire at cur_deadline,
>>>         so again,
>>>         no need to add this new field, and multiplex its actual value
>>>         between
>>>         budget and deadline (which, yes, counts as being something
>>>         rather
>>>         complex for me! :-D).
>>>
>>>
>>> As mentioned, the timerq is handling all events that could change the
>>> scheduling decision, not just replenishments.
>>>
>> Yes, exactly, it's handling *too much* events. :-)
>>
>> For example, have a look at 'struct vcpu', in
>> xen/include/xen/sched.h. It's got 3 timers in it:
>>
>>     struct timer     periodic_timer;
>>     struct timer     singleshot_timer;
>>     struct timer     poll_timer;    /* timeout for SCHEDOP_poll */
>>
>> It probably would have been possible to just use one, with a single and
>> mixed event queue, as you're doing, a 1k lines handling routines, etc.
>>
>> Do you it it would have been easier or harder to implement, understand,
>> debug and maintain? I bet on harder.
>
> Point taken.
>
>>
>>> This tracks the earliest time anything cause this to be scheduled
>>> differently, and could be extended if any more insights are found.
>>> Budget enforcement is being done in rt_schedule but its being done by
>>> this very list: a budget expiration is one possible event that would
>>> require a vcpu reschedule.
>>>
>> OMG, 'extended'... You're thinking to actually put more stuff in
>> there?!? :-O
>>
>> It's a rather key and already too long and complex critical section, so
>> please, let's aim at shortening and making it simpler, i.e., at
>> improving things, not making them worse.
>
> This can again be explained by our goal mismatch. I see where you are
> coming from now.
>
>>>
>>>         > +/*
>>>         > + * Return how far the lowest time on the timerq (that is
>>>         after NOW) is in the
>>>         > + * future.
>>>         > + * Lock must be grabbed before calling this.
>>>         > + */
>>>         > +static s_time_t __timerq_next(const struct scheduler *ops,
>>>         s_time_t now)
>>>         > +{
>>>         > +    struct list_head *timerq = rt_timerq(ops);
>>>         > +    struct list_head *iter, *tmp;
>>>         > +
>>>         > +    list_for_each_safe(iter, tmp, timerq)
>>>         > +    {
>>>         > +        struct rt_vcpu * iter_svc = __t_elem(iter);
>>>         > +
>>>         > +        if ( iter_svc->next_sched_needed > now )
>>>         > +            return (iter_svc->next_sched_needed - now);
>>>         > +        else
>>>         > +            __t_remove(iter_svc);
>>>         > +    }
>>>         > +
>>>         > +    return MAX_SCHEDULE;
>>>         > +}
>>>         >
>>>         If using a timer, you can just get rid of items while, in the
>>>         timer
>>>         routine, you handle the event associated to them.
>>>
>>>         Also, why is MAX_SCHEDULE still there?
>>>
>>>
>>> Honestly, events do not have to be removed here, but its done to
>>> prevent walking a list of stale values to get to the newest one on the
>>> next call. This is done more for performance. Their non-removal would
>>> be functionally equivalent.
>>>
>> Of course you should remove the stale entries, I certainly was not
>> arguing that the list should just grow indefinitely!
>>
>> Point is, again, that this is another list walk occurring in an hot
>> path, with a big spin lock held. We want to get rid of such thing, not
>> adding more of it.
>>
>>> With the timer alternative you mention, where would the timer events
>>> and their data be held? I think that could be extra complexity, unless
>>> I fail to understand the idea completely.
>>>
>> In an event queue like yours, of course. But you won't go through it
>> and/or bookkeep it while in hot paths, with the scheduler lock held.
>>
>> See my email to Meng to have more details on what I have in mind, or
>> fell free to ask more details.
>>
>>> MAX_SCHEDULE may not be required, but I have it there as an "in case."
>>> For example, it will cause the scheduler to be called after
>>> MAX_SCHEDULE even when no events are occurring (such as no vcpus). Its
>>> there to ensure progress in case of any bugs or unexpected behavior.
>>>
>> Wait, so, all this work, and then you still want to call rt_schedule()
>> every millisecond, when there's nothing to do?!?! :-O
>>
>>>         That being said, in this specific case, you're moving up
>>>         runq_tickle(),
>>>         the purpose of which is to trigger a call to rt_schedule()
>>>         (after going
>>>         through the softirq machinery)... in order to call it in
>>>         rt_schedule().
>>>         That's pretty tricky, and is not how tickling should work.
>>
>>> It set up to only tickle if needed. I'm not sure if this is an issue,
>>>
>> It's wrong, AFAICT. See my reply to Meng and, please, comment by
>> replying to it, if you've got anything to say about this, to make the
>> discussion easier to follow.
>>
>> Regards,
>> Dario
>
> I understand what caused our mismatch now. I think option a) you
> mentioned makes sense
> given your values. I am going to look into the details of this
> approach and get back with any
> questions or concerns.
>
> Regards,
> ~Dagaen

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-16 17:07         ` Dagaen Golomb
@ 2015-06-17  0:20           ` Dario Faggioli
  2015-06-17  5:24             ` Dagaen Golomb
  0 siblings, 1 reply; 21+ messages in thread
From: Dario Faggioli @ 2015-06-17  0:20 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li


[-- Attachment #1.1: Type: text/plain, Size: 14992 bytes --]

On Tue, 2015-06-16 at 13:07 -0400, Dagaen Golomb wrote:
> I'm not replying inline because this is a more general response that
> is not limited
> to any particular comment.
> 
That's fine.

Thanks for this actually... I love discussing these things, it makes me
remind the time when I was doing these stuff myself, and makes me feel
young! :-P

> Separating the replenishment from the scheduler may be problematic. The nature
> of the gEDF scheduler is that when a priority changes it should be instantly
> reflected. 
>
'instantly'. Is there such a thing? :-O

> If replenishments were done by themsevles, then the scheduler may
> decide to wait for some period of time, and in this same time period a
> replenishment
> could change what should be executing. 
>
Well, the scheduler then would better *not* decide to wait for any
period of time. It better act like this: as soon as a replenishment
happens, and a new deadline is assigned to a vCPU, put the vCPU in the
runqueue, in the proper position; and if such a newly replenished vCPU
should now preempt any other, currently running, vCPU, 'tickle' the pCPU
in question and let it reschedule, and pick up the newly replenished
vCPU.

> Either the gEDF policy will be
> violated or
> the replenishment routine would have to notify the scheduler after any
> replenishment, requiring some form of interaction and possibly more scheduler
> invocations than the current structure. 
>
Obviously not. In fact, it's your 'current structure' that already
invokes the scheduler for any replenishment. Doing more than that, would
be really hard (impossible, I should say).

I fact, it's the s_timer that you're making fire for any replenishment,
and replenishments happen inside rt_schedule() itself. This is
_the_definition_ of invoking the scheduler!

OTOH, by using a (some) replenishment timer(s), the scheduler *may*
*potentially* be invoked for any replenishment, yes, but it may also
very well not. In fact, if even after a replenishment, the new deadline
of the replenished vCPU is is father than the deadlines of all the
currently running vCPUs, there's no need to tickle any pCPU, no need to
call the scheduler.

BTW, from what you say, it seems to me that we are in agreement: we
_do_not_ want to call the scheduler upon any replenishment. Do as I say
and we'll achieve that.

> The other option is for the scheduler to
> check for replenishments, as it does now. 
>
Yeah, the wrong option. :-D :-D

> Without any interaction a violation of
> the policy could ensue. The gEDF is not like the credit scheduler where there is
> an accounting period where, during this time, policy may be violated
> temporarily.
>
Yep, I do have an idea of what EDF is.

> Further, with two timer routines we now need to deal with any ordering
> or preemption
> between them (or logic to prevent / order such) which I could argue is far more
> complexity than having them done at once as it is now. 
>
Mmm... I think I'm starting to lose you. Ordering between two timers? I
don't see what you mean.

Allow me, though, to walk through the behavior of your current approach.
I've done this already in a previous email, but no big deal re-doing it.
Oh, actually, the current approach is buggy, as you tickle the wrong
vCPU, but let's forget about that (let's assume it's fixed).

Let's say that time for a replenishment to vCPU w has come.

 on pCPU X:
 . timer interrupt
 .   TIMER_SOFTIRQ raised
 .     process softirqs
 .       SCHEDULE_SOFTIRQ raised
 .         process softirqs
 .           rt_schedule()
 .             [spin_lock]
 .             burn_budget(scurr)
 .             __repl_update()  (goes through all runnable vcpus!!)
 .             snext = __runq_pick() -> says vCPU w should run on pCPU Y
 .             runq_tickle(snext) ----> tickle pCPU Y [*]
 .             [spin_unlock]
 .
 on pCPU Y:
 . process softirqs
 .   SCHEDULE_SOFTIRQ
 .     rt_schedule()
 .       [spin_lock]
 .       burn_budget(scurr)
 .       __repl_update()  (goes through all runnable vcpus!! AGAIN!!)
 .       snext = runq_pick() ----------> pick vCPU w
 .       [spin_unlock]
 .
 context switch: vCPU w now runs on pCPU Y

So, you forced pCPU X through a full execution of rt_schedule(), with
the spinlock, the scan of runqueue and depleted queue, and everything,
just for the sake of figuring out that pCPU Y should reschedule. Then
you (as soon as practical) actually reschedule on pCPU Y and context
switch in favour of vCPU w, enacting EDF.

That's 2 scheduler invocations, and rt_schedule() is still ugly and
complex to read, maintain and improve.

[*] and don't forget that this needs fixing, because right now it's just
incorrect, and (just speculating) making runq_pick() behave in the way
you want, may not be super-simple.

Oh, I was almost forgetting! Wonder what happens in the time interval
between when __repl_update() is called (from inside rt_schedule()) on
pCPU X, and the actual time instant of the context switch on pCPU Y?
Yeah, right, what happens is that you're violating EDF. :-/

This can be accounted as a form of overhead introduced by the
implementation, I guess. In this specific case, it's due to the  fact
that you can't, from pCPU X, just change *instantly* what's running on
pCPU Y. No, you've got to send an IPI, then Y has to react and it has to
schedule, etc. This is of course dependant on the architecture of the
scheduler. Xen's scheduler works like this. FWIW, Linux's scheduler, wrt
this, also does.

So, see? Transient  violation of EDF is just there, no matter the
approach!

Let's look at the timers way:

 on pCPU X:
 . timer interrupt
 .   TIMER_SOFTIRQ raised
 .     process softirqs
 .       replenishment_timer_handler()
 .         [spin_lock]
 .           <for_each_depleted_vcpu(i, i.repl_time < NOW()) {
 .              replenish(i) ---------> replenish vCPU w
 .              runq_tickle(i) -------> tickle pCPU Y
 .            }>
 .         [spin_lock]
 .
 on pCPU Y:
 . process softirqs
 .   SCHEDULE_SOFTIRQ
 .     rt_schedule()
 .       [spin_lock]
 .       burn_budget(scurr)
 .       snext = runq_pick() ----------> pick vCPU w
 .       [spin_unlock]
 .
 context switch: vCPU x now runs on pCPU Y

So this means only 1 invocation of the scheduler, and only on the pCPU
that actually needs to reschedule. There's some overhead due to the
timer, of course, but, notably, this allows for shorter critical
sections, not to mention better looking and easier to maintain code.

Are we still violating EDF? Of course we are, between the replenishment
and context switch time, as above. That's unavoidable, I'm afraid.

Summarizing, the two solutions are on par wrt temporary violation of the
theoretical algorithm, but the timer based approach has loads of other
benefits, so let's go with timers. :-)

> If both are armed for the same time, which should execute? Scheduler
> first before
> a possibly applicable replenishment? Or replenishment always first?
> Both with added
> logic to enforce this and prevent preemption, of course.
> 
I really don't see what you mean here. There won't be anything like that
to take care of. Xen offers timers as an abstraction, and deals with
them itself. You only need to take care of properly serializing
rt_schedule() and the timer handling routine, for the code sections that
require that.

> Due to this, it is my belief that by the nature of the gEDF policy the
> current solution is
> better, mostly because it appears to actually be the least complex way that is
> functionally correct. 
>
Not really, no. I wouldn't be convinced of this, even if what you have
right now were functionally correct.

> The gEDF policy just isn't well suited for
> decoupling events, as
> the events are highly dependent on one another, and particularly dependent at an
> instantaneous timescale. 
>
And here it come 'instantaneous timescale' again.

No, sorry, but I don't buy this. In fact, why does it not apply to
vCPUs' wakeups? I think it does, conceptually... So, should we move the
wakeup logic inside rt_schedule() too?

> It would be a hard pitch for a gEDF scheduler with a
> similar "accounting period" where the gEDF policy could be violated. 
>
There's no accounting period of any sort. It's called overhead!

What we need to do is to try to keep it as small as possible. And I'm
quite sure that an effective way to do so is to keep critical sections
short, especially when protected by (potentially) highly contented spin
locks. RTDS, currently, suffer from poor scalability (and, I suspect,
poor latency as well, at least under pressure), and one of the reasons
why the work you're doing is interesting is that it can alleviate
this... if done with that in mind, of course.

> That is
> blasphemy in the real-time world. 
>
Blasphemy, that's a nice one! :-D :-D

Well, I've never been good at religions, I have to admit. So, yes, I can
live with being called blasphemous, I guess. :-)

> Its also worth noting that the stereotypical textbook event-driven
> model is as we have
> now: a single event loop that handles events. In this case the scheduler is the
> conceptually the main loop and this makes perfect sense: its deciding
> what to run
> (think of the running VCPUs as callback functions and the abstraction
> falls into place
> perfectly). The event loop needs some information (about replenishments) before
> deciding, and collecting this would be part of the decode and decision
> phase for an
> event signal.
> 
Talking about 'stereotypes', I don't have any textbook describing an
event-driven model at hand right now. However, you RT-Xen people like
and use LitmusRT, don't you? Well, you're doing the right thing, because
it's a great piece of software!

If you're interested, have a look in there. Spoiler: you'll find a lot
of timers. :-D :-D

More seriously, it's of course rather different, as Linux is not Xen,
but since you're looking for stereotypes:

 - Linux (or at lease LitmusRT patch 2014.2, i.e., what I'm looking at
   right now) has ticks. This means a timer fires every X milliseconds
   (on every CPU), and task_tick_litmus() is run as a consequence. Such
   function does not (at all!) invoke the scheduler directly, it much
   rather just checks whether doing so is necessary, in this case
   because of some task having exhausted its budget. If it happened, it
   calls litmus_reschedule_local() (see below). I know, this is budget
   enforcement, not replenishment, but I think it works as a good
   example of my point about using timers.

 - Actually, budget enforcement can be done, in LitmusRT, in two ways.
   One is tick based (described above), the other, called 'precise', is
   timer based. To see that in action, check how the enforcement timer
   is handled, e.g., in update_enforcement_timer(). Look at
   on_enforcement_timeout(), and find that it also _does_not_ schedule.
   It again just asks for the scheduler to be invoked as soon as
   practical, via (again) litmus_reschedule_local().

 - Finally, a 'job release', in LitmusRT terminology, is probably what
   is most close to a budget replenishment for us here. In fact, when a
   new RT job is released, it's given full budget, and its scheduling
   parameters (deadline, in most cases) are updated accordingly, like it
   happens, for us, with replenishments.
   Check, therefore, how that happens in add_release() and
   __add_release(). You'll find a call to arm_release_timer() which
   calls, when firing, on_release_timer(). From there, go, e.g., to
   gsnedf_release_jobs(), and follow the call chain through
   check_for_preemptions() --> preempt() --> preempt_if_preemptable()
   --> litmus_reschedule(), which then calls either
   set_tsk_need_resched(current) or smp_send_reschedule(cpu).

litmus_reschedule_local(), and similar functions, do something very
similar to our tickling, i.e., they ask a specific CPU to go through the
scheduler. When this happens via set_tsk_need_resched(current) it's
like, in Xen, tickling the pCPU we're on. When it happens via
smp_send_reschedule(cpu) it's like, in Xen, tickling the remote pCPU
cpu.

Now, although some years have passed, I've interacted quite a bit with
LitmusRT folks. They're very smart, and usually very good at what they
do, and I'd be surprised to find them guilty of that much real-time
blasphemy. :-P

You also may know that there is an EDF implementation in mainline Linux,
and you may would like to go have a look there too. I'm not including
any example from there because I'm one of the  main authors of it, so
it's pretty obvious that it uses the approach I think it's best (and,
more important, it would be rather inelegant to cite myself! :-D).

Anyway, if you fancy doing that, the core is in kernel/sched/deadline.c.

> Not only is there a complexity issue, but as mentioned before this may be the
> best performing option, making the most information available and the
> best timing
> decision possible. Adding a few extra cycles to a hot path, even with
> a lock being
> held, is worth it if the scheduler and context switching is done less.
>
Right, so let's use timers, they require less or equal scheduler
invocations wrt to... Always invoking the scheduler!! (Not sure why you
mention the number of context switches, that's independent on which of
the two approaches you  choose.)

> For the above reasons I think you should reconsider the current
> implementation, as it
> appears it may be the least complex and easiest to reason about
> solution. 
>
Well, no, sorry. :-(

I mean, I appreciate that what you have now (once fixed), may be
effective in avoiding some unnecessary invocation of the scheduler, with
respect to what we currently have in the repo, so I'm not saying it
can't be seen as an improvement. It's a "nice hack", actually.

However, if we're  still aiming at making this scheduler a first class
citizen within Xen, we're interested in less hacks, rather than in more
--no matter how nice they are. So, with that in mind, it would be a step
in the wrong direction, IMO.

> Let me know if I'm missing some key insight into how the behavior
> could be implemented
> correctly and beautifully using the multiple timer approach.
>
well, I tried. I really did.

> I simply
> don't see how it can
> be done without heavy interaction and information sharing between them
> which really
> defeats the purpose.
> 
No, it doesn't.

Thanks and Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-17  0:20           ` Dario Faggioli
@ 2015-06-17  5:24             ` Dagaen Golomb
  2015-06-17  5:45               ` Meng Xu
  2015-06-17  8:27               ` Dario Faggioli
  0 siblings, 2 replies; 21+ messages in thread
From: Dagaen Golomb @ 2015-06-17  5:24 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li

> Thanks for this actually... I love discussing these things, it makes me
> remind the time when I was doing these stuff myself, and makes me feel
> young! :-P

And thank you for the very detailed and well-thought response!

>
>> Separating the replenishment from the scheduler may be problematic. The nature
>> of the gEDF scheduler is that when a priority changes it should be instantly
>> reflected.
>>
> 'instantly'. Is there such a thing? :-O

I meant no accounting period and implicitly during scheduler functions
it may happen,
but from your reply I know you understand this :P

>
>> If replenishments were done by themsevles, then the scheduler may
>> decide to wait for some period of time, and in this same time period a
>> replenishment
>> could change what should be executing.
>>
> Well, the scheduler then would better *not* decide to wait for any
> period of time. It better act like this: as soon as a replenishment
> happens, and a new deadline is assigned to a vCPU, put the vCPU in the
> runqueue, in the proper position; and if such a newly replenished vCPU
> should now preempt any other, currently running, vCPU, 'tickle' the pCPU
> in question and let it reschedule, and pick up the newly replenished
> vCPU.

Yes, this is an option. However, I thought this would actually be an
option you would
not like. For the replenishment routine to know when to call the
scheduler and when
not to, it basically has to perform the the scheduler's logic, at
which point if there is a
change... why doesn't it just do it (i.e., be the scheduler) instead
of adding logic and
possibly overhead to call it.

>
>> Either the gEDF policy will be
>> violated or
>> the replenishment routine would have to notify the scheduler after any
>> replenishment, requiring some form of interaction and possibly more scheduler
>> invocations than the current structure.
>>
> Obviously not. In fact, it's your 'current structure' that already
> invokes the scheduler for any replenishment. Doing more than that, would
> be really hard (impossible, I should say).
>
> I fact, it's the s_timer that you're making fire for any replenishment,
> and replenishments happen inside rt_schedule() itself. This is
> _the_definition_ of invoking the scheduler!
>
> OTOH, by using a (some) replenishment timer(s), the scheduler *may*
> *potentially* be invoked for any replenishment, yes, but it may also
> very well not. In fact, if even after a replenishment, the new deadline
> of the replenished vCPU is is father than the deadlines of all the
> currently running vCPUs, there's no need to tickle any pCPU, no need to
> call the scheduler.
>
> BTW, from what you say, it seems to me that we are in agreement: we
> _do_not_ want to call the scheduler upon any replenishment. Do as I say
> and we'll achieve that.

I'm glad we are in agreement on that :)
Also, I'd like to point out that when I said things like "make more intelligent"
decision handling, I meant exactly things like the above: using as much
information as possible to prevent scheduler invocations. The current patch
doesn't reschedule for any replenishment. Firstly, it doesn't even look at
the run queue: replenishments here can only happen on miss.

>
>> The other option is for the scheduler to
>> check for replenishments, as it does now.
>>
> Yeah, the wrong option. :-D :-D
>
>> Without any interaction a violation of
>> the policy could ensue. The gEDF is not like the credit scheduler where there is
>> an accounting period where, during this time, policy may be violated
>> temporarily.
>>
> Yep, I do have an idea of what EDF is.
>
>> Further, with two timer routines we now need to deal with any ordering
>> or preemption
>> between them (or logic to prevent / order such) which I could argue is far more
>> complexity than having them done at once as it is now.
>>
> Mmm... I think I'm starting to lose you. Ordering between two timers? I
> don't see what you mean.
>
> Allow me, though, to walk through the behavior of your current approach.
> I've done this already in a previous email, but no big deal re-doing it.
> Oh, actually, the current approach is buggy, as you tickle the wrong
> vCPU, but let's forget about that (let's assume it's fixed).
>
> Let's say that time for a replenishment to vCPU w has come.
>
>  on pCPU X:
>  . timer interrupt
>  .   TIMER_SOFTIRQ raised
>  .     process softirqs
>  .       SCHEDULE_SOFTIRQ raised
>  .         process softirqs
>  .           rt_schedule()
>  .             [spin_lock]
>  .             burn_budget(scurr)
>  .             __repl_update()  (goes through all runnable vcpus!!)
>  .             snext = __runq_pick() -> says vCPU w should run on pCPU Y
>  .             runq_tickle(snext) ----> tickle pCPU Y [*]
>  .             [spin_unlock]
>  .
>  on pCPU Y:
>  . process softirqs
>  .   SCHEDULE_SOFTIRQ
>  .     rt_schedule()
>  .       [spin_lock]
>  .       burn_budget(scurr)
>  .       __repl_update()  (goes through all runnable vcpus!! AGAIN!!)
>  .       snext = runq_pick() ----------> pick vCPU w
>  .       [spin_unlock]
>  .
>  context switch: vCPU w now runs on pCPU Y
>
> So, you forced pCPU X through a full execution of rt_schedule(), with
> the spinlock, the scan of runqueue and depleted queue, and everything,
> just for the sake of figuring out that pCPU Y should reschedule. Then
> you (as soon as practical) actually reschedule on pCPU Y and context
> switch in favour of vCPU w, enacting EDF.
>
> That's 2 scheduler invocations, and rt_schedule() is still ugly and
> complex to read, maintain and improve.
>
> [*] and don't forget that this needs fixing, because right now it's just
> incorrect, and (just speculating) making runq_pick() behave in the way
> you want, may not be super-simple.
>
> Oh, I was almost forgetting! Wonder what happens in the time interval
> between when __repl_update() is called (from inside rt_schedule()) on
> pCPU X, and the actual time instant of the context switch on pCPU Y?
> Yeah, right, what happens is that you're violating EDF. :-/
>
> This can be accounted as a form of overhead introduced by the
> implementation, I guess. In this specific case, it's due to the  fact
> that you can't, from pCPU X, just change *instantly* what's running on
> pCPU Y. No, you've got to send an IPI, then Y has to react and it has to
> schedule, etc. This is of course dependant on the architecture of the
> scheduler. Xen's scheduler works like this. FWIW, Linux's scheduler, wrt
> this, also does.
>
> So, see? Transient  violation of EDF is just there, no matter the
> approach!

Of course, violation during the scheduler is possible for any implementation :(

>
> Let's look at the timers way:
>
>  on pCPU X:
>  . timer interrupt
>  .   TIMER_SOFTIRQ raised
>  .     process softirqs
>  .       replenishment_timer_handler()
>  .         [spin_lock]
>  .           <for_each_depleted_vcpu(i, i.repl_time < NOW()) {
>  .              replenish(i) ---------> replenish vCPU w
>  .              runq_tickle(i) -------> tickle pCPU Y
>  .            }>
>  .         [spin_lock]
>  .
>  on pCPU Y:
>  . process softirqs
>  .   SCHEDULE_SOFTIRQ
>  .     rt_schedule()
>  .       [spin_lock]
>  .       burn_budget(scurr)
>  .       snext = runq_pick() ----------> pick vCPU w
>  .       [spin_unlock]
>  .
>  context switch: vCPU x now runs on pCPU Y
>
> So this means only 1 invocation of the scheduler, and only on the pCPU
> that actually needs to reschedule. There's some overhead due to the
> timer, of course, but, notably, this allows for shorter critical
> sections, not to mention better looking and easier to maintain code.
>
> Are we still violating EDF? Of course we are, between the replenishment
> and context switch time, as above. That's unavoidable, I'm afraid.
>
> Summarizing, the two solutions are on par wrt temporary violation of the
> theoretical algorithm, but the timer based approach has loads of other
> benefits, so let's go with timers. :-)
>
>> If both are armed for the same time, which should execute? Scheduler
>> first before
>> a possibly applicable replenishment? Or replenishment always first?
>> Both with added
>> logic to enforce this and prevent preemption, of course.
>>
> I really don't see what you mean here. There won't be anything like that
> to take care of. Xen offers timers as an abstraction, and deals with
> them itself. You only need to take care of properly serializing
> rt_schedule() and the timer handling routine, for the code sections that
> require that.

This is good to know, thanks. The question would then change to how does
Xen handle such an occurrence, but I think this is irrelevant now that I know
which "side" you had in mind - that is, that the replenishment handler would
decide when the scheduler should be invoked.

>
>> Due to this, it is my belief that by the nature of the gEDF policy the
>> current solution is
>> better, mostly because it appears to actually be the least complex way that is
>> functionally correct.
>>
> Not really, no. I wouldn't be convinced of this, even if what you have
> right now were functionally correct.
>
>> The gEDF policy just isn't well suited for
>> decoupling events, as
>> the events are highly dependent on one another, and particularly dependent at an
>> instantaneous timescale.
>>
> And here it come 'instantaneous timescale' again.
>
> No, sorry, but I don't buy this. In fact, why does it not apply to
> vCPUs' wakeups? I think it does, conceptually... So, should we move the
> wakeup logic inside rt_schedule() too?
>
>> It would be a hard pitch for a gEDF scheduler with a
>> similar "accounting period" where the gEDF policy could be violated.
>>
> There's no accounting period of any sort. It's called overhead!

This part was referring to if we had two timers without extensive communication,
cooperation, or information/logic sharing.

>
> What we need to do is to try to keep it as small as possible. And I'm
> quite sure that an effective way to do so is to keep critical sections
> short, especially when protected by (potentially) highly contented spin
> locks. RTDS, currently, suffer from poor scalability (and, I suspect,
> poor latency as well, at least under pressure), and one of the reasons
> why the work you're doing is interesting is that it can alleviate
> this... if done with that in mind, of course.
>
>> That is
>> blasphemy in the real-time world.
>>
> Blasphemy, that's a nice one! :-D :-D
>
> Well, I've never been good at religions, I have to admit. So, yes, I can
> live with being called blasphemous, I guess. :-)

You get the complement of blasphemy, I get that of "nice hack" :)

>
>> Its also worth noting that the stereotypical textbook event-driven
>> model is as we have
>> now: a single event loop that handles events. In this case the scheduler is the
>> conceptually the main loop and this makes perfect sense: its deciding
>> what to run
>> (think of the running VCPUs as callback functions and the abstraction
>> falls into place
>> perfectly). The event loop needs some information (about replenishments) before
>> deciding, and collecting this would be part of the decode and decision
>> phase for an
>> event signal.
>>
> Talking about 'stereotypes', I don't have any textbook describing an
> event-driven model at hand right now. However, you RT-Xen people like
> and use LitmusRT, don't you? Well, you're doing the right thing, because
> it's a great piece of software!
>
> If you're interested, have a look in there. Spoiler: you'll find a lot
> of timers. :-D :-D
>
> More seriously, it's of course rather different, as Linux is not Xen,
> but since you're looking for stereotypes:
>
>  - Linux (or at lease LitmusRT patch 2014.2, i.e., what I'm looking at
>    right now) has ticks. This means a timer fires every X milliseconds
>    (on every CPU), and task_tick_litmus() is run as a consequence. Such
>    function does not (at all!) invoke the scheduler directly, it much
>    rather just checks whether doing so is necessary, in this case
>    because of some task having exhausted its budget. If it happened, it
>    calls litmus_reschedule_local() (see below). I know, this is budget
>    enforcement, not replenishment, but I think it works as a good
>    example of my point about using timers.
>
>  - Actually, budget enforcement can be done, in LitmusRT, in two ways.
>    One is tick based (described above), the other, called 'precise', is
>    timer based. To see that in action, check how the enforcement timer
>    is handled, e.g., in update_enforcement_timer(). Look at
>    on_enforcement_timeout(), and find that it also _does_not_ schedule.
>    It again just asks for the scheduler to be invoked as soon as
>    practical, via (again) litmus_reschedule_local().
>
>  - Finally, a 'job release', in LitmusRT terminology, is probably what
>    is most close to a budget replenishment for us here. In fact, when a
>    new RT job is released, it's given full budget, and its scheduling
>    parameters (deadline, in most cases) are updated accordingly, like it
>    happens, for us, with replenishments.
>    Check, therefore, how that happens in add_release() and
>    __add_release(). You'll find a call to arm_release_timer() which
>    calls, when firing, on_release_timer(). From there, go, e.g., to
>    gsnedf_release_jobs(), and follow the call chain through
>    check_for_preemptions() --> preempt() --> preempt_if_preemptable()
>    --> litmus_reschedule(), which then calls either
>    set_tsk_need_resched(current) or smp_send_reschedule(cpu).
>
> litmus_reschedule_local(), and similar functions, do something very
> similar to our tickling, i.e., they ask a specific CPU to go through the
> scheduler. When this happens via set_tsk_need_resched(current) it's
> like, in Xen, tickling the pCPU we're on. When it happens via
> smp_send_reschedule(cpu) it's like, in Xen, tickling the remote pCPU
> cpu.
>
> Now, although some years have passed, I've interacted quite a bit with
> LitmusRT folks. They're very smart, and usually very good at what they
> do, and I'd be surprised to find them guilty of that much real-time
> blasphemy. :-P
>
> You also may know that there is an EDF implementation in mainline Linux,
> and you may would like to go have a look there too. I'm not including
> any example from there because I'm one of the  main authors of it, so
> it's pretty obvious that it uses the approach I think it's best (and,
> more important, it would be rather inelegant to cite myself! :-D).
>
> Anyway, if you fancy doing that, the core is in kernel/sched/deadline.c.

Thanks for the history and background lesson here :)

>
>> Not only is there a complexity issue, but as mentioned before this may be the
>> best performing option, making the most information available and the
>> best timing
>> decision possible. Adding a few extra cycles to a hot path, even with
>> a lock being
>> held, is worth it if the scheduler and context switching is done less.
>>
> Right, so let's use timers, they require less or equal scheduler
> invocations wrt to... Always invoking the scheduler!! (Not sure why you
> mention the number of context switches, that's independent on which of
> the two approaches you  choose.)
>
>> For the above reasons I think you should reconsider the current
>> implementation, as it
>> appears it may be the least complex and easiest to reason about
>> solution.
>>
> Well, no, sorry. :-(
>
> I mean, I appreciate that what you have now (once fixed), may be
> effective in avoiding some unnecessary invocation of the scheduler, with
> respect to what we currently have in the repo, so I'm not saying it
> can't be seen as an improvement. It's a "nice hack", actually.

"nice hack." heh, heh, thanks :P

>
> However, if we're  still aiming at making this scheduler a first class
> citizen within Xen, we're interested in less hacks, rather than in more
> --no matter how nice they are. So, with that in mind, it would be a step
> in the wrong direction, IMO.
>
>> Let me know if I'm missing some key insight into how the behavior
>> could be implemented
>> correctly and beautifully using the multiple timer approach.
>>
> well, I tried. I really did.
>
>> I simply
>> don't see how it can
>> be done without heavy interaction and information sharing between them
>> which really
>> defeats the purpose.
>>
> No, it doesn't.

Ok this last line is the zinger.

Almost this entire email was built on the premise that you would NOT like the
idea of the replenishment handler having basically the same decision logic
as the scheduler and then tickling the pCPU to pick up the new vCPU. Actually,
with it done this way, I would have a hard time calling the
tickle-invoked method
the "scheduler." It would be more like the mover, with the
replenishment function
being essentially the scheduler. In this case, I'm not too sure
actually how much
different this would be from what we have now. It feels like, from
what you want,
that we could get the same behavior by modifying rt_schedule to do
replenishments
first, then check if a reschedule is needed (these two parts would be in this
proposed replenishment timer routine) and the perform any move if necessary. I
know this isn't exactly what you want, but that sounds close right?

But the scheduler will have to decide which to move in, so that logic
is done twice.
Also, if these are done back-to-back and require the locks, isn't it
really the same
as having one long hot path? If you want maintainability, couldn't we just do
replenishment then schedule and move (if necessary) in one timer (the
one we have
now) and move them to functions. It seems this can be done with one
timer neatly.

So here's my proposal, lets see if it fits what you want:
1.) The scheduler does not do any timing,
2.) replenishments are scheduled via timer at each [next]
replenishment period. Each
replenishment resorts the replenished vCPUs, and if any of the first
#CPUs in the
runq change, we tickle a pCPU for each change

In this case, we can use one timer. We could use the current one as a
replenishment
timer, and make it so rt_schedule isn't the default invoked method.

Does this sound similar to what you are suggesting? I have to ask
because I thought
you wouldn't like the idea, and its not really *that* far off from
what we have now, Its
a little restructuring so that replenishments occur before any
scheduling activity and
the handler checks if switching is needed (basically acting as the
scheduler) and then
calls tickle. Sounds like what you had in mind?

Regards,
~Dagaen

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-17  5:24             ` Dagaen Golomb
@ 2015-06-17  5:45               ` Meng Xu
  2015-06-17  6:03                 ` Dagaen Golomb
  2015-06-17  9:20                 ` Dario Faggioli
  2015-06-17  8:27               ` Dario Faggioli
  1 sibling, 2 replies; 21+ messages in thread
From: Meng Xu @ 2015-06-17  5:45 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel,
	Meng Xu, Jan Beulich, Chong Li

Hi Dagaen,

I just comment on the summary of scheduler design you proposed at the
end of the email. I'm looking forward to Dario's more insightful
reply.
>
>
> >
> >> I simply
> >> don't see how it can
> >> be done without heavy interaction and information sharing between them
> >> which really
> >> defeats the purpose.
> >>
> > No, it doesn't.
>
> Ok this last line is the zinger.
>
> Almost this entire email was built on the premise that you would NOT like the
> idea of the replenishment handler having basically the same decision logic
> as the scheduler and then tickling the pCPU to pick up the new vCPU. Actually,
> with it done this way, I would have a hard time calling the
> tickle-invoked method
> the "scheduler." It would be more like the mover, with the
> replenishment function
> being essentially the scheduler. In this case, I'm not too sure
> actually how much
> different this would be from what we have now. It feels like, from
> what you want,
> that we could get the same behavior by modifying rt_schedule to do
> replenishments
> first, then check if a reschedule is needed (these two parts would be in this
> proposed replenishment timer routine) and the perform any move if necessary. I
> know this isn't exactly what you want, but that sounds close right?
>
> But the scheduler will have to decide which to move in, so that logic
> is done twice.
> Also, if these are done back-to-back and require the locks, isn't it
> really the same
> as having one long hot path? If you want maintainability, couldn't we just do
> replenishment then schedule and move (if necessary) in one timer (the
> one we have
> now) and move them to functions. It seems this can be done with one
> timer neatly.
>
> So here's my proposal, lets see if it fits what you want:


I will leave this to Dario to answer if it fits what he wants. :-P


>
> 1.) The scheduler does not do any timing,


Not really. The rt_schedule does the budget enforcement. When a
current VCPU runs out of budget, the rt_schedule will be invoked by a
timer (you can refer to the schedule function in xen/sched/schedule.c
to have a look how the timer is armed/disarmed.). When the rt_schedule
is invoked, it needs to:
a) update the budget of the current running VCPU and move it from runq
to depleted queue;
b) pick the current highest VCPU from runq and return it as the snext VCPU.

So scheduling logic is still  involved in the rt_schedule function.

>
> 2.) replenishments are scheduled via timer at each [next]
> replenishment period. Each
> replenishment resorts the replenished vCPUs, and if any of the first
> #CPUs in the
> runq change, we tickle a pCPU for each change


This is right.

>
>
> In this case, we can use one timer.


We actually have two: one for budget enforcement and the other for
budget replenishment.


>
> We could use the current one as a
> replenishment
> timer, and make it so rt_schedule isn't the default invoked method.
>
> Does this sound similar to what you are suggesting?


I don't think so, but I will leave it for Dario's for his opinion.

In Dario's suggestion, you just simply remove the update_budget
function out of rt_schedule. This is because budget enforcement, which
is the work of rt_schedule, does not naturelly involves budget
replenishment.

In order to achieve budget replenishment, we need another timer to
invoke another function (budget_replenish function), that is dedicated
to that.

>
> I have to ask
> because I thought
> you wouldn't like the idea,


I guess Dario won't like this idea. :-P (I'm kidding, but it should be
the case.)


>
> and its not really *that* far off from
> what we have now, Its
> a little restructuring so that replenishments occur before any
> scheduling activity and
> the handler checks if switching is needed (basically acting as the
> scheduler) and then
> calls tickle. Sounds like what you had in mind?


Thanks and best regards,

Meng
-- 


-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-17  5:45               ` Meng Xu
@ 2015-06-17  6:03                 ` Dagaen Golomb
  2015-06-17  6:09                   ` Meng Xu
  2015-06-17  9:20                 ` Dario Faggioli
  1 sibling, 1 reply; 21+ messages in thread
From: Dagaen Golomb @ 2015-06-17  6:03 UTC (permalink / raw)
  To: Meng Xu
  Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel,
	Meng Xu, Jan Beulich, Chong Li


[-- Attachment #1.1: Type: text/plain, Size: 4501 bytes --]

Thanks for the reply, budget enforcement in the scheduler timer makes
sense. I think I have an idea of what he wants done now.

~Dagaen
On Jun 17, 2015 1:45 AM, "Meng Xu" <xumengpanda@gmail.com> wrote:

> Hi Dagaen,
>
> I just comment on the summary of scheduler design you proposed at the
> end of the email. I'm looking forward to Dario's more insightful
> reply.
> >
> >
> > >
> > >> I simply
> > >> don't see how it can
> > >> be done without heavy interaction and information sharing between them
> > >> which really
> > >> defeats the purpose.
> > >>
> > > No, it doesn't.
> >
> > Ok this last line is the zinger.
> >
> > Almost this entire email was built on the premise that you would NOT
> like the
> > idea of the replenishment handler having basically the same decision
> logic
> > as the scheduler and then tickling the pCPU to pick up the new vCPU.
> Actually,
> > with it done this way, I would have a hard time calling the
> > tickle-invoked method
> > the "scheduler." It would be more like the mover, with the
> > replenishment function
> > being essentially the scheduler. In this case, I'm not too sure
> > actually how much
> > different this would be from what we have now. It feels like, from
> > what you want,
> > that we could get the same behavior by modifying rt_schedule to do
> > replenishments
> > first, then check if a reschedule is needed (these two parts would be in
> this
> > proposed replenishment timer routine) and the perform any move if
> necessary. I
> > know this isn't exactly what you want, but that sounds close right?
> >
> > But the scheduler will have to decide which to move in, so that logic
> > is done twice.
> > Also, if these are done back-to-back and require the locks, isn't it
> > really the same
> > as having one long hot path? If you want maintainability, couldn't we
> just do
> > replenishment then schedule and move (if necessary) in one timer (the
> > one we have
> > now) and move them to functions. It seems this can be done with one
> > timer neatly.
> >
> > So here's my proposal, lets see if it fits what you want:
>
>
> I will leave this to Dario to answer if it fits what he wants. :-P
>
>
> >
> > 1.) The scheduler does not do any timing,
>
>
> Not really. The rt_schedule does the budget enforcement. When a
> current VCPU runs out of budget, the rt_schedule will be invoked by a
> timer (you can refer to the schedule function in xen/sched/schedule.c
> to have a look how the timer is armed/disarmed.). When the rt_schedule
> is invoked, it needs to:
> a) update the budget of the current running VCPU and move it from runq
> to depleted queue;
> b) pick the current highest VCPU from runq and return it as the snext VCPU.
>
> So scheduling logic is still  involved in the rt_schedule function.
>
> >
> > 2.) replenishments are scheduled via timer at each [next]
> > replenishment period. Each
> > replenishment resorts the replenished vCPUs, and if any of the first
> > #CPUs in the
> > runq change, we tickle a pCPU for each change
>
>
> This is right.
>
> >
> >
> > In this case, we can use one timer.
>
>
> We actually have two: one for budget enforcement and the other for
> budget replenishment.
>
>
> >
> > We could use the current one as a
> > replenishment
> > timer, and make it so rt_schedule isn't the default invoked method.
> >
> > Does this sound similar to what you are suggesting?
>
>
> I don't think so, but I will leave it for Dario's for his opinion.
>
> In Dario's suggestion, you just simply remove the update_budget
> function out of rt_schedule. This is because budget enforcement, which
> is the work of rt_schedule, does not naturelly involves budget
> replenishment.
>
> In order to achieve budget replenishment, we need another timer to
> invoke another function (budget_replenish function), that is dedicated
> to that.
>
> >
> > I have to ask
> > because I thought
> > you wouldn't like the idea,
>
>
> I guess Dario won't like this idea. :-P (I'm kidding, but it should be
> the case.)
>
>
> >
> > and its not really *that* far off from
> > what we have now, Its
> > a little restructuring so that replenishments occur before any
> > scheduling activity and
> > the handler checks if switching is needed (basically acting as the
> > scheduler) and then
> > calls tickle. Sounds like what you had in mind?
>
>
> Thanks and best regards,
>
> Meng
> --
>
>
> -----------
> Meng Xu
> PhD Student in Computer and Information Science
> University of Pennsylvania
> http://www.cis.upenn.edu/~mengxu/
>

[-- Attachment #1.2: Type: text/html, Size: 5593 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-17  6:03                 ` Dagaen Golomb
@ 2015-06-17  6:09                   ` Meng Xu
  0 siblings, 0 replies; 21+ messages in thread
From: Meng Xu @ 2015-06-17  6:09 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel,
	Meng Xu, Jan Beulich, Chong Li

Hi Dagaen,

First, please try not top post in the ML. :-) As the one you just
sent, it is better to reply to the comment instead of top reply. :-)

2015-06-16 23:03 GMT-07:00 Dagaen Golomb <dgolomb@seas.upenn.edu>:
> Thanks for the reply, budget enforcement in the scheduler timer makes sense.
> I think I have an idea of what he wants done now.

Great! It is better to summarize your understanding of the design and
send to the ML to check if your understanding is correct as Chong did
for the improved toolstack of RTDS scheduler. This will save rounds of
patches.

Thanks,

Meng



-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-17  5:24             ` Dagaen Golomb
  2015-06-17  5:45               ` Meng Xu
@ 2015-06-17  8:27               ` Dario Faggioli
  2015-06-18 18:07                 ` Dagaen Golomb
  1 sibling, 1 reply; 21+ messages in thread
From: Dario Faggioli @ 2015-06-17  8:27 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li


[-- Attachment #1.1: Type: text/plain, Size: 9995 bytes --]

[Let me just reply to this... Then I'll jump to the end of the thread]

On Wed, 2015-06-17 at 01:24 -0400, Dagaen Golomb wrote:

> >> If replenishments were done by themsevles, then the scheduler may
> >> decide to wait for some period of time, and in this same time period a
> >> replenishment
> >> could change what should be executing.
> >>
> > Well, the scheduler then would better *not* decide to wait for any
> > period of time. It better act like this: as soon as a replenishment
> > happens, and a new deadline is assigned to a vCPU, put the vCPU in the
> > runqueue, in the proper position; and if such a newly replenished vCPU
> > should now preempt any other, currently running, vCPU, 'tickle' the pCPU
> > in question and let it reschedule, and pick up the newly replenished
> > vCPU.
> 
> Yes, this is an option. However, I thought this would actually be an
> option you would
> not like. 
>
How so... I've been arguing for this the whole time?!?! :-O

I'm sure I've put down a sketch of what I think the replenishment
function should do in my first or second email in the thread, and I'm
sure that involved a call to runq_tickle()

> For the replenishment routine to know when to call the
> scheduler and when
> not to, it basically has to perform the the scheduler's logic, at
> which point if there is a
> change... why doesn't it just do it (i.e., be the scheduler) instead
> of adding logic and
> possibly overhead to call it.
> 
It has to call runq_tickle(), which is not "the scheduler's logic", is
the function that decides whether a pCPU should reschedule.

runq_tickle(), right now, is called from within rt_vcpu_wake(), which
makes perfect sense, and from within rt_context_saved(), which makes no
sense. It should be called from within rt_vcpu_wake() and from the
replenishment handling routine.

Scheduling, to me, is __runq_pick() + the if() in rt_schedule() that
checks whether scurr->cur_deadline<=snext->cur_deadline.

Anyway, I've zero interest in turning this into a fight over
terminology... If you want to call runq_tickle() "the scheduler", go
ahead, it would just make communication a bit more difficult, but I'm up
for the challenge! :-)

Oh, BTW, while we're here, __runq_pick() being called from a bunch of
places, outside of rt_schedule(), is also something I never liked (you
can go check that in the archives), and I also blame the confusion
between scheduling and replenishmets, for the fact that such a thing is
necessary. I seriously hope this can be fixed too.

> Also, I'd like to point out that when I said things like "make more intelligent"
> decision handling, I meant exactly things like the above: using as much
> information as possible to prevent scheduler invocations. The current patch
> doesn't reschedule for any replenishment.
>
How does it not? you're arming s_timer to fire at either next
replenishment or budget enforcement point in time...

>  Firstly, it doesn't even look at
> the run queue: replenishments here can only happen on miss.
> 
... Oh, wait, I know! I know! I know! You mean "it doesn't tickle at any
replenishment", or "it doesn't context switch at any replenishment",
don't you?

(See, I'm good at adapting to "the new" terminology! :-D)

> > So, see? Transient  violation of EDF is just there, no matter the
> > approach!
> 
> Of course, violation during the scheduler is possible for any implementation :(
> 
Yeah, well, it was you that said that which should stick to your design
to prevent violations, which would have crept in if going with my
solution. Bha, anyway, it's probably best to let go...

> > I really don't see what you mean here. There won't be anything like that
> > to take care of. Xen offers timers as an abstraction, and deals with
> > them itself. You only need to take care of properly serializing
> > rt_schedule() and the timer handling routine, for the code sections that
> > require that.
> 
> This is good to know, thanks. The question would then change to how does
> Xen handle such an occurrence, but I think this is irrelevant now that I know
> which "side" you had in mind - that is, that the replenishment handler would
> decide when the scheduler should be invoked.
> 
It should call runq_tickle(), yes.

> >> It would be a hard pitch for a gEDF scheduler with a
> >> similar "accounting period" where the gEDF policy could be violated.
> >>
> > There's no accounting period of any sort. It's called overhead!
> 
> This part was referring to if we had two timers without extensive communication,
> cooperation, or information/logic sharing.
> 
I'm in the dark about what this "extensive communication, cooperation,
or information/logic sharing" could be, I have to admit.

If it means that the replenishment handler should call runq_tickle(),
then yes, let me state it once more: replenishment handling routine
should call runq_tickle() (other places, with the only other exception
of rt_vcpu_wake(), shouldn't).

> > You also may know that there is an EDF implementation in mainline Linux,
> > and you may would like to go have a look there too. I'm not including
> > any example from there because I'm one of the  main authors of it, so
> > it's pretty obvious that it uses the approach I think it's best (and,
> > more important, it would be rather inelegant to cite myself! :-D).
> >
> > Anyway, if you fancy doing that, the core is in kernel/sched/deadline.c.
> 
> Thanks for the history and background lesson here :)
> 
NP. Did you have a look? What do you think, do you like it? :-D

> >> I simply
> >> don't see how it can
> >> be done without heavy interaction and information sharing between them
> >> which really
> >> defeats the purpose.
> >>
> > No, it doesn't.
> 
> Ok this last line is the zinger.
> 
> Almost this entire email was built on the premise that you would NOT like the
> idea of the replenishment handler having basically the same decision logic
> as the scheduler and then tickling the pCPU to pick up the new vCPU. 
>
Which is true, they're separate and different things, the must have
separate and different logic. The fact that one may decide when it's
time to call the other is perfectly logical.

And in fact, I want __runq_pick() and related logic to be in
rt_schedule(), and nowhere else, while I want runq_tickle() to be done
from replenishment (and wakeup), and nowhere else.

> Actually,
> with it done this way, I would have a hard time calling the
> tickle-invoked method
> the "scheduler." 
>
With "tickle-invoked method" you mean rt_schedule(), I guess.

> In this case, I'm not too sure
> actually how much
> different this would be from what we have now. 
>
Very different. Mostly because functions will have a clear and well
defined semantic, and will be called to fulfill their own purpose, from
places where it makes sense to do so, not every now and then, like now:

 - rt_schedule() for picking a new vcpu to be run and for doing runtime
   accounting and enforcement;
 - __runq_pick() to do the actual pick operation, from within
   rt_schedule() only, not every now and then, like now;
 - __repl_update() to do replenishments, from within the replenishment
   handling routine, in an event-triggered fashion, not every now and
   then, like now;
 - runq_tickle() to check whether a pCPU needs to go through the
   scheduler, from either the vcpu wakeup path, or because of a
   replenishment, not every now and then, like now.

> It feels like, from
> what you want,
> that we could get the same behavior by modifying rt_schedule to do
> replenishments
> first, then check if a reschedule is needed (these two parts would be in this
> proposed replenishment timer routine) and the perform any move if necessary. I
> know this isn't exactly what you want, but that sounds close right?
>
Ehm... not really? It indeed sounds like the opposite of how I think
things should be done (and I'm running out of different ways in which
I'm able to tell that! :-O)

> But the scheduler will have to decide which to move in, so that logic
> is done twice.
>
Done twice? What's done twice?

> Also, if these are done back-to-back and require the locks, isn't it
> really the same
> as having one long hot path? 
>
What's back-to-back?

> If you want maintainability, couldn't we just do
> replenishment then schedule and move (if necessary) in one timer (the
> one we have
> now) and move them to functions. It seems this can be done with one
> timer neatly.
> 
Not to me, but I may have lost you again, sorry.

> So here's my proposal, lets see if it fits what you want:
>
Ok, let me leave this here, and deal with this last part by replying to
Meng's email.

Regards,
Dario

> 1.) The scheduler does not do any timing,
> 2.) replenishments are scheduled via timer at each [next]
> replenishment period. Each
> replenishment resorts the replenished vCPUs, and if any of the first
> #CPUs in the
> runq change, we tickle a pCPU for each change
> 
> In this case, we can use one timer. We could use the current one as a
> replenishment
> timer, and make it so rt_schedule isn't the default invoked method.
> 
> Does this sound similar to what you are suggesting? I have to ask
> because I thought
> you wouldn't like the idea, and its not really *that* far off from
> what we have now, Its
> a little restructuring so that replenishments occur before any
> scheduling activity and
> the handler checks if switching is needed (basically acting as the
> scheduler) and then
> calls tickle. Sounds like what you had in mind?
> 
> Regards,
> ~Dagaen

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-17  5:45               ` Meng Xu
  2015-06-17  6:03                 ` Dagaen Golomb
@ 2015-06-17  9:20                 ` Dario Faggioli
  1 sibling, 0 replies; 21+ messages in thread
From: Dario Faggioli @ 2015-06-17  9:20 UTC (permalink / raw)
  To: Meng Xu
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li, Dagaen Golomb


[-- Attachment #1.1: Type: text/plain, Size: 6394 bytes --]

On Tue, 2015-06-16 at 22:45 -0700, Meng Xu wrote:

> > So here's my proposal, lets see if it fits what you want:
>
> I will leave this to Dario to answer if it fits what he wants. :-P

> > 1.) The scheduler does not do any timing,

> Not really. The rt_schedule does the budget enforcement. When a
> current VCPU runs out of budget, the rt_schedule will be invoked by a
> timer (you can refer to the schedule function in xen/sched/schedule.c
> to have a look how the timer is armed/disarmed.). When the rt_schedule
> is invoked, it needs to:
> a) update the budget of the current running VCPU and move it from runq
> to depleted queue;
> b) pick the current highest VCPU from runq and return it as the snext VCPU.
> 
> So scheduling logic is still  involved in the rt_schedule function.
> 
Correct, thanks Meng.

> > 2.) replenishments are scheduled via timer at each [next]
> > replenishment period. Each
> > replenishment resorts the replenished vCPUs, and if any of the first
> > #CPUs in the
> > runq change, we tickle a pCPU for each change

> This is right.
> 
Yeah, sort of. I mean, I'm not sure what "Each replenishment resorts the
replenished vCPUs" means. If it means that a replenished vCPU is given a
new deadline, and hence needs to move from depleted queue to runqueue,
in a position within runqueue that reflects its new deadline... then, I
agree.

This requires holding the runqueue lock, but we need it anyway, for now.

This may or may not require a pCPU to be tickled, and it's
runq_tickle()'s job to decide whether that is the case and, if yes,
which one.

Also, about this "we tickle a pCPU for each change": first of all, as I
just said, that's not _always_ the case, and we should avoid tickling
when it's not necessary. Also, when it's necessary in principle, it
makes few sense to tickle a pCPU twice in a very tight loop (i.e.,
without giving it the chance to reschedule).

So if, for instance, two different replenishments (no matter whether
being serviced together, or in different invocations of the
replenishment timer handler) both determine that pCPU Y is the one that
should be tickled (because it's the one running the vCPU with the latest
deadline, and both the two replenished vCPUs have more imminent deadline
than what's running on Y, but later than what's running everywhere
else), there's no point in tickling twice. Not that doing it would be
harmful, it just won't have any effect. What I mean is much rather that
you can use this information to keep runq_tickle() simple/quick.

Note that this is pretty much what's happening already in runq_tickle(),
thanks to the use of the prv->tickled bitmap, so the point here is
making sure that "we tickle a pCPU for each change" doesn't mean
removing this, because I think that would be wrong. It's probably
unlikely that this was what Dagaen meant, but I though I better state
things clearly. :-)

> > In this case, we can use one timer.
>
> We actually have two: one for budget enforcement and the other for
> budget replenishment.
> 
EXACTLY!! :-)

> > We could use the current one as a
> > replenishment
> > timer, and make it so rt_schedule isn't the default invoked method.
> >
> > Does this sound similar to what you are suggesting?
>
> I don't think so, but I will leave it for Dario's for his opinion.
> 
"use current one as replenishment timer, and make it so rt_schedule
isn't the default invoked method"

Please, please, please, don't do that! :-P

> In Dario's suggestion, you just simply remove the update_budget
> function out of rt_schedule. This is because budget enforcement, which
> is the work of rt_schedule, does not naturelly involves budget
> replenishment.
> 
Yes. Or at least, that's the core of my suggestion, yes. In more
details, my suggestion includes getting rid of a bunch of other
invocations of scheduling/picking and replenishment related functions
from many other places, as I tried to make clear in my last email.

Perhaps, the fact that I haven't stated that clearly enough since now,
was what was making it a bit harder for you to see what I meant with
'making things simple', etc. I hope I explained myself better now.

> In order to achieve budget replenishment, we need another timer to
> invoke another function (budget_replenish function), that is dedicated
> to that.
> 
Yep, indeed.

> > I have to ask
> > because I thought
> > you wouldn't like the idea,
>
> I guess Dario won't like this idea. :-P (I'm kidding, but it should be
> the case.)
> 
You're right, it's not. :-)

> >
> > and its not really *that* far off from
> > what we have now, Its
> > a little restructuring so that replenishments occur before any
> > scheduling activity and
> > the handler checks if switching is needed (basically acting as the
> > scheduler) and then
> > calls tickle. Sounds like what you had in mind?
> 
This need for an strict ordering between replenishments and scheduling
is something that you're (Dagaen) insisting a lot on, while I really
don't see why it would be a good thing.

I've stated countless time that they're independent, which also mean
there's no strict ordering requirement between them.

Maybe what you're hinting at is that we may want to think to a better
way of handling rescheduling on the 'local' pCPU. That is to say, if
replenishment timer fires on pCPU X, and runq_tickle() determines that
it's pCPU X itself that should reschedule, there may be a quicker way to
do so, than raising the SCHEDULE_SOFTIRQ and waiting. I.e., some kind of
"hey man, we're here, let's just call (rt_)schedule() and get done with
it!".

If it's this, I thought about it too, many times, but whether it is an
actualy issue, and whether it would actually be a good thing, it's still
unclear. For instance, Linux has this, but, of course, we are not
Linux. :-)

In any case, this is independent from Dagaen's work, and it's even
general enough that other schedulers may be interested in it, so I'm
happy to leave this for future investigation.

Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-17  8:27               ` Dario Faggioli
@ 2015-06-18 18:07                 ` Dagaen Golomb
  2015-06-22  9:11                   ` Dario Faggioli
  0 siblings, 1 reply; 21+ messages in thread
From: Dagaen Golomb @ 2015-06-18 18:07 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li

>> Yes, this is an option. However, I thought this would actually be an
>> option you would
>> not like.
>>
> How so... I've been arguing for this the whole time?!?! :-O
>
> I'm sure I've put down a sketch of what I think the replenishment
> function should do in my first or second email in the thread, and I'm
> sure that involved a call to runq_tickle()

I just thought that since you didn't like the scheduler handling replenishments,
that you wouldn't be very fond of replenishment calling the scheduler. I don't
find either an issue so if you like one over the other, then that's
fine of course.

>> For the replenishment routine to know when to call the
>> scheduler and when
>> not to, it basically has to perform the the scheduler's logic, at
>> which point if there is a
>> change... why doesn't it just do it (i.e., be the scheduler) instead
>> of adding logic and
>> possibly overhead to call it.
>>
> It has to call runq_tickle(), which is not "the scheduler's logic", is
> the function that decides whether a pCPU should reschedule.
>
> runq_tickle(), right now, is called from within rt_vcpu_wake(), which
> makes perfect sense, and from within rt_context_saved(), which makes no
> sense. It should be called from within rt_vcpu_wake() and from the
> replenishment handling routine.
>
> Scheduling, to me, is __runq_pick() + the if() in rt_schedule() that
> checks whether scurr->cur_deadline<=snext->cur_deadline.
>
> Anyway, I've zero interest in turning this into a fight over
> terminology... If you want to call runq_tickle() "the scheduler", go
> ahead, it would just make communication a bit more difficult, but I'm up
> for the challenge! :-)
>
> Oh, BTW, while we're here, __runq_pick() being called from a bunch of
> places, outside of rt_schedule(), is also something I never liked (you
> can go check that in the archives), and I also blame the confusion
> between scheduling and replenishmets, for the fact that such a thing is
> necessary. I seriously hope this can be fixed too.

I have no interest in a terminology war either. I just figured that if we want
the replenishment function to only tickle when necessary, then it should
check that the replenished vCPU is a greater priority than current ones.
At this point it has essentially done the entire scheduling decision. I agree
it can then tickle and let pCPU pick it up, and I find this a fine solution. As
I said before I guess I just thought you wouldn't like the idea, but its clear
you do.

>> > I really don't see what you mean here. There won't be anything like that
>> > to take care of. Xen offers timers as an abstraction, and deals with
>> > them itself. You only need to take care of properly serializing
>> > rt_schedule() and the timer handling routine, for the code sections that
>> > require that.
>>
>> This is good to know, thanks. The question would then change to how does
>> Xen handle such an occurrence, but I think this is irrelevant now that I know
>> which "side" you had in mind - that is, that the replenishment handler would
>> decide when the scheduler should be invoked.
>>
> It should call runq_tickle(), yes.

Gotcha.

>> Thanks for the history and background lesson here :)
>>
> NP. Did you have a look? What do you think, do you like it? :-D
>
>> >> I simply
>> >> don't see how it can
>> >> be done without heavy interaction and information sharing between them
>> >> which really
>> >> defeats the purpose.
>> >>
>> > No, it doesn't.
>>
>> Ok this last line is the zinger.
>>
>> Almost this entire email was built on the premise that you would NOT like the
>> idea of the replenishment handler having basically the same decision logic
>> as the scheduler and then tickling the pCPU to pick up the new vCPU.
>>
> Which is true, they're separate and different things, the must have
> separate and different logic. The fact that one may decide when it's
> time to call the other is perfectly logical.
>
> And in fact, I want __runq_pick() and related logic to be in
> rt_schedule(), and nowhere else, while I want runq_tickle() to be done
> from replenishment (and wakeup), and nowhere else.

I guess the last remaining question is this: when the scheduler is
enforcing a budget and times out, should it check for replenishment
before kicking the vCPU? Or assume that any relevant replenishments
have occurred? This is the issue I mentioned before about two timers
armed at the same time - I'm not sure what convention Xen uses to
order them. I would assume from your very reason for mentioning this
change that you don't want any replenishment in rt_schedule, but then
it may kick a vCPU who at that very instant is supposed to be replenished
as well, and should actually stay on the pCPU. This is assuming you
still want rt_schedule to be timer-triggered to enforce budget, is this
correct?

However, I think this is a more minor issue that we can sort out via
inspecting the default Xen behavior, or allowing a single replenishment
call before kicking (I don't expect you to like that option :P), or some
other method.

>
>> Actually,
>> with it done this way, I would have a hard time calling the
>> tickle-invoked method
>> the "scheduler."
>>
> With "tickle-invoked method" you mean rt_schedule(), I guess.
>
>> In this case, I'm not too sure
>> actually how much
>> different this would be from what we have now.
>>
> Very different. Mostly because functions will have a clear and well
> defined semantic, and will be called to fulfill their own purpose, from
> places where it makes sense to do so, not every now and then, like now:
>
>  - rt_schedule() for picking a new vcpu to be run and for doing runtime
>    accounting and enforcement;
>  - __runq_pick() to do the actual pick operation, from within
>    rt_schedule() only, not every now and then, like now;
>  - __repl_update() to do replenishments, from within the replenishment
>    handling routine, in an event-triggered fashion, not every now and
>    then, like now;
>  - runq_tickle() to check whether a pCPU needs to go through the
>    scheduler, from either the vcpu wakeup path, or because of a
>    replenishment, not every now and then, like now.

Okay, I understand this. My [poor] point was that how it is done now could
be better separated into logical parts with functions.

>> But the scheduler will have to decide which to move in, so that logic
>> is done twice.
>>
> Done twice? What's done twice?

I mean its already moved the vCPU and knows it should go on pCPU X, now
when we call tickle that function itself will have to choose which to kick. It's
doing this work twice, we already knew which one was new. Its not a lot of
work and isn't an issue, just feels like the replenishment routine, in order
to only tickle when necessary, is basically acting as the scheduler. Just a
weird way my mind views it.

>> Also, if these are done back-to-back and require the locks, isn't it
>> really the same
>> as having one long hot path?
>>
> What's back-to-back?

A replenishment that causes a resched - these will be back to back and will both
involve the lock. Its definetly not worse than the current solution, just that
its likely going to be about the same amount of time in lock. I'm just saying
the lock timing in the current solution doesn't appear like it would be much
worse, although we do have all the maintainability and structure arguments
against it, so this is a moot point.

Overall, I see where you're coming from now. I think the only issue still
is whether rt_schedule should absolutely never check for replenishments.
I guess my last question is, when rt_schedule is reinvoked by timer because
a budget is being exhausted, what do you expect it to do? Nothing? (Besides
move it to the depletedq) and rely on the budget replenish routine to to handle
it? Should it wait for that routine or call it?

Besides this I understand where you want this to go, and I think its worth
pursuing.

Regards,
~Dagaen

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-18 18:07                 ` Dagaen Golomb
@ 2015-06-22  9:11                   ` Dario Faggioli
  2015-06-22 11:58                     ` Dagaen Golomb
  0 siblings, 1 reply; 21+ messages in thread
From: Dario Faggioli @ 2015-06-22  9:11 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li


[-- Attachment #1.1: Type: text/plain, Size: 5071 bytes --]

On Thu, 2015-06-18 at 14:07 -0400, Dagaen Golomb wrote:

> > Anyway, I've zero interest in turning this into a fight over
> > terminology... If you want to call runq_tickle() "the scheduler", go
> > ahead, it would just make communication a bit more difficult, but I'm up
> > for the challenge! :-)
> >
> > Oh, BTW, while we're here, __runq_pick() being called from a bunch of
> > places, outside of rt_schedule(), is also something I never liked (you
> > can go check that in the archives), and I also blame the confusion
> > between scheduling and replenishmets, for the fact that such a thing is
> > necessary. I seriously hope this can be fixed too.
> 
> I have no interest in a terminology war either. I just figured that if we want
> the replenishment function to only tickle when necessary, then it should
> check that the replenished vCPU is a greater priority than current ones.
>
Exactly, we need a preemption check, which is part of the 'scheduling
logic'... but everything in the file is part of the scheduling logic (or
it will be in another one), and we don't want to put everything that is
in the file in one only function! :-)

An in fact, a preemption check is not the full scheduling logic it's...
well... a preemption check. For example, if there are idle pCPUs, it's
something very quick.

Note that, right now, this preemption check is done by acquiring the
global lock and checking the deadlines of currently running vCPUs on all
pCPUs. In future, this can be modified/improved, by using a dedicate
data structure, and a locking scheme that would reduce the pressure on
the global lock. I'm not saying that you should do this as part of the
work you're doing now. Rather, I'm saying that the work being done now
should go in the direction of making this easier, not harder.

> > And in fact, I want __runq_pick() and related logic to be in
> > rt_schedule(), and nowhere else, while I want runq_tickle() to be done
> > from replenishment (and wakeup), and nowhere else.
> 
> I guess the last remaining question is this: when the scheduler is
> enforcing a budget and times out, should it check for replenishment
> before kicking the vCPU? Or assume that any relevant replenishments
> have occurred?
>
This is an implementation detail that is quite hard to discuss without
seeing the code. In theory, if there wasn't any overhead, etc., you
shouldn't, because it should not be necessary. In practise, yes, it is
possible that various sources of overhead prevent a replenishment to be
notified in time, and that you end up with a vCPU running, and consuming
the last bits of its budget _after_ a scheduled replenishment instant as
it is, I guess, possible that you figure during a replenishment that the
budget was exhausted and the vCPU had overrun a bit, I guess (or not, it
again depends on the actual implementation).

So, yes, you certainly need to take care of this corner cases, and you
can do it in the way you think it's best, basing on how the code ends up
looking like. The important thing is that they're treated for what they
are, i.e., we should handle them, not design the code around them.

> This is the issue I mentioned before about two timers
> armed at the same time - I'm not sure what convention Xen uses to
> order them. 
>
No, I don't think this has much to do with the internals of timers'
implementation, it's a corner case that, due to overhead, long critical
section (or, in general, interrupts-off sections), etc., will always be
present and will need to be taken care of, no mater how the scheduler is
implemented.

> I would assume from your very reason for mentioning this
> change that you don't want any replenishment in rt_schedule, but then
> it may kick a vCPU who at that very instant is supposed to be replenished
> as well, and should actually stay on the pCPU.
>
That depends of the vCPU, on it's budget and deadline, and on what's
running on other pCPUs. And that is something unlikely, although
possible, and it should be the exception rather than the rule.

Anyway, I think it would be correct, for instance, to check during
rt_schedule() whether the vCPU running on the pCPU is already beyond
it's deadline/replenishment time. If yes, log/count a deadline miss
(because that's what just happened!), replenish it and continue with
rt_schedule().

>  This is assuming you
> still want rt_schedule to be timer-triggered to enforce budget, is this
> correct?
> 
Of course.

> However, I think this is a more minor issue that we can sort out via
> inspecting the default Xen behavior, or allowing a single replenishment
> call before kicking (I don't expect you to like that option :P), or some
> other method.
> 
Exactly.

Thanks and Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-22  9:11                   ` Dario Faggioli
@ 2015-06-22 11:58                     ` Dagaen Golomb
  0 siblings, 0 replies; 21+ messages in thread
From: Dagaen Golomb @ 2015-06-22 11:58 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li

Thank you. This clears things up a bit. I will work on this model and
make (or ignore) any corner case decisions I feel adequate for now.
We can focus on them after the main structure is in place.
~Dagaen

On Mon, Jun 22, 2015 at 5:11 AM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> On Thu, 2015-06-18 at 14:07 -0400, Dagaen Golomb wrote:
>
>> > Anyway, I've zero interest in turning this into a fight over
>> > terminology... If you want to call runq_tickle() "the scheduler", go
>> > ahead, it would just make communication a bit more difficult, but I'm up
>> > for the challenge! :-)
>> >
>> > Oh, BTW, while we're here, __runq_pick() being called from a bunch of
>> > places, outside of rt_schedule(), is also something I never liked (you
>> > can go check that in the archives), and I also blame the confusion
>> > between scheduling and replenishmets, for the fact that such a thing is
>> > necessary. I seriously hope this can be fixed too.
>>
>> I have no interest in a terminology war either. I just figured that if we want
>> the replenishment function to only tickle when necessary, then it should
>> check that the replenished vCPU is a greater priority than current ones.
>>
> Exactly, we need a preemption check, which is part of the 'scheduling
> logic'... but everything in the file is part of the scheduling logic (or
> it will be in another one), and we don't want to put everything that is
> in the file in one only function! :-)
>
> An in fact, a preemption check is not the full scheduling logic it's...
> well... a preemption check. For example, if there are idle pCPUs, it's
> something very quick.
>
> Note that, right now, this preemption check is done by acquiring the
> global lock and checking the deadlines of currently running vCPUs on all
> pCPUs. In future, this can be modified/improved, by using a dedicate
> data structure, and a locking scheme that would reduce the pressure on
> the global lock. I'm not saying that you should do this as part of the
> work you're doing now. Rather, I'm saying that the work being done now
> should go in the direction of making this easier, not harder.
>
>> > And in fact, I want __runq_pick() and related logic to be in
>> > rt_schedule(), and nowhere else, while I want runq_tickle() to be done
>> > from replenishment (and wakeup), and nowhere else.
>>
>> I guess the last remaining question is this: when the scheduler is
>> enforcing a budget and times out, should it check for replenishment
>> before kicking the vCPU? Or assume that any relevant replenishments
>> have occurred?
>>
> This is an implementation detail that is quite hard to discuss without
> seeing the code. In theory, if there wasn't any overhead, etc., you
> shouldn't, because it should not be necessary. In practise, yes, it is
> possible that various sources of overhead prevent a replenishment to be
> notified in time, and that you end up with a vCPU running, and consuming
> the last bits of its budget _after_ a scheduled replenishment instant as
> it is, I guess, possible that you figure during a replenishment that the
> budget was exhausted and the vCPU had overrun a bit, I guess (or not, it
> again depends on the actual implementation).
>
> So, yes, you certainly need to take care of this corner cases, and you
> can do it in the way you think it's best, basing on how the code ends up
> looking like. The important thing is that they're treated for what they
> are, i.e., we should handle them, not design the code around them.
>
>> This is the issue I mentioned before about two timers
>> armed at the same time - I'm not sure what convention Xen uses to
>> order them.
>>
> No, I don't think this has much to do with the internals of timers'
> implementation, it's a corner case that, due to overhead, long critical
> section (or, in general, interrupts-off sections), etc., will always be
> present and will need to be taken care of, no mater how the scheduler is
> implemented.
>
>> I would assume from your very reason for mentioning this
>> change that you don't want any replenishment in rt_schedule, but then
>> it may kick a vCPU who at that very instant is supposed to be replenished
>> as well, and should actually stay on the pCPU.
>>
> That depends of the vCPU, on it's budget and deadline, and on what's
> running on other pCPUs. And that is something unlikely, although
> possible, and it should be the exception rather than the rule.
>
> Anyway, I think it would be correct, for instance, to check during
> rt_schedule() whether the vCPU running on the pCPU is already beyond
> it's deadline/replenishment time. If yes, log/count a deadline miss
> (because that's what just happened!), replenish it and continue with
> rt_schedule().
>
>>  This is assuming you
>> still want rt_schedule to be timer-triggered to enforce budget, is this
>> correct?
>>
> Of course.
>
>> However, I think this is a more minor issue that we can sort out via
>> inspecting the default Xen behavior, or allowing a single replenishment
>> call before kicking (I don't expect you to like that option :P), or some
>> other method.
>>
> Exactly.
>
> Thanks and Regards,
> Dario
>
> --
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2015-06-22 11:58 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-08 11:46 [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling Dagaen Golomb
2015-06-09 12:53 ` Dario Faggioli
2015-06-10  4:18   ` Dagaen Golomb
2015-06-10 22:30     ` Dario Faggioli
2015-06-13 20:33       ` Dagaen Golomb
2015-06-16 17:07         ` Dagaen Golomb
2015-06-17  0:20           ` Dario Faggioli
2015-06-17  5:24             ` Dagaen Golomb
2015-06-17  5:45               ` Meng Xu
2015-06-17  6:03                 ` Dagaen Golomb
2015-06-17  6:09                   ` Meng Xu
2015-06-17  9:20                 ` Dario Faggioli
2015-06-17  8:27               ` Dario Faggioli
2015-06-18 18:07                 ` Dagaen Golomb
2015-06-22  9:11                   ` Dario Faggioli
2015-06-22 11:58                     ` Dagaen Golomb
2015-06-10  5:54   ` Meng Xu
2015-06-10 17:46     ` Dario Faggioli
2015-06-11  5:50       ` Meng Xu
2015-06-12  9:28         ` Dario Faggioli
2015-06-13 17:21           ` Meng Xu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).