* [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. @ 2015-06-08 11:46 Dagaen Golomb 2015-06-09 12:53 ` Dario Faggioli 0 siblings, 1 reply; 21+ messages in thread From: Dagaen Golomb @ 2015-06-08 11:46 UTC (permalink / raw) To: xen-devel Cc: wei.liu2, xisisu, george.dunlap, dario.faggioli, mengxu, jbeulich, lichong659 To do this, we create a new list that holds, for each vcpu, the time least into the future that it may need to be rescheduled. The scheduler chooses the lowest time off of this list and waits until the specified time instead of running every 1 ms as it did before. Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu> Signed-off-by: Meng Xu <mengxu@cis.upenn.edu> --- xen/common/sched_rt.c | 319 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 222 insertions(+), 97 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 7c39a9e..25f0458 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -4,6 +4,7 @@ * * by Sisu Xi, 2013, Washington University in Saint Louis * and Meng Xu, 2014, University of Pennsylvania + * and Dagaen Golomb, 2015, University of Pennsylvania * * based on the code of credit Scheduler */ @@ -134,6 +135,8 @@ struct rt_private { struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list of depleted vcpus */ + struct list_head timerq; /* ascending list of next required scheduling + decision */ cpumask_t tickled; /* cpus been tickled */ }; @@ -143,6 +146,7 @@ struct rt_private { struct rt_vcpu { struct list_head q_elem; /* on the runq/depletedq list */ struct list_head sdom_elem; /* on the domain VCPU list */ + struct list_head t_elem; /* on the timerq */ /* Up-pointers */ struct rt_dom *sdom; @@ -156,6 +160,7 @@ struct rt_vcpu { s_time_t cur_budget; /* current budget */ s_time_t last_start; /* last start time */ s_time_t cur_deadline; /* current deadline for EDF */ + s_time_t next_sched_needed; /* next time to make scheduling decision */ unsigned flags; /* mark __RTDS_scheduled, etc.. */ }; @@ -197,6 +202,11 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops) return &rt_priv(ops)->depletedq; } +static inline struct list_head *rt_timerq(const struct scheduler *ops) +{ + return &rt_priv(ops)->timerq; +} + /* * Queue helper functions for runq and depletedq */ @@ -212,6 +222,11 @@ __q_elem(struct list_head *elem) return list_entry(elem, struct rt_vcpu, q_elem); } +static struct rt_vcpu * __t_elem(struct list_head *elem) +{ + return list_entry(elem, struct rt_vcpu, t_elem); +} + /* * Debug related code, dump vcpu/cpu information */ @@ -231,7 +246,8 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc) cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity); printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime")," - " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n" + " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime + " next_sched=%"PRI_stime"\n" " \t\t onQ=%d runnable=%d cpu_hard_affinity=%s ", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, @@ -241,6 +257,7 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc) svc->cur_budget, svc->cur_deadline, svc->last_start, + svc->next_sched_needed, __vcpu_on_q(svc), vcpu_runnable(svc->vcpu), cpustr); @@ -264,7 +281,7 @@ rt_dump_pcpu(const struct scheduler *ops, int cpu) static void rt_dump(const struct scheduler *ops) { - struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *iter; + struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *timerq, *iter; struct rt_private *prv = rt_priv(ops); struct rt_vcpu *svc; struct rt_dom *sdom; @@ -277,6 +294,7 @@ rt_dump(const struct scheduler *ops) runq = rt_runq(ops); depletedq = rt_depletedq(ops); + timerq = rt_timerq(ops); printk("Global RunQueue info:\n"); list_for_each( iter, runq ) @@ -292,6 +310,14 @@ rt_dump(const struct scheduler *ops) rt_dump_vcpu(ops, svc); } + printk("Global TimerQueue info:\n"); + list_for_each( iter, timerq ) + { + svc = __t_elem(iter); + printk("\tvcpu %d next_sched=%"PRI_stime"\n", svc->vcpu->vcpu_id, + svc->next_sched_needed); + } + printk("Domain info:\n"); list_for_each( iter_sdom, &prv->sdom ) { @@ -361,6 +387,12 @@ __q_remove(struct rt_vcpu *svc) list_del_init(&svc->q_elem); } +static inline void __t_remove(struct rt_vcpu *svc) +{ + if( !list_empty(&svc->t_elem) ) + list_del_init(&svc->t_elem); +} + /* * Insert svc with budget in RunQ according to EDF: * vcpus with smaller deadlines go first. @@ -395,6 +427,72 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) } /* + * Insert svc into the timerq, maintaining ascending order by next_sched_needed. + */ +static void __timerq_insert(const struct scheduler *ops, struct rt_vcpu *svc) +{ + struct rt_private *prv = rt_priv(ops); + struct list_head *timerq = rt_timerq(ops); + struct list_head *iter = timerq; + + ASSERT( spin_is_locked(&prv->lock) ); + + ASSERT( list_empty(&svc->t_elem) ); + + list_for_each(iter, timerq) + { + struct rt_vcpu * iter_svc = __t_elem(iter); + if ( svc->next_sched_needed <= iter_svc->next_sched_needed ) + break; + } + list_add_tail(&svc->t_elem, iter); +} + +/* + * Return how far the lowest time on the timerq (that is after NOW) is in the + * future. + * Lock must be grabbed before calling this. + */ +static s_time_t __timerq_next(const struct scheduler *ops, s_time_t now) +{ + struct list_head *timerq = rt_timerq(ops); + struct list_head *iter, *tmp; + + list_for_each_safe(iter, tmp, timerq) + { + struct rt_vcpu * iter_svc = __t_elem(iter); + + if ( iter_svc->next_sched_needed > now ) + return (iter_svc->next_sched_needed - now); + else + __t_remove(iter_svc); + } + + return MAX_SCHEDULE; +} + +/* + * Updates the next_sched_needed field for a vcpu, which is used for + * determining the next needed schedule time for this vcpu. Then updates + * timerq via reinsert. + */ +static void update_sched_time(const struct scheduler *ops, s_time_t now, + struct rt_vcpu *svc) +{ + /* update next needed schedule time */ + if( test_bit(__RTDS_scheduled, &svc->flags) ) + svc->next_sched_needed = now + svc->cur_budget; + else + svc->next_sched_needed = svc->cur_deadline; + + /* Remove and reinsert to maintain sorted order in timerq */ + __t_remove(svc); + __timerq_insert(ops, svc); + + return; +} + +/* * Init/Free related code */ static int @@ -413,6 +511,7 @@ rt_init(struct scheduler *ops) INIT_LIST_HEAD(&prv->sdom); INIT_LIST_HEAD(&prv->runq); INIT_LIST_HEAD(&prv->depletedq); + INIT_LIST_HEAD(&prv->timerq); cpumask_clear(&prv->tickled); @@ -536,10 +635,12 @@ rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) INIT_LIST_HEAD(&svc->q_elem); INIT_LIST_HEAD(&svc->sdom_elem); + INIT_LIST_HEAD(&svc->t_elem); svc->flags = 0U; svc->sdom = dd; svc->vcpu = vc; svc->last_start = 0; + svc->next_sched_needed = 0; svc->period = RTDS_DEFAULT_PERIOD; if ( !is_idle_vcpu(vc) ) @@ -579,7 +680,10 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) rt_update_deadline(now, svc); if ( !__vcpu_on_q(svc) && vcpu_runnable(vc) && !vc->is_running ) + { __runq_insert(ops, svc); + update_sched_time(ops, now, svc); + } /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); @@ -601,8 +705,8 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) BUG_ON( sdom == NULL ); lock = vcpu_schedule_lock_irq(vc); - if ( __vcpu_on_q(svc) ) - __q_remove(svc); + __q_remove(svc); + __t_remove(svc); vcpu_schedule_unlock_irq(lock, vc); if ( !is_idle_vcpu(vc) ) @@ -763,6 +867,8 @@ __repl_update(const struct scheduler *ops, s_time_t now) /* reinsert the vcpu if its deadline is updated */ __q_remove(svc); __runq_insert(ops, svc); + /* Update next needed sched time if deadline is updated */ + update_sched_time(ops, now, svc); } list_for_each_safe(iter, tmp, depletedq) @@ -773,11 +879,102 @@ __repl_update(const struct scheduler *ops, s_time_t now) rt_update_deadline(now, svc); __q_remove(svc); /* remove from depleted queue */ __runq_insert(ops, svc); /* add to runq */ + /* Update next needed sched time since deadline is updated */ + update_sched_time(ops, now, svc); } } } /* + * Pick a cpu where to run a vcpu, + * possibly kicking out the vcpu running there + * Called by wake() and context_saved() + * We have a running candidate here, the kick logic is: + * Among all the cpus that are within the cpu affinity + * 1) if the new->cpu is idle, kick it. This could benefit cache hit + * 2) if there are any idle vcpu, kick it. + * 3) now all pcpus are busy; + * among all the running vcpus, pick lowest priority one + * if snext has higher priority, kick it. + * + * TODO: + * 1) what if these two vcpus belongs to the same domain? + * replace a vcpu belonging to the same domain introduces more overhead + * + * lock is grabbed before calling this function + */ +static void +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new) +{ + struct rt_private *prv = rt_priv(ops); + struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */ + struct rt_vcpu *iter_svc; + struct vcpu *iter_vc; + int cpu = 0, cpu_to_tickle = 0; + cpumask_t not_tickled; + cpumask_t *online; + + if ( new == NULL || is_idle_vcpu(new->vcpu) ) + return; + + online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool); + cpumask_and(¬_tickled, online, new->vcpu->cpu_hard_affinity); + cpumask_andnot(¬_tickled, ¬_tickled, &prv->tickled); + + /* 1) if new's previous cpu is idle, kick it for cache benefit */ + if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) ) + { + cpu_to_tickle = new->vcpu->processor; + goto out; + } + + /* 2) if there are any idle pcpu, kick it */ + /* The same loop also find the one with lowest priority */ + for_each_cpu(cpu, ¬_tickled) + { + iter_vc = curr_on_cpu(cpu); + if ( is_idle_vcpu(iter_vc) ) + { + cpu_to_tickle = cpu; + goto out; + } + iter_svc = rt_vcpu(iter_vc); + if ( latest_deadline_vcpu == NULL || + iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline ) + latest_deadline_vcpu = iter_svc; + } + + /* 3) candicate has higher priority, kick out lowest priority vcpu */ + if ( latest_deadline_vcpu != NULL && + new->cur_deadline < latest_deadline_vcpu->cur_deadline ) + { + cpu_to_tickle = latest_deadline_vcpu->vcpu->processor; + goto out; + } + + /* didn't tickle any cpu */ + SCHED_STAT_CRANK(tickle_idlers_none); + return; +out: + /* TRACE */ + { + struct { + unsigned cpu:16, pad:16; + } d; + d.cpu = cpu_to_tickle; + d.pad = 0; + trace_var(TRC_RTDS_TICKLE, 0, + sizeof(d), + (unsigned char *)&d); + } + + cpumask_set_cpu(cpu_to_tickle, &prv->tickled); + SCHED_STAT_CRANK(tickle_idlers_some); + cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ); + return; +} + +/* * schedule function for rt scheduler. * The lock is already grabbed in schedule.c, no need to lock here */ @@ -790,11 +987,9 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched struct rt_vcpu *snext = NULL; struct task_slice ret = { .migrated = 0 }; - /* clear ticked bit now that we've been scheduled */ - cpumask_clear_cpu(cpu, &prv->tickled); - /* burn_budget would return for IDLE VCPU */ burn_budget(ops, scurr, now); + update_sched_time(ops, now, scurr); __repl_update(ops, now); @@ -828,6 +1023,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched if ( snext != scurr ) { __q_remove(snext); + __t_remove(snext); set_bit(__RTDS_scheduled, &snext->flags); } if ( snext->vcpu->processor != cpu ) @@ -837,7 +1033,15 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched } } - ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */ + /* Check if we were tickled and clear bit if so. If not, tickle now since */ + /* timer must have fired */ + if ( cpumask_test_cpu(cpu, &prv->tickled) ) + cpumask_clear_cpu(cpu, &prv->tickled); + else + runq_tickle(ops, snext); + + /* Use minimum from timerq */ + ret.time = __timerq_next(ops, now); ret.task = snext->vcpu; /* TRACE */ @@ -879,95 +1083,8 @@ rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) __q_remove(svc); else if ( test_bit(__RTDS_delayed_runq_add, &svc->flags) ) clear_bit(__RTDS_delayed_runq_add, &svc->flags); -} - -/* - * Pick a cpu where to run a vcpu, - * possibly kicking out the vcpu running there - * Called by wake() and context_saved() - * We have a running candidate here, the kick logic is: - * Among all the cpus that are within the cpu affinity - * 1) if the new->cpu is idle, kick it. This could benefit cache hit - * 2) if there are any idle vcpu, kick it. - * 3) now all pcpus are busy; - * among all the running vcpus, pick lowest priority one - * if snext has higher priority, kick it. - * - * TODO: - * 1) what if these two vcpus belongs to the same domain? - * replace a vcpu belonging to the same domain introduces more overhead - * - * lock is grabbed before calling this function - */ -static void -runq_tickle(const struct scheduler *ops, struct rt_vcpu *new) -{ - struct rt_private *prv = rt_priv(ops); - struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */ - struct rt_vcpu *iter_svc; - struct vcpu *iter_vc; - int cpu = 0, cpu_to_tickle = 0; - cpumask_t not_tickled; - cpumask_t *online; - - if ( new == NULL || is_idle_vcpu(new->vcpu) ) - return; - - online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool); - cpumask_and(¬_tickled, online, new->vcpu->cpu_hard_affinity); - cpumask_andnot(¬_tickled, ¬_tickled, &prv->tickled); - /* 1) if new's previous cpu is idle, kick it for cache benefit */ - if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) ) - { - cpu_to_tickle = new->vcpu->processor; - goto out; - } - - /* 2) if there are any idle pcpu, kick it */ - /* The same loop also find the one with lowest priority */ - for_each_cpu(cpu, ¬_tickled) - { - iter_vc = curr_on_cpu(cpu); - if ( is_idle_vcpu(iter_vc) ) - { - cpu_to_tickle = cpu; - goto out; - } - iter_svc = rt_vcpu(iter_vc); - if ( latest_deadline_vcpu == NULL || - iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline ) - latest_deadline_vcpu = iter_svc; - } - - /* 3) candicate has higher priority, kick out lowest priority vcpu */ - if ( latest_deadline_vcpu != NULL && - new->cur_deadline < latest_deadline_vcpu->cur_deadline ) - { - cpu_to_tickle = latest_deadline_vcpu->vcpu->processor; - goto out; - } - - /* didn't tickle any cpu */ - SCHED_STAT_CRANK(tickle_idlers_none); - return; -out: - /* TRACE */ - { - struct { - unsigned cpu:16, pad:16; - } d; - d.cpu = cpu_to_tickle; - d.pad = 0; - trace_var(TRC_RTDS_TICKLE, 0, - sizeof(d), - (unsigned char *)&d); - } - - cpumask_set_cpu(cpu_to_tickle, &prv->tickled); - SCHED_STAT_CRANK(tickle_idlers_some); - cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ); - return; + __t_remove(svc); } /* @@ -988,6 +1105,8 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) BUG_ON( is_idle_vcpu(vc) ); + update_sched_time(ops, now, svc); + if ( unlikely(curr_on_cpu(vc->processor) == vc) ) { SCHED_STAT_CRANK(vcpu_wake_running); @@ -1024,6 +1143,9 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) __repl_update(ops, now); + /* Update timerq */ + update_sched_time(ops, now, svc); + ASSERT(!list_empty(&prv->sdom)); sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); online = cpupool_scheduler_cpumask(sdom->dom->cpupool); @@ -1056,8 +1178,11 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc) if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) && likely(vcpu_runnable(vc)) ) { + s_time_t now = NOW(); + __runq_insert(ops, svc); - __repl_update(ops, NOW()); + __repl_update(ops, now); + update_sched_time(ops, now, svc); ASSERT(!list_empty(&prv->sdom)); sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); -- 1.7.9.5 ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-08 11:46 [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling Dagaen Golomb @ 2015-06-09 12:53 ` Dario Faggioli 2015-06-10 4:18 ` Dagaen Golomb 2015-06-10 5:54 ` Meng Xu 0 siblings, 2 replies; 21+ messages in thread From: Dario Faggioli @ 2015-06-09 12:53 UTC (permalink / raw) To: Dagaen Golomb Cc: wei.liu2, xisisu, george.dunlap, xen-devel, mengxu, jbeulich, lichong659 [-- Attachment #1.1: Type: text/plain, Size: 11024 bytes --] Hello Dagaen, Thanks for doing this. The first question I have is whether you run any test/benchmark that you can show the result of here. For instance, a few weeks ago, Meng (while trying to do a completely different thing) sent here on the list some numbers about the frequency of the scheduler being called, and the overhead caused by that... Would it be hard to run the same experiments and collect the numbers, with and without your patch? On Mon, 2015-06-08 at 07:46 -0400, Dagaen Golomb wrote: > To do this, we create a new list that holds, for each > vcpu, the time least into the future that it may need to be > rescheduled. > Ok. Actually, what I really expected to see in this patch was, either: a) a new, scheduler wide, list of replenishment times _and_ a timer, always (re)programmed to fire at the most imminent one; or b) one timer for each vcpu, always (re)programmed to fire at the time instant when the replenishment for that vcpu should happen. And note that, when I say "timer", I mean an actual Xen timer, i.e., those things that are started, stopped, and with a timer handling routine being called when they expire. For an example, you can have a look, in sched_credit.c, at 'ticker' or 'master_ticker'. Deciding between a) or b) isn't easy, without actually trying to implement them. I personally prefer b), and I think it would be a superior long term solution (see right below), but given te current architecture of the RTDS scheduler (e.g., the fact that it has a global runq, etc), I won't nack a), which would most likely be simpler. Performance and overhead wise, again, hard to tell in advance. b) would introduce more overhead (as there are more timers), but will likely reveal more scalable (which is not something of great importance for this scheduler) and may allow for better latency (which _is_ something of great importance for this scheduler :-) ), since there won't be the need to lock the whole scheduler during the replenishments (which is, OTOH, necessary with a)). And that's it for the replenishments. For enforcing the budget, well, we have the ret.time value of the task_slice struct returned by rt_schedule, so that's event driven already, and we don't need to do much about it. Does all this make sense? Am I making myself clear enough? If not, feel free to ask. > The scheduler chooses the lowest time off of this > list and waits until the specified time instead of running every > 1 ms as it did before. > Yeah, well, I see what you mean and how you this is actually succeeding (at least AFAICT, by looking at the code, again, it would be nice to have some numbers) in improving the scheduler behavior. However, this transition toward event driven-ness has two goals: * improve the scheduler behavior [check, at least to some extent] * improve the code (readability, maintainability, etc.) [not check at all :-(] As an example, the __repl_update() function: it's called 2 times inside rt_schedule() and a third from rt_context_saved(), which is basically like saying it's called 3 times from rt_schedule(), and this always made very few sense to me. In fact, I think that scheduling decisions and replenishment events shouldn't be so tightly coupled. There are interdependencies, of course (a replenishment could call for a scheduling decision to be taken), but not like this. That's why I say I'd like to see a timer handling routine dealing with replenishment, and let rt_schedule() do it's job, which is: - enforcing the budget of the running vcpu. If it expires, _(re)program_ the replenishment timer - choose the best vcpu to run next, if necessary With this patch, you're still using rt_schedule() to do both scheduling and replenishments, although you're now doing it at actual replenishment times, instead of checking for them every millisecond. Also, the bits and pieces that you need to add in order to deal with this new queue is, really, making things even more complex than they are now, which is undesirable. So, all this being said, let me know what you think about it (and that applies to everyone else as well, of course, in particular Meng). I won't comment on the code in too much details, as it will require some quite radical restructuring, but I'll skim through it and provide some hints anyway. > Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu> > Signed-off-by: Meng Xu <mengxu@cis.upenn.edu> > --- > xen/common/sched_rt.c | 319 ++++++++++++++++++++++++++++++++++--------------- > 1 file changed, 222 insertions(+), 97 deletions(-) > First of all, it's a big patch. It's only changing one file and one logical component, and for that reason, TBH, it's not that hard to review. Still I think you can break it in at least two, and perhaps even more, if you try to implement what I described above. > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c > index 7c39a9e..25f0458 100644 > --- a/xen/common/sched_rt.c > +++ b/xen/common/sched_rt.c > @@ -156,6 +160,7 @@ struct rt_vcpu { > s_time_t cur_budget; /* current budget */ > s_time_t last_start; /* last start time */ > s_time_t cur_deadline; /* current deadline for EDF */ > + s_time_t next_sched_needed; /* next time to make scheduling decision */ > As an example of why I said that things should become simpler, and are instead becoming more complex: with my solution, you don't need anything like this. In fact, budget enforcement is already being done ok in rt_schedule(), so no need to do anything more/different about it. Replenishments should be programmed to fire at cur_deadline, so again, no need to add this new field, and multiplex its actual value between budget and deadline (which, yes, counts as being something rather complex for me! :-D). > @@ -361,6 +387,12 @@ __q_remove(struct rt_vcpu *svc) > list_del_init(&svc->q_elem); > } > > +static inline void __t_remove(struct rt_vcpu *svc) > +{ > + if( !list_empty(&svc->t_elem) ) > + list_del_init(&svc->t_elem); > +} > + > You're using hard tabs for indenting here. As you see everywhere esle, Xen wants 4 spaces for this. > /* > * Insert svc with budget in RunQ according to EDF: > * vcpus with smaller deadlines go first. > @@ -395,6 +427,72 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) > } > > /* > + * Insert svc into the timerq, maintaining ascending order by next_sched_needed. > + */ > +static void __timerq_insert(const struct scheduler *ops, struct rt_vcpu *svc) > +{ > + struct rt_private *prv = rt_priv(ops); > + struct list_head *timerq = rt_timerq(ops); > + struct list_head *iter = timerq; > + > + ASSERT( spin_is_locked(&prv->lock) ); > + > + ASSERT( list_empty(&svc->t_elem) ); > + The blank line between the asserts, I'd kill it. > + list_for_each(iter, timerq) > + { > You're still using tabs, and mixing them with spaces, making things look even more cumbersome. > +/* > + * Return how far the lowest time on the timerq (that is after NOW) is in the > + * future. > + * Lock must be grabbed before calling this. > + */ > +static s_time_t __timerq_next(const struct scheduler *ops, s_time_t now) > +{ > + struct list_head *timerq = rt_timerq(ops); > + struct list_head *iter, *tmp; > + > + list_for_each_safe(iter, tmp, timerq) > + { > + struct rt_vcpu * iter_svc = __t_elem(iter); > + > + if ( iter_svc->next_sched_needed > now ) > + return (iter_svc->next_sched_needed - now); > + else > + __t_remove(iter_svc); > + } > + > + return MAX_SCHEDULE; > +} > If using a timer, you can just get rid of items while, in the timer routine, you handle the event associated to them. Also, why is MAX_SCHEDULE still there? > +/* > + * Updates the next_sched_needed field for a vcpu, which is used for > + * determining the next needed schedule time for this vcpu. Then updates > + * timerq via reinsert. > + */ > +static void update_sched_time(const struct scheduler *ops, s_time_t now, > + struct rt_vcpu *svc) > +{ > + /* update next needed schedule time */ > + if( test_bit(__RTDS_scheduled, &svc->flags) ) > + svc->next_sched_needed = now + svc->cur_budget; > + else > + svc->next_sched_needed = svc->cur_deadline; > + > + /* Remove and reinsert to maintain sorted order in timerq */ > + __t_remove(svc); > + __timerq_insert(ops, svc); > + > + return; > +} > And here's the multiplexing, which --even worse-- (may) require rearranging the queue! We really don't need anything like this. > /* > + * Pick a cpu where to run a vcpu, > + * possibly kicking out the vcpu running there > + * Called by wake() and context_saved() > + * We have a running candidate here, the kick logic is: > + * Among all the cpus that are within the cpu affinity > + * 1) if the new->cpu is idle, kick it. This could benefit cache hit > + * 2) if there are any idle vcpu, kick it. > + * 3) now all pcpus are busy; > + * among all the running vcpus, pick lowest priority one > + * if snext has higher priority, kick it. > + * > + * TODO: > + * 1) what if these two vcpus belongs to the same domain? > + * replace a vcpu belonging to the same domain introduces more overhead > + * > + * lock is grabbed before calling this function > + */ > +static void > +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new) > +{ > + struct rt_private *prv = rt_priv(ops); > + struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */ > + struct rt_vcpu *iter_svc; > + struct vcpu *iter_vc; > + int cpu = 0, cpu_to_tickle = 0; > + cpumask_t not_tickled; > + cpumask_t *online; > + > [snip] And here you are moving a function up. In general, it is better to have separate patches that do the movings, with the fact that the are all about moving and that the code is not being changed, clearly stated in the commit message. This is because it is really really hard to figure out, e.g. from here, whether you also changed something in the function while moving it, making reviewing a lot harder and more prone to error. That being said, in this specific case, you're moving up runq_tickle(), the purpose of which is to trigger a call to rt_schedule() (after going through the softirq machinery)... in order to call it in rt_schedule(). That's pretty tricky, and is not how tickling should work. Again, with an approach that really take advantage of timers, this won't be necessary. Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) [-- Attachment #1.2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 181 bytes --] [-- Attachment #2: Type: text/plain, Size: 126 bytes --] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-09 12:53 ` Dario Faggioli @ 2015-06-10 4:18 ` Dagaen Golomb 2015-06-10 22:30 ` Dario Faggioli 2015-06-10 5:54 ` Meng Xu 1 sibling, 1 reply; 21+ messages in thread From: Dagaen Golomb @ 2015-06-10 4:18 UTC (permalink / raw) To: Dario Faggioli Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li [-- Attachment #1.1: Type: text/plain, Size: 15723 bytes --] Thanks for the feedback. > Thanks for doing this. The first question I have is whether you run any > test/benchmark that you can show the result of here. > > For instance, a few weeks ago, Meng (while trying to do a completely > different thing) sent here on the list some numbers about the frequency > of the scheduler being called, and the overhead caused by that... Would > it be hard to run the same experiments and collect the numbers, with and > without your patch? > I could run some experiments to measure this. An easier quick test may be to count invocations and run the same workload on each, making sure each run a long time to remove biases. > On Mon, 2015-06-08 at 07:46 -0400, Dagaen Golomb wrote: > > To do this, we create a new list that holds, for each > > vcpu, the time least into the future that it may need to be > > rescheduled. > > > Ok. Actually, what I really expected to see in this patch was, either: > a) a new, scheduler wide, list of replenishment times _and_ a timer, > always (re)programmed to fire at the most imminent one; or > b) one timer for each vcpu, always (re)programmed to fire at the time > instant when the replenishment for that vcpu should happen. > a) is essentially what we have going on here. The list is exactly that, a list of possible scheduling change times (including replenishments as they are coalesced with period), one per vcpu, and the timer is triggered using the earliest one. > And note that, when I say "timer", I mean an actual Xen timer, i.e., > those things that are started, stopped, and with a timer handling > routine being called when they expire. For an example, you can have a > look, in sched_credit.c, at 'ticker' or 'master_ticker'. > I will look at this. However, the current solution is likely functionally equivalent and, with only one timer and a single list used to arm it, I'm not sure if using a Xen timer is necessary. Do they incur any extra overhead or coarsen granularity? > > Deciding between a) or b) isn't easy, without actually trying to > implement them. I personally prefer b), and I think it would be a > superior long term solution (see right below), but given te current > architecture of the RTDS scheduler (e.g., the fact that it has a global > runq, etc), I won't nack a), which would most likely be simpler. > I agree b) would may be better in the long run, but with the current architecture a) is simpler. b) can be future work as the scheduler evolves. > Performance and overhead wise, again, hard to tell in advance. b) would > introduce more overhead (as there are more timers), but will likely > reveal more scalable (which is not something of great importance for > this scheduler) and may allow for better latency (which _is_ something > of great importance for this scheduler :-) ), since there won't be the > need to lock the whole scheduler during the replenishments (which is, > OTOH, necessary with a)). > > And that's it for the replenishments. For enforcing the budget, well, we > have the ret.time value of the task_slice struct returned by > rt_schedule, so that's event driven already, and we don't need to do > much about it. > The timerq will also hold the end of period if that is the next relevant event, and if at head of the timerq this value will be used to arm the timer to let the task run until budget exhaustion. > Does all this make sense? Am I making myself clear enough? > If not, feel free to ask. > > > The scheduler chooses the lowest time off of this > > list and waits until the specified time instead of running every > > 1 ms as it did before. > > > Yeah, well, I see what you mean and how you this is actually succeeding > (at least AFAICT, by looking at the code, again, it would be nice to > have some numbers) in improving the scheduler behavior. > > However, this transition toward event driven-ness has two goals: > * improve the scheduler behavior [check, at least to some extent] > * improve the code (readability, maintainability, etc.) > [not check at all :-(] > > As an example, the __repl_update() function: it's called 2 times inside > rt_schedule() and a third from rt_context_saved(), which is basically > like saying it's called 3 times from rt_schedule(), and this always made > very few sense to me. In fact, I think that scheduling decisions and > replenishment events shouldn't be so tightly coupled. There are > interdependencies, of course (a replenishment could call for a > scheduling decision to be taken), but not like this. That's why I say > I'd like to see a timer handling routine dealing with replenishment, and > let rt_schedule() do it's job, which is: > - enforcing the budget of the running vcpu. If it expires, > _(re)program_ the replenishment timer > - choose the best vcpu to run next, if necessary > > With this patch, you're still using rt_schedule() to do both scheduling > and replenishments, although you're now doing it at actual replenishment > times, instead of checking for them every millisecond. > > I think once you understand that the timerq is not only replenishments, but any event that could cause a branch is the scheduler behavior, it becomes more palatable to have them intertwined. Really, the timerq and scheduling aren't as intertwined as they appear, they are piratically isolated functionally. Its just that the timerq is most naturally serviced during scheduling events when data that effects scheduling decisions is changing. Its straightforward and efficient. Let me know your thoughts on this. > Also, the bits and pieces that you need to add in order to deal with > this new queue is, really, making things even more complex than they are > now, which is undesirable. > While it does add some complexity, I don't feel a single queue and managing it is that much extra complexity. > So, all this being said, let me know what you think about it (and that > applies to everyone else as well, of course, in particular Meng). > > I won't comment on the code in too much details, as it will require some > quite radical restructuring, but I'll skim through it and provide some > hints anyway. > > > Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu> > > Signed-off-by: Meng Xu <mengxu@cis.upenn.edu> > > --- > > xen/common/sched_rt.c | 319 > ++++++++++++++++++++++++++++++++++--------------- > > 1 file changed, 222 insertions(+), 97 deletions(-) > > > First of all, it's a big patch. It's only changing one file and one > logical component, and for that reason, TBH, it's not that hard to > review. Still I think you can break it in at least two, and perhaps even > more, if you try to implement what I described above. > Noted. Note that its not as large as it seems, a large chunk of the deletions and an equal number of insertions include moving a function due to visibility to other functions. > > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c > > index 7c39a9e..25f0458 100644 > > --- a/xen/common/sched_rt.c > > +++ b/xen/common/sched_rt.c > > > @@ -156,6 +160,7 @@ struct rt_vcpu { > > s_time_t cur_budget; /* current budget */ > > s_time_t last_start; /* last start time */ > > s_time_t cur_deadline; /* current deadline for EDF */ > > + s_time_t next_sched_needed; /* next time to make scheduling > decision */ > > > As an example of why I said that things should become simpler, and are > instead becoming more complex: with my solution, you don't need anything > like this. In fact, budget enforcement is already being done ok in > rt_schedule(), so no need to do anything more/different about it. > Replenishments should be programmed to fire at cur_deadline, so again, > no need to add this new field, and multiplex its actual value between > budget and deadline (which, yes, counts as being something rather > complex for me! :-D). > As mentioned, the timerq is handling all events that could change the scheduling decision, not just replenishments. This tracks the earliest time anything cause this to be scheduled differently, and could be extended if any more insights are found. Budget enforcement is being done in rt_schedule but its being done by this very list: a budget expiration is one possible event that would require a vcpu reschedule. > > > @@ -361,6 +387,12 @@ __q_remove(struct rt_vcpu *svc) > > list_del_init(&svc->q_elem); > > } > > > > +static inline void __t_remove(struct rt_vcpu *svc) > > +{ > > + if( !list_empty(&svc->t_elem) ) > > + list_del_init(&svc->t_elem); > > +} > > + > > > You're using hard tabs for indenting here. As you see everywhere esle, > Xen wants 4 spaces for this. > Sorry, I thought I had gotten them all changed to spaces. My editor wasn't configured correctly at first so some are lurking around. I will exterminate the remaining ones. > > /* > > * Insert svc with budget in RunQ according to EDF: > > * vcpus with smaller deadlines go first. > > @@ -395,6 +427,72 @@ __runq_insert(const struct scheduler *ops, struct > rt_vcpu *svc) > > } > > > > /* > > + * Insert svc into the timerq, maintaining ascending order by > next_sched_needed. > > + */ > > +static void __timerq_insert(const struct scheduler *ops, struct rt_vcpu > *svc) > > +{ > > + struct rt_private *prv = rt_priv(ops); > > + struct list_head *timerq = rt_timerq(ops); > > + struct list_head *iter = timerq; > > + > > + ASSERT( spin_is_locked(&prv->lock) ); > > + > > + ASSERT( list_empty(&svc->t_elem) ); > > + > The blank line between the asserts, I'd kill it. > Can do. > > + list_for_each(iter, timerq) > > + { > > > You're still using tabs, and mixing them with spaces, making things look > even more cumbersome. > Again, sorry. Will exterminate. > > > +/* > > + * Return how far the lowest time on the timerq (that is after NOW) is > in the > > + * future. > > + * Lock must be grabbed before calling this. > > + */ > > +static s_time_t __timerq_next(const struct scheduler *ops, s_time_t now) > > +{ > > + struct list_head *timerq = rt_timerq(ops); > > + struct list_head *iter, *tmp; > > + > > + list_for_each_safe(iter, tmp, timerq) > > + { > > + struct rt_vcpu * iter_svc = __t_elem(iter); > > + > > + if ( iter_svc->next_sched_needed > now ) > > + return (iter_svc->next_sched_needed - now); > > + else > > + __t_remove(iter_svc); > > + } > > + > > + return MAX_SCHEDULE; > > +} > > > If using a timer, you can just get rid of items while, in the timer > routine, you handle the event associated to them. > > Also, why is MAX_SCHEDULE still there? > Honestly, events do not have to be removed here, but its done to prevent walking a list of stale values to get to the newest one on the next call. This is done more for performance. Their non-removal would be functionally equivalent. With the timer alternative you mention, where would the timer events and their data be held? I think that could be extra complexity, unless I fail to understand the idea completely. The current approach is actually somewhat simple if you think about it, requiring just one piece of information per vcpu (the next_sched_needed field) which is updated as information is available. It also uses a guaranteed O(1) increase in memory with how the queue and next_sched_time is set up, and most computational parts are rolled into existing routines that need to be done anyways. MAX_SCHEDULE may not be required, but I have it there as an "in case." For example, it will cause the scheduler to be called after MAX_SCHEDULE even when no events are occurring (such as no vcpus). Its there to ensure progress in case of any bugs or unexpected behavior. > > +/* > > + * Updates the next_sched_needed field for a vcpu, which is used for > > + * determining the next needed schedule time for this vcpu. Then updates > > + * timerq via reinsert. > > + */ > > +static void update_sched_time(const struct scheduler *ops, s_time_t now, > > + struct rt_vcpu *svc) > > +{ > > + /* update next needed schedule time */ > > + if( test_bit(__RTDS_scheduled, &svc->flags) ) > > + svc->next_sched_needed = now + svc->cur_budget; > > + else > > + svc->next_sched_needed = svc->cur_deadline; > > + > > + /* Remove and reinsert to maintain sorted order in timerq */ > > + __t_remove(svc); > > + __timerq_insert(ops, svc); > > + > > + return; > > +} > > > And here's the multiplexing, which --even worse-- (may) require > rearranging the queue! We really don't need anything like this. > I see what you mean here, but this doesn't seem bad to me. Right now its only these two events and a simple if-else, but what is great is that this method can be updated to include any number of tricks or insights that could help determine when something may need be scheduled or not. Right now it seems complex for what appears to be just two cases, but many more cases or an entirely different accounting method could be placed here and it would be fine. It makes for a nice "ground zero" for modifying the timer update behavior. > > /* > > + * Pick a cpu where to run a vcpu, > > + * possibly kicking out the vcpu running there > > + * Called by wake() and context_saved() > > + * We have a running candidate here, the kick logic is: > > + * Among all the cpus that are within the cpu affinity > > + * 1) if the new->cpu is idle, kick it. This could benefit cache hit > > + * 2) if there are any idle vcpu, kick it. > > + * 3) now all pcpus are busy; > > + * among all the running vcpus, pick lowest priority one > > + * if snext has higher priority, kick it. > > + * > > + * TODO: > > + * 1) what if these two vcpus belongs to the same domain? > > + * replace a vcpu belonging to the same domain introduces more > overhead > > + * > > + * lock is grabbed before calling this function > > + */ > > +static void > > +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new) > > +{ > > + struct rt_private *prv = rt_priv(ops); > > + struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */ > > + struct rt_vcpu *iter_svc; > > + struct vcpu *iter_vc; > > + int cpu = 0, cpu_to_tickle = 0; > > + cpumask_t not_tickled; > > + cpumask_t *online; > > + > > > [snip] > > And here you are moving a function up. In general, it is better to have > separate patches that do the movings, with the fact that the are all > about moving and that the code is not being changed, clearly stated in > the commit message. This is because it is really really hard to figure > out, e.g. from here, whether you also changed something in the function > while moving it, making reviewing a lot harder and more prone to error. > Ah, thanks for mentioning. The move was required but I didn't know a separate patch was customary. I can make it so. > That being said, in this specific case, you're moving up runq_tickle(), > the purpose of which is to trigger a call to rt_schedule() (after going > through the softirq machinery)... in order to call it in rt_schedule(). > That's pretty tricky, and is not how tickling should work. > It set up to only tickle if needed. I'm not sure if this is an issue, > Again, with an approach that really take advantage of timers, this won't > be necessary. > I think there could be some discussion on the approaches. Hopefully others comment as well. As mentioned this is actually a logically quite simple change, with good efficiency, and the groud zero makes changes using lots of information easy although right now it was kept simple to get the change out. Future improvements to the update function was somewhat expected as future work. Thanks for the comments. ~Dagaen [-- Attachment #1.2: Type: text/html, Size: 20508 bytes --] [-- Attachment #2: Type: text/plain, Size: 126 bytes --] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-10 4:18 ` Dagaen Golomb @ 2015-06-10 22:30 ` Dario Faggioli 2015-06-13 20:33 ` Dagaen Golomb 0 siblings, 1 reply; 21+ messages in thread From: Dario Faggioli @ 2015-06-10 22:30 UTC (permalink / raw) To: Dagaen Golomb Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li [-- Attachment #1.1: Type: text/plain, Size: 9922 bytes --] No HTML, please. On Wed, 2015-06-10 at 00:18 -0400, Dagaen Golomb wrote: > And note that, when I say "timer", I mean an actual Xen timer, > i.e., > those things that are started, stopped, and with a timer > handling > routine being called when they expire. For an example, you can > have a > look, in sched_credit.c, at 'ticker' or 'master_ticker'. > > > I will look at this. However, the current solution is likely > functionally equivalent and, with only one timer and a single list > used to arm it, I'm not sure if using a Xen timer is necessary. > It is functionally equivalent, by chance (see the issue about calling runq_tickle() on current vcpus in my reply to Meng). The reason why a different approach is required is making the code look better, reducing (instead than increasing) the complexity of sched_rt.c, lowering the overhead caused by long running operations performed while holding the global scheduler spin lock, and improving scalability. > Do they incur any extra overhead or coarsen granularity? > "extra overhead or coarsen granularity" as compared to what? s_timer in schedule.c (which is what you're using) is one of them already! What I meant with "an actual timer" is that you should ad a new one, and move some of the stuff currently being done in rt_schedule() in its handling routine, rather than just adding a new queue of events to be serviced by abusing s_timer. > Deciding between a) or b) isn't easy, without actually trying > to > implement them. I personally prefer b), and I think it would > be a > superior long term solution (see right below), but given te > current > architecture of the RTDS scheduler (e.g., the fact that it has > a global > runq, etc), I won't nack a), which would most likely be > simpler. > > > I agree b) would may be better in the long run, but with the current > architecture a) is simpler. b) can be future work as the scheduler > evolves. > Sure. And in fact, I'm fine with a), if done properly. > > With this patch, you're still using rt_schedule() to do both > scheduling > and replenishments, although you're now doing it at actual > replenishment > times, instead of checking for them every millisecond. > > I think once you understand that the timerq is not only > replenishments, but any event that could cause a branch is the > scheduler behavior, it becomes more palatable to have them > intertwined. > I got that, and I'm asking you to do it differently. > Really, the timerq and scheduling aren't as intertwined as they > appear, they are piratically isolated functionally. Its just that the > timerq is most naturally serviced during scheduling events when data > that effects scheduling decisions is changing. Its straightforward and > efficient. > Yeah, replenishments are 'naturally serviced' in a 'straightforward and efficient' way by looping on all runnable vcpus, in more than one place, from within super-hot paths, with the global scheduler spin lock held. Neat! :-/ Oh, and that's before you introducing of another list to be taken care of, still under the same conditions. :-O > Also, the bits and pieces that you need to add in order to > deal with > this new queue is, really, making things even more complex > than they are > now, which is undesirable. > While it does add some complexity, I don't feel a single queue and > managing it is that much extra complexity. > But in fact the point of making the scheduler 'event driven' is to take advantage of the chances that this offers for getting stuff *out* of rt_schedule(), and the purpose is not "not adding that much extra complexity", is making it _simpler_. > > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c > > index 7c39a9e..25f0458 100644 > > --- a/xen/common/sched_rt.c > > +++ b/xen/common/sched_rt.c > > > @@ -156,6 +160,7 @@ struct rt_vcpu { > > s_time_t cur_budget; /* current budget */ > > s_time_t last_start; /* last start time */ > > s_time_t cur_deadline; /* current deadline for EDF > */ > > + s_time_t next_sched_needed; /* next time to make > scheduling decision */ > > > As an example of why I said that things should become simpler, > and are > instead becoming more complex: with my solution, you don't > need anything > like this. In fact, budget enforcement is already being done > ok in > rt_schedule(), so no need to do anything more/different about > it. > Replenishments should be programmed to fire at cur_deadline, > so again, > no need to add this new field, and multiplex its actual value > between > budget and deadline (which, yes, counts as being something > rather > complex for me! :-D). > > > As mentioned, the timerq is handling all events that could change the > scheduling decision, not just replenishments. > Yes, exactly, it's handling *too much* events. :-) For example, have a look at 'struct vcpu', in xen/include/xen/sched.h. It's got 3 timers in it: struct timer periodic_timer; struct timer singleshot_timer; struct timer poll_timer; /* timeout for SCHEDOP_poll */ It probably would have been possible to just use one, with a single and mixed event queue, as you're doing, a 1k lines handling routines, etc. Do you it it would have been easier or harder to implement, understand, debug and maintain? I bet on harder. > This tracks the earliest time anything cause this to be scheduled > differently, and could be extended if any more insights are found. > Budget enforcement is being done in rt_schedule but its being done by > this very list: a budget expiration is one possible event that would > require a vcpu reschedule. > OMG, 'extended'... You're thinking to actually put more stuff in there?!? :-O It's a rather key and already too long and complex critical section, so please, let's aim at shortening and making it simpler, i.e., at improving things, not making them worse. > > > +/* > > + * Return how far the lowest time on the timerq (that is > after NOW) is in the > > + * future. > > + * Lock must be grabbed before calling this. > > + */ > > +static s_time_t __timerq_next(const struct scheduler *ops, > s_time_t now) > > +{ > > + struct list_head *timerq = rt_timerq(ops); > > + struct list_head *iter, *tmp; > > + > > + list_for_each_safe(iter, tmp, timerq) > > + { > > + struct rt_vcpu * iter_svc = __t_elem(iter); > > + > > + if ( iter_svc->next_sched_needed > now ) > > + return (iter_svc->next_sched_needed - now); > > + else > > + __t_remove(iter_svc); > > + } > > + > > + return MAX_SCHEDULE; > > +} > > > If using a timer, you can just get rid of items while, in the > timer > routine, you handle the event associated to them. > > Also, why is MAX_SCHEDULE still there? > > > Honestly, events do not have to be removed here, but its done to > prevent walking a list of stale values to get to the newest one on the > next call. This is done more for performance. Their non-removal would > be functionally equivalent. > Of course you should remove the stale entries, I certainly was not arguing that the list should just grow indefinitely! Point is, again, that this is another list walk occurring in an hot path, with a big spin lock held. We want to get rid of such thing, not adding more of it. > With the timer alternative you mention, where would the timer events > and their data be held? I think that could be extra complexity, unless > I fail to understand the idea completely. > In an event queue like yours, of course. But you won't go through it and/or bookkeep it while in hot paths, with the scheduler lock held. See my email to Meng to have more details on what I have in mind, or fell free to ask more details. > MAX_SCHEDULE may not be required, but I have it there as an "in case." > For example, it will cause the scheduler to be called after > MAX_SCHEDULE even when no events are occurring (such as no vcpus). Its > there to ensure progress in case of any bugs or unexpected behavior. > Wait, so, all this work, and then you still want to call rt_schedule() every millisecond, when there's nothing to do?!?! :-O > That being said, in this specific case, you're moving up > runq_tickle(), > the purpose of which is to trigger a call to rt_schedule() > (after going > through the softirq machinery)... in order to call it in > rt_schedule(). > That's pretty tricky, and is not how tickling should work. > It set up to only tickle if needed. I'm not sure if this is an issue, > It's wrong, AFAICT. See my reply to Meng and, please, comment by replying to it, if you've got anything to say about this, to make the discussion easier to follow. Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) [-- Attachment #1.2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 181 bytes --] [-- Attachment #2: Type: text/plain, Size: 126 bytes --] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-10 22:30 ` Dario Faggioli @ 2015-06-13 20:33 ` Dagaen Golomb 2015-06-16 17:07 ` Dagaen Golomb 0 siblings, 1 reply; 21+ messages in thread From: Dagaen Golomb @ 2015-06-13 20:33 UTC (permalink / raw) To: Dario Faggioli Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li > No HTML, please. Got it, sorry. >> And note that, when I say "timer", I mean an actual Xen timer, >> i.e., >> those things that are started, stopped, and with a timer >> handling >> routine being called when they expire. For an example, you can >> have a >> look, in sched_credit.c, at 'ticker' or 'master_ticker'. >> >> >> I will look at this. However, the current solution is likely >> functionally equivalent and, with only one timer and a single list >> used to arm it, I'm not sure if using a Xen timer is necessary. >> > It is functionally equivalent, by chance (see the issue about calling > runq_tickle() on current vcpus in my reply to Meng). The reason why a > different approach is required is making the code look better, reducing > (instead than increasing) the complexity of sched_rt.c, lowering the > overhead caused by long running operations performed while holding the > global scheduler spin lock, and improving scalability. > >> Do they incur any extra overhead or coarsen granularity? >> > "extra overhead or coarsen granularity" as compared to what? s_timer in > schedule.c (which is what you're using) is one of them already! > > What I meant with "an actual timer" is that you should ad a new one, and > move some of the stuff currently being done in rt_schedule() in its > handling routine, rather than just adding a new queue of events to be > serviced by abusing s_timer. Okay, I was wondering what you mean by "an actual timer." > >> Deciding between a) or b) isn't easy, without actually trying >> to >> implement them. I personally prefer b), and I think it would >> be a >> superior long term solution (see right below), but given te >> current >> architecture of the RTDS scheduler (e.g., the fact that it has >> a global >> runq, etc), I won't nack a), which would most likely be >> simpler. >> >> >> I agree b) would may be better in the long run, but with the current >> architecture a) is simpler. b) can be future work as the scheduler >> evolves. >> > Sure. And in fact, I'm fine with a), if done properly. >> >> With this patch, you're still using rt_schedule() to do both >> scheduling >> and replenishments, although you're now doing it at actual >> replenishment >> times, instead of checking for them every millisecond. > >> >> I think once you understand that the timerq is not only >> replenishments, but any event that could cause a branch is the >> scheduler behavior, it becomes more palatable to have them >> intertwined. >> > I got that, and I'm asking you to do it differently. > >> Really, the timerq and scheduling aren't as intertwined as they >> appear, they are piratically isolated functionally. Its just that the >> timerq is most naturally serviced during scheduling events when data >> that effects scheduling decisions is changing. Its straightforward and >> efficient. >> > Yeah, replenishments are 'naturally serviced' in a 'straightforward and > efficient' way by looping on all runnable vcpus, in more than one place, > from within super-hot paths, with the global scheduler spin lock held. > Neat! :-/ > > Oh, and that's before you introducing of another list to be taken care > of, still under the same conditions. :-O The paths are already hot, so adding a small amount of extra work is a small percentage increase. I'm usually against this kind of thing, too, but separating to another timer, while runnable independently, may actually be more work if it needs to use some of the same maintenance behavior. I guess the main argument against is maintainability. > >> Also, the bits and pieces that you need to add in order to >> deal with >> this new queue is, really, making things even more complex >> than they are >> now, which is undesirable. > >> While it does add some complexity, I don't feel a single queue and >> managing it is that much extra complexity. >> > But in fact the point of making the scheduler 'event driven' is to take > advantage of the chances that this offers for getting stuff *out* of > rt_schedule(), and the purpose is not "not adding that much extra > complexity", is making it _simpler_. I understand where you are coming from now. I was looking at is mostly from the perspective of performance. This explains our differences. > > >> > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c >> > index 7c39a9e..25f0458 100644 >> > --- a/xen/common/sched_rt.c >> > +++ b/xen/common/sched_rt.c >> >> > @@ -156,6 +160,7 @@ struct rt_vcpu { >> > s_time_t cur_budget; /* current budget */ >> > s_time_t last_start; /* last start time */ >> > s_time_t cur_deadline; /* current deadline for EDF >> */ >> > + s_time_t next_sched_needed; /* next time to make >> scheduling decision */ >> > >> As an example of why I said that things should become simpler, >> and are >> instead becoming more complex: with my solution, you don't >> need anything >> like this. In fact, budget enforcement is already being done >> ok in >> rt_schedule(), so no need to do anything more/different about >> it. >> Replenishments should be programmed to fire at cur_deadline, >> so again, >> no need to add this new field, and multiplex its actual value >> between >> budget and deadline (which, yes, counts as being something >> rather >> complex for me! :-D). >> >> >> As mentioned, the timerq is handling all events that could change the >> scheduling decision, not just replenishments. >> > Yes, exactly, it's handling *too much* events. :-) > > For example, have a look at 'struct vcpu', in > xen/include/xen/sched.h. It's got 3 timers in it: > > struct timer periodic_timer; > struct timer singleshot_timer; > struct timer poll_timer; /* timeout for SCHEDOP_poll */ > > It probably would have been possible to just use one, with a single and > mixed event queue, as you're doing, a 1k lines handling routines, etc. > > Do you it it would have been easier or harder to implement, understand, > debug and maintain? I bet on harder. Point taken. > >> This tracks the earliest time anything cause this to be scheduled >> differently, and could be extended if any more insights are found. >> Budget enforcement is being done in rt_schedule but its being done by >> this very list: a budget expiration is one possible event that would >> require a vcpu reschedule. >> > OMG, 'extended'... You're thinking to actually put more stuff in > there?!? :-O > > It's a rather key and already too long and complex critical section, so > please, let's aim at shortening and making it simpler, i.e., at > improving things, not making them worse. This can again be explained by our goal mismatch. I see where you are coming from now. >> >> > +/* >> > + * Return how far the lowest time on the timerq (that is >> after NOW) is in the >> > + * future. >> > + * Lock must be grabbed before calling this. >> > + */ >> > +static s_time_t __timerq_next(const struct scheduler *ops, >> s_time_t now) >> > +{ >> > + struct list_head *timerq = rt_timerq(ops); >> > + struct list_head *iter, *tmp; >> > + >> > + list_for_each_safe(iter, tmp, timerq) >> > + { >> > + struct rt_vcpu * iter_svc = __t_elem(iter); >> > + >> > + if ( iter_svc->next_sched_needed > now ) >> > + return (iter_svc->next_sched_needed - now); >> > + else >> > + __t_remove(iter_svc); >> > + } >> > + >> > + return MAX_SCHEDULE; >> > +} >> > >> If using a timer, you can just get rid of items while, in the >> timer >> routine, you handle the event associated to them. >> >> Also, why is MAX_SCHEDULE still there? >> >> >> Honestly, events do not have to be removed here, but its done to >> prevent walking a list of stale values to get to the newest one on the >> next call. This is done more for performance. Their non-removal would >> be functionally equivalent. >> > Of course you should remove the stale entries, I certainly was not > arguing that the list should just grow indefinitely! > > Point is, again, that this is another list walk occurring in an hot > path, with a big spin lock held. We want to get rid of such thing, not > adding more of it. > >> With the timer alternative you mention, where would the timer events >> and their data be held? I think that could be extra complexity, unless >> I fail to understand the idea completely. >> > In an event queue like yours, of course. But you won't go through it > and/or bookkeep it while in hot paths, with the scheduler lock held. > > See my email to Meng to have more details on what I have in mind, or > fell free to ask more details. > >> MAX_SCHEDULE may not be required, but I have it there as an "in case." >> For example, it will cause the scheduler to be called after >> MAX_SCHEDULE even when no events are occurring (such as no vcpus). Its >> there to ensure progress in case of any bugs or unexpected behavior. >> > Wait, so, all this work, and then you still want to call rt_schedule() > every millisecond, when there's nothing to do?!?! :-O > >> That being said, in this specific case, you're moving up >> runq_tickle(), >> the purpose of which is to trigger a call to rt_schedule() >> (after going >> through the softirq machinery)... in order to call it in >> rt_schedule(). >> That's pretty tricky, and is not how tickling should work. > >> It set up to only tickle if needed. I'm not sure if this is an issue, >> > It's wrong, AFAICT. See my reply to Meng and, please, comment by > replying to it, if you've got anything to say about this, to make the > discussion easier to follow. > > Regards, > Dario I understand what caused our mismatch now. I think option a) you mentioned makes sense given your values. I am going to look into the details of this approach and get back with any questions or concerns. Regards, ~Dagaen ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-13 20:33 ` Dagaen Golomb @ 2015-06-16 17:07 ` Dagaen Golomb 2015-06-17 0:20 ` Dario Faggioli 0 siblings, 1 reply; 21+ messages in thread From: Dagaen Golomb @ 2015-06-16 17:07 UTC (permalink / raw) To: Dario Faggioli Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li I'm not replying inline because this is a more general response that is not limited to any particular comment. Separating the replenishment from the scheduler may be problematic. The nature of the gEDF scheduler is that when a priority changes it should be instantly reflected. If replenishments were done by themsevles, then the scheduler may decide to wait for some period of time, and in this same time period a replenishment could change what should be executing. Either the gEDF policy will be violated or the replenishment routine would have to notify the scheduler after any replenishment, requiring some form of interaction and possibly more scheduler invocations than the current structure. The other option is for the scheduler to check for replenishments, as it does now. Without any interaction a violation of the policy could ensue. The gEDF is not like the credit scheduler where there is an accounting period where, during this time, policy may be violated temporarily. This model is much easier in the credit scheduler because there is a fixed slice. Imagine if we wanted an accounting period of 0 for the scheduler. The only option would be to recompute the replenishments before any scheduling and to run the scheduler at exactly when these replenishments would occur. As in, replenishments and scheduling would have to be coalesced. This is essentially what the current RTDS patch is doing. Further, with two timer routines we now need to deal with any ordering or preemption between them (or logic to prevent / order such) which I could argue is far more complexity than having them done at once as it is now. Being able to reason about the execution with one thread in mind is easier for most people than several threads. If both are armed for the same time, which should execute? Scheduler first before a possibly applicable replenishment? Or replenishment always first? Both with added logic to enforce this and prevent preemption, of course. Due to this, it is my belief that by the nature of the gEDF policy the current solution is better, mostly because it appears to actually be the least complex way that is functionally correct. The gEDF policy just isn't well suited for decoupling events, as the events are highly dependent on one another, and particularly dependent at an instantaneous timescale. It would be a hard pitch for a gEDF scheduler with a similar "accounting period" where the gEDF policy could be violated. That is blasphemy in the real-time world. Any other option for implementing the policy correctly would include more headache due to a multiple execution mindset and any enforcement and interaction logic. Its also worth noting that the stereotypical textbook event-driven model is as we have now: a single event loop that handles events. In this case the scheduler is the conceptually the main loop and this makes perfect sense: its deciding what to run (think of the running VCPUs as callback functions and the abstraction falls into place perfectly). The event loop needs some information (about replenishments) before deciding, and collecting this would be part of the decode and decision phase for an event signal. Not only is there a complexity issue, but as mentioned before this may be the best performing option, making the most information available and the best timing decision possible. Adding a few extra cycles to a hot path, even with a lock being held, is worth it if the scheduler and context switching is done less. I'm sure the entire pre-change scheduler is much longer than the time we've added, and the other model proposed with more timers may require just as much added time as well as aforementioned complexity, or would suffer from less intelligent decisions (hence more execution and/or policy violation) as dependance and cooperation between the timers decreased. For the above reasons I think you should reconsider the current implementation, as it appears it may be the least complex and easiest to reason about solution. This is not to say that the current patch doesn't have issues. We would still aim to fix any of the concerns about multiple softirqs or CPU tickles. Let me know if I'm missing some key insight into how the behavior could be implemented correctly and beautifully using the multiple timer approach. I simply don't see how it can be done without heavy interaction and information sharing between them which really defeats the purpose. Regards, ~Dagaen On Sat, Jun 13, 2015 at 4:33 PM, Dagaen Golomb <dgolomb@seas.upenn.edu> wrote: >> No HTML, please. > > Got it, sorry. > >>> And note that, when I say "timer", I mean an actual Xen timer, >>> i.e., >>> those things that are started, stopped, and with a timer >>> handling >>> routine being called when they expire. For an example, you can >>> have a >>> look, in sched_credit.c, at 'ticker' or 'master_ticker'. >>> >>> >>> I will look at this. However, the current solution is likely >>> functionally equivalent and, with only one timer and a single list >>> used to arm it, I'm not sure if using a Xen timer is necessary. >>> >> It is functionally equivalent, by chance (see the issue about calling >> runq_tickle() on current vcpus in my reply to Meng). The reason why a >> different approach is required is making the code look better, reducing >> (instead than increasing) the complexity of sched_rt.c, lowering the >> overhead caused by long running operations performed while holding the >> global scheduler spin lock, and improving scalability. >> >>> Do they incur any extra overhead or coarsen granularity? >>> >> "extra overhead or coarsen granularity" as compared to what? s_timer in >> schedule.c (which is what you're using) is one of them already! >> >> What I meant with "an actual timer" is that you should ad a new one, and >> move some of the stuff currently being done in rt_schedule() in its >> handling routine, rather than just adding a new queue of events to be >> serviced by abusing s_timer. > > Okay, I was wondering what you mean by "an actual timer." > >> >>> Deciding between a) or b) isn't easy, without actually trying >>> to >>> implement them. I personally prefer b), and I think it would >>> be a >>> superior long term solution (see right below), but given te >>> current >>> architecture of the RTDS scheduler (e.g., the fact that it has >>> a global >>> runq, etc), I won't nack a), which would most likely be >>> simpler. >>> >>> >>> I agree b) would may be better in the long run, but with the current >>> architecture a) is simpler. b) can be future work as the scheduler >>> evolves. >>> >> Sure. And in fact, I'm fine with a), if done properly. >>> >>> With this patch, you're still using rt_schedule() to do both >>> scheduling >>> and replenishments, although you're now doing it at actual >>> replenishment >>> times, instead of checking for them every millisecond. >> >>> >>> I think once you understand that the timerq is not only >>> replenishments, but any event that could cause a branch is the >>> scheduler behavior, it becomes more palatable to have them >>> intertwined. >>> >> I got that, and I'm asking you to do it differently. >> >>> Really, the timerq and scheduling aren't as intertwined as they >>> appear, they are piratically isolated functionally. Its just that the >>> timerq is most naturally serviced during scheduling events when data >>> that effects scheduling decisions is changing. Its straightforward and >>> efficient. >>> >> Yeah, replenishments are 'naturally serviced' in a 'straightforward and >> efficient' way by looping on all runnable vcpus, in more than one place, >> from within super-hot paths, with the global scheduler spin lock held. >> Neat! :-/ >> >> Oh, and that's before you introducing of another list to be taken care >> of, still under the same conditions. :-O > > The paths are already hot, so adding a small amount of extra work is a small > percentage increase. I'm usually against this kind of thing, too, but separating > to another timer, while runnable independently, may actually be more work if it > needs to use some of the same maintenance behavior. I guess the main argument > against is maintainability. > >> >>> Also, the bits and pieces that you need to add in order to >>> deal with >>> this new queue is, really, making things even more complex >>> than they are >>> now, which is undesirable. >> >>> While it does add some complexity, I don't feel a single queue and >>> managing it is that much extra complexity. >>> >> But in fact the point of making the scheduler 'event driven' is to take >> advantage of the chances that this offers for getting stuff *out* of >> rt_schedule(), and the purpose is not "not adding that much extra >> complexity", is making it _simpler_. > > I understand where you are coming from now. I was looking at is mostly > from the perspective of performance. This explains our differences. > >> >> >>> > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c >>> > index 7c39a9e..25f0458 100644 >>> > --- a/xen/common/sched_rt.c >>> > +++ b/xen/common/sched_rt.c >>> >>> > @@ -156,6 +160,7 @@ struct rt_vcpu { >>> > s_time_t cur_budget; /* current budget */ >>> > s_time_t last_start; /* last start time */ >>> > s_time_t cur_deadline; /* current deadline for EDF >>> */ >>> > + s_time_t next_sched_needed; /* next time to make >>> scheduling decision */ >>> > >>> As an example of why I said that things should become simpler, >>> and are >>> instead becoming more complex: with my solution, you don't >>> need anything >>> like this. In fact, budget enforcement is already being done >>> ok in >>> rt_schedule(), so no need to do anything more/different about >>> it. >>> Replenishments should be programmed to fire at cur_deadline, >>> so again, >>> no need to add this new field, and multiplex its actual value >>> between >>> budget and deadline (which, yes, counts as being something >>> rather >>> complex for me! :-D). >>> >>> >>> As mentioned, the timerq is handling all events that could change the >>> scheduling decision, not just replenishments. >>> >> Yes, exactly, it's handling *too much* events. :-) >> >> For example, have a look at 'struct vcpu', in >> xen/include/xen/sched.h. It's got 3 timers in it: >> >> struct timer periodic_timer; >> struct timer singleshot_timer; >> struct timer poll_timer; /* timeout for SCHEDOP_poll */ >> >> It probably would have been possible to just use one, with a single and >> mixed event queue, as you're doing, a 1k lines handling routines, etc. >> >> Do you it it would have been easier or harder to implement, understand, >> debug and maintain? I bet on harder. > > Point taken. > >> >>> This tracks the earliest time anything cause this to be scheduled >>> differently, and could be extended if any more insights are found. >>> Budget enforcement is being done in rt_schedule but its being done by >>> this very list: a budget expiration is one possible event that would >>> require a vcpu reschedule. >>> >> OMG, 'extended'... You're thinking to actually put more stuff in >> there?!? :-O >> >> It's a rather key and already too long and complex critical section, so >> please, let's aim at shortening and making it simpler, i.e., at >> improving things, not making them worse. > > This can again be explained by our goal mismatch. I see where you are > coming from now. > >>> >>> > +/* >>> > + * Return how far the lowest time on the timerq (that is >>> after NOW) is in the >>> > + * future. >>> > + * Lock must be grabbed before calling this. >>> > + */ >>> > +static s_time_t __timerq_next(const struct scheduler *ops, >>> s_time_t now) >>> > +{ >>> > + struct list_head *timerq = rt_timerq(ops); >>> > + struct list_head *iter, *tmp; >>> > + >>> > + list_for_each_safe(iter, tmp, timerq) >>> > + { >>> > + struct rt_vcpu * iter_svc = __t_elem(iter); >>> > + >>> > + if ( iter_svc->next_sched_needed > now ) >>> > + return (iter_svc->next_sched_needed - now); >>> > + else >>> > + __t_remove(iter_svc); >>> > + } >>> > + >>> > + return MAX_SCHEDULE; >>> > +} >>> > >>> If using a timer, you can just get rid of items while, in the >>> timer >>> routine, you handle the event associated to them. >>> >>> Also, why is MAX_SCHEDULE still there? >>> >>> >>> Honestly, events do not have to be removed here, but its done to >>> prevent walking a list of stale values to get to the newest one on the >>> next call. This is done more for performance. Their non-removal would >>> be functionally equivalent. >>> >> Of course you should remove the stale entries, I certainly was not >> arguing that the list should just grow indefinitely! >> >> Point is, again, that this is another list walk occurring in an hot >> path, with a big spin lock held. We want to get rid of such thing, not >> adding more of it. >> >>> With the timer alternative you mention, where would the timer events >>> and their data be held? I think that could be extra complexity, unless >>> I fail to understand the idea completely. >>> >> In an event queue like yours, of course. But you won't go through it >> and/or bookkeep it while in hot paths, with the scheduler lock held. >> >> See my email to Meng to have more details on what I have in mind, or >> fell free to ask more details. >> >>> MAX_SCHEDULE may not be required, but I have it there as an "in case." >>> For example, it will cause the scheduler to be called after >>> MAX_SCHEDULE even when no events are occurring (such as no vcpus). Its >>> there to ensure progress in case of any bugs or unexpected behavior. >>> >> Wait, so, all this work, and then you still want to call rt_schedule() >> every millisecond, when there's nothing to do?!?! :-O >> >>> That being said, in this specific case, you're moving up >>> runq_tickle(), >>> the purpose of which is to trigger a call to rt_schedule() >>> (after going >>> through the softirq machinery)... in order to call it in >>> rt_schedule(). >>> That's pretty tricky, and is not how tickling should work. >> >>> It set up to only tickle if needed. I'm not sure if this is an issue, >>> >> It's wrong, AFAICT. See my reply to Meng and, please, comment by >> replying to it, if you've got anything to say about this, to make the >> discussion easier to follow. >> >> Regards, >> Dario > > I understand what caused our mismatch now. I think option a) you > mentioned makes sense > given your values. I am going to look into the details of this > approach and get back with any > questions or concerns. > > Regards, > ~Dagaen ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-16 17:07 ` Dagaen Golomb @ 2015-06-17 0:20 ` Dario Faggioli 2015-06-17 5:24 ` Dagaen Golomb 0 siblings, 1 reply; 21+ messages in thread From: Dario Faggioli @ 2015-06-17 0:20 UTC (permalink / raw) To: Dagaen Golomb Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li [-- Attachment #1.1: Type: text/plain, Size: 14992 bytes --] On Tue, 2015-06-16 at 13:07 -0400, Dagaen Golomb wrote: > I'm not replying inline because this is a more general response that > is not limited > to any particular comment. > That's fine. Thanks for this actually... I love discussing these things, it makes me remind the time when I was doing these stuff myself, and makes me feel young! :-P > Separating the replenishment from the scheduler may be problematic. The nature > of the gEDF scheduler is that when a priority changes it should be instantly > reflected. > 'instantly'. Is there such a thing? :-O > If replenishments were done by themsevles, then the scheduler may > decide to wait for some period of time, and in this same time period a > replenishment > could change what should be executing. > Well, the scheduler then would better *not* decide to wait for any period of time. It better act like this: as soon as a replenishment happens, and a new deadline is assigned to a vCPU, put the vCPU in the runqueue, in the proper position; and if such a newly replenished vCPU should now preempt any other, currently running, vCPU, 'tickle' the pCPU in question and let it reschedule, and pick up the newly replenished vCPU. > Either the gEDF policy will be > violated or > the replenishment routine would have to notify the scheduler after any > replenishment, requiring some form of interaction and possibly more scheduler > invocations than the current structure. > Obviously not. In fact, it's your 'current structure' that already invokes the scheduler for any replenishment. Doing more than that, would be really hard (impossible, I should say). I fact, it's the s_timer that you're making fire for any replenishment, and replenishments happen inside rt_schedule() itself. This is _the_definition_ of invoking the scheduler! OTOH, by using a (some) replenishment timer(s), the scheduler *may* *potentially* be invoked for any replenishment, yes, but it may also very well not. In fact, if even after a replenishment, the new deadline of the replenished vCPU is is father than the deadlines of all the currently running vCPUs, there's no need to tickle any pCPU, no need to call the scheduler. BTW, from what you say, it seems to me that we are in agreement: we _do_not_ want to call the scheduler upon any replenishment. Do as I say and we'll achieve that. > The other option is for the scheduler to > check for replenishments, as it does now. > Yeah, the wrong option. :-D :-D > Without any interaction a violation of > the policy could ensue. The gEDF is not like the credit scheduler where there is > an accounting period where, during this time, policy may be violated > temporarily. > Yep, I do have an idea of what EDF is. > Further, with two timer routines we now need to deal with any ordering > or preemption > between them (or logic to prevent / order such) which I could argue is far more > complexity than having them done at once as it is now. > Mmm... I think I'm starting to lose you. Ordering between two timers? I don't see what you mean. Allow me, though, to walk through the behavior of your current approach. I've done this already in a previous email, but no big deal re-doing it. Oh, actually, the current approach is buggy, as you tickle the wrong vCPU, but let's forget about that (let's assume it's fixed). Let's say that time for a replenishment to vCPU w has come. on pCPU X: . timer interrupt . TIMER_SOFTIRQ raised . process softirqs . SCHEDULE_SOFTIRQ raised . process softirqs . rt_schedule() . [spin_lock] . burn_budget(scurr) . __repl_update() (goes through all runnable vcpus!!) . snext = __runq_pick() -> says vCPU w should run on pCPU Y . runq_tickle(snext) ----> tickle pCPU Y [*] . [spin_unlock] . on pCPU Y: . process softirqs . SCHEDULE_SOFTIRQ . rt_schedule() . [spin_lock] . burn_budget(scurr) . __repl_update() (goes through all runnable vcpus!! AGAIN!!) . snext = runq_pick() ----------> pick vCPU w . [spin_unlock] . context switch: vCPU w now runs on pCPU Y So, you forced pCPU X through a full execution of rt_schedule(), with the spinlock, the scan of runqueue and depleted queue, and everything, just for the sake of figuring out that pCPU Y should reschedule. Then you (as soon as practical) actually reschedule on pCPU Y and context switch in favour of vCPU w, enacting EDF. That's 2 scheduler invocations, and rt_schedule() is still ugly and complex to read, maintain and improve. [*] and don't forget that this needs fixing, because right now it's just incorrect, and (just speculating) making runq_pick() behave in the way you want, may not be super-simple. Oh, I was almost forgetting! Wonder what happens in the time interval between when __repl_update() is called (from inside rt_schedule()) on pCPU X, and the actual time instant of the context switch on pCPU Y? Yeah, right, what happens is that you're violating EDF. :-/ This can be accounted as a form of overhead introduced by the implementation, I guess. In this specific case, it's due to the fact that you can't, from pCPU X, just change *instantly* what's running on pCPU Y. No, you've got to send an IPI, then Y has to react and it has to schedule, etc. This is of course dependant on the architecture of the scheduler. Xen's scheduler works like this. FWIW, Linux's scheduler, wrt this, also does. So, see? Transient violation of EDF is just there, no matter the approach! Let's look at the timers way: on pCPU X: . timer interrupt . TIMER_SOFTIRQ raised . process softirqs . replenishment_timer_handler() . [spin_lock] . <for_each_depleted_vcpu(i, i.repl_time < NOW()) { . replenish(i) ---------> replenish vCPU w . runq_tickle(i) -------> tickle pCPU Y . }> . [spin_lock] . on pCPU Y: . process softirqs . SCHEDULE_SOFTIRQ . rt_schedule() . [spin_lock] . burn_budget(scurr) . snext = runq_pick() ----------> pick vCPU w . [spin_unlock] . context switch: vCPU x now runs on pCPU Y So this means only 1 invocation of the scheduler, and only on the pCPU that actually needs to reschedule. There's some overhead due to the timer, of course, but, notably, this allows for shorter critical sections, not to mention better looking and easier to maintain code. Are we still violating EDF? Of course we are, between the replenishment and context switch time, as above. That's unavoidable, I'm afraid. Summarizing, the two solutions are on par wrt temporary violation of the theoretical algorithm, but the timer based approach has loads of other benefits, so let's go with timers. :-) > If both are armed for the same time, which should execute? Scheduler > first before > a possibly applicable replenishment? Or replenishment always first? > Both with added > logic to enforce this and prevent preemption, of course. > I really don't see what you mean here. There won't be anything like that to take care of. Xen offers timers as an abstraction, and deals with them itself. You only need to take care of properly serializing rt_schedule() and the timer handling routine, for the code sections that require that. > Due to this, it is my belief that by the nature of the gEDF policy the > current solution is > better, mostly because it appears to actually be the least complex way that is > functionally correct. > Not really, no. I wouldn't be convinced of this, even if what you have right now were functionally correct. > The gEDF policy just isn't well suited for > decoupling events, as > the events are highly dependent on one another, and particularly dependent at an > instantaneous timescale. > And here it come 'instantaneous timescale' again. No, sorry, but I don't buy this. In fact, why does it not apply to vCPUs' wakeups? I think it does, conceptually... So, should we move the wakeup logic inside rt_schedule() too? > It would be a hard pitch for a gEDF scheduler with a > similar "accounting period" where the gEDF policy could be violated. > There's no accounting period of any sort. It's called overhead! What we need to do is to try to keep it as small as possible. And I'm quite sure that an effective way to do so is to keep critical sections short, especially when protected by (potentially) highly contented spin locks. RTDS, currently, suffer from poor scalability (and, I suspect, poor latency as well, at least under pressure), and one of the reasons why the work you're doing is interesting is that it can alleviate this... if done with that in mind, of course. > That is > blasphemy in the real-time world. > Blasphemy, that's a nice one! :-D :-D Well, I've never been good at religions, I have to admit. So, yes, I can live with being called blasphemous, I guess. :-) > Its also worth noting that the stereotypical textbook event-driven > model is as we have > now: a single event loop that handles events. In this case the scheduler is the > conceptually the main loop and this makes perfect sense: its deciding > what to run > (think of the running VCPUs as callback functions and the abstraction > falls into place > perfectly). The event loop needs some information (about replenishments) before > deciding, and collecting this would be part of the decode and decision > phase for an > event signal. > Talking about 'stereotypes', I don't have any textbook describing an event-driven model at hand right now. However, you RT-Xen people like and use LitmusRT, don't you? Well, you're doing the right thing, because it's a great piece of software! If you're interested, have a look in there. Spoiler: you'll find a lot of timers. :-D :-D More seriously, it's of course rather different, as Linux is not Xen, but since you're looking for stereotypes: - Linux (or at lease LitmusRT patch 2014.2, i.e., what I'm looking at right now) has ticks. This means a timer fires every X milliseconds (on every CPU), and task_tick_litmus() is run as a consequence. Such function does not (at all!) invoke the scheduler directly, it much rather just checks whether doing so is necessary, in this case because of some task having exhausted its budget. If it happened, it calls litmus_reschedule_local() (see below). I know, this is budget enforcement, not replenishment, but I think it works as a good example of my point about using timers. - Actually, budget enforcement can be done, in LitmusRT, in two ways. One is tick based (described above), the other, called 'precise', is timer based. To see that in action, check how the enforcement timer is handled, e.g., in update_enforcement_timer(). Look at on_enforcement_timeout(), and find that it also _does_not_ schedule. It again just asks for the scheduler to be invoked as soon as practical, via (again) litmus_reschedule_local(). - Finally, a 'job release', in LitmusRT terminology, is probably what is most close to a budget replenishment for us here. In fact, when a new RT job is released, it's given full budget, and its scheduling parameters (deadline, in most cases) are updated accordingly, like it happens, for us, with replenishments. Check, therefore, how that happens in add_release() and __add_release(). You'll find a call to arm_release_timer() which calls, when firing, on_release_timer(). From there, go, e.g., to gsnedf_release_jobs(), and follow the call chain through check_for_preemptions() --> preempt() --> preempt_if_preemptable() --> litmus_reschedule(), which then calls either set_tsk_need_resched(current) or smp_send_reschedule(cpu). litmus_reschedule_local(), and similar functions, do something very similar to our tickling, i.e., they ask a specific CPU to go through the scheduler. When this happens via set_tsk_need_resched(current) it's like, in Xen, tickling the pCPU we're on. When it happens via smp_send_reschedule(cpu) it's like, in Xen, tickling the remote pCPU cpu. Now, although some years have passed, I've interacted quite a bit with LitmusRT folks. They're very smart, and usually very good at what they do, and I'd be surprised to find them guilty of that much real-time blasphemy. :-P You also may know that there is an EDF implementation in mainline Linux, and you may would like to go have a look there too. I'm not including any example from there because I'm one of the main authors of it, so it's pretty obvious that it uses the approach I think it's best (and, more important, it would be rather inelegant to cite myself! :-D). Anyway, if you fancy doing that, the core is in kernel/sched/deadline.c. > Not only is there a complexity issue, but as mentioned before this may be the > best performing option, making the most information available and the > best timing > decision possible. Adding a few extra cycles to a hot path, even with > a lock being > held, is worth it if the scheduler and context switching is done less. > Right, so let's use timers, they require less or equal scheduler invocations wrt to... Always invoking the scheduler!! (Not sure why you mention the number of context switches, that's independent on which of the two approaches you choose.) > For the above reasons I think you should reconsider the current > implementation, as it > appears it may be the least complex and easiest to reason about > solution. > Well, no, sorry. :-( I mean, I appreciate that what you have now (once fixed), may be effective in avoiding some unnecessary invocation of the scheduler, with respect to what we currently have in the repo, so I'm not saying it can't be seen as an improvement. It's a "nice hack", actually. However, if we're still aiming at making this scheduler a first class citizen within Xen, we're interested in less hacks, rather than in more --no matter how nice they are. So, with that in mind, it would be a step in the wrong direction, IMO. > Let me know if I'm missing some key insight into how the behavior > could be implemented > correctly and beautifully using the multiple timer approach. > well, I tried. I really did. > I simply > don't see how it can > be done without heavy interaction and information sharing between them > which really > defeats the purpose. > No, it doesn't. Thanks and Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) [-- Attachment #1.2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 181 bytes --] [-- Attachment #2: Type: text/plain, Size: 126 bytes --] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-17 0:20 ` Dario Faggioli @ 2015-06-17 5:24 ` Dagaen Golomb 2015-06-17 5:45 ` Meng Xu 2015-06-17 8:27 ` Dario Faggioli 0 siblings, 2 replies; 21+ messages in thread From: Dagaen Golomb @ 2015-06-17 5:24 UTC (permalink / raw) To: Dario Faggioli Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li > Thanks for this actually... I love discussing these things, it makes me > remind the time when I was doing these stuff myself, and makes me feel > young! :-P And thank you for the very detailed and well-thought response! > >> Separating the replenishment from the scheduler may be problematic. The nature >> of the gEDF scheduler is that when a priority changes it should be instantly >> reflected. >> > 'instantly'. Is there such a thing? :-O I meant no accounting period and implicitly during scheduler functions it may happen, but from your reply I know you understand this :P > >> If replenishments were done by themsevles, then the scheduler may >> decide to wait for some period of time, and in this same time period a >> replenishment >> could change what should be executing. >> > Well, the scheduler then would better *not* decide to wait for any > period of time. It better act like this: as soon as a replenishment > happens, and a new deadline is assigned to a vCPU, put the vCPU in the > runqueue, in the proper position; and if such a newly replenished vCPU > should now preempt any other, currently running, vCPU, 'tickle' the pCPU > in question and let it reschedule, and pick up the newly replenished > vCPU. Yes, this is an option. However, I thought this would actually be an option you would not like. For the replenishment routine to know when to call the scheduler and when not to, it basically has to perform the the scheduler's logic, at which point if there is a change... why doesn't it just do it (i.e., be the scheduler) instead of adding logic and possibly overhead to call it. > >> Either the gEDF policy will be >> violated or >> the replenishment routine would have to notify the scheduler after any >> replenishment, requiring some form of interaction and possibly more scheduler >> invocations than the current structure. >> > Obviously not. In fact, it's your 'current structure' that already > invokes the scheduler for any replenishment. Doing more than that, would > be really hard (impossible, I should say). > > I fact, it's the s_timer that you're making fire for any replenishment, > and replenishments happen inside rt_schedule() itself. This is > _the_definition_ of invoking the scheduler! > > OTOH, by using a (some) replenishment timer(s), the scheduler *may* > *potentially* be invoked for any replenishment, yes, but it may also > very well not. In fact, if even after a replenishment, the new deadline > of the replenished vCPU is is father than the deadlines of all the > currently running vCPUs, there's no need to tickle any pCPU, no need to > call the scheduler. > > BTW, from what you say, it seems to me that we are in agreement: we > _do_not_ want to call the scheduler upon any replenishment. Do as I say > and we'll achieve that. I'm glad we are in agreement on that :) Also, I'd like to point out that when I said things like "make more intelligent" decision handling, I meant exactly things like the above: using as much information as possible to prevent scheduler invocations. The current patch doesn't reschedule for any replenishment. Firstly, it doesn't even look at the run queue: replenishments here can only happen on miss. > >> The other option is for the scheduler to >> check for replenishments, as it does now. >> > Yeah, the wrong option. :-D :-D > >> Without any interaction a violation of >> the policy could ensue. The gEDF is not like the credit scheduler where there is >> an accounting period where, during this time, policy may be violated >> temporarily. >> > Yep, I do have an idea of what EDF is. > >> Further, with two timer routines we now need to deal with any ordering >> or preemption >> between them (or logic to prevent / order such) which I could argue is far more >> complexity than having them done at once as it is now. >> > Mmm... I think I'm starting to lose you. Ordering between two timers? I > don't see what you mean. > > Allow me, though, to walk through the behavior of your current approach. > I've done this already in a previous email, but no big deal re-doing it. > Oh, actually, the current approach is buggy, as you tickle the wrong > vCPU, but let's forget about that (let's assume it's fixed). > > Let's say that time for a replenishment to vCPU w has come. > > on pCPU X: > . timer interrupt > . TIMER_SOFTIRQ raised > . process softirqs > . SCHEDULE_SOFTIRQ raised > . process softirqs > . rt_schedule() > . [spin_lock] > . burn_budget(scurr) > . __repl_update() (goes through all runnable vcpus!!) > . snext = __runq_pick() -> says vCPU w should run on pCPU Y > . runq_tickle(snext) ----> tickle pCPU Y [*] > . [spin_unlock] > . > on pCPU Y: > . process softirqs > . SCHEDULE_SOFTIRQ > . rt_schedule() > . [spin_lock] > . burn_budget(scurr) > . __repl_update() (goes through all runnable vcpus!! AGAIN!!) > . snext = runq_pick() ----------> pick vCPU w > . [spin_unlock] > . > context switch: vCPU w now runs on pCPU Y > > So, you forced pCPU X through a full execution of rt_schedule(), with > the spinlock, the scan of runqueue and depleted queue, and everything, > just for the sake of figuring out that pCPU Y should reschedule. Then > you (as soon as practical) actually reschedule on pCPU Y and context > switch in favour of vCPU w, enacting EDF. > > That's 2 scheduler invocations, and rt_schedule() is still ugly and > complex to read, maintain and improve. > > [*] and don't forget that this needs fixing, because right now it's just > incorrect, and (just speculating) making runq_pick() behave in the way > you want, may not be super-simple. > > Oh, I was almost forgetting! Wonder what happens in the time interval > between when __repl_update() is called (from inside rt_schedule()) on > pCPU X, and the actual time instant of the context switch on pCPU Y? > Yeah, right, what happens is that you're violating EDF. :-/ > > This can be accounted as a form of overhead introduced by the > implementation, I guess. In this specific case, it's due to the fact > that you can't, from pCPU X, just change *instantly* what's running on > pCPU Y. No, you've got to send an IPI, then Y has to react and it has to > schedule, etc. This is of course dependant on the architecture of the > scheduler. Xen's scheduler works like this. FWIW, Linux's scheduler, wrt > this, also does. > > So, see? Transient violation of EDF is just there, no matter the > approach! Of course, violation during the scheduler is possible for any implementation :( > > Let's look at the timers way: > > on pCPU X: > . timer interrupt > . TIMER_SOFTIRQ raised > . process softirqs > . replenishment_timer_handler() > . [spin_lock] > . <for_each_depleted_vcpu(i, i.repl_time < NOW()) { > . replenish(i) ---------> replenish vCPU w > . runq_tickle(i) -------> tickle pCPU Y > . }> > . [spin_lock] > . > on pCPU Y: > . process softirqs > . SCHEDULE_SOFTIRQ > . rt_schedule() > . [spin_lock] > . burn_budget(scurr) > . snext = runq_pick() ----------> pick vCPU w > . [spin_unlock] > . > context switch: vCPU x now runs on pCPU Y > > So this means only 1 invocation of the scheduler, and only on the pCPU > that actually needs to reschedule. There's some overhead due to the > timer, of course, but, notably, this allows for shorter critical > sections, not to mention better looking and easier to maintain code. > > Are we still violating EDF? Of course we are, between the replenishment > and context switch time, as above. That's unavoidable, I'm afraid. > > Summarizing, the two solutions are on par wrt temporary violation of the > theoretical algorithm, but the timer based approach has loads of other > benefits, so let's go with timers. :-) > >> If both are armed for the same time, which should execute? Scheduler >> first before >> a possibly applicable replenishment? Or replenishment always first? >> Both with added >> logic to enforce this and prevent preemption, of course. >> > I really don't see what you mean here. There won't be anything like that > to take care of. Xen offers timers as an abstraction, and deals with > them itself. You only need to take care of properly serializing > rt_schedule() and the timer handling routine, for the code sections that > require that. This is good to know, thanks. The question would then change to how does Xen handle such an occurrence, but I think this is irrelevant now that I know which "side" you had in mind - that is, that the replenishment handler would decide when the scheduler should be invoked. > >> Due to this, it is my belief that by the nature of the gEDF policy the >> current solution is >> better, mostly because it appears to actually be the least complex way that is >> functionally correct. >> > Not really, no. I wouldn't be convinced of this, even if what you have > right now were functionally correct. > >> The gEDF policy just isn't well suited for >> decoupling events, as >> the events are highly dependent on one another, and particularly dependent at an >> instantaneous timescale. >> > And here it come 'instantaneous timescale' again. > > No, sorry, but I don't buy this. In fact, why does it not apply to > vCPUs' wakeups? I think it does, conceptually... So, should we move the > wakeup logic inside rt_schedule() too? > >> It would be a hard pitch for a gEDF scheduler with a >> similar "accounting period" where the gEDF policy could be violated. >> > There's no accounting period of any sort. It's called overhead! This part was referring to if we had two timers without extensive communication, cooperation, or information/logic sharing. > > What we need to do is to try to keep it as small as possible. And I'm > quite sure that an effective way to do so is to keep critical sections > short, especially when protected by (potentially) highly contented spin > locks. RTDS, currently, suffer from poor scalability (and, I suspect, > poor latency as well, at least under pressure), and one of the reasons > why the work you're doing is interesting is that it can alleviate > this... if done with that in mind, of course. > >> That is >> blasphemy in the real-time world. >> > Blasphemy, that's a nice one! :-D :-D > > Well, I've never been good at religions, I have to admit. So, yes, I can > live with being called blasphemous, I guess. :-) You get the complement of blasphemy, I get that of "nice hack" :) > >> Its also worth noting that the stereotypical textbook event-driven >> model is as we have >> now: a single event loop that handles events. In this case the scheduler is the >> conceptually the main loop and this makes perfect sense: its deciding >> what to run >> (think of the running VCPUs as callback functions and the abstraction >> falls into place >> perfectly). The event loop needs some information (about replenishments) before >> deciding, and collecting this would be part of the decode and decision >> phase for an >> event signal. >> > Talking about 'stereotypes', I don't have any textbook describing an > event-driven model at hand right now. However, you RT-Xen people like > and use LitmusRT, don't you? Well, you're doing the right thing, because > it's a great piece of software! > > If you're interested, have a look in there. Spoiler: you'll find a lot > of timers. :-D :-D > > More seriously, it's of course rather different, as Linux is not Xen, > but since you're looking for stereotypes: > > - Linux (or at lease LitmusRT patch 2014.2, i.e., what I'm looking at > right now) has ticks. This means a timer fires every X milliseconds > (on every CPU), and task_tick_litmus() is run as a consequence. Such > function does not (at all!) invoke the scheduler directly, it much > rather just checks whether doing so is necessary, in this case > because of some task having exhausted its budget. If it happened, it > calls litmus_reschedule_local() (see below). I know, this is budget > enforcement, not replenishment, but I think it works as a good > example of my point about using timers. > > - Actually, budget enforcement can be done, in LitmusRT, in two ways. > One is tick based (described above), the other, called 'precise', is > timer based. To see that in action, check how the enforcement timer > is handled, e.g., in update_enforcement_timer(). Look at > on_enforcement_timeout(), and find that it also _does_not_ schedule. > It again just asks for the scheduler to be invoked as soon as > practical, via (again) litmus_reschedule_local(). > > - Finally, a 'job release', in LitmusRT terminology, is probably what > is most close to a budget replenishment for us here. In fact, when a > new RT job is released, it's given full budget, and its scheduling > parameters (deadline, in most cases) are updated accordingly, like it > happens, for us, with replenishments. > Check, therefore, how that happens in add_release() and > __add_release(). You'll find a call to arm_release_timer() which > calls, when firing, on_release_timer(). From there, go, e.g., to > gsnedf_release_jobs(), and follow the call chain through > check_for_preemptions() --> preempt() --> preempt_if_preemptable() > --> litmus_reschedule(), which then calls either > set_tsk_need_resched(current) or smp_send_reschedule(cpu). > > litmus_reschedule_local(), and similar functions, do something very > similar to our tickling, i.e., they ask a specific CPU to go through the > scheduler. When this happens via set_tsk_need_resched(current) it's > like, in Xen, tickling the pCPU we're on. When it happens via > smp_send_reschedule(cpu) it's like, in Xen, tickling the remote pCPU > cpu. > > Now, although some years have passed, I've interacted quite a bit with > LitmusRT folks. They're very smart, and usually very good at what they > do, and I'd be surprised to find them guilty of that much real-time > blasphemy. :-P > > You also may know that there is an EDF implementation in mainline Linux, > and you may would like to go have a look there too. I'm not including > any example from there because I'm one of the main authors of it, so > it's pretty obvious that it uses the approach I think it's best (and, > more important, it would be rather inelegant to cite myself! :-D). > > Anyway, if you fancy doing that, the core is in kernel/sched/deadline.c. Thanks for the history and background lesson here :) > >> Not only is there a complexity issue, but as mentioned before this may be the >> best performing option, making the most information available and the >> best timing >> decision possible. Adding a few extra cycles to a hot path, even with >> a lock being >> held, is worth it if the scheduler and context switching is done less. >> > Right, so let's use timers, they require less or equal scheduler > invocations wrt to... Always invoking the scheduler!! (Not sure why you > mention the number of context switches, that's independent on which of > the two approaches you choose.) > >> For the above reasons I think you should reconsider the current >> implementation, as it >> appears it may be the least complex and easiest to reason about >> solution. >> > Well, no, sorry. :-( > > I mean, I appreciate that what you have now (once fixed), may be > effective in avoiding some unnecessary invocation of the scheduler, with > respect to what we currently have in the repo, so I'm not saying it > can't be seen as an improvement. It's a "nice hack", actually. "nice hack." heh, heh, thanks :P > > However, if we're still aiming at making this scheduler a first class > citizen within Xen, we're interested in less hacks, rather than in more > --no matter how nice they are. So, with that in mind, it would be a step > in the wrong direction, IMO. > >> Let me know if I'm missing some key insight into how the behavior >> could be implemented >> correctly and beautifully using the multiple timer approach. >> > well, I tried. I really did. > >> I simply >> don't see how it can >> be done without heavy interaction and information sharing between them >> which really >> defeats the purpose. >> > No, it doesn't. Ok this last line is the zinger. Almost this entire email was built on the premise that you would NOT like the idea of the replenishment handler having basically the same decision logic as the scheduler and then tickling the pCPU to pick up the new vCPU. Actually, with it done this way, I would have a hard time calling the tickle-invoked method the "scheduler." It would be more like the mover, with the replenishment function being essentially the scheduler. In this case, I'm not too sure actually how much different this would be from what we have now. It feels like, from what you want, that we could get the same behavior by modifying rt_schedule to do replenishments first, then check if a reschedule is needed (these two parts would be in this proposed replenishment timer routine) and the perform any move if necessary. I know this isn't exactly what you want, but that sounds close right? But the scheduler will have to decide which to move in, so that logic is done twice. Also, if these are done back-to-back and require the locks, isn't it really the same as having one long hot path? If you want maintainability, couldn't we just do replenishment then schedule and move (if necessary) in one timer (the one we have now) and move them to functions. It seems this can be done with one timer neatly. So here's my proposal, lets see if it fits what you want: 1.) The scheduler does not do any timing, 2.) replenishments are scheduled via timer at each [next] replenishment period. Each replenishment resorts the replenished vCPUs, and if any of the first #CPUs in the runq change, we tickle a pCPU for each change In this case, we can use one timer. We could use the current one as a replenishment timer, and make it so rt_schedule isn't the default invoked method. Does this sound similar to what you are suggesting? I have to ask because I thought you wouldn't like the idea, and its not really *that* far off from what we have now, Its a little restructuring so that replenishments occur before any scheduling activity and the handler checks if switching is needed (basically acting as the scheduler) and then calls tickle. Sounds like what you had in mind? Regards, ~Dagaen ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-17 5:24 ` Dagaen Golomb @ 2015-06-17 5:45 ` Meng Xu 2015-06-17 6:03 ` Dagaen Golomb 2015-06-17 9:20 ` Dario Faggioli 2015-06-17 8:27 ` Dario Faggioli 1 sibling, 2 replies; 21+ messages in thread From: Meng Xu @ 2015-06-17 5:45 UTC (permalink / raw) To: Dagaen Golomb Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel, Meng Xu, Jan Beulich, Chong Li Hi Dagaen, I just comment on the summary of scheduler design you proposed at the end of the email. I'm looking forward to Dario's more insightful reply. > > > > > >> I simply > >> don't see how it can > >> be done without heavy interaction and information sharing between them > >> which really > >> defeats the purpose. > >> > > No, it doesn't. > > Ok this last line is the zinger. > > Almost this entire email was built on the premise that you would NOT like the > idea of the replenishment handler having basically the same decision logic > as the scheduler and then tickling the pCPU to pick up the new vCPU. Actually, > with it done this way, I would have a hard time calling the > tickle-invoked method > the "scheduler." It would be more like the mover, with the > replenishment function > being essentially the scheduler. In this case, I'm not too sure > actually how much > different this would be from what we have now. It feels like, from > what you want, > that we could get the same behavior by modifying rt_schedule to do > replenishments > first, then check if a reschedule is needed (these two parts would be in this > proposed replenishment timer routine) and the perform any move if necessary. I > know this isn't exactly what you want, but that sounds close right? > > But the scheduler will have to decide which to move in, so that logic > is done twice. > Also, if these are done back-to-back and require the locks, isn't it > really the same > as having one long hot path? If you want maintainability, couldn't we just do > replenishment then schedule and move (if necessary) in one timer (the > one we have > now) and move them to functions. It seems this can be done with one > timer neatly. > > So here's my proposal, lets see if it fits what you want: I will leave this to Dario to answer if it fits what he wants. :-P > > 1.) The scheduler does not do any timing, Not really. The rt_schedule does the budget enforcement. When a current VCPU runs out of budget, the rt_schedule will be invoked by a timer (you can refer to the schedule function in xen/sched/schedule.c to have a look how the timer is armed/disarmed.). When the rt_schedule is invoked, it needs to: a) update the budget of the current running VCPU and move it from runq to depleted queue; b) pick the current highest VCPU from runq and return it as the snext VCPU. So scheduling logic is still involved in the rt_schedule function. > > 2.) replenishments are scheduled via timer at each [next] > replenishment period. Each > replenishment resorts the replenished vCPUs, and if any of the first > #CPUs in the > runq change, we tickle a pCPU for each change This is right. > > > In this case, we can use one timer. We actually have two: one for budget enforcement and the other for budget replenishment. > > We could use the current one as a > replenishment > timer, and make it so rt_schedule isn't the default invoked method. > > Does this sound similar to what you are suggesting? I don't think so, but I will leave it for Dario's for his opinion. In Dario's suggestion, you just simply remove the update_budget function out of rt_schedule. This is because budget enforcement, which is the work of rt_schedule, does not naturelly involves budget replenishment. In order to achieve budget replenishment, we need another timer to invoke another function (budget_replenish function), that is dedicated to that. > > I have to ask > because I thought > you wouldn't like the idea, I guess Dario won't like this idea. :-P (I'm kidding, but it should be the case.) > > and its not really *that* far off from > what we have now, Its > a little restructuring so that replenishments occur before any > scheduling activity and > the handler checks if switching is needed (basically acting as the > scheduler) and then > calls tickle. Sounds like what you had in mind? Thanks and best regards, Meng -- ----------- Meng Xu PhD Student in Computer and Information Science University of Pennsylvania http://www.cis.upenn.edu/~mengxu/ ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-17 5:45 ` Meng Xu @ 2015-06-17 6:03 ` Dagaen Golomb 2015-06-17 6:09 ` Meng Xu 2015-06-17 9:20 ` Dario Faggioli 1 sibling, 1 reply; 21+ messages in thread From: Dagaen Golomb @ 2015-06-17 6:03 UTC (permalink / raw) To: Meng Xu Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel, Meng Xu, Jan Beulich, Chong Li [-- Attachment #1.1: Type: text/plain, Size: 4501 bytes --] Thanks for the reply, budget enforcement in the scheduler timer makes sense. I think I have an idea of what he wants done now. ~Dagaen On Jun 17, 2015 1:45 AM, "Meng Xu" <xumengpanda@gmail.com> wrote: > Hi Dagaen, > > I just comment on the summary of scheduler design you proposed at the > end of the email. I'm looking forward to Dario's more insightful > reply. > > > > > > > > > >> I simply > > >> don't see how it can > > >> be done without heavy interaction and information sharing between them > > >> which really > > >> defeats the purpose. > > >> > > > No, it doesn't. > > > > Ok this last line is the zinger. > > > > Almost this entire email was built on the premise that you would NOT > like the > > idea of the replenishment handler having basically the same decision > logic > > as the scheduler and then tickling the pCPU to pick up the new vCPU. > Actually, > > with it done this way, I would have a hard time calling the > > tickle-invoked method > > the "scheduler." It would be more like the mover, with the > > replenishment function > > being essentially the scheduler. In this case, I'm not too sure > > actually how much > > different this would be from what we have now. It feels like, from > > what you want, > > that we could get the same behavior by modifying rt_schedule to do > > replenishments > > first, then check if a reschedule is needed (these two parts would be in > this > > proposed replenishment timer routine) and the perform any move if > necessary. I > > know this isn't exactly what you want, but that sounds close right? > > > > But the scheduler will have to decide which to move in, so that logic > > is done twice. > > Also, if these are done back-to-back and require the locks, isn't it > > really the same > > as having one long hot path? If you want maintainability, couldn't we > just do > > replenishment then schedule and move (if necessary) in one timer (the > > one we have > > now) and move them to functions. It seems this can be done with one > > timer neatly. > > > > So here's my proposal, lets see if it fits what you want: > > > I will leave this to Dario to answer if it fits what he wants. :-P > > > > > > 1.) The scheduler does not do any timing, > > > Not really. The rt_schedule does the budget enforcement. When a > current VCPU runs out of budget, the rt_schedule will be invoked by a > timer (you can refer to the schedule function in xen/sched/schedule.c > to have a look how the timer is armed/disarmed.). When the rt_schedule > is invoked, it needs to: > a) update the budget of the current running VCPU and move it from runq > to depleted queue; > b) pick the current highest VCPU from runq and return it as the snext VCPU. > > So scheduling logic is still involved in the rt_schedule function. > > > > > 2.) replenishments are scheduled via timer at each [next] > > replenishment period. Each > > replenishment resorts the replenished vCPUs, and if any of the first > > #CPUs in the > > runq change, we tickle a pCPU for each change > > > This is right. > > > > > > > In this case, we can use one timer. > > > We actually have two: one for budget enforcement and the other for > budget replenishment. > > > > > > We could use the current one as a > > replenishment > > timer, and make it so rt_schedule isn't the default invoked method. > > > > Does this sound similar to what you are suggesting? > > > I don't think so, but I will leave it for Dario's for his opinion. > > In Dario's suggestion, you just simply remove the update_budget > function out of rt_schedule. This is because budget enforcement, which > is the work of rt_schedule, does not naturelly involves budget > replenishment. > > In order to achieve budget replenishment, we need another timer to > invoke another function (budget_replenish function), that is dedicated > to that. > > > > > I have to ask > > because I thought > > you wouldn't like the idea, > > > I guess Dario won't like this idea. :-P (I'm kidding, but it should be > the case.) > > > > > > and its not really *that* far off from > > what we have now, Its > > a little restructuring so that replenishments occur before any > > scheduling activity and > > the handler checks if switching is needed (basically acting as the > > scheduler) and then > > calls tickle. Sounds like what you had in mind? > > > Thanks and best regards, > > Meng > -- > > > ----------- > Meng Xu > PhD Student in Computer and Information Science > University of Pennsylvania > http://www.cis.upenn.edu/~mengxu/ > [-- Attachment #1.2: Type: text/html, Size: 5593 bytes --] [-- Attachment #2: Type: text/plain, Size: 126 bytes --] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-17 6:03 ` Dagaen Golomb @ 2015-06-17 6:09 ` Meng Xu 0 siblings, 0 replies; 21+ messages in thread From: Meng Xu @ 2015-06-17 6:09 UTC (permalink / raw) To: Dagaen Golomb Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel, Meng Xu, Jan Beulich, Chong Li Hi Dagaen, First, please try not top post in the ML. :-) As the one you just sent, it is better to reply to the comment instead of top reply. :-) 2015-06-16 23:03 GMT-07:00 Dagaen Golomb <dgolomb@seas.upenn.edu>: > Thanks for the reply, budget enforcement in the scheduler timer makes sense. > I think I have an idea of what he wants done now. Great! It is better to summarize your understanding of the design and send to the ML to check if your understanding is correct as Chong did for the improved toolstack of RTDS scheduler. This will save rounds of patches. Thanks, Meng ----------- Meng Xu PhD Student in Computer and Information Science University of Pennsylvania http://www.cis.upenn.edu/~mengxu/ ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-17 5:45 ` Meng Xu 2015-06-17 6:03 ` Dagaen Golomb @ 2015-06-17 9:20 ` Dario Faggioli 1 sibling, 0 replies; 21+ messages in thread From: Dario Faggioli @ 2015-06-17 9:20 UTC (permalink / raw) To: Meng Xu Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li, Dagaen Golomb [-- Attachment #1.1: Type: text/plain, Size: 6394 bytes --] On Tue, 2015-06-16 at 22:45 -0700, Meng Xu wrote: > > So here's my proposal, lets see if it fits what you want: > > I will leave this to Dario to answer if it fits what he wants. :-P > > 1.) The scheduler does not do any timing, > Not really. The rt_schedule does the budget enforcement. When a > current VCPU runs out of budget, the rt_schedule will be invoked by a > timer (you can refer to the schedule function in xen/sched/schedule.c > to have a look how the timer is armed/disarmed.). When the rt_schedule > is invoked, it needs to: > a) update the budget of the current running VCPU and move it from runq > to depleted queue; > b) pick the current highest VCPU from runq and return it as the snext VCPU. > > So scheduling logic is still involved in the rt_schedule function. > Correct, thanks Meng. > > 2.) replenishments are scheduled via timer at each [next] > > replenishment period. Each > > replenishment resorts the replenished vCPUs, and if any of the first > > #CPUs in the > > runq change, we tickle a pCPU for each change > This is right. > Yeah, sort of. I mean, I'm not sure what "Each replenishment resorts the replenished vCPUs" means. If it means that a replenished vCPU is given a new deadline, and hence needs to move from depleted queue to runqueue, in a position within runqueue that reflects its new deadline... then, I agree. This requires holding the runqueue lock, but we need it anyway, for now. This may or may not require a pCPU to be tickled, and it's runq_tickle()'s job to decide whether that is the case and, if yes, which one. Also, about this "we tickle a pCPU for each change": first of all, as I just said, that's not _always_ the case, and we should avoid tickling when it's not necessary. Also, when it's necessary in principle, it makes few sense to tickle a pCPU twice in a very tight loop (i.e., without giving it the chance to reschedule). So if, for instance, two different replenishments (no matter whether being serviced together, or in different invocations of the replenishment timer handler) both determine that pCPU Y is the one that should be tickled (because it's the one running the vCPU with the latest deadline, and both the two replenished vCPUs have more imminent deadline than what's running on Y, but later than what's running everywhere else), there's no point in tickling twice. Not that doing it would be harmful, it just won't have any effect. What I mean is much rather that you can use this information to keep runq_tickle() simple/quick. Note that this is pretty much what's happening already in runq_tickle(), thanks to the use of the prv->tickled bitmap, so the point here is making sure that "we tickle a pCPU for each change" doesn't mean removing this, because I think that would be wrong. It's probably unlikely that this was what Dagaen meant, but I though I better state things clearly. :-) > > In this case, we can use one timer. > > We actually have two: one for budget enforcement and the other for > budget replenishment. > EXACTLY!! :-) > > We could use the current one as a > > replenishment > > timer, and make it so rt_schedule isn't the default invoked method. > > > > Does this sound similar to what you are suggesting? > > I don't think so, but I will leave it for Dario's for his opinion. > "use current one as replenishment timer, and make it so rt_schedule isn't the default invoked method" Please, please, please, don't do that! :-P > In Dario's suggestion, you just simply remove the update_budget > function out of rt_schedule. This is because budget enforcement, which > is the work of rt_schedule, does not naturelly involves budget > replenishment. > Yes. Or at least, that's the core of my suggestion, yes. In more details, my suggestion includes getting rid of a bunch of other invocations of scheduling/picking and replenishment related functions from many other places, as I tried to make clear in my last email. Perhaps, the fact that I haven't stated that clearly enough since now, was what was making it a bit harder for you to see what I meant with 'making things simple', etc. I hope I explained myself better now. > In order to achieve budget replenishment, we need another timer to > invoke another function (budget_replenish function), that is dedicated > to that. > Yep, indeed. > > I have to ask > > because I thought > > you wouldn't like the idea, > > I guess Dario won't like this idea. :-P (I'm kidding, but it should be > the case.) > You're right, it's not. :-) > > > > and its not really *that* far off from > > what we have now, Its > > a little restructuring so that replenishments occur before any > > scheduling activity and > > the handler checks if switching is needed (basically acting as the > > scheduler) and then > > calls tickle. Sounds like what you had in mind? > This need for an strict ordering between replenishments and scheduling is something that you're (Dagaen) insisting a lot on, while I really don't see why it would be a good thing. I've stated countless time that they're independent, which also mean there's no strict ordering requirement between them. Maybe what you're hinting at is that we may want to think to a better way of handling rescheduling on the 'local' pCPU. That is to say, if replenishment timer fires on pCPU X, and runq_tickle() determines that it's pCPU X itself that should reschedule, there may be a quicker way to do so, than raising the SCHEDULE_SOFTIRQ and waiting. I.e., some kind of "hey man, we're here, let's just call (rt_)schedule() and get done with it!". If it's this, I thought about it too, many times, but whether it is an actualy issue, and whether it would actually be a good thing, it's still unclear. For instance, Linux has this, but, of course, we are not Linux. :-) In any case, this is independent from Dagaen's work, and it's even general enough that other schedulers may be interested in it, so I'm happy to leave this for future investigation. Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) [-- Attachment #1.2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 181 bytes --] [-- Attachment #2: Type: text/plain, Size: 126 bytes --] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-17 5:24 ` Dagaen Golomb 2015-06-17 5:45 ` Meng Xu @ 2015-06-17 8:27 ` Dario Faggioli 2015-06-18 18:07 ` Dagaen Golomb 1 sibling, 1 reply; 21+ messages in thread From: Dario Faggioli @ 2015-06-17 8:27 UTC (permalink / raw) To: Dagaen Golomb Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li [-- Attachment #1.1: Type: text/plain, Size: 9995 bytes --] [Let me just reply to this... Then I'll jump to the end of the thread] On Wed, 2015-06-17 at 01:24 -0400, Dagaen Golomb wrote: > >> If replenishments were done by themsevles, then the scheduler may > >> decide to wait for some period of time, and in this same time period a > >> replenishment > >> could change what should be executing. > >> > > Well, the scheduler then would better *not* decide to wait for any > > period of time. It better act like this: as soon as a replenishment > > happens, and a new deadline is assigned to a vCPU, put the vCPU in the > > runqueue, in the proper position; and if such a newly replenished vCPU > > should now preempt any other, currently running, vCPU, 'tickle' the pCPU > > in question and let it reschedule, and pick up the newly replenished > > vCPU. > > Yes, this is an option. However, I thought this would actually be an > option you would > not like. > How so... I've been arguing for this the whole time?!?! :-O I'm sure I've put down a sketch of what I think the replenishment function should do in my first or second email in the thread, and I'm sure that involved a call to runq_tickle() > For the replenishment routine to know when to call the > scheduler and when > not to, it basically has to perform the the scheduler's logic, at > which point if there is a > change... why doesn't it just do it (i.e., be the scheduler) instead > of adding logic and > possibly overhead to call it. > It has to call runq_tickle(), which is not "the scheduler's logic", is the function that decides whether a pCPU should reschedule. runq_tickle(), right now, is called from within rt_vcpu_wake(), which makes perfect sense, and from within rt_context_saved(), which makes no sense. It should be called from within rt_vcpu_wake() and from the replenishment handling routine. Scheduling, to me, is __runq_pick() + the if() in rt_schedule() that checks whether scurr->cur_deadline<=snext->cur_deadline. Anyway, I've zero interest in turning this into a fight over terminology... If you want to call runq_tickle() "the scheduler", go ahead, it would just make communication a bit more difficult, but I'm up for the challenge! :-) Oh, BTW, while we're here, __runq_pick() being called from a bunch of places, outside of rt_schedule(), is also something I never liked (you can go check that in the archives), and I also blame the confusion between scheduling and replenishmets, for the fact that such a thing is necessary. I seriously hope this can be fixed too. > Also, I'd like to point out that when I said things like "make more intelligent" > decision handling, I meant exactly things like the above: using as much > information as possible to prevent scheduler invocations. The current patch > doesn't reschedule for any replenishment. > How does it not? you're arming s_timer to fire at either next replenishment or budget enforcement point in time... > Firstly, it doesn't even look at > the run queue: replenishments here can only happen on miss. > ... Oh, wait, I know! I know! I know! You mean "it doesn't tickle at any replenishment", or "it doesn't context switch at any replenishment", don't you? (See, I'm good at adapting to "the new" terminology! :-D) > > So, see? Transient violation of EDF is just there, no matter the > > approach! > > Of course, violation during the scheduler is possible for any implementation :( > Yeah, well, it was you that said that which should stick to your design to prevent violations, which would have crept in if going with my solution. Bha, anyway, it's probably best to let go... > > I really don't see what you mean here. There won't be anything like that > > to take care of. Xen offers timers as an abstraction, and deals with > > them itself. You only need to take care of properly serializing > > rt_schedule() and the timer handling routine, for the code sections that > > require that. > > This is good to know, thanks. The question would then change to how does > Xen handle such an occurrence, but I think this is irrelevant now that I know > which "side" you had in mind - that is, that the replenishment handler would > decide when the scheduler should be invoked. > It should call runq_tickle(), yes. > >> It would be a hard pitch for a gEDF scheduler with a > >> similar "accounting period" where the gEDF policy could be violated. > >> > > There's no accounting period of any sort. It's called overhead! > > This part was referring to if we had two timers without extensive communication, > cooperation, or information/logic sharing. > I'm in the dark about what this "extensive communication, cooperation, or information/logic sharing" could be, I have to admit. If it means that the replenishment handler should call runq_tickle(), then yes, let me state it once more: replenishment handling routine should call runq_tickle() (other places, with the only other exception of rt_vcpu_wake(), shouldn't). > > You also may know that there is an EDF implementation in mainline Linux, > > and you may would like to go have a look there too. I'm not including > > any example from there because I'm one of the main authors of it, so > > it's pretty obvious that it uses the approach I think it's best (and, > > more important, it would be rather inelegant to cite myself! :-D). > > > > Anyway, if you fancy doing that, the core is in kernel/sched/deadline.c. > > Thanks for the history and background lesson here :) > NP. Did you have a look? What do you think, do you like it? :-D > >> I simply > >> don't see how it can > >> be done without heavy interaction and information sharing between them > >> which really > >> defeats the purpose. > >> > > No, it doesn't. > > Ok this last line is the zinger. > > Almost this entire email was built on the premise that you would NOT like the > idea of the replenishment handler having basically the same decision logic > as the scheduler and then tickling the pCPU to pick up the new vCPU. > Which is true, they're separate and different things, the must have separate and different logic. The fact that one may decide when it's time to call the other is perfectly logical. And in fact, I want __runq_pick() and related logic to be in rt_schedule(), and nowhere else, while I want runq_tickle() to be done from replenishment (and wakeup), and nowhere else. > Actually, > with it done this way, I would have a hard time calling the > tickle-invoked method > the "scheduler." > With "tickle-invoked method" you mean rt_schedule(), I guess. > In this case, I'm not too sure > actually how much > different this would be from what we have now. > Very different. Mostly because functions will have a clear and well defined semantic, and will be called to fulfill their own purpose, from places where it makes sense to do so, not every now and then, like now: - rt_schedule() for picking a new vcpu to be run and for doing runtime accounting and enforcement; - __runq_pick() to do the actual pick operation, from within rt_schedule() only, not every now and then, like now; - __repl_update() to do replenishments, from within the replenishment handling routine, in an event-triggered fashion, not every now and then, like now; - runq_tickle() to check whether a pCPU needs to go through the scheduler, from either the vcpu wakeup path, or because of a replenishment, not every now and then, like now. > It feels like, from > what you want, > that we could get the same behavior by modifying rt_schedule to do > replenishments > first, then check if a reschedule is needed (these two parts would be in this > proposed replenishment timer routine) and the perform any move if necessary. I > know this isn't exactly what you want, but that sounds close right? > Ehm... not really? It indeed sounds like the opposite of how I think things should be done (and I'm running out of different ways in which I'm able to tell that! :-O) > But the scheduler will have to decide which to move in, so that logic > is done twice. > Done twice? What's done twice? > Also, if these are done back-to-back and require the locks, isn't it > really the same > as having one long hot path? > What's back-to-back? > If you want maintainability, couldn't we just do > replenishment then schedule and move (if necessary) in one timer (the > one we have > now) and move them to functions. It seems this can be done with one > timer neatly. > Not to me, but I may have lost you again, sorry. > So here's my proposal, lets see if it fits what you want: > Ok, let me leave this here, and deal with this last part by replying to Meng's email. Regards, Dario > 1.) The scheduler does not do any timing, > 2.) replenishments are scheduled via timer at each [next] > replenishment period. Each > replenishment resorts the replenished vCPUs, and if any of the first > #CPUs in the > runq change, we tickle a pCPU for each change > > In this case, we can use one timer. We could use the current one as a > replenishment > timer, and make it so rt_schedule isn't the default invoked method. > > Does this sound similar to what you are suggesting? I have to ask > because I thought > you wouldn't like the idea, and its not really *that* far off from > what we have now, Its > a little restructuring so that replenishments occur before any > scheduling activity and > the handler checks if switching is needed (basically acting as the > scheduler) and then > calls tickle. Sounds like what you had in mind? > > Regards, > ~Dagaen -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) [-- Attachment #1.2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 181 bytes --] [-- Attachment #2: Type: text/plain, Size: 126 bytes --] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-17 8:27 ` Dario Faggioli @ 2015-06-18 18:07 ` Dagaen Golomb 2015-06-22 9:11 ` Dario Faggioli 0 siblings, 1 reply; 21+ messages in thread From: Dagaen Golomb @ 2015-06-18 18:07 UTC (permalink / raw) To: Dario Faggioli Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li >> Yes, this is an option. However, I thought this would actually be an >> option you would >> not like. >> > How so... I've been arguing for this the whole time?!?! :-O > > I'm sure I've put down a sketch of what I think the replenishment > function should do in my first or second email in the thread, and I'm > sure that involved a call to runq_tickle() I just thought that since you didn't like the scheduler handling replenishments, that you wouldn't be very fond of replenishment calling the scheduler. I don't find either an issue so if you like one over the other, then that's fine of course. >> For the replenishment routine to know when to call the >> scheduler and when >> not to, it basically has to perform the the scheduler's logic, at >> which point if there is a >> change... why doesn't it just do it (i.e., be the scheduler) instead >> of adding logic and >> possibly overhead to call it. >> > It has to call runq_tickle(), which is not "the scheduler's logic", is > the function that decides whether a pCPU should reschedule. > > runq_tickle(), right now, is called from within rt_vcpu_wake(), which > makes perfect sense, and from within rt_context_saved(), which makes no > sense. It should be called from within rt_vcpu_wake() and from the > replenishment handling routine. > > Scheduling, to me, is __runq_pick() + the if() in rt_schedule() that > checks whether scurr->cur_deadline<=snext->cur_deadline. > > Anyway, I've zero interest in turning this into a fight over > terminology... If you want to call runq_tickle() "the scheduler", go > ahead, it would just make communication a bit more difficult, but I'm up > for the challenge! :-) > > Oh, BTW, while we're here, __runq_pick() being called from a bunch of > places, outside of rt_schedule(), is also something I never liked (you > can go check that in the archives), and I also blame the confusion > between scheduling and replenishmets, for the fact that such a thing is > necessary. I seriously hope this can be fixed too. I have no interest in a terminology war either. I just figured that if we want the replenishment function to only tickle when necessary, then it should check that the replenished vCPU is a greater priority than current ones. At this point it has essentially done the entire scheduling decision. I agree it can then tickle and let pCPU pick it up, and I find this a fine solution. As I said before I guess I just thought you wouldn't like the idea, but its clear you do. >> > I really don't see what you mean here. There won't be anything like that >> > to take care of. Xen offers timers as an abstraction, and deals with >> > them itself. You only need to take care of properly serializing >> > rt_schedule() and the timer handling routine, for the code sections that >> > require that. >> >> This is good to know, thanks. The question would then change to how does >> Xen handle such an occurrence, but I think this is irrelevant now that I know >> which "side" you had in mind - that is, that the replenishment handler would >> decide when the scheduler should be invoked. >> > It should call runq_tickle(), yes. Gotcha. >> Thanks for the history and background lesson here :) >> > NP. Did you have a look? What do you think, do you like it? :-D > >> >> I simply >> >> don't see how it can >> >> be done without heavy interaction and information sharing between them >> >> which really >> >> defeats the purpose. >> >> >> > No, it doesn't. >> >> Ok this last line is the zinger. >> >> Almost this entire email was built on the premise that you would NOT like the >> idea of the replenishment handler having basically the same decision logic >> as the scheduler and then tickling the pCPU to pick up the new vCPU. >> > Which is true, they're separate and different things, the must have > separate and different logic. The fact that one may decide when it's > time to call the other is perfectly logical. > > And in fact, I want __runq_pick() and related logic to be in > rt_schedule(), and nowhere else, while I want runq_tickle() to be done > from replenishment (and wakeup), and nowhere else. I guess the last remaining question is this: when the scheduler is enforcing a budget and times out, should it check for replenishment before kicking the vCPU? Or assume that any relevant replenishments have occurred? This is the issue I mentioned before about two timers armed at the same time - I'm not sure what convention Xen uses to order them. I would assume from your very reason for mentioning this change that you don't want any replenishment in rt_schedule, but then it may kick a vCPU who at that very instant is supposed to be replenished as well, and should actually stay on the pCPU. This is assuming you still want rt_schedule to be timer-triggered to enforce budget, is this correct? However, I think this is a more minor issue that we can sort out via inspecting the default Xen behavior, or allowing a single replenishment call before kicking (I don't expect you to like that option :P), or some other method. > >> Actually, >> with it done this way, I would have a hard time calling the >> tickle-invoked method >> the "scheduler." >> > With "tickle-invoked method" you mean rt_schedule(), I guess. > >> In this case, I'm not too sure >> actually how much >> different this would be from what we have now. >> > Very different. Mostly because functions will have a clear and well > defined semantic, and will be called to fulfill their own purpose, from > places where it makes sense to do so, not every now and then, like now: > > - rt_schedule() for picking a new vcpu to be run and for doing runtime > accounting and enforcement; > - __runq_pick() to do the actual pick operation, from within > rt_schedule() only, not every now and then, like now; > - __repl_update() to do replenishments, from within the replenishment > handling routine, in an event-triggered fashion, not every now and > then, like now; > - runq_tickle() to check whether a pCPU needs to go through the > scheduler, from either the vcpu wakeup path, or because of a > replenishment, not every now and then, like now. Okay, I understand this. My [poor] point was that how it is done now could be better separated into logical parts with functions. >> But the scheduler will have to decide which to move in, so that logic >> is done twice. >> > Done twice? What's done twice? I mean its already moved the vCPU and knows it should go on pCPU X, now when we call tickle that function itself will have to choose which to kick. It's doing this work twice, we already knew which one was new. Its not a lot of work and isn't an issue, just feels like the replenishment routine, in order to only tickle when necessary, is basically acting as the scheduler. Just a weird way my mind views it. >> Also, if these are done back-to-back and require the locks, isn't it >> really the same >> as having one long hot path? >> > What's back-to-back? A replenishment that causes a resched - these will be back to back and will both involve the lock. Its definetly not worse than the current solution, just that its likely going to be about the same amount of time in lock. I'm just saying the lock timing in the current solution doesn't appear like it would be much worse, although we do have all the maintainability and structure arguments against it, so this is a moot point. Overall, I see where you're coming from now. I think the only issue still is whether rt_schedule should absolutely never check for replenishments. I guess my last question is, when rt_schedule is reinvoked by timer because a budget is being exhausted, what do you expect it to do? Nothing? (Besides move it to the depletedq) and rely on the budget replenish routine to to handle it? Should it wait for that routine or call it? Besides this I understand where you want this to go, and I think its worth pursuing. Regards, ~Dagaen ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-18 18:07 ` Dagaen Golomb @ 2015-06-22 9:11 ` Dario Faggioli 2015-06-22 11:58 ` Dagaen Golomb 0 siblings, 1 reply; 21+ messages in thread From: Dario Faggioli @ 2015-06-22 9:11 UTC (permalink / raw) To: Dagaen Golomb Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li [-- Attachment #1.1: Type: text/plain, Size: 5071 bytes --] On Thu, 2015-06-18 at 14:07 -0400, Dagaen Golomb wrote: > > Anyway, I've zero interest in turning this into a fight over > > terminology... If you want to call runq_tickle() "the scheduler", go > > ahead, it would just make communication a bit more difficult, but I'm up > > for the challenge! :-) > > > > Oh, BTW, while we're here, __runq_pick() being called from a bunch of > > places, outside of rt_schedule(), is also something I never liked (you > > can go check that in the archives), and I also blame the confusion > > between scheduling and replenishmets, for the fact that such a thing is > > necessary. I seriously hope this can be fixed too. > > I have no interest in a terminology war either. I just figured that if we want > the replenishment function to only tickle when necessary, then it should > check that the replenished vCPU is a greater priority than current ones. > Exactly, we need a preemption check, which is part of the 'scheduling logic'... but everything in the file is part of the scheduling logic (or it will be in another one), and we don't want to put everything that is in the file in one only function! :-) An in fact, a preemption check is not the full scheduling logic it's... well... a preemption check. For example, if there are idle pCPUs, it's something very quick. Note that, right now, this preemption check is done by acquiring the global lock and checking the deadlines of currently running vCPUs on all pCPUs. In future, this can be modified/improved, by using a dedicate data structure, and a locking scheme that would reduce the pressure on the global lock. I'm not saying that you should do this as part of the work you're doing now. Rather, I'm saying that the work being done now should go in the direction of making this easier, not harder. > > And in fact, I want __runq_pick() and related logic to be in > > rt_schedule(), and nowhere else, while I want runq_tickle() to be done > > from replenishment (and wakeup), and nowhere else. > > I guess the last remaining question is this: when the scheduler is > enforcing a budget and times out, should it check for replenishment > before kicking the vCPU? Or assume that any relevant replenishments > have occurred? > This is an implementation detail that is quite hard to discuss without seeing the code. In theory, if there wasn't any overhead, etc., you shouldn't, because it should not be necessary. In practise, yes, it is possible that various sources of overhead prevent a replenishment to be notified in time, and that you end up with a vCPU running, and consuming the last bits of its budget _after_ a scheduled replenishment instant as it is, I guess, possible that you figure during a replenishment that the budget was exhausted and the vCPU had overrun a bit, I guess (or not, it again depends on the actual implementation). So, yes, you certainly need to take care of this corner cases, and you can do it in the way you think it's best, basing on how the code ends up looking like. The important thing is that they're treated for what they are, i.e., we should handle them, not design the code around them. > This is the issue I mentioned before about two timers > armed at the same time - I'm not sure what convention Xen uses to > order them. > No, I don't think this has much to do with the internals of timers' implementation, it's a corner case that, due to overhead, long critical section (or, in general, interrupts-off sections), etc., will always be present and will need to be taken care of, no mater how the scheduler is implemented. > I would assume from your very reason for mentioning this > change that you don't want any replenishment in rt_schedule, but then > it may kick a vCPU who at that very instant is supposed to be replenished > as well, and should actually stay on the pCPU. > That depends of the vCPU, on it's budget and deadline, and on what's running on other pCPUs. And that is something unlikely, although possible, and it should be the exception rather than the rule. Anyway, I think it would be correct, for instance, to check during rt_schedule() whether the vCPU running on the pCPU is already beyond it's deadline/replenishment time. If yes, log/count a deadline miss (because that's what just happened!), replenish it and continue with rt_schedule(). > This is assuming you > still want rt_schedule to be timer-triggered to enforce budget, is this > correct? > Of course. > However, I think this is a more minor issue that we can sort out via > inspecting the default Xen behavior, or allowing a single replenishment > call before kicking (I don't expect you to like that option :P), or some > other method. > Exactly. Thanks and Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) [-- Attachment #1.2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 181 bytes --] [-- Attachment #2: Type: text/plain, Size: 126 bytes --] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-22 9:11 ` Dario Faggioli @ 2015-06-22 11:58 ` Dagaen Golomb 0 siblings, 0 replies; 21+ messages in thread From: Dagaen Golomb @ 2015-06-22 11:58 UTC (permalink / raw) To: Dario Faggioli Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich, Chong Li Thank you. This clears things up a bit. I will work on this model and make (or ignore) any corner case decisions I feel adequate for now. We can focus on them after the main structure is in place. ~Dagaen On Mon, Jun 22, 2015 at 5:11 AM, Dario Faggioli <dario.faggioli@citrix.com> wrote: > On Thu, 2015-06-18 at 14:07 -0400, Dagaen Golomb wrote: > >> > Anyway, I've zero interest in turning this into a fight over >> > terminology... If you want to call runq_tickle() "the scheduler", go >> > ahead, it would just make communication a bit more difficult, but I'm up >> > for the challenge! :-) >> > >> > Oh, BTW, while we're here, __runq_pick() being called from a bunch of >> > places, outside of rt_schedule(), is also something I never liked (you >> > can go check that in the archives), and I also blame the confusion >> > between scheduling and replenishmets, for the fact that such a thing is >> > necessary. I seriously hope this can be fixed too. >> >> I have no interest in a terminology war either. I just figured that if we want >> the replenishment function to only tickle when necessary, then it should >> check that the replenished vCPU is a greater priority than current ones. >> > Exactly, we need a preemption check, which is part of the 'scheduling > logic'... but everything in the file is part of the scheduling logic (or > it will be in another one), and we don't want to put everything that is > in the file in one only function! :-) > > An in fact, a preemption check is not the full scheduling logic it's... > well... a preemption check. For example, if there are idle pCPUs, it's > something very quick. > > Note that, right now, this preemption check is done by acquiring the > global lock and checking the deadlines of currently running vCPUs on all > pCPUs. In future, this can be modified/improved, by using a dedicate > data structure, and a locking scheme that would reduce the pressure on > the global lock. I'm not saying that you should do this as part of the > work you're doing now. Rather, I'm saying that the work being done now > should go in the direction of making this easier, not harder. > >> > And in fact, I want __runq_pick() and related logic to be in >> > rt_schedule(), and nowhere else, while I want runq_tickle() to be done >> > from replenishment (and wakeup), and nowhere else. >> >> I guess the last remaining question is this: when the scheduler is >> enforcing a budget and times out, should it check for replenishment >> before kicking the vCPU? Or assume that any relevant replenishments >> have occurred? >> > This is an implementation detail that is quite hard to discuss without > seeing the code. In theory, if there wasn't any overhead, etc., you > shouldn't, because it should not be necessary. In practise, yes, it is > possible that various sources of overhead prevent a replenishment to be > notified in time, and that you end up with a vCPU running, and consuming > the last bits of its budget _after_ a scheduled replenishment instant as > it is, I guess, possible that you figure during a replenishment that the > budget was exhausted and the vCPU had overrun a bit, I guess (or not, it > again depends on the actual implementation). > > So, yes, you certainly need to take care of this corner cases, and you > can do it in the way you think it's best, basing on how the code ends up > looking like. The important thing is that they're treated for what they > are, i.e., we should handle them, not design the code around them. > >> This is the issue I mentioned before about two timers >> armed at the same time - I'm not sure what convention Xen uses to >> order them. >> > No, I don't think this has much to do with the internals of timers' > implementation, it's a corner case that, due to overhead, long critical > section (or, in general, interrupts-off sections), etc., will always be > present and will need to be taken care of, no mater how the scheduler is > implemented. > >> I would assume from your very reason for mentioning this >> change that you don't want any replenishment in rt_schedule, but then >> it may kick a vCPU who at that very instant is supposed to be replenished >> as well, and should actually stay on the pCPU. >> > That depends of the vCPU, on it's budget and deadline, and on what's > running on other pCPUs. And that is something unlikely, although > possible, and it should be the exception rather than the rule. > > Anyway, I think it would be correct, for instance, to check during > rt_schedule() whether the vCPU running on the pCPU is already beyond > it's deadline/replenishment time. If yes, log/count a deadline miss > (because that's what just happened!), replenish it and continue with > rt_schedule(). > >> This is assuming you >> still want rt_schedule to be timer-triggered to enforce budget, is this >> correct? >> > Of course. > >> However, I think this is a more minor issue that we can sort out via >> inspecting the default Xen behavior, or allowing a single replenishment >> call before kicking (I don't expect you to like that option :P), or some >> other method. >> > Exactly. > > Thanks and Regards, > Dario > > -- > <<This happens because I choose it to happen!>> (Raistlin Majere) > ----------------------------------------------------------------- > Dario Faggioli, Ph.D, http://about.me/dario.faggioli > Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-09 12:53 ` Dario Faggioli 2015-06-10 4:18 ` Dagaen Golomb @ 2015-06-10 5:54 ` Meng Xu 2015-06-10 17:46 ` Dario Faggioli 1 sibling, 1 reply; 21+ messages in thread From: Meng Xu @ 2015-06-10 5:54 UTC (permalink / raw) To: Dario Faggioli Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich, Chong Li, Dagaen Golomb Hi Dario and Dagaen, 2015-06-09 5:53 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>: > > Hello Dagaen, > > Thanks for doing this. The first question I have is whether you run any > test/benchmark that you can show the result of here. Thank you very much Dario for your quick review! > > For instance, a few weeks ago, Meng (while trying to do a completely > different thing) sent here on the list some numbers about the frequency > of the scheduler being called, and the overhead caused by that... Would > it be hard to run the same experiments and collect the numbers, with and > without your patch? I think there should be two ways to evaluate the performance: 1) Using xentrace to trace the frequency of the scheduler being called and the overhead, when we run workload inside guest domains; 2) Using cyclictest as Dario mentioned before to test the real-time performance at end user. Dagaen, I can provide you the commands to run it, which is actually quite simple to run. > > On Mon, 2015-06-08 at 07:46 -0400, Dagaen Golomb wrote: > > To do this, we create a new list that holds, for each > > vcpu, the time least into the future that it may need to be > > rescheduled. > > > Ok. Actually, what I really expected to see in this patch was, either: > a) a new, scheduler wide, list of replenishment times _and_ a timer, > always (re)programmed to fire at the most imminent one; or > b) one timer for each vcpu, always (re)programmed to fire at the time > instant when the replenishment for that vcpu should happen. Ah... Here what we are trying to do is: Reuse the existing timer that is used by rt_schedule(). We use that timer as the timer to invoke the schedule function (rt_schedule). So that timer has two purposes: (a) budget enforcement as it was; (b) budget replenish (and deadline enforcement since deadline = period for each VCPU). When the timer fires, the rt_schedule is called: enforcement the budget, replenish the budget of VCPU(s), and pick the (next) VCPU. In other words, we can think about the scheduler in another way: The scheduling decision happens at the following two time points: (a) budget enforcement as it was; (b) budget replenish and deadline enforcement; Whenever any of the above two situations occurs, the scheduler may need to pick the next VCPU to run or preempt a current running VCPU. Therefore, the logic for scheduling decision is unavoidable when either of these two situation occurs, IMO. In terms of performance/overhead, I think the option b) you pointed out has the benefit of low overhead in updating the budget because we don't need to hold the lock. However, the budget replenish occurs when deadline of the VCPU is changed (which means the next period of the VCPU arrives). Then it means the scheduler (may) need to make a new scheduling decision, in which situation, the scheduler will hold the lock for the runq. So I'm curious about the source of overhead the option b) could save over the current implementation/design Dagaen is doing. Of course, it is hard to tell the performance difference until we really run it. IMHO, we should at least have some mental excise and have good reasons to believe the performance could improve under some situations. > > And note that, when I say "timer", I mean an actual Xen timer, i.e., > those things that are started, stopped, and with a timer handling > routine being called when they expire. For an example, you can have a > look, in sched_credit.c, at 'ticker' or 'master_ticker'. > > Deciding between a) or b) isn't easy, without actually trying to > implement them. I personally prefer b), and I think it would be a > superior long term solution (see right below), but given te current > architecture of the RTDS scheduler (e.g., the fact that it has a global > runq, etc), I won't nack a), which would most likely be simpler. > > Performance and overhead wise, again, hard to tell in advance. b) would > introduce more overhead (as there are more timers), but will likely > reveal more scalable (which is not something of great importance for > this scheduler) and may allow for better latency (which _is_ something > of great importance for this scheduler :-) ), since there won't be the > need to lock the whole scheduler during the replenishments (which is, > OTOH, necessary with a)). Right. This is true. But when we are replenishing the budget, we are also changing the deadline, which affect VCPUs' priority and then affect schedule decision. > > And that's it for the replenishments. For enforcing the budget, well, we > have the ret.time value of the task_slice struct returned by > rt_schedule, so that's event driven already, and we don't need to do > much about it. > > Does all this make sense? Am I making myself clear enough? > If not, feel free to ask. Thank you very much for your detailed explanation. :-) I just had a little confusion on how much the overhead the option b) could save. > > > The scheduler chooses the lowest time off of this > > list and waits until the specified time instead of running every > > 1 ms as it did before. > > > Yeah, well, I see what you mean and how you this is actually succeeding > (at least AFAICT, by looking at the code, again, it would be nice to > have some numbers) in improving the scheduler behavior. > > However, this transition toward event driven-ness has two goals: > * improve the scheduler behavior [check, at least to some extent] > * improve the code (readability, maintainability, etc.) > [not check at all :-(] I see. We did consider the readability and maintainability factor in the design but I think we just neglect the standards that "define" the rules of readability and maintainability. Is there some documentation that we could follow? > > As an example, the __repl_update() function: it's called 2 times inside > rt_schedule() and a third from rt_context_saved(), which is basically > like saying it's called 3 times from rt_schedule(), and this always made > very few sense to me. I see. This part should be improved. This is not good for sure. > In fact, I think that scheduling decisions and > replenishment events shouldn't be so tightly coupled. There are > interdependencies, of course (a replenishment could call for a > scheduling decision to be taken), but not like this. I see. I think this is the root reason why we had one kind of design and you had another kind of design. :-) I see how you think this interdependecies should be handled (via Xen timers per VCPU in option b), but I didn't quite get the reason/principles why you think the current design is not good to handle such interdependencies. :-( > That's why I say > I'd like to see a timer handling routine dealing with replenishment, and > let rt_schedule() do it's job, which is: > - enforcing the budget of the running vcpu. If it expires, > _(re)program_ the replenishment timer > - choose the best vcpu to run next, if necessary > > With this patch, you're still using rt_schedule() to do both scheduling > and replenishments, although you're now doing it at actual replenishment > times, instead of checking for them every millisecond. > > Also, the bits and pieces that you need to add in order to deal with > this new queue is, really, making things even more complex than they are > now, which is undesirable. > > So, all this being said, let me know what you think about it (and that > applies to everyone else as well, of course, in particular Meng). I think I didn't quite get the underlining principles/reasons why the current design is not good to handle the interdependencies between budget replenishment and scheduling decisions. I can understand the strength of option b). I think I can have better understanding of option b) if I could understand the underlining principles/reasons/rationale why the current design is not good. (I'm not saying/arguing the current design is good. I just tried to understand the weakness of it so that we could know how to handle it best.) > > I won't comment on the code in too much details, as it will require some > quite radical restructuring, but I'll skim through it and provide some > hints anyway. Sure. > > > Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu> > > Signed-off-by: Meng Xu <mengxu@cis.upenn.edu> > > --- > > xen/common/sched_rt.c | 319 ++++++++++++++++++++++++++++++++++--------------- > > 1 file changed, 222 insertions(+), 97 deletions(-) > > > First of all, it's a big patch. It's only changing one file and one > logical component, and for that reason, TBH, it's not that hard to > review. Still I think you can break it in at least two, and perhaps even > more, if you try to implement what I described above. > > > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c > > index 7c39a9e..25f0458 100644 > > --- a/xen/common/sched_rt.c > > +++ b/xen/common/sched_rt.c > > > @@ -156,6 +160,7 @@ struct rt_vcpu { > > s_time_t cur_budget; /* current budget */ > > s_time_t last_start; /* last start time */ > > s_time_t cur_deadline; /* current deadline for EDF */ > > + s_time_t next_sched_needed; /* next time to make scheduling decision */ > > > As an example of why I said that things should become simpler, and are > instead becoming more complex: with my solution, you don't need anything > like this. In fact, budget enforcement is already being done ok in > rt_schedule(), so no need to do anything more/different about it. > Replenishments should be programmed to fire at cur_deadline, so again, > no need to add this new field, and multiplex its actual value between > budget and deadline (which, yes, counts as being something rather > complex for me! :-D). > > > @@ -361,6 +387,12 @@ __q_remove(struct rt_vcpu *svc) > > list_del_init(&svc->q_elem); > > } > > > > +static inline void __t_remove(struct rt_vcpu *svc) > > +{ > > + if( !list_empty(&svc->t_elem) ) > > + list_del_init(&svc->t_elem); > > +} > > + > > > You're using hard tabs for indenting here. As you see everywhere esle, > Xen wants 4 spaces for this. > > > /* > > * Insert svc with budget in RunQ according to EDF: > > * vcpus with smaller deadlines go first. > > @@ -395,6 +427,72 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) > > } > > > > /* > > + * Insert svc into the timerq, maintaining ascending order by next_sched_needed. > > + */ > > +static void __timerq_insert(const struct scheduler *ops, struct rt_vcpu *svc) > > +{ > > + struct rt_private *prv = rt_priv(ops); > > + struct list_head *timerq = rt_timerq(ops); > > + struct list_head *iter = timerq; > > + > > + ASSERT( spin_is_locked(&prv->lock) ); > > + > > + ASSERT( list_empty(&svc->t_elem) ); > > + > The blank line between the asserts, I'd kill it. > > > + list_for_each(iter, timerq) > > + { > > > You're still using tabs, and mixing them with spaces, making things look > even more cumbersome. > > > +/* > > + * Return how far the lowest time on the timerq (that is after NOW) is in the > > + * future. > > + * Lock must be grabbed before calling this. > > + */ > > +static s_time_t __timerq_next(const struct scheduler *ops, s_time_t now) > > +{ > > + struct list_head *timerq = rt_timerq(ops); > > + struct list_head *iter, *tmp; > > + > > + list_for_each_safe(iter, tmp, timerq) > > + { > > + struct rt_vcpu * iter_svc = __t_elem(iter); > > + > > + if ( iter_svc->next_sched_needed > now ) > > + return (iter_svc->next_sched_needed - now); > > + else > > + __t_remove(iter_svc); > > + } > > + > > + return MAX_SCHEDULE; > > +} > > > If using a timer, you can just get rid of items while, in the timer > routine, you handle the event associated to them. > > Also, why is MAX_SCHEDULE still there? Is it ok to use a BUG_ON here if it will never be called? > > > +/* > > + * Updates the next_sched_needed field for a vcpu, which is used for > > + * determining the next needed schedule time for this vcpu. Then updates > > + * timerq via reinsert. > > + */ > > +static void update_sched_time(const struct scheduler *ops, s_time_t now, > > + struct rt_vcpu *svc) > > +{ > > + /* update next needed schedule time */ > > + if( test_bit(__RTDS_scheduled, &svc->flags) ) > > + svc->next_sched_needed = now + svc->cur_budget; > > + else > > + svc->next_sched_needed = svc->cur_deadline; > > + > > + /* Remove and reinsert to maintain sorted order in timerq */ > > + __t_remove(svc); > > + __timerq_insert(ops, svc); > > + > > + return; > > +} > > > And here's the multiplexing, which --even worse-- (may) require > rearranging the queue! We really don't need anything like this. > > > /* > > + * Pick a cpu where to run a vcpu, > > + * possibly kicking out the vcpu running there > > + * Called by wake() and context_saved() > > + * We have a running candidate here, the kick logic is: > > + * Among all the cpus that are within the cpu affinity > > + * 1) if the new->cpu is idle, kick it. This could benefit cache hit > > + * 2) if there are any idle vcpu, kick it. > > + * 3) now all pcpus are busy; > > + * among all the running vcpus, pick lowest priority one > > + * if snext has higher priority, kick it. > > + * > > + * TODO: > > + * 1) what if these two vcpus belongs to the same domain? > > + * replace a vcpu belonging to the same domain introduces more overhead > > + * > > + * lock is grabbed before calling this function > > + */ > > +static void > > +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new) > > +{ > > + struct rt_private *prv = rt_priv(ops); > > + struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */ > > + struct rt_vcpu *iter_svc; > > + struct vcpu *iter_vc; > > + int cpu = 0, cpu_to_tickle = 0; > > + cpumask_t not_tickled; > > + cpumask_t *online; > > + > > > [snip] > > And here you are moving a function up. In general, it is better to have > separate patches that do the movings, with the fact that the are all > about moving and that the code is not being changed, clearly stated in > the commit message. This is because it is really really hard to figure > out, e.g. from here, whether you also changed something in the function > while moving it, making reviewing a lot harder and more prone to error. > > That being said, in this specific case, you're moving up runq_tickle(), > the purpose of which is to trigger a call to rt_schedule() (after going > through the softirq machinery)... in order to call it in rt_schedule(). > That's pretty tricky, and is not how tickling should work. > > Again, with an approach that really take advantage of timers, this won't > be necessary. The reason why we need to call runq_tickle here is to pick the "best" CPU to run the next VCPU. Consider the following situation: The timer may be invoked on a CPU i, which has a medium-priority VCPU running. However, we may have another CPU j, which has the lowest priority VCPU running. The next VCPU should be place onto the CPU j to run instead of on CPU i based on the global EDF scheduling policy. Because we did not control where the timer is placed when we arm the timer (in the current design), the timer may be invoked on CPU i, and that's why we need runq_tickle() here. But this is a (minor) issue since we need to settle down the design first. :-) Thank you again for your helpful comments! Best regards, Meng ----------- Meng Xu PhD Student in Computer and Information Science University of Pennsylvania http://www.cis.upenn.edu/~mengxu/ ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-10 5:54 ` Meng Xu @ 2015-06-10 17:46 ` Dario Faggioli 2015-06-11 5:50 ` Meng Xu 0 siblings, 1 reply; 21+ messages in thread From: Dario Faggioli @ 2015-06-10 17:46 UTC (permalink / raw) To: Meng Xu Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich, Chong Li, Dagaen Golomb [-- Attachment #1.1: Type: text/plain, Size: 9741 bytes --] On Tue, 2015-06-09 at 22:54 -0700, Meng Xu wrote: > Hi Dario and Dagaen, > Hi, > 2015-06-09 5:53 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>: > > Thanks for doing this. The first question I have is whether you run any > > test/benchmark that you can show the result of here. > > Thank you very much Dario for your quick review! > It wasn't as quick as I wanted it to be. I'll improve on that. :-) > > For instance, a few weeks ago, Meng (while trying to do a completely > > different thing) sent here on the list some numbers about the frequency > > of the scheduler being called, and the overhead caused by that... Would > > it be hard to run the same experiments and collect the numbers, with and > > without your patch? > > I think there should be two ways to evaluate the performance: > 1) Using xentrace to trace the frequency of the scheduler being called > and the overhead, when we run workload inside guest domains; > Yes, that counts more as a way of testing whether what is done in here actually works, as we expect the scheduler to be invoked a lot less, and that would tell us whether or not it is the case. So, please, provide these numbers (at least the number/frequency of invocation, overhead measurements can come later) as soon as practical. > 2) Using cyclictest as Dario mentioned before to test the real-time > performance at end user. Dagaen, I can provide you the commands to run > it, which is actually quite simple to run. > It is indeed, and that's more similar to a proper performance evaluation of an aspect which is rather critical for this scheduler, and really important for people wanting to use it. So, yes, seeing some results would be great, independently from the specific work done in this patch. > > On Mon, 2015-06-08 at 07:46 -0400, Dagaen Golomb wrote: > > > To do this, we create a new list that holds, for each > > > vcpu, the time least into the future that it may need to be > > > rescheduled. > > > > > Ok. Actually, what I really expected to see in this patch was, either: > > a) a new, scheduler wide, list of replenishment times _and_ a timer, > > always (re)programmed to fire at the most imminent one; or > > b) one timer for each vcpu, always (re)programmed to fire at the time > > instant when the replenishment for that vcpu should happen. > > Ah... > > Here what we are trying to do is: > Reuse the existing timer that is used by rt_schedule(). > Yeah, and what I can't figure out is why you decided to do so. The reason I don't like it is that things become a lot (more) complex to understand, maintain and modify. > We use that > timer as the timer to invoke the schedule function (rt_schedule). So > that timer has two purposes: > (a) budget enforcement as it was; > (b) budget replenish (and deadline enforcement since deadline = period > for each VCPU). > Exactly, the timer has two purposes. It actually has 3, as you're saying yourself ("budget replenish (*and* deadline enforcement"). It should have exactly one. :-) > When the timer fires, the rt_schedule is called: enforcement the > budget, replenish the budget of VCPU(s), and pick the (next) VCPU. > Right. OTOH, I think that all it should do is pick the next vcpu, if necessary. In fact, the purpose of all this that Dagaen is doing is to dramatically reduce the calls to rt_scheduler(), when that is not necessary. Well, replenishing the budget of all running vcpus around is not the core scheduling function's business, especially considering that we're holding a global spinlock. > In other words, we can think about the scheduler in another way: > The scheduling decision happens at the following two time points: > (a) budget enforcement as it was; > (b) budget replenish and deadline enforcement; > The scheduler is all that's in sched_rt.c, point here is how things should be organized. > Whenever any of the above two situations occurs, the scheduler may > need to pick the next VCPU to run or preempt a current running VCPU. > Therefore, the logic for scheduling decision is unavoidable when > either of these two situation occurs, IMO. > A budget replenishment for a vcpu with a big period, in a busy system, may well end up not requiring any immediate rescheduling of none of the lower period vcpus, may not it? So, with your approach, it is like this, when replenishment time for vcpu X arrives: timer interrupt TIMER_SOFTIRQ raised process softirqs SCHEDULE_SOFTIRQ raised process softirqs rt_schedule() [spin_lock] burn_budget(scurr) __repl_update() (goes through all runnable vcpus!!) snext = __runq_pick(): snext == scurr runq_tickle(snext)...WAIT, WHAT?!?! :-O I mean, and I'm noticing this now, if the replenishments done during a particular call to rt_schedule() are not enough to change the situation on that particular pcpu, and hence the task which was running (and that you are deliberately disturbing with _a_full_execution_ of the scheduler's core function!) should continue to do so, you're calling runq_tickle() right on it. So, AFAICT, you're tickling scurr, not a newly replenished vcpu! Am I missing or misreading something? Let's assume not, and see what happens in this case... Looking at runq_tickle(), if there are idle pcpus, you'll probably tickle one of them, which will likely pick up the newly replenished vcpu, like this (as a followup of the 'callchain' above): process softirqs SCHEDULE_SOFTIRQ rt_schedule() [spin_lock] __repl_update() (goes through all runnable vcpus!! AGAIN!!) snext = runq_pick(): snext == vcpu X [spin_unlock] If there are no idle pcpus, you probably won't tickle anyone, which is also fine. So, yay, it works!! Is this abuse of runq_ticke() there by desing? :-D Jokes apart, if the above analysis is accurate, I think this is a good enough example of what I meant when saying to Dagaen "this is making things too complex". > In terms of performance/overhead, I think the option b) you pointed > out has the benefit of low overhead in updating the budget because we > don't need to hold the lock. However, the budget replenish occurs when > deadline of the VCPU is changed (which means the next period of the > VCPU arrives). Then it means the scheduler (may) need to make a new > scheduling decision, in which situation, the scheduler will hold the > lock for the runq. So I'm curious about the source of overhead the > option b) could save over the current implementation/design Dagaen is > doing. > Leave my b) option alone, for now. With a) done as I'm suggesting, for one, you'll be removing __repl_update() from rt_schedule(), which means no longer going through the list of all runnable vcpus with the global scheduler spin lock held, which really is something we should aim at for this scheduler, sooner rather than later. Here's how I envision things to go. Again, I'm talking about sticking with option a), so no per-vcpu timers, just 1 timer and a global queue, which now is a replenishment queue: timer interrupt TIMER_SOFTIRQ raised process softirqs replenishment_timer_handler() [spin_lock] <for_each_replenishment_event(repl_time < NOW()) { replenish(vcpu) runq_tickle(vcpu) }> [spin_lock] Then, on the tickled pcpus (if any): process softirqs SCHEDULE_SOFTIRQ rt_schedule() [spin_lock] snext = runq_pick(): snext == vcpu X [spin_unlock] And get rid of __repl_update(), which makes my eyes bleed every time I open sched_rt.c. :-) Oh, and note that we probably can use a new spin lock for the replenishment queue, different from the existing global scheduler spinlock, lowering contention and increasing scalabiliy even further! > > > The scheduler chooses the lowest time off of this > > > list and waits until the specified time instead of running every > > > 1 ms as it did before. > > > > > Yeah, well, I see what you mean and how you this is actually succeeding > > (at least AFAICT, by looking at the code, again, it would be nice to > > have some numbers) in improving the scheduler behavior. > > > > However, this transition toward event driven-ness has two goals: > > * improve the scheduler behavior [check, at least to some extent] > > * improve the code (readability, maintainability, etc.) > > [not check at all :-(] > > I see. We did consider the readability and maintainability factor in > the design but I think we just neglect the standards that "define" the > rules of readability and maintainability. Is there some documentation > that we could follow? > There is not, and probably never will be... and that's fine, because that is not something you write down in a document. It's trial and error, following suits and making and looking at examples (like the analysis I made above). > I see how you think this interdependecies should be handled (via Xen > timers per VCPU in option b), but I didn't quite get the > reason/principles why you think the current design is not good to > handle such interdependencies. :-( > Right. I hope this is more clear now. Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) [-- Attachment #1.2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 181 bytes --] [-- Attachment #2: Type: text/plain, Size: 126 bytes --] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-10 17:46 ` Dario Faggioli @ 2015-06-11 5:50 ` Meng Xu 2015-06-12 9:28 ` Dario Faggioli 0 siblings, 1 reply; 21+ messages in thread From: Meng Xu @ 2015-06-11 5:50 UTC (permalink / raw) To: Dario Faggioli Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich, Chong Li, Dagaen Golomb Hi Dario, First I think I got most of the points you raised/explained! They are very very clear and thanks for the detailed explanation of your idea! I have some minor questions about your comments/advices. > >> 2015-06-09 5:53 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>: > >> > Thanks for doing this. The first question I have is whether you run any >> > test/benchmark that you can show the result of here. >> >> Thank you very much Dario for your quick review! >> > It wasn't as quick as I wanted it to be. I'll improve on that. :-) > >> > For instance, a few weeks ago, Meng (while trying to do a completely >> > different thing) sent here on the list some numbers about the frequency >> > of the scheduler being called, and the overhead caused by that... Would >> > it be hard to run the same experiments and collect the numbers, with and >> > without your patch? >> >> I think there should be two ways to evaluate the performance: >> 1) Using xentrace to trace the frequency of the scheduler being called >> and the overhead, when we run workload inside guest domains; >> > Yes, that counts more as a way of testing whether what is done in here > actually works, as we expect the scheduler to be invoked a lot less, and > that would tell us whether or not it is the case. > > So, please, provide these numbers (at least the number/frequency of > invocation, overhead measurements can come later) as soon as practical. > >> 2) Using cyclictest as Dario mentioned before to test the real-time >> performance at end user. Dagaen, I can provide you the commands to run >> it, which is actually quite simple to run. >> > It is indeed, and that's more similar to a proper performance evaluation > of an aspect which is rather critical for this scheduler, and really > important for people wanting to use it. > > So, yes, seeing some results would be great, independently from the > specific work done in this patch. Right! I have some ideas about this test but won't want to mess up the focus of this thread. :-) I will raise this test again when we come to it. > >> > On Mon, 2015-06-08 at 07:46 -0400, Dagaen Golomb wrote: >> > > To do this, we create a new list that holds, for each >> > > vcpu, the time least into the future that it may need to be >> > > rescheduled. >> > > >> > Ok. Actually, what I really expected to see in this patch was, either: >> > a) a new, scheduler wide, list of replenishment times _and_ a timer, >> > always (re)programmed to fire at the most imminent one; or >> > b) one timer for each vcpu, always (re)programmed to fire at the time >> > instant when the replenishment for that vcpu should happen. >> >> Ah... >> >> Here what we are trying to do is: >> Reuse the existing timer that is used by rt_schedule(). >> > Yeah, and what I can't figure out is why you decided to do so. The > reason I don't like it is that things become a lot (more) complex to > understand, maintain and modify. Now I get why you think it is harder to maintain. Right. reusing the timer will just make the rt_schedule complex and make the hot path longer. If other schedulers are using this timer only for the budget enforcement, it won't be a good idea to use it for other purposes in the RTDS scheduler, considering the design consistency of different schedulers. > >> We use that >> timer as the timer to invoke the schedule function (rt_schedule). So >> that timer has two purposes: >> (a) budget enforcement as it was; >> (b) budget replenish (and deadline enforcement since deadline = period >> for each VCPU). >> > Exactly, the timer has two purposes. It actually has 3, as you're saying > yourself ("budget replenish (*and* deadline enforcement"). It should > have exactly one. :-) > >> When the timer fires, the rt_schedule is called: enforcement the >> budget, replenish the budget of VCPU(s), and pick the (next) VCPU. >> > Right. OTOH, I think that all it should do is pick the next vcpu, if > necessary. In fact, the purpose of all this that Dagaen is doing is to > dramatically reduce the calls to rt_scheduler(), when that is not > necessary. Well, replenishing the budget of all running vcpus around is > not the core scheduling function's business, especially considering that > we're holding a global spinlock. > >> In other words, we can think about the scheduler in another way: >> The scheduling decision happens at the following two time points: >> (a) budget enforcement as it was; >> (b) budget replenish and deadline enforcement; >> > The scheduler is all that's in sched_rt.c, point here is how things > should be organized. Right. Got it. > >> Whenever any of the above two situations occurs, the scheduler may >> need to pick the next VCPU to run or preempt a current running VCPU. >> Therefore, the logic for scheduling decision is unavoidable when >> either of these two situation occurs, IMO. >> > A budget replenishment for a vcpu with a big period, in a busy system, > may well end up not requiring any immediate rescheduling of none of the > lower period vcpus, may not it? Right. VCPUs with larger period should get replenished with lower frequency. > > So, with your approach, it is like this, when replenishment time for > vcpu X arrives: > > timer interrupt > TIMER_SOFTIRQ raised > process softirqs > SCHEDULE_SOFTIRQ raised > process softirqs > rt_schedule() > [spin_lock] > burn_budget(scurr) > __repl_update() (goes through all runnable vcpus!!) > snext = __runq_pick(): snext == scurr Ah, I neglected this situation. As you said below, if scurr has higher priority than the top VCPU of runq, we are going to tickle the scurr, which should not happen. > runq_tickle(snext)...WAIT, WHAT?!?! :-O > > I mean, and I'm noticing this now, if the replenishments done during a > particular call to rt_schedule() are not enough to change the situation > on that particular pcpu, and hence the task which was running (and that > you are deliberately disturbing with _a_full_execution_ of the > scheduler's core function!) should continue to do so, you're calling > runq_tickle() right on it. So, AFAICT, you're tickling scurr, not a > newly replenished vcpu! > > Am I missing or misreading something? Let's assume not, and see what > happens in this case... You are right! Although this issue (i.e., tickling on scurr instead of the next high priority VCPU) can be "fixed" (dirtily), it can be avoided with the design option a) you said. > > Looking at runq_tickle(), if there are idle pcpus, you'll probably > tickle one of them, which will likely pick up the newly replenished > vcpu, like this (as a followup of the 'callchain' above): > > process softirqs > SCHEDULE_SOFTIRQ > rt_schedule() > [spin_lock] > __repl_update() (goes through all runnable vcpus!! AGAIN!!) > snext = runq_pick(): snext == vcpu X > [spin_unlock] > > If there are no idle pcpus, you probably won't tickle anyone, which is > also fine. > > So, yay, it works!! Is this abuse of runq_ticke() there by desing? :-D Yes. It abuse the use of rt_schedule() actually. Some of the operations (e.g., __repl_update() ) are unnecessarily called twice, which is a bad thing in the hot path. :-( > > Jokes apart, if the above analysis is accurate, I think this is a good > enough example of what I meant when saying to Dagaen "this is making > things too complex". Yes. The flow chart you drew is very clear! Thanks! @Dagaen, what do you think? Please comment on Dario's reply with your opinion and raise any of your concerns. > >> In terms of performance/overhead, I think the option b) you pointed >> out has the benefit of low overhead in updating the budget because we >> don't need to hold the lock. However, the budget replenish occurs when >> deadline of the VCPU is changed (which means the next period of the >> VCPU arrives). Then it means the scheduler (may) need to make a new >> scheduling decision, in which situation, the scheduler will hold the >> lock for the runq. So I'm curious about the source of overhead the >> option b) could save over the current implementation/design Dagaen is >> doing. >> > Leave my b) option alone, for now. With a) done as I'm suggesting, for > one, you'll be removing __repl_update() from rt_schedule(), which means > no longer going through the list of all runnable vcpus with the global > scheduler spin lock held, which really is something we should aim at for > this scheduler, sooner rather than later. > > Here's how I envision things to go. Again, I'm talking about sticking > with option a), so no per-vcpu timers, just 1 timer and a global queue, > which now is a replenishment queue: > > timer interrupt > TIMER_SOFTIRQ raised > process softirqs > replenishment_timer_handler() > [spin_lock] > <for_each_replenishment_event(repl_time < NOW()) { > replenish(vcpu) > runq_tickle(vcpu) > }> > [spin_lock] > > Then, on the tickled pcpus (if any): > > process softirqs > SCHEDULE_SOFTIRQ > rt_schedule() > [spin_lock] > snext = runq_pick(): snext == vcpu X > [spin_unlock] > > And get rid of __repl_update(), which makes my eyes bleed every time I > open sched_rt.c. :-) > > Oh, and note that we probably can use a new spin lock for the > replenishment queue, different from the existing global scheduler > spinlock, lowering contention and increasing scalabiliy even further! Here is the main question I have about your advice. If we are introducing a new spin lock for the replenishment queue, what is the relation between the new lock and the old lock of the runq? Because the deadline decides the priority of VCPUs and thus decides the ordering of VCPUs in the runq, the "replenish(vcpu)" will operate on the runq as well. As shown in the workflow in your reply: > replenish(vcpu) > runq_tickle(vcpu) The runq_tickle(vcpu) will pick the desired CPU. On that CPU, rt_schedule will pick snext by runq_pick(). Therefore, the replenished VCPU need to be resorted in the runq. So replenish(vcpu) will operates on the runq. Another question in my mind is: do we really need the replenish queue to record the replenish time? Because the period = deadline in the current implementation, the deadline of VCPUs in runq is actually the replenish time. (We are reusing the runq here and I'm unsure if it good or not in terms of maintenance.) > >> > > The scheduler chooses the lowest time off of this >> > > list and waits until the specified time instead of running every >> > > 1 ms as it did before. >> > > >> > Yeah, well, I see what you mean and how you this is actually succeeding >> > (at least AFAICT, by looking at the code, again, it would be nice to >> > have some numbers) in improving the scheduler behavior. >> > >> > However, this transition toward event driven-ness has two goals: >> > * improve the scheduler behavior [check, at least to some extent] >> > * improve the code (readability, maintainability, etc.) >> > [not check at all :-(] >> >> I see. We did consider the readability and maintainability factor in >> the design but I think we just neglect the standards that "define" the >> rules of readability and maintainability. Is there some documentation >> that we could follow? >> > There is not, and probably never will be... and that's fine, because > that is not something you write down in a document. > > It's trial and error, following suits and making and looking at examples > (like the analysis I made above). I see. I like the analysis you made very much! Thanks! :-) > >> I see how you think this interdependecies should be handled (via Xen >> timers per VCPU in option b), but I didn't quite get the >> reason/principles why you think the current design is not good to >> handle such interdependencies. :-( >> > Right. I hope this is more clear now. Yes. I think I get your idea now. The explanation is very clear and neat! :-) Thank you very much and best regards, Meng -- ----------- Meng Xu PhD Student in Computer and Information Science University of Pennsylvania http://www.cis.upenn.edu/~mengxu/ ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-11 5:50 ` Meng Xu @ 2015-06-12 9:28 ` Dario Faggioli 2015-06-13 17:21 ` Meng Xu 0 siblings, 1 reply; 21+ messages in thread From: Dario Faggioli @ 2015-06-12 9:28 UTC (permalink / raw) To: Meng Xu Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich, Chong Li, Dagaen Golomb [-- Attachment #1.1: Type: text/plain, Size: 7868 bytes --] On Wed, 2015-06-10 at 22:50 -0700, Meng Xu wrote: > Hi Dario, > > First I think I got most of the points you raised/explained! They are > very very clear and thanks for the detailed explanation of your idea! > Glad you got it and (sort of :-) ) agree. > >> 2015-06-09 5:53 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>: > > > >> 2) Using cyclictest as Dario mentioned before to test the real-time > >> performance at end user. Dagaen, I can provide you the commands to run > >> it, which is actually quite simple to run. > > So, yes, seeing some results would be great, independently from the > > specific work done in this patch. > > Right! I have some ideas about this test but won't want to mess up the > focus of this thread. :-) > I will raise this test again when we come to it. > Ok. Looking forward to see this happening. > > Yeah, and what I can't figure out is why you decided to do so. The > > reason I don't like it is that things become a lot (more) complex to > > understand, maintain and modify. > > Now I get why you think it is harder to maintain. Right. reusing the > timer will just make the rt_schedule complex and make the hot path > longer. > Exactly. > > runq_tickle(snext)...WAIT, WHAT?!?! :-O > > > > I mean, and I'm noticing this now, if the replenishments done during a > > particular call to rt_schedule() are not enough to change the situation > > on that particular pcpu, and hence the task which was running (and that > > you are deliberately disturbing with _a_full_execution_ of the > > scheduler's core function!) should continue to do so, you're calling > > runq_tickle() right on it. So, AFAICT, you're tickling scurr, not a > > newly replenished vcpu! > > > > Am I missing or misreading something? Let's assume not, and see what > > happens in this case... > > You are right! Although this issue (i.e., tickling on scurr instead of > the next high priority VCPU) can be "fixed" (dirtily), it can be > avoided with the design option a) you said. > Of course it can be fixed.. Pretty much everything can! Point is the reason why it happened, and how to make these things not happen and/or easier to figure out. That's (one of) the point(s) of keeping things simple and self contained, even within a single component (like a scheduler), instead of "let's do everything in this 10k lines function!" :-P Glad that you saw what I meat. > > Jokes apart, if the above analysis is accurate, I think this is a good > > enough example of what I meant when saying to Dagaen "this is making > > things too complex". > Yes. The flow chart you drew is very clear! Thanks! > > @Dagaen, what do you think? Please comment on Dario's reply with your > opinion and raise any of your concerns. > Indeed. > > Here's how I envision things to go. Again, I'm talking about sticking > > with option a), so no per-vcpu timers, just 1 timer and a global queue, > > which now is a replenishment queue: > > > > timer interrupt > > TIMER_SOFTIRQ raised > > process softirqs > > replenishment_timer_handler() > > [spin_lock] > > <for_each_replenishment_event(repl_time < NOW()) { > > replenish(vcpu) > > runq_tickle(vcpu) > > }> > > [spin_lock] > > > > Then, on the tickled pcpus (if any): > > > > process softirqs > > SCHEDULE_SOFTIRQ > > rt_schedule() > > [spin_lock] > > snext = runq_pick(): snext == vcpu X > > [spin_unlock] > > > > And get rid of __repl_update(), which makes my eyes bleed every time I > > open sched_rt.c. :-) > > > > Oh, and note that we probably can use a new spin lock for the > > replenishment queue, different from the existing global scheduler > > spinlock, lowering contention and increasing scalabiliy even further! > > Here is the main question I have about your advice. > If we are introducing a new spin lock for the replenishment queue, > what is the relation between the new lock and the old lock of the > runq? > That's hard to tell in advance, you'll know it completely only when trying to implement it. But, yes, when more locks are involved, it's impotant to figure out the relationships between each other, or we risk introducing deadlock or, even if things are correct, fail to improve the performance (or do even worse!!). The idea is, since the two operations (scheduling/budget enforcement and replenishments) are logically independent, and if they're implemented in a way that stress this independence, then it make sens to try to use different spin locks. As soon as you have more than one spin lock, what you should pay the most attention to is, if they need to 'nest' (one is acquired when the other is already being held), that has to happen consistently, or deadlock will occur! :-/ > Because the deadline decides the priority of VCPUs and thus decides > the ordering of VCPUs in the runq, the "replenish(vcpu)" will operate > on the runq as well. As shown in the workflow in your reply: > > replenish(vcpu) > > runq_tickle(vcpu) > The runq_tickle(vcpu) will pick the desired CPU. On that CPU, > rt_schedule will pick snext by runq_pick(). Therefore, the replenished > VCPU need to be resorted in the runq. So replenish(vcpu) will operates > on the runq. > Oh, sure, absolutely. Calling __runq_insert() and runq_tickle(), clearly must be done with the runqueue lock (which is the global scheduler lock, at the moment) held. Reading again what I wrote, I realize that I probably expressed myself really bad, when hinting at that "let's use another lock" thing. What I really wanted to say is that, if there will be a replenishment queue, i.e., an event queue on which replenishment events are queued, and that the timer handling routine scans, there may be some locking required for consistently dealing with the queue itself. If that will be the case, we probably can use _another_ spin lock, and let the timer handling routine acquire the runqueue lock *only* when/if it gets to do the insertions and the ticklings. Sorry for this... I added that paragraph quickly, right before hitting 'send'... I probably would have better not done so! :-P Again, this may or may not be necessary, and may or may not be possible, depending on whether the two locks would nest nicely everywhere they're required. This also depends on how we'll deal with the replenishment timer. So, the bottom line of all this is: get down to it, ad we'll see how it will be better put together! :-D > Another question in my mind is: do we really need the replenish queue > to record the replenish time? Because the period = deadline in the > current implementation, the deadline of VCPUs in runq is actually the > replenish time. (We are reusing the runq here and I'm unsure if it > good or not in terms of maintenance.) > Again, let's see. The timer will be always programmed to fire at the most imminent replenishment event, so it seems to me that you need something that will tell you, when servicing replenishment X, what's the next time instant you want the timer to fire, to perform the next replenishment. Actually, the depleted queue you have know, can well become the replenishment queue (it will have to be kept sorted, though, I think). Whether you want to keep it as the tail of the actual runqueue, or split the two of them, it does not matter much, IMO. Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) [-- Attachment #1.2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 181 bytes --] [-- Attachment #2: Type: text/plain, Size: 126 bytes --] _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling. 2015-06-12 9:28 ` Dario Faggioli @ 2015-06-13 17:21 ` Meng Xu 0 siblings, 0 replies; 21+ messages in thread From: Meng Xu @ 2015-06-13 17:21 UTC (permalink / raw) To: Dario Faggioli, Dagaen Golomb Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich, Chong Li Hi Dario, I think I got what you meant 100% now. :-) I will ping Dagaen to see what he thinks and if he has any concerns. > > > > Here's how I envision things to go. Again, I'm talking about sticking > > > with option a), so no per-vcpu timers, just 1 timer and a global queue, > > > which now is a replenishment queue: > > > > > > timer interrupt > > > TIMER_SOFTIRQ raised > > > process softirqs > > > replenishment_timer_handler() > > > [spin_lock] > > > <for_each_replenishment_event(repl_time < NOW()) { > > > replenish(vcpu) > > > runq_tickle(vcpu) > > > }> > > > [spin_lock] > > > > > > Then, on the tickled pcpus (if any): > > > > > > process softirqs > > > SCHEDULE_SOFTIRQ > > > rt_schedule() > > > [spin_lock] > > > snext = runq_pick(): snext == vcpu X > > > [spin_unlock] > > > > > > And get rid of __repl_update(), which makes my eyes bleed every time I > > > open sched_rt.c. :-) > > > > > > Oh, and note that we probably can use a new spin lock for the > > > replenishment queue, different from the existing global scheduler > > > spinlock, lowering contention and increasing scalabiliy even further! > > > > Here is the main question I have about your advice. > > If we are introducing a new spin lock for the replenishment queue, > > what is the relation between the new lock and the old lock of the > > runq? > > > That's hard to tell in advance, you'll know it completely only when > trying to implement it. But, yes, when more locks are involved, it's > impotant to figure out the relationships between each other, or we risk > introducing deadlock or, even if things are correct, fail to improve the > performance (or do even worse!!). > > The idea is, since the two operations (scheduling/budget enforcement and > replenishments) are logically independent, and if they're implemented in > a way that stress this independence, then it make sens to try to use > different spin locks. > > As soon as you have more than one spin lock, what you should pay the > most attention to is, if they need to 'nest' (one is acquired when the > other is already being held), that has to happen consistently, or > deadlock will occur! :-/ > > > Because the deadline decides the priority of VCPUs and thus decides > > the ordering of VCPUs in the runq, the "replenish(vcpu)" will operate > > on the runq as well. As shown in the workflow in your reply: > > > replenish(vcpu) > > > runq_tickle(vcpu) > > The runq_tickle(vcpu) will pick the desired CPU. On that CPU, > > rt_schedule will pick snext by runq_pick(). Therefore, the replenished > > VCPU need to be resorted in the runq. So replenish(vcpu) will operates > > on the runq. > > > Oh, sure, absolutely. Calling __runq_insert() and runq_tickle(), clearly > must be done with the runqueue lock (which is the global scheduler lock, > at the moment) held. > > Reading again what I wrote, I realize that I probably expressed myself > really bad, when hinting at that "let's use another lock" thing. > > What I really wanted to say is that, if there will be a replenishment > queue, i.e., an event queue on which replenishment events are queued, > and that the timer handling routine scans, there may be some locking > required for consistently dealing with the queue itself. If that will be > the case, we probably can use _another_ spin lock, and let the timer > handling routine acquire the runqueue lock *only* when/if it gets to do > the insertions and the ticklings. > Sorry for this... I added that paragraph quickly, right before hitting > 'send'... I probably would have better not done so! :-P > > Again, this may or may not be necessary, and may or may not be possible, > depending on whether the two locks would nest nicely everywhere they're > required. This also depends on how we'll deal with the replenishment > timer. > > So, the bottom line of all this is: get down to it, ad we'll see how it > will be better put together! :-D I see. Got it. :-) > > > Another question in my mind is: do we really need the replenish queue > > to record the replenish time? Because the period = deadline in the > > current implementation, the deadline of VCPUs in runq is actually the > > replenish time. (We are reusing the runq here and I'm unsure if it > > good or not in terms of maintenance.) > > > Again, let's see. The timer will be always programmed to fire at the > most imminent replenishment event, so it seems to me that you need > something that will tell you, when servicing replenishment X, what's the > next time instant you want the timer to fire, to perform the next > replenishment. > > Actually, the depleted queue you have know, can well become the > replenishment queue (it will have to be kept sorted, though, I think). > Whether you want to keep it as the tail of the actual runqueue, or split > the two of them, it does not matter much, IMO. Right. The runq and the depleted queue actually has the information of the next replenish time for all VCPUs, after the depleted queue is changed to be sorted. Best regards, Meng ----------- Meng Xu PhD Student in Computer and Information Science University of Pennsylvania http://www.cis.upenn.edu/~mengxu/ ^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2015-06-22 11:58 UTC | newest] Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2015-06-08 11:46 [PATCH] Modified RTDS scheduler to use an event-driven model instead of polling Dagaen Golomb 2015-06-09 12:53 ` Dario Faggioli 2015-06-10 4:18 ` Dagaen Golomb 2015-06-10 22:30 ` Dario Faggioli 2015-06-13 20:33 ` Dagaen Golomb 2015-06-16 17:07 ` Dagaen Golomb 2015-06-17 0:20 ` Dario Faggioli 2015-06-17 5:24 ` Dagaen Golomb 2015-06-17 5:45 ` Meng Xu 2015-06-17 6:03 ` Dagaen Golomb 2015-06-17 6:09 ` Meng Xu 2015-06-17 9:20 ` Dario Faggioli 2015-06-17 8:27 ` Dario Faggioli 2015-06-18 18:07 ` Dagaen Golomb 2015-06-22 9:11 ` Dario Faggioli 2015-06-22 11:58 ` Dagaen Golomb 2015-06-10 5:54 ` Meng Xu 2015-06-10 17:46 ` Dario Faggioli 2015-06-11 5:50 ` Meng Xu 2015-06-12 9:28 ` Dario Faggioli 2015-06-13 17:21 ` Meng Xu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).