All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4] xen: sched: convert RTDS from time to event driven model
@ 2016-02-01  4:32 Tianyang Chen
  2016-02-02 15:08 ` Dario Faggioli
  0 siblings, 1 reply; 14+ messages in thread
From: Tianyang Chen @ 2016-02-01  4:32 UTC (permalink / raw)
  To: xen-devel
  Cc: dario.faggioli, Tianyang Chen, george.dunlap, Dagaen Golomb, Meng Xu

v4 is meant for discussion on the addition of replq.

Changes since v3:
	removed running queue.
	added repl queue to keep track of repl events.
	timer is now per scheduler.
	timer is init on a valid cpu in a cpupool.

Bugs to be fixed: Cpupool and locks. When a pcpu is removed from a
pool and added to another, the lock equality assert in free_pdata()
fails when Pool-0 is using rtds.

This patch is based on master branch after commit 2e46e3
x86/mce: fix misleading indentation in init_nonfatal_mce_checker()

Signed-off-by: Tianyang Chen <tiche@seas.upenn.edu>
Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
---
 xen/common/sched_rt.c |  262 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 192 insertions(+), 70 deletions(-)

diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 2e5430f..c36e5de 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -16,6 +16,7 @@
 #include <xen/delay.h>
 #include <xen/event.h>
 #include <xen/time.h>
+#include <xen/timer.h>
 #include <xen/perfc.h>
 #include <xen/sched-if.h>
 #include <xen/softirq.h>
@@ -87,7 +88,7 @@
 #define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
 
 #define UPDATE_LIMIT_SHIFT      10
-#define MAX_SCHEDULE            (MILLISECS(1))
+
 /*
  * Flags
  */
@@ -142,6 +143,9 @@ static cpumask_var_t *_cpumask_scratch;
  */
 static unsigned int nr_rt_ops;
 
+/* handler for the replenishment timer */
+static void repl_handler(void *data);
+
 /*
  * Systme-wide private data, include global RunQueue/DepletedQ
  * Global lock is referenced by schedule_data.schedule_lock from all
@@ -152,7 +156,9 @@ struct rt_private {
     struct list_head sdom;      /* list of availalbe domains, used for dump */
     struct list_head runq;      /* ordered list of runnable vcpus */
     struct list_head depletedq; /* unordered list of depleted vcpus */
+    struct list_head replq;     /* ordered list of vcpus that need repl */
     cpumask_t tickled;          /* cpus been tickled */
+    struct timer *repl_timer;   /* replenishment timer */
 };
 
 /*
@@ -160,6 +166,7 @@ struct rt_private {
  */
 struct rt_vcpu {
     struct list_head q_elem;    /* on the runq/depletedq list */
+    struct list_head p_elem;    /* on the repl event list */
 
     /* Up-pointers */
     struct rt_dom *sdom;
@@ -213,8 +220,14 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops)
     return &rt_priv(ops)->depletedq;
 }
 
+static inline struct list_head *rt_replq(const struct scheduler *ops)
+{
+    return &rt_priv(ops)->replq;
+}
+
 /*
- * Queue helper functions for runq and depletedq
+ * Queue helper functions for runq, depletedq
+ * and repl event q
  */
 static int
 __vcpu_on_q(const struct rt_vcpu *svc)
@@ -228,6 +241,18 @@ __q_elem(struct list_head *elem)
     return list_entry(elem, struct rt_vcpu, q_elem);
 }
 
+static struct rt_vcpu *
+__p_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct rt_vcpu, p_elem);
+}
+
+static int
+__vcpu_on_p(const struct rt_vcpu *svc)
+{
+   return !list_empty(&svc->p_elem);
+}
+
 /*
  * Debug related code, dump vcpu/cpu information
  */
@@ -387,6 +412,13 @@ __q_remove(struct rt_vcpu *svc)
         list_del_init(&svc->q_elem);
 }
 
+static inline void
+__p_remove(struct rt_vcpu *svc)
+{
+    if ( __vcpu_on_p(svc) )
+        list_del_init(&svc->p_elem);
+}
+
 /*
  * Insert svc with budget in RunQ according to EDF:
  * vcpus with smaller deadlines go first.
@@ -421,6 +453,32 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
 }
 
 /*
+ * Insert svc into the repl even list:
+ * vcpus that needs to be repl earlier go first.
+ */
+static void
+__replq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *replq = rt_replq(ops);
+    struct list_head *iter;
+
+    ASSERT( spin_is_locked(&prv->lock) );
+
+    ASSERT( !__vcpu_on_p(svc) );
+
+    list_for_each(iter, replq)
+    {
+        struct rt_vcpu * iter_svc = __p_elem(iter);
+        if ( svc->cur_deadline <= iter_svc->cur_deadline )
+                break;
+    }
+
+    list_add_tail(&svc->p_elem, iter);
+}
+
+
+/*
  * Init/Free related code
  */
 static int
@@ -449,6 +507,7 @@ rt_init(struct scheduler *ops)
     INIT_LIST_HEAD(&prv->sdom);
     INIT_LIST_HEAD(&prv->runq);
     INIT_LIST_HEAD(&prv->depletedq);
+    INIT_LIST_HEAD(&prv->replq);
 
     cpumask_clear(&prv->tickled);
 
@@ -473,6 +532,9 @@ rt_deinit(const struct scheduler *ops)
         xfree(_cpumask_scratch);
         _cpumask_scratch = NULL;
     }
+
+    kill_timer(prv->repl_timer);
+
     xfree(prv);
 }
 
@@ -586,6 +648,7 @@ rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
         return NULL;
 
     INIT_LIST_HEAD(&svc->q_elem);
+    INIT_LIST_HEAD(&svc->p_elem);
     svc->flags = 0U;
     svc->sdom = dd;
     svc->vcpu = vc;
@@ -618,6 +681,10 @@ static void
 rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
 {
     struct rt_vcpu *svc = rt_vcpu(vc);
+    struct rt_private *prv = rt_priv(ops);
+    struct timer *repl_timer;
+    int cpu;
+
     s_time_t now = NOW();
     spinlock_t *lock;
 
@@ -632,6 +699,17 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
     vcpu_schedule_unlock_irq(lock, vc);
 
     SCHED_STAT_CRANK(vcpu_insert);
+
+    if( prv->repl_timer == NULL )
+    {   
+        /* vc->processor has been set in schedule.c */
+        cpu = vc->processor;
+
+        repl_timer = xzalloc(struct timer);
+
+        prv->repl_timer = repl_timer;
+        init_timer(repl_timer, repl_handler, (void *)ops, cpu);
+    }
 }
 
 /*
@@ -785,44 +863,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask)
 }
 
 /*
- * Update vcpu's budget and
- * sort runq by insert the modifed vcpu back to runq
- * lock is grabbed before calling this function
- */
-static void
-__repl_update(const struct scheduler *ops, s_time_t now)
-{
-    struct list_head *runq = rt_runq(ops);
-    struct list_head *depletedq = rt_depletedq(ops);
-    struct list_head *iter;
-    struct list_head *tmp;
-    struct rt_vcpu *svc = NULL;
-
-    list_for_each_safe(iter, tmp, runq)
-    {
-        svc = __q_elem(iter);
-        if ( now < svc->cur_deadline )
-            break;
-
-        rt_update_deadline(now, svc);
-        /* reinsert the vcpu if its deadline is updated */
-        __q_remove(svc);
-        __runq_insert(ops, svc);
-    }
-
-    list_for_each_safe(iter, tmp, depletedq)
-    {
-        svc = __q_elem(iter);
-        if ( now >= svc->cur_deadline )
-        {
-            rt_update_deadline(now, svc);
-            __q_remove(svc); /* remove from depleted queue */
-            __runq_insert(ops, svc); /* add to runq */
-        }
-    }
-}
-
-/*
  * schedule function for rt scheduler.
  * The lock is already grabbed in schedule.c, no need to lock here
  */
@@ -841,7 +881,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
     /* burn_budget would return for IDLE VCPU */
     burn_budget(ops, scurr, now);
 
-    __repl_update(ops, now);
 
     if ( tasklet_work_scheduled )
     {
@@ -868,6 +907,8 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
         set_bit(__RTDS_delayed_runq_add, &scurr->flags);
 
     snext->last_start = now;
+
+    ret.time =  -1; /* if an idle vcpu is picked */ 
     if ( !is_idle_vcpu(snext->vcpu) )
     {
         if ( snext != scurr )
@@ -880,9 +921,11 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
             snext->vcpu->processor = cpu;
             ret.migrated = 1;
         }
+
+        ret.time = snext->budget; /* invoke the scheduler next time */
+
     }
 
-    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
     ret.task = snext->vcpu;
 
     /* TRACE */
@@ -924,6 +967,10 @@ rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
         __q_remove(svc);
     else if ( svc->flags & RTDS_delayed_runq_add )
         clear_bit(__RTDS_delayed_runq_add, &svc->flags);
+
+    /* stop tracking the repl time of this vcpu */
+   /* if( __vcpu_on_p(svc) )
+       __p_remove(svc); */
 }
 
 /*
@@ -1027,23 +1074,21 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
     struct rt_vcpu * const svc = rt_vcpu(vc);
     s_time_t now = NOW();
     struct rt_private *prv = rt_priv(ops);
-    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
-    struct rt_dom *sdom = NULL;
-    cpumask_t *online;
+    struct timer *repl_timer = prv->repl_timer;
 
     BUG_ON( is_idle_vcpu(vc) );
 
     if ( unlikely(curr_on_cpu(vc->processor) == vc) )
     {
         SCHED_STAT_CRANK(vcpu_wake_running);
-        return;
+        goto out;
     }
 
     /* on RunQ/DepletedQ, just update info is ok */
     if ( unlikely(__vcpu_on_q(svc)) )
     {
         SCHED_STAT_CRANK(vcpu_wake_onrunq);
-        return;
+        goto out;
     }
 
     if ( likely(vcpu_runnable(vc)) )
@@ -1058,25 +1103,39 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
     if ( unlikely(svc->flags & RTDS_scheduled) )
     {
         set_bit(__RTDS_delayed_runq_add, &svc->flags);
-        return;
+        goto out;
     }
 
+    /* budget repl here is needed before inserting back to runq. If so,
+     * it should be taken out of replq and put back. This can potentially
+     * cause the timer handler to replenish no vcpu when it triggers if
+     * the replenishment is done here already.
+     */
     if ( now >= svc->cur_deadline)
+    {   
         rt_update_deadline(now, svc);
+        if( __vcpu_on_p(svc) )
+            __p_remove(svc);
+    }
 
     /* insert svc to runq/depletedq because svc is not in queue now */
     __runq_insert(ops, svc);
 
-    __repl_update(ops, now);
-
-    ASSERT(!list_empty(&prv->sdom));
-    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-    online = cpupool_domain_cpumask(sdom->dom);
-    snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */
-
-    runq_tickle(ops, snext);
+    runq_tickle(ops, svc);
 
-    return;
+out:
+    /* a newly waken-up vcpu could have an earlier release time 
+     * or it could be the first to program the timer
+     */
+    if( repl_timer->expires == 0 || repl_timer->expires > svc->cur_deadline )
+        set_timer(repl_timer,svc->cur_deadline);
+
+    /* start tracking the repl time of this vcpu here 
+    * it stays on the replq unless it goes to sleep
+    * or marked as not-runnable
+    */
+    if( !__vcpu_on_p(svc) )
+       __replq_insert(ops, svc);
 }
 
 /*
@@ -1087,10 +1146,7 @@ static void
 rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
 {
     struct rt_vcpu *svc = rt_vcpu(vc);
-    struct rt_vcpu *snext = NULL;
-    struct rt_dom *sdom = NULL;
-    struct rt_private *prv = rt_priv(ops);
-    cpumask_t *online;
+
     spinlock_t *lock = vcpu_schedule_lock_irq(vc);
 
     clear_bit(__RTDS_scheduled, &svc->flags);
@@ -1100,17 +1156,9 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
 
     if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
          likely(vcpu_runnable(vc)) )
-    {
-        __runq_insert(ops, svc);
-        __repl_update(ops, NOW());
 
-        ASSERT(!list_empty(&prv->sdom));
-        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-        online = cpupool_domain_cpumask(sdom->dom);
-        snext = __runq_pick(ops, online); /* pick snext from ALL cpus */
+        __runq_insert(ops, svc);
 
-        runq_tickle(ops, snext);
-    }
 out:
     vcpu_schedule_unlock_irq(lock, vc);
 }
@@ -1168,6 +1216,80 @@ rt_dom_cntl(
     return rc;
 }
 
+/* The replenishment timer handler picks vcpus
+ * from the replq and does the actual replenishment
+ * the replq only keeps track of runnable vcpus
+ */
+static void repl_handler(void *data){
+    unsigned long flags;
+    s_time_t now = NOW();
+    s_time_t t_next = LONG_MAX; /* the next time timer should be triggered */
+    struct scheduler *ops = data; 
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *replq = rt_replq(ops);
+    struct timer *repl_timer = prv->repl_timer;
+    struct list_head *iter, *tmp;
+    struct rt_vcpu *svc = NULL;
+
+    stop_timer(repl_timer);
+
+    spin_lock_irqsave(&prv->lock, flags);
+ 
+    list_for_each_safe(iter, tmp, replq)
+    {
+        svc = __p_elem(iter);
+
+        if ( now >= svc->cur_deadline )
+        {
+            rt_update_deadline(now, svc);
+
+            if( t_next > svc->cur_deadline )
+                t_next = svc->cur_deadline;
+            
+            /* when the replenishment happens
+             * svc is either on a pcpu or on
+             * runq/depletedq
+             */
+            if( __vcpu_on_q(svc) )
+            {
+                /* put back to runq */
+                __q_remove(svc);
+                __runq_insert(ops, svc);
+                runq_tickle(ops, svc);
+            }
+
+            /* resort replq, keep track of this
+             * vcpu if it's still runnable. If 
+             * at this point no vcpu are, then
+             * the timer waits for wake() to
+             * program it.
+             */
+            __p_remove(svc);
+
+            if( vcpu_runnable(svc->vcpu) )
+                __replq_insert(ops, svc);
+        }
+
+        else
+        {
+            /* Break out of the loop if a vcpu's
+             * cur_deadline is bigger than now.
+             * Also when some replenishment is done
+             * in wake(), the code could end up here
+             * if the time fires earlier than it should
+            */
+            if( t_next > svc->cur_deadline )
+                t_next = svc->cur_deadline;
+            break;
+        }
+    }
+
+    set_timer(repl_timer, t_next);
+   
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+}
+
 static struct rt_private _rt_priv;
 
 static const struct scheduler sched_rtds_def = {
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-01  4:32 [PATCH v4] xen: sched: convert RTDS from time to event driven model Tianyang Chen
@ 2016-02-02 15:08 ` Dario Faggioli
  2016-02-02 18:09   ` Tianyang Chen
  2016-02-03  3:33   ` Meng Xu
  0 siblings, 2 replies; 14+ messages in thread
From: Dario Faggioli @ 2016-02-02 15:08 UTC (permalink / raw)
  To: Tianyang Chen, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1: Type: text/plain, Size: 16612 bytes --]

On Sun, 2016-01-31 at 23:32 -0500, Tianyang Chen wrote:
> v4 is meant for discussion on the addition of replq.
> 
In which cases, you should always put something like RFC in the
subject, so people knows what the intent is even by just skimming their
inboxes.

> Changes since v3:
> 	removed running queue.
> 	added repl queue to keep track of repl events.
> 	timer is now per scheduler.
> 	timer is init on a valid cpu in a cpupool.
> 
> Bugs to be fixed: Cpupool and locks. When a pcpu is removed from a
> pool and added to another, the lock equality assert in free_pdata()
> fails when Pool-0 is using rtds.
> 
Known issue. I will fix it for both Credit2 and RTDS, so just ignore
it.

> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> index 2e5430f..c36e5de 100644
> --- a/xen/common/sched_rt.c
> +++ b/xen/common/sched_rt.c
> @@ -16,6 +16,7 @@
>  #include <xen/delay.h>
>  #include <xen/event.h>
>  #include <xen/time.h>
> +#include <xen/timer.h>
>  #include <xen/perfc.h>
>  #include <xen/sched-if.h>
>  #include <xen/softirq.h>
> @@ -87,7 +88,7 @@
>  #define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
>  
>  #define UPDATE_LIMIT_SHIFT      10
> -#define MAX_SCHEDULE            (MILLISECS(1))
> +
>
Unrelated to the topic. Can we have a dedicated patch that adds a
comment above UPDATE_LIMIT_SHIFT, explainig what it is and how it's
used.

This is something low priority, of course.

> @@ -160,6 +166,7 @@ struct rt_private {
>   */
>  struct rt_vcpu {
>      struct list_head q_elem;    /* on the runq/depletedq list */
> +    struct list_head p_elem;    /* on the repl event list */
>  
Mmm... 'p_elem' because the previous one was 'q_elem', and 'p' follows
'q', I guess. It feels a bit obscure, though, and I'd prefer a more
'talking' name.

For now, and at this level, I guess I can live with it. But in other
places, things need to be improved (see below).

> @@ -213,8 +220,14 @@ static inline struct list_head
> *rt_depletedq(const struct scheduler *ops)
>      return &rt_priv(ops)->depletedq;
>  }
>  
> +static inline struct list_head *rt_replq(const struct scheduler
> *ops)
> +{
> +    return &rt_priv(ops)->replq;
> +}
> +
>  /*
> - * Queue helper functions for runq and depletedq
> + * Queue helper functions for runq, depletedq
> + * and repl event q
>   */
>
So, in comments, can we use full words as much as possible. It makes
them a lot easier to read (so, e.g., "replenishment events queue" or
"queue of the replenishment events").

I know this very file's style is not like that, from this point of
view... and, in fact, this is something I would like to change, when we
get the chance (i.e., when touching the code for other reasons, like in
this case).

> @@ -228,6 +241,18 @@ __q_elem(struct list_head *elem)
>      return list_entry(elem, struct rt_vcpu, q_elem);
>  }
>  
> +static struct rt_vcpu *
> +__p_elem(struct list_head *elem)
> +{
> +    return list_entry(elem, struct rt_vcpu, p_elem);
> +}
> +
So, while this may well still be fine...

> +static int
> +__vcpu_on_p(const struct rt_vcpu *svc)
> +{
> +   return !list_empty(&svc->p_elem);
> +}
> +
>
...here I really would like to go for something that will make it much
more obvious, just by giving a look at the code, where we are actually
checking the vcpu to be, without having to remember that p is the
replenishment queue.

So, I don't know, maybe something like vcpu_on_replq(), or
vcpu_needs_replenish() ?

> @@ -387,6 +412,13 @@ __q_remove(struct rt_vcpu *svc)
>          list_del_init(&svc->q_elem);
>  }
>  
> +static inline void
> +__p_remove(struct rt_vcpu *svc)
> +{
> +    if ( __vcpu_on_p(svc) )
> +        list_del_init(&svc->p_elem);
> +}
> +
replq_remove(), or vcpu_repl_cancel() (or something else) ?

> @@ -421,6 +453,32 @@ __runq_insert(const struct scheduler *ops,
> struct rt_vcpu *svc)
>  }
>  
>  /*
> + * Insert svc into the repl even list:
> + * vcpus that needs to be repl earlier go first.
> + */
> +static void
> +__replq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> +{
>
This is exactly what I've been trying to say above: __replq_insert() is
ok, much better than __q_insert()! Do the same everywhere else. :-)

> +    struct rt_private *prv = rt_priv(ops);
> +    struct list_head *replq = rt_replq(ops);
> +    struct list_head *iter;
> +
> +    ASSERT( spin_is_locked(&prv->lock) );
> +
Is this really useful? Considering where this is called, I don't think
it is.

> +    ASSERT( !__vcpu_on_p(svc) );
> +
> +    list_for_each(iter, replq)
> +    {
> +        struct rt_vcpu * iter_svc = __p_elem(iter);
> +        if ( svc->cur_deadline <= iter_svc->cur_deadline )
> +                break;
> +    }
> +
> +    list_add_tail(&svc->p_elem, iter);
> +}
This looks ok. The list_for_each()+list_ad_tail() part would be
basically identical to what we have inside __runq_insert(), thgough,
wouldn't it?

It may make sense to define an _ordered_queue_insert() (or
_deadline_queue_insert(), since it's alwas the deadline that is
compared) utility function that we can then call in both cases.

> @@ -449,6 +507,7 @@ rt_init(struct scheduler *ops)
>      INIT_LIST_HEAD(&prv->sdom);
>      INIT_LIST_HEAD(&prv->runq);
>      INIT_LIST_HEAD(&prv->depletedq);
> +    INIT_LIST_HEAD(&prv->replq);
>  
Is there a reason why you don't do at least the allocation of the timer
here? Because, to me, this looks the best place for that.

>      cpumask_clear(&prv->tickled);
>  
> @@ -473,6 +532,9 @@ rt_deinit(const struct scheduler *ops)
>          xfree(_cpumask_scratch);
>          _cpumask_scratch = NULL;
>      }
> +
> +    kill_timer(prv->repl_timer);
> +
>      xfree(prv);
>
And here, you're freeing prv, without having freed prv->timer, which is
therefore leaked.
 
> @@ -632,6 +699,17 @@ rt_vcpu_insert(const struct scheduler *ops,
> struct vcpu *vc)
>      vcpu_schedule_unlock_irq(lock, vc);
>  
>      SCHED_STAT_CRANK(vcpu_insert);
> +
> +    if( prv->repl_timer == NULL )
> +    {   
> +        /* vc->processor has been set in schedule.c */
> +        cpu = vc->processor;
> +
> +        repl_timer = xzalloc(struct timer);
> +
> +        prv->repl_timer = repl_timer;
> +        init_timer(repl_timer, repl_handler, (void *)ops, cpu);
> +    }
>  }
>  
This belong to rt_init, IMO.

> @@ -841,7 +881,6 @@ rt_schedule(const struct scheduler *ops, s_time_t
> now, bool_t tasklet_work_sched
>      /* burn_budget would return for IDLE VCPU */
>      burn_budget(ops, scurr, now);
>  
> -    __repl_update(ops, now);
>  
__repl_update() going away is of course fine. But we need to enact the
budget enforcement logic somewhere in here, don't we?

> @@ -924,6 +967,10 @@ rt_vcpu_sleep(const struct scheduler *ops,
> struct vcpu *vc)
>          __q_remove(svc);
>      else if ( svc->flags & RTDS_delayed_runq_add )
>          clear_bit(__RTDS_delayed_runq_add, &svc->flags);
> +
> +    /* stop tracking the repl time of this vcpu */
> +   /* if( __vcpu_on_p(svc) )
> +       __p_remove(svc); */
>  }
>  
Is it ok to kill the replenishment in this case?

This is a genuine question. What does the dynamic DS algorithm that
you're trying to implement here says about this? Meng, maybe you can
help here?

Is it ok to do this _because_ you then handle the situation of a
replenishment having to had happened while the vcpu was asleep in
rt_vcpu_wake (the '(now>=svc->cur_deadline)' thing)? Yes... it probably
is ok for that reason...

> @@ -1027,23 +1074,21 @@ rt_vcpu_wake(const struct scheduler *ops,
> struct vcpu *vc)
>      struct rt_vcpu * const svc = rt_vcpu(vc);
>      s_time_t now = NOW();
>      struct rt_private *prv = rt_priv(ops);
> -    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
> -    struct rt_dom *sdom = NULL;
> -    cpumask_t *online;
> +    struct timer *repl_timer = prv->repl_timer;
>  
>      BUG_ON( is_idle_vcpu(vc) );
>  
>      if ( unlikely(curr_on_cpu(vc->processor) == vc) )
>      {
>          SCHED_STAT_CRANK(vcpu_wake_running);
> -        return;
> +        goto out;
>      }
>  
>      /* on RunQ/DepletedQ, just update info is ok */
>      if ( unlikely(__vcpu_on_q(svc)) )
>      {
>          SCHED_STAT_CRANK(vcpu_wake_onrunq);
> -        return;
> +        goto out;
>      }
>  
>      if ( likely(vcpu_runnable(vc)) )
>
Mmm.. no. At least the first two cases, they should really just be
'return'-s.

As you say yourself in the comment further down, right below the 'out:'
label "a newly waken-up vcpu could xxx". If this is running on a pCPU,
or it is in a runqueue already, it is not a newly woken-up vcpu.

In fact, we need to check for this cases, but let's at least made them
act like NOPs, as all other schedulers do and as it's correct.

> @@ -1058,25 +1103,39 @@ rt_vcpu_wake(const struct scheduler *ops,
> struct vcpu *vc)
>      if ( unlikely(svc->flags & RTDS_scheduled) )
>      {
>          set_bit(__RTDS_delayed_runq_add, &svc->flags);
> -        return;
> +        goto out;
>      }
>
In this case, I'm less sure. I think this should just return as well,
but it may depend on how other stuff are done (like budget
enforcement), so it's hard to think at it right now, but consider this
when working on next version.

> +    /* budget repl here is needed before inserting back to runq. If
> so,
> +     * it should be taken out of replq and put back. This can
> potentially
> +     * cause the timer handler to replenish no vcpu when it triggers
> if
> +     * the replenishment is done here already.
> +     */
>
Coding style for long comments wants them to have both "wings" (you're
missing the opening one, '/*').

>      if ( now >= svc->cur_deadline)
> +    {   
>          rt_update_deadline(now, svc);
> +        if( __vcpu_on_p(svc) )
> +            __p_remove(svc);
> +    }
>  
Well, then maybe the __p_remove() function can be made smart enough to:
 - check whether the one being removed is the first element of the
   queue;
 - if it is, disarm the timer
 - if there still are elements in the queue, rearm the timer to fire
at 
   the new first's deadline

This will be useful at least in another case (rt_vcpu_sleep()).

>      /* insert svc to runq/depletedq because svc is not in queue now
> */
>      __runq_insert(ops, svc);
>  
> -    __repl_update(ops, now);
> -
> -    ASSERT(!list_empty(&prv->sdom));
> -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
> -    online = cpupool_domain_cpumask(sdom->dom);
> -    snext = __runq_pick(ops, online); /* pick snext from ALL valid
> cpus */
> -
> -    runq_tickle(ops, snext);
> +    runq_tickle(ops, svc);
>  
> -    return;
> +out:
> +    /* a newly waken-up vcpu could have an earlier release time 
> +     * or it could be the first to program the timer
> +     */
> +    if( repl_timer->expires == 0 || repl_timer->expires > svc-
> >cur_deadline )
> +        set_timer(repl_timer,svc->cur_deadline);
> +
And again, can't this logic be part of __rqplq_insert()? I.e.:
 - do the insertion
 - if the inserted element is the new head of the queue, reprogram the 
   timer

> +    /* start tracking the repl time of this vcpu here 
> +    * it stays on the replq unless it goes to sleep
> +    * or marked as not-runnable
> +    */
> +    if( !__vcpu_on_p(svc) )
> +       __replq_insert(ops, svc);
>  }

> @@ -1168,6 +1216,80 @@ rt_dom_cntl(
>      return rc;
>  }
>  
> +/* The replenishment timer handler picks vcpus
> + * from the replq and does the actual replenishment
> + * the replq only keeps track of runnable vcpus
> + */
> +static void repl_handler(void *data){
> +    unsigned long flags;
> +    s_time_t now = NOW();
> +    s_time_t t_next = LONG_MAX; /* the next time timer should be
> triggered */
> +    struct scheduler *ops = data; 
> +    struct rt_private *prv = rt_priv(ops);
> +    struct list_head *replq = rt_replq(ops);
> +    struct timer *repl_timer = prv->repl_timer;
> +    struct list_head *iter, *tmp;
> +    struct rt_vcpu *svc = NULL;
> +
> +    stop_timer(repl_timer);
> +
> +    spin_lock_irqsave(&prv->lock, flags);
> + 
> +    list_for_each_safe(iter, tmp, replq)
> +    {
> +        svc = __p_elem(iter);
> +
> +        if ( now >= svc->cur_deadline )
> +        {
> +            rt_update_deadline(now, svc);
> +
> +            if( t_next > svc->cur_deadline )
> +                t_next = svc->cur_deadline;
> +            
Why do you need to do this? The next time at which the timer needs to
be reprogrammed is, after you'll have done all the replenishments that
needed to be done, here in this loop, the deadline of the new head of
the replenishment queue (if there still are elements in there).

> +            /* when the replenishment happens
> +             * svc is either on a pcpu or on
> +             * runq/depletedq
> +             */
> +            if( __vcpu_on_q(svc) )
> +            {
> +                /* put back to runq */
> +                __q_remove(svc);
> +                __runq_insert(ops, svc);
> +                runq_tickle(ops, svc);
> +            }
> +
Aha! So, it looks the budget enforcement logic ended up here? Mmm...
no, I think that doing it in rt_schedule(), as we agreed, would make
things simpler and better looking, both here and there.

This is the replenishment timer handler, and it really should do one
thins: handle replenishment events. That's it! ;-P

> +            /* resort replq, keep track of this
> +             * vcpu if it's still runnable. If 
> +             * at this point no vcpu are, then
> +             * the timer waits for wake() to
> +             * program it.
> +             */
> +            __p_remove(svc);
> +
> +            if( vcpu_runnable(svc->vcpu) )
> +                __replq_insert(ops, svc);
> +        }
> +
> +        else
> +        {
> +            /* Break out of the loop if a vcpu's
> +             * cur_deadline is bigger than now.
> +             * Also when some replenishment is done
> +             * in wake(), the code could end up here
> +             * if the time fires earlier than it should
> +            */
> +            if( t_next > svc->cur_deadline )
> +                t_next = svc->cur_deadline;
> +            break;
> +        }
> +    }
> +
> +    set_timer(repl_timer, t_next);
> +
Well, what if there are no more replenishments to be performed? You're
still reprogramming the timer, either at t_next or at cur_deadline,
which looks arbitrary.

If there are no replenishments, let's just keep the timer off.

So, all in all, this is a very very good first attempt at implementing
what we agreed upon in previous email... Good work, can't wait to see
v5! :-D

Let me know if there is anything unclear with any of the comments.

Thanks and Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-02 15:08 ` Dario Faggioli
@ 2016-02-02 18:09   ` Tianyang Chen
  2016-02-03 12:39     ` Dario Faggioli
  2016-02-03  3:33   ` Meng Xu
  1 sibling, 1 reply; 14+ messages in thread
From: Tianyang Chen @ 2016-02-02 18:09 UTC (permalink / raw)
  To: Dario Faggioli, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu



On 2/2/2016 10:08 AM, Dario Faggioli wrote:
> On Sun, 2016-01-31 at 23:32 -0500, Tianyang Chen wrote:
>> v4 is meant for discussion on the addition of replq.
>>
> In which cases, you should always put something like RFC in the
> subject, so people knows what the intent is even by just skimming their
> inboxes.
>
Got it.
>> Changes since v3:
>> 	removed running queue.
>> 	added repl queue to keep track of repl events.
>> 	timer is now per scheduler.
>> 	timer is init on a valid cpu in a cpupool.
>>
>> Bugs to be fixed: Cpupool and locks. When a pcpu is removed from a
>> pool and added to another, the lock equality assert in free_pdata()
>> fails when Pool-0 is using rtds.
>>
> Known issue. I will fix it for both Credit2 and RTDS, so just ignore
> it.
>

Yep. I was wondering if I should fix this before sending out v4.

>> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
>> index 2e5430f..c36e5de 100644
>> --- a/xen/common/sched_rt.c
>> +++ b/xen/common/sched_rt.c
>> @@ -16,6 +16,7 @@
>>   #include <xen/delay.h>
>>   #include <xen/event.h>
>>   #include <xen/time.h>
>> +#include <xen/timer.h>
>>   #include <xen/perfc.h>
>>   #include <xen/sched-if.h>
>>   #include <xen/softirq.h>
>> @@ -87,7 +88,7 @@
>>   #define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
>>
>>   #define UPDATE_LIMIT_SHIFT      10
>> -#define MAX_SCHEDULE            (MILLISECS(1))
>> +
>>
> Unrelated to the topic. Can we have a dedicated patch that adds a
> comment above UPDATE_LIMIT_SHIFT, explainig what it is and how it's
> used.
>
> This is something low priority, of course.
>

Sure.

>> @@ -160,6 +166,7 @@ struct rt_private {
>>    */
>>   struct rt_vcpu {
>>       struct list_head q_elem;    /* on the runq/depletedq list */
>> +    struct list_head p_elem;    /* on the repl event list */
>>
> Mmm... 'p_elem' because the previous one was 'q_elem', and 'p' follows
> 'q', I guess. It feels a bit obscure, though, and I'd prefer a more
> 'talking' name.
>
> For now, and at this level, I guess I can live with it. But in other
> places, things need to be improved (see below).
>
>> @@ -213,8 +220,14 @@ static inline struct list_head
>> *rt_depletedq(const struct scheduler *ops)
>>       return &rt_priv(ops)->depletedq;
>>   }
>>
>> +static inline struct list_head *rt_replq(const struct scheduler
>> *ops)
>> +{
>> +    return &rt_priv(ops)->replq;
>> +}
>> +
>>   /*
>> - * Queue helper functions for runq and depletedq
>> + * Queue helper functions for runq, depletedq
>> + * and repl event q
>>    */
>>
> So, in comments, can we use full words as much as possible. It makes
> them a lot easier to read (so, e.g., "replenishment events queue" or
> "queue of the replenishment events").
>
> I know this very file's style is not like that, from this point of
> view... and, in fact, this is something I would like to change, when we
> get the chance (i.e., when touching the code for other reasons, like in
> this case).
>
>> @@ -228,6 +241,18 @@ __q_elem(struct list_head *elem)
>>       return list_entry(elem, struct rt_vcpu, q_elem);
>>   }
>>
>> +static struct rt_vcpu *
>> +__p_elem(struct list_head *elem)
>> +{
>> +    return list_entry(elem, struct rt_vcpu, p_elem);
>> +}
>> +
> So, while this may well still be fine...
>
>> +static int
>> +__vcpu_on_p(const struct rt_vcpu *svc)
>> +{
>> +   return !list_empty(&svc->p_elem);
>> +}
>> +
>>
> ...here I really would like to go for something that will make it much
> more obvious, just by giving a look at the code, where we are actually
> checking the vcpu to be, without having to remember that p is the
> replenishment queue.
>
> So, I don't know, maybe something like vcpu_on_replq(), or
> vcpu_needs_replenish() ?
>
>> @@ -387,6 +412,13 @@ __q_remove(struct rt_vcpu *svc)
>>           list_del_init(&svc->q_elem);
>>   }
>>
>> +static inline void
>> +__p_remove(struct rt_vcpu *svc)
>> +{
>> +    if ( __vcpu_on_p(svc) )
>> +        list_del_init(&svc->p_elem);
>> +}
>> +
> replq_remove(), or vcpu_repl_cancel() (or something else) ?
>
>> @@ -421,6 +453,32 @@ __runq_insert(const struct scheduler *ops,
>> struct rt_vcpu *svc)
>>   }
>>
>>   /*
>> + * Insert svc into the repl even list:
>> + * vcpus that needs to be repl earlier go first.
>> + */
>> +static void
>> +__replq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>> +{
>>
> This is exactly what I've been trying to say above: __replq_insert() is
> ok, much better than __q_insert()! Do the same everywhere else. :-)
>

Indeed, some of the names are confusing the first time I looked at RTDS.

>> +    struct rt_private *prv = rt_priv(ops);
>> +    struct list_head *replq = rt_replq(ops);
>> +    struct list_head *iter;
>> +
>> +    ASSERT( spin_is_locked(&prv->lock) );
>> +
> Is this really useful? Considering where this is called, I don't think
> it is.
>

This basically comes from __q_insert(). I have been searching for the 
definitions of vcpu_schedule_lock_irq() and pcpu_schedule_lock_irq(). Do 
you know where they are defined? I did some name search in the entire 
xen directory but still nothing.

>> +    ASSERT( !__vcpu_on_p(svc) );
>> +
>> +    list_for_each(iter, replq)
>> +    {
>> +        struct rt_vcpu * iter_svc = __p_elem(iter);
>> +        if ( svc->cur_deadline <= iter_svc->cur_deadline )
>> +                break;
>> +    }
>> +
>> +    list_add_tail(&svc->p_elem, iter);
>> +}
> This looks ok. The list_for_each()+list_ad_tail() part would be
> basically identical to what we have inside __runq_insert(), thgough,
> wouldn't it?
>
> It may make sense to define an _ordered_queue_insert() (or
> _deadline_queue_insert(), since it's alwas the deadline that is
> compared) utility function that we can then call in both cases.
>
>> @@ -449,6 +507,7 @@ rt_init(struct scheduler *ops)
>>       INIT_LIST_HEAD(&prv->sdom);
>>       INIT_LIST_HEAD(&prv->runq);
>>       INIT_LIST_HEAD(&prv->depletedq);
>> +    INIT_LIST_HEAD(&prv->replq);
>>
> Is there a reason why you don't do at least the allocation of the timer
> here? Because, to me, this looks the best place for that.
>
>>       cpumask_clear(&prv->tickled);
>>
>> @@ -473,6 +532,9 @@ rt_deinit(const struct scheduler *ops)
>>           xfree(_cpumask_scratch);
>>           _cpumask_scratch = NULL;
>>       }
>> +
>> +    kill_timer(prv->repl_timer);
>> +
>>       xfree(prv);
>>
> And here, you're freeing prv, without having freed prv->timer, which is
> therefore leaked.
>
>> @@ -632,6 +699,17 @@ rt_vcpu_insert(const struct scheduler *ops,
>> struct vcpu *vc)
>>       vcpu_schedule_unlock_irq(lock, vc);
>>
>>       SCHED_STAT_CRANK(vcpu_insert);
>> +
>> +    if( prv->repl_timer == NULL )
>> +    {
>> +        /* vc->processor has been set in schedule.c */
>> +        cpu = vc->processor;
>> +
>> +        repl_timer = xzalloc(struct timer);
>> +
>> +        prv->repl_timer = repl_timer;
>> +        init_timer(repl_timer, repl_handler, (void *)ops, cpu);
>> +    }
>>   }
>>
> This belong to rt_init, IMO.
>

When a new pool is created, the corresponding scheduler is also 
initialized. But there is no pcpu assigned to that pool. If init_timer() 
happens in rt_init, how does it know which cpu to pick?
When move_domain() is called, there must be some pcpus in the 
destination pool and then vcpu_insert happens.

>> @@ -841,7 +881,6 @@ rt_schedule(const struct scheduler *ops, s_time_t
>> now, bool_t tasklet_work_sched
>>       /* burn_budget would return for IDLE VCPU */
>>       burn_budget(ops, scurr, now);
>>
>> -    __repl_update(ops, now);
>>
> __repl_update() going away is of course fine. But we need to enact the
> budget enforcement logic somewhere in here, don't we?
>

Sorry about that, I meant to add it in this version.

>> @@ -924,6 +967,10 @@ rt_vcpu_sleep(const struct scheduler *ops,
>> struct vcpu *vc)
>>           __q_remove(svc);
>>       else if ( svc->flags & RTDS_delayed_runq_add )
>>           clear_bit(__RTDS_delayed_runq_add, &svc->flags);
>> +
>> +    /* stop tracking the repl time of this vcpu */
>> +   /* if( __vcpu_on_p(svc) )
>> +       __p_remove(svc); */
>>   }
>>
> Is it ok to kill the replenishment in this case?
>
> This is a genuine question. What does the dynamic DS algorithm that
> you're trying to implement here says about this? Meng, maybe you can
> help here?
>
> Is it ok to do this _because_ you then handle the situation of a
> replenishment having to had happened while the vcpu was asleep in
> rt_vcpu_wake (the '(now>=svc->cur_deadline)' thing)? Yes... it probably
> is ok for that reason...
>

I had some discussions with Meng earlier and we haven't settled on this 
yet. But yes, sleeping vcpus will be replenished in wake().

>> @@ -1027,23 +1074,21 @@ rt_vcpu_wake(const struct scheduler *ops,
>> struct vcpu *vc)
>>       struct rt_vcpu * const svc = rt_vcpu(vc);
>>       s_time_t now = NOW();
>>       struct rt_private *prv = rt_priv(ops);
>> -    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
>> -    struct rt_dom *sdom = NULL;
>> -    cpumask_t *online;
>> +    struct timer *repl_timer = prv->repl_timer;
>>
>>       BUG_ON( is_idle_vcpu(vc) );
>>
>>       if ( unlikely(curr_on_cpu(vc->processor) == vc) )
>>       {
>>           SCHED_STAT_CRANK(vcpu_wake_running);
>> -        return;
>> +        goto out;
>>       }
>>
>>       /* on RunQ/DepletedQ, just update info is ok */
>>       if ( unlikely(__vcpu_on_q(svc)) )
>>       {
>>           SCHED_STAT_CRANK(vcpu_wake_onrunq);
>> -        return;
>> +        goto out;
>>       }
>>
>>       if ( likely(vcpu_runnable(vc)) )
>>
> Mmm.. no. At least the first two cases, they should really just be
> 'return'-s.
>
> As you say yourself in the comment further down, right below the 'out:'
> label "a newly waken-up vcpu could xxx". If this is running on a pCPU,
> or it is in a runqueue already, it is not a newly woken-up vcpu.
>
> In fact, we need to check for this cases, but let's at least made them
> act like NOPs, as all other schedulers do and as it's correct.
>
>> @@ -1058,25 +1103,39 @@ rt_vcpu_wake(const struct scheduler *ops,
>> struct vcpu *vc)
>>       if ( unlikely(svc->flags & RTDS_scheduled) )
>>       {
>>           set_bit(__RTDS_delayed_runq_add, &svc->flags);
>> -        return;
>> +        goto out;
>>       }
>>
> In this case, I'm less sure. I think this should just return as well,
> but it may depend on how other stuff are done (like budget
> enforcement), so it's hard to think at it right now, but consider this
> when working on next version.
>

OK. For the second case, I guess there is nothing preventing a vcpu that 
sleeps for a long time on runq to be picked before other vcpus.
And also, repl_handler removes vcpus that are not runnable and if they 
are not added back here, where should we start tracking them? This goes 
back to if it's necessary to remove a un-runnable vcpu.

>> +    /* budget repl here is needed before inserting back to runq. If
>> so,
>> +     * it should be taken out of replq and put back. This can
>> potentially
>> +     * cause the timer handler to replenish no vcpu when it triggers
>> if
>> +     * the replenishment is done here already.
>> +     */
>>
> Coding style for long comments wants them to have both "wings" (you're
> missing the opening one, '/*').
>
>>       if ( now >= svc->cur_deadline)
>> +    {
>>           rt_update_deadline(now, svc);
>> +        if( __vcpu_on_p(svc) )
>> +            __p_remove(svc);
>> +    }
>>
> Well, then maybe the __p_remove() function can be made smart enough to:
>   - check whether the one being removed is the first element of the
>     queue;
>   - if it is, disarm the timer
>   - if there still are elements in the queue, rearm the timer to fire
> at
>     the new first's deadline
>
> This will be useful at least in another case (rt_vcpu_sleep()).
>
>>       /* insert svc to runq/depletedq because svc is not in queue now
>> */
>>       __runq_insert(ops, svc);
>>
>> -    __repl_update(ops, now);
>> -
>> -    ASSERT(!list_empty(&prv->sdom));
>> -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>> -    online = cpupool_domain_cpumask(sdom->dom);
>> -    snext = __runq_pick(ops, online); /* pick snext from ALL valid
>> cpus */
>> -
>> -    runq_tickle(ops, snext);
>> +    runq_tickle(ops, svc);
>>
>> -    return;
>> +out:
>> +    /* a newly waken-up vcpu could have an earlier release time
>> +     * or it could be the first to program the timer
>> +     */
>> +    if( repl_timer->expires == 0 || repl_timer->expires > svc-
>>> cur_deadline )
>> +        set_timer(repl_timer,svc->cur_deadline);
>> +
> And again, can't this logic be part of __rqplq_insert()? I.e.:
>   - do the insertion
>   - if the inserted element is the new head of the queue, reprogram the
>     timer
>

Yeah I agree that insert() and remove() can be handle some of that.

>> +    /* start tracking the repl time of this vcpu here
>> +    * it stays on the replq unless it goes to sleep
>> +    * or marked as not-runnable
>> +    */
>> +    if( !__vcpu_on_p(svc) )
>> +       __replq_insert(ops, svc);
>>   }
>
>> @@ -1168,6 +1216,80 @@ rt_dom_cntl(
>>       return rc;
>>   }
>>
>> +/* The replenishment timer handler picks vcpus
>> + * from the replq and does the actual replenishment
>> + * the replq only keeps track of runnable vcpus
>> + */
>> +static void repl_handler(void *data){
>> +    unsigned long flags;
>> +    s_time_t now = NOW();
>> +    s_time_t t_next = LONG_MAX; /* the next time timer should be
>> triggered */
>> +    struct scheduler *ops = data;
>> +    struct rt_private *prv = rt_priv(ops);
>> +    struct list_head *replq = rt_replq(ops);
>> +    struct timer *repl_timer = prv->repl_timer;
>> +    struct list_head *iter, *tmp;
>> +    struct rt_vcpu *svc = NULL;
>> +
>> +    stop_timer(repl_timer);
>> +
>> +    spin_lock_irqsave(&prv->lock, flags);
>> +
>> +    list_for_each_safe(iter, tmp, replq)
>> +    {
>> +        svc = __p_elem(iter);
>> +
>> +        if ( now >= svc->cur_deadline )
>> +        {
>> +            rt_update_deadline(now, svc);
>> +
>> +            if( t_next > svc->cur_deadline )
>> +                t_next = svc->cur_deadline;
>> +
> Why do you need to do this? The next time at which the timer needs to
> be reprogrammed is, after you'll have done all the replenishments that
> needed to be done, here in this loop, the deadline of the new head of
> the replenishment queue (if there still are elements in there).
>

Agree.

>> +            /* when the replenishment happens
>> +             * svc is either on a pcpu or on
>> +             * runq/depletedq
>> +             */
>> +            if( __vcpu_on_q(svc) )
>> +            {
>> +                /* put back to runq */
>> +                __q_remove(svc);
>> +                __runq_insert(ops, svc);
>> +                runq_tickle(ops, svc);
>> +            }
>> +
> Aha! So, it looks the budget enforcement logic ended up here? Mmm...
> no, I think that doing it in rt_schedule(), as we agreed, would make
> things simpler and better looking, both here and there.
>
> This is the replenishment timer handler, and it really should do one
> thins: handle replenishment events. That's it! ;-P
>

Oh I might be misunderstanding what should be put in rt_schedule(). Was 
it specifically for handling a vcpu that shouldn't be taken off a core?
Why are you referring this piece of code as budget enforcement?

Isn't it necessary to modify the runq here after budget replenishment 
has happened? If runq goes out of order here, will rt_schedule() be 
sorting it?

>> +            /* resort replq, keep track of this
>> +             * vcpu if it's still runnable. If
>> +             * at this point no vcpu are, then
>> +             * the timer waits for wake() to
>> +             * program it.
>> +             */
>> +            __p_remove(svc);
>> +
>> +            if( vcpu_runnable(svc->vcpu) )
>> +                __replq_insert(ops, svc);
>> +        }
>> +
>> +        else
>> +        {
>> +            /* Break out of the loop if a vcpu's
>> +             * cur_deadline is bigger than now.
>> +             * Also when some replenishment is done
>> +             * in wake(), the code could end up here
>> +             * if the time fires earlier than it should
>> +            */
>> +            if( t_next > svc->cur_deadline )
>> +                t_next = svc->cur_deadline;
>> +            break;
>> +        }
>> +    }
>> +
>> +    set_timer(repl_timer, t_next);
>> +
> Well, what if there are no more replenishments to be performed? You're
> still reprogramming the timer, either at t_next or at cur_deadline,
> which looks arbitrary.
>
> If there are no replenishments, let's just keep the timer off.
>

The timer fires earlier than it should when some replenishment is done 
in wake(). In this situation, no replenishment will be done and the 
timer will not be programmed until a vcpu wakes up. What if all vcpus 
wake up and get replenished before the timer fires, it replies on wake() 
solely to program it if it's done here. If none of the vcpus is going to 
sleep afterwards but they all have replenished their budget, no one will 
do the replenishment forever. That's why I set the default value of 
t_next to be the first svc on replq.

Thanks,

Tianyang

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-02 15:08 ` Dario Faggioli
  2016-02-02 18:09   ` Tianyang Chen
@ 2016-02-03  3:33   ` Meng Xu
  2016-02-03 12:30     ` Dario Faggioli
  1 sibling, 1 reply; 14+ messages in thread
From: Meng Xu @ 2016-02-03  3:33 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: xen-devel, Tianyang Chen, George Dunlap, Dagaen Golomb, Meng Xu

Hi Dario and Tianyang,

I will just comment on the parts Tianyang hasn't replied.

On Tue, Feb 2, 2016 at 10:08 AM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
>
> On Sun, 2016-01-31 at 23:32 -0500, Tianyang Chen wrote:
> > v4 is meant for discussion on the addition of replq.
> >
> In which cases, you should always put something like RFC in the
> subject, so people knows what the intent is even by just skimming their
> inboxes.
>
> > Changes since v3:
> >       removed running queue.
> >       added repl queue to keep track of repl events.
> >       timer is now per scheduler.
> >       timer is init on a valid cpu in a cpupool.
> >
> > Bugs to be fixed: Cpupool and locks. When a pcpu is removed from a
> > pool and added to another, the lock equality assert in free_pdata()
> > fails when Pool-0 is using rtds.
> >
> Known issue. I will fix it for both Credit2 and RTDS, so just ignore
> it.
>
> > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> > index 2e5430f..c36e5de 100644
> > --- a/xen/common/sched_rt.c
> > +++ b/xen/common/sched_rt.c
> > @@ -16,6 +16,7 @@
> >  #include <xen/delay.h>
> >  #include <xen/event.h>
> >  #include <xen/time.h>
> > +#include <xen/timer.h>
> >  #include <xen/perfc.h>
> >  #include <xen/sched-if.h>
> >  #include <xen/softirq.h>
> > @@ -87,7 +88,7 @@
> >  #define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
> >
> >  #define UPDATE_LIMIT_SHIFT      10
> > -#define MAX_SCHEDULE            (MILLISECS(1))
> > +
> >
> Unrelated to the topic. Can we have a dedicated patch that adds a
> comment above UPDATE_LIMIT_SHIFT, explainig what it is and how it's
> used.
>
> This is something low priority, of course.
>
> > @@ -160,6 +166,7 @@ struct rt_private {
> >   */
> >  struct rt_vcpu {
> >      struct list_head q_elem;    /* on the runq/depletedq list */
> > +    struct list_head p_elem;    /* on the repl event list */
> >
> Mmm... 'p_elem' because the previous one was 'q_elem', and 'p' follows
> 'q', I guess. It feels a bit obscure, though, and I'd prefer a more
> 'talking' name.
>
> For now, and at this level, I guess I can live with it. But in other
> places, things need to be improved (see below).
>
> > @@ -213,8 +220,14 @@ static inline struct list_head
> > *rt_depletedq(const struct scheduler *ops)
> >      return &rt_priv(ops)->depletedq;
> >  }
> >
> > +static inline struct list_head *rt_replq(const struct scheduler
> > *ops)
> > +{
> > +    return &rt_priv(ops)->replq;
> > +}
> > +
> >  /*
> > - * Queue helper functions for runq and depletedq
> > + * Queue helper functions for runq, depletedq
> > + * and repl event q
> >   */
> >
> So, in comments, can we use full words as much as possible. It makes
> them a lot easier to read (so, e.g., "replenishment events queue" or
> "queue of the replenishment events").
>
> I know this very file's style is not like that, from this point of
> view... and, in fact, this is something I would like to change, when we
> get the chance (i.e., when touching the code for other reasons, like in
> this case).
>
> > @@ -228,6 +241,18 @@ __q_elem(struct list_head *elem)
> >      return list_entry(elem, struct rt_vcpu, q_elem);
> >  }
> >
> > +static struct rt_vcpu *
> > +__p_elem(struct list_head *elem)
> > +{
> > +    return list_entry(elem, struct rt_vcpu, p_elem);
> > +}
> > +
> So, while this may well still be fine...
>
> > +static int
> > +__vcpu_on_p(const struct rt_vcpu *svc)
> > +{
> > +   return !list_empty(&svc->p_elem);
> > +}
> > +
> >
> ...here I really would like to go for something that will make it much
> more obvious, just by giving a look at the code, where we are actually
> checking the vcpu to be, without having to remember that p is the
> replenishment queue.
>
> So, I don't know, maybe something like vcpu_on_replq(), or
> vcpu_needs_replenish() ?
>
> > @@ -387,6 +412,13 @@ __q_remove(struct rt_vcpu *svc)
> >          list_del_init(&svc->q_elem);
> >  }
> >
> > +static inline void
> > +__p_remove(struct rt_vcpu *svc)
> > +{
> > +    if ( __vcpu_on_p(svc) )
> > +        list_del_init(&svc->p_elem);
> > +}
> > +
> replq_remove(), or vcpu_repl_cancel() (or something else) ?
>
> > @@ -421,6 +453,32 @@ __runq_insert(const struct scheduler *ops,
> > struct rt_vcpu *svc)
> >  }
> >
> >  /*
> > + * Insert svc into the repl even list:
> > + * vcpus that needs to be repl earlier go first.
> > + */
> > +static void
> > +__replq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> > +{
> >
> This is exactly what I've been trying to say above: __replq_insert() is
> ok, much better than __q_insert()! Do the same everywhere else. :-)
>
> > +    struct rt_private *prv = rt_priv(ops);
> > +    struct list_head *replq = rt_replq(ops);
> > +    struct list_head *iter;
> > +
> > +    ASSERT( spin_is_locked(&prv->lock) );
> > +
> Is this really useful? Considering where this is called, I don't think
> it is.
>
> > +    ASSERT( !__vcpu_on_p(svc) );
> > +
> > +    list_for_each(iter, replq)
> > +    {
> > +        struct rt_vcpu * iter_svc = __p_elem(iter);
> > +        if ( svc->cur_deadline <= iter_svc->cur_deadline )
> > +                break;
> > +    }
> > +
> > +    list_add_tail(&svc->p_elem, iter);
> > +}
> This looks ok. The list_for_each()+list_ad_tail() part would be
> basically identical to what we have inside __runq_insert(), thgough,
> wouldn't it?
>
> It may make sense to define an _ordered_queue_insert() (or
> _deadline_queue_insert(), since it's alwas the deadline that is
> compared) utility function that we can then call in both cases.


I will prefer the _deadline_queue_insert() because it is based on the
EDF scheduling policy.


>
>
> > @@ -449,6 +507,7 @@ rt_init(struct scheduler *ops)
> >      INIT_LIST_HEAD(&prv->sdom);
> >      INIT_LIST_HEAD(&prv->runq);
> >      INIT_LIST_HEAD(&prv->depletedq);
> > +    INIT_LIST_HEAD(&prv->replq);
> >
> Is there a reason why you don't do at least the allocation of the timer
> here? Because, to me, this looks the best place for that.
>
> >      cpumask_clear(&prv->tickled);
> >
> > @@ -473,6 +532,9 @@ rt_deinit(const struct scheduler *ops)
> >          xfree(_cpumask_scratch);
> >          _cpumask_scratch = NULL;
> >      }
> > +
> > +    kill_timer(prv->repl_timer);
> > +
> >      xfree(prv);
> >
> And here, you're freeing prv, without having freed prv->timer, which is
> therefore leaked.
>
> > @@ -632,6 +699,17 @@ rt_vcpu_insert(const struct scheduler *ops,
> > struct vcpu *vc)
> >      vcpu_schedule_unlock_irq(lock, vc);
> >
> >      SCHED_STAT_CRANK(vcpu_insert);
> > +
> > +    if( prv->repl_timer == NULL )
> > +    {
> > +        /* vc->processor has been set in schedule.c */
> > +        cpu = vc->processor;
> > +
> > +        repl_timer = xzalloc(struct timer);
> > +
> > +        prv->repl_timer = repl_timer;
> > +        init_timer(repl_timer, repl_handler, (void *)ops, cpu);
> > +    }
> >  }
> >
> This belong to rt_init, IMO.
>
> > @@ -841,7 +881,6 @@ rt_schedule(const struct scheduler *ops, s_time_t
> > now, bool_t tasklet_work_sched
> >      /* burn_budget would return for IDLE VCPU */
> >      burn_budget(ops, scurr, now);
> >
> > -    __repl_update(ops, now);
> >
> __repl_update() going away is of course fine. But we need to enact the
> budget enforcement logic somewhere in here, don't we?
>
> > @@ -924,6 +967,10 @@ rt_vcpu_sleep(const struct scheduler *ops,
> > struct vcpu *vc)
> >          __q_remove(svc);
> >      else if ( svc->flags & RTDS_delayed_runq_add )
> >          clear_bit(__RTDS_delayed_runq_add, &svc->flags);
> > +
> > +    /* stop tracking the repl time of this vcpu */
> > +   /* if( __vcpu_on_p(svc) )
> > +       __p_remove(svc); */
> >  }
> >
> Is it ok to kill the replenishment in this case?
>
> This is a genuine question. What does the dynamic DS algorithm that
> you're trying to implement here says about this? Meng, maybe you can
> help here?


Based on the DS algorithm (in theory), each VCPU should have its
budget replenished at the beginning of its next period.

However, we are thinking that when a VCPU is put to sleep, no one is
supposed to use it. Do we really need to keep updating the VCPU's
budget? Can we just update the VCPU's budget when it is waken up
later? This could potentially save some implementation overhead, IMHO.
That's why we decided not to update the budget of VCPUs that are put into sleep.


>
>
> Is it ok to do this _because_ you then handle the situation of a
> replenishment having to had happened while the vcpu was asleep in
> rt_vcpu_wake (the '(now>=svc->cur_deadline)' thing)? Yes... it probably
> is ok for that reason...


Yes, exactly. We hope this could reduce the frequency of invoking the
replenishment timer when the system is (kind of) idle.


>
>
> > @@ -1027,23 +1074,21 @@ rt_vcpu_wake(const struct scheduler *ops,
> > struct vcpu *vc)
> >      struct rt_vcpu * const svc = rt_vcpu(vc);
> >      s_time_t now = NOW();
> >      struct rt_private *prv = rt_priv(ops);
> > -    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
> > -    struct rt_dom *sdom = NULL;
> > -    cpumask_t *online;
> > +    struct timer *repl_timer = prv->repl_timer;
> >
> >      BUG_ON( is_idle_vcpu(vc) );
> >
> >      if ( unlikely(curr_on_cpu(vc->processor) == vc) )
> >      {
> >          SCHED_STAT_CRANK(vcpu_wake_running);
> > -        return;
> > +        goto out;
> >      }
> >
> >      /* on RunQ/DepletedQ, just update info is ok */
> >      if ( unlikely(__vcpu_on_q(svc)) )
> >      {
> >          SCHED_STAT_CRANK(vcpu_wake_onrunq);
> > -        return;
> > +        goto out;
> >      }
> >
> >      if ( likely(vcpu_runnable(vc)) )
> >
> Mmm.. no. At least the first two cases, they should really just be
> 'return'-s.
>
> As you say yourself in the comment further down, right below the 'out:'
> label "a newly waken-up vcpu could xxx". If this is running on a pCPU,
> or it is in a runqueue already, it is not a newly woken-up vcpu.
>
> In fact, we need to check for this cases, but let's at least made them
> act like NOPs, as all other schedulers do and as it's correct.
>
> > @@ -1058,25 +1103,39 @@ rt_vcpu_wake(const struct scheduler *ops,
> > struct vcpu *vc)
> >      if ( unlikely(svc->flags & RTDS_scheduled) )
> >      {
> >          set_bit(__RTDS_delayed_runq_add, &svc->flags);
> > -        return;
> > +        goto out;
> >      }
> >
> In this case, I'm less sure. I think this should just return as well,
> but it may depend on how other stuff are done (like budget
> enforcement), so it's hard to think at it right now, but consider this
> when working on next version.
>
> > +    /* budget repl here is needed before inserting back to runq. If
> > so,
> > +     * it should be taken out of replq and put back. This can
> > potentially
> > +     * cause the timer handler to replenish no vcpu when it triggers
> > if
> > +     * the replenishment is done here already.
> > +     */
> >
> Coding style for long comments wants them to have both "wings" (you're
> missing the opening one, '/*').


BTW, each line should have less than 80 characters.


>
>
> >      if ( now >= svc->cur_deadline)
> > +    {
> >          rt_update_deadline(now, svc);
> > +        if( __vcpu_on_p(svc) )
> > +            __p_remove(svc);
> > +    }
> >
> Well, then maybe the __p_remove() function can be made smart enough to:
>  - check whether the one being removed is the first element of the
>    queue;
>  - if it is, disarm the timer
>  - if there still are elements in the queue, rearm the timer to fire
> at
>    the new first's deadline
>
> This will be useful at least in another case (rt_vcpu_sleep()).
>
> >      /* insert svc to runq/depletedq because svc is not in queue now
> > */
> >      __runq_insert(ops, svc);
> >
> > -    __repl_update(ops, now);
> > -
> > -    ASSERT(!list_empty(&prv->sdom));
> > -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
> > -    online = cpupool_domain_cpumask(sdom->dom);
> > -    snext = __runq_pick(ops, online); /* pick snext from ALL valid
> > cpus */
> > -
> > -    runq_tickle(ops, snext);
> > +    runq_tickle(ops, svc);
> >
> > -    return;
> > +out:
> > +    /* a newly waken-up vcpu could have an earlier release time
> > +     * or it could be the first to program the timer
> > +     */
> > +    if( repl_timer->expires == 0 || repl_timer->expires > svc-
> > >cur_deadline )
> > +        set_timer(repl_timer,svc->cur_deadline);
> > +
> And again, can't this logic be part of __rqplq_insert()? I.e.:
>  - do the insertion
>  - if the inserted element is the new head of the queue, reprogram the
>    timer
>
> > +    /* start tracking the repl time of this vcpu here
> > +    * it stays on the replq unless it goes to sleep
> > +    * or marked as not-runnable
> > +    */
> > +    if( !__vcpu_on_p(svc) )
> > +       __replq_insert(ops, svc);
> >  }
>
> > @@ -1168,6 +1216,80 @@ rt_dom_cntl(
> >      return rc;
> >  }
> >
> > +/* The replenishment timer handler picks vcpus
> > + * from the replq and does the actual replenishment
> > + * the replq only keeps track of runnable vcpus
> > + */
> > +static void repl_handler(void *data){
> > +    unsigned long flags;
> > +    s_time_t now = NOW();
> > +    s_time_t t_next = LONG_MAX; /* the next time timer should be
> > triggered */
> > +    struct scheduler *ops = data;
> > +    struct rt_private *prv = rt_priv(ops);
> > +    struct list_head *replq = rt_replq(ops);
> > +    struct timer *repl_timer = prv->repl_timer;
> > +    struct list_head *iter, *tmp;
> > +    struct rt_vcpu *svc = NULL;
> > +
> > +    stop_timer(repl_timer);
> > +
> > +    spin_lock_irqsave(&prv->lock, flags);
> > +
> > +    list_for_each_safe(iter, tmp, replq)
> > +    {
> > +        svc = __p_elem(iter);
> > +
> > +        if ( now >= svc->cur_deadline )
> > +        {
> > +            rt_update_deadline(now, svc);
> > +
> > +            if( t_next > svc->cur_deadline )
> > +                t_next = svc->cur_deadline;
> > +
> Why do you need to do this? The next time at which the timer needs to
> be reprogrammed is, after you'll have done all the replenishments that
> needed to be done, here in this loop, the deadline of the new head of
> the replenishment queue (if there still are elements in there).
>
> > +            /* when the replenishment happens
> > +             * svc is either on a pcpu or on
> > +             * runq/depletedq
> > +             */
> > +            if( __vcpu_on_q(svc) )
> > +            {
> > +                /* put back to runq */
> > +                __q_remove(svc);
> > +                __runq_insert(ops, svc);
> > +                runq_tickle(ops, svc);
> > +            }
> > +
> Aha! So, it looks the budget enforcement logic ended up here? Mmm...
> no, I think that doing it in rt_schedule(), as we agreed, would make
> things simpler and better looking, both here and there.
>
> This is the replenishment timer handler, and it really should do one
> thins: handle replenishment events. That's it! ;-P
>
> > +            /* resort replq, keep track of this
> > +             * vcpu if it's still runnable. If
> > +             * at this point no vcpu are, then
> > +             * the timer waits for wake() to
> > +             * program it.
> > +             */
> > +            __p_remove(svc);
> > +
> > +            if( vcpu_runnable(svc->vcpu) )
> > +                __replq_insert(ops, svc);
> > +        }
> > +
> > +        else
> > +        {
> > +            /* Break out of the loop if a vcpu's
> > +             * cur_deadline is bigger than now.
> > +             * Also when some replenishment is done
> > +             * in wake(), the code could end up here
> > +             * if the time fires earlier than it should
> > +            */
> > +            if( t_next > svc->cur_deadline )
> > +                t_next = svc->cur_deadline;
> > +            break;
> > +        }
> > +    }
> > +
> > +    set_timer(repl_timer, t_next);
> > +
> Well, what if there are no more replenishments to be performed? You're
> still reprogramming the timer, either at t_next or at cur_deadline,
> which looks arbitrary.
>
> If there are no replenishments, let's just keep the timer off.
>
> So, all in all, this is a very very good first attempt at implementing
> what we agreed upon in previous email... Good work, can't wait to see
> v5! :-D
>
> Let me know if there is anything unclear with any of the comments.


Thanks and Best Regards,

Meng



-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-03  3:33   ` Meng Xu
@ 2016-02-03 12:30     ` Dario Faggioli
  0 siblings, 0 replies; 14+ messages in thread
From: Dario Faggioli @ 2016-02-03 12:30 UTC (permalink / raw)
  To: Meng Xu; +Cc: xen-devel, Tianyang Chen, George Dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1: Type: text/plain, Size: 2185 bytes --]

On Tue, 2016-02-02 at 22:33 -0500, Meng Xu wrote:
> On Tue, Feb 2, 2016 at 10:08 AM, Dario Faggioli
> <dario.faggioli@citrix.com> wrote:
> > 
> > Is it ok to kill the replenishment in this case?
> > 
> > This is a genuine question. What does the dynamic DS algorithm that
> > you're trying to implement here says about this? Meng, maybe you
> > can
> > help here?
> 
> 
> Based on the DS algorithm (in theory), each VCPU should have its
> budget replenished at the beginning of its next period.
> 
> However, we are thinking that when a VCPU is put to sleep, no one is
> supposed to use it. Do we really need to keep updating the VCPU's
> budget? Can we just update the VCPU's budget when it is waken up
> later? This could potentially save some implementation overhead,
> IMHO.
> That's why we decided not to update the budget of VCPUs that are put
> into sleep.
> 
> > Is it ok to do this _because_ you then handle the situation of a
> > replenishment having to had happened while the vcpu was asleep in
> > rt_vcpu_wake (the '(now>=svc->cur_deadline)' thing)? Yes... it
> > probably
> > is ok for that reason...
> 
> 
> Yes, exactly. We hope this could reduce the frequency of invoking the
> replenishment timer when the system is (kind of) idle.
> 
I see what you mean. I wonder, however, given how big and complicated
this re-structuring we are doing is, whether it would not be easier to
just leave this optimization for the future, and just implement the
algorithm in the new event-driven fashion, as first step.

Early optimization is known to be a bad thing in software.

Note that I'm not saying that the optimization should happen in 6
months, it can be patch number 2 of the same series doing the event-
driven transformation... I'm just saying that we should probably have a
patch 1 which does only that, for ease of both  doing and reviewing.

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-02 18:09   ` Tianyang Chen
@ 2016-02-03 12:39     ` Dario Faggioli
  2016-02-04  2:23       ` Tianyang Chen
  2016-02-04  4:03       ` Meng Xu
  0 siblings, 2 replies; 14+ messages in thread
From: Dario Faggioli @ 2016-02-03 12:39 UTC (permalink / raw)
  To: Tianyang Chen, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1: Type: text/plain, Size: 15822 bytes --]

On Tue, 2016-02-02 at 13:09 -0500, Tianyang Chen wrote:
> On 2/2/2016 10:08 AM, Dario Faggioli wrote:
> > On Sun, 2016-01-31 at 23:32 -0500, Tianyang Chen wrote:
> > > 
> > > +    struct rt_private *prv = rt_priv(ops);
> > > +    struct list_head *replq = rt_replq(ops);
> > > +    struct list_head *iter;
> > > +
> > > +    ASSERT( spin_is_locked(&prv->lock) );
> > > +
> > Is this really useful? Considering where this is called, I don't
> > think
> > it is.
> > 
> 
> This basically comes from __q_insert(). 
>
I see. Well it's still not necessary, IMO, so don't add it. At some
point, we may want to remove it from __runq_insert() too.

The bottom line is: prv->lock is, currently, the runqueue lock (it's
the lock for everything! :-D). It is pretty obvious that you need the
runq lock to manipulate the runqueue.

It is not the runqueue that you are manipulating here, it is the
replenishment queue, but as a matter of fact, prv->lock serializes that
too. So, if anything, it would be more useful to add a comment
somewhere, noting (reinforcing, I should probably say) the point that
also this new queue for replenishment is being serialized by prv->lock, 
than an assert like this one.

> I have been searching for the 
> definitions of vcpu_schedule_lock_irq() and pcpu_schedule_lock_irq().
> Do 
> you know where they are defined? I did some name search in the
> entire 
> xen directory but still nothing.
> 
They are auto-generated. Look in xen/include/xen/sched-if.h.

> > > +    ASSERT( !__vcpu_on_p(svc) );
> > > +
> > > +    list_for_each(iter, replq)
> > > +    {
> > > +        struct rt_vcpu * iter_svc = __p_elem(iter);
> > > +        if ( svc->cur_deadline <= iter_svc->cur_deadline )
> > > +                break;
> > > +    }
> > > +
> > > +    list_add_tail(&svc->p_elem, iter);
> > > +}
> > This looks ok. The list_for_each()+list_ad_tail() part would be
> > basically identical to what we have inside __runq_insert(),
> > thgough,
> > wouldn't it?
> > 
> > It may make sense to define an _ordered_queue_insert() (or
> > _deadline_queue_insert(), since it's alwas the deadline that is
> > compared) utility function that we can then call in both cases.
> > 
> > > @@ -449,6 +507,7 @@ rt_init(struct scheduler *ops)
> > >       INIT_LIST_HEAD(&prv->sdom);
> > >       INIT_LIST_HEAD(&prv->runq);
> > >       INIT_LIST_HEAD(&prv->depletedq);
> > > +    INIT_LIST_HEAD(&prv->replq);
> > > 
> > Is there a reason why you don't do at least the allocation of the
> > timer
> > here? Because, to me, this looks the best place for that.
> > 
> > >       cpumask_clear(&prv->tickled);
> > > 
> > > @@ -473,6 +532,9 @@ rt_deinit(const struct scheduler *ops)
> > >           xfree(_cpumask_scratch);
> > >           _cpumask_scratch = NULL;
> > >       }
> > > +
> > > +    kill_timer(prv->repl_timer);
> > > +
> > >       xfree(prv);
> > > 
> > And here, you're freeing prv, without having freed prv->timer,
> > which is
> > therefore leaked.
> > 
> > > @@ -632,6 +699,17 @@ rt_vcpu_insert(const struct scheduler *ops,
> > > struct vcpu *vc)
> > >       vcpu_schedule_unlock_irq(lock, vc);
> > > 
> > >       SCHED_STAT_CRANK(vcpu_insert);
> > > +
> > > +    if( prv->repl_timer == NULL )
> > > +    {
> > > +        /* vc->processor has been set in schedule.c */
> > > +        cpu = vc->processor;
> > > +
> > > +        repl_timer = xzalloc(struct timer);
> > > +
> > > +        prv->repl_timer = repl_timer;
> > > +        init_timer(repl_timer, repl_handler, (void *)ops, cpu);
> > > +    }
> > >   }
> > > 
> > This belong to rt_init, IMO.
> > 
> 
> When a new pool is created, the corresponding scheduler is also 
> initialized. But there is no pcpu assigned to that pool. 
>
Sure, and in fact, above, when commenting on rt_init() I said "Is there
a reason why you don't do at least the allocation of the timer here?"

But you're right, here I put it like all should belong to rt_init(), I
should have been more clear.

> If init_timer() 
> happens in rt_init, how does it know which cpu to pick?
> When move_domain() is called, there must be some pcpus in the 
> destination pool and then vcpu_insert happens.
> 
Allocating memory for the rt_priv copy of this particular scheduler
instance, definitely belong in rt_init().

Then, in this case, we could do the allocation in rt_init() and do the
initialization later, perhaps here, or the first time that the timer
needs to be started, or when the first pCPU is assigned to this
scheduler (by being assigned to the cpupool this scheduler services).

I honestly prefer the latter. That is to say, in rt_alloc_pdata(), you
check whether prv->repl_timer is NULL and you allocate and initialize
it for the pCPU being brought up. I also would put an explicit 'prv-
>repl_timer=NULL', with a comment that proper allocation and
initialization will happen later, and the reason why that is the case,
in rt_init().

What do you think?

> > > @@ -841,7 +881,6 @@ rt_schedule(const struct scheduler *ops,
> > > s_time_t
> > > now, bool_t tasklet_work_sched
> > >       /* burn_budget would return for IDLE VCPU */
> > >       burn_budget(ops, scurr, now);
> > > 
> > > -    __repl_update(ops, now);
> > > 
> > __repl_update() going away is of course fine. But we need to enact
> > the
> > budget enforcement logic somewhere in here, don't we?
> > 
> 
> Sorry about that, I meant to add it in this version.
> 
More on this later. But then I'm curios. With this patch applied, it
you set b=400/p=1000 for a domain, was the 40% utilization being
enforced properly?

> > > @@ -1058,25 +1103,39 @@ rt_vcpu_wake(const struct scheduler *ops,
> > > struct vcpu *vc)
> > >       if ( unlikely(svc->flags & RTDS_scheduled) )
> > >       {
> > >           set_bit(__RTDS_delayed_runq_add, &svc->flags);
> > > -        return;
> > > +        goto out;
> > >       }
> > > 
> > In this case, I'm less sure. I think this should just return as
> > well,
> > but it may depend on how other stuff are done (like budget
> > enforcement), so it's hard to think at it right now, but consider
> > this
> > when working on next version.
> > 
> 
> OK. For the second case, I guess there is nothing preventing a vcpu
> that 
> sleeps for a long time on runq to be picked before other vcpus.
>
By "the second case" you mean this one about __RTDS_delayed_runq_add?
If yes, I'm not following what you mean with "a vcpu sleeps for a long
time on runq etc."

This is what we are talking about here. A vCPU is selected to be
preempted, during rt_schedule(), but it is still runnable, so we need
to put it back to the runqueue. You can't just do that (put it back in
the runqueue), because, in RTDS, like in Credit2, the runqueue is
shared between multiple pCPUs (see commit be9a0806e197f4fd3fa6).

So what do we do, we raise the *_delayed_runq_add flag and continue and
leave the vcpu alone. This happens, for instance, when there is a race
between scheduling out a vcpu that is going to sleep, and the vcpu in
question being woken back up already! It's not frequent, but we need to
take care of that. At some point (sooner rather tan later, most likely)
the context switch will actually happen, and the vcpu that is waking up
is put in the runqueue.

> And also, repl_handler removes vcpus that are not runnable and if
> they 
> are not added back here, where should we start tracking them? This
> goes 
> back to if it's necessary to remove a un-runnable vcpu.
> 
I also don't get what you mean by "tracking" (here and in other
places).

Anyway, what I meant when I said that what to do in the "delayed add"
case depends on other things is, for instance, this: the vcpu is waking
up right now, but it is going to be inserted in the runqueue later.
When (as per "where in the code", "in what function") do you want to
program the timer, here in rt_vcpu_wake(), or changing
__replq_insert(), as said somewhere else, or in rt_context_saved(),
where the vcpu is actually added to the runq? This kind of things...


All this being said, taking a bit of a step back, and considering, not
only the replenishment event and/or timer (re)programming issue, but
also the deadline that needs updating, I think you actually need to
find a way to check for both, in this patch.

In fact, if we aim at simplifying rt_context_saved() too --and we
certainly are-- in this case that a vcpu is actually being woken up,
but it's not being put back in the runqueue until context switch, when
it is that we check if it's deadline passed and it hence refreshed
scheduling parameters?


So, big recap, I think a possible variant of rt_vcpu_wake() would look
as follows:

 - if vcpu is running or in a q     ==> /* nothing. */

 - esle if vcpu is delayed_runq_add ==> if (now >= deadline?) {
                                          update_deadline();
                                          if ( __vcpu_on_p(svc) )
                                            _replq_remove();
                                        }
                                        _replq_insert();

 - else                             ==> if (now >= deadline?) {
                                          update_deadline();
                                          if ( __vcpu_on_p(svc) )
      
                                     _replq_remove();
                 
                      }
                                        _runq_insert();
                                        _replq_insert();
                                        runq_tickle();

This would mean something like this:

static void
rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
{
    struct rt_vcpu * const svc = rt_vcpu(vc);
    s_time_t now = NOW();
    struct rt_private *prv = rt_priv(ops);

    BUG_ON( is_idle_vcpu(vc) );

    if ( unlikely(curr_on_cpu(vc->processor) == vc) )
    {
        SCHED_STAT_CRANK(vcpu_wake_running);
        return;
    }

    /* on RunQ/DepletedQ, just update info is ok */
    if ( unlikely(__vcpu_on_q(svc)) )
    {
        SCHED_STAT_CRANK(vcpu_wake_onrunq);
        return;
    }

    if ( likely(vcpu_runnable(vc)) )
        SCHED_STAT_CRANK(vcpu_wake_runnable);
    else
        SCHED_STAT_CRANK(vcpu_wake_not_runnable);


    if ( now >= svc->cur_deadline)
    {
        rt_update_deadline(now, svc);
        __replq_remove(svc);
    }

    __replq_insert(svc);

    /* If context hasn't been saved for this vcpu yet, we can't put it on
     * the Runqueue/DepletedQ. Instead, we set a flag so that it will be
     * put on the Runqueue/DepletedQ after the context has been saved.
     */
    if ( unlikely(svc->flags & RTDS_scheduled) )
    {
        set_bit(__RTDS_delayed_runq_add, &svc->flags);
        return;
    }

    /* insert svc to runq/depletedq because svc is not in queue now */
    __runq_insert(ops, svc);
    runq_tickle(ops, snext);

    return;
}

And, at this point, I'm starting thinking again that we want to call
runq_tickle() in rt_context_saved(), at least in case
__RTDS_delayed_runq_add was set as, if we don't, we're missing tickling
for the vcpu being inserted because of that.

Of course, all the above, taken with a little bit of salt... i.e., I do
not expect to have got all the details right, especially in the code,
but it should at least be effective in showing the idea.

Thoughts?


> > > +            /* when the replenishment happens
> > > +             * svc is either on a pcpu or on
> > > +             * runq/depletedq
> > > +             */
> > > +            if( __vcpu_on_q(svc) )
> > > +            {
> > > +                /* put back to runq */
> > > +                __q_remove(svc);
> > > +                __runq_insert(ops, svc);
> > > +                runq_tickle(ops, svc);
> > > +            }
> > > +
> > Aha! So, it looks the budget enforcement logic ended up here?
> > Mmm...
> > no, I think that doing it in rt_schedule(), as we agreed, would
> > make
> > things simpler and better looking, both here and there.
> > 
> > This is the replenishment timer handler, and it really should do
> > one
> > thins: handle replenishment events. That's it! ;-P
> > 
> 
> Oh I might be misunderstanding what should be put in rt_schedule().
> Was 
> it specifically for handling a vcpu that shouldn't be taken off a
> core?
> Why are you referring this piece of code as budget enforcement?
> 
> Isn't it necessary to modify the runq here after budget
> replenishment 
> has happened? If runq goes out of order here, will rt_schedule() be 
> sorting it?
> 
Ops, yes, you are right, this is not budget enforcement. So, budget
enforcement looks still to be missing from this patch (and that is why
I asked whether it is actually working).

And this chunk of code (and perhaps this whole function) is probably
ok, it's just a bit hard to understand what it does...


So, do you mind if we take a step back again, and analyze the
situation. When the timer fires, and this handler is executed and a
replenishment event taken off the replenishment queue, the following
situations are possible:

 1) the replenishment is for a running vcpu
 2) the replenishment is for a runnable vcpu in the runq
 3) the replenishment is for a runnable vcpu in the depletedq
 4) the replenishment is for a sleeping vcpu

So, 4) is never going to happen, right?, as we're stopping the timer
when a vcpu blocks. If we go for this, let's put an ASSERT() and a
comment as a guard for that. If we do the optimization in this very
patch. If we leave it for another patch, we need to handle this case, I
guess.

In all 1), 2) and 3) cases, we need to remove the replenishment event
from the replenishment queue, so just (in the code) do it upfront.

What other actions need to happen in each case, 1), 2) and 3)? Can you
summarize then in here, so we can decide how to handle each situation?

I'd say that, in both case 2) and case 3), the vcpu needs to be taken
out of the queue where they are, and (re-)inserted in the runqueue, and
then we need to tickle... Is that correct?

I'd also say that, in case 1), we also need to tickle because the
deadline may now be have become bigger than some other vcpu that is in
the runqueue?

Also, under what conditions we need, as soon as replenishment for vcpu
X happened, to queue another replenishment for the same vcpu, at its
next deadline? Always, I would say (unless the vcpu is sleeping,
because of the optimization you introduced)?

Thanks and Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-03 12:39     ` Dario Faggioli
@ 2016-02-04  2:23       ` Tianyang Chen
  2016-02-05 14:39         ` Dario Faggioli
  2016-02-04  4:03       ` Meng Xu
  1 sibling, 1 reply; 14+ messages in thread
From: Tianyang Chen @ 2016-02-04  2:23 UTC (permalink / raw)
  To: Dario Faggioli, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu



On 2/3/2016 7:39 AM, Dario Faggioli wrote:
> On Tue, 2016-02-02 at 13:09 -0500, Tianyang Chen wrote:
>> On 2/2/2016 10:08 AM, Dario Faggioli wrote:
>>> On Sun, 2016-01-31 at 23:32 -0500, Tianyang Chen wrote:
>>>>
>>>> +    struct rt_private *prv = rt_priv(ops);
>>>> +    struct list_head *replq = rt_replq(ops);
>>>> +    struct list_head *iter;
>>>> +
>>>> +    ASSERT( spin_is_locked(&prv->lock) );
>>>> +
>>> Is this really useful? Considering where this is called, I don't
>>> think
>>> it is.
>>>
>>
>> This basically comes from __q_insert().
>>
> I see. Well it's still not necessary, IMO, so don't add it. At some
> point, we may want to remove it from __runq_insert() too.
>
> The bottom line is: prv->lock is, currently, the runqueue lock (it's
> the lock for everything! :-D). It is pretty obvious that you need the
> runq lock to manipulate the runqueue.
>
> It is not the runqueue that you are manipulating here, it is the
> replenishment queue, but as a matter of fact, prv->lock serializes that
> too. So, if anything, it would be more useful to add a comment
> somewhere, noting (reinforcing, I should probably say) the point that
> also this new queue for replenishment is being serialized by prv->lock,
> than an assert like this one.
>

Sure

>> I have been searching for the
>> definitions of vcpu_schedule_lock_irq() and pcpu_schedule_lock_irq().
>> Do
>> you know where they are defined? I did some name search in the
>> entire
>> xen directory but still nothing.
>>
> They are auto-generated. Look in xen/include/xen/sched-if.h.
>
>>>> +    ASSERT( !__vcpu_on_p(svc) );
>>>> +
>>>> +    list_for_each(iter, replq)
>>>> +    {
>>>> +        struct rt_vcpu * iter_svc = __p_elem(iter);
>>>> +        if ( svc->cur_deadline <= iter_svc->cur_deadline )
>>>> +                break;
>>>> +    }
>>>> +
>>>> +    list_add_tail(&svc->p_elem, iter);
>>>> +}
>>> This looks ok. The list_for_each()+list_ad_tail() part would be
>>> basically identical to what we have inside __runq_insert(),
>>> thgough,
>>> wouldn't it?
>>>
>>> It may make sense to define an _ordered_queue_insert() (or
>>> _deadline_queue_insert(), since it's alwas the deadline that is
>>> compared) utility function that we can then call in both cases.
>>>
>>>> @@ -449,6 +507,7 @@ rt_init(struct scheduler *ops)
>>>>        INIT_LIST_HEAD(&prv->sdom);
>>>>        INIT_LIST_HEAD(&prv->runq);
>>>>        INIT_LIST_HEAD(&prv->depletedq);
>>>> +    INIT_LIST_HEAD(&prv->replq);
>>>>
>>> Is there a reason why you don't do at least the allocation of the
>>> timer
>>> here? Because, to me, this looks the best place for that.
>>>
>>>>        cpumask_clear(&prv->tickled);
>>>>
>>>> @@ -473,6 +532,9 @@ rt_deinit(const struct scheduler *ops)
>>>>            xfree(_cpumask_scratch);
>>>>            _cpumask_scratch = NULL;
>>>>        }
>>>> +
>>>> +    kill_timer(prv->repl_timer);
>>>> +
>>>>        xfree(prv);
>>>>
>>> And here, you're freeing prv, without having freed prv->timer,
>>> which is
>>> therefore leaked.
>>>
>>>> @@ -632,6 +699,17 @@ rt_vcpu_insert(const struct scheduler *ops,
>>>> struct vcpu *vc)
>>>>        vcpu_schedule_unlock_irq(lock, vc);
>>>>
>>>>        SCHED_STAT_CRANK(vcpu_insert);
>>>> +
>>>> +    if( prv->repl_timer == NULL )
>>>> +    {
>>>> +        /* vc->processor has been set in schedule.c */
>>>> +        cpu = vc->processor;
>>>> +
>>>> +        repl_timer = xzalloc(struct timer);
>>>> +
>>>> +        prv->repl_timer = repl_timer;
>>>> +        init_timer(repl_timer, repl_handler, (void *)ops, cpu);
>>>> +    }
>>>>    }
>>>>
>>> This belong to rt_init, IMO.
>>>
>>
>> When a new pool is created, the corresponding scheduler is also
>> initialized. But there is no pcpu assigned to that pool.
>>
> Sure, and in fact, above, when commenting on rt_init() I said "Is there
> a reason why you don't do at least the allocation of the timer here?"
>
> But you're right, here I put it like all should belong to rt_init(), I
> should have been more clear.
>
>> If init_timer()
>> happens in rt_init, how does it know which cpu to pick?
>> When move_domain() is called, there must be some pcpus in the
>> destination pool and then vcpu_insert happens.
>>
> Allocating memory for the rt_priv copy of this particular scheduler
> instance, definitely belong in rt_init().
>
> Then, in this case, we could do the allocation in rt_init() and do the
> initialization later, perhaps here, or the first time that the timer
> needs to be started, or when the first pCPU is assigned to this
> scheduler (by being assigned to the cpupool this scheduler services).
>
> I honestly prefer the latter. That is to say, in rt_alloc_pdata(), you
> check whether prv->repl_timer is NULL and you allocate and initialize
> it for the pCPU being brought up. I also would put an explicit 'prv-
>> repl_timer=NULL', with a comment that proper allocation and
> initialization will happen later, and the reason why that is the case,
> in rt_init().
>
> What do you think?
>

Yeah it makes sense to put some comments in rt_init to remind readers 
that the timer will be taken care of later in alloc_pdata().

>>>> @@ -841,7 +881,6 @@ rt_schedule(const struct scheduler *ops,
>>>> s_time_t
>>>> now, bool_t tasklet_work_sched
>>>>        /* burn_budget would return for IDLE VCPU */
>>>>        burn_budget(ops, scurr, now);
>>>>
>>>> -    __repl_update(ops, now);
>>>>
>>> __repl_update() going away is of course fine. But we need to enact
>>> the
>>> budget enforcement logic somewhere in here, don't we?
>>>
>>
>> Sorry about that, I meant to add it in this version.
>>
> More on this later. But then I'm curios. With this patch applied, it
> you set b=400/p=1000 for a domain, was the 40% utilization being
> enforced properly?
>
>>>> @@ -1058,25 +1103,39 @@ rt_vcpu_wake(const struct scheduler *ops,
>>>> struct vcpu *vc)
>>>>        if ( unlikely(svc->flags & RTDS_scheduled) )
>>>>        {
>>>>            set_bit(__RTDS_delayed_runq_add, &svc->flags);
>>>> -        return;
>>>> +        goto out;
>>>>        }
>>>>
>>> In this case, I'm less sure. I think this should just return as
>>> well,
>>> but it may depend on how other stuff are done (like budget
>>> enforcement), so it's hard to think at it right now, but consider
>>> this
>>> when working on next version.
>>>
>>
>> OK. For the second case, I guess there is nothing preventing a vcpu
>> that
>> sleeps for a long time on runq to be picked before other vcpus.
>>
> By "the second case" you mean this one about __RTDS_delayed_runq_add?
> If yes, I'm not following what you mean with "a vcpu sleeps for a long
> time on runq etc."
>
> This is what we are talking about here. A vCPU is selected to be
> preempted, during rt_schedule(), but it is still runnable, so we need
> to put it back to the runqueue. You can't just do that (put it back in
> the runqueue), because, in RTDS, like in Credit2, the runqueue is
> shared between multiple pCPUs (see commit be9a0806e197f4fd3fa6).
>
> So what do we do, we raise the *_delayed_runq_add flag and continue and
> leave the vcpu alone. This happens, for instance, when there is a race
> between scheduling out a vcpu that is going to sleep, and the vcpu in
> question being woken back up already! It's not frequent, but we need to
> take care of that. At some point (sooner rather tan later, most likely)
> the context switch will actually happen, and the vcpu that is waking up
> is put in the runqueue.
>

Yeah I agree with this situation. But I meant to say when a vcpu is 
waking up while it's still on a queue. See below.

>> And also, repl_handler removes vcpus that are not runnable and if
>> they
>> are not added back here, where should we start tracking them? This
>> goes
>> back to if it's necessary to remove a un-runnable vcpu.
>>
> I also don't get what you mean by "tracking" (here and in other
> places).
>

So "tracking" basically means adding a vcpu to the replenishment queue. 
"Stop tracking" means removing a vcpu from the replenishment queue.

> Anyway, what I meant when I said that what to do in the "delayed add"
> case depends on other things is, for instance, this: the vcpu is waking
> up right now, but it is going to be inserted in the runqueue later.
> When (as per "where in the code", "in what function") do you want to
> program the timer, here in rt_vcpu_wake(), or changing
> __replq_insert(), as said somewhere else, or in rt_context_saved(),
> where the vcpu is actually added to the runq? This kind of things...
>
>
> All this being said, taking a bit of a step back, and considering, not
> only the replenishment event and/or timer (re)programming issue, but
> also the deadline that needs updating, I think you actually need to
> find a way to check for both, in this patch.
>

> In fact, if we aim at simplifying rt_context_saved() too --and we
> certainly are-- in this case that a vcpu is actually being woken up,
> but it's not being put back in the runqueue until context switch, when
> it is that we check if it's deadline passed and it hence refreshed
> scheduling parameters?
>
>
> So, big recap, I think a possible variant of rt_vcpu_wake() would look
> as follows:
>
>   - if vcpu is running or in a q     ==> /* nothing. */
>
>   - esle if vcpu is delayed_runq_add ==> if (now >= deadline?) {
>                                            update_deadline();
>                                            if ( __vcpu_on_p(svc) )
>                                              _replq_remove();
>                                          }
>                                          _replq_insert();
>
>   - else                             ==> if (now >= deadline?) {
>                                            update_deadline();
>                                            if ( __vcpu_on_p(svc) )
>
>                                       _replq_remove();
>
>                        }
>                                          _runq_insert();
>                                          _replq_insert();
>                                          runq_tickle();
>
> This would mean something like this:
>
> static void
> rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
> {
>      struct rt_vcpu * const svc = rt_vcpu(vc);
>      s_time_t now = NOW();
>      struct rt_private *prv = rt_priv(ops);
>
>      BUG_ON( is_idle_vcpu(vc) );
>
>      if ( unlikely(curr_on_cpu(vc->processor) == vc) )
>      {
>          SCHED_STAT_CRANK(vcpu_wake_running);
>          return;
>      }
>

For the this situation, a vcpu is waking up on a pcpu. It could be taken 
off from the replq inside the timer handler at the end. So, if we decide 
to not replenish any sleeping/un-runnable vcpus any further, we should 
probably add it back to replq here right?

>      /* on RunQ/DepletedQ, just update info is ok */
>      if ( unlikely(__vcpu_on_q(svc)) )
>      {
>          SCHED_STAT_CRANK(vcpu_wake_onrunq);
>          return;
>      }
>

For this situation, a vcpu could be going through the following phases:

on runq --> sleep --> (for a long time) --> wake up

When it wakes up, it could stay in the front of the runq because 
cur_deadline hasn't been updated. It will, inherently have some high 
priority-ness (because of EDF) and be picked over other vcpus right? Do 
we want to update it's deadline and budget here and re-insert it 
accordingly?

>      if ( likely(vcpu_runnable(vc)) )
>          SCHED_STAT_CRANK(vcpu_wake_runnable);
>      else
>          SCHED_STAT_CRANK(vcpu_wake_not_runnable);
>
>
>      if ( now >= svc->cur_deadline)
>      {
>          rt_update_deadline(now, svc);
>          __replq_remove(svc);
>      }
>
>      __replq_insert(svc);
>

For here, consider this scenario:

on pcpu (originally on replq) --> sleep --> context_saved() but still 
asleep --> wake()

Timer handler isn't triggered before it awakes. Then this vcpu will be 
added to replq again while it's already there. I'm not 100% positive if 
this situation is possible though. But judging from rt_context_saved(), 
it checks if a vcpu is runnable before adding back to the runq.

>      /* If context hasn't been saved for this vcpu yet, we can't put it on
>       * the Runqueue/DepletedQ. Instead, we set a flag so that it will be
>       * put on the Runqueue/DepletedQ after the context has been saved.
>       */
>      if ( unlikely(svc->flags & RTDS_scheduled) )
>      {
>          set_bit(__RTDS_delayed_runq_add, &svc->flags);
>          return;
>      }
>

I agree with this. Since replenishment happens above already, it can be 
inserted back to runq correctly in rt_context_saved().

>      /* insert svc to runq/depletedq because svc is not in queue now */
>      __runq_insert(ops, svc);
>      runq_tickle(ops, snext);
>
>      return;
> }
>
> And, at this point, I'm starting thinking again that we want to call
> runq_tickle() in rt_context_saved(), at least in case
> __RTDS_delayed_runq_add was set as, if we don't, we're missing tickling
> for the vcpu being inserted because of that.
>
> Of course, all the above, taken with a little bit of salt... i.e., I do
> not expect to have got all the details right, especially in the code,
> but it should at least be effective in showing the idea.
>
> Thoughts?
>
>

So just to wrap up my thoughts for budget replenishment inside of 
wake(), a vcpu can be in different states(on pcpu, on queue etc.) when 
it wakes up.


1)wake up on a pcpu:
Do nothing as it may have some remaining budget and we just ignore it's 
cur_deadline when it wakes up. I can't find a corresponding pattern in 
DS but this can be treated like a server being pushed back because of 
the nap.

2)wake up on a queue:
Update the deadline(refill budget) of it and re-insert it into runq for 
the reason stated above.

3)wake up in the middle of context switching:
Update the deadline(refill budget) of it so it can be re-inserted into 
runq in rt_context_saved().

4)wake up on nowhere ( sleep on pcpu--> context_saved() --> wake()):
Replenish it and re-insert it back to runq.

5)other? I can't think of any for now.

As for replenishment queue manipulation, we should always try and add it 
back to replq because we don't know if the timer handler has taken it 
off or not.

The timer handler and wake() is coupled in a sense that they both 
manipulate replq.


>>>> +            /* when the replenishment happens
>>>> +             * svc is either on a pcpu or on
>>>> +             * runq/depletedq
>>>> +             */
>>>> +            if( __vcpu_on_q(svc) )
>>>> +            {
>>>> +                /* put back to runq */
>>>> +                __q_remove(svc);
>>>> +                __runq_insert(ops, svc);
>>>> +                runq_tickle(ops, svc);
>>>> +            }
>>>> +
>>> Aha! So, it looks the budget enforcement logic ended up here?
>>> Mmm...
>>> no, I think that doing it in rt_schedule(), as we agreed, would
>>> make
>>> things simpler and better looking, both here and there.
>>>
>>> This is the replenishment timer handler, and it really should do
>>> one
>>> thins: handle replenishment events. That's it! ;-P
>>>
>>
>> Oh I might be misunderstanding what should be put in rt_schedule().
>> Was
>> it specifically for handling a vcpu that shouldn't be taken off a
>> core?
>> Why are you referring this piece of code as budget enforcement?
>>
>> Isn't it necessary to modify the runq here after budget
>> replenishment
>> has happened? If runq goes out of order here, will rt_schedule() be
>> sorting it?
>>
> Ops, yes, you are right, this is not budget enforcement. So, budget
> enforcement looks still to be missing from this patch (and that is why
> I asked whether it is actually working).
>
> And this chunk of code (and perhaps this whole function) is probably
> ok, it's just a bit hard to understand what it does...
>

So, the burn_budget() is still there right? I'm not sure what you are 
referring to as budget enforcement. Correct me if I'm wrong, budget 
enforcement means that each vcpu uses its assigned/remaining budget. Is 
there any specific situation I'm missing?

>
> So, do you mind if we take a step back again, and analyze the
> situation. When the timer fires, and this handler is executed and a
> replenishment event taken off the replenishment queue, the following
> situations are possible:
>
>   1) the replenishment is for a running vcpu
>   2) the replenishment is for a runnable vcpu in the runq
>   3) the replenishment is for a runnable vcpu in the depletedq
>   4) the replenishment is for a sleeping vcpu
>
> So, 4) is never going to happen, right?, as we're stopping the timer
> when a vcpu blocks. If we go for this, let's put an ASSERT() and a
> comment as a guard for that. If we do the optimization in this very
> patch. If we leave it for another patch, we need to handle this case, I
> guess.
>
> In all 1), 2) and 3) cases, we need to remove the replenishment event
> from the replenishment queue, so just (in the code) do it upfront.
>

I don't think we need to take them of the replenishment queue? See the 
last couple paragraphs.

> What other actions need to happen in each case, 1), 2) and 3)? Can you
> summarize then in here, so we can decide how to handle each situation?
>

Here we go.
1) the replenishment is for a running vcpu
Replenish the vcpu. Keep it on the replenishment queue. Tickle it as it 
may have a later deadline.

2) the replenishment is for a runnable vcpu in the runq
Replenish the vcpu. Keep it on the replenishment queue. (re-)insert it 
into runq. Tickle it.

3) the replenishment is for a runnable vcpu in the depletedq
Same as 2)

4) the replenishment is for a sleeping vcpu
Take it off from the replenishment queue and don't do replenishment. As 
a matter of fact, this patch does the replenishment first and then take 
it off.

> I'd say that, in both case 2) and case 3), the vcpu needs to be taken
> out of the queue where they are, and (re-)inserted in the runqueue, and
> then we need to tickle... Is that correct?
>

correct^.

> I'd also say that, in case 1), we also need to tickle because the
> deadline may now be have become bigger than some other vcpu that is in
> the runqueue?
>

correct. I missed this case^.

> Also, under what conditions we need, as soon as replenishment for vcpu
> X happened, to queue another replenishment for the same vcpu, at its
> next deadline? Always, I would say (unless the vcpu is sleeping,
> because of the optimization you introduced)?

So this is kinda confusing. If in case 1) 2) and 3) they are never taken 
off from the replenishment queue, a series of replenishment events are 
automatically updated after the replenishment. There is no need, as 
least one less reason, to keep track of when to queue the next vcpus for 
replenishment.

The side effect of this is that all vcpus could be going to sleep right 
after the timer was programmed at the end of the handler. If they don't 
wake up before the timer fires next time, the handler is called to 
replenish nobody. If that's the case, timer will not be programmed again 
in the handler. Wake() kicks in and programs it instead. And the 
downside of this is that we always need to check if an awaking vcpu is 
on replq or not. It not add it back(hence the notion of starting to 
track it's replenishment events again). When adding back, check if there 
is a need to reprogram the timer.

I really like the idea of replenishment queue compared to running queue 
in that the places that we need to manipulate the queue are only in 
wake() and repl_handler().

Thanks,
Tianyang

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-03 12:39     ` Dario Faggioli
  2016-02-04  2:23       ` Tianyang Chen
@ 2016-02-04  4:03       ` Meng Xu
  2016-02-05 14:45         ` Dario Faggioli
  1 sibling, 1 reply; 14+ messages in thread
From: Meng Xu @ 2016-02-04  4:03 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: xen-devel, Tianyang Chen, George Dunlap, Dagaen Golomb, Meng Xu

On Wed, Feb 3, 2016 at 7:39 AM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
>
> On Tue, 2016-02-02 at 13:09 -0500, Tianyang Chen wrote:
> > On 2/2/2016 10:08 AM, Dario Faggioli wrote:
> > > On Sun, 2016-01-31 at 23:32 -0500, Tianyang Chen wrote:
> > > >
> > > > +    struct rt_private *prv = rt_priv(ops);
> > > > +    struct list_head *replq = rt_replq(ops);
> > > > +    struct list_head *iter;
> > > > +
> > > > +    ASSERT( spin_is_locked(&prv->lock) );
> > > > +
> > > Is this really useful? Considering where this is called, I don't
> > > think
> > > it is.
> > >
> >
> > This basically comes from __q_insert().
> >
> I see. Well it's still not necessary, IMO, so don't add it. At some
> point, we may want to remove it from __runq_insert() too.
>
> The bottom line is: prv->lock is, currently, the runqueue lock (it's
> the lock for everything! :-D). It is pretty obvious that you need the
> runq lock to manipulate the runqueue.
>
> It is not the runqueue that you are manipulating here, it is the
> replenishment queue, but as a matter of fact, prv->lock serializes that
> too. So, if anything, it would be more useful to add a comment
> somewhere, noting (reinforcing, I should probably say) the point that
> also this new queue for replenishment is being serialized by prv->lock,
> than an assert like this one.
>
> > I have been searching for the
> > definitions of vcpu_schedule_lock_irq() and pcpu_schedule_lock_irq().
> > Do
> > you know where they are defined? I did some name search in the
> > entire
> > xen directory but still nothing.
> >
> They are auto-generated. Look in xen/include/xen/sched-if.h.
>
> > > > +    ASSERT( !__vcpu_on_p(svc) );
> > > > +
> > > > +    list_for_each(iter, replq)
> > > > +    {
> > > > +        struct rt_vcpu * iter_svc = __p_elem(iter);
> > > > +        if ( svc->cur_deadline <= iter_svc->cur_deadline )
> > > > +                break;
> > > > +    }
> > > > +
> > > > +    list_add_tail(&svc->p_elem, iter);
> > > > +}
> > > This looks ok. The list_for_each()+list_ad_tail() part would be
> > > basically identical to what we have inside __runq_insert(),
> > > thgough,
> > > wouldn't it?
> > >
> > > It may make sense to define an _ordered_queue_insert() (or
> > > _deadline_queue_insert(), since it's alwas the deadline that is
> > > compared) utility function that we can then call in both cases.
> > >
> > > > @@ -449,6 +507,7 @@ rt_init(struct scheduler *ops)
> > > >       INIT_LIST_HEAD(&prv->sdom);
> > > >       INIT_LIST_HEAD(&prv->runq);
> > > >       INIT_LIST_HEAD(&prv->depletedq);
> > > > +    INIT_LIST_HEAD(&prv->replq);
> > > >
> > > Is there a reason why you don't do at least the allocation of the
> > > timer
> > > here? Because, to me, this looks the best place for that.
> > >
> > > >       cpumask_clear(&prv->tickled);
> > > >
> > > > @@ -473,6 +532,9 @@ rt_deinit(const struct scheduler *ops)
> > > >           xfree(_cpumask_scratch);
> > > >           _cpumask_scratch = NULL;
> > > >       }
> > > > +
> > > > +    kill_timer(prv->repl_timer);
> > > > +
> > > >       xfree(prv);
> > > >
> > > And here, you're freeing prv, without having freed prv->timer,
> > > which is
> > > therefore leaked.
> > >
> > > > @@ -632,6 +699,17 @@ rt_vcpu_insert(const struct scheduler *ops,
> > > > struct vcpu *vc)
> > > >       vcpu_schedule_unlock_irq(lock, vc);
> > > >
> > > >       SCHED_STAT_CRANK(vcpu_insert);
> > > > +
> > > > +    if( prv->repl_timer == NULL )
> > > > +    {
> > > > +        /* vc->processor has been set in schedule.c */
> > > > +        cpu = vc->processor;
> > > > +
> > > > +        repl_timer = xzalloc(struct timer);
> > > > +
> > > > +        prv->repl_timer = repl_timer;
> > > > +        init_timer(repl_timer, repl_handler, (void *)ops, cpu);
> > > > +    }
> > > >   }
> > > >
> > > This belong to rt_init, IMO.
> > >
> >
> > When a new pool is created, the corresponding scheduler is also
> > initialized. But there is no pcpu assigned to that pool.
> >
> Sure, and in fact, above, when commenting on rt_init() I said "Is there
> a reason why you don't do at least the allocation of the timer here?"
>
> But you're right, here I put it like all should belong to rt_init(), I
> should have been more clear.
>
> > If init_timer()
> > happens in rt_init, how does it know which cpu to pick?
> > When move_domain() is called, there must be some pcpus in the
> > destination pool and then vcpu_insert happens.
> >
> Allocating memory for the rt_priv copy of this particular scheduler
> instance, definitely belong in rt_init().
>
> Then, in this case, we could do the allocation in rt_init() and do the
> initialization later, perhaps here, or the first time that the timer
> needs to be started, or when the first pCPU is assigned to this
> scheduler (by being assigned to the cpupool this scheduler services).
>
> I honestly prefer the latter. That is to say, in rt_alloc_pdata(), you
> check whether prv->repl_timer is NULL and you allocate and initialize
> it for the pCPU being brought up. I also would put an explicit 'prv-
> >repl_timer=NULL', with a comment that proper allocation and
> initialization will happen later, and the reason why that is the case,
> in rt_init().
>
> What do you think?
>
> > > > @@ -841,7 +881,6 @@ rt_schedule(const struct scheduler *ops,
> > > > s_time_t
> > > > now, bool_t tasklet_work_sched
> > > >       /* burn_budget would return for IDLE VCPU */
> > > >       burn_budget(ops, scurr, now);
> > > >
> > > > -    __repl_update(ops, now);
> > > >
> > > __repl_update() going away is of course fine. But we need to enact
> > > the
> > > budget enforcement logic somewhere in here, don't we?
> > >
> >
> > Sorry about that, I meant to add it in this version.
> >
> More on this later. But then I'm curios. With this patch applied, it
> you set b=400/p=1000 for a domain, was the 40% utilization being
> enforced properly?
>
> > > > @@ -1058,25 +1103,39 @@ rt_vcpu_wake(const struct scheduler *ops,
> > > > struct vcpu *vc)
> > > >       if ( unlikely(svc->flags & RTDS_scheduled) )
> > > >       {
> > > >           set_bit(__RTDS_delayed_runq_add, &svc->flags);
> > > > -        return;
> > > > +        goto out;
> > > >       }
> > > >
> > > In this case, I'm less sure. I think this should just return as
> > > well,
> > > but it may depend on how other stuff are done (like budget
> > > enforcement), so it's hard to think at it right now, but consider
> > > this
> > > when working on next version.
> > >
> >
> > OK. For the second case, I guess there is nothing preventing a vcpu
> > that
> > sleeps for a long time on runq to be picked before other vcpus.
> >
> By "the second case" you mean this one about __RTDS_delayed_runq_add?
> If yes, I'm not following what you mean with "a vcpu sleeps for a long
> time on runq etc."
>
> This is what we are talking about here. A vCPU is selected to be
> preempted, during rt_schedule(), but it is still runnable, so we need
> to put it back to the runqueue. You can't just do that (put it back in
> the runqueue), because, in RTDS, like in Credit2, the runqueue is
> shared between multiple pCPUs (see commit be9a0806e197f4fd3fa6).
>
> So what do we do, we raise the *_delayed_runq_add flag and continue and
> leave the vcpu alone. This happens, for instance, when there is a race
> between scheduling out a vcpu that is going to sleep, and the vcpu in
> question being woken back up already! It's not frequent, but we need to
> take care of that. At some point (sooner rather tan later, most likely)
> the context switch will actually happen, and the vcpu that is waking up
> is put in the runqueue.
>
> > And also, repl_handler removes vcpus that are not runnable and if
> > they
> > are not added back here, where should we start tracking them? This
> > goes
> > back to if it's necessary to remove a un-runnable vcpu.
> >
> I also don't get what you mean by "tracking" (here and in other
> places).
>
> Anyway, what I meant when I said that what to do in the "delayed add"
> case depends on other things is, for instance, this: the vcpu is waking
> up right now, but it is going to be inserted in the runqueue later.
> When (as per "where in the code", "in what function") do you want to
> program the timer, here in rt_vcpu_wake(), or changing
> __replq_insert(), as said somewhere else, or in rt_context_saved(),
> where the vcpu is actually added to the runq? This kind of things...
>
>
> All this being said, taking a bit of a step back, and considering, not
> only the replenishment event and/or timer (re)programming issue, but
> also the deadline that needs updating, I think you actually need to
> find a way to check for both, in this patch.
>
> In fact, if we aim at simplifying rt_context_saved() too --and we
> certainly are-- in this case that a vcpu is actually being woken up,
> but it's not being put back in the runqueue until context switch, when
> it is that we check if it's deadline passed and it hence refreshed
> scheduling parameters?
>
>
> So, big recap, I think a possible variant of rt_vcpu_wake() would look
> as follows:
>
>  - if vcpu is running or in a q     ==> /* nothing. */
>
>  - esle if vcpu is delayed_runq_add ==> if (now >= deadline?) {
>                                           update_deadline();
>                                           if ( __vcpu_on_p(svc) )
>                                             _replq_remove();
>                                         }
>                                         _replq_insert();
>
>  - else                             ==> if (now >= deadline?) {
>                                           update_deadline();
>                                           if ( __vcpu_on_p(svc) )
>
>                                      _replq_remove();
>
>                       }
>                                         _runq_insert();
>                                         _replq_insert();
>                                         runq_tickle();
>
> This would mean something like this:
>
> static void
> rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
> {
>     struct rt_vcpu * const svc = rt_vcpu(vc);
>     s_time_t now = NOW();
>     struct rt_private *prv = rt_priv(ops);
>
>     BUG_ON( is_idle_vcpu(vc) );
>
>     if ( unlikely(curr_on_cpu(vc->processor) == vc) )
>     {
>         SCHED_STAT_CRANK(vcpu_wake_running);
>         return;
>     }
>
>     /* on RunQ/DepletedQ, just update info is ok */
>     if ( unlikely(__vcpu_on_q(svc)) )
>     {
>         SCHED_STAT_CRANK(vcpu_wake_onrunq);
>         return;
>     }
>
>     if ( likely(vcpu_runnable(vc)) )
>         SCHED_STAT_CRANK(vcpu_wake_runnable);
>     else
>         SCHED_STAT_CRANK(vcpu_wake_not_runnable);
>
>
>     if ( now >= svc->cur_deadline)
>     {
>         rt_update_deadline(now, svc);
>         __replq_remove(svc);
>     }
>
>     __replq_insert(svc);
>
>     /* If context hasn't been saved for this vcpu yet, we can't put it on
>      * the Runqueue/DepletedQ. Instead, we set a flag so that it will be
>      * put on the Runqueue/DepletedQ after the context has been saved.
>      */
>     if ( unlikely(svc->flags & RTDS_scheduled) )
>     {
>         set_bit(__RTDS_delayed_runq_add, &svc->flags);
>         return;
>     }
>
>     /* insert svc to runq/depletedq because svc is not in queue now */
>     __runq_insert(ops, svc);
>     runq_tickle(ops, snext);
>
>     return;
> }
>
> And, at this point, I'm starting thinking again that we want to call
> runq_tickle() in rt_context_saved(), at least in case
> __RTDS_delayed_runq_add was set as, if we don't, we're missing tickling
> for the vcpu being inserted because of that.
>
> Of course, all the above, taken with a little bit of salt... i.e., I do
> not expect to have got all the details right, especially in the code,
> but it should at least be effective in showing the idea.
>
> Thoughts?


This is smart and we should be able to put it into this patch series.

When a VCPU is stopped on a core, hasn't saved its context and has not
been added back to runq/depletedq, we should delay the tickle until
its context has been saved and it has been put back to the
(runq/depletedq) queue.
>
>
>
> > > > +            /* when the replenishment happens
> > > > +             * svc is either on a pcpu or on
> > > > +             * runq/depletedq
> > > > +             */
> > > > +            if( __vcpu_on_q(svc) )
> > > > +            {
> > > > +                /* put back to runq */
> > > > +                __q_remove(svc);
> > > > +                __runq_insert(ops, svc);
> > > > +                runq_tickle(ops, svc);
> > > > +            }
> > > > +
> > > Aha! So, it looks the budget enforcement logic ended up here?
> > > Mmm...
> > > no, I think that doing it in rt_schedule(), as we agreed, would
> > > make
> > > things simpler and better looking, both here and there.
> > >
> > > This is the replenishment timer handler, and it really should do
> > > one
> > > thins: handle replenishment events. That's it! ;-P
> > >
> >
> > Oh I might be misunderstanding what should be put in rt_schedule().
> > Was
> > it specifically for handling a vcpu that shouldn't be taken off a
> > core?
> > Why are you referring this piece of code as budget enforcement?
> >
> > Isn't it necessary to modify the runq here after budget
> > replenishment
> > has happened? If runq goes out of order here, will rt_schedule() be
> > sorting it?
> >
> Ops, yes, you are right, this is not budget enforcement. So, budget
> enforcement looks still to be missing from this patch (and that is why
> I asked whether it is actually working).
>
> And this chunk of code (and perhaps this whole function) is probably
> ok, it's just a bit hard to understand what it does...
>
>
> So, do you mind if we take a step back again, and analyze the
> situation. When the timer fires, and this handler is executed and a
> replenishment event taken off the replenishment queue, the following
> situations are possible:
>
>  1) the replenishment is for a running vcpu
>  2) the replenishment is for a runnable vcpu in the runq
>  3) the replenishment is for a runnable vcpu in the depletedq
>  4) the replenishment is for a sleeping vcpu
>
> So, 4) is never going to happen, right?, as we're stopping the timer
> when a vcpu blocks. If we go for this, let's put an ASSERT() and a
> comment as a guard for that. If we do the optimization in this very
> patch. If we leave it for another patch, we need to handle this case, I
> guess.
>
> In all 1), 2) and 3) cases, we need to remove the replenishment event
> from the replenishment queue, so just (in the code) do it upfront.
>
> What other actions need to happen in each case, 1), 2) and 3)? Can you
> summarize then in here, so we can decide how to handle each situation?
>
> I'd say that, in both case 2) and case 3), the vcpu needs to be taken
> out of the queue where they are, and (re-)inserted in the runqueue, and
> then we need to tickle... Is that correct?
>
> I'd also say that, in case 1), we also need to tickle because the
> deadline may now be have become bigger than some other vcpu that is in
> the runqueue?
>
> Also, under what conditions we need, as soon as replenishment for vcpu
> X happened, to queue another replenishment for the same vcpu, at its
> next deadline? Always, I would say (unless the vcpu is sleeping,
> because of the optimization you introduced)?



I'm thinking the following example as to why we need to rearm the
timer for the just-replenished VCPU (Although it's very likely I
missed something.):

A runnable VCPU (period = 10ms, budget = 4ms) is scheduled to run on a
core at t = 0;
at t = 4, its budget depleted and it's added to depletedq;
at t=10, the replenish timer should fire and replenish the budget;
I think we should arm the replenish timer for t = 20 so that its
budget will get replenished at t = 20 for the period [20, 30];
If we don't arm the timer at t = 20, how will the VCPU's budget be
replenished at t=20?

Please correct me if I'm wrong.

Thanks and Best Regards,

Meng


-- 


-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-04  2:23       ` Tianyang Chen
@ 2016-02-05 14:39         ` Dario Faggioli
  2016-02-06  4:27           ` Tianyang Chen
  0 siblings, 1 reply; 14+ messages in thread
From: Dario Faggioli @ 2016-02-05 14:39 UTC (permalink / raw)
  To: Tianyang Chen, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1: Type: text/plain, Size: 14412 bytes --]

On Wed, 2016-02-03 at 21:23 -0500, Tianyang Chen wrote:
> 
> On 2/3/2016 7:39 AM, Dario Faggioli wrote:
> > On Tue, 2016-02-02 at 13:09 -0500, Tianyang Chen wrote:
> > > 
> > So what do we do, we raise the *_delayed_runq_add flag and continue
> > and
> > leave the vcpu alone. This happens, for instance, when there is a
> > race
> > between scheduling out a vcpu that is going to sleep, and the vcpu
> > in
> > question being woken back up already! It's not frequent, but we
> > need to
> > take care of that. At some point (sooner rather tan later, most
> > likely)
> > the context switch will actually happen, and the vcpu that is
> > waking up
> > is put in the runqueue.
> > 
> 
> Yeah I agree with this situation. But I meant to say when a vcpu is 
> waking up while it's still on a queue. See below.
> 
So, if a vcpu wakes up while it is running on a pcpu already, or while
it is already woken and has already been put in the runqueue, these are
SPURIOUS WAKEUP, which should be dealt with by just doing nothing, as
every scheduler is doing.... This is what I'm tring to say since a
couple of email, let's see if I've said it clear enough this time. :-D

> > static void
> > rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
> > {
> >      struct rt_vcpu * const svc = rt_vcpu(vc);
> >      s_time_t now = NOW();
> >      struct rt_private *prv = rt_priv(ops);
> > 
> >      BUG_ON( is_idle_vcpu(vc) );
> > 
> >      if ( unlikely(curr_on_cpu(vc->processor) == vc) )
> >      {
> >          SCHED_STAT_CRANK(vcpu_wake_running);
> >          return;
> >      }
> > 
> 
> For this situation, a vcpu is waking up on a pcpu. It could be taken 
> off from the replq inside the timer handler at the end. So, if we
> decide 
> to not replenish any sleeping/un-runnable vcpus any further, we
> should 
> probably add it back to replq here right?
> 
I'm sorry, I don't understand what you're saying here. The vcpu woke up
a few time ago. We went through here. At the time it was not on any
pcpu and it was not in the runqueue, so we did plan a replenishment
event for it in the replenishment queue, at it's next deadline.

Now we're here again, for some reason. We figure out that the vcpu is
running already and, most likely (feel free to add an ASSERT() inside
the if to that effect), it will also still have its replenishment event
queued.

So we realized that this is just a spurious wakeup, and get back to
what we were doing.

What's wrong with this I just said?

> >      /* on RunQ/DepletedQ, just update info is ok */
> >      if ( unlikely(__vcpu_on_q(svc)) )
> >      {
> >          SCHED_STAT_CRANK(vcpu_wake_onrunq);
> >          return;
> >      }
> > 
> 
> For this situation, a vcpu could be going through the following
> phases:
> 
> on runq --> sleep --> (for a long time) --> wake up
> 
Well, but then we're not here, because when it went to sleep, it's been
taken off the runqueue, hasn't it?

Actually, I do think that you need to be running to go to sleep, but
anyway, even if a vcpu could go to sleep or block from the runqueue,
it's not going to stay in the runqueue, and the first wakeup that
occurs does not find it on the runqueue, and hence is not covered by
this case... Is it?

> When it wakes up, it could stay in the front of the runq because 
> cur_deadline hasn't been updated. It will, inherently have some high 
> priority-ness (because of EDF) and be picked over other vcpus right?
> Do 
> we want to update it's deadline and budget here and re-insert it 
> accordingly?
> 
When it wakes up and it is found not to be either running or in the
runqueue already, yes, we certainly want to do that!

But if some other (again, spurious) wakeup occurs, and find it already
in the runqueue (i.e., this case) we actually *don't* want to update
its deadline again.

> >      if ( likely(vcpu_runnable(vc)) )
> >          SCHED_STAT_CRANK(vcpu_wake_runnable);
> >      else
> >          SCHED_STAT_CRANK(vcpu_wake_not_runnable);
> > 
> > 
> >      if ( now >= svc->cur_deadline)
> >      {
> >          rt_update_deadline(now, svc);
> >          __replq_remove(svc);
> >      }
> > 
> >      __replq_insert(svc);
> > 
> 
> For here, consider this scenario:
> 
> on pcpu (originally on replq) --> sleep --> context_saved() but
> still 
> asleep --> wake()
> 
> Timer handler isn't triggered before it awakes. Then this vcpu will
> be 
> added to replq again while it's already there. I'm not 100% positive
> if 
> this situation is possible though. But judging from
> rt_context_saved(), 
> it checks if a vcpu is runnable before adding back to the runq.
> 
I'm also having a bit of an hard time understanding this... especially
what context_saved() has to do with it.

Are you saying that it is possible for a vcpu to start running with
deadline td, and hence a replenishment is programmed to happen at td,
and then to go to sleep at ta<td and wakeup at tb that is also <td?

Well, in that case, yes, whether to update the deadline or not, and
whether to reuse the already existing replenishment or not, it's up to
you guys, as it's part of the algorithm. What does the algorithm say in
this case?

In this implementation, you are removing replenishment when a vcpu
sleep, so it looks to me that you don't have much choice than re-
instating it at wakeup (we've gone through this already, remember?).

OTOH, if you wouldn't stop the timer on sleep, then yes, we ma need
some logic here, to understand whether the replenishment happened
during sleeping/blocking time, or if it's still pending.

If that was not what you were saying... then, sorry, but I just did not
get it...

> > Thoughts?
> > 
> > 
> So just to wrap up my thoughts for budget replenishment inside of 
> wake(), a vcpu can be in different states(on pcpu, on queue etc.)
> when 
> it wakes up.
> 
Yes, and only one is usually interesting: when it's not in any runqueue
nor on any pcpu. In out case, the case where it's in _delayed_runq add
needs some care as well (unlike, e.g., in Credit2).

> 1)wake up on a pcpu:
> Do nothing as it may have some remaining budget and we just ignore
> it's 
> cur_deadline when it wakes up. I can't find a corresponding pattern
> in 
> DS but this can be treated like a server being pushed back because
> of 
> the nap.
> 
Well, I'd say: do nothing because it's a spurious wakeup.

> 2)wake up on a queue:
> Update the deadline(refill budget) of it and re-insert it into runq
> for 
> the reason stated above.
> 
Do nothing because it's a spurious wakeup.

> 3)wake up in the middle of context switching:
> Update the deadline(refill budget) of it so it can be re-inserted
> into 
> runq in rt_context_saved().
> 
Ok.

> 4)wake up on nowhere ( sleep on pcpu--> context_saved() --> wake()):
> Replenish it and re-insert it back to runq.
> 
Here it is the one we care about. And yes, I agree on what we should do
in this case.

> As for replenishment queue manipulation, we should always try and add
> it 
> back to replq because we don't know if the timer handler has taken
> it 
> off or not.
> 
Mmm... no, I think we should know/figure out whether a replenishment is
pending already and we are ok with it, or if we need a new one.

> > > Oh I might be misunderstanding what should be put in
> > > rt_schedule().
> > > Was
> > > it specifically for handling a vcpu that shouldn't be taken off a
> > > core?
> > > Why are you referring this piece of code as budget enforcement?
> > > 
> > > Isn't it necessary to modify the runq here after budget
> > > replenishment
> > > has happened? If runq goes out of order here, will rt_schedule()
> > > be
> > > sorting it?
> > > 
> > Ops, yes, you are right, this is not budget enforcement. So, budget
> > enforcement looks still to be missing from this patch (and that is
> > why
> > I asked whether it is actually working).
> > 
> > And this chunk of code (and perhaps this whole function) is
> > probably
> > ok, it's just a bit hard to understand what it does...
> > 
> 
> So, the burn_budget() is still there right? I'm not sure what you
> are 
> referring to as budget enforcement. Correct me if I'm wrong, budget 
> enforcement means that each vcpu uses its assigned/remaining budget.
> Is 
> there any specific situation I'm missing?
> 
That's correct, and I saw that burn_budget() is still there. What I
failed to see was logic, within rt_schedule() for stopping a vcpu that
has exhausted its budget until its next replenishment.

Before, this was done in __repl_update(), which is (and that's a good
thing) no longer there.

But maybe it's me that missed something...

> > So, do you mind if we take a step back again, and analyze the
> > situation. When the timer fires, and this handler is executed and a
> > replenishment event taken off the replenishment queue, the
> > following
> > situations are possible:
> > 
> >   1) the replenishment is for a running vcpu
> >   2) the replenishment is for a runnable vcpu in the runq
> >   3) the replenishment is for a runnable vcpu in the depletedq
> >   4) the replenishment is for a sleeping vcpu
> > 
> > So, 4) is never going to happen, right?, as we're stopping the
> > timer
> > when a vcpu blocks. If we go for this, let's put an ASSERT() and a
> > comment as a guard for that. If we do the optimization in this very
> > patch. If we leave it for another patch, we need to handle this
> > case, I
> > guess.
> > 
> > In all 1), 2) and 3) cases, we need to remove the replenishment
> > event
> > from the replenishment queue, so just (in the code) do it upfront.
> > 
> 
> I don't think we need to take them of the replenishment queue? See
> the 
> last couple paragraphs.
> 
I meant removing it for reinserting it again with a different
replenishment time, due to the fact that, as a consequence of the
replenishment that is just happening, the vcpu is being given a new
deadline.

> > What other actions need to happen in each case, 1), 2) and 3)? Can
> > you
> > summarize then in here, so we can decide how to handle each
> > situation?
> > 
> 
> Here we go.
> 1) the replenishment is for a running vcpu
> Replenish the vcpu. Keep it on the replenishment queue. Tickle it as
> it 
> may have a later deadline.
> 
What do you mean keep it in the repl queue? This particular
replenishment just happened, which means the time at which it was
scheduler passed, there is no reason for keeping it in the queue like
this. Do you mean inserting a new replenishment event in the queue, for
this same vcpu, with a new replenishment time corresponding to its
deadline? If yes, then yes, I agree with that.

> 2) the replenishment is for a runnable vcpu in the runq
> Replenish the vcpu. Keep it on the replenishment queue.
>
Same as above, I guess?

> (re-)insert it 
> into runq. Tickle it.
> 
Ok.

> 3) the replenishment is for a runnable vcpu in the depletedq
> Same as 2)
> 
Same as 2), but you insert it in the runq (not again in the depletedq),
right?

> 4) the replenishment is for a sleeping vcpu
> Take it off from the replenishment queue and don't do replenishment.
> As 
> a matter of fact, this patch does the replenishment first and then
> take 
> it off.
> 
Ah, so can this happen? Aren't you removing the queued replenishment
event when a vcpu goes to sleep?

> > I'd say that, in both case 2) and case 3), the vcpu needs to be
> > taken
> > out of the queue where they are, and (re-)inserted in the runqueue,
> > and
> > then we need to tickle... Is that correct?
> > 
> 
> correct^.
> 
Ok. :-)

> > I'd also say that, in case 1), we also need to tickle because the
> > deadline may now be have become bigger than some other vcpu that is
> > in
> > the runqueue?
> > 
> 
> correct. I missed this case^.
> 
Ok. :-) :-)

> > Also, under what conditions we need, as soon as replenishment for
> > vcpu
> > X happened, to queue another replenishment for the same vcpu, at
> > its
> > next deadline? Always, I would say (unless the vcpu is sleeping,
> > because of the optimization you introduced)?
> 
> So this is kinda confusing. If in case 1) 2) and 3) they are never
> taken 
> off from the replenishment queue, a series of replenishment events
> are 
> automatically updated after the replenishment.
>
This is mostly about the both of us saying the same thing differently.

I'm saying that you should remove the vcpu from the replenishment runqueue and then insert it in there again with an updated replenishment time, computed out of the vcpu's new deadline.

You are saying that the replenishment events are updated.

Are we (talking about the same thing)? :-)

>  There is no need, as 
> least one less reason, to keep track of when to queue the next vcpus
> for 
> replenishment.
> 
I don't understand what you mean here.

> The side effect of this is that all vcpus could be going to sleep
> right 
> after the timer was programmed at the end of the handler. If they
> don't 
> wake up before the timer fires next time, the handler is called to 
> replenish nobody. If that's the case, timer will not be programmed
> again 
> in the handler. Wake() kicks in and programs it instead. And the 
> downside of this is that we always need to check if an awaking vcpu
> is 
> on replq or not. It not add it back(hence the notion of starting to 
> track it's replenishment events again). When adding back, check if
> there 
> is a need to reprogram the timer.
> 
This all sounds fine to me.

> I really like the idea of replenishment queue compared to running
> queue 
> in that the places that we need to manipulate the queue are only in 
> wake() and repl_handler().
> 
Indeed. :-)

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-04  4:03       ` Meng Xu
@ 2016-02-05 14:45         ` Dario Faggioli
  0 siblings, 0 replies; 14+ messages in thread
From: Dario Faggioli @ 2016-02-05 14:45 UTC (permalink / raw)
  To: Meng Xu; +Cc: xen-devel, Tianyang Chen, George Dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1: Type: text/plain, Size: 2758 bytes --]

On Wed, 2016-02-03 at 23:03 -0500, Meng Xu wrote:
> On Wed, Feb 3, 2016 at 7:39 AM, Dario Faggioli
> <dario.faggioli@citrix.com> wrote:
> > 
> > And, at this point, I'm starting thinking again that we want to
> > call
> > runq_tickle() in rt_context_saved(), at least in case
> > __RTDS_delayed_runq_add was set as, if we don't, we're missing
> > tickling
> > for the vcpu being inserted because of that.
> > 
> > Of course, all the above, taken with a little bit of salt... i.e.,
> > I do
> > not expect to have got all the details right, especially in the
> > code,
> > but it should at least be effective in showing the idea.
> > 
> > Thoughts?
> 
> 
> This is smart and we should be able to put it into this patch series.
> 
It's in the code already...

> When a VCPU is stopped on a core, hasn't saved its context and has
> not
> been added back to runq/depletedq, we should delay the tickle until
> its context has been saved and it has been put back to the
> (runq/depletedq) queue.
>
...It's only this bit (the tickling) that we need to get right. :-)

> > Also, under what conditions we need, as soon as replenishment for
> > vcpu
> > X happened, to queue another replenishment for the same vcpu, at
> > its
> > next deadline? Always, I would say (unless the vcpu is sleeping,
> > because of the optimization you introduced)?
> 
> 
> 
> I'm thinking the following example as to why we need to rearm the
> timer for the just-replenished VCPU (Although it's very likely I
> missed something.):
> 
> A runnable VCPU (period = 10ms, budget = 4ms) is scheduled to run on
> a
> core at t = 0;
> at t = 4, its budget depleted and it's added to depletedq;
> at t=10, the replenish timer should fire and replenish the budget;
> I think we should arm the replenish timer for t = 20 so that its
> budget will get replenished at t = 20 for the period [20, 30];
> If we don't arm the timer at t = 20, how will the VCPU's budget be
> replenished at t=20?
> 
> Please correct me if I'm wrong.
> 
No, I think you're right. It was me that, during the first one or two
emails, was probably thinking with my CBS mindset! :-/

Now I see that we indeed need to have a replenishment event pretty much
always running for every (runnable) vcpu, and that's also what I'm
saying when replying to Tianyang in the other email...

Let's see if we can sort the terminology and spurious wakeup quirks,
and check in this code!! :-P

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-05 14:39         ` Dario Faggioli
@ 2016-02-06  4:27           ` Tianyang Chen
  2016-02-08 11:27             ` Dario Faggioli
  0 siblings, 1 reply; 14+ messages in thread
From: Tianyang Chen @ 2016-02-06  4:27 UTC (permalink / raw)
  To: Dario Faggioli, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu



On 2/5/2016 9:39 AM, Dario Faggioli wrote:
> On Wed, 2016-02-03 at 21:23 -0500, Tianyang Chen wrote:
>>
>> On 2/3/2016 7:39 AM, Dario Faggioli wrote:
>>> On Tue, 2016-02-02 at 13:09 -0500, Tianyang Chen wrote:
>>>>
>>> So what do we do, we raise the *_delayed_runq_add flag and continue
>>> and
>>> leave the vcpu alone. This happens, for instance, when there is a
>>> race
>>> between scheduling out a vcpu that is going to sleep, and the vcpu
>>> in
>>> question being woken back up already! It's not frequent, but we
>>> need to
>>> take care of that. At some point (sooner rather tan later, most
>>> likely)
>>> the context switch will actually happen, and the vcpu that is
>>> waking up
>>> is put in the runqueue.
>>>
>>
>> Yeah I agree with this situation. But I meant to say when a vcpu is
>> waking up while it's still on a queue. See below.
>>
> So, if a vcpu wakes up while it is running on a pcpu already, or while
> it is already woken and has already been put in the runqueue, these are
> SPURIOUS WAKEUP, which should be dealt with by just doing nothing, as
> every scheduler is doing.... This is what I'm tring to say since a
> couple of email, let's see if I've said it clear enough this time. :-D
>

I see. So I'm just curious what can cause spurious wakeup? Does it only 
happen to a running vcpu (currently on pcpu), and a vcpu that is on 
either runq or depletedq?

>>> static void
>>> rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>>> {
>>>       struct rt_vcpu * const svc = rt_vcpu(vc);
>>>       s_time_t now = NOW();
>>>       struct rt_private *prv = rt_priv(ops);
>>>
>>>       BUG_ON( is_idle_vcpu(vc) );
>>>
>>>       if ( unlikely(curr_on_cpu(vc->processor) == vc) )
>>>       {
>>>           SCHED_STAT_CRANK(vcpu_wake_running);
>>>           return;
>>>       }
>>>
>>
>> For this situation, a vcpu is waking up on a pcpu. It could be taken
>> off from the replq inside the timer handler at the end. So, if we
>> decide
>> to not replenish any sleeping/un-runnable vcpus any further, we
>> should
>> probably add it back to replq here right?
>>
> I'm sorry, I don't understand what you're saying here. The vcpu woke up
> a few time ago. We went through here. At the time it was not on any
> pcpu and it was not in the runqueue, so we did plan a replenishment
> event for it in the replenishment queue, at it's next deadline.
>
> Now we're here again, for some reason. We figure out that the vcpu is
> running already and, most likely (feel free to add an ASSERT() inside
> the if to that effect), it will also still have its replenishment event
> queued.
>
> So we realized that this is just a spurious wakeup, and get back to
> what we were doing.
>
> What's wrong with this I just said?
>

The reason why I wanted to add a vcpu back to replq is because it could 
be taken off from the timer handler. Now, because of the spurious 
wakeup, I think the timer shouldn't take any vcpus off of replq, in 
which case wake() should be doing nothing just like other schedulers 
when it's a spurious wakeup. Also, only sleep() should remove events 
from replq.

>>>       /* on RunQ/DepletedQ, just update info is ok */
>>>       if ( unlikely(__vcpu_on_q(svc)) )
>>>       {
>>>           SCHED_STAT_CRANK(vcpu_wake_onrunq);
>>>           return;
>>>       }
>>>
>>
>> For this situation, a vcpu could be going through the following
>> phases:
>>
>> on runq --> sleep --> (for a long time) --> wake up
>>
> Well, but then we're not here, because when it went to sleep, it's been
> taken off the runqueue, hasn't it?
>
> Actually, I do think that you need to be running to go to sleep, but
> anyway, even if a vcpu could go to sleep or block from the runqueue,
> it's not going to stay in the runqueue, and the first wakeup that
> occurs does not find it on the runqueue, and hence is not covered by
> this case... Is it?
>

uh, I missed the __q_remove() from sleep. It will not end up here in the 
case I mentioned.

>> When it wakes up, it could stay in the front of the runq because
>> cur_deadline hasn't been updated. It will, inherently have some high
>> priority-ness (because of EDF) and be picked over other vcpus right?
>> Do
>> we want to update it's deadline and budget here and re-insert it
>> accordingly?
>>
> When it wakes up and it is found not to be either running or in the
> runqueue already, yes, we certainly want to do that!
>
> But if some other (again, spurious) wakeup occurs, and find it already
> in the runqueue (i.e., this case) we actually *don't* want to update
> its deadline again.
>

Agree.

>>>       if ( likely(vcpu_runnable(vc)) )
>>>           SCHED_STAT_CRANK(vcpu_wake_runnable);
>>>       else
>>>           SCHED_STAT_CRANK(vcpu_wake_not_runnable);
>>>
>>>
>>>       if ( now >= svc->cur_deadline)
>>>       {
>>>           rt_update_deadline(now, svc);
>>>           __replq_remove(svc);
>>>       }
>>>
>>>       __replq_insert(svc);
>>>
>>
>> For here, consider this scenario:
>>
>> on pcpu (originally on replq) --> sleep --> context_saved() but
>> still
>> asleep --> wake()
>>
>> Timer handler isn't triggered before it awakes. Then this vcpu will
>> be
>> added to replq again while it's already there. I'm not 100% positive
>> if
>> this situation is possible though. But judging from
>> rt_context_saved(),
>> it checks if a vcpu is runnable before adding back to the runq.
>>
> I'm also having a bit of an hard time understanding this... especially
> what context_saved() has to do with it.
>

I guess I shouldn't have put /* */ around the removal of a vcpu on replq 
in sleep(). So the original patch won't actually take a vcpu off if it 
goes to sleep. Like Meng said, it should. And I see what you mean by 
spurious wakeup now. I will put a summary for wake() somewhere below.

> Are you saying that it is possible for a vcpu to start running with
> deadline td, and hence a replenishment is programmed to happen at td,
> and then to go to sleep at ta<td and wakeup at tb that is also <td?
>
> Well, in that case, yes, whether to update the deadline or not, and
> whether to reuse the already existing replenishment or not, it's up to
> you guys, as it's part of the algorithm. What does the algorithm say in
> this case?
>
> In this implementation, you are removing replenishment when a vcpu
> sleep, so it looks to me that you don't have much choice than re-
> instating it at wakeup (we've gone through this already, remember?).
>
> OTOH, if you wouldn't stop the timer on sleep, then yes, we ma need
> some logic here, to understand whether the replenishment happened
> during sleeping/blocking time, or if it's still pending.
>
> If that was not what you were saying... then, sorry, but I just did not
> get it...
>
>>> Thoughts?
>>>
>>>
>> So just to wrap up my thoughts for budget replenishment inside of
>> wake(), a vcpu can be in different states(on pcpu, on queue etc.)
>> when
>> it wakes up.
>>
> Yes, and only one is usually interesting: when it's not in any runqueue
> nor on any pcpu. In out case, the case where it's in _delayed_runq add
> needs some care as well (unlike, e.g., in Credit2).
>

See below.

>> 1)wake up on a pcpu:
>> Do nothing as it may have some remaining budget and we just ignore
>> it's
>> cur_deadline when it wakes up. I can't find a corresponding pattern
>> in
>> DS but this can be treated like a server being pushed back because
>> of
>> the nap.
>>
> Well, I'd say: do nothing because it's a spurious wakeup.
>

Yes.

>> 2)wake up on a queue:
>> Update the deadline(refill budget) of it and re-insert it into runq
>> for
>> the reason stated above.
>>
> Do nothing because it's a spurious wakeup.
>

Yes.

>> 3)wake up in the middle of context switching:
>> Update the deadline(refill budget) of it so it can be re-inserted
>> into
>> runq in rt_context_saved().
>>
> Ok.
>
>> 4)wake up on nowhere ( sleep on pcpu--> context_saved() --> wake()):
>> Replenish it and re-insert it back to runq.
>>
> Here it is the one we care about. And yes, I agree on what we should do
> in this case.
>
>> As for replenishment queue manipulation, we should always try and add
>> it
>> back to replq because we don't know if the timer handler has taken
>> it
>> off or not.
>>
> Mmm... no, I think we should know/figure out whether a replenishment is
> pending already and we are ok with it, or if we need a new one.
>

Just to make sure, spurious sleep/wakeup are for a vcpu that is on pcpu 
or any queue right?


If a real sleep happens, it should be taken off from replq. However, in 
spurious wakeup (which I assume corresponds to a spurious sleep?), it 
shouldn't be taken off from replq. Adding back to replq should happen 
for those vcpus which were taken off because of "real sleep".

So here is a summary to make sure everyone's on the same page :)
This is basically your code with a slight modification before 
replq_insert().

static void
rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
{
     struct rt_vcpu * const svc = rt_vcpu(vc);
     s_time_t now = NOW();

     BUG_ON( is_idle_vcpu(vc) );

     if ( unlikely(curr_on_cpu(vc->processor) == vc) )
     {
	/*
	 * spurious wakeup so do nothing as it is
	 * not removed from replq
   	 */
         SCHED_STAT_CRANK(vcpu_wake_running);
         return;
     }

     /* on RunQ/DepletedQ, just update info is ok */
     if ( unlikely(__vcpu_on_q(svc)) )
     {
	/*
	 * spurious wakeup so do nothing as it is
	 * not removed from replq
	 */
         SCHED_STAT_CRANK(vcpu_wake_onrunq);
         return;
     }

     if ( likely(vcpu_runnable(vc)) )
         SCHED_STAT_CRANK(vcpu_wake_runnable);
     else
         SCHED_STAT_CRANK(vcpu_wake_not_runnable);


     /*
      * sleep() might have taken it off from replq before
      * conext_saved(). Three cases:
      * 1) sleep on pcpu--> context_saved() --> wake()
      * 2) sleep on pcpu--> wake() --> context_saved()
      * 3) sleep before context_saved() -->wake().
      * The first two cases sleep() doesn't remove this vcpu
      * so add a check before inserting.
      */
     if ( now >= svc->cur_deadline)
     {
         rt_update_deadline(now, svc);
         __replq_remove(svc);
     }

     if( !__vcpu_on_replq(svc) )
         __replq_insert(svc);

     /* If context hasn't been saved for this vcpu yet, we can't put it on
      * the Runqueue/DepletedQ. Instead, we set a flag so that it will be
      * put on the Runqueue/DepletedQ after the context has been saved.
      */
     if ( unlikely(svc->flags & RTDS_scheduled) )
     {
         set_bit(__RTDS_delayed_runq_add, &svc->flags);

         return;
     }

     /* insert svc to runq/depletedq because svc is not in queue now */
     __runq_insert(ops, svc);
     runq_tickle(ops, snext);

     return;
}


static void
rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
{
     struct rt_vcpu * const svc = rt_vcpu(vc);

     BUG_ON( is_idle_vcpu(vc) );
     SCHED_STAT_CRANK(vcpu_sleep);

     /*
      * If a vcpu is not sleeping on a pcpu or a queue,
      * it should be taken off
      * from the replenishment queue
      * Case 1: sleep on a pcpu
      * Case 2: sleep on a queue
      * Case 3: sleep before context switch is finished
      */
     if ( curr_on_cpu(vc->processor) == vc )
         cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
     else if ( __vcpu_on_q(svc) )
         __q_remove(svc);
     else if ( svc->flags & RTDS_delayed_runq_add )
     {
         clear_bit(__RTDS_delayed_runq_add, &svc->flags);

         if( __vcpu_on_replq(svc) )
            __replq_remove(svc);
     }
}



>>>> Oh I might be misunderstanding what should be put in
>>>> rt_schedule().
>>>> Was
>>>> it specifically for handling a vcpu that shouldn't be taken off a
>>>> core?
>>>> Why are you referring this piece of code as budget enforcement?
>>>>
>>>> Isn't it necessary to modify the runq here after budget
>>>> replenishment
>>>> has happened? If runq goes out of order here, will rt_schedule()
>>>> be
>>>> sorting it?
>>>>
>>> Ops, yes, you are right, this is not budget enforcement. So, budget
>>> enforcement looks still to be missing from this patch (and that is
>>> why
>>> I asked whether it is actually working).
>>>
>>> And this chunk of code (and perhaps this whole function) is
>>> probably
>>> ok, it's just a bit hard to understand what it does...
>>>
>>
>> So, the burn_budget() is still there right? I'm not sure what you
>> are
>> referring to as budget enforcement. Correct me if I'm wrong, budget
>> enforcement means that each vcpu uses its assigned/remaining budget.
>> Is
>> there any specific situation I'm missing?
>>
> That's correct, and I saw that burn_budget() is still there. What I
> failed to see was logic, within rt_schedule() for stopping a vcpu that
> has exhausted its budget until its next replenishment.
>
> Before, this was done in __repl_update(), which is (and that's a good
> thing) no longer there.
>
> But maybe it's me that missed something...
>

__repl_update() only scans runq and depletedq for replenishment right? 
And right after burn_budget(), if the current vcpu runs out of budget,
snext will be picked here because of:

/* if scurr has higher priority and budget, still pick scurr */
if ( !is_idle_vcpu(current) && vcpu_runnable(current) &&
       scurr->cur_budget > 0 && ( is_idle_vcpu(snext->vcpu) ||
     scurr->cur_deadline <= snext->cur_deadline ) )
     snext = scurr;

if scurr->cur_budget == 0, snext will be picked immediately.
Is that what you looking for?

>>> So, do you mind if we take a step back again, and analyze the
>>> situation. When the timer fires, and this handler is executed and a
>>> replenishment event taken off the replenishment queue, the
>>> following
>>> situations are possible:
>>>
>>>    1) the replenishment is for a running vcpu
>>>    2) the replenishment is for a runnable vcpu in the runq
>>>    3) the replenishment is for a runnable vcpu in the depletedq
>>>    4) the replenishment is for a sleeping vcpu
>>>
>>> So, 4) is never going to happen, right?, as we're stopping the
>>> timer
>>> when a vcpu blocks. If we go for this, let's put an ASSERT() and a
>>> comment as a guard for that. If we do the optimization in this very
>>> patch. If we leave it for another patch, we need to handle this
>>> case, I
>>> guess.
>>>
>>> In all 1), 2) and 3) cases, we need to remove the replenishment
>>> event
>>> from the replenishment queue, so just (in the code) do it upfront.
>>>
>>
>> I don't think we need to take them of the replenishment queue? See
>> the
>> last couple paragraphs.
>>
> I meant removing it for reinserting it again with a different
> replenishment time, due to the fact that, as a consequence of the
> replenishment that is just happening, the vcpu is being given a new
> deadline.
>

Ok so we are talking about the same thing.

>>> What other actions need to happen in each case, 1), 2) and 3)? Can
>>> you
>>> summarize then in here, so we can decide how to handle each
>>> situation?
>>>
>>
>> Here we go.
>> 1) the replenishment is for a running vcpu
>> Replenish the vcpu. Keep it on the replenishment queue. Tickle it as
>> it
>> may have a later deadline.
>>
> What do you mean keep it in the repl queue? This particular
> replenishment just happened, which means the time at which it was
> scheduler passed, there is no reason for keeping it in the queue like
> this. Do you mean inserting a new replenishment event in the queue, for
> this same vcpu, with a new replenishment time corresponding to its
> deadline? If yes, then yes, I agree with that.
>

Exactly. It's the way I expressed it. When I said keep it on, it 
includes remove() + insert(). lol

>> 2) the replenishment is for a runnable vcpu in the runq
>> Replenish the vcpu. Keep it on the replenishment queue.
>>
> Same as above, I guess?
>

Yes.

>> (re-)insert it
>> into runq. Tickle it.
>>
> Ok.
>
>> 3) the replenishment is for a runnable vcpu in the depletedq
>> Same as 2)
>>
> Same as 2), but you insert it in the runq (not again in the depletedq),
> right?
>

Right. _q_insert() checks if a vcpu has budget. If so it's put on the 
runq. Otherwise on the depletedq.

>> 4) the replenishment is for a sleeping vcpu
>> Take it off from the replenishment queue and don't do replenishment.
>> As
>> a matter of fact, this patch does the replenishment first and then
>> take
>> it off.
>>
> Ah, so can this happen? Aren't you removing the queued replenishment
> event when a vcpu goes to sleep?
>

Now, if we all agree on removing a vcpu in sleep(), this is not going to 
happen. So, there are three functions manipulating the replq now.

>>> I'd say that, in both case 2) and case 3), the vcpu needs to be
>>> taken
>>> out of the queue where they are, and (re-)inserted in the runqueue,
>>> and
>>> then we need to tickle... Is that correct?
>>>
>>
>> correct^.
>>
> Ok. :-)
>
>>> I'd also say that, in case 1), we also need to tickle because the
>>> deadline may now be have become bigger than some other vcpu that is
>>> in
>>> the runqueue?
>>>
>>
>> correct. I missed this case^.
>>
> Ok. :-) :-)
>
>>> Also, under what conditions we need, as soon as replenishment for
>>> vcpu
>>> X happened, to queue another replenishment for the same vcpu, at
>>> its
>>> next deadline? Always, I would say (unless the vcpu is sleeping,
>>> because of the optimization you introduced)?
>>
>> So this is kinda confusing. If in case 1) 2) and 3) they are never
>> taken
>> off from the replenishment queue, a series of replenishment events
>> are
>> automatically updated after the replenishment.
>>
> This is mostly about the both of us saying the same thing differently.
>
> I'm saying that you should remove the vcpu from the replenishment runqueue and then insert it in there again with an updated replenishment time, computed out of the vcpu's new deadline.
>
> You are saying that the replenishment events are updated.
>
> Are we (talking about the same thing)? :-)
>

Yeah.

>>   There is no need, as
>> least one less reason, to keep track of when to queue the next vcpus
>> for
>> replenishment.
>>
> I don't understand what you mean here.
>
>> The side effect of this is that all vcpus could be going to sleep
>> right
>> after the timer was programmed at the end of the handler. If they
>> don't
>> wake up before the timer fires next time, the handler is called to
>> replenish nobody. If that's the case, timer will not be programmed
>> again
>> in the handler. Wake() kicks in and programs it instead. And the
>> downside of this is that we always need to check if an awaking vcpu
>> is
>> on replq or not. It not add it back(hence the notion of starting to
>> track it's replenishment events again). When adding back, check if
>> there
>> is a need to reprogram the timer.
>>
> This all sounds fine to me.
>
Great. I will put together v5 soon. If we all agree on the rt_wake() and 
rt_sleep().

Thanks,
Tianyang

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-06  4:27           ` Tianyang Chen
@ 2016-02-08 11:27             ` Dario Faggioli
  2016-02-08 19:04               ` Tianyang Chen
  0 siblings, 1 reply; 14+ messages in thread
From: Dario Faggioli @ 2016-02-08 11:27 UTC (permalink / raw)
  To: Tianyang Chen, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1: Type: text/plain, Size: 9648 bytes --]

On Fri, 2016-02-05 at 23:27 -0500, Tianyang Chen wrote:
> 
> On 2/5/2016 9:39 AM, Dario Faggioli wrote:
> > On Wed, 2016-02-03 at 21:23 -0500, Tianyang Chen wrote:
> > > 
> I see. So I'm just curious what can cause spurious wakeup? Does it
> only 
> happen to a running vcpu (currently on pcpu), and a vcpu that is on 
> either runq or depletedq?
> 
Hehe, "that's virt, baby!" :-P

No, seriously, vcpu being already running or already in a queue, is
indeed what makes me call a wakeup a "spurious" wakeup. You can check
at the performance counters that we update in those cases, to see when
they happen most. In my experience, on Credit, I've seen them happening
mostly when a vcpu is running. For instance:

root@Zhaman:~# xenperf |grep vcpu_wake
sched: vcpu_wake_running            T=        39 
sched: vcpu_wake_onrunq             T=         0 
sched: vcpu_wake_runnable           T=     59875 
sched: vcpu_wake_not_runnable       T=         0

> > So we realized that this is just a spurious wakeup, and get back to
> > what we were doing.
> > 
> > What's wrong with this I just said?
> > 
> 
> The reason why I wanted to add a vcpu back to replq is because it
> could 
> be taken off from the timer handler. Now, because of the spurious 
> wakeup, I think the timer shouldn't take any vcpus off of replq, in 
> which case wake() should be doing nothing just like other schedulers 
> when it's a spurious wakeup. Also, only sleep() should remove events 
> from replq.
> 
Err... well, that depends on the how the code ends up looking like, and
it's start to become difficult to reason on it like this.

Perhaps, considering that it looks to me that we are in agreement on
all the important aspects, you can draft a new version and we'll reason
and comment on top of that?

> > Mmm... no, I think we should know/figure out whether a
> > replenishment is
> > pending already and we are ok with it, or if we need a new one.
> > 
> 
> Just to make sure, spurious sleep/wakeup are for a vcpu that is on
> pcpu 
> or any queue right?
> 
Yes, spurious wakeup is how I'm calling wakeups that needs no action
from the schedule, because the vcpu is already awake (and either
running or runnable).

I don't think there is such a thing as spurious sleep... When sleep is
called, we go to sleep. There are cases where the sleep-->wakeup
turnaround is really really fast, but I would not call them "spurious
sleeps", and they should not need much special treatment, not in
sched_rt.c at least.

Oh, maybe you mean the cases where you wakeup and you find out that
context_saved() has not run yet (just occurred by looking at the code
below)? Well, yes... But I wouldn't call those "spurious sleeps"
either.

> If a real sleep happens, it should be taken off from replq. However,
> in 
> spurious wakeup (which I assume corresponds to a spurious sleep?),
> it 
> shouldn't be taken off from replq. Adding back to replq should
> happen 
> for those vcpus which were taken off because of "real sleep".
> 
I don't really follow, but I have the feeling that you're making it
sound more complicated like it is in reality... :-)

> So here is a summary to make sure everyone's on the same page :)
> This is basically your code with a slight modification before 
> replq_insert().
> 
> static void
> rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
> {
>      struct rt_vcpu * const svc = rt_vcpu(vc);
>      s_time_t now = NOW();
> 
>      BUG_ON( is_idle_vcpu(vc) );
> 
>      if ( unlikely(curr_on_cpu(vc->processor) == vc) )
>      {
> 	/*
> 	 * spurious wakeup so do nothing as it is
> 	 * not removed from replq
>    	 */
>
Kill the comment (and below too), it's pretty useless.

>          SCHED_STAT_CRANK(vcpu_wake_running);
>          return;
>      }
> 
>      /* on RunQ/DepletedQ, just update info is ok */
>      if ( unlikely(__vcpu_on_q(svc)) )
>      {
> 	/*
> 	 * spurious wakeup so do nothing as it is
> 	 * not removed from replq
> 	 */
>          SCHED_STAT_CRANK(vcpu_wake_onrunq);
>          return;
>      }
> 
>      if ( likely(vcpu_runnable(vc)) )
>          SCHED_STAT_CRANK(vcpu_wake_runnable);
>      else
>          SCHED_STAT_CRANK(vcpu_wake_not_runnable);
> 
> 
>      /*
>       * sleep() might have taken it off from replq before
>       * conext_saved(). Three cases:
>       * 1) sleep on pcpu--> context_saved() --> wake()
>       * 2) sleep on pcpu--> wake() --> context_saved()
>       * 3) sleep before context_saved() -->wake().
>       * The first two cases sleep() doesn't remove this vcpu
>       * so add a check before inserting.
>       */
>
I'm not sure why you're saying that, in cases 1) and 2), sleep should
not remove the replenishment event from the queue...

>      if ( now >= svc->cur_deadline)
>      {
>          rt_update_deadline(now, svc);
>          __replq_remove(svc);
>      }
> 
>      if( !__vcpu_on_replq(svc) )
>          __replq_insert(svc);
> 
And despite the comment above being a bit unclear to me, I *think* this
code is ok. :-)

>      /* If context hasn't been saved for this vcpu yet, we can't put
> it on
>       * the Runqueue/DepletedQ. Instead, we set a flag so that it
> will be
>       * put on the Runqueue/DepletedQ after the context has been
> saved.
>       */
>      if ( unlikely(svc->flags & RTDS_scheduled) )
>      {
>          set_bit(__RTDS_delayed_runq_add, &svc->flags);
> 
>          return;
>      }
> 
>      /* insert svc to runq/depletedq because svc is not in queue now
> */
>      __runq_insert(ops, svc);
>      runq_tickle(ops, snext);
> 
>      return;
> }
> 
> 
> static void
> rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
> {
>      struct rt_vcpu * const svc = rt_vcpu(vc);
> 
>      BUG_ON( is_idle_vcpu(vc) );
>      SCHED_STAT_CRANK(vcpu_sleep);
> 
>      /*
>       * If a vcpu is not sleeping on a pcpu or a queue,
>       * it should be taken off
>       * from the replenishment queue
>       * Case 1: sleep on a pcpu
>       * Case 2: sleep on a queue
>       * Case 3: sleep before context switch is finished
>       */
>      if ( curr_on_cpu(vc->processor) == vc )
>          cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
>      else if ( __vcpu_on_q(svc) )
>          __q_remove(svc);
>      else if ( svc->flags & RTDS_delayed_runq_add )
>      {
>          clear_bit(__RTDS_delayed_runq_add, &svc->flags);
> 
>          if( __vcpu_on_replq(svc) )
>             __replq_remove(svc);
>      }
> }
> 
Yep, as I was saying above, why you're only removing from the
replenishment queue in the latest case?

For instance, if a vcpu is running on a pcpu, and it goes to sleep (the
first if), do you want the timer to fire while it is still asleep or
not?

My impression is that, no, you don't want that, and you compensate for
it in rt_vcpu_wake(). Well, if this is the case, then you should remove
the replenishment event (and, potentially, stop or reprogram the timer)
in any of these cases.

Don't you think?

Note that the if-s here are not to be matched with the ones in _wake().
As I said, when you sleep you sleep, and there's no such thing as
spurious sleep. The reason why we have these if-s, is that different
actions need to be undertaken depending on whether the vcpu that is
going to sleep was running, in a queue, or waiting to be re-queued.

For instance, you only want the pcpu to go through schedule again, only
if the vcpu was running. Also, you want to be sure that the vcpu is not
queued back in context_saved(), if it was flagged to be. Etc.

> > Before, this was done in __repl_update(), which is (and that's a
> > good
> > thing) no longer there.
> > 
> > But maybe it's me that missed something...
> > 
> 
> __repl_update() only scans runq and depletedq for replenishment
> right? 
> And right after burn_budget(), if the current vcpu runs out of
> budget,
> snext will be picked here because of:
> 
> /* if scurr has higher priority and budget, still pick scurr */
> if ( !is_idle_vcpu(current) && vcpu_runnable(current) &&
>        scurr->cur_budget > 0 && ( is_idle_vcpu(snext->vcpu) ||
>      scurr->cur_deadline <= snext->cur_deadline ) )
>      snext = scurr;
> 
> if scurr->cur_budget == 0, snext will be picked immediately.
> Is that what you looking for?
> 
Ok, but then scurr need to go to the depleted queue. Double checking, I
see that this is happening in __runq_insert(), so, yes, this code also
looks fine.

Thanks for bearing with me. :-D

> > This all sounds fine to me.
> > 
> Great. I will put together v5 soon. If we all agree on the rt_wake()
> and 
> rt_sleep().
> 
We're probably quite close. :-P

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-08 11:27             ` Dario Faggioli
@ 2016-02-08 19:04               ` Tianyang Chen
  2016-02-08 21:23                 ` Dario Faggioli
  0 siblings, 1 reply; 14+ messages in thread
From: Tianyang Chen @ 2016-02-08 19:04 UTC (permalink / raw)
  To: Dario Faggioli, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu



On 2/8/2016 6:27 AM, Dario Faggioli wrote:
> On Fri, 2016-02-05 at 23:27 -0500, Tianyang Chen wrote:
>>
>> On 2/5/2016 9:39 AM, Dario Faggioli wrote:
>>> On Wed, 2016-02-03 at 21:23 -0500, Tianyang Chen wrote:
>>>>
>> I see. So I'm just curious what can cause spurious wakeup? Does it
>> only
>> happen to a running vcpu (currently on pcpu), and a vcpu that is on
>> either runq or depletedq?
>>
> Hehe, "that's virt, baby!" :-P
>
> No, seriously, vcpu being already running or already in a queue, is
> indeed what makes me call a wakeup a "spurious" wakeup. You can check
> at the performance counters that we update in those cases, to see when
> they happen most. In my experience, on Credit, I've seen them happening
> mostly when a vcpu is running. For instance:
>
> root@Zhaman:~# xenperf |grep vcpu_wake
> sched: vcpu_wake_running            T=        39
> sched: vcpu_wake_onrunq             T=         0
> sched: vcpu_wake_runnable           T=     59875
> sched: vcpu_wake_not_runnable       T=         0
>
>>> So we realized that this is just a spurious wakeup, and get back to
>>> what we were doing.
>>>
>>> What's wrong with this I just said?
>>>
>>
>> The reason why I wanted to add a vcpu back to replq is because it
>> could
>> be taken off from the timer handler. Now, because of the spurious
>> wakeup, I think the timer shouldn't take any vcpus off of replq, in
>> which case wake() should be doing nothing just like other schedulers
>> when it's a spurious wakeup. Also, only sleep() should remove events
>> from replq.
>>
> Err... well, that depends on the how the code ends up looking like, and
> it's start to become difficult to reason on it like this.
>
> Perhaps, considering that it looks to me that we are in agreement on
> all the important aspects, you can draft a new version and we'll reason
> and comment on top of that?
>
>>> Mmm... no, I think we should know/figure out whether a
>>> replenishment is
>>> pending already and we are ok with it, or if we need a new one.
>>>
>>
>> Just to make sure, spurious sleep/wakeup are for a vcpu that is on
>> pcpu
>> or any queue right?
>>
> Yes, spurious wakeup is how I'm calling wakeups that needs no action
> from the schedule, because the vcpu is already awake (and either
> running or runnable).
>
> I don't think there is such a thing as spurious sleep... When sleep is
> called, we go to sleep. There are cases where the sleep-->wakeup
> turnaround is really really fast, but I would not call them "spurious
> sleeps", and they should not need much special treatment, not in
> sched_rt.c at least.
>
> Oh, maybe you mean the cases where you wakeup and you find out that
> context_saved() has not run yet (just occurred by looking at the code
> below)? Well, yes... But I wouldn't call those "spurious sleeps"
> either.
>
>> If a real sleep happens, it should be taken off from replq. However,
>> in
>> spurious wakeup (which I assume corresponds to a spurious sleep?),
>> it
>> shouldn't be taken off from replq. Adding back to replq should
>> happen
>> for those vcpus which were taken off because of "real sleep".
>>
> I don't really follow, but I have the feeling that you're making it
> sound more complicated like it is in reality... :-)
>

So my reasoning is that, when sleep() is called in sched_rt.c, a vcpu 
will need to be taken off from the replenishment event queue. However, a 
vcpu can be put to sleep/not-runnable without even calling sleep(), 
which corresponds to a later "spurious wakeup". Is there any way that 
RTDS can know when this happens? If not, in rt_vcpu_wake(), it needs to 
check, in all situations, if a vcpu is on the replenishment event queue 
or not. If not, add it back. I'm I right?

vcpu running      --> sleep(), taken off from replq	
vcpu on queue     --> sleep(), taken off from replq
vcpu not on queue --> sleep(), taken off from replq


However,

vcpu running/on queque/not on queue --> just not runnable anymore (still 
on replq) --> spurious wakeup (still on replq)

vcpus shouldn't be added back to replq again.

So in all cases, rt_vcpu_wake() always need to check if a vcpu is on the 
replq or not.

Now I think because of the addition of a replenishment queue, spurious 
wake up cannot be simply treated as NOP. Just like V4, the "Out" label 
basically maintains the replenishment queue.

Anyways, I agree it's gonna be easier to reason on v5 where all changes 
are applied.

Thanks,
Tianyang

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] xen: sched: convert RTDS from time to event driven model
  2016-02-08 19:04               ` Tianyang Chen
@ 2016-02-08 21:23                 ` Dario Faggioli
  0 siblings, 0 replies; 14+ messages in thread
From: Dario Faggioli @ 2016-02-08 21:23 UTC (permalink / raw)
  To: Tianyang Chen, xen-devel; +Cc: george.dunlap, Dagaen Golomb, Meng Xu


[-- Attachment #1.1: Type: text/plain, Size: 2714 bytes --]

On Mon, 2016-02-08 at 14:04 -0500, Tianyang Chen wrote:
> 
> On 2/8/2016 6:27 AM, Dario Faggioli wrote:
> > 
> > I don't really follow, but I have the feeling that you're making it
> > sound more complicated like it is in reality... :-)
> > 
> 
> So my reasoning is that, when sleep() is called in sched_rt.c, a
> vcpu 
> will need to be taken off from the replenishment event queue.
> However, a 
> vcpu can be put to sleep/not-runnable without even calling sleep(), 
>
Well, sure... for instance, it can block.

> which corresponds to a later "spurious wakeup". Is there any way
> that 
> RTDS can know when this happens? 
>
There is no way no one can tell what will happen in future but,
luckily, we don't need to care! There is no such thing as the
correspondence between spurious wakeups and sleeps.

> If not, in rt_vcpu_wake(), it needs to 
> check, in all situations, if a vcpu is on the replenishment event
> queue 
> or not. If not, add it back. I'm I right?
> 
> vcpu running      --> sleep(), taken off from replq	
> vcpu on queue     --> sleep(), taken off from replq
> vcpu not on queue --> sleep(), taken off from replq
> 
No. What happens is, more or less, like this:

 - sleep  : take off from runq, take off from replq
 - wakeup : put back in runq, put back in replq
 - wakeup : already on runq, already on replq ==> nothing!
 ...
 [becomes running]
 ...
 - wakeup : on a cpu, already on replq ==> nothing!

> However,
> 
> vcpu running/on queque/not on queue --> just not runnable anymore
> (still 
> on replq) --> spurious wakeup (still on replq)
> 
> vcpus shouldn't be added back to replq again.
> 
Exactly, and in fact, you do nothing. :-)

> So in all cases, rt_vcpu_wake() always need to check if a vcpu is on
> the 
> replq or not.
> 
Eheh... looks like we agree on the fact that we disagree. :-)

> Now I think because of the addition of a replenishment queue,
> spurious 
> wake up cannot be simply treated as NOP. Just like V4, the "Out"
> label 
> basically maintains the replenishment queue.
> 
I do think so. I also think that we may need to think about the
blocking case (i.e., whether to do something for removing the vcpu from
the replenishment queue or not). But really...

> Anyways, I agree it's gonna be easier to reason on v5 where all
> changes 
> are applied.
> 
... let's do this! :-)

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)


[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-02-08 21:23 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-01  4:32 [PATCH v4] xen: sched: convert RTDS from time to event driven model Tianyang Chen
2016-02-02 15:08 ` Dario Faggioli
2016-02-02 18:09   ` Tianyang Chen
2016-02-03 12:39     ` Dario Faggioli
2016-02-04  2:23       ` Tianyang Chen
2016-02-05 14:39         ` Dario Faggioli
2016-02-06  4:27           ` Tianyang Chen
2016-02-08 11:27             ` Dario Faggioli
2016-02-08 19:04               ` Tianyang Chen
2016-02-08 21:23                 ` Dario Faggioli
2016-02-04  4:03       ` Meng Xu
2016-02-05 14:45         ` Dario Faggioli
2016-02-03  3:33   ` Meng Xu
2016-02-03 12:30     ` Dario Faggioli

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.