All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
@ 2015-06-27 19:46 Dagaen Golomb
  2015-06-27 19:53 ` Dagaen Golomb
  2015-06-28  7:06 ` Meng Xu
  0 siblings, 2 replies; 25+ messages in thread
From: Dagaen Golomb @ 2015-06-27 19:46 UTC (permalink / raw)
  To: xen-devel, wei.liu2, xisisu, george.dunlap, mengxu, jbeulich, lichong659
  Cc: Dagaen Golomb

To do this, we create a new list that holds, for each
vcpu, the time least into the future that it may need to be
rescheduled. The scheduler chooses the lowest time off of this
list and waits until the specified time instead of running every
1 ms as it did before.

Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
---
 xen/common/sched_rt.c |  100 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 93 insertions(+), 7 deletions(-)

diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 4372486..cce3446 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -4,6 +4,7 @@
  *
  * by Sisu Xi, 2013, Washington University in Saint Louis
  * and Meng Xu, 2014, University of Pennsylvania
+ * and Dagaen Golomb, 2015, University of Pennsylvania
  *
  * based on the code of credit Scheduler
  */
@@ -152,6 +153,8 @@ struct rt_private {
     struct list_head sdom;      /* list of availalbe domains, used for dump */
     struct list_head runq;      /* ordered list of runnable vcpus */
     struct list_head depletedq; /* unordered list of depleted vcpus */
+    struct timer replenishment_timer;
+    unsigned NUM_CPUS;
     cpumask_t tickled;          /* cpus been tickled */
 };
 
@@ -178,6 +181,8 @@ struct rt_vcpu {
     unsigned flags;             /* mark __RTDS_scheduled, etc.. */
 };
 
+static void runq_tickle(const struct scheduler *, struct rt_vcpu *);
+
 /*
  * Domain
  */
@@ -391,13 +396,19 @@ __q_remove(struct rt_vcpu *svc)
  * Insert svc with budget in RunQ according to EDF:
  * vcpus with smaller deadlines go first.
  * Insert svc without budget in DepletedQ unsorted;
+ *
+ * Returns the position, 1-indexed, of where the item was inserted.
+ * Returns a positive index if placed on runq, and a negative index (the index
+ *   in the depletedq * -1) if placed on the depletedq
  */
-static void
+static int
 __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
 {
     struct rt_private *prv = rt_priv(ops);
     struct list_head *runq = rt_runq(ops);
+    struct list_head *depletedq = rt_depletedq(ops);
     struct list_head *iter;
+    unsigned inserted_index = 1;
 
     ASSERT( spin_is_locked(&prv->lock) );
 
@@ -411,15 +422,77 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
             struct rt_vcpu * iter_svc = __q_elem(iter);
             if ( svc->cur_deadline <= iter_svc->cur_deadline )
                     break;
+            else
+                ++inserted_index;
          }
         list_add_tail(&svc->q_elem, iter);
+        return inserted_index;
     }
     else
     {
-        list_add(&svc->q_elem, &prv->depletedq);
+        // If we are inserting into previously empty depletedq, rearm
+        //   rearm replenish timer. It will handle further scheduling until
+        //   the depletedq is empty again (if ever)
+        if( list_empty(depletedq) )
+            set_timer( &prv->replenishment_timer, svc->cur_deadline );
+
+        list_for_each(iter, depletedq)
+        {
+            struct rt_vcpu * iter_svc = __q_elem(iter);
+            if ( svc->cur_deadline <= iter_svc->cur_deadline )
+                break;
+            else
+                ++inserted_index;
+         }
+        list_add_tail(&svc->q_elem, iter);
+        return -inserted_index;
     }
 }
 
+static void replenishment_handler(void* data)
+{
+    const struct scheduler *ops = data;
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *depletedq = rt_depletedq(ops);
+    struct list_head *iter;
+    struct list_head *tmp;
+    struct rt_vcpu *svc = NULL;
+    s_time_t now = NOW();
+    int new_position = 0;
+    unsigned long flags;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    // Replenish the vCPU that triggered this, and resort it into runq
+    list_for_each_safe(iter, tmp, depletedq)
+    {
+        svc = __q_elem(iter);
+        if ( now >= svc->cur_deadline )
+        {
+            rt_update_deadline(now, svc);
+            __q_remove(svc); /* remove from depleted queue */
+            new_position = __runq_insert(ops, svc); /* add to runq */
+        }
+        else break; // leave list_for_each_safe
+    }
+
+    // If we become one of top [# CPUs] in the runq, tickle it
+    // TODO: make this work when multiple tickles are required
+    if ( new_position > 0 && new_position <= prv->NUM_CPUS )
+        runq_tickle(ops, svc);
+
+    // Use first item on deletedq to schedule next replenishment.
+    // If no replenishments are pending, disable timer for now
+    if( !list_empty(depletedq) )
+    {
+        struct rt_vcpu * soonest_to_replenish = __q_elem(depletedq->next);
+        set_timer( &prv->replenishment_timer,
+                   soonest_to_replenish->cur_deadline );
+    }
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
 /*
  * Init/Free related code
  */
@@ -427,6 +500,7 @@ static int
 rt_init(struct scheduler *ops)
 {
     struct rt_private *prv = xzalloc(struct rt_private);
+    int cpu = 0;
 
     printk("Initializing RTDS scheduler\n"
            "WARNING: This is experimental software in development.\n"
@@ -450,6 +524,15 @@ rt_init(struct scheduler *ops)
     INIT_LIST_HEAD(&prv->runq);
     INIT_LIST_HEAD(&prv->depletedq);
 
+    // Replenishment timer, for now always on CPU 0
+    init_timer(&prv->replenishment_timer, replenishment_handler, ops, 0);
+    // Calculate number of pCPUs present.
+    // Note: assumes does not change online.
+    for_each_present_cpu(cpu)
+    {
+        ++(prv->NUM_CPUS);
+    }
+    
     cpumask_clear(&prv->tickled);
 
     ops->sched_data = prv;
@@ -466,6 +549,8 @@ rt_deinit(const struct scheduler *ops)
 {
     struct rt_private *prv = rt_priv(ops);
 
+    kill_timer(&prv->replenishment_timer);
+
     ASSERT( _cpumask_scratch && nr_rt_ops > 0 );
 
     if ( (--nr_rt_ops) == 0 )
@@ -653,8 +738,7 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
     BUG_ON( sdom == NULL );
 
     lock = vcpu_schedule_lock_irq(vc);
-    if ( __vcpu_on_q(svc) )
-        __q_remove(svc);
+    __q_remove(svc);
     vcpu_schedule_unlock_irq(lock, vc);
 
     if ( !is_idle_vcpu(vc) )
@@ -838,6 +922,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
 {
     const int cpu = smp_processor_id();
     struct rt_private *prv = rt_priv(ops);
+    struct list_head *runq = rt_runq(ops);
     struct rt_vcpu *const scurr = rt_vcpu(current);
     struct rt_vcpu *snext = NULL;
     struct task_slice ret = { .migrated = 0 };
@@ -848,8 +933,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
     /* burn_budget would return for IDLE VCPU */
     burn_budget(ops, scurr, now);
 
-    __repl_update(ops, now);
-
     if ( tasklet_work_scheduled )
     {
         snext = rt_vcpu(idle_vcpu[cpu]);
@@ -889,7 +972,10 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
         }
     }
 
-    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
+    /* Use minimum from runq */
+    ret.time = MAX_SCHEDULE;
+    if( !list_empty(runq) )
+        ret.time = __q_elem(runq->next)->cur_deadline;
     ret.task = snext->vcpu;
 
     /* TRACE */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-27 19:46 [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling Dagaen Golomb
@ 2015-06-27 19:53 ` Dagaen Golomb
  2015-06-28  7:10   ` Meng Xu
  2015-06-28  7:06 ` Meng Xu
  1 sibling, 1 reply; 25+ messages in thread
From: Dagaen Golomb @ 2015-06-27 19:53 UTC (permalink / raw)
  To: xen-devel, Wei Liu, Sisu Xi, George Dunlap, Meng Xu, Jan Beulich,
	Chong Li

All,

Sorry for the repeated send to the mailing list, I forgot to add some
cc's and wanted to make sure
everyone was included.

This is the new patch that works towards how Dario suggested
structuring the event-driven move.
It uses the previous timer to drive the rt_schedule function, which
enforces budget depletions and
performs scheduling decisions.

There is an added timer that only handles replenishments, which is
called at the time the next
replenishment will occur. To do this, we now also keep the depletedq
sorted. If it detects it has
moved a vCPU into the first [# CPUs] slots in the runq, it tickles the
runq with the added vCPU.
If the depletedq becomes empty the timer is stopped, and if the
scheduler moves a vCPU into
a previously empty depletedq it restarts the timer.

This may have some issues with corner cases that were discussed
earlier, such as unexpected
behavior if the two timers are armed for the same time. It should be
correct for the common case.

Dario, let me know if this is closer to what you envisioned.

~Dagaen

On Sat, Jun 27, 2015 at 3:46 PM, Dagaen Golomb <dgolomb@seas.upenn.edu> wrote:
> To do this, we create a new list that holds, for each
> vcpu, the time least into the future that it may need to be
> rescheduled. The scheduler chooses the lowest time off of this
> list and waits until the specified time instead of running every
> 1 ms as it did before.
>
> Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
> Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
> ---
>  xen/common/sched_rt.c |  100 +++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 93 insertions(+), 7 deletions(-)
>
> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> index 4372486..cce3446 100644
> --- a/xen/common/sched_rt.c
> +++ b/xen/common/sched_rt.c
> @@ -4,6 +4,7 @@
>   *
>   * by Sisu Xi, 2013, Washington University in Saint Louis
>   * and Meng Xu, 2014, University of Pennsylvania
> + * and Dagaen Golomb, 2015, University of Pennsylvania
>   *
>   * based on the code of credit Scheduler
>   */
> @@ -152,6 +153,8 @@ struct rt_private {
>      struct list_head sdom;      /* list of availalbe domains, used for dump */
>      struct list_head runq;      /* ordered list of runnable vcpus */
>      struct list_head depletedq; /* unordered list of depleted vcpus */
> +    struct timer replenishment_timer;
> +    unsigned NUM_CPUS;
>      cpumask_t tickled;          /* cpus been tickled */
>  };
>
> @@ -178,6 +181,8 @@ struct rt_vcpu {
>      unsigned flags;             /* mark __RTDS_scheduled, etc.. */
>  };
>
> +static void runq_tickle(const struct scheduler *, struct rt_vcpu *);
> +
>  /*
>   * Domain
>   */
> @@ -391,13 +396,19 @@ __q_remove(struct rt_vcpu *svc)
>   * Insert svc with budget in RunQ according to EDF:
>   * vcpus with smaller deadlines go first.
>   * Insert svc without budget in DepletedQ unsorted;
> + *
> + * Returns the position, 1-indexed, of where the item was inserted.
> + * Returns a positive index if placed on runq, and a negative index (the index
> + *   in the depletedq * -1) if placed on the depletedq
>   */
> -static void
> +static int
>  __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>  {
>      struct rt_private *prv = rt_priv(ops);
>      struct list_head *runq = rt_runq(ops);
> +    struct list_head *depletedq = rt_depletedq(ops);
>      struct list_head *iter;
> +    unsigned inserted_index = 1;
>
>      ASSERT( spin_is_locked(&prv->lock) );
>
> @@ -411,15 +422,77 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>              struct rt_vcpu * iter_svc = __q_elem(iter);
>              if ( svc->cur_deadline <= iter_svc->cur_deadline )
>                      break;
> +            else
> +                ++inserted_index;
>           }
>          list_add_tail(&svc->q_elem, iter);
> +        return inserted_index;
>      }
>      else
>      {
> -        list_add(&svc->q_elem, &prv->depletedq);
> +        // If we are inserting into previously empty depletedq, rearm
> +        //   rearm replenish timer. It will handle further scheduling until
> +        //   the depletedq is empty again (if ever)
> +        if( list_empty(depletedq) )
> +            set_timer( &prv->replenishment_timer, svc->cur_deadline );
> +
> +        list_for_each(iter, depletedq)
> +        {
> +            struct rt_vcpu * iter_svc = __q_elem(iter);
> +            if ( svc->cur_deadline <= iter_svc->cur_deadline )
> +                break;
> +            else
> +                ++inserted_index;
> +         }
> +        list_add_tail(&svc->q_elem, iter);
> +        return -inserted_index;
>      }
>  }
>
> +static void replenishment_handler(void* data)
> +{
> +    const struct scheduler *ops = data;
> +    struct rt_private *prv = rt_priv(ops);
> +    struct list_head *depletedq = rt_depletedq(ops);
> +    struct list_head *iter;
> +    struct list_head *tmp;
> +    struct rt_vcpu *svc = NULL;
> +    s_time_t now = NOW();
> +    int new_position = 0;
> +    unsigned long flags;
> +
> +    spin_lock_irqsave(&prv->lock, flags);
> +
> +    // Replenish the vCPU that triggered this, and resort it into runq
> +    list_for_each_safe(iter, tmp, depletedq)
> +    {
> +        svc = __q_elem(iter);
> +        if ( now >= svc->cur_deadline )
> +        {
> +            rt_update_deadline(now, svc);
> +            __q_remove(svc); /* remove from depleted queue */
> +            new_position = __runq_insert(ops, svc); /* add to runq */
> +        }
> +        else break; // leave list_for_each_safe
> +    }
> +
> +    // If we become one of top [# CPUs] in the runq, tickle it
> +    // TODO: make this work when multiple tickles are required
> +    if ( new_position > 0 && new_position <= prv->NUM_CPUS )
> +        runq_tickle(ops, svc);
> +
> +    // Use first item on deletedq to schedule next replenishment.
> +    // If no replenishments are pending, disable timer for now
> +    if( !list_empty(depletedq) )
> +    {
> +        struct rt_vcpu * soonest_to_replenish = __q_elem(depletedq->next);
> +        set_timer( &prv->replenishment_timer,
> +                   soonest_to_replenish->cur_deadline );
> +    }
> +
> +    spin_unlock_irqrestore(&prv->lock, flags);
> +}
> +
>  /*
>   * Init/Free related code
>   */
> @@ -427,6 +500,7 @@ static int
>  rt_init(struct scheduler *ops)
>  {
>      struct rt_private *prv = xzalloc(struct rt_private);
> +    int cpu = 0;
>
>      printk("Initializing RTDS scheduler\n"
>             "WARNING: This is experimental software in development.\n"
> @@ -450,6 +524,15 @@ rt_init(struct scheduler *ops)
>      INIT_LIST_HEAD(&prv->runq);
>      INIT_LIST_HEAD(&prv->depletedq);
>
> +    // Replenishment timer, for now always on CPU 0
> +    init_timer(&prv->replenishment_timer, replenishment_handler, ops, 0);
> +    // Calculate number of pCPUs present.
> +    // Note: assumes does not change online.
> +    for_each_present_cpu(cpu)
> +    {
> +        ++(prv->NUM_CPUS);
> +    }
> +
>      cpumask_clear(&prv->tickled);
>
>      ops->sched_data = prv;
> @@ -466,6 +549,8 @@ rt_deinit(const struct scheduler *ops)
>  {
>      struct rt_private *prv = rt_priv(ops);
>
> +    kill_timer(&prv->replenishment_timer);
> +
>      ASSERT( _cpumask_scratch && nr_rt_ops > 0 );
>
>      if ( (--nr_rt_ops) == 0 )
> @@ -653,8 +738,7 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
>      BUG_ON( sdom == NULL );
>
>      lock = vcpu_schedule_lock_irq(vc);
> -    if ( __vcpu_on_q(svc) )
> -        __q_remove(svc);
> +    __q_remove(svc);
>      vcpu_schedule_unlock_irq(lock, vc);
>
>      if ( !is_idle_vcpu(vc) )
> @@ -838,6 +922,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>  {
>      const int cpu = smp_processor_id();
>      struct rt_private *prv = rt_priv(ops);
> +    struct list_head *runq = rt_runq(ops);
>      struct rt_vcpu *const scurr = rt_vcpu(current);
>      struct rt_vcpu *snext = NULL;
>      struct task_slice ret = { .migrated = 0 };
> @@ -848,8 +933,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>      /* burn_budget would return for IDLE VCPU */
>      burn_budget(ops, scurr, now);
>
> -    __repl_update(ops, now);
> -
>      if ( tasklet_work_scheduled )
>      {
>          snext = rt_vcpu(idle_vcpu[cpu]);
> @@ -889,7 +972,10 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>          }
>      }
>
> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
> +    /* Use minimum from runq */
> +    ret.time = MAX_SCHEDULE;
> +    if( !list_empty(runq) )
> +        ret.time = __q_elem(runq->next)->cur_deadline;
>      ret.task = snext->vcpu;
>
>      /* TRACE */
> --
> 1.7.9.5
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-27 19:46 [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling Dagaen Golomb
  2015-06-27 19:53 ` Dagaen Golomb
@ 2015-06-28  7:06 ` Meng Xu
  2015-07-06 17:30   ` Dario Faggioli
  1 sibling, 1 reply; 25+ messages in thread
From: Meng Xu @ 2015-06-28  7:06 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel,
	mengxu, Jan Beulich, Chong Li

[Adding Dario into the cc list...]

Dario, this patch has some coding issues and some other minor issues.
I tried to point out some in this email. IMHO, the main purpose of
this patch is to confirm that  the implementation fully reflects what
you suggests in the previous version. Once we make sure the direction
we are heading on is correct, we can improve the minor issues again.
(Of course, these minor issues are not actually minor things and
should be fixed before sending out the patch. Since it has been sent
out, let's first make sure the top issue is solved. :-( )

Dagaen, you can write down the cc list in a text file so that you
won't miss anyone when you send the patch.

Dagaen, please see my comments below:

2015-06-27 12:46 GMT-07:00 Dagaen Golomb <dgolomb@seas.upenn.edu>:
> To do this, we create a new list that holds, for each
> vcpu, the time least into the future that it may need to be
> rescheduled. The scheduler chooses the lowest time off of this
> list and waits until the specified time instead of running every
> 1 ms as it did before.

Can you summarize the design discussion with Dario on the first
version of the patch? The above information is ok, but it does not say
much about the design. As you can see in our discussion with Dario,
there exist several design choices to achieve this. Explicitly state
it here can help people understand what this patch is doing.

>
> Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
> Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
> ---
>  xen/common/sched_rt.c |  100 +++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 93 insertions(+), 7 deletions(-)
>
> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> index 4372486..cce3446 100644
> --- a/xen/common/sched_rt.c
> +++ b/xen/common/sched_rt.c
> @@ -4,6 +4,7 @@
>   *
>   * by Sisu Xi, 2013, Washington University in Saint Louis
>   * and Meng Xu, 2014, University of Pennsylvania
> + * and Dagaen Golomb, 2015, University of Pennsylvania
>   *
>   * based on the code of credit Scheduler
>   */
> @@ -152,6 +153,8 @@ struct rt_private {
>      struct list_head sdom;      /* list of availalbe domains, used for dump */
>      struct list_head runq;      /* ordered list of runnable vcpus */
>      struct list_head depletedq; /* unordered list of depleted vcpus */
> +    struct timer replenishment_timer;
> +    unsigned NUM_CPUS;

First, it does not follow the convension of variable names. UPPER CASE
should be the macro definition and a constant, which is not the case.
You can use a more proper name, such as ncpus, here and add what it is
used for.

I saw how you use this below. I will comment more there.

>      cpumask_t tickled;          /* cpus been tickled */
>  };
>
> @@ -178,6 +181,8 @@ struct rt_vcpu {
>      unsigned flags;             /* mark __RTDS_scheduled, etc.. */
>  };
>
> +static void runq_tickle(const struct scheduler *, struct rt_vcpu *);

Not sure if declaring the function is the best approach or having
another patch to move the function forward. But anyway, we don't have
to worry about this yet right now, considering there are more
important issues to tackle for this patch.
> +
>  /*
>   * Domain
>   */
> @@ -391,13 +396,19 @@ __q_remove(struct rt_vcpu *svc)
>   * Insert svc with budget in RunQ according to EDF:
>   * vcpus with smaller deadlines go first.
>   * Insert svc without budget in DepletedQ unsorted;
> + *
> + * Returns the position, 1-indexed, of where the item was inserted.
> + * Returns a positive index if placed on runq, and a negative index (the index
> + *   in the depletedq * -1) if placed on the depletedq
>   */
> -static void
> +static int
>  __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>  {
>      struct rt_private *prv = rt_priv(ops);
>      struct list_head *runq = rt_runq(ops);
> +    struct list_head *depletedq = rt_depletedq(ops);
>      struct list_head *iter;
> +    unsigned inserted_index = 1;
>
>      ASSERT( spin_is_locked(&prv->lock) );
>
> @@ -411,15 +422,77 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>              struct rt_vcpu * iter_svc = __q_elem(iter);
>              if ( svc->cur_deadline <= iter_svc->cur_deadline )
>                      break;
> +            else
> +                ++inserted_index;
>           }
>          list_add_tail(&svc->q_elem, iter);
> +        return inserted_index;
>      }
>      else
>      {
> -        list_add(&svc->q_elem, &prv->depletedq);
> +        // If we are inserting into previously empty depletedq, rearm
> +        //   rearm replenish timer. It will handle further scheduling until
> +        //   the depletedq is empty again (if ever)

The format of the comment is like
/**
 *
 */
instead of //.

The coding style of this patch need to improve. I know that this
version is to make sure the implementation is in the right track but
when the patch is sent out, it is supposed to be ready to apply (as I
learned this from Dario last year). :-) So please correct the coding
style in the next version. :-)


> +        if( list_empty(depletedq) )
> +            set_timer( &prv->replenishment_timer, svc->cur_deadline );

Hmm, can you handle the set_timer outside of this function? It should
be possible and make this runq_insert() function serve for only one
purpose.

> +
> +        list_for_each(iter, depletedq)
> +        {
> +            struct rt_vcpu * iter_svc = __q_elem(iter);
> +            if ( svc->cur_deadline <= iter_svc->cur_deadline )
> +                break;
> +            else
> +                ++inserted_index;
> +         }
> +        list_add_tail(&svc->q_elem, iter);
> +        return -inserted_index;
>      }
>  }
>
> +static void replenishment_handler(void* data)
> +{
> +    const struct scheduler *ops = data;
> +    struct rt_private *prv = rt_priv(ops);
> +    struct list_head *depletedq = rt_depletedq(ops);
> +    struct list_head *iter;
> +    struct list_head *tmp;
> +    struct rt_vcpu *svc = NULL;
> +    s_time_t now = NOW();
> +    int new_position = 0;
> +    unsigned long flags;
> +
> +    spin_lock_irqsave(&prv->lock, flags);
> +
> +    // Replenish the vCPU that triggered this, and resort it into runq
> +    list_for_each_safe(iter, tmp, depletedq)
> +    {
> +        svc = __q_elem(iter);
> +        if ( now >= svc->cur_deadline )
> +        {
> +            rt_update_deadline(now, svc);
> +            __q_remove(svc); /* remove from depleted queue */
> +            new_position = __runq_insert(ops, svc); /* add to runq */
> +        }
> +        else break; // leave list_for_each_safe
> +    }
> +
> +    // If we become one of top [# CPUs] in the runq, tickle it
> +    // TODO: make this work when multiple tickles are required

Why do you need multiple tickles?

> +    if ( new_position > 0 && new_position <= prv->NUM_CPUS )
> +        runq_tickle(ops, svc);
> +
> +    // Use first item on deletedq to schedule next replenishment.
> +    // If no replenishments are pending, disable timer for now
> +    if( !list_empty(depletedq) )
> +    {
> +        struct rt_vcpu * soonest_to_replenish = __q_elem(depletedq->next);
> +        set_timer( &prv->replenishment_timer,
> +                   soonest_to_replenish->cur_deadline );
> +    }


What if the inserted VCPU in the depletedq is not at the top of the
depletedq?  You deactive the current timer and then active the same
timer, which is unnecessary and costly. Since this is in the hot path,
you can avoid this deactive-reactive stuff by using an "if".


> +
> +    spin_unlock_irqrestore(&prv->lock, flags);
> +}
> +
>  /*
>   * Init/Free related code
>   */
> @@ -427,6 +500,7 @@ static int
>  rt_init(struct scheduler *ops)
>  {
>      struct rt_private *prv = xzalloc(struct rt_private);
> +    int cpu = 0;
>
>      printk("Initializing RTDS scheduler\n"
>             "WARNING: This is experimental software in development.\n"
> @@ -450,6 +524,15 @@ rt_init(struct scheduler *ops)
>      INIT_LIST_HEAD(&prv->runq);
>      INIT_LIST_HEAD(&prv->depletedq);
>
> +    // Replenishment timer, for now always on CPU 0
> +    init_timer(&prv->replenishment_timer, replenishment_handler, ops, 0);
> +    // Calculate number of pCPUs present.
> +    // Note: assumes does not change online.

Dario, is this a valid assumption? I'm not sure if Xen supports cpu
hot plug and if cpu-hotplug (in case xen supports it) will invalidate
this assumption.

> +    for_each_present_cpu(cpu)
> +    {
> +        ++(prv->NUM_CPUS);
> +    }
> +

This is incorrect. The for_each_present_cpu() list all cpus in the
system. it enumerate the cpu_present_map.

What you want here is the number of cpus for the scheduler in the
current cpupool. You can increase this number when  rt_alloc_pdata()
is called.


>      cpumask_clear(&prv->tickled);
>
>      ops->sched_data = prv;
> @@ -466,6 +549,8 @@ rt_deinit(const struct scheduler *ops)
>  {
>      struct rt_private *prv = rt_priv(ops);
>
> +    kill_timer(&prv->replenishment_timer);
> +
>      ASSERT( _cpumask_scratch && nr_rt_ops > 0 );
>
>      if ( (--nr_rt_ops) == 0 )
> @@ -653,8 +738,7 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
>      BUG_ON( sdom == NULL );
>
>      lock = vcpu_schedule_lock_irq(vc);
> -    if ( __vcpu_on_q(svc) )
> -        __q_remove(svc);
> +    __q_remove(svc);
>      vcpu_schedule_unlock_irq(lock, vc);
>
>      if ( !is_idle_vcpu(vc) )
> @@ -838,6 +922,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>  {
>      const int cpu = smp_processor_id();
>      struct rt_private *prv = rt_priv(ops);
> +    struct list_head *runq = rt_runq(ops);
>      struct rt_vcpu *const scurr = rt_vcpu(current);
>      struct rt_vcpu *snext = NULL;
>      struct task_slice ret = { .migrated = 0 };
> @@ -848,8 +933,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>      /* burn_budget would return for IDLE VCPU */
>      burn_budget(ops, scurr, now);
>
> -    __repl_update(ops, now);
> -
>      if ( tasklet_work_scheduled )
>      {
>          snext = rt_vcpu(idle_vcpu[cpu]);
> @@ -889,7 +972,10 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>          }
>      }
>
> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
> +    /* Use minimum from runq */
> +    ret.time = MAX_SCHEDULE;

why it is not snext->budget? The budget enforcement only need to be
invoked when the VCPU scheduled to run runs out of budget;

> +    if( !list_empty(runq) )
> +        ret.time = __q_elem(runq->next)->cur_deadline;

This function is for budget enforcement. Why do you need this?
Maybe you want to consider the situation when a VCPU still has budget
remaining at the end of the current deadline. You want to use the
cur_deadline to enforce the budget, since the RTDS scheduler does not
allow a VCPU overrun its budget when it misses its deadline and "The
VCPU discards its unused budget at the end of each period."

If it is the case, you can use snxet->vcpu to get the cur_deadline
instead of using the runq->next.

In addition, this rt_schedule function does not handle the above logic
right now. Although it still works (due to how  __repl_update is
implemented), it is better to explicitly have it in rt_schedule(),
IMHO.

>      ret.task = snext->vcpu;
>
>      /* TRACE */
> --
> 1.7.9.5
>



-- 


-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-27 19:53 ` Dagaen Golomb
@ 2015-06-28  7:10   ` Meng Xu
  2015-06-28 17:49     ` Dagaen Golomb
  0 siblings, 1 reply; 25+ messages in thread
From: Meng Xu @ 2015-06-28  7:10 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel,
	Meng Xu, Jan Beulich, Chong Li

[Dario was forgotten in this email. Adding him back..:-( ]



2015-06-27 12:53 GMT-07:00 Dagaen Golomb <dgolomb@seas.upenn.edu>:
> All,
>
> Sorry for the repeated send to the mailing list, I forgot to add some
> cc's and wanted to make sure
> everyone was included.
>
> This is the new patch that works towards how Dario suggested
> structuring the event-driven move.
> It uses the previous timer to drive the rt_schedule function, which
> enforces budget depletions and
> performs scheduling decisions.

This is more like a cover letter. Next time, can you use the option
--compose to send a cover letter with the detailed design along with
the patch?

You can also link to the discusssion we had about the design.

>
> There is an added timer that only handles replenishments, which is
> called at the time the next
> replenishment will occur. To do this, we now also keep the depletedq
> sorted. If it detects it has
> moved a vCPU into the first [# CPUs] slots in the runq, it tickles the
> runq with the added vCPU.
> If the depletedq becomes empty the timer is stopped, and if the
> scheduler moves a vCPU into
> a previously empty depletedq it restarts the timer.
>
> This may have some issues with corner cases that were discussed
> earlier, such as unexpected
> behavior if the two timers are armed for the same time. It should be
> correct for the common case.

Could you elaborate more about when two timers can be armed for the same time?

Thanks,

Meng


-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-28  7:10   ` Meng Xu
@ 2015-06-28 17:49     ` Dagaen Golomb
  2015-06-28 19:11       ` Meng Xu
  0 siblings, 1 reply; 25+ messages in thread
From: Dagaen Golomb @ 2015-06-28 17:49 UTC (permalink / raw)
  To: Meng Xu
  Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel,
	Meng Xu, Jan Beulich, Chong Li


[-- Attachment #1.1: Type: text/plain, Size: 2396 bytes --]

> [Dario was forgotten in this email. Adding him back..:-( ]

Gah, how did I forget this! I've been meaning to make a text file of the
command so I never forget anyone.

> > Sorry for the repeated send to the mailing list, I forgot to add some
> > cc's and wanted to make sure
> > everyone was included.
> >
> > This is the new patch that works towards how Dario suggested
> > structuring the event-driven move.
> > It uses the previous timer to drive the rt_schedule function, which
> > enforces budget depletions and
> > performs scheduling decisions.
>
> This is more like a cover letter. Next time, can you use the option
> --compose to send a cover letter with the detailed design along with
> the patch?
>
> You can also link to the discusssion we had about the design.

I guess this could be useful to be mentioned in a cover letter.

> >
> > There is an added timer that only handles replenishments, which is
> > called at the time the next
> > replenishment will occur. To do this, we now also keep the depletedq
> > sorted. If it detects it has
> > moved a vCPU into the first [# CPUs] slots in the runq, it tickles the
> > runq with the added vCPU.
> > If the depletedq becomes empty the timer is stopped, and if the
> > scheduler moves a vCPU into
> > a previously empty depletedq it restarts the timer.
> >
> > This may have some issues with corner cases that were discussed
> > earlier, such as unexpected
> > behavior if the two timers are armed for the same time. It should be
> > correct for the common case.
>
> Could you elaborate more about when two timers can be armed for the same
time?

Since the two timers are independent now, if a task on the depletedq has
deadline at time X (so the replenishment timer will run) and another task
on a CPU runs out of budget at time X (so scheduler should run), its not
clear what will happen. If the replenishment goes first it probably isn't a
big deal. However, if rt_schedule goes first it may kick a vcpu that is
about to get a replenishment that would cause it to remain a top priority.
One easy option is to check replenishments before kicking a vcpu, but
that's exactly the kind of stuff we wanted to avoid with this
restructuring. Additional logic to enforce a replenishment always goes
first may be more than we would like. I'll have to look more into the Xen
timer behavior with these regards to this matter.

~Dagaen Golomb

[-- Attachment #1.2: Type: text/html, Size: 2863 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-28 17:49     ` Dagaen Golomb
@ 2015-06-28 19:11       ` Meng Xu
  2015-06-28 20:17         ` Dagaen Golomb
  0 siblings, 1 reply; 25+ messages in thread
From: Meng Xu @ 2015-06-28 19:11 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel,
	Meng Xu, Jan Beulich, Chong Li

>> >
>> > There is an added timer that only handles replenishments, which is
>> > called at the time the next
>> > replenishment will occur. To do this, we now also keep the depletedq
>> > sorted. If it detects it has
>> > moved a vCPU into the first [# CPUs] slots in the runq, it tickles the
>> > runq with the added vCPU.
>> > If the depletedq becomes empty the timer is stopped, and if the
>> > scheduler moves a vCPU into
>> > a previously empty depletedq it restarts the timer.
>> >
>> > This may have some issues with corner cases that were discussed
>> > earlier, such as unexpected
>> > behavior if the two timers are armed for the same time. It should be
>> > correct for the common case.
>>
>> Could you elaborate more about when two timers can be armed for the same
>> time?

I don't think the scenario you described below will happen. Here is my argument:

>
> Since the two timers are independent now, if a task on the depletedq has
> deadline at time X (so the replenishment timer will run) and another task on
> a CPU runs out of budget at time X (so scheduler should run), its not clear
> what will happen. If the replenishment goes first it probably isn't a big
> deal.

OK.

> However, if rt_schedule goes first it may kick a vcpu that is about to
> get a replenishment that would cause it to remain a top priority.

So a VCPU i in the runq kicks the currently running VCPU j that is
about to get a replenishment. Right?
It means that the cur_deadline of VCPU is is less than the
cur_deadline of VCPU j; otherwise, VCPU i won't preempt VCPU j.

When VCPU j get replenishiment, it means the deadline of VCPU j will
be added by at least one period. Since the cur_deadline of VCPU i is
already smaller than the cur_deadline of VCPU j, VCPU i will still
have higher priority than VCPU j even when VCPU j get budget
replenishment in the near future.

Therefore, this scenario won't happen. :-)
Do you have another scenario in mind? :-P

The scenario in my mind that will potentially invoke one more rt_schedule  is:
VCPU j currently runs out of budget and will have top priority once it
get budget replenishment.
If replenishment runs first, rt_schedule will be invoked for only once.
If rt_schedule runs first and schedule a VCPU to run, rt_schedule will
be invoked again when replenishment is invoked.

I'm not sure if this is the only kind of scenario that may arm two
timers for the same time.

Thanks,

Meng



-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-28 19:11       ` Meng Xu
@ 2015-06-28 20:17         ` Dagaen Golomb
  2015-06-28 20:45           ` Meng Xu
  2015-07-06 17:41           ` Dario Faggioli
  0 siblings, 2 replies; 25+ messages in thread
From: Dagaen Golomb @ 2015-06-28 20:17 UTC (permalink / raw)
  To: Meng Xu
  Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel,
	Meng Xu, Jan Beulich, Chong Li

>>> > There is an added timer that only handles replenishments, which is
>>> > called at the time the next
>>> > replenishment will occur. To do this, we now also keep the depletedq
>>> > sorted. If it detects it has
>>> > moved a vCPU into the first [# CPUs] slots in the runq, it tickles the
>>> > runq with the added vCPU.
>>> > If the depletedq becomes empty the timer is stopped, and if the
>>> > scheduler moves a vCPU into
>>> > a previously empty depletedq it restarts the timer.
>>> >
>>> > This may have some issues with corner cases that were discussed
>>> > earlier, such as unexpected
>>> > behavior if the two timers are armed for the same time. It should be
>>> > correct for the common case.
>>>
>>> Could you elaborate more about when two timers can be armed for the same
>>> time?
>
> I don't think the scenario you described below will happen. Here is my argument:

It does take some thinking as to whether this will occur or not. I
also am not sure
if Xen will let the timer handlers preempt each other and, if so, if
this is even a
problem.

>>
>> Since the two timers are independent now, if a task on the depletedq has
>> deadline at time X (so the replenishment timer will run) and another task on
>> a CPU runs out of budget at time X (so scheduler should run), its not clear
>> what will happen. If the replenishment goes first it probably isn't a big
>> deal.
>
> OK.
>
>> However, if rt_schedule goes first it may kick a vcpu that is about to
>> get a replenishment that would cause it to remain a top priority.
>
> So a VCPU i in the runq kicks the currently running VCPU j that is
> about to get a replenishment. Right?
> It means that the cur_deadline of VCPU is is less than the
> cur_deadline of VCPU j; otherwise, VCPU i won't preempt VCPU j.
>
> When VCPU j get replenishiment, it means the deadline of VCPU j will
> be added by at least one period. Since the cur_deadline of VCPU i is
> already smaller than the cur_deadline of VCPU j, VCPU i will still
> have higher priority than VCPU j even when VCPU j get budget
> replenishment in the near future.
>
> Therefore, this scenario won't happen. :-)
> Do you have another scenario in mind? :-P

In this scenario you are correct.
I was thinking more about a budget depletion. The kicked vCPU may be
because it has run out of budget. Imagine a vCPU competing its work just
before its deadline, and imagine it has a very high priority. It may finish
its work at time X, and then some other vCPU may have a replenishment
scheduled for time X. Now, if rt_schedule goes first it will kick it to the
depletedq, then it will get replenished, then it will get rescheduled since
it is high priority and should be scheduled again. We just swapped out and
back in a vCPU that should have just stayed there. It does appear it would
still be functionally correct, though. Replenishment first also shouldn't be
a problem since it performs replenishment on both queues and then the
current vCPU wouldn't be swapped out.

> The scenario in my mind that will potentially invoke one more rt_schedule  is:
> VCPU j currently runs out of budget and will have top priority once it
> get budget replenishment.
> If replenishment runs first, rt_schedule will be invoked for only once.
> If rt_schedule runs first and schedule a VCPU to run, rt_schedule will
> be invoked again when replenishment is invoked.

This is a good point here. The ordering in this case doesn't seem to cause
any functional/behavior problems, but it will cause rt_schedule to run twice
when it could have been ran once. So, even as a corner case, it would seem
that its a performance corner case and not a behavior one.

~Dagaen Golomb

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-28 20:17         ` Dagaen Golomb
@ 2015-06-28 20:45           ` Meng Xu
  2015-07-06 17:37             ` Dario Faggioli
  2015-07-06 17:41           ` Dario Faggioli
  1 sibling, 1 reply; 25+ messages in thread
From: Meng Xu @ 2015-06-28 20:45 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel,
	Meng Xu, Jan Beulich, Chong Li

2015-06-28 13:17 GMT-07:00 Dagaen Golomb <dgolomb@seas.upenn.edu>:
>
> >>> > There is an added timer that only handles replenishments, which is
> >>> > called at the time the next
> >>> > replenishment will occur. To do this, we now also keep the depletedq
> >>> > sorted. If it detects it has
> >>> > moved a vCPU into the first [# CPUs] slots in the runq, it tickles the
> >>> > runq with the added vCPU.
> >>> > If the depletedq becomes empty the timer is stopped, and if the
> >>> > scheduler moves a vCPU into
> >>> > a previously empty depletedq it restarts the timer.
> >>> >
> >>> > This may have some issues with corner cases that were discussed
> >>> > earlier, such as unexpected
> >>> > behavior if the two timers are armed for the same time. It should be
> >>> > correct for the common case.
> >>>
> >>> Could you elaborate more about when two timers can be armed for the same
> >>> time?
> >
> > I don't think the scenario you described below will happen. Here is my argument:
>
> It does take some thinking as to whether this will occur or not. I
> also am not sure
> if Xen will let the timer handlers preempt each other and, if so, if
> this is even a
> problem.
>
> >>
> >> Since the two timers are independent now, if a task on the depletedq has
> >> deadline at time X (so the replenishment timer will run) and another task on
> >> a CPU runs out of budget at time X (so scheduler should run), its not clear
> >> what will happen. If the replenishment goes first it probably isn't a big
> >> deal.
> >
> > OK.
> >
> >> However, if rt_schedule goes first it may kick a vcpu that is about to
> >> get a replenishment that would cause it to remain a top priority.
> >
> > So a VCPU i in the runq kicks the currently running VCPU j that is
> > about to get a replenishment. Right?
> > It means that the cur_deadline of VCPU is is less than the
> > cur_deadline of VCPU j; otherwise, VCPU i won't preempt VCPU j.
> >
> > When VCPU j get replenishiment, it means the deadline of VCPU j will
> > be added by at least one period. Since the cur_deadline of VCPU i is
> > already smaller than the cur_deadline of VCPU j, VCPU i will still
> > have higher priority than VCPU j even when VCPU j get budget
> > replenishment in the near future.
> >
> > Therefore, this scenario won't happen. :-)
> > Do you have another scenario in mind? :-P
>
> In this scenario you are correct.
> I was thinking more about a budget depletion. The kicked vCPU may be
> because it has run out of budget. Imagine a vCPU competing its work just
> before its deadline, and imagine it has a very high priority. It may finish
> its work at time X, and then some other vCPU may have a replenishment
> scheduled for time X. Now, if rt_schedule goes first it will kick it to the
> depletedq, then it will get replenished, then it will get rescheduled since
> it is high priority and should be scheduled again. We just swapped out and
> back in a vCPU that should have just stayed there. It does appear it would
> still be functionally correct, though. Replenishment first also shouldn't be
> a problem since it performs replenishment on both queues and then the
> current vCPU wouldn't be swapped out.

Right!  This is the same scenario as described in my previous reply
below. Glad that we are on the same page. :-P

>
> > The scenario in my mind that will potentially invoke one more rt_schedule  is:
> > VCPU j currently runs out of budget and will have top priority once it
> > get budget replenishment.
> > If replenishment runs first, rt_schedule will be invoked for only once.
> > If rt_schedule runs first and schedule a VCPU to run, rt_schedule will
> > be invoked again when replenishment is invoked.
>
> This is a good point here. The ordering in this case doesn't seem to cause
> any functional/behavior problems, but it will cause rt_schedule to run twice
> when it could have been ran once. So, even as a corner case, it would seem
> that its a performance corner case and not a behavior one.

Right.

Let's look forward to the reply from others, especially Dario's.  :-)

Best,

Meng



-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-28  7:06 ` Meng Xu
@ 2015-07-06 17:30   ` Dario Faggioli
  2015-07-07  5:51     ` Meng Xu
  2015-07-09 16:17     ` Dagaen Golomb
  0 siblings, 2 replies; 25+ messages in thread
From: Dario Faggioli @ 2015-07-06 17:30 UTC (permalink / raw)
  To: Meng Xu
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich,
	Chong Li, Dagaen Golomb


[-- Attachment #1.1: Type: text/plain, Size: 11415 bytes --]

On Sun, 2015-06-28 at 00:06 -0700, Meng Xu wrote:
> [Adding Dario into the cc list...]
> 
> Dario, this patch has some coding issues and some other minor issues.
> I tried to point out some in this email. IMHO, the main purpose of
> this patch is to confirm that  the implementation fully reflects what
> you suggests in the previous version. Once we make sure the direction
> we are heading on is correct, we can improve the minor issues again.
>
So, all in all, this looks a lot better than previous one. It's easier
to understand, review, etc., as far as the patch itself is concerned,
and I believe it makes the code look (and, in the long run) perform
better, one (something similar to this) will be applied.

> (Of course, these minor issues are not actually minor things and
> should be fixed before sending out the patch. Since it has been sent
> out, let's first make sure the top issue is solved. :-( )
> 
That is fine. The way one usually marks a similar patch or series, is to
put RFC as a tag. Then, provided you describe, in the cover and/or in
the single patches, what the issues are and how that RFC should be
interpreted, you're fine to send to the list (almost) anything. :-)

> 2015-06-27 12:46 GMT-07:00 Dagaen Golomb <dgolomb@seas.upenn.edu>:
> > To do this, we create a new list that holds, for each
> > vcpu, the time least into the future that it may need to be
> > rescheduled. The scheduler chooses the lowest time off of this
> > list and waits until the specified time instead of running every
> > 1 ms as it did before.
> 
> Can you summarize the design discussion with Dario on the first
> version of the patch? The above information is ok, but it does not say
> much about the design. As you can see in our discussion with Dario,
> there exist several design choices to achieve this. Explicitly state
> it here can help people understand what this patch is doing.
> 
Yes, that would be helpful. It's not required to summarize all the
various ML thread in the actual changelog(s), but, really, a summary and
maybe a few links and pointers in the cover letter, would be useful, to
give interested people that may join late the chance to get some
context.

Note that, even for single patch and not only for series, it is ok to
include a cover letter (i.e., a [PATCH 0/x]), e.g., right for this
purpose.

So, as I was saying, I like the architecture now. There are issues, or
at least I have some questions about specific decisions or code chunks,
but I think this is on the right track now.

> > Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
> > Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
> > ---

> > --- a/xen/common/sched_rt.c
> > +++ b/xen/common/sched_rt.c
> > @@ -4,6 +4,7 @@
> >   *
> >   * by Sisu Xi, 2013, Washington University in Saint Louis
> >   * and Meng Xu, 2014, University of Pennsylvania
> > + * and Dagaen Golomb, 2015, University of Pennsylvania
> >   *
> >   * based on the code of credit Scheduler
> >   */
> > @@ -152,6 +153,8 @@ struct rt_private {
> >      struct list_head sdom;      /* list of availalbe domains, used for dump */
> >      struct list_head runq;      /* ordered list of runnable vcpus */
> >      struct list_head depletedq; /* unordered list of depleted vcpus */
> > +    struct timer replenishment_timer;
> > +    unsigned NUM_CPUS;
> 
> First, it does not follow the convension of variable names. UPPER CASE
> should be the macro definition and a constant, which is not the case.
> You can use a more proper name, such as ncpus, here and add what it is
> used for.
> 
Agreed.

> >      cpumask_t tickled;          /* cpus been tickled */
> >  };
> >
> > @@ -178,6 +181,8 @@ struct rt_vcpu {
> >      unsigned flags;             /* mark __RTDS_scheduled, etc.. */
> >  };
> >
> > +static void runq_tickle(const struct scheduler *, struct rt_vcpu *);
> 
> Not sure if declaring the function is the best approach or having
> another patch to move the function forward. But anyway, we don't have
> to worry about this yet right now, considering there are more
> important issues to tackle for this patch.
>
Sure.

Personally, I would like moving the function better (in a separate
patch). Forward declarations make it (slightly, yes, but still...) more
difficult to find (and switch, like in editors) between call sites and
where the actual code is.

> > @@ -411,15 +422,77 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> >              struct rt_vcpu * iter_svc = __q_elem(iter);
> >              if ( svc->cur_deadline <= iter_svc->cur_deadline )
> >                      break;
> > +            else
> > +                ++inserted_index;
> >           }
> >          list_add_tail(&svc->q_elem, iter);
> > +        return inserted_index;
> >      }
> >      else
> >      {
> > -        list_add(&svc->q_elem, &prv->depletedq);
> > +        // If we are inserting into previously empty depletedq, rearm
> > +        //   rearm replenish timer. It will handle further scheduling until
> > +        //   the depletedq is empty again (if ever)
> > +        if( list_empty(depletedq) )
> > +            set_timer( &prv->replenishment_timer, svc->cur_deadline );
> 
> Hmm, can you handle the set_timer outside of this function? It should
> be possible and make this runq_insert() function serve for only one
> purpose.
> 
TBH, I think I like it being here. I mean, __runq_insert is called in
many places (at least right now), and I wouldn't like to have the check
and the arming repeated everywhere.

Then, that being said, I'm still hoping that some of these calls could
go away, and maybe there are call sites where we know that the timer
won't be armed in any case... So, we'll have to see about this.

So, just take this comment as "I don't dislike the timer machinery to be
in here". :-)

> > +static void replenishment_handler(void* data)
> > +{
> > +    const struct scheduler *ops = data;
> > +    struct rt_private *prv = rt_priv(ops);
> > +    struct list_head *depletedq = rt_depletedq(ops);
> > +    struct list_head *iter;
> > +    struct list_head *tmp;
> > +    struct rt_vcpu *svc = NULL;
> > +    s_time_t now = NOW();
> > +    int new_position = 0;
> > +    unsigned long flags;
> > +
> > +    spin_lock_irqsave(&prv->lock, flags);
> > +
> > +    // Replenish the vCPU that triggered this, and resort it into runq
> > +    list_for_each_safe(iter, tmp, depletedq)
> > +    {
> > +        svc = __q_elem(iter);
> > +        if ( now >= svc->cur_deadline )
> > +        {
> > +            rt_update_deadline(now, svc);
> > +            __q_remove(svc); /* remove from depleted queue */
> > +            new_position = __runq_insert(ops, svc); /* add to runq */
> > +        }
> > +        else break; // leave list_for_each_safe
> > +    }
> > +
> > +    // If we become one of top [# CPUs] in the runq, tickle it
> > +    // TODO: make this work when multiple tickles are required
> 
> Why do you need multiple tickles?
> 
Because the loop above may have been done more than one replenishment,
which makes sense.

While we are here. I've said that I don't like the fact that we have one
big spinlock for the whole scheduler. However, we do have it right now,
so let's make good use of it.

So (but please double check what I'm about to say, and provide your
view), it should be safe to access the various svc->vc->processor from
inside here, since we're holding the big log. If that's the case, then
it should be fairly easy to figure out which are the pCPUs that needs to
reschedule (playing with the index, etc), and then use their
vc->processor to decide what pCPU to actually tickle, isn't this the
case?

(let me know if I've explained myself...)

> > @@ -427,6 +500,7 @@ static int
> >  rt_init(struct scheduler *ops)
> >  {
> >      struct rt_private *prv = xzalloc(struct rt_private);
> > +    int cpu = 0;
> >
> >      printk("Initializing RTDS scheduler\n"
> >             "WARNING: This is experimental software in development.\n"
> > @@ -450,6 +524,15 @@ rt_init(struct scheduler *ops)
> >      INIT_LIST_HEAD(&prv->runq);
> >      INIT_LIST_HEAD(&prv->depletedq);
> >
> > +    // Replenishment timer, for now always on CPU 0
> > +    init_timer(&prv->replenishment_timer, replenishment_handler, ops, 0);
>
Oh, yes, I wanted to discuss about this! It is true that in Xen, timers
are bound to cpus (while, e.g., in Linux, but both the interface and the
implementation are different).

So, for know, you're doing the right thing in just putting the
replenishment timer on one pCPU. We'll se how to improve this later on.

> > +    // Calculate number of pCPUs present.
> > +    // Note: assumes does not change online.
> 
> Dario, is this a valid assumption? I'm not sure if Xen supports cpu
> hot plug and if cpu-hotplug (in case xen supports it) will invalidate
> this assumption.
> 
It's not, but...

> > +    for_each_present_cpu(cpu)
> > +    {
> > +        ++(prv->NUM_CPUS);
> > +    }
> > +
> 
> This is incorrect. The for_each_present_cpu() list all cpus in the
> system. it enumerate the cpu_present_map.
> 
> What you want here is the number of cpus for the scheduler in the
> current cpupool. You can increase this number when  rt_alloc_pdata()
> is called.
> 
... yes, exactly, that's something rather easy to fix. Just use the pCPU
allocation and initialization routines (and the matching de-allocation
ones) to keep the count updated.

> > @@ -838,6 +922,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
> >  {
> >      const int cpu = smp_processor_id();
> >      struct rt_private *prv = rt_priv(ops);
> > +    struct list_head *runq = rt_runq(ops);
> >      struct rt_vcpu *const scurr = rt_vcpu(current);
> >      struct rt_vcpu *snext = NULL;
> >      struct task_slice ret = { .migrated = 0 };
> > @@ -848,8 +933,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
> >      /* burn_budget would return for IDLE VCPU */
> >      burn_budget(ops, scurr, now);
> >
> > -    __repl_update(ops, now);
> > -
> >      if ( tasklet_work_scheduled )
> >      {
> >          snext = rt_vcpu(idle_vcpu[cpu]);
> > @@ -889,7 +972,10 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
> >          }
> >      }
> >
> > -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
> > +    /* Use minimum from runq */
> > +    ret.time = MAX_SCHEDULE;
> 
> why it is not snext->budget? The budget enforcement only need to be
> invoked when the VCPU scheduled to run runs out of budget;
> 
Yep, I was thinking to make right the same comment. :-)

Allow me to say this: thanks (to both Daegan and Meng) for doing this
work. It's been a pleasure to have so much an interesting architectural
discussion, and it's great to see results already. :-D

Regards,
Dario

PS. I've got a few other comments... I'll gather them and send a new
email...
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-28 20:45           ` Meng Xu
@ 2015-07-06 17:37             ` Dario Faggioli
  0 siblings, 0 replies; 25+ messages in thread
From: Dario Faggioli @ 2015-07-06 17:37 UTC (permalink / raw)
  To: Meng Xu
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, Jan Beulich,
	Chong Li, Dagaen Golomb


[-- Attachment #1.1: Type: text/plain, Size: 1527 bytes --]

On Sun, 2015-06-28 at 13:45 -0700, Meng Xu wrote:
> 2015-06-28 13:17 GMT-07:00 Dagaen Golomb <dgolomb@seas.upenn.edu>:

> > > The scenario in my mind that will potentially invoke one more rt_schedule  is:
> > > VCPU j currently runs out of budget and will have top priority once it
> > > get budget replenishment.
> > > If replenishment runs first, rt_schedule will be invoked for only once.
> > > If rt_schedule runs first and schedule a VCPU to run, rt_schedule will
> > > be invoked again when replenishment is invoked.
> >
> > This is a good point here. The ordering in this case doesn't seem to cause
> > any functional/behavior problems, but it will cause rt_schedule to run twice
> > when it could have been ran once. So, even as a corner case, it would seem
> > that its a performance corner case and not a behavior one.
> 
> Right.
> 
> Let's look forward to the reply from others, especially Dario's.  :-)
> 
I concur. There might be situation where more scheduler invocations than
in the ideal (theoretical?) case will be involved, but that's a problem
(in different ways) with any actual implementation (we discussed this at
length already :-) ).

But really, from a correctness point of view, I see no issues.

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-06-28 20:17         ` Dagaen Golomb
  2015-06-28 20:45           ` Meng Xu
@ 2015-07-06 17:41           ` Dario Faggioli
  2015-07-09 16:19             ` Dagaen Golomb
  1 sibling, 1 reply; 25+ messages in thread
From: Dario Faggioli @ 2015-07-06 17:41 UTC (permalink / raw)
  To: Dagaen Golomb
  Cc: Sisu Xi, George Dunlap, xen-devel, Meng Xu, Meng Xu, Chong Li


[-- Attachment #1.1: Type: text/plain, Size: 1015 bytes --]

On Sun, 2015-06-28 at 16:17 -0400, Dagaen Golomb wrote:
> It does take some thinking as to whether this will occur or not. I
> also am not sure
> if Xen will let the timer handlers preempt each other and, if so, if
> this is even a
> problem.
> 
I was about to tell you... But, really, if you want to try going there
and finding it yourself, I think it would be more useful, as you'll
learn something and you won't forget the answer after 5 minutes, as it
happens when someone just tells you something, without you knowing out
of own personal investigation, why that is the case. :-)

So, as you wish. If you just wanna know, as again. If you fancy go down
in the code and check, that, IMO, would be best. :-D

Thanks and Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-06 17:30   ` Dario Faggioli
@ 2015-07-07  5:51     ` Meng Xu
  2015-07-07 14:03       ` Dario Faggioli
  2015-07-09 16:17     ` Dagaen Golomb
  1 sibling, 1 reply; 25+ messages in thread
From: Meng Xu @ 2015-07-07  5:51 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich,
	Chong Li, Dagaen Golomb

[I agree with most of Dario's comment and only reply on the comments
that I didn't quite understand/agree.]

>> > @@ -411,15 +422,77 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>> >              struct rt_vcpu * iter_svc = __q_elem(iter);
>> >              if ( svc->cur_deadline <= iter_svc->cur_deadline )
>> >                      break;
>> > +            else
>> > +                ++inserted_index;
>> >           }
>> >          list_add_tail(&svc->q_elem, iter);
>> > +        return inserted_index;
>> >      }
>> >      else
>> >      {
>> > -        list_add(&svc->q_elem, &prv->depletedq);
>> > +        // If we are inserting into previously empty depletedq, rearm
>> > +        //   rearm replenish timer. It will handle further scheduling until
>> > +        //   the depletedq is empty again (if ever)
>> > +        if( list_empty(depletedq) )
>> > +            set_timer( &prv->replenishment_timer, svc->cur_deadline );
>>
>> Hmm, can you handle the set_timer outside of this function? It should
>> be possible and make this runq_insert() function serve for only one
>> purpose.
>>
> TBH, I think I like it being here. I mean, __runq_insert is called in
> many places (at least right now), and I wouldn't like to have the check
> and the arming repeated everywhere.
>
> Then, that being said, I'm still hoping that some of these calls could
> go away, and maybe there are call sites where we know that the timer
> won't be armed in any case... So, we'll have to see about this.
>
> So, just take this comment as "I don't dislike the timer machinery to be
> in here". :-)

I see and agree. :-)


>
>> > +static void replenishment_handler(void* data)
>> > +{
>> > +    const struct scheduler *ops = data;
>> > +    struct rt_private *prv = rt_priv(ops);
>> > +    struct list_head *depletedq = rt_depletedq(ops);
>> > +    struct list_head *iter;
>> > +    struct list_head *tmp;
>> > +    struct rt_vcpu *svc = NULL;
>> > +    s_time_t now = NOW();
>> > +    int new_position = 0;
>> > +    unsigned long flags;
>> > +
>> > +    spin_lock_irqsave(&prv->lock, flags);
>> > +
>> > +    // Replenish the vCPU that triggered this, and resort it into runq
>> > +    list_for_each_safe(iter, tmp, depletedq)
>> > +    {
>> > +        svc = __q_elem(iter);
>> > +        if ( now >= svc->cur_deadline )
>> > +        {
>> > +            rt_update_deadline(now, svc);
>> > +            __q_remove(svc); /* remove from depleted queue */
>> > +            new_position = __runq_insert(ops, svc); /* add to runq */
>> > +        }
>> > +        else break; // leave list_for_each_safe
>> > +    }
>> > +
>> > +    // If we become one of top [# CPUs] in the runq, tickle it
>> > +    // TODO: make this work when multiple tickles are required
>>
>> Why do you need multiple tickles?
>>
> Because the loop above may have been done more than one replenishment,
> which makes sense.

Ah-ha, I got the reason.I should have read it twice to figure it out. :-)
>
> While we are here. I've said that I don't like the fact that we have one
> big spinlock for the whole scheduler. However, we do have it right now,
> so let's make good use of it.

OK.

>
> So (but please double check what I'm about to say, and provide your
> view), it should be safe to access the various svc->vc->processor from
> inside here, since we're holding the big log. If that's the case, then
> it should be fairly easy to figure out which are the pCPUs that needs to
> reschedule (playing with the index, etc), and then use their
> vc->processor to decide what pCPU to actually tickle, isn't this the
> case?

Hmm, I don't quite get what you meant here although I read it for
three times. :-(

Did you mean we can decide which PCPUs to tickle for the several
"highest" priority vcpus after their budgets get replenished?
If so, doesn't that mean that we will "duplicate" (part of) the
runq_tickle code to find the pCPUs to preempt? Is it better than the
approach that just call runq_tickle each time whenever a high priority
"ready" VCPU is found?

>
> (let me know if I've explained myself...)

Sure. I'm not sure I really got the idea of how to handle the multiple
tickles scenario you mentioned above. ;-(

>
> Allow me to say this: thanks (to both Daegan and Meng) for doing this
> work. It's been a pleasure to have so much an interesting architectural
> discussion, and it's great to see results already. :-D

Thank you very much for your always very helpful suggestions/advices! :-D

Best regards,

Meng


-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-07  5:51     ` Meng Xu
@ 2015-07-07 14:03       ` Dario Faggioli
  2015-07-08  5:56         ` Meng Xu
  0 siblings, 1 reply; 25+ messages in thread
From: Dario Faggioli @ 2015-07-07 14:03 UTC (permalink / raw)
  To: Meng Xu
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich,
	Chong Li, Dagaen Golomb


[-- Attachment #1.1: Type: text/plain, Size: 12242 bytes --]

On Mon, 2015-07-06 at 22:51 -0700, Meng Xu wrote:

> >> > +static void replenishment_handler(void* data)
> >> > +{
> >> > +    const struct scheduler *ops = data;
> >> > +    struct rt_private *prv = rt_priv(ops);
> >> > +    struct list_head *depletedq = rt_depletedq(ops);
> >> > +    struct list_head *iter;
> >> > +    struct list_head *tmp;
> >> > +    struct rt_vcpu *svc = NULL;
> >> > +    s_time_t now = NOW();
> >> > +    int new_position = 0;
> >> > +    unsigned long flags;
> >> > +
> >> > +    spin_lock_irqsave(&prv->lock, flags);
> >> > +
> >> > +    // Replenish the vCPU that triggered this, and resort it into runq
> >> > +    list_for_each_safe(iter, tmp, depletedq)
> >> > +    {
> >> > +        svc = __q_elem(iter);
> >> > +        if ( now >= svc->cur_deadline )
> >> > +        {
> >> > +            rt_update_deadline(now, svc);
> >> > +            __q_remove(svc); /* remove from depleted queue */
> >> > +            new_position = __runq_insert(ops, svc); /* add to runq */
> >> > +        }
> >> > +        else break; // leave list_for_each_safe
> >> > +    }
> >> > +
> >> > +    // If we become one of top [# CPUs] in the runq, tickle it
> >> > +    // TODO: make this work when multiple tickles are required
> >>
> >> Why do you need multiple tickles?
> >>
> > Because the loop above may have been done more than one replenishment,
> > which makes sense.
> 
> Ah-ha, I got the reason.I should have read it twice to figure it out. :-)
> >
> > While we are here. I've said that I don't like the fact that we have one
> > big spinlock for the whole scheduler. However, we do have it right now,
> > so let's make good use of it.
> 
> OK.
> 
> >
> > So (but please double check what I'm about to say, and provide your
> > view), it should be safe to access the various svc->vc->processor from
> > inside here, since we're holding the big log. If that's the case, then
> > it should be fairly easy to figure out which are the pCPUs that needs to
> > reschedule (playing with the index, etc), and then use their
> > vc->processor to decide what pCPU to actually tickle, isn't this the
> > case?
> 
> Hmm, I don't quite get what you meant here although I read it for
> three times. :-(
> 
Yeah, sorry, my bad. :-/

> Did you mean we can decide which PCPUs to tickle for the several
> "highest" priority vcpus after their budgets get replenished?
> If so, doesn't that mean that we will "duplicate" (part of) the
> runq_tickle code to find the pCPUs to preempt? Is it better than the
> approach that just call runq_tickle each time whenever a high priority
> "ready" VCPU is found?
> 
I was just sketching a possible improvement, which as usual is something
ttivially done, without actually writing the code. However, let me try
again.

Let's look at runq_tickle(). It is, right now, called from:
 (1) rt_vcpu_wake(), and it _does_belong_ in there (at least, the logic
     it implements does), but it's called in a way that I really don't
     like. In fact, there is a call to __repl_update(), followed by a
     __runq_pick(), and I think both should go away;
 (2) in rt_context_saved(), again, following a replenishment update and
     a pick. Same as above, I'd love for the update+pick to be kick out
     of here as well;
 (3) with Dagaen patch, it's called from the replenishment handler, and
     again I think the rescheduling request logic (i.e., what
     runq_tickle() currently implements) is fine in there, although how
     this is done needs to improve to support tickling multiple pCPUs.

Let's look at these one by one.

1) rt_vcpu_wake():
   You are waking up vCPU i, and you're doing it right until the call to
   __runq_insert(), where you insert the vCPU on the runq. But then, why
   do you call __repl_update()? Why, at every vCPU wakeup, you want to
   scan the whole runqueue updating everyone's budget? I mean, I (not
   completely, but, well...) understand why you've been doing this
   _until_now_, but after Dagaen patch, we have the replenishment timer,
   and replenishments will be done by the replenishment timer handler,
   so you don't need to do this in here any longer!

   So, basically, I would like Dagaen, as a part of this really cool
   work he's doing, to remove completely the __repl_update() and
   __runq_pick() logic from rt_vcpu_wake().  Then, it is ok to call
   runq_tickle(), but you'll be calling it like this:

     runq_tickle(ops, svc);

  i.e., you'll go checking whether the vCPU that just woke up needs to  
  preempt any other one currently running on a pCPU, AND THAT'S IT!
  We're dealing with a vCPU waking up, not with a wakeup+replenishment
  +pick+reschedule... Let's keep functions focused on their own purpose,
  as we were saying in the previous thread, ok? :-)

  So, as far as this function is concerned, it is ok to call
  runq_tickle(), and runq_tickle() is doing the right thing already, in
  the way it's currently implemented, provided it's called _on_ the
  waking vCPU, not on something else.

2) rt_context_saved()
   This looks a lot similar to (1). In fact, the replenishment update
   and the pick should be removed from here completely. In fact, with a 
   similar reasoning to the above case, what is this function doing? It
   is inserting a vCPU in the runq. It's actually rather similar to a
   wake-up, the only difference being the wake-up is an actual insertion
   (i.e., the vCPU --most times-- was not there before), while in this
   case it is a re-insertion (i.e., the vCPU was there already, but we
   needed to delay fiddling with it).

   So, indeed, everything just said in (1) applies here too, and I'd
   like this function to be restructured accordingly (kill
   __repl_update(), kill __runq_pick(), kill snext, and call
   runq_tickle(ops, svc)).

Now, allow me to stop for a sec, and make some considerations. Dagaen,
in his patch, is changing __runq_insert(), making it return the position
in the runqueue where the vCPU has been inserted. This is actually a
good thing, IMO. 

In different, and generally more scalable implementations of global EDF,
e.g., where you have multiple runqueues, and hence multiple spinlocks,
doing something like this is an absolute nightmare. More specifically,
what is really really difficult (not to implement, but to implement in a
way that does not defeat the scalability benefits of splitting
runqueues) is to actually have a reliable and low overhead way of
looking at other queues and/or pCPUs, to figure out what vCPUs are
running there, and whether or not the one we are dealing with (e.g.,
during a vCPU wakeup) should preempt any one of them. E.g., if you look
at the Linux EDF implementation, which uses different queues, there is a
whole data structure dedicate to that, implemented in its own source
file!

Here, everything is below the umbrella of the big, scheduler wide,
spinlock... So, really, it's rather easy to keep track of that, and that
is actually what runq_tickle() is doing (well, it does not keep track of
it, it goes figuring that out), and what Dagaen is doing by returning
the index is exactly another way to do that.

That's actually what I was (badly) trying to hint at in my previous
mail. Certainly, we don't want to duplicate code from runq_tickle() by
cutting-&-pasting it around... But maybe we can change things in a way
that they're even simpler than they are now (which is really what I
hoped, when thinking to the outcomes of this work, and at using this
design for it! :-D).

So, it looks to me that, as far as (1) and (2) are concerned, since we
are "just" inserting a vCPU in the runq, if we have M pCPUs, and we know
whether we inserted it within the first M spots, we already have what we
want, or am I missing something? And if __runq_insert() now (with Dagaen
patch) tells us this, well, we can simplify the tickling logic, can't
we?

With an example:
We are waking up (or re-inserting, in rt_context_saved()) vCPU j. We
have 6 pCPUs. __runq_insert() tells us that it put vCPU j at the 3rd
place in the runq. This means vCPU j should be set to run as soon as
possible. So, if vCPU j is 3rd in runq, either
 (a) there are only 3 runnable vCPUs (i.e., if we are waking up j, there
     were 2 of them, and j is the third; if we are in context_saved,
     there already where 3, and j just got it's deadline postponed, or
     someone else got its one replenished);
 (b) there are more than 3 runnable vCPUs, i.e., there is at least a 4th
     vCPU --say vCPU k-- in the runq, which was the 3rd before vCPU j
     were woken (or re-inserted), but now became the 4th, because
     deadline(j)<deadline(k).
In case (a), there are for sure idle pCPUs, and we should tickle one of
them. In case (b) there may be idle pCPUs (and, if that's the case, we
should tickle one of them, of course) or not. If not, we need to go
figure out which pCPU to tickle, which is exactly what runq_tickle()
does, but we at least know for sure that we want to tickle the pCPU
where vCPU k runs, or others where vCPUs with deadline greater than vCPU
k run.

Does this make sense? If yes, I think this means we can (and hence
should) restructure runq_tickle() in order to make it behave as above,
as I think it would be more quick and effective. It's probably necessary
to turn the for_each_cpu() loop in there, in a runq scan, but one that
only starts from a specific vCPU (the one that is ->next wrt the newly
inserted vCPU, k, in the example above), and stopping at the M-eth vCPU
in the queue. It's probably necessary to have not only the index, but
also an rt_vcpu pointer, from __runq_insert() (or maybe not, if svc is
already available, and we chan check and traverse its q_elem->next
pointer), and to pass either (or both) to runq_tickle()... But this is
better determined while implementing things.

Honestly, I originally thought that runq_tickle() could be simplified
even more, by looking at the svc->vc->processor field of the vCPU that
our vCPU is inserted before. I see now, however, that this can't be
done, as we, not always tickle exactly the pCPU of the vCPU that is now
next to us (i.e., the pCPU where vCPU k runs, in the example), but we
may need to tickle the one that is running the latest deadline vCPU,
which may be another one.

Still, I think I gave enough material for an actual optimization. What
do you think?

Oh, just out of the top of my head, I think that to implement the above
more easily and effectively, it would make sense to track the idle and
the already tickled pCPUs at a scheduler-wide level. If that reveals to
be true, have a look at how Credit2 is doing it, it's not that
difficult.

Let's go back to cases above, and look at (3). Well, right now, I don't
think I see much alternatives to putting the tickling in the loop, and
(potentially) issuing one call to runq_tickle() for each vCPU that an
execution instance of the replenishment handler actually replenishes.
That may not look cheap, but:
 - it won't be so common that we'll replenish tens of vCPUs in a single
   instance. Ideally, each execution of the handler will replenish (and
   hence insert) only one vCPU (and hence tickle only one pCPU), with
   the loop being there to cover for non-ideal scenarios due to
   overhead, etc;
 - with the optimization to the tickling logic suggested above, it
   should be a bit cheaper to issue multiple tickling calls;

So, I think I've said everything I wanted to say, and I hope this is a
bit more clear now.

Dagaen, Meng, any question?

I really think that, if we manage to implement all this, code quality
and performance would improve a lot. Oh, considering all the various and
(logically) different changes that I've hinted at, the patch needs to
become a patch series! :-D

Thanks and Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-07 14:03       ` Dario Faggioli
@ 2015-07-08  5:56         ` Meng Xu
  2015-07-08  8:01           ` Dario Faggioli
  2015-07-09 16:31           ` Dagaen Golomb
  0 siblings, 2 replies; 25+ messages in thread
From: Meng Xu @ 2015-07-08  5:56 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich,
	Chong Li, Dagaen Golomb

Hi Dario,

(I understand and agree most of your comments, but have one concern
about your comment. I will comment below.)

Hi Dagaen,
Please comment on my comment if you have any concern/idea.

2015-07-07 7:03 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>:
>
> On Mon, 2015-07-06 at 22:51 -0700, Meng Xu wrote:
>
> > >> > +static void replenishment_handler(void* data)
> > >> > +{
> > >> > +    const struct scheduler *ops = data;
> > >> > +    struct rt_private *prv = rt_priv(ops);
> > >> > +    struct list_head *depletedq = rt_depletedq(ops);
> > >> > +    struct list_head *iter;
> > >> > +    struct list_head *tmp;
> > >> > +    struct rt_vcpu *svc = NULL;
> > >> > +    s_time_t now = NOW();
> > >> > +    int new_position = 0;
> > >> > +    unsigned long flags;
> > >> > +
> > >> > +    spin_lock_irqsave(&prv->lock, flags);
> > >> > +
> > >> > +    // Replenish the vCPU that triggered this, and resort it into runq
> > >> > +    list_for_each_safe(iter, tmp, depletedq)
> > >> > +    {
> > >> > +        svc = __q_elem(iter);
> > >> > +        if ( now >= svc->cur_deadline )
> > >> > +        {
> > >> > +            rt_update_deadline(now, svc);
> > >> > +            __q_remove(svc); /* remove from depleted queue */
> > >> > +            new_position = __runq_insert(ops, svc); /* add to runq */
> > >> > +        }
> > >> > +        else break; // leave list_for_each_safe
> > >> > +    }
> > >> > +
> > >> > +    // If we become one of top [# CPUs] in the runq, tickle it
> > >> > +    // TODO: make this work when multiple tickles are required
> > >>
> > >> Why do you need multiple tickles?
> > >>
> > > Because the loop above may have been done more than one replenishment,
> > > which makes sense.
> >
> > Ah-ha, I got the reason.I should have read it twice to figure it out. :-)
> > >
> > > While we are here. I've said that I don't like the fact that we have one
> > > big spinlock for the whole scheduler. However, we do have it right now,
> > > so let's make good use of it.
> >
> > OK.
> >
> > >
> > > So (but please double check what I'm about to say, and provide your
> > > view), it should be safe to access the various svc->vc->processor from
> > > inside here, since we're holding the big log. If that's the case, then
> > > it should be fairly easy to figure out which are the pCPUs that needs to
> > > reschedule (playing with the index, etc), and then use their
> > > vc->processor to decide what pCPU to actually tickle, isn't this the
> > > case?
> >
> > Hmm, I don't quite get what you meant here although I read it for
> > three times. :-(
> >
> Yeah, sorry, my bad. :-/

No problem at all. :-) Thank you so much for your such detailed
explanation!  :-D

>
> > Did you mean we can decide which PCPUs to tickle for the several
> > "highest" priority vcpus after their budgets get replenished?
> > If so, doesn't that mean that we will "duplicate" (part of) the
> > runq_tickle code to find the pCPUs to preempt? Is it better than the
> > approach that just call runq_tickle each time whenever a high priority
> > "ready" VCPU is found?
> >
> I was just sketching a possible improvement, which as usual is something
> ttivially done, without actually writing the code. However, let me try
> again.

Thank you very much! This is "really" helpful and explains a lot!

>
> Let's look at runq_tickle(). It is, right now, called from:
>  (1) rt_vcpu_wake(), and it _does_belong_ in there (at least, the logic
>      it implements does), but it's called in a way that I really don't
>      like. In fact, there is a call to __repl_update(), followed by a
>      __runq_pick(), and I think both should go away;
>  (2) in rt_context_saved(), again, following a replenishment update and
>      a pick. Same as above, I'd love for the update+pick to be kick out
>      of here as well;
>  (3) with Dagaen patch, it's called from the replenishment handler, and
>      again I think the rescheduling request logic (i.e., what
>      runq_tickle() currently implements) is fine in there, although how
>      this is done needs to improve to support tickling multiple pCPUs.

Generally speaking, I agree that runq_tickle can go into these three
functions and the implementation will be a little different but more
efficient.

>
> Let's look at these one by one.
>
> 1) rt_vcpu_wake():
>    You are waking up vCPU i, and you're doing it right until the call to
>    __runq_insert(), where you insert the vCPU on the runq. But then, why
>    do you call __repl_update()? Why, at every vCPU wakeup, you want to
>    scan the whole runqueue updating everyone's budget? I mean, I (not
>    completely, but, well...) understand why you've been doing this
>    _until_now_, but after Dagaen patch, we have the replenishment timer,
>    and replenishments will be done by the replenishment timer handler,
>    so you don't need to do this in here any longer!
>
>    So, basically, I would like Dagaen, as a part of this really cool
>    work he's doing, to remove completely the __repl_update() and
>    __runq_pick() logic from rt_vcpu_wake().  Then, it is ok to call
>    runq_tickle(), but you'll be calling it like this:
>
>      runq_tickle(ops, svc);
>
>   i.e., you'll go checking whether the vCPU that just woke up needs to
>   preempt any other one currently running on a pCPU, AND THAT'S IT!
>   We're dealing with a vCPU waking up, not with a wakeup+replenishment
>   +pick+reschedule... Let's keep functions focused on their own purpose,
>   as we were saying in the previous thread, ok? :-)
>
>   So, as far as this function is concerned, it is ok to call
>   runq_tickle(), and runq_tickle() is doing the right thing already, in
>   the way it's currently implemented, provided it's called _on_ the
>   waking vCPU, not on something else.

Agree! Thanks! :-)

>
> 2) rt_context_saved()
>    This looks a lot similar to (1). In fact, the replenishment update
>    and the pick should be removed from here completely. In fact, with a
>    similar reasoning to the above case, what is this function doing? It
>    is inserting a vCPU in the runq. It's actually rather similar to a
>    wake-up, the only difference being the wake-up is an actual insertion
>    (i.e., the vCPU --most times-- was not there before), while in this
>    case it is a re-insertion (i.e., the vCPU was there already, but we
>    needed to delay fiddling with it).
>
>    So, indeed, everything just said in (1) applies here too, and I'd
>    like this function to be restructured accordingly (kill
>    __repl_update(), kill __runq_pick(), kill snext, and call
>    runq_tickle(ops, svc)).
>
> Now, allow me to stop for a sec, and make some considerations. Dagaen,
> in his patch, is changing __runq_insert(), making it return the position
> in the runqueue where the vCPU has been inserted. This is actually a
> good thing, IMO.
>
> In different, and generally more scalable implementations of global EDF,
> e.g., where you have multiple runqueues, and hence multiple spinlocks,
> doing something like this is an absolute nightmare. More specifically,
> what is really really difficult (not to implement, but to implement in a
> way that does not defeat the scalability benefits of splitting
> runqueues) is to actually have a reliable and low overhead way of
> looking at other queues and/or pCPUs, to figure out what vCPUs are
> running there, and whether or not the one we are dealing with (e.g.,
> during a vCPU wakeup) should preempt any one of them. E.g., if you look
> at the Linux EDF implementation, which uses different queues, there is a
> whole data structure dedicate to that, implemented in its own source
> file!
>
> Here, everything is below the umbrella of the big, scheduler wide,
> spinlock... So, really, it's rather easy to keep track of that, and that
> is actually what runq_tickle() is doing (well, it does not keep track of
> it, it goes figuring that out), and what Dagaen is doing by returning
> the index is exactly another way to do that.
>
> That's actually what I was (badly) trying to hint at in my previous
> mail. Certainly, we don't want to duplicate code from runq_tickle() by
> cutting-&-pasting it around... But maybe we can change things in a way
> that they're even simpler than they are now (which is really what I
> hoped, when thinking to the outcomes of this work, and at using this
> design for it! :-D).

Agree up to this point.

>
> So, it looks to me that, as far as (1) and (2) are concerned, since we
> are "just" inserting a vCPU in the runq, if we have M pCPUs, and we know
> whether we inserted it within the first M spots, we already have what we
> want, or am I missing something? And if __runq_insert() now (with Dagaen
> patch) tells us this, well, we can simplify the tickling logic, can't
> we?

I think you might assume that the first M VCPUs  in the runq are the
current running VCPUs on the M pCPUs. Am I correct? (From what you
described in the following example, I think I'm correct. ;-) )

>
> With an example:
> We are waking up (or re-inserting, in rt_context_saved()) vCPU j. We
> have 6 pCPUs. __runq_insert() tells us that it put vCPU j at the 3rd
> place in the runq. This means vCPU j should be set to run as soon as
> possible. So, if vCPU j is 3rd in runq, either
>  (a) there are only 3 runnable vCPUs (i.e., if we are waking up j, there
>      were 2 of them, and j is the third; if we are in context_saved,
>      there already where 3, and j just got it's deadline postponed, or
>      someone else got its one replenished);
>  (b) there are more than 3 runnable vCPUs, i.e., there is at least a 4th
>      vCPU --say vCPU k-- in the runq, which was the 3rd before vCPU j
>      were woken (or re-inserted), but now became the 4th, because
>      deadline(j)<deadline(k).
> In case (a), there are for sure idle pCPUs, and we should tickle one of
> them.

I tell that you make the above assumption from here.

However, in the current implementation, runq does not hold the current
running VCPUs on the pCPUs. We remove the vcpu from runq in
rt_schedule() function. What you described above make perfect sense
"if" we decide to make runq hold the current running VCPUs.

Actually, after thinking about the example you described, I think we
can hold the current running VCPUs *and* the current idle pCPUs in the
scheduler-wide structure; In other words, we can have another runningq
(not runq) and a idle_pcpu list in the rt_private; Now all VCPUs are
stored in three queues: runningq, runq, and depletedq, in increasing
priority order.

When we make the tickle decision, we only need to scan the idle_pcpu
and then runningq to figure out which pCPU to tickle. All of other
design you describe still hold here, except that the position where a
VCPU is inserted into runq cannot directly give us which pCPU to
tickle. What do you think?


> In case (b) there may be idle pCPUs (and, if that's the case, we
> should tickle one of them, of course) or not. If not, we need to go
> figure out which pCPU to tickle, which is exactly what runq_tickle()
> does, but we at least know for sure that we want to tickle the pCPU
> where vCPU k runs, or others where vCPUs with deadline greater than vCPU
> k run.
>
> Does this make sense?

Yes, if we decide to hold the currently running VCPUs in
scheduler-wide structure: it can be runq or runningq.

> If yes, I think this means we can (and hence
> should) restructure runq_tickle() in order to make it behave as above,
> as I think it would be more quick and effective. It's probably necessary
> to turn the for_each_cpu() loop in there, in a runq scan, but one that
> only starts from a specific vCPU (the one that is ->next wrt the newly
> inserted vCPU, k, in the example above), and stopping at the M-eth vCPU
> in the queue. It's probably necessary to have not only the index, but
> also an rt_vcpu pointer, from __runq_insert() (or maybe not, if svc is
> already available, and we chan check and traverse its q_elem->next
> pointer), and to pass either (or both) to runq_tickle()... But this is
> better determined while implementing things.
>
> Honestly, I originally thought that runq_tickle() could be simplified
> even more, by looking at the svc->vc->processor field of the vCPU that
> our vCPU is inserted before. I see now, however, that this can't be
> done, as we, not always tickle exactly the pCPU of the vCPU that is now
> next to us (i.e., the pCPU where vCPU k runs, in the example), but we
> may need to tickle the one that is running the latest deadline vCPU,
> which may be another one.

Right. any pCPU might be tickled. It depends on which vCPU is running
on it and what the priority the vCPU has.

>
> Still, I think I gave enough material for an actual optimization. What
> do you think?

Yes. It is very clear.
The only thing is how we are going to "realize" your assumption. :-)
I'm not so sure if we do need the runningq  to hold the current
running VCPUs since we can still use "for_each_cpu()" to find all
pCPUs used in the scheduler and get the current running VCPU on it.
Adding a new field to hold all such running VCPUs might help speed up
the searching, but need more space. So there is a balance between
space and time.

What is your opinion?  :-)


>
> Oh, just out of the top of my head, I think that to implement the above
> more easily and effectively, it would make sense to track the idle and
> the already tickled pCPUs at a scheduler-wide level. If that reveals to
> be true, have a look at how Credit2 is doing it, it's not that
> difficult.

Right.

>
> Let's go back to cases above, and look at (3). Well, right now, I don't
> think I see much alternatives to putting the tickling in the loop, and
> (potentially) issuing one call to runq_tickle() for each vCPU that an
> execution instance of the replenishment handler actually replenishes.
> That may not look cheap, but:
>  - it won't be so common that we'll replenish tens of vCPUs in a single
>    instance. Ideally, each execution of the handler will replenish (and
>    hence insert) only one vCPU (and hence tickle only one pCPU), with
>    the loop being there to cover for non-ideal scenarios due to
>    overhead, etc;
>  - with the optimization to the tickling logic suggested above, it
>    should be a bit cheaper to issue multiple tickling calls;
>
> So, I think I've said everything I wanted to say, and I hope this is a
> bit more clear now.

Thank you again! It is very clear and I'm clear which part is unclear now. :-D

>
> Dagaen, Meng, any question?
>
> I really think that, if we manage to implement all this, code quality
> and performance would improve a lot. Oh, considering all the various and
> (logically) different changes that I've hinted at, the patch needs to
> become a patch series! :-D

Sure! Dagaen, what do you think?

>
> Thanks and Regards,

Thank you for your guidance and best regards,

Meng

-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-08  5:56         ` Meng Xu
@ 2015-07-08  8:01           ` Dario Faggioli
  2015-07-09  3:46             ` Meng Xu
  2015-07-09 16:35             ` Dagaen Golomb
  2015-07-09 16:31           ` Dagaen Golomb
  1 sibling, 2 replies; 25+ messages in thread
From: Dario Faggioli @ 2015-07-08  8:01 UTC (permalink / raw)
  To: Meng Xu
  Cc: Sisu Xi, George Dunlap, xen-devel, mengxu, Chong Li, Dagaen Golomb


[-- Attachment #1.1: Type: text/plain, Size: 6114 bytes --]

[Trimming the Cc-list a bit, to avoid bothering Wei and Jan]

On Tue, 2015-07-07 at 22:56 -0700, Meng Xu wrote:
> Hi Dario,
> 
Hi,

> 2015-07-07 7:03 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>:
> >
> > On Mon, 2015-07-06 at 22:51 -0700, Meng Xu wrote:

> > So, it looks to me that, as far as (1) and (2) are concerned, since we
> > are "just" inserting a vCPU in the runq, if we have M pCPUs, and we know
> > whether we inserted it within the first M spots, we already have what we
> > want, or am I missing something? And if __runq_insert() now (with Dagaen
> > patch) tells us this, well, we can simplify the tickling logic, can't
> > we?
> 
> I think you might assume that the first M VCPUs  in the runq are the
> current running VCPUs on the M pCPUs. Am I correct? (From what you
> described in the following example, I think I'm correct. ;-) )
> 
Mmm... Interesting. Yes, I was. I was basing this assumption on this
chunk on Dagaen's patch:

    // If we become one of top [# CPUs] in the runq, tickle it
    // TODO: make this work when multiple tickles are required
    if ( new_position > 0 && new_position <= prv->NUM_CPUS )
        runq_tickle(ops, svc);

And forgot (and did not go check) about the __q_remove() in
rt_schedule(). My bad again.

But then, since we don't have the running vCPUs in the runq, how the
code above is supposed to be correct?

> > With an example:
> > We are waking up (or re-inserting, in rt_context_saved()) vCPU j. We
> > have 6 pCPUs. __runq_insert() tells us that it put vCPU j at the 3rd
> > place in the runq. This means vCPU j should be set to run as soon as
> > possible. So, if vCPU j is 3rd in runq, either
> >  (a) there are only 3 runnable vCPUs (i.e., if we are waking up j, there
> >      were 2 of them, and j is the third; if we are in context_saved,
> >      there already where 3, and j just got it's deadline postponed, or
> >      someone else got its one replenished);
> >  (b) there are more than 3 runnable vCPUs, i.e., there is at least a 4th
> >      vCPU --say vCPU k-- in the runq, which was the 3rd before vCPU j
> >      were woken (or re-inserted), but now became the 4th, because
> >      deadline(j)<deadline(k).
> > In case (a), there are for sure idle pCPUs, and we should tickle one of
> > them.
> 
> I tell that you make the above assumption from here.
> 
> However, in the current implementation, runq does not hold the current
> running VCPUs on the pCPUs. We remove the vcpu from runq in
> rt_schedule() function. What you described above make perfect sense
> "if" we decide to make runq hold the current running VCPUs.
> 
Yep. And it indeed seems to me that we may well think about doing so. It
will make it possible to base on the position for making/optimizing
scheduling decisions, and at the same time I don't think I see much
downsides in that, do you?

> Actually, after thinking about the example you described, I think we
> can hold the current running VCPUs *and* the current idle pCPUs in the
> scheduler-wide structure; 
>
What do you mean with 'current idle pCPUs'? I said something similar as
well, and what I meant was a cpumask with bit i set if i-eth pCPU is
idle, do you also mean this?

About the running vCPUs, why just not leave them in the actual runq?

> In other words, we can have another runningq
> (not runq) and a idle_pcpu list in the rt_private; Now all VCPUs are
> stored in three queues: runningq, runq, and depletedq, in increasing
> priority order.
> 
Perhaps, but I'm not sure I see the need for another list. Again, why
just not leave them in runq? I appreciate this is a rather big  change
(although, perhaps it looks bigger said than done), but I think it could
be worth pursuing.

For double checking, asserting, and making sure that we are able to
identify the running svc-s, we have the __RTDS_scheduled flag.

> When we make the tickle decision, we only need to scan the idle_pcpu
> and then runningq to figure out which pCPU to tickle. All of other
> design you describe still hold here, except that the position where a
> VCPU is inserted into runq cannot directly give us which pCPU to
> tickle. What do you think?
> 
I think that I'd like to know why you think adding another queue is
necessary, instead of just leaving the vCPUs in the actual runq. Is
there something bad about that which I'm missing?

> > In case (b) there may be idle pCPUs (and, if that's the case, we
> > should tickle one of them, of course) or not. If not, we need to go
> > figure out which pCPU to tickle, which is exactly what runq_tickle()
> > does, but we at least know for sure that we want to tickle the pCPU
> > where vCPU k runs, or others where vCPUs with deadline greater than vCPU
> > k run.
> >
> > Does this make sense?
> 
> Yes, if we decide to hold the currently running VCPUs in
> scheduler-wide structure: it can be runq or runningq.
> 
Yes, but if we use two queues, we defeat at least part of this
optimization/simplification.

> > Still, I think I gave enough material for an actual optimization. What
> > do you think?
> 
> Yes. It is very clear.
> The only thing is how we are going to "realize" your assumption. :-)
>
:-D

> I'm not so sure if we do need the runningq  to hold the current
> running VCPUs since we can still use "for_each_cpu()" to find all
> pCPUs used in the scheduler and get the current running VCPU on it.
> Adding a new field to hold all such running VCPUs might help speed up
> the searching, but need more space. So there is a balance between
> space and time.
> 
I don't think the added space would be a problem, but I don't see why we
need it.

> Thank you for your guidance and best regards,
> 
Thanks for you for actually doing this work. :-)

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-08  8:01           ` Dario Faggioli
@ 2015-07-09  3:46             ` Meng Xu
  2015-07-09 16:39               ` Dagaen Golomb
  2015-07-10 10:07               ` Dario Faggioli
  2015-07-09 16:35             ` Dagaen Golomb
  1 sibling, 2 replies; 25+ messages in thread
From: Meng Xu @ 2015-07-09  3:46 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Sisu Xi, George Dunlap, xen-devel, mengxu, Chong Li, Dagaen Golomb

2015-07-08 1:01 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>:
> [Trimming the Cc-list a bit, to avoid bothering Wei and Jan]
>
> On Tue, 2015-07-07 at 22:56 -0700, Meng Xu wrote:
>> Hi Dario,
>>
> Hi,
>
>> 2015-07-07 7:03 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>:
>> >
>> > On Mon, 2015-07-06 at 22:51 -0700, Meng Xu wrote:
>
>> > So, it looks to me that, as far as (1) and (2) are concerned, since we
>> > are "just" inserting a vCPU in the runq, if we have M pCPUs, and we know
>> > whether we inserted it within the first M spots, we already have what we
>> > want, or am I missing something? And if __runq_insert() now (with Dagaen
>> > patch) tells us this, well, we can simplify the tickling logic, can't
>> > we?
>>
>> I think you might assume that the first M VCPUs  in the runq are the
>> current running VCPUs on the M pCPUs. Am I correct? (From what you
>> described in the following example, I think I'm correct. ;-) )
>>
> Mmm... Interesting. Yes, I was. I was basing this assumption on this
> chunk on Dagaen's patch:
>
>     // If we become one of top [# CPUs] in the runq, tickle it
>     // TODO: make this work when multiple tickles are required
>     if ( new_position > 0 && new_position <= prv->NUM_CPUS )
>         runq_tickle(ops, svc);
>
> And forgot (and did not go check) about the __q_remove() in
> rt_schedule(). My bad again.

Actually this assumption is good! :-D

>
> But then, since we don't have the running vCPUs in the runq, how the
> code above is supposed to be correct?

That's why at the very beginning of this thread, I asked Dagaen why we
need to return the position where a VCPU is inserted in the runq.
Without this assumption, I don't see how we can use the returned
position of the vcpu in runq in the current patch could be useful.
But with this assumption, it could be very useful!


>
>> > With an example:
>> > We are waking up (or re-inserting, in rt_context_saved()) vCPU j. We
>> > have 6 pCPUs. __runq_insert() tells us that it put vCPU j at the 3rd
>> > place in the runq. This means vCPU j should be set to run as soon as
>> > possible. So, if vCPU j is 3rd in runq, either
>> >  (a) there are only 3 runnable vCPUs (i.e., if we are waking up j, there
>> >      were 2 of them, and j is the third; if we are in context_saved,
>> >      there already where 3, and j just got it's deadline postponed, or
>> >      someone else got its one replenished);
>> >  (b) there are more than 3 runnable vCPUs, i.e., there is at least a 4th
>> >      vCPU --say vCPU k-- in the runq, which was the 3rd before vCPU j
>> >      were woken (or re-inserted), but now became the 4th, because
>> >      deadline(j)<deadline(k).
>> > In case (a), there are for sure idle pCPUs, and we should tickle one of
>> > them.
>>
>> I tell that you make the above assumption from here.
>>
>> However, in the current implementation, runq does not hold the current
>> running VCPUs on the pCPUs. We remove the vcpu from runq in
>> rt_schedule() function. What you described above make perfect sense
>> "if" we decide to make runq hold the current running VCPUs.
>>
> Yep. And it indeed seems to me that we may well think about doing so. It
> will make it possible to base on the position for making/optimizing
> scheduling decisions, and at the same time I don't think I see much
> downsides in that, do you?

No. Actually, I think it is better to keep the running VCPUs in the
runq (not running queue) so that we can better use this position of
VCPU in the runq. This could speed up the process to find a CPU to
tickle.

>
>> Actually, after thinking about the example you described, I think we
>> can hold the current running VCPUs *and* the current idle pCPUs in the
>> scheduler-wide structure;
>>
> What do you mean with 'current idle pCPUs'? I said something similar as
> well, and what I meant was a cpumask with bit i set if i-eth pCPU is
> idle, do you also mean this?

Yes.

>
> About the running vCPUs, why just not leave them in the actual runq?
>
>> In other words, we can have another runningq
>> (not runq) and a idle_pcpu list in the rt_private; Now all VCPUs are
>> stored in three queues: runningq, runq, and depletedq, in increasing
>> priority order.
>>
> Perhaps, but I'm not sure I see the need for another list. Again, why
> just not leave them in runq?

When I proposed the runningq, I was thinking about the situation when
we decide to split the (old) runq in RT-Xen to the (current) runq and
depletedq; I was thinking the same reason why we split to runq and
depletedq may still hold here when we add runningq.

But after thinking twice, maybe runq approach is a better way since it
just make the position information more meaningful. As you described
in the previous email, we can compare the position of a vcpu inserted
into the runq with the number of pCPUs so as to quickly know which
pCPU to tickle.

> I appreciate this is a rather big  change
> (although, perhaps it looks bigger said than done), but I think it could
> be worth pursuing.

It is worth pursuing since it simplify the cpu tickling process a lot.

>
> For double checking, asserting, and making sure that we are able to
> identify the running svc-s, we have the __RTDS_scheduled flag.

Yes. In rt_schedule(), we set the flag for the next vcpu
"set_bit(__RTDS_scheduled, &snext->flags);"

>
>> When we make the tickle decision, we only need to scan the idle_pcpu
>> and then runningq to figure out which pCPU to tickle. All of other
>> design you describe still hold here, except that the position where a
>> VCPU is inserted into runq cannot directly give us which pCPU to
>> tickle. What do you think?
>>
> I think that I'd like to know why you think adding another queue is
> necessary, instead of just leaving the vCPUs in the actual runq. Is
> there something bad about that which I'm missing?

Actually, nothing bad. I just recalled the situation when we split a
runq to runq and delpetedq. I was thinking it might be the case here
as well. (Yes, it is different here since we can get more useful
information to tickle cpu if we put vCPUs into runq instead of adding
one more queue.) :-)
>
>> > In case (b) there may be idle pCPUs (and, if that's the case, we
>> > should tickle one of them, of course) or not. If not, we need to go
>> > figure out which pCPU to tickle, which is exactly what runq_tickle()
>> > does, but we at least know for sure that we want to tickle the pCPU
>> > where vCPU k runs, or others where vCPUs with deadline greater than vCPU
>> > k run.
>> >
>> > Does this make sense?
>>
>> Yes, if we decide to hold the currently running VCPUs in
>> scheduler-wide structure: it can be runq or runningq.
>>
> Yes, but if we use two queues, we defeat at least part of this
> optimization/simplification.

Agree. Thanks! :-D

>
>> > Still, I think I gave enough material for an actual optimization. What
>> > do you think?
>>
>> Yes. It is very clear.
>> The only thing is how we are going to "realize" your assumption. :-)
>>
> :-D
>
>> I'm not so sure if we do need the runningq  to hold the current
>> running VCPUs since we can still use "for_each_cpu()" to find all
>> pCPUs used in the scheduler and get the current running VCPU on it.
>> Adding a new field to hold all such running VCPUs might help speed up
>> the searching, but need more space. So there is a balance between
>> space and time.
>>
> I don't think the added space would be a problem, but I don't see why we
> need it.

If we don't add another queue, no extra space. So we get free lunch here. :-)

Thanks and best regards,

Meng

-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-06 17:30   ` Dario Faggioli
  2015-07-07  5:51     ` Meng Xu
@ 2015-07-09 16:17     ` Dagaen Golomb
  1 sibling, 0 replies; 25+ messages in thread
From: Dagaen Golomb @ 2015-07-09 16:17 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Wei Liu, Sisu Xi, George Dunlap, xen-devel, Meng Xu, mengxu,
	Jan Beulich, Chong Li

>> Can you summarize the design discussion with Dario on the first
>> version of the patch? The above information is ok, but it does not say
>> much about the design. As you can see in our discussion with Dario,
>> there exist several design choices to achieve this. Explicitly state
>> it here can help people understand what this patch is doing.
>>
> Yes, that would be helpful. It's not required to summarize all the
> various ML thread in the actual changelog(s), but, really, a summary and
> maybe a few links and pointers in the cover letter, would be useful, to
> give interested people that may join late the chance to get some
> context.
>
> Note that, even for single patch and not only for series, it is ok to
> include a cover letter (i.e., a [PATCH 0/x]), e.g., right for this
> purpose.
>
> So, as I was saying, I like the architecture now. There are issues, or
> at least I have some questions about specific decisions or code chunks,
> but I think this is on the right track now.

I'll strengthen the summary/cover letter on future patches.

>> > +    struct timer replenishment_timer;
>> > +    unsigned NUM_CPUS;
>>
>> First, it does not follow the convension of variable names. UPPER CASE
>> should be the macro definition and a constant, which is not the case.
>> You can use a more proper name, such as ncpus, here and add what it is
>> used for.
>>
> Agreed.

Will change.

>> >      cpumask_t tickled;          /* cpus been tickled */
>> >  };
>> >
>> > @@ -178,6 +181,8 @@ struct rt_vcpu {
>> >      unsigned flags;             /* mark __RTDS_scheduled, etc.. */
>> >  };
>> >
>> > +static void runq_tickle(const struct scheduler *, struct rt_vcpu *);
>>
>> Not sure if declaring the function is the best approach or having
>> another patch to move the function forward. But anyway, we don't have
>> to worry about this yet right now, considering there are more
>> important issues to tackle for this patch.
>>
> Sure.
>
> Personally, I would like moving the function better (in a separate
> patch). Forward declarations make it (slightly, yes, but still...) more
> difficult to find (and switch, like in editors) between call sites and
> where the actual code is.

I'd like to move it, but as mentioned that can be a separate patch.

>> > @@ -411,15 +422,77 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>> >              struct rt_vcpu * iter_svc = __q_elem(iter);
>> >              if ( svc->cur_deadline <= iter_svc->cur_deadline )
>> >                      break;
>> > +            else
>> > +                ++inserted_index;
>> >           }
>> >          list_add_tail(&svc->q_elem, iter);
>> > +        return inserted_index;
>> >      }
>> >      else
>> >      {
>> > -        list_add(&svc->q_elem, &prv->depletedq);
>> > +        // If we are inserting into previously empty depletedq, rearm
>> > +        //   rearm replenish timer. It will handle further scheduling until
>> > +        //   the depletedq is empty again (if ever)
>> > +        if( list_empty(depletedq) )
>> > +            set_timer( &prv->replenishment_timer, svc->cur_deadline );
>>
>> Hmm, can you handle the set_timer outside of this function? It should
>> be possible and make this runq_insert() function serve for only one
>> purpose.
>>
> TBH, I think I like it being here. I mean, __runq_insert is called in
> many places (at least right now), and I wouldn't like to have the check
> and the arming repeated everywhere.
>
> Then, that being said, I'm still hoping that some of these calls could
> go away, and maybe there are call sites where we know that the timer
> won't be armed in any case... So, we'll have to see about this.
>
> So, just take this comment as "I don't dislike the timer machinery to be
> in here". :-)

I thought it was an acceptable way of accomplishing this. We can focus on
trimming more later if this doesn't seem ideal.


>> > +    // If we become one of top [# CPUs] in the runq, tickle it
>> > +    // TODO: make this work when multiple tickles are required
>>
>> Why do you need multiple tickles?
>>
> Because the loop above may have been done more than one replenishment,
> which makes sense.
>
> While we are here. I've said that I don't like the fact that we have one
> big spinlock for the whole scheduler. However, we do have it right now,
> so let's make good use of it.
>
> So (but please double check what I'm about to say, and provide your
> view), it should be safe to access the various svc->vc->processor from
> inside here, since we're holding the big log. If that's the case, then
> it should be fairly easy to figure out which are the pCPUs that needs to
> reschedule (playing with the index, etc), and then use their
> vc->processor to decide what pCPU to actually tickle, isn't this the
> case?
>
> (let me know if I've explained myself...)

>> > +    // Replenishment timer, for now always on CPU 0
>> > +    init_timer(&prv->replenishment_timer, replenishment_handler, ops, 0);
>>
> Oh, yes, I wanted to discuss about this! It is true that in Xen, timers
> are bound to cpus (while, e.g., in Linux, but both the interface and the
> implementation are different).
>
> So, for know, you're doing the right thing in just putting the
> replenishment timer on one pCPU. We'll se how to improve this later on.

I expected this would a be possible later improvement.

>> > +    // Calculate number of pCPUs present.
>> > +    // Note: assumes does not change online.
>>
>> Dario, is this a valid assumption? I'm not sure if Xen supports cpu
>> hot plug and if cpu-hotplug (in case xen supports it) will invalidate
>> this assumption.
>>
> It's not, but...
>
>> > +    for_each_present_cpu(cpu)
>> > +    {
>> > +        ++(prv->NUM_CPUS);
>> > +    }
>> > +
>>
>> This is incorrect. The for_each_present_cpu() list all cpus in the
>> system. it enumerate the cpu_present_map.
>>
>> What you want here is the number of cpus for the scheduler in the
>> current cpupool. You can increase this number when  rt_alloc_pdata()
>> is called.
>>
> ... yes, exactly, that's something rather easy to fix. Just use the pCPU
> allocation and initialization routines (and the matching de-allocation
> ones) to keep the count updated.

Good idea. I knew it wasn't a purely valid assumption but wanted to
get a working patch out.

>> > -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
>> > +    /* Use minimum from runq */
>> > +    ret.time = MAX_SCHEDULE;
>>
>> why it is not snext->budget? The budget enforcement only need to be
>> invoked when the VCPU scheduled to run runs out of budget;
>>
> Yep, I was thinking to make right the same comment. :-)

I believe I put that there with the intention of having some
invocations even with an empty queue,
but this may be a bug. I don't think we should need a minimum
invocation period so I may
remove this and the associated constant.

Regards
~Dagaen Golomb

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-06 17:41           ` Dario Faggioli
@ 2015-07-09 16:19             ` Dagaen Golomb
  0 siblings, 0 replies; 25+ messages in thread
From: Dagaen Golomb @ 2015-07-09 16:19 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Sisu Xi, George Dunlap, xen-devel, Meng Xu, Meng Xu, Chong Li

>> It does take some thinking as to whether this will occur or not. I
>> also am not sure
>> if Xen will let the timer handlers preempt each other and, if so, if
>> this is even a
>> problem.
>>
> I was about to tell you... But, really, if you want to try going there
> and finding it yourself, I think it would be more useful, as you'll
> learn something and you won't forget the answer after 5 minutes, as it
> happens when someone just tells you something, without you knowing out
> of own personal investigation, why that is the case. :-)
>
> So, as you wish. If you just wanna know, as again. If you fancy go down
> in the code and check, that, IMO, would be best. :-D

I will research myself. :)

Regards,
~Dagaen Golomb

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-08  5:56         ` Meng Xu
  2015-07-08  8:01           ` Dario Faggioli
@ 2015-07-09 16:31           ` Dagaen Golomb
  1 sibling, 0 replies; 25+ messages in thread
From: Dagaen Golomb @ 2015-07-09 16:31 UTC (permalink / raw)
  To: Meng Xu
  Cc: Wei Liu, Sisu Xi, George Dunlap, Dario Faggioli, xen-devel,
	mengxu, Jan Beulich, Chong Li

> I think you might assume that the first M VCPUs  in the runq are the
> current running VCPUs on the M pCPUs. Am I correct? (From what you
> described in the following example, I think I'm correct. ;-) )

This is an assumption he gathered from my implementation, because I
wrote code that uses this "fact." Right now the code is wrong since
current vcpus are not kept in the queue.

> I tell that you make the above assumption from here.
>
> However, in the current implementation, runq does not hold the current
> running VCPUs on the pCPUs. We remove the vcpu from runq in
> rt_schedule() function. What you described above make perfect sense
> "if" we decide to make runq hold the current running VCPUs.
>
> Actually, after thinking about the example you described, I think we
> can hold the current running VCPUs *and* the current idle pCPUs in the
> scheduler-wide structure; In other words, we can have another runningq
> (not runq) and a idle_pcpu list in the rt_private; Now all VCPUs are
> stored in three queues: runningq, runq, and depletedq, in increasing
> priority order.
>
> When we make the tickle decision, we only need to scan the idle_pcpu
> and then runningq to figure out which pCPU to tickle. All of other
> design you describe still hold here, except that the position where a
> VCPU is inserted into runq cannot directly give us which pCPU to
> tickle. What do you think?

This is an good idea too. I think having the running vcpus simply at the
beginning of the runq could work, too.

>> In case (b) there may be idle pCPUs (and, if that's the case, we
>> should tickle one of them, of course) or not. If not, we need to go
>> figure out which pCPU to tickle, which is exactly what runq_tickle()
>> does, but we at least know for sure that we want to tickle the pCPU
>> where vCPU k runs, or others where vCPUs with deadline greater than vCPU
>> k run.
>>
>> Does this make sense?
>
> Yes, if we decide to hold the currently running VCPUs in
> scheduler-wide structure: it can be runq or runningq.

Right. I think for now I'll just keep them in runq to keep most of the
selection logic the same.

> Thank you again! It is very clear and I'm clear which part is unclear now. :-D
>
>>
>> Dagaen, Meng, any question?
>>
>> I really think that, if we manage to implement all this, code quality
>> and performance would improve a lot. Oh, considering all the various and
>> (logically) different changes that I've hinted at, the patch needs to
>> become a patch series! :-D
>
> Sure! Dagaen, what do you think?

Yes, this may become a series with several changes like this. For now
I am going to get it working with the running vcpus in the runq.
I thought returning the inserted index was a good way of checking if
we need to tickle, and moving the runnings vpus to runq will make
the scheduler almost done as far as functional correctness goes.
Various other features hinted at would be a series on top of this.

Regards,
~Dagaen Golomb

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-08  8:01           ` Dario Faggioli
  2015-07-09  3:46             ` Meng Xu
@ 2015-07-09 16:35             ` Dagaen Golomb
  1 sibling, 0 replies; 25+ messages in thread
From: Dagaen Golomb @ 2015-07-09 16:35 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Sisu Xi, George Dunlap, xen-devel, Meng Xu, mengxu, Chong Li

>> I think you might assume that the first M VCPUs  in the runq are the
>> current running VCPUs on the M pCPUs. Am I correct? (From what you
>> described in the following example, I think I'm correct. ;-) )
>>
> Mmm... Interesting. Yes, I was. I was basing this assumption on this
> chunk on Dagaen's patch:
>
>     // If we become one of top [# CPUs] in the runq, tickle it
>     // TODO: make this work when multiple tickles are required
>     if ( new_position > 0 && new_position <= prv->NUM_CPUS )
>         runq_tickle(ops, svc);
>
> And forgot (and did not go check) about the __q_remove() in
> rt_schedule(). My bad again.
>
> But then, since we don't have the running vCPUs in the runq, how the
> code above is supposed to be correct?

This was my bad. I need to change so running vcpus are in runq, and
this would then be correct.

>> I tell that you make the above assumption from here.
>>
>> However, in the current implementation, runq does not hold the current
>> running VCPUs on the pCPUs. We remove the vcpu from runq in
>> rt_schedule() function. What you described above make perfect sense
>> "if" we decide to make runq hold the current running VCPUs.
>>
> Yep. And it indeed seems to me that we may well think about doing so. It
> will make it possible to base on the position for making/optimizing
> scheduling decisions, and at the same time I don't think I see much
> downsides in that, do you?
>
>> Actually, after thinking about the example you described, I think we
>> can hold the current running VCPUs *and* the current idle pCPUs in the
>> scheduler-wide structure;
>>
> What do you mean with 'current idle pCPUs'? I said something similar as
> well, and what I meant was a cpumask with bit i set if i-eth pCPU is
> idle, do you also mean this?
>
> About the running vCPUs, why just not leave them in the actual runq?

This seemed like the straightforward way to me, too.

>> In other words, we can have another runningq
>> (not runq) and a idle_pcpu list in the rt_private; Now all VCPUs are
>> stored in three queues: runningq, runq, and depletedq, in increasing
>> priority order.
>>
> Perhaps, but I'm not sure I see the need for another list. Again, why
> just not leave them in runq? I appreciate this is a rather big  change
> (although, perhaps it looks bigger said than done), but I think it could
> be worth pursuing.
>
> For double checking, asserting, and making sure that we are able to
> identify the running svc-s, we have the __RTDS_scheduled flag.

I also don't feel we need another list.

Regards,
~Dagaen Golomb

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-09  3:46             ` Meng Xu
@ 2015-07-09 16:39               ` Dagaen Golomb
  2015-07-10 10:07               ` Dario Faggioli
  1 sibling, 0 replies; 25+ messages in thread
From: Dagaen Golomb @ 2015-07-09 16:39 UTC (permalink / raw)
  To: Meng Xu
  Cc: Sisu Xi, George Dunlap, Dario Faggioli, xen-devel, mengxu, Chong Li

> That's why at the very beginning of this thread, I asked Dagaen why we
> need to return the position where a VCPU is inserted in the runq.
> Without this assumption, I don't see how we can use the returned
> position of the vcpu in runq in the current patch could be useful.
> But with this assumption, it could be very useful!

Exactly, we keep the runq as-is, but now position/sorting actually
mean something.
If you are in the first # CPUs in the runq is makes sense to be on a pCPU.


> When I proposed the runningq, I was thinking about the situation when
> we decide to split the (old) runq in RT-Xen to the (current) runq and
> depletedq; I was thinking the same reason why we split to runq and
> depletedq may still hold here when we add runningq.
>
> But after thinking twice, maybe runq approach is a better way since it
> just make the position information more meaningful. As you described
> in the previous email, we can compare the position of a vcpu inserted
> into the runq with the number of pCPUs so as to quickly know which
> pCPU to tickle.

[...]

> Actually, nothing bad. I just recalled the situation when we split a
> runq to runq and delpetedq. I was thinking it might be the case here
> as well. (Yes, it is different here since we can get more useful
> information to tickle cpu if we put vCPUs into runq instead of adding
> one more queue.) :-)

I think this is straightforward to simply use the runq. I will work on
this implementation.

Regards,
~Dagaen Golomb

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-09  3:46             ` Meng Xu
  2015-07-09 16:39               ` Dagaen Golomb
@ 2015-07-10 10:07               ` Dario Faggioli
  1 sibling, 0 replies; 25+ messages in thread
From: Dario Faggioli @ 2015-07-10 10:07 UTC (permalink / raw)
  To: Meng Xu
  Cc: Sisu Xi, George Dunlap, xen-devel, mengxu, Chong Li, Dagaen Golomb


[-- Attachment #1.1: Type: text/plain, Size: 2575 bytes --]

On Wed, 2015-07-08 at 20:46 -0700, Meng Xu wrote:
> 2015-07-08 1:01 GMT-07:00 Dario Faggioli <dario.faggioli@citrix.com>:

> But after thinking twice, maybe runq approach is a better way since it
> just make the position information more meaningful. As you described
> in the previous email, we can compare the position of a vcpu inserted
> into the runq with the number of pCPUs so as to quickly know which
> pCPU to tickle.
> 
> > I appreciate this is a rather big  change
> > (although, perhaps it looks bigger said than done), but I think it could
> > be worth pursuing.
> 
> It is worth pursuing since it simplify the cpu tickling process a lot.
> 
Great! :-)

> > I think that I'd like to know why you think adding another queue is
> > necessary, instead of just leaving the vCPUs in the actual runq. Is
> > there something bad about that which I'm missing?
> 
> Actually, nothing bad. I just recalled the situation when we split a
> runq to runq and delpetedq. I was thinking it might be the case here
> as well. (Yes, it is different here since we can get more useful
> information to tickle cpu if we put vCPUs into runq instead of adding
> one more queue.) :-)
> >
IMO, having things split could be beneficial for scalability *BUT* only
in case we also get rid of the big spin lock. Until we won't get there
(which is not that urgent) using the same queue is fine.

For runq and depletedq, the same applies, with the small difference
that, since both need to be sorted, having two actual queues looks
easier than having just one with a marker, but that's only an
implementation detail, it's fine however it is. OTOH, in this case,
using runq only, without introducing another queue, actually helps
making things simpler!

> > Yes, but if we use two queues, we defeat at least part of this
> > optimization/simplification.
> 
> Agree. Thanks! :-D
> 
Perfect. Looking also at Dagaen replies, it sounds like we have a plan!
Looking forward to see the code! :-D

> > I don't think the added space would be a problem, but I don't see why we
> > need it.
> 
> If we don't add another queue, no extra space. So we get free lunch here. :-)
> 
Yeah, but let's not exaggerate with free lunches, I'm trying to loose
some weight! :-P

Thanks and Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
  2015-07-03  5:12 Meng Xu
@ 2015-07-06 11:51 ` Dario Faggioli
  0 siblings, 0 replies; 25+ messages in thread
From: Dario Faggioli @ 2015-07-06 11:51 UTC (permalink / raw)
  To: Meng Xu; +Cc: George Dunlap, Dagaen Golomb, Sisu Xi, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 9776 bytes --]

On Thu, 2015-07-02 at 22:12 -0700, Meng Xu wrote:
> Hi Dario,
> 
Hi,

> [Since I have commented on this thread in previous email, I just
> top-post it for reminder.]
> 
> Just in case, this email is out of your radar... :-)
> 
It is on my radar. It's just that I really had to drain my queue of
bugfixes. Sorry for the dealy, I'm looking at it today.

Regards,
Dario

> I had discussed this patch with Dagaen as shown in
> http://www.gossamer-threads.com/lists/xen/devel/386651
> 
> You don't need any detailed comment for this patch. We only need your
> confirmation if this is in the correct direction and there is not
> "serious" design issue in this implementation.
> 
> Once we are confirmed with that, we can send the next version with the
> minor stuffs (such as coding style) done soon.
> 
> 
> 2015-06-27 12:46 GMT-07:00 Dagaen Golomb <dgolomb@seas.upenn.edu>:
> > To do this, we create a new list that holds, for each
> > vcpu, the time least into the future that it may need to be
> > rescheduled. The scheduler chooses the lowest time off of this
> > list and waits until the specified time instead of running every
> > 1 ms as it did before.
> >
> > Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
> > Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
> > ---
> >  xen/common/sched_rt.c |  100 +++++++++++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 93 insertions(+), 7 deletions(-)
> >
> > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> > index 4372486..cce3446 100644
> > --- a/xen/common/sched_rt.c
> > +++ b/xen/common/sched_rt.c
> > @@ -4,6 +4,7 @@
> >   *
> >   * by Sisu Xi, 2013, Washington University in Saint Louis
> >   * and Meng Xu, 2014, University of Pennsylvania
> > + * and Dagaen Golomb, 2015, University of Pennsylvania
> >   *
> >   * based on the code of credit Scheduler
> >   */
> > @@ -152,6 +153,8 @@ struct rt_private {
> >      struct list_head sdom;      /* list of availalbe domains, used for dump */
> >      struct list_head runq;      /* ordered list of runnable vcpus */
> >      struct list_head depletedq; /* unordered list of depleted vcpus */
> > +    struct timer replenishment_timer;
> > +    unsigned NUM_CPUS;
> >      cpumask_t tickled;          /* cpus been tickled */
> >  };
> >
> > @@ -178,6 +181,8 @@ struct rt_vcpu {
> >      unsigned flags;             /* mark __RTDS_scheduled, etc.. */
> >  };
> >
> > +static void runq_tickle(const struct scheduler *, struct rt_vcpu *);
> > +
> >  /*
> >   * Domain
> >   */
> > @@ -391,13 +396,19 @@ __q_remove(struct rt_vcpu *svc)
> >   * Insert svc with budget in RunQ according to EDF:
> >   * vcpus with smaller deadlines go first.
> >   * Insert svc without budget in DepletedQ unsorted;
> > + *
> > + * Returns the position, 1-indexed, of where the item was inserted.
> > + * Returns a positive index if placed on runq, and a negative index (the index
> > + *   in the depletedq * -1) if placed on the depletedq
> >   */
> > -static void
> > +static int
> >  __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> >  {
> >      struct rt_private *prv = rt_priv(ops);
> >      struct list_head *runq = rt_runq(ops);
> > +    struct list_head *depletedq = rt_depletedq(ops);
> >      struct list_head *iter;
> > +    unsigned inserted_index = 1;
> >
> >      ASSERT( spin_is_locked(&prv->lock) );
> >
> > @@ -411,15 +422,77 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> >              struct rt_vcpu * iter_svc = __q_elem(iter);
> >              if ( svc->cur_deadline <= iter_svc->cur_deadline )
> >                      break;
> > +            else
> > +                ++inserted_index;
> >           }
> >          list_add_tail(&svc->q_elem, iter);
> > +        return inserted_index;
> >      }
> >      else
> >      {
> > -        list_add(&svc->q_elem, &prv->depletedq);
> > +        // If we are inserting into previously empty depletedq, rearm
> > +        //   rearm replenish timer. It will handle further scheduling until
> > +        //   the depletedq is empty again (if ever)
> > +        if( list_empty(depletedq) )
> > +            set_timer( &prv->replenishment_timer, svc->cur_deadline );
> > +
> > +        list_for_each(iter, depletedq)
> > +        {
> > +            struct rt_vcpu * iter_svc = __q_elem(iter);
> > +            if ( svc->cur_deadline <= iter_svc->cur_deadline )
> > +                break;
> > +            else
> > +                ++inserted_index;
> > +         }
> > +        list_add_tail(&svc->q_elem, iter);
> > +        return -inserted_index;
> >      }
> >  }
> >
> > +static void replenishment_handler(void* data)
> > +{
> > +    const struct scheduler *ops = data;
> > +    struct rt_private *prv = rt_priv(ops);
> > +    struct list_head *depletedq = rt_depletedq(ops);
> > +    struct list_head *iter;
> > +    struct list_head *tmp;
> > +    struct rt_vcpu *svc = NULL;
> > +    s_time_t now = NOW();
> > +    int new_position = 0;
> > +    unsigned long flags;
> > +
> > +    spin_lock_irqsave(&prv->lock, flags);
> > +
> > +    // Replenish the vCPU that triggered this, and resort it into runq
> > +    list_for_each_safe(iter, tmp, depletedq)
> > +    {
> > +        svc = __q_elem(iter);
> > +        if ( now >= svc->cur_deadline )
> > +        {
> > +            rt_update_deadline(now, svc);
> > +            __q_remove(svc); /* remove from depleted queue */
> > +            new_position = __runq_insert(ops, svc); /* add to runq */
> > +        }
> > +        else break; // leave list_for_each_safe
> > +    }
> > +
> > +    // If we become one of top [# CPUs] in the runq, tickle it
> > +    // TODO: make this work when multiple tickles are required
> > +    if ( new_position > 0 && new_position <= prv->NUM_CPUS )
> > +        runq_tickle(ops, svc);
> > +
> > +    // Use first item on deletedq to schedule next replenishment.
> > +    // If no replenishments are pending, disable timer for now
> > +    if( !list_empty(depletedq) )
> > +    {
> > +        struct rt_vcpu * soonest_to_replenish = __q_elem(depletedq->next);
> > +        set_timer( &prv->replenishment_timer,
> > +                   soonest_to_replenish->cur_deadline );
> > +    }
> > +
> > +    spin_unlock_irqrestore(&prv->lock, flags);
> > +}
> > +
> >  /*
> >   * Init/Free related code
> >   */
> > @@ -427,6 +500,7 @@ static int
> >  rt_init(struct scheduler *ops)
> >  {
> >      struct rt_private *prv = xzalloc(struct rt_private);
> > +    int cpu = 0;
> >
> >      printk("Initializing RTDS scheduler\n"
> >             "WARNING: This is experimental software in development.\n"
> > @@ -450,6 +524,15 @@ rt_init(struct scheduler *ops)
> >      INIT_LIST_HEAD(&prv->runq);
> >      INIT_LIST_HEAD(&prv->depletedq);
> >
> > +    // Replenishment timer, for now always on CPU 0
> > +    init_timer(&prv->replenishment_timer, replenishment_handler, ops, 0);
> > +    // Calculate number of pCPUs present.
> > +    // Note: assumes does not change online.
> > +    for_each_present_cpu(cpu)
> > +    {
> > +        ++(prv->NUM_CPUS);
> > +    }
> > +
> >      cpumask_clear(&prv->tickled);
> >
> >      ops->sched_data = prv;
> > @@ -466,6 +549,8 @@ rt_deinit(const struct scheduler *ops)
> >  {
> >      struct rt_private *prv = rt_priv(ops);
> >
> > +    kill_timer(&prv->replenishment_timer);
> > +
> >      ASSERT( _cpumask_scratch && nr_rt_ops > 0 );
> >
> >      if ( (--nr_rt_ops) == 0 )
> > @@ -653,8 +738,7 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
> >      BUG_ON( sdom == NULL );
> >
> >      lock = vcpu_schedule_lock_irq(vc);
> > -    if ( __vcpu_on_q(svc) )
> > -        __q_remove(svc);
> > +    __q_remove(svc);
> >      vcpu_schedule_unlock_irq(lock, vc);
> >
> >      if ( !is_idle_vcpu(vc) )
> > @@ -838,6 +922,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
> >  {
> >      const int cpu = smp_processor_id();
> >      struct rt_private *prv = rt_priv(ops);
> > +    struct list_head *runq = rt_runq(ops);
> >      struct rt_vcpu *const scurr = rt_vcpu(current);
> >      struct rt_vcpu *snext = NULL;
> >      struct task_slice ret = { .migrated = 0 };
> > @@ -848,8 +933,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
> >      /* burn_budget would return for IDLE VCPU */
> >      burn_budget(ops, scurr, now);
> >
> > -    __repl_update(ops, now);
> > -
> >      if ( tasklet_work_scheduled )
> >      {
> >          snext = rt_vcpu(idle_vcpu[cpu]);
> > @@ -889,7 +972,10 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
> >          }
> >      }
> >
> > -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
> > +    /* Use minimum from runq */
> > +    ret.time = MAX_SCHEDULE;
> > +    if( !list_empty(runq) )
> > +        ret.time = __q_elem(runq->next)->cur_deadline;
> >      ret.task = snext->vcpu;
> >
> >      /* TRACE */
> > --
> > 1.7.9.5
> >
> 
> 
> 
> Thanks,
> 
> Meng
> 
> 
> -----------
> Meng Xu
> PhD Student in Computer and Information Science
> University of Pennsylvania
> http://www.cis.upenn.edu/~mengxu/
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
@ 2015-07-03  5:12 Meng Xu
  2015-07-06 11:51 ` Dario Faggioli
  0 siblings, 1 reply; 25+ messages in thread
From: Meng Xu @ 2015-07-03  5:12 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Chong Li, Sisu Xi, George Dunlap, xen-devel, mengxu, Jan Beulich,
	Chong Li, Dagaen Golomb

Hi Dario,

[Since I have commented on this thread in previous email, I just
top-post it for reminder.]

Just in case, this email is out of your radar... :-)

I had discussed this patch with Dagaen as shown in
http://www.gossamer-threads.com/lists/xen/devel/386651

You don't need any detailed comment for this patch. We only need your
confirmation if this is in the correct direction and there is not
"serious" design issue in this implementation.

Once we are confirmed with that, we can send the next version with the
minor stuffs (such as coding style) done soon.


2015-06-27 12:46 GMT-07:00 Dagaen Golomb <dgolomb@seas.upenn.edu>:
> To do this, we create a new list that holds, for each
> vcpu, the time least into the future that it may need to be
> rescheduled. The scheduler chooses the lowest time off of this
> list and waits until the specified time instead of running every
> 1 ms as it did before.
>
> Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
> Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
> ---
>  xen/common/sched_rt.c |  100 +++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 93 insertions(+), 7 deletions(-)
>
> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> index 4372486..cce3446 100644
> --- a/xen/common/sched_rt.c
> +++ b/xen/common/sched_rt.c
> @@ -4,6 +4,7 @@
>   *
>   * by Sisu Xi, 2013, Washington University in Saint Louis
>   * and Meng Xu, 2014, University of Pennsylvania
> + * and Dagaen Golomb, 2015, University of Pennsylvania
>   *
>   * based on the code of credit Scheduler
>   */
> @@ -152,6 +153,8 @@ struct rt_private {
>      struct list_head sdom;      /* list of availalbe domains, used for dump */
>      struct list_head runq;      /* ordered list of runnable vcpus */
>      struct list_head depletedq; /* unordered list of depleted vcpus */
> +    struct timer replenishment_timer;
> +    unsigned NUM_CPUS;
>      cpumask_t tickled;          /* cpus been tickled */
>  };
>
> @@ -178,6 +181,8 @@ struct rt_vcpu {
>      unsigned flags;             /* mark __RTDS_scheduled, etc.. */
>  };
>
> +static void runq_tickle(const struct scheduler *, struct rt_vcpu *);
> +
>  /*
>   * Domain
>   */
> @@ -391,13 +396,19 @@ __q_remove(struct rt_vcpu *svc)
>   * Insert svc with budget in RunQ according to EDF:
>   * vcpus with smaller deadlines go first.
>   * Insert svc without budget in DepletedQ unsorted;
> + *
> + * Returns the position, 1-indexed, of where the item was inserted.
> + * Returns a positive index if placed on runq, and a negative index (the index
> + *   in the depletedq * -1) if placed on the depletedq
>   */
> -static void
> +static int
>  __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>  {
>      struct rt_private *prv = rt_priv(ops);
>      struct list_head *runq = rt_runq(ops);
> +    struct list_head *depletedq = rt_depletedq(ops);
>      struct list_head *iter;
> +    unsigned inserted_index = 1;
>
>      ASSERT( spin_is_locked(&prv->lock) );
>
> @@ -411,15 +422,77 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>              struct rt_vcpu * iter_svc = __q_elem(iter);
>              if ( svc->cur_deadline <= iter_svc->cur_deadline )
>                      break;
> +            else
> +                ++inserted_index;
>           }
>          list_add_tail(&svc->q_elem, iter);
> +        return inserted_index;
>      }
>      else
>      {
> -        list_add(&svc->q_elem, &prv->depletedq);
> +        // If we are inserting into previously empty depletedq, rearm
> +        //   rearm replenish timer. It will handle further scheduling until
> +        //   the depletedq is empty again (if ever)
> +        if( list_empty(depletedq) )
> +            set_timer( &prv->replenishment_timer, svc->cur_deadline );
> +
> +        list_for_each(iter, depletedq)
> +        {
> +            struct rt_vcpu * iter_svc = __q_elem(iter);
> +            if ( svc->cur_deadline <= iter_svc->cur_deadline )
> +                break;
> +            else
> +                ++inserted_index;
> +         }
> +        list_add_tail(&svc->q_elem, iter);
> +        return -inserted_index;
>      }
>  }
>
> +static void replenishment_handler(void* data)
> +{
> +    const struct scheduler *ops = data;
> +    struct rt_private *prv = rt_priv(ops);
> +    struct list_head *depletedq = rt_depletedq(ops);
> +    struct list_head *iter;
> +    struct list_head *tmp;
> +    struct rt_vcpu *svc = NULL;
> +    s_time_t now = NOW();
> +    int new_position = 0;
> +    unsigned long flags;
> +
> +    spin_lock_irqsave(&prv->lock, flags);
> +
> +    // Replenish the vCPU that triggered this, and resort it into runq
> +    list_for_each_safe(iter, tmp, depletedq)
> +    {
> +        svc = __q_elem(iter);
> +        if ( now >= svc->cur_deadline )
> +        {
> +            rt_update_deadline(now, svc);
> +            __q_remove(svc); /* remove from depleted queue */
> +            new_position = __runq_insert(ops, svc); /* add to runq */
> +        }
> +        else break; // leave list_for_each_safe
> +    }
> +
> +    // If we become one of top [# CPUs] in the runq, tickle it
> +    // TODO: make this work when multiple tickles are required
> +    if ( new_position > 0 && new_position <= prv->NUM_CPUS )
> +        runq_tickle(ops, svc);
> +
> +    // Use first item on deletedq to schedule next replenishment.
> +    // If no replenishments are pending, disable timer for now
> +    if( !list_empty(depletedq) )
> +    {
> +        struct rt_vcpu * soonest_to_replenish = __q_elem(depletedq->next);
> +        set_timer( &prv->replenishment_timer,
> +                   soonest_to_replenish->cur_deadline );
> +    }
> +
> +    spin_unlock_irqrestore(&prv->lock, flags);
> +}
> +
>  /*
>   * Init/Free related code
>   */
> @@ -427,6 +500,7 @@ static int
>  rt_init(struct scheduler *ops)
>  {
>      struct rt_private *prv = xzalloc(struct rt_private);
> +    int cpu = 0;
>
>      printk("Initializing RTDS scheduler\n"
>             "WARNING: This is experimental software in development.\n"
> @@ -450,6 +524,15 @@ rt_init(struct scheduler *ops)
>      INIT_LIST_HEAD(&prv->runq);
>      INIT_LIST_HEAD(&prv->depletedq);
>
> +    // Replenishment timer, for now always on CPU 0
> +    init_timer(&prv->replenishment_timer, replenishment_handler, ops, 0);
> +    // Calculate number of pCPUs present.
> +    // Note: assumes does not change online.
> +    for_each_present_cpu(cpu)
> +    {
> +        ++(prv->NUM_CPUS);
> +    }
> +
>      cpumask_clear(&prv->tickled);
>
>      ops->sched_data = prv;
> @@ -466,6 +549,8 @@ rt_deinit(const struct scheduler *ops)
>  {
>      struct rt_private *prv = rt_priv(ops);
>
> +    kill_timer(&prv->replenishment_timer);
> +
>      ASSERT( _cpumask_scratch && nr_rt_ops > 0 );
>
>      if ( (--nr_rt_ops) == 0 )
> @@ -653,8 +738,7 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
>      BUG_ON( sdom == NULL );
>
>      lock = vcpu_schedule_lock_irq(vc);
> -    if ( __vcpu_on_q(svc) )
> -        __q_remove(svc);
> +    __q_remove(svc);
>      vcpu_schedule_unlock_irq(lock, vc);
>
>      if ( !is_idle_vcpu(vc) )
> @@ -838,6 +922,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>  {
>      const int cpu = smp_processor_id();
>      struct rt_private *prv = rt_priv(ops);
> +    struct list_head *runq = rt_runq(ops);
>      struct rt_vcpu *const scurr = rt_vcpu(current);
>      struct rt_vcpu *snext = NULL;
>      struct task_slice ret = { .migrated = 0 };
> @@ -848,8 +933,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>      /* burn_budget would return for IDLE VCPU */
>      burn_budget(ops, scurr, now);
>
> -    __repl_update(ops, now);
> -
>      if ( tasklet_work_scheduled )
>      {
>          snext = rt_vcpu(idle_vcpu[cpu]);
> @@ -889,7 +972,10 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>          }
>      }
>
> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
> +    /* Use minimum from runq */
> +    ret.time = MAX_SCHEDULE;
> +    if( !list_empty(runq) )
> +        ret.time = __q_elem(runq->next)->cur_deadline;
>      ret.task = snext->vcpu;
>
>      /* TRACE */
> --
> 1.7.9.5
>



Thanks,

Meng


-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling.
@ 2015-06-27 19:35 Dagaen Golomb
  0 siblings, 0 replies; 25+ messages in thread
From: Dagaen Golomb @ 2015-06-27 19:35 UTC (permalink / raw)
  To: xen-devel; +Cc: Dagaen Golomb

To do this, we create a new list that holds, for each
vcpu, the time least into the future that it may need to be
rescheduled. The scheduler chooses the lowest time off of this
list and waits until the specified time instead of running every
1 ms as it did before.

Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
---
 xen/common/sched_rt.c |  100 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 93 insertions(+), 7 deletions(-)

diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 4372486..cce3446 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -4,6 +4,7 @@
  *
  * by Sisu Xi, 2013, Washington University in Saint Louis
  * and Meng Xu, 2014, University of Pennsylvania
+ * and Dagaen Golomb, 2015, University of Pennsylvania
  *
  * based on the code of credit Scheduler
  */
@@ -152,6 +153,8 @@ struct rt_private {
     struct list_head sdom;      /* list of availalbe domains, used for dump */
     struct list_head runq;      /* ordered list of runnable vcpus */
     struct list_head depletedq; /* unordered list of depleted vcpus */
+    struct timer replenishment_timer;
+    unsigned NUM_CPUS;
     cpumask_t tickled;          /* cpus been tickled */
 };
 
@@ -178,6 +181,8 @@ struct rt_vcpu {
     unsigned flags;             /* mark __RTDS_scheduled, etc.. */
 };
 
+static void runq_tickle(const struct scheduler *, struct rt_vcpu *);
+
 /*
  * Domain
  */
@@ -391,13 +396,19 @@ __q_remove(struct rt_vcpu *svc)
  * Insert svc with budget in RunQ according to EDF:
  * vcpus with smaller deadlines go first.
  * Insert svc without budget in DepletedQ unsorted;
+ *
+ * Returns the position, 1-indexed, of where the item was inserted.
+ * Returns a positive index if placed on runq, and a negative index (the index
+ *   in the depletedq * -1) if placed on the depletedq
  */
-static void
+static int
 __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
 {
     struct rt_private *prv = rt_priv(ops);
     struct list_head *runq = rt_runq(ops);
+    struct list_head *depletedq = rt_depletedq(ops);
     struct list_head *iter;
+    unsigned inserted_index = 1;
 
     ASSERT( spin_is_locked(&prv->lock) );
 
@@ -411,15 +422,77 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
             struct rt_vcpu * iter_svc = __q_elem(iter);
             if ( svc->cur_deadline <= iter_svc->cur_deadline )
                     break;
+            else
+                ++inserted_index;
          }
         list_add_tail(&svc->q_elem, iter);
+        return inserted_index;
     }
     else
     {
-        list_add(&svc->q_elem, &prv->depletedq);
+        // If we are inserting into previously empty depletedq, rearm
+        //   rearm replenish timer. It will handle further scheduling until
+        //   the depletedq is empty again (if ever)
+        if( list_empty(depletedq) )
+            set_timer( &prv->replenishment_timer, svc->cur_deadline );
+
+        list_for_each(iter, depletedq)
+        {
+            struct rt_vcpu * iter_svc = __q_elem(iter);
+            if ( svc->cur_deadline <= iter_svc->cur_deadline )
+                break;
+            else
+                ++inserted_index;
+         }
+        list_add_tail(&svc->q_elem, iter);
+        return -inserted_index;
     }
 }
 
+static void replenishment_handler(void* data)
+{
+    const struct scheduler *ops = data;
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *depletedq = rt_depletedq(ops);
+    struct list_head *iter;
+    struct list_head *tmp;
+    struct rt_vcpu *svc = NULL;
+    s_time_t now = NOW();
+    int new_position = 0;
+    unsigned long flags;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    // Replenish the vCPU that triggered this, and resort it into runq
+    list_for_each_safe(iter, tmp, depletedq)
+    {
+        svc = __q_elem(iter);
+        if ( now >= svc->cur_deadline )
+        {
+            rt_update_deadline(now, svc);
+            __q_remove(svc); /* remove from depleted queue */
+            new_position = __runq_insert(ops, svc); /* add to runq */
+        }
+        else break; // leave list_for_each_safe
+    }
+
+    // If we become one of top [# CPUs] in the runq, tickle it
+    // TODO: make this work when multiple tickles are required
+    if ( new_position > 0 && new_position <= prv->NUM_CPUS )
+        runq_tickle(ops, svc);
+
+    // Use first item on deletedq to schedule next replenishment.
+    // If no replenishments are pending, disable timer for now
+    if( !list_empty(depletedq) )
+    {
+        struct rt_vcpu * soonest_to_replenish = __q_elem(depletedq->next);
+        set_timer( &prv->replenishment_timer,
+                   soonest_to_replenish->cur_deadline );
+    }
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
 /*
  * Init/Free related code
  */
@@ -427,6 +500,7 @@ static int
 rt_init(struct scheduler *ops)
 {
     struct rt_private *prv = xzalloc(struct rt_private);
+    int cpu = 0;
 
     printk("Initializing RTDS scheduler\n"
            "WARNING: This is experimental software in development.\n"
@@ -450,6 +524,15 @@ rt_init(struct scheduler *ops)
     INIT_LIST_HEAD(&prv->runq);
     INIT_LIST_HEAD(&prv->depletedq);
 
+    // Replenishment timer, for now always on CPU 0
+    init_timer(&prv->replenishment_timer, replenishment_handler, ops, 0);
+    // Calculate number of pCPUs present.
+    // Note: assumes does not change online.
+    for_each_present_cpu(cpu)
+    {
+        ++(prv->NUM_CPUS);
+    }
+    
     cpumask_clear(&prv->tickled);
 
     ops->sched_data = prv;
@@ -466,6 +549,8 @@ rt_deinit(const struct scheduler *ops)
 {
     struct rt_private *prv = rt_priv(ops);
 
+    kill_timer(&prv->replenishment_timer);
+
     ASSERT( _cpumask_scratch && nr_rt_ops > 0 );
 
     if ( (--nr_rt_ops) == 0 )
@@ -653,8 +738,7 @@ rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
     BUG_ON( sdom == NULL );
 
     lock = vcpu_schedule_lock_irq(vc);
-    if ( __vcpu_on_q(svc) )
-        __q_remove(svc);
+    __q_remove(svc);
     vcpu_schedule_unlock_irq(lock, vc);
 
     if ( !is_idle_vcpu(vc) )
@@ -838,6 +922,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
 {
     const int cpu = smp_processor_id();
     struct rt_private *prv = rt_priv(ops);
+    struct list_head *runq = rt_runq(ops);
     struct rt_vcpu *const scurr = rt_vcpu(current);
     struct rt_vcpu *snext = NULL;
     struct task_slice ret = { .migrated = 0 };
@@ -848,8 +933,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
     /* burn_budget would return for IDLE VCPU */
     burn_budget(ops, scurr, now);
 
-    __repl_update(ops, now);
-
     if ( tasklet_work_scheduled )
     {
         snext = rt_vcpu(idle_vcpu[cpu]);
@@ -889,7 +972,10 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
         }
     }
 
-    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
+    /* Use minimum from runq */
+    ret.time = MAX_SCHEDULE;
+    if( !list_empty(runq) )
+        ret.time = __q_elem(runq->next)->cur_deadline;
     ret.task = snext->vcpu;
 
     /* TRACE */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2015-07-10 10:07 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-27 19:46 [PATCH v2] Modified RTDS scheduler to use an event-driven model instead of polling Dagaen Golomb
2015-06-27 19:53 ` Dagaen Golomb
2015-06-28  7:10   ` Meng Xu
2015-06-28 17:49     ` Dagaen Golomb
2015-06-28 19:11       ` Meng Xu
2015-06-28 20:17         ` Dagaen Golomb
2015-06-28 20:45           ` Meng Xu
2015-07-06 17:37             ` Dario Faggioli
2015-07-06 17:41           ` Dario Faggioli
2015-07-09 16:19             ` Dagaen Golomb
2015-06-28  7:06 ` Meng Xu
2015-07-06 17:30   ` Dario Faggioli
2015-07-07  5:51     ` Meng Xu
2015-07-07 14:03       ` Dario Faggioli
2015-07-08  5:56         ` Meng Xu
2015-07-08  8:01           ` Dario Faggioli
2015-07-09  3:46             ` Meng Xu
2015-07-09 16:39               ` Dagaen Golomb
2015-07-10 10:07               ` Dario Faggioli
2015-07-09 16:35             ` Dagaen Golomb
2015-07-09 16:31           ` Dagaen Golomb
2015-07-09 16:17     ` Dagaen Golomb
  -- strict thread matches above, loose matches on Subject: below --
2015-07-03  5:12 Meng Xu
2015-07-06 11:51 ` Dario Faggioli
2015-06-27 19:35 Dagaen Golomb

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.