[PATCH] Scheduling groups, credit scheduler support

* [PATCH] Scheduling groups, credit scheduler support
@ 2007-11-29 20:19 Mike D. Day
  2007-11-29 22:36 ` Chris B
                   ` (4 more replies)
  0 siblings, 5 replies; 24+ messages in thread
From: Mike D. Day @ 2007-11-29 20:19 UTC (permalink / raw)
  To: xen-devel

The credit implementation is limited to the sharing time slices among
group members. All members of a group must share the master's time
slices with the master. If the group master is capped at 20%, the
cumulative total of the master and all members will be 20%. This is
specifically to support stub domains, which will host the device model
for hvm domains. The proper way to schedule a stub domain is for it to
share the times slices allocated the the stub's hvm domain.

The credit scheduler is driven by its accounting function. Each domain
is given a credit amount sufficient to run each of its vcpus for one
scheduling cycle. The scheduler divides the total domain credit by the
number of vcpus and allocates each vcpu its share of the domain's
credits. A domain with two vcpus has each of its vcpus given 1/2 of
the domain's credits.

The credit scheduler subtracts credits from each vcpu for every time
slice that vcpu runs. When a vcpu has consumed its credit or exceeded
its cap the credit scheduler puts that vcpu to sleep. At the beginning
of each new scheduling cycle sleeping vcpus that have work are
awakened and given a new share of credits.

The credit scheduler runs vcpus, not domains. However, a domain's
vcpus are given time slices according to the credits available to the
domain and any caps placed on the domain. Therefore, the simplest way
to group domains together in the credit scheduler is to assign the
member domain's vcpus to the master domain. Each vcpu assigned to the
master domain receives a credit equal to the master domain's total
credit divided by the number of assigned vcpus. This forces all the
member domains to share the master domain's credits with the master,
which achieves the desired behavior. 

The primary accounting function in the credit scheduler is unmodified,
save for the removal of one debugging line. All of the group
processing is handled off the fast path. There are no additional locks
and the only new locked section is the grouping/ungrouping of domains,
which happens infrequently. Although I have yet to run any micro
benchmarks I anticipate no difference in the performance of the credit
scheduler with these patches applied.

Each struct csched_vcpu receives five new members: A list_head to hold
grouped domains (for the master), and another list head to place
member domains on the master's list; a pointer to the master domain
(for members), and two bool_t members to hold the domain's grouping state.

Domains are added to a group by the function
add_member_to_master. This routine moves the member domain's vcpus to
the master by calling delegate_active_vcpu.

delegate_active_vcpu migrates all the member domain's active vcpus to
the new master. If necessary it then removes the member domain from
the credit scheduler's list of active domains.

When a new vcpu is made active by csched_vcpu_acct_start, that vcpu is
always added to the domain master if the vcpu belongs to a domain
member. This and an equivalent line in __csched_vcpu_acct_stop
comprise the only new code that executes on the fast path:

static inline struct csched_dom *master_dom(struct csched_dom *d)
{
    if ( d->is_member )
        return d->master;
    return d;
}

When a domain is removed from a group, the inverse occurs. First the
former member domain's vcpus are returned by a call to
reclaim_active_vcpus. In addition to reclaiming the vcpus, the
(former) member domain is removed from the master's list. If it has
any active vcpus, the former member is placed on the credit
scheduler's list of active domains.

The remainder of the code handles the sched-group sub op and ensures
that a destroyed domain's grouping properties are properly handled and
that vcpus end up in the right place: either destroyed with their
domain or moved back to the (former) group member which owns them.

Signed-off-by: Mike D. Day <ncmike@us.ibm.com>

--
sched_credit.c |  267 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 259 insertions(+), 8 deletions(-)

-- 

diff -r 0bff1fad920a xen/common/sched_credit.c

--- a/xen/common/sched_credit.c	Wed May 09 16:41:28 2007 -0400
+++ b/xen/common/sched_credit.c	Thu May 10 16:45:21 2007 -0400
@@ -219,10 +219,15 @@ struct csched_dom {
 struct csched_dom {
     struct list_head active_vcpu;
     struct list_head active_sdom_elem;
+    struct list_head group;
+    struct list_head group_elem;
+    struct csched_dom *master;
     struct domain *dom;
     uint16_t active_vcpu_count;
     uint16_t weight;
     uint16_t cap;
+    bool_t is_master;
+    bool_t is_member;
 };
 
 /*
@@ -344,6 +349,118 @@ __runq_tickle(unsigned int cpu, struct c
         cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
 }
 
+static inline struct csched_dom *csched_dom(struct domain *d)
+{
+    return (struct csched_dom *)d->sched_priv;
+}
+
+static inline struct csched_dom *get_master_dom(struct csched_dom *d)
+{
+    if ( d->is_member )
+    {
+        if ( get_domain(d->master->dom) )
+            return d->master;
+        BUG();
+    }
+    return NULL;
+}
+
+static inline struct csched_dom *master_dom(struct csched_dom *d)
+{
+    if ( d->is_member )
+        return d->master;
+    return d;
+}
+
+static inline void delegate_active_vcpus(struct csched_dom *member,
+                                         struct csched_dom *master)
+{
+    BUG_ON( ! ( member->is_member ) );
+    BUG_ON( member->master != master );
+    if ( member->is_member && member->master == master )
+    {
+        struct list_head *elem;
+
+        while ( !list_empty(&member->active_vcpu) )
+        {
+            elem = member->active_vcpu.next;
+            list_del(elem);
+            list_add(elem, &master->active_vcpu);
+            member->active_vcpu_count--;
+            master->active_vcpu_count++;
+        }
+
+        if ( !list_empty(&member->active_sdom_elem) )
+        {
+            list_del_init(&member->active_sdom_elem);
+            csched_priv.weight -= member->weight;
+        }
+
+        if ( list_empty(&master->active_sdom_elem) )
+        {
+            list_add(&master->active_sdom_elem, &csched_priv.active_sdom);
+            csched_priv.weight += master->weight;
+        }
+    }
+}
+
+static inline void reclaim_active_vcpus(struct csched_dom *master,
+                                        struct csched_dom *member)
+{
+    BUG_ON( !master->is_master );
+    BUG_ON( member->master != master );
+    if ( master->is_master && member->master == master )
+    {
+        struct csched_vcpu *iter, *n;
+
+        list_for_each_entry_safe( iter, n, &master->active_vcpu,
+                                  active_vcpu_elem )
+        {
+            if ( iter->sdom == member )
+            {
+                list_del(&iter->active_vcpu_elem);
+                list_add(&iter->active_vcpu_elem, &member->active_vcpu);
+                master->active_vcpu_count--;
+                member->active_vcpu_count++;
+            }
+        }
+
+        if ( list_empty(&master->active_vcpu) &&
+            !list_empty(&master->active_sdom_elem) )
+        {
+            list_del_init(&master->active_sdom_elem);
+            csched_priv.weight -= master->weight;
+        }
+        if ( !list_empty(&member->active_vcpu) &&
+            list_empty(&member->active_sdom_elem) )
+        {
+            list_add(&member->active_sdom_elem, &csched_priv.active_sdom);
+            csched_priv.weight += member->weight;
+        }
+    }
+}
+
+static inline void add_member_to_master(struct csched_dom *member,
+                                        struct csched_dom *master)
+{
+    list_add(&member->group_elem, &master->group);
+    member->master = master;
+    member->is_member = 1;
+    master->is_master = 1;
+    delegate_active_vcpus(member, master);
+}
+
+static inline void rem_member_from_master(struct csched_dom *member,
+                                          struct csched_dom *master)
+{
+    reclaim_active_vcpus(master, member);
+    member->is_member = 0;
+    member->master = NULL;
+    list_del(&member->group_elem);
+    if (list_empty(&master->group))
+        master->is_master = 0;
+}
+
 static int
 csched_pcpu_init(int cpu)
 {
@@ -395,6 +512,17 @@ __csched_vcpu_check(struct vcpu *vc)
     else
     {
         BUG_ON( !is_idle_vcpu(vc) );
+    }
+
+    if ( sdom->is_master )
+    {
+        BUG_ON( list_empty(&sdom->group) );
+        BUG_ON( sdom->is_member );
+    }
+    if ( sdom->is_member )
+    {
+        BUG_ON( list_empty(&sdom->group_elem) );
+        BUG_ON( sdom->is_master );
     }
 
     CSCHED_STAT_CRANK(vcpu_check);
@@ -486,11 +614,11 @@ static inline void
 static inline void
 __csched_vcpu_acct_start(struct csched_vcpu *svc)
 {
-    struct csched_dom * const sdom = svc->sdom;
     unsigned long flags;
-
+    struct csched_dom * sdom;
     spin_lock_irqsave(&csched_priv.lock, flags);
 
+    sdom = master_dom(svc->sdom);
     if ( list_empty(&svc->active_vcpu_elem) )
     {
         CSCHED_VCPU_STAT_CRANK(svc, state_active);
@@ -504,14 +632,13 @@ __csched_vcpu_acct_start(struct csched_v
             csched_priv.weight += sdom->weight;
         }
     }
-
     spin_unlock_irqrestore(&csched_priv.lock, flags);
 }
 
 static inline void
 __csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
 {
-    struct csched_dom * const sdom = svc->sdom;
+    struct csched_dom * const sdom = master_dom(svc->sdom);
 
     BUG_ON( list_empty(&svc->active_vcpu_elem) );
 
@@ -605,20 +732,34 @@ csched_vcpu_init(struct vcpu *vc)
     return 0;
 }
 
+static void group_cleanup(struct csched_vcpu *svc)
+{
+    if ( svc->sdom->is_member )
+        rem_member_from_master(svc->sdom, master_dom(svc->sdom));
+    if ( svc->sdom->is_master )
+    {
+        struct csched_dom *iter, *n;
+        list_for_each_entry_safe( iter, n, &svc->sdom->group, group_elem )
+        {
+            rem_member_from_master(iter, svc->sdom);
+        }
+    }
+}
+
+
 static void
 csched_vcpu_destroy(struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
-    struct csched_dom * const sdom = svc->sdom;
     unsigned long flags;
 
     CSCHED_STAT_CRANK(vcpu_destroy);
 
-    BUG_ON( sdom == NULL );
     BUG_ON( !list_empty(&svc->runq_elem) );
 
     spin_lock_irqsave(&csched_priv.lock, flags);
 
+    group_cleanup(svc);
     if ( !list_empty(&svc->active_vcpu_elem) )
         __csched_vcpu_acct_stop_locked(svc);
 
@@ -697,6 +838,112 @@ csched_vcpu_wake(struct vcpu *vc)
     __runq_tickle(cpu, svc);
 }
 
+static inline int
+_sanity_check(struct csched_dom *member, struct csched_dom *master)
+{
+    if ( member->dom->domain_id == master->dom->domain_id )
+        return SGRP_err_same_id;
+    if ( member->is_master )
+        return SGRP_err_already_master;
+    if ( master->is_member )
+        return SGRP_err_already_member;
+    return 0;
+}
+
+static inline int
+add_sanity_check(struct csched_dom *member, struct csched_dom *master)
+{
+    if ( member->master )
+        return SGRP_err_inval;
+    return _sanity_check(member, master);
+}
+
+static inline int
+rem_sanity_check(struct csched_dom *member, struct csched_dom *master)
+{
+    if ( member->is_member && member->master && member->master == master )
+        return _sanity_check(member, master);
+    return SGRP_err_inval;
+}
+
+static int csched_group_op(struct xen_domctl_group * op)
+{
+    int ret = -EINVAL;
+
+    switch(op->op)
+    {
+    case SGRP_get_status:
+    case SGRP_get_master:
+    {
+        struct domain *dom = get_domain_by_id(op->id_member);
+        if ( dom )
+        {
+            struct csched_dom *cdom = csched_dom(dom);
+            if ( op->op == SGRP_get_status )
+            {
+                op->is_master = cdom->is_master;
+                op->is_member = cdom->is_member;
+            }
+            else
+            {
+                struct csched_dom *master = get_master_dom(cdom);
+                if ( master )
+                {
+                    op->id_master = master->dom->domain_id;
+                    put_domain(master->dom);
+                }
+                else
+                    op->reason = SGRP_err_not_member;
+            }
+            put_domain(dom);
+            ret = 0;
+        }
+        break;
+    }
+
+    case SGRP_add_member:
+    case SGRP_del_member:
+    {
+        struct domain *member, *master;
+        unsigned long flags;
+
+        master  = get_domain_by_id(op->id_master);
+        if ( !master )
+            break;
+        member = get_domain_by_id(op->id_member);
+        if ( !member )
+            goto release_master;
+        ret = 0;
+        if ( op->op == SGRP_add_member )
+            op->reason =
+                add_sanity_check(csched_dom(member), csched_dom(master));
+        else
+            op->reason =
+                rem_sanity_check(csched_dom(member), csched_dom(master));
+        if ( op->reason )
+            goto release_member;
+
+        spin_lock_irqsave(&csched_priv.lock, flags);
+        if ( op->op == SGRP_add_member )
+            add_member_to_master(csched_dom(member), csched_dom(master));
+        else
+            rem_member_from_master(csched_dom(member), csched_dom(master));
+        spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+release_member:
+        put_domain(member);
+release_master:
+        put_domain(master);
+
+        break;
+    }
+    default:
+        break;
+    }
+
+    return ret;
+}
+
 static int
 csched_dom_cntl(
     struct domain *d,
@@ -754,10 +1001,14 @@ csched_dom_init(struct domain *dom)
     sdom->active_vcpu_count = 0;
     INIT_LIST_HEAD(&sdom->active_sdom_elem);
     sdom->dom = dom;
+    sdom->master = NULL;
     sdom->weight = CSCHED_DEFAULT_WEIGHT;
     sdom->cap = 0U;
     dom->sched_priv = sdom;
-
+    INIT_LIST_HEAD(&sdom->group);
+    INIT_LIST_HEAD(&sdom->group_elem);
+    sdom->is_master = 0;
+    sdom->is_member = 0;
     return 0;
 }
 
@@ -942,7 +1193,6 @@ csched_acct(void)
         list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
         {
             svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
-            BUG_ON( sdom != svc->sdom );
 
             /* Increment credit */
             atomic_add(credit_fair, &svc->credit);
@@ -1384,6 +1634,7 @@ struct scheduler sched_credit_def = {
     .sleep          = csched_vcpu_sleep,
     .wake           = csched_vcpu_wake,
 
+    .group_op        = csched_group_op,
     .adjust         = csched_dom_cntl,
 
     .pick_cpu       = csched_cpu_pick,

-- 
Mike D. Day
Virtualization Architect and Sr. Technical Staff Member, IBM LTC
Cell: 919 412-3900
ST: mdday@us.ibm.com | AIM: ncmikeday | Yahoo IM: ultra.runner
PGP key: http://www.ncultra.org/ncmike/pubkey.asc

^ permalink raw reply	[flat|nested] 24+ messages in thread