From mboxrd@z Thu Jan 1 00:00:00 1970 From: Meng Xu Subject: [PATCH v2 1/4] xen: add real time scheduler rt Date: Sun, 7 Sep 2014 15:40:58 -0400 Message-ID: <1410118861-2671-2-git-send-email-mengxu@cis.upenn.edu> References: <1410118861-2671-1-git-send-email-mengxu@cis.upenn.edu> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1410118861-2671-1-git-send-email-mengxu@cis.upenn.edu> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: xen-devel@lists.xen.org Cc: ian.campbell@citrix.com, xisisu@gmail.com, stefano.stabellini@eu.citrix.com, george.dunlap@eu.citrix.com, lu@cse.wustl.edu, dario.faggioli@citrix.com, ian.jackson@eu.citrix.com, ptxlinh@gmail.com, xumengpanda@gmail.com, Meng Xu , JBeulich@suse.com, chaowang@wustl.edu, lichong659@gmail.com, dgolomb@seas.upenn.edu List-Id: xen-devel@lists.xenproject.org This scheduler follows the Preemptive Global Earliest Deadline First (EDF) theory in real-time field. At any scheduling point, the VCPU with earlier deadline has higher priority. The scheduler always picks the highest priority VCPU to run on a feasible PCPU. A PCPU is feasible if the VCPU can run on this PCPU and (the PCPU is idle or has a lower-priority VCPU running on it.) Each VCPU has a dedicated period and budget. The deadline of a VCPU is at the end of each of its periods; A VCPU has its budget replenished at the beginning of each of its periods; While scheduled, a VCPU burns its budget. The VCPU needs to finish its budget before its deadline in each period; The VCPU discards its unused budget at the end of each of its periods. If a VCPU runs out of budget in a period, it has to wait until next period. Each VCPU is implemented as a deferable server. When a VCPU has a task running on it, its budget is continuously burned; When a VCPU has no task but with budget left, its budget is preserved. Queue scheme: A global runqueue for each CPU pool. The runqueue holds all runnable VCPUs. VCPUs in the runqueue are divided into two parts: with and without budget. At the first part, VCPUs are sorted based on EDF priority scheme. Note: cpumask and cpupool is supported. This is an experimental scheduler. Signed-off-by: Meng Xu Signed-off-by: Sisu Xi --- xen/common/Makefile | 1 + xen/common/sched_rt.c | 1146 +++++++++++++++++++++++++++++++++++++++++++ xen/common/schedule.c | 1 + xen/include/public/domctl.h | 6 + xen/include/public/trace.h | 1 + xen/include/xen/sched-if.h | 1 + 6 files changed, 1156 insertions(+) create mode 100644 xen/common/sched_rt.c diff --git a/xen/common/Makefile b/xen/common/Makefile index 3683ae3..5a23aa4 100644 --- a/xen/common/Makefile +++ b/xen/common/Makefile @@ -26,6 +26,7 @@ obj-y += sched_credit.o obj-y += sched_credit2.o obj-y += sched_sedf.o obj-y += sched_arinc653.o +obj-y += sched_rt.o obj-y += schedule.o obj-y += shutdown.o obj-y += softirq.o diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c new file mode 100644 index 0000000..412f8b1 --- /dev/null +++ b/xen/common/sched_rt.c @@ -0,0 +1,1146 @@ +/****************************************************************************** + * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen + * EDF scheduling is a real-time scheduling algorithm used in embedded field. + * + * by Sisu Xi, 2013, Washington University in Saint Louis + * and Meng Xu, 2014, University of Pennsylvania + * + * based on the code of credit Scheduler + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * TODO: + * + * Migration compensation and resist like credit2 to better use cache; + * Lock Holder Problem, using yield? + * Self switch problem: VCPUs of the same domain may preempt each other; + */ + +/* + * Design: + * + * This scheduler follows the Preemptive Global Earliest Deadline First (EDF) + * theory in real-time field. + * At any scheduling point, the VCPU with earlier deadline has higher priority. + * The scheduler always picks highest priority VCPU to run on a feasible PCPU. + * A PCPU is feasible if the VCPU can run on this PCPU and (the PCPU is idle or + * has a lower-priority VCPU running on it.) + * + * Each VCPU has a dedicated period and budget. + * The deadline of a VCPU is at the end of each of its periods; + * A VCPU has its budget replenished at the beginning of each of its periods; + * While scheduled, a VCPU burns its budget. + * The VCPU needs to finish its budget before its deadline in each period; + * The VCPU discards its unused budget at the end of each of its periods. + * If a VCPU runs out of budget in a period, it has to wait until next period. + * + * Each VCPU is implemented as a deferable server. + * When a VCPU has a task running on it, its budget is continuously burned; + * When a VCPU has no task but with budget left, its budget is preserved. + * + * Queue scheme: A global runqueue for each CPU pool. + * The runqueue holds all runnable VCPUs. + * VCPUs in the runqueue are divided into two parts: + * with and without remaining budget. + * At the first part, VCPUs are sorted based on EDF priority scheme. + * + * Note: cpumask and cpupool is supported. + */ + +/* + * Locking: + * A global system lock is used to protect the RunQ. + * The global lock is referenced by schedule_data.schedule_lock + * from all physical cpus. + * + * The lock is already grabbed when calling wake/sleep/schedule/ functions + * in schedule.c + * + * The functions involes RunQ and needs to grab locks are: + * vcpu_insert, vcpu_remove, context_saved, __runq_insert + */ + + +/* + * Default parameters: + * Period and budget in default is 10 and 4 ms, respectively + */ +#define RT_DS_DEFAULT_PERIOD (MICROSECS(10000)) +#define RT_DS_DEFAULT_BUDGET (MICROSECS(4000)) + +/* + * Flags + */ +/* + * RT_scheduled: Is this vcpu either running on, or context-switching off, + * a phyiscal cpu? + * + Accessed only with Runqueue lock held. + * + Set when chosen as next in rt_schedule(). + * + Cleared after context switch has been saved in rt_context_saved() + * + Checked in vcpu_wake to see if we can add to the Runqueue, or if we should + * set RT_delayed_runq_add + * + Checked to be false in runq_insert. + */ +#define __RT_scheduled 1 +#define RT_scheduled (1<<__RT_scheduled) +/* + * RT_delayed_runq_add: Do we need to add this to the Runqueueu once it'd done + * being context switching out? + * + Set when scheduling out in rt_schedule() if prev is runable + * + Set in rt_vcpu_wake if it finds RT_scheduled set + * + Read in rt_context_saved(). If set, it adds prev to the Runqueue and + * clears the bit. + */ +#define __RT_delayed_runq_add 2 +#define RT_delayed_runq_add (1<<__RT_delayed_runq_add) + +/* + * Debug only. Used to printout debug information + */ +#define printtime()\ + ({s_time_t now = NOW(); \ + printk("%u : %3ld.%3ldus : %-19s\n",smp_processor_id(),\ + now/MICROSECS(1), now%MICROSECS(1)/1000, __func__);} ) + +/* + * rt tracing events ("only" 512 available!). Check + * include/public/trace.h for more details. + */ +#define TRC_RT_TICKLE TRC_SCHED_CLASS_EVT(RT, 1) +#define TRC_RT_RUNQ_PICK TRC_SCHED_CLASS_EVT(RT, 2) +#define TRC_RT_BUDGET_BURN TRC_SCHED_CLASS_EVT(RT, 3) +#define TRC_RT_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RT, 4) +#define TRC_RT_SCHED_TASKLET TRC_SCHED_CLASS_EVT(RT, 5) +#define TRC_RT_VCPU_DUMP TRC_SCHED_CLASS_EVT(RT, 6) + +/* + * Systme-wide private data, include a global RunQueue + * Global lock is referenced by schedule_data.schedule_lock from all + * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() + */ +struct rt_private { + spinlock_t lock; /* The global coarse grand lock */ + struct list_head sdom; /* list of availalbe domains, used for dump */ + struct list_head runq; /* Ordered list of runnable VMs */ + struct rt_vcpu *flag_vcpu; /* position of the first depleted vcpu */ + cpumask_t cpus; /* cpumask_t of available physical cpus */ + cpumask_t tickled; /* cpus been tickled */ +}; + +/* + * Virtual CPU + */ +struct rt_vcpu { + struct list_head runq_elem; /* On the runqueue list */ + struct list_head sdom_elem; /* On the domain VCPU list */ + + /* Up-pointers */ + struct rt_dom *sdom; + struct vcpu *vcpu; + + /* VCPU parameters, in nanoseconds */ + s_time_t period; + s_time_t budget; + + /* VCPU current infomation in nanosecond */ + s_time_t cur_budget; /* current budget */ + s_time_t last_start; /* last start time */ + s_time_t cur_deadline; /* current deadline for EDF */ + + unsigned flags; /* mark __RT_scheduled, etc.. */ +}; + +/* + * Domain + */ +struct rt_dom { + struct list_head vcpu; /* link its VCPUs */ + struct list_head sdom_elem; /* link list on rt_priv */ + struct domain *dom; /* pointer to upper domain */ +}; + +/* + * Useful inline functions + */ +static inline struct rt_private *RT_PRIV(const struct scheduler *ops) +{ + return ops->sched_data; +} + +static inline struct rt_vcpu *RT_VCPU(const struct vcpu *vcpu) +{ + return vcpu->sched_priv; +} + +static inline struct rt_dom *RT_DOM(const struct domain *dom) +{ + return dom->sched_priv; +} + +static inline struct list_head *RUNQ(const struct scheduler *ops) +{ + return &RT_PRIV(ops)->runq; +} + +/* + * RunQueue helper functions + */ +static int +__vcpu_on_runq(const struct rt_vcpu *svc) +{ + return !list_empty(&svc->runq_elem); +} + +static struct rt_vcpu * +__runq_elem(struct list_head *elem) +{ + return list_entry(elem, struct rt_vcpu, runq_elem); +} + +/* + * Debug related code, dump vcpu/cpu information + */ +static void +rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc) +{ + struct rt_private *prv = RT_PRIV(ops); + char cpustr[1024]; + cpumask_t *cpupool_mask; + + ASSERT(svc != NULL); + /* flag vcpu */ + if( svc->sdom == NULL ) + return; + + cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity); + printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime")," + " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime + " onR=%d runnable=%d cpu_hard_affinity=%s ", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id, + svc->vcpu->processor, + svc->period, + svc->budget, + svc->cur_budget, + svc->cur_deadline, + svc->last_start, + __vcpu_on_runq(svc), + vcpu_runnable(svc->vcpu), + cpustr); + memset(cpustr, 0, sizeof(cpustr)); + cpupool_mask = cpupool_scheduler_cpumask(svc->vcpu->domain->cpupool); + cpumask_scnprintf(cpustr, sizeof(cpustr), cpupool_mask); + printk("cpupool=%s ", cpustr); + memset(cpustr, 0, sizeof(cpustr)); + cpumask_scnprintf(cpustr, sizeof(cpustr), &prv->cpus); + printk("prv->cpus=%s\n", cpustr); + + /* TRACE */ + { + struct { + unsigned dom:16,vcpu:16; + unsigned processor; + unsigned cur_budget_lo, cur_budget_hi; + unsigned cur_deadline_lo, cur_deadline_hi; + unsigned is_vcpu_on_runq:16,is_vcpu_runnable:16; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.processor = svc->vcpu->processor; + d.cur_budget_lo = (unsigned) svc->cur_budget; + d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32); + d.cur_deadline_lo = (unsigned) svc->cur_deadline; + d.cur_deadline_hi = (unsigned) (svc->cur_deadline >> 32); + d.is_vcpu_on_runq = __vcpu_on_runq(svc); + d.is_vcpu_runnable = vcpu_runnable(svc->vcpu); + trace_var(TRC_RT_VCPU_DUMP, 1, + sizeof(d), + (unsigned char *)&d); + } +} + +static void +rt_dump_pcpu(const struct scheduler *ops, int cpu) +{ + struct rt_vcpu *svc = RT_VCPU(curr_on_cpu(cpu)); + + printtime(); + rt_dump_vcpu(ops, svc); +} + +/* + * should not need lock here. only showing stuff + */ +static void +rt_dump(const struct scheduler *ops) +{ + struct list_head *iter_sdom, *iter_svc, *runq, *iter; + struct rt_private *prv = RT_PRIV(ops); + struct rt_vcpu *svc; + unsigned int cpu = 0; + + printtime(); + + printk("PCPU info:\n"); + for_each_cpu(cpu, &prv->cpus) + rt_dump_pcpu(ops, cpu); + + printk("Global RunQueue info:\n"); + runq = RUNQ(ops); + list_for_each( iter, runq ) + { + svc = __runq_elem(iter); + rt_dump_vcpu(ops, svc); + } + + printk("Domain info:\n"); + list_for_each( iter_sdom, &prv->sdom ) + { + struct rt_dom *sdom; + sdom = list_entry(iter_sdom, struct rt_dom, sdom_elem); + printk("\tdomain: %d\n", sdom->dom->domain_id); + + list_for_each( iter_svc, &sdom->vcpu ) + { + svc = list_entry(iter_svc, struct rt_vcpu, sdom_elem); + rt_dump_vcpu(ops, svc); + } + } + + printk("\n"); +} + +/* + * update deadline and budget when deadline is in the past, + * it need to be updated to the deadline of the current period + */ +static void +rt_update_helper(s_time_t now, struct rt_vcpu *svc) +{ + s_time_t diff = now - svc->cur_deadline; + + if ( diff >= 0 ) + { + /* now can be later for several periods */ + long count = ( diff/svc->period ) + 1; + svc->cur_deadline += count * svc->period; + svc->cur_budget = svc->budget; + + /* TRACE */ + { + struct { + unsigned dom:16,vcpu:16; + unsigned cur_budget_lo, cur_budget_hi; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.cur_budget_lo = (unsigned) svc->cur_budget; + d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32); + trace_var(TRC_RT_BUDGET_REPLENISH, 1, + sizeof(d), + (unsigned char *) &d); + } + + return; + } +} + +static inline void +__runq_remove(struct rt_vcpu *svc) +{ + if ( __vcpu_on_runq(svc) ) + list_del_init(&svc->runq_elem); +} + +/* + * Insert svc in the RunQ according to EDF: vcpus with smaller deadlines + * goes first. + */ +static void +__runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) +{ + struct rt_private *prv = RT_PRIV(ops); + struct list_head *runq = RUNQ(ops); + struct list_head *iter; + spinlock_t *schedule_lock; + + schedule_lock = per_cpu(schedule_data, svc->vcpu->processor).schedule_lock; + ASSERT( spin_is_locked(schedule_lock) ); + + ASSERT( !__vcpu_on_runq(svc) ); + + /* svc still has budget */ + if ( svc->cur_budget > 0 ) + { + list_for_each(iter, runq) + { + struct rt_vcpu * iter_svc = __runq_elem(iter); + if ( iter_svc->cur_budget == 0 || + svc->cur_deadline <= iter_svc->cur_deadline ) + break; + } + list_add_tail(&svc->runq_elem, iter); + } + else + { + list_add(&svc->runq_elem, &prv->flag_vcpu->runq_elem); + } +} + +/* + * Init/Free related code + */ +static int +rt_init(struct scheduler *ops) +{ + struct rt_private *prv = xzalloc(struct rt_private); + + printk("Initializing RT scheduler\n" \ + " WARNING: This is experimental software in development.\n" \ + " Use at your own risk.\n"); + + if ( prv == NULL ) + return -ENOMEM; + + spin_lock_init(&prv->lock); + INIT_LIST_HEAD(&prv->sdom); + INIT_LIST_HEAD(&prv->runq); + + prv->flag_vcpu = xzalloc(struct rt_vcpu); + prv->flag_vcpu->cur_budget = 0; + prv->flag_vcpu->sdom = NULL; /* distinguish this vcpu with others */ + list_add(&prv->flag_vcpu->runq_elem, &prv->runq); + + cpumask_clear(&prv->cpus); + cpumask_clear(&prv->tickled); + + ops->sched_data = prv; + + printtime(); + printk("\n"); + + return 0; +} + +static void +rt_deinit(const struct scheduler *ops) +{ + struct rt_private *prv = RT_PRIV(ops); + + printtime(); + printk("\n"); + xfree(prv->flag_vcpu); + xfree(prv); +} + +/* + * Point per_cpu spinlock to the global system lock; + * All cpu have same global system lock + */ +static void * +rt_alloc_pdata(const struct scheduler *ops, int cpu) +{ + struct rt_private *prv = RT_PRIV(ops); + + cpumask_set_cpu(cpu, &prv->cpus); + + per_cpu(schedule_data, cpu).schedule_lock = &prv->lock; + + printtime(); + printk("%s total cpus: %d", __func__, cpumask_weight(&prv->cpus)); + /* 1 indicates alloc. succeed in schedule.c */ + return (void *)1; +} + +static void +rt_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) +{ + struct rt_private * prv = RT_PRIV(ops); + cpumask_clear_cpu(cpu, &prv->cpus); +} + +static void * +rt_alloc_domdata(const struct scheduler *ops, struct domain *dom) +{ + unsigned long flags; + struct rt_dom *sdom; + struct rt_private * prv = RT_PRIV(ops); + + sdom = xzalloc(struct rt_dom); + if ( sdom == NULL ) + { + printk("%s, xzalloc failed\n", __func__); + return NULL; + } + + INIT_LIST_HEAD(&sdom->vcpu); + INIT_LIST_HEAD(&sdom->sdom_elem); + sdom->dom = dom; + + /* spinlock here to insert the dom */ + spin_lock_irqsave(&prv->lock, flags); + list_add_tail(&sdom->sdom_elem, &(prv->sdom)); + spin_unlock_irqrestore(&prv->lock, flags); + + return sdom; +} + +static void +rt_free_domdata(const struct scheduler *ops, void *data) +{ + unsigned long flags; + struct rt_dom *sdom = data; + struct rt_private *prv = RT_PRIV(ops); + + spin_lock_irqsave(&prv->lock, flags); + list_del_init(&sdom->sdom_elem); + spin_unlock_irqrestore(&prv->lock, flags); + xfree(data); +} + +static int +rt_dom_init(const struct scheduler *ops, struct domain *dom) +{ + struct rt_dom *sdom; + + /* IDLE Domain does not link on rt_private */ + if ( is_idle_domain(dom) ) + return 0; + + sdom = rt_alloc_domdata(ops, dom); + if ( sdom == NULL ) + { + printk("%s, failed\n", __func__); + return -ENOMEM; + } + dom->sched_priv = sdom; + + return 0; +} + +static void +rt_dom_destroy(const struct scheduler *ops, struct domain *dom) +{ + rt_free_domdata(ops, RT_DOM(dom)); +} + +static void * +rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) +{ + struct rt_vcpu *svc; + s_time_t now = NOW(); + + /* Allocate per-VCPU info */ + svc = xzalloc(struct rt_vcpu); + if ( svc == NULL ) + { + printk("%s, xzalloc failed\n", __func__); + return NULL; + } + + INIT_LIST_HEAD(&svc->runq_elem); + INIT_LIST_HEAD(&svc->sdom_elem); + svc->flags = 0U; + svc->sdom = dd; + svc->vcpu = vc; + svc->last_start = 0; + + svc->period = RT_DS_DEFAULT_PERIOD; + if ( !is_idle_vcpu(vc) ) + svc->budget = RT_DS_DEFAULT_BUDGET; + + rt_update_helper(now, svc); + + /* Debug only: dump new vcpu's info */ + rt_dump_vcpu(ops, svc); + + return svc; +} + +static void +rt_free_vdata(const struct scheduler *ops, void *priv) +{ + struct rt_vcpu *svc = priv; + + /* Debug only: dump freed vcpu's info */ + rt_dump_vcpu(ops, svc); + xfree(svc); +} + +/* + * This function is called in sched_move_domain() in schedule.c + * When move a domain to a new cpupool. + * It inserts vcpus of moving domain to the scheduler's RunQ in + * dest. cpupool; and insert rt_vcpu svc to scheduler-specific + * vcpu list of the dom + */ +static void +rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu *svc = RT_VCPU(vc); + + /* Debug only: dump info of vcpu to insert */ + rt_dump_vcpu(ops, svc); + + /* not addlocate idle vcpu to dom vcpu list */ + if ( is_idle_vcpu(vc) ) + return; + + if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running ) + __runq_insert(ops, svc); + + /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ + list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); +} + +/* + * Remove rt_vcpu svc from the old scheduler in source cpupool; and + * Remove rt_vcpu svc from scheduler-specific vcpu list of the dom + */ +static void +rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * const svc = RT_VCPU(vc); + struct rt_dom * const sdom = svc->sdom; + + rt_dump_vcpu(ops, svc); + + BUG_ON( sdom == NULL ); + BUG_ON( __vcpu_on_runq(svc) ); + + if ( __vcpu_on_runq(svc) ) + __runq_remove(svc); + + if ( !is_idle_vcpu(vc) ) + list_del_init(&svc->sdom_elem); +} + +/* + * Pick a valid CPU for the vcpu vc + * Valid CPU of a vcpu is intesection of vcpu's affinity + * and available cpus + */ +static int +rt_cpu_pick(const struct scheduler *ops, struct vcpu *vc) +{ + cpumask_t cpus; + cpumask_t *online; + int cpu; + struct rt_private * prv = RT_PRIV(ops); + + online = cpupool_scheduler_cpumask(vc->domain->cpupool); + cpumask_and(&cpus, &prv->cpus, online); + cpumask_and(&cpus, &cpus, vc->cpu_hard_affinity); + + cpu = cpumask_test_cpu(vc->processor, &cpus) + ? vc->processor + : cpumask_cycle(vc->processor, &cpus); + ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); + + return cpu; +} + +/* + * Burn budget in nanosecond granularity + */ +static void +burn_budgets(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now) +{ + s_time_t delta; + + /* don't burn budget for idle VCPU */ + if ( is_idle_vcpu(svc->vcpu) ) + return; + + rt_update_helper(now, svc); + + /* not burn budget when vcpu miss deadline */ + if ( now >= svc->cur_deadline ) + return; + + /* burn at nanoseconds level */ + delta = now - svc->last_start; + /* + * delta < 0 only happens in nested virtualization; + * TODO: how should we handle delta < 0 in a better way? + */ + if ( delta < 0 ) + { + printk("%s, ATTENTION: now is behind last_start! delta = %ld", + __func__, delta); + rt_dump_vcpu(ops, svc); + svc->last_start = now; + svc->cur_budget = 0; + return; + } + + if ( svc->cur_budget == 0 ) + return; + + svc->cur_budget -= delta; + if ( svc->cur_budget < 0 ) + svc->cur_budget = 0; + + /* TRACE */ + { + struct { + unsigned dom:16, vcpu:16; + unsigned cur_budget_lo; + unsigned cur_budget_hi; + int delta; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.cur_budget_lo = (unsigned) svc->cur_budget; + d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32); + d.delta = delta; + trace_var(TRC_RT_BUDGET_BURN, 1, + sizeof(d), + (unsigned char *) &d); + } +} + +/* + * RunQ is sorted. Pick first one within cpumask. If no one, return NULL + * lock is grabbed before calling this function + */ +static struct rt_vcpu * +__runq_pick(const struct scheduler *ops, cpumask_t mask) +{ + struct list_head *runq = RUNQ(ops); + struct list_head *iter; + struct rt_vcpu *svc = NULL; + struct rt_vcpu *iter_svc = NULL; + cpumask_t cpu_common; + cpumask_t *online; + struct rt_private * prv = RT_PRIV(ops); + + list_for_each(iter, runq) + { + iter_svc = __runq_elem(iter); + + /* flag vcpu */ + if(iter_svc->sdom == NULL) + break; + + /* mask cpu_hard_affinity & cpupool & priv->cpus */ + online = cpupool_scheduler_cpumask(iter_svc->vcpu->domain->cpupool); + cpumask_and(&cpu_common, online, &prv->cpus); + cpumask_and(&cpu_common, &cpu_common, iter_svc->vcpu->cpu_hard_affinity); + cpumask_and(&cpu_common, &mask, &cpu_common); + if ( cpumask_empty(&cpu_common) ) + continue; + + ASSERT( iter_svc->cur_budget > 0 ); + + svc = iter_svc; + break; + } + + /* TRACE */ + { + if( svc != NULL ) + { + struct { + unsigned dom:16, vcpu:16; + unsigned cur_deadline_lo, cur_deadline_hi; + unsigned cur_budget_lo, cur_budget_hi; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.cur_deadline_lo = (unsigned) svc->cur_deadline; + d.cur_deadline_hi = (unsigned) (svc->cur_deadline >> 32); + d.cur_budget_lo = (unsigned) svc->cur_budget; + d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32); + trace_var(TRC_RT_RUNQ_PICK, 1, + sizeof(d), + (unsigned char *) &d); + } + else + trace_var(TRC_RT_RUNQ_PICK, 1, 0, NULL); + } + + return svc; +} + +/* + * Update vcpu's budget and sort runq by insert the modifed vcpu back to runq + * lock is grabbed before calling this function + */ +static void +__repl_update(const struct scheduler *ops, s_time_t now) +{ + struct list_head *runq = RUNQ(ops); + struct list_head *iter; + struct list_head *tmp; + struct rt_vcpu *svc = NULL; + + list_for_each_safe(iter, tmp, runq) + { + svc = __runq_elem(iter); + + /* not update flag_vcpu's budget */ + if(svc->sdom == NULL) + continue; + + rt_update_helper(now, svc); + /* reinsert the vcpu if its deadline is updated */ + if ( now >= 0 ) + { + __runq_remove(svc); + __runq_insert(ops, svc); + } + } +} + +/* + * schedule function for rt scheduler. + * The lock is already grabbed in schedule.c, no need to lock here + */ +static struct task_slice +rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) +{ + const int cpu = smp_processor_id(); + struct rt_private * prv = RT_PRIV(ops); + struct rt_vcpu * const scurr = RT_VCPU(current); + struct rt_vcpu * snext = NULL; + struct task_slice ret = { .migrated = 0 }; + + /* clear ticked bit now that we've been scheduled */ + if ( cpumask_test_cpu(cpu, &prv->tickled) ) + cpumask_clear_cpu(cpu, &prv->tickled); + + /* burn_budget would return for IDLE VCPU */ + burn_budgets(ops, scurr, now); + + __repl_update(ops, now); + + if ( tasklet_work_scheduled ) + { + snext = RT_VCPU(idle_vcpu[cpu]); + } + else + { + cpumask_t cur_cpu; + cpumask_clear(&cur_cpu); + cpumask_set_cpu(cpu, &cur_cpu); + snext = __runq_pick(ops, cur_cpu); + if ( snext == NULL ) + snext = RT_VCPU(idle_vcpu[cpu]); + + /* if scurr has higher priority and budget, still pick scurr */ + if ( !is_idle_vcpu(current) && + vcpu_runnable(current) && + scurr->cur_budget > 0 && + ( is_idle_vcpu(snext->vcpu) || + scurr->cur_deadline <= snext->cur_deadline ) ) + snext = scurr; + } + + if ( snext != scurr && + !is_idle_vcpu(current) && + vcpu_runnable(current) ) + set_bit(__RT_delayed_runq_add, &scurr->flags); + + + snext->last_start = now; + if ( !is_idle_vcpu(snext->vcpu) ) + { + if ( snext != scurr ) + { + __runq_remove(snext); + set_bit(__RT_scheduled, &snext->flags); + } + if ( snext->vcpu->processor != cpu ) + { + snext->vcpu->processor = cpu; + ret.migrated = 1; + } + } + + ret.time = MILLISECS(1); /* sched quantum */ + ret.task = snext->vcpu; + + /* TRACE */ + { + struct { + unsigned dom:16,vcpu:16; + unsigned cur_deadline_lo, cur_deadline_hi; + unsigned cur_budget_lo, cur_budget_hi; + } d; + d.dom = snext->vcpu->domain->domain_id; + d.vcpu = snext->vcpu->vcpu_id; + d.cur_deadline_lo = (unsigned) snext->cur_deadline; + d.cur_deadline_hi = (unsigned) (snext->cur_deadline >> 32); + d.cur_budget_lo = (unsigned) snext->cur_budget; + d.cur_budget_hi = (unsigned) (snext->cur_budget >> 32); + trace_var(TRC_RT_SCHED_TASKLET, 1, + sizeof(d), + (unsigned char *)&d); + } + + return ret; +} + +/* + * Remove VCPU from RunQ + * The lock is already grabbed in schedule.c, no need to lock here + */ +static void +rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * const svc = RT_VCPU(vc); + + BUG_ON( is_idle_vcpu(vc) ); + + if ( curr_on_cpu(vc->processor) == vc ) + cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); + else if ( __vcpu_on_runq(svc) ) + __runq_remove(svc); + else if ( test_bit(__RT_delayed_runq_add, &svc->flags) ) + clear_bit(__RT_delayed_runq_add, &svc->flags); +} + +/* + * Pick a vcpu on a cpu to kick out to place the running candidate + * Called by wake() and context_saved() + * We have a running candidate here, the kick logic is: + * Among all the cpus that are within the cpu affinity + * 1) if the new->cpu is idle, kick it. This could benefit cache hit + * 2) if there are any idle vcpu, kick it. + * 3) now all pcpus are busy, among all the running vcpus, pick lowest priority one + * if snext has higher priority, kick it. + * + * TODO: + * 1) what if these two vcpus belongs to the same domain? + * replace a vcpu belonging to the same domain introduces more overhead + * + * lock is grabbed before calling this function + */ +static void +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new) +{ + struct rt_private * prv = RT_PRIV(ops); + struct rt_vcpu * latest_deadline_vcpu = NULL; /* lowest priority scheduled */ + struct rt_vcpu * iter_svc; + struct vcpu * iter_vc; + int cpu = 0, cpu_to_tickle = 0; + cpumask_t not_tickled; + cpumask_t *online; + + if ( new == NULL || is_idle_vcpu(new->vcpu) ) + return; + + online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool); + cpumask_and(¬_tickled, online, &prv->cpus); + cpumask_and(¬_tickled, ¬_tickled, new->vcpu->cpu_hard_affinity); + cpumask_andnot(¬_tickled, ¬_tickled, &prv->tickled); + + /* 1) if new's previous cpu is idle, kick it for cache benefit */ + if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) ) + { + cpu_to_tickle = new->vcpu->processor; + goto out; + } + + /* 2) if there are any idle pcpu, kick it */ + /* The same loop also find the one with lowest priority */ + for_each_cpu(cpu, ¬_tickled) + { + iter_vc = curr_on_cpu(cpu); + if ( is_idle_vcpu(iter_vc) ) + { + cpu_to_tickle = cpu; + goto out; + } + iter_svc = RT_VCPU(iter_vc); + if ( latest_deadline_vcpu == NULL || + iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline ) + latest_deadline_vcpu = iter_svc; + } + + /* 3) candicate has higher priority, kick out the lowest priority vcpu */ + if ( latest_deadline_vcpu != NULL && new->cur_deadline < latest_deadline_vcpu->cur_deadline ) + { + cpu_to_tickle = latest_deadline_vcpu->vcpu->processor; + goto out; + } + +out: + /* TRACE */ + { + struct { + unsigned cpu:8, pad:24; + } d; + d.cpu = cpu_to_tickle; + d.pad = 0; + trace_var(TRC_RT_TICKLE, 0, + sizeof(d), + (unsigned char *)&d); + } + + cpumask_set_cpu(cpu_to_tickle, &prv->tickled); + cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ); + return; +} + +/* + * Should always wake up runnable vcpu, put it back to RunQ. + * Check priority to raise interrupt + * The lock is already grabbed in schedule.c, no need to lock here + * TODO: what if these two vcpus belongs to the same domain? + */ +static void +rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * const svc = RT_VCPU(vc); + s_time_t now = NOW(); + struct rt_private * prv = RT_PRIV(ops); + struct rt_vcpu * snext = NULL; /* highest priority on RunQ */ + + BUG_ON( is_idle_vcpu(vc) ); + + if ( unlikely(curr_on_cpu(vc->processor) == vc) ) + return; + + /* on RunQ, just update info is ok */ + if ( unlikely(__vcpu_on_runq(svc)) ) + return; + + /* If context hasn't been saved for this vcpu yet, we can't put it on + * the Runqueue. Instead, we set a flag so that it will be put on the Runqueue + * After the context has been saved. + */ + if ( unlikely(test_bit(__RT_scheduled, &svc->flags)) ) + { + set_bit(__RT_delayed_runq_add, &svc->flags); + return; + } + + rt_update_helper(now, svc); + + __runq_insert(ops, svc); + __repl_update(ops, now); + snext = __runq_pick(ops, prv->cpus); /* pick snext from ALL valid cpus */ + runq_tickle(ops, snext); + + return; +} + +/* + * scurr has finished context switch, insert it back to the RunQ, + * and then pick the highest priority vcpu from runq to run + */ +static void +rt_context_saved(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * svc = RT_VCPU(vc); + struct rt_vcpu * snext = NULL; + struct rt_private * prv = RT_PRIV(ops); + spinlock_t *lock = vcpu_schedule_lock_irq(vc); + + clear_bit(__RT_scheduled, &svc->flags); + /* not insert idle vcpu to runq */ + if ( is_idle_vcpu(vc) ) + goto out; + + if ( test_and_clear_bit(__RT_delayed_runq_add, &svc->flags) && + likely(vcpu_runnable(vc)) ) + { + __runq_insert(ops, svc); + __repl_update(ops, NOW()); + snext = __runq_pick(ops, prv->cpus); /* pick snext from ALL cpus */ + runq_tickle(ops, snext); + } +out: + vcpu_schedule_unlock_irq(lock, vc); +} + +/* + * set/get each vcpu info of each domain + */ +static int +rt_dom_cntl( + const struct scheduler *ops, + struct domain *d, + struct xen_domctl_scheduler_op *op) +{ + struct rt_dom * const sdom = RT_DOM(d); + struct rt_vcpu * svc; + struct list_head *iter; + int rc = 0; + + switch ( op->cmd ) + { + case XEN_DOMCTL_SCHEDOP_getinfo: + /* for debug use, whenever adjust Dom0 parameter, do global dump */ + if ( d->domain_id == 0 ) + rt_dump(ops); + + svc = list_entry(sdom->vcpu.next, struct rt_vcpu, sdom_elem); + op->u.rt.period = svc->period / MICROSECS(1); /* transfer to us */ + op->u.rt.budget = svc->budget / MICROSECS(1); + break; + case XEN_DOMCTL_SCHEDOP_putinfo: + list_for_each( iter, &sdom->vcpu ) + { + struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, sdom_elem); + svc->period = MICROSECS(op->u.rt.period); /* transfer to nanosec */ + svc->budget = MICROSECS(op->u.rt.budget); + } + break; + } + + return rc; +} + +static struct rt_private _rt_priv; + +const struct scheduler sched_rt_ds_def = { + .name = "SMP RT DS Scheduler", + .opt_name = "rt_ds", + .sched_id = XEN_SCHEDULER_RT_DS, + .sched_data = &_rt_priv, + + .dump_cpu_state = rt_dump_pcpu, + .dump_settings = rt_dump, + .init = rt_init, + .deinit = rt_deinit, + .alloc_pdata = rt_alloc_pdata, + .free_pdata = rt_free_pdata, + .alloc_domdata = rt_alloc_domdata, + .free_domdata = rt_free_domdata, + .init_domain = rt_dom_init, + .destroy_domain = rt_dom_destroy, + .alloc_vdata = rt_alloc_vdata, + .free_vdata = rt_free_vdata, + .insert_vcpu = rt_vcpu_insert, + .remove_vcpu = rt_vcpu_remove, + + .adjust = rt_dom_cntl, + + .pick_cpu = rt_cpu_pick, + .do_schedule = rt_schedule, + .sleep = rt_vcpu_sleep, + .wake = rt_vcpu_wake, + .context_saved = rt_context_saved, +}; diff --git a/xen/common/schedule.c b/xen/common/schedule.c index 73cc2ea..dc4f749 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -69,6 +69,7 @@ static const struct scheduler *schedulers[] = { &sched_credit_def, &sched_credit2_def, &sched_arinc653_def, + &sched_rt_ds_def, }; static struct scheduler __read_mostly ops; diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h index 69a8b44..11654d0 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -347,6 +347,8 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t); #define XEN_SCHEDULER_CREDIT 5 #define XEN_SCHEDULER_CREDIT2 6 #define XEN_SCHEDULER_ARINC653 7 +#define XEN_SCHEDULER_RT_DS 8 + /* Set or get info? */ #define XEN_DOMCTL_SCHEDOP_putinfo 0 #define XEN_DOMCTL_SCHEDOP_getinfo 1 @@ -368,6 +370,10 @@ struct xen_domctl_scheduler_op { struct xen_domctl_sched_credit2 { uint16_t weight; } credit2; + struct xen_domctl_sched_rt{ + uint32_t period; + uint32_t budget; + } rt; } u; }; typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t; diff --git a/xen/include/public/trace.h b/xen/include/public/trace.h index cfcf4aa..87340c4 100644 --- a/xen/include/public/trace.h +++ b/xen/include/public/trace.h @@ -77,6 +77,7 @@ #define TRC_SCHED_CSCHED2 1 #define TRC_SCHED_SEDF 2 #define TRC_SCHED_ARINC653 3 +#define TRC_SCHED_RT 4 /* Per-scheduler tracing */ #define TRC_SCHED_CLASS_EVT(_c, _e) \ diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h index 4164dff..04d81dc 100644 --- a/xen/include/xen/sched-if.h +++ b/xen/include/xen/sched-if.h @@ -169,6 +169,7 @@ extern const struct scheduler sched_sedf_def; extern const struct scheduler sched_credit_def; extern const struct scheduler sched_credit2_def; extern const struct scheduler sched_arinc653_def; +extern const struct scheduler sched_rt_ds_def; struct cpupool -- 1.7.9.5