[Xen-devel] [RFC 6/6] schedule: account all the hypervisor time to the idle vcpu

From: Andrii Anisov <andrii.anisov@gmail.com>
To: xen-devel@lists.xenproject.org
Cc: Stefano Stabellini <sstabellini@kernel.org>,
	Andrii Anisov <andrii_anisov@epam.com>, Wei Liu <wl@xen.org>,
	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>,
	George Dunlap <george.dunlap@eu.citrix.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Ian Jackson <ian.jackson@eu.citrix.com>, Tim Deegan <tim@xen.org>,
	Julien Grall <julien.grall@arm.com>,
	Meng Xu <mengxu@cis.upenn.edu>, Jan Beulich <jbeulich@suse.com>,
	Dario Faggioli <dfaggioli@suse.com>,
	Volodymyr Babchuk <Volodymyr_Babchuk@epam.com>
Subject: [Xen-devel] [RFC 6/6] schedule: account all the hypervisor time to the idle vcpu
Date: Fri, 26 Jul 2019 13:37:40 +0300	[thread overview]
Message-ID: <1564137460-25629-8-git-send-email-andrii.anisov@gmail.com> (raw)
In-Reply-To: <1564137460-25629-1-git-send-email-andrii.anisov@gmail.com>

From: Andrii Anisov <andrii_anisov@epam.com>

Account for a guest:
 - guest running time
 - guest sync traps serving time (hypercalls, trapped emulated iomems, etc)
 - vcpu jobs in leave_hypervisor_tail

Account for the hyp:
 - IRQ processing
 - Softirq processing

Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
---
 xen/arch/arm/traps.c       | 49 ++++++++++++++++++++++++++----
 xen/common/sched_credit.c  |  2 +-
 xen/common/sched_credit2.c |  4 +--
 xen/common/sched_rt.c      |  2 +-
 xen/common/schedule.c      | 74 +++++++++++++++++++++++++++++++++++++++-------
 xen/include/xen/sched.h    |  5 ++++
 6 files changed, 116 insertions(+), 20 deletions(-)

diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index 13726db..f978b94 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -2064,7 +2064,7 @@ void do_trap_guest_sync(struct cpu_user_regs *regs)
         if ( !check_conditional_instr(regs, hsr) )
         {
             advance_pc(regs, hsr);
-            return;
+            break;
         }
         if ( hsr.wfi_wfe.ti ) {
             /* Yield the VCPU for WFE */
@@ -2126,10 +2126,16 @@ void do_trap_guest_sync(struct cpu_user_regs *regs)
         perfc_incr(trap_hvc32);
 #ifndef NDEBUG
         if ( (hsr.iss & 0xff00) == 0xff00 )
-            return do_debug_trap(regs, hsr.iss & 0x00ff);
+        {
+            do_debug_trap(regs, hsr.iss & 0x00ff);
+            break;
+        }
 #endif
         if ( hsr.iss == 0 )
-            return do_trap_hvc_smccc(regs);
+        {
+            do_trap_hvc_smccc(regs);
+            break;
+        }
         nr = regs->r12;
         do_trap_hypercall(regs, &nr, hsr);
         regs->r12 = (uint32_t)nr;
@@ -2141,10 +2147,16 @@ void do_trap_guest_sync(struct cpu_user_regs *regs)
         perfc_incr(trap_hvc64);
 #ifndef NDEBUG
         if ( (hsr.iss & 0xff00) == 0xff00 )
-            return do_debug_trap(regs, hsr.iss & 0x00ff);
+        {
+            do_debug_trap(regs, hsr.iss & 0x00ff);
+            break;
+        }
 #endif
         if ( hsr.iss == 0 )
-            return do_trap_hvc_smccc(regs);
+        {
+            do_trap_hvc_smccc(regs);
+            break;
+        }
         do_trap_hypercall(regs, &regs->x16, hsr);
         break;
     case HSR_EC_SMC64:
@@ -2179,6 +2191,11 @@ void do_trap_guest_sync(struct cpu_user_regs *regs)
                 hsr.bits, hsr.ec, hsr.len, hsr.iss);
         inject_undef_exception(regs, hsr);
     }
+
+    local_irq_disable();
+    hyp_tacc_head(1);
+
+    /*we will call tacc tail from the leave_hypervisor_tail*/
 }
 
 void do_trap_hyp_sync(struct cpu_user_regs *regs)
@@ -2219,6 +2236,7 @@ void do_trap_hyp_sync(struct cpu_user_regs *regs)
                hsr.bits, hsr.ec, hsr.len, hsr.iss);
         do_unexpected_trap("Hypervisor", regs);
     }
+
 }
 
 void do_trap_hyp_serror(struct cpu_user_regs *regs)
@@ -2234,28 +2252,47 @@ void do_trap_guest_serror(struct cpu_user_regs *regs)
     local_irq_enable();
 
     __do_trap_serror(regs, true);
+
+    local_irq_disable();
+    hyp_tacc_head(2);
 }
 
 void do_trap_guest_irq(struct cpu_user_regs *regs)
 {
+    hyp_tacc_head(3);
+
     enter_hypervisor_head();
     gic_interrupt(regs, 0);
+
+    /*we will call tacc tail from the leave_hypervisor_tail*/
 }
 
 void do_trap_guest_fiq(struct cpu_user_regs *regs)
 {
+    hyp_tacc_head(4);
+
     enter_hypervisor_head();
     gic_interrupt(regs, 1);
+
+    /*we will call tacc tail from the leave_hypervisor_tail*/
 }
 
 void do_trap_hyp_irq(struct cpu_user_regs *regs)
 {
+    hyp_tacc_head(5);
+
     gic_interrupt(regs, 0);
+
+    hyp_tacc_tail(5);
 }
 
 void do_trap_hyp_fiq(struct cpu_user_regs *regs)
 {
+    hyp_tacc_head(6);
+
     gic_interrupt(regs, 1);
+
+    hyp_tacc_tail(6);
 }
 
 static void check_for_pcpu_work(void)
@@ -2318,6 +2355,8 @@ void leave_hypervisor_tail(void)
      */
     SYNCHRONIZE_SERROR(SKIP_SYNCHRONIZE_SERROR_ENTRY_EXIT);
 
+    hyp_tacc_tail(1234);
+
     /*
      * The hypervisor runs with the workaround always present.
      * If the guest wants it disabled, so be it...
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index 3c0d7c7..b8d866b 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -1856,7 +1856,7 @@ csched_schedule(
                     (unsigned char *)&d);
     }
 
-    runtime = now - current->runstate.state_entry_time;
+    runtime = current->runtime;
     if ( runtime < 0 ) /* Does this ever happen? */
         runtime = 0;
 
diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index 8e4381d..2d11a5f 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -3285,7 +3285,7 @@ runq_candidate(struct csched2_runqueue_data *rqd,
      * no point forcing it to do so until rate limiting expires.
      */
     if ( !yield && prv->ratelimit_us && vcpu_runnable(scurr->vcpu) &&
-         (now - scurr->vcpu->runstate.state_entry_time) <
+          scurr->vcpu->runtime <
           MICROSECS(prv->ratelimit_us) )
     {
         if ( unlikely(tb_init_done) )
@@ -3296,7 +3296,7 @@ runq_candidate(struct csched2_runqueue_data *rqd,
             } d;
             d.dom = scurr->vcpu->domain->domain_id;
             d.vcpu = scurr->vcpu->vcpu_id;
-            d.runtime = now - scurr->vcpu->runstate.state_entry_time;
+            d.runtime = scurr->vcpu->runtime;
             __trace_var(TRC_CSCHED2_RATELIMIT, 1,
                         sizeof(d),
                         (unsigned char *)&d);
diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 0acfc3d..f1de511 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -947,7 +947,7 @@ burn_budget(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now)
         return;
 
     /* burn at nanoseconds level */
-    delta = now - svc->last_start;
+    delta = svc->vcpu->runtime;
     /*
      * delta < 0 only happens in nested virtualization;
      * TODO: how should we handle delta < 0 in a better way?
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 9e8805d..d3246f9 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -1504,20 +1504,16 @@ static void schedule(void)
              (now - next->runstate.state_entry_time) : 0,
              next_slice.time);
 
-    ASSERT(prev->runstate.state == RUNSTATE_running);
-
     TRACE_4D(TRC_SCHED_SWITCH,
              prev->domain->domain_id, prev->vcpu_id,
              next->domain->domain_id, next->vcpu_id);
 
-    vcpu_runstate_change(
-        prev,
-        ((prev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
-         (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
-        now);
-
-    ASSERT(next->runstate.state != RUNSTATE_running);
-    vcpu_runstate_change(next, RUNSTATE_running, now);
+    if ( !vcpu_runnable(prev) )
+        vcpu_runstate_change(
+            prev,
+            ((prev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
+             RUNSTATE_offline),
+            now);
 
     /*
      * NB. Don't add any trace records from here until the actual context
@@ -1526,6 +1522,7 @@ static void schedule(void)
 
     ASSERT(!next->is_running);
     next->is_running = 1;
+    next->runtime = 0;
 
     pcpu_schedule_unlock_irq(lock, cpu);
 
@@ -1541,6 +1538,58 @@ static void schedule(void)
     context_switch(prev, next);
 }
 
+DEFINE_PER_CPU(int, hyp_tacc_cnt);
+
+void hyp_tacc_head(int place)
+{
+    //printk("\thead cpu %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(hyp_tacc_cnt));
+
+    ASSERT(this_cpu(hyp_tacc_cnt) >= 0);
+
+    if ( this_cpu(hyp_tacc_cnt) == 0 )
+    {
+        s_time_t now = NOW();
+        spin_lock(per_cpu(schedule_data,smp_processor_id()).schedule_lock);
+        /*
+         * Stop time accounting for guest (guest vcpu)
+         */
+        ASSERT( (current->runstate.state_entry_time & XEN_RUNSTATE_UPDATE) == 0);
+        current->runtime += now - current->runstate.state_entry_time;
+        vcpu_runstate_change(current, RUNSTATE_runnable, now);
+        /*
+         * Start time accounting for hyp (idle vcpu)
+         */
+        vcpu_runstate_change(idle_vcpu[smp_processor_id()], RUNSTATE_running, now);
+        spin_unlock(per_cpu(schedule_data,smp_processor_id()).schedule_lock);
+    }
+
+    this_cpu(hyp_tacc_cnt)++;
+}
+
+void hyp_tacc_tail(int place)
+{
+    //printk("\t\t\t\ttail cpu %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(hyp_tacc_cnt));
+
+    ASSERT(this_cpu(hyp_tacc_cnt) > 0);
+
+    if (this_cpu(hyp_tacc_cnt) == 1)
+    {
+        s_time_t now = NOW();
+        spin_lock(per_cpu(schedule_data,smp_processor_id()).schedule_lock);
+        /*
+         * Stop time accounting for guest (guest vcpu)
+         */
+        vcpu_runstate_change(idle_vcpu[smp_processor_id()], RUNSTATE_runnable, now);
+        /*
+         * Start time accounting for hyp (idle vcpu)
+         */
+        vcpu_runstate_change(current, RUNSTATE_running, now);
+        spin_unlock(per_cpu(schedule_data,smp_processor_id()).schedule_lock);
+    }
+
+    this_cpu(hyp_tacc_cnt)--;
+}
+
 void context_saved(struct vcpu *prev)
 {
     /* Clear running flag /after/ writing context to memory. */
@@ -1597,8 +1646,9 @@ static int cpu_schedule_up(unsigned int cpu)
     sd->curr = idle_vcpu[cpu];
     init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
     atomic_set(&sd->urgent_count, 0);
+    per_cpu(hyp_tacc_cnt, cpu) = 1;
 
-    /* Boot CPU is dealt with later in schedule_init(). */
+    /* Boot CPU is dealt with later in scheduler_init(). */
     if ( cpu == 0 )
         return 0;
 
@@ -1654,6 +1704,8 @@ static void cpu_schedule_down(unsigned int cpu)
     sd->sched_priv = NULL;
 
     kill_timer(&sd->s_timer);
+
+    per_cpu(hyp_tacc_cnt, cpu) = 0;
 }
 
 static int cpu_schedule_callback(
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 5e28797..9391318 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -174,6 +174,8 @@ struct vcpu
     } runstate_guest; /* guest address */
 #endif
 
+    s_time_t runtime;
+
     /* Has the FPU been initialised? */
     bool             fpu_initialised;
     /* Has the FPU been used since it was last saved? */
@@ -998,6 +1000,9 @@ extern void dump_runq(unsigned char key);
 
 void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
 
+void hyp_tacc_head(int place);
+void hyp_tacc_tail(int place);
+
 #endif /* __SCHED_H__ */
 
 /*
-- 
2.7.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel