Re: [PATCH 0/4] mitigate the per-pCPU blocking list may be too long

From: Chao Gao <chao.gao@intel.com>
To: George Dunlap <george.dunlap@citrix.com>,
	Jan Beulich <jbeulich@suse.com>,
	Kevin Tian <kevin.tian@intel.com>
Cc: Wei Liu <wei.liu2@citrix.com>,
	George Dunlap <George.Dunlap@eu.citrix.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Ian Jackson <ian.jackson@eu.citrix.com>,
	xen-devel@lists.xen.org, Jun Nakajima <jun.nakajima@intel.com>
Subject: Re: [PATCH 0/4] mitigate the per-pCPU blocking list may be too long
Date: Tue, 2 May 2017 13:45:01 +0800	[thread overview]
Message-ID: <20170502054459.GA13105@skl-2s3.sh.intel.com> (raw)
In-Reply-To: <15f405cc-04aa-ac3d-8ae2-17f684b21d36@citrix.com>

On Wed, Apr 26, 2017 at 05:39:57PM +0100, George Dunlap wrote:
>On 26/04/17 01:52, Chao Gao wrote:
>> I compared the maximum of #entry in one list and #event (adding entry to
>> PI blocking list) with and without the three latter patches. Here
>> is the result:
>> -------------------------------------------------------------
>> |               |                      |                    |
>> |    Items      |   Maximum of #entry  |      #event        |
>> |               |                      |                    |
>> -------------------------------------------------------------
>> |               |                      |                    |
>> |W/ the patches |         6            |       22740        |
>> |               |                      |                    |
>> -------------------------------------------------------------
>> |               |                      |                    |
>> |W/O the patches|        128           |       46481        |
>> |               |                      |                    |
>> -------------------------------------------------------------
>
>Any chance you could trace how long the list traversal took?  It would
>be good for future reference to have an idea what kinds of timescales
>we're talking about.

Hi.

I made a simple test to get the time consumed by the list traversal.
Apply below patch and create one hvm guest with 128 vcpus and a passthrough 40 NIC.
All guest vcpu are pinned to one pcpu. collect data by
'xentrace -D -e 0x82000 -T 300 trace.bin' and decode data by
xentrace_format. When the list length is about 128, the traversal time
is in the range of 1750 cycles to 39330 cycles. The physical cpu's
frequence is 1795.788MHz, therefore the time consumed is in the range of 1us
to 22us. If 0.5ms is the upper bound the system can tolerate, at most
2900 vcpus can be added into the list.

I hope there is no error in the test and analysis.

Thanks
Chao

---8<---
From 504fd32bc042670812daad41efcd982434b98cd4 Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Wed, 26 Apr 2017 03:39:06 +0800
Subject: [PATCH] xentrace: trace PI-related events.

This patch adds TRC_HVM_VT_D_PI_BLOCK, TRC_HVM_PI_WAKEUP_START and
TRC_HVM_PI_WAKEUP_END to track PI-related events. Specifically,
TRC_HVM_VT_D_PI_BLOCK track adding one entry to the per-pcpu
blocking list. TRC_HVM_PI_WAKEUP_{START, END} mark the start and end
of PI blocking list traversal. Also introduce a 'counter' to track the
number of entries in the list.

Signed-off-by: Chao Gao <chao.gao@intel.com>
---
 tools/xentrace/formats          |  3 +++
 xen/arch/x86/hvm/vmx/vmx.c      | 13 ++++++++++++-
 xen/include/asm-x86/hvm/trace.h |  3 +++
 xen/include/public/trace.h      |  3 +++
 4 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tools/xentrace/formats b/tools/xentrace/formats
index 8b31780..34ed9e4 100644
--- a/tools/xentrace/formats
+++ b/tools/xentrace/formats
@@ -125,6 +125,9 @@
 0x00082020  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INTR_WINDOW [ value = 0x%(1)08x ]
 0x00082021  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  NPF         [ gpa = 0x%(2)08x%(1)08x mfn = 0x%(4)08x%(3)08x qual = 0x%(5)04x p2mt = 0x%(6)04x ]
 0x00082023  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  TRAP        [ vector = 0x%(1)02x ]
+0x00082026  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PI_BLOCK_LIST  [ domid = 0x%(1)04x vcpu = 0x%(2)04x, pcpu = 0x%(3)04x, #entry = 0x%(4)04x ]
+0x00082027  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PI_WAKEUP_START [ list_len = 0x%(1)04x ]
+0x00082028  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PI_WAKEUP_END
 
 0x0010f001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  page_grant_map      [ domid = %(1)d ]
 0x0010f002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  page_grant_unmap    [ domid = %(1)d ]
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index c8ef18a..3a6640b 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -82,6 +82,7 @@ static int vmx_vmfunc_intercept(struct cpu_user_regs *regs);
 struct vmx_pi_blocking_vcpu {
     struct list_head     list;
     spinlock_t           lock;
+    atomic_t             counter;
 };
 
 /*
@@ -119,6 +120,9 @@ static void vmx_vcpu_block(struct vcpu *v)
      */
     ASSERT(old_lock == NULL);
 
+    atomic_inc(&per_cpu(vmx_pi_blocking, v->processor).counter);
+    HVMTRACE_4D(VT_D_PI_BLOCK, v->domain->domain_id, v->vcpu_id, v->processor,
+                atomic_read(&per_cpu(vmx_pi_blocking, v->processor).counter));
     list_add_tail(&v->arch.hvm_vmx.pi_blocking.list,
                   &per_cpu(vmx_pi_blocking, v->processor).list);
     spin_unlock_irqrestore(pi_blocking_list_lock, flags);
@@ -186,6 +190,8 @@ static void vmx_pi_unblock_vcpu(struct vcpu *v)
     {
         ASSERT(v->arch.hvm_vmx.pi_blocking.lock == pi_blocking_list_lock);
         list_del(&v->arch.hvm_vmx.pi_blocking.list);
+        atomic_dec(&container_of(pi_blocking_list_lock,
+                                 struct vmx_pi_blocking_vcpu, lock)->counter);
         v->arch.hvm_vmx.pi_blocking.lock = NULL;
     }
 
@@ -234,6 +240,7 @@ void vmx_pi_desc_fixup(unsigned int cpu)
         if ( pi_test_on(&vmx->pi_desc) )
         {
             list_del(&vmx->pi_blocking.list);
+            atomic_dec(&per_cpu(vmx_pi_blocking, cpu).counter);
             vmx->pi_blocking.lock = NULL;
             vcpu_unblock(container_of(vmx, struct vcpu, arch.hvm_vmx));
         }
@@ -2360,13 +2367,15 @@ static void pi_wakeup_interrupt(struct cpu_user_regs *regs)
     struct arch_vmx_struct *vmx, *tmp;
     spinlock_t *lock = &per_cpu(vmx_pi_blocking, smp_processor_id()).lock;
     struct list_head *blocked_vcpus =
-		&per_cpu(vmx_pi_blocking, smp_processor_id()).list;
+               &per_cpu(vmx_pi_blocking, smp_processor_id()).list;
 
     ack_APIC_irq();
     this_cpu(irq_count)++;
 
     spin_lock(lock);
 
+    TRACE_1D(TRC_HVM_PI_WAKEUP_START,
+        atomic_read(&per_cpu(vmx_pi_blocking, smp_processor_id()).counter));
     /*
      * XXX: The length of the list depends on how many vCPU is current
      * blocked on this specific pCPU. This may hurt the interrupt latency
@@ -2377,11 +2386,13 @@ static void pi_wakeup_interrupt(struct cpu_user_regs *regs)
         if ( pi_test_on(&vmx->pi_desc) )
         {
             list_del(&vmx->pi_blocking.list);
+            atomic_dec(&per_cpu(vmx_pi_blocking, smp_processor_id()).counter);
             ASSERT(vmx->pi_blocking.lock == lock);
             vmx->pi_blocking.lock = NULL;
             vcpu_unblock(container_of(vmx, struct vcpu, arch.hvm_vmx));
         }
     }
+    TRACE_0D(TRC_HVM_PI_WAKEUP_END);
 
     spin_unlock(lock);
 }
diff --git a/xen/include/asm-x86/hvm/trace.h b/xen/include/asm-x86/hvm/trace.h
index de802a6..afe8b75 100644
--- a/xen/include/asm-x86/hvm/trace.h
+++ b/xen/include/asm-x86/hvm/trace.h
@@ -54,6 +54,9 @@
 #define DO_TRC_HVM_TRAP             DEFAULT_HVM_MISC
 #define DO_TRC_HVM_TRAP_DEBUG       DEFAULT_HVM_MISC
 #define DO_TRC_HVM_VLAPIC           DEFAULT_HVM_MISC
+#define DO_TRC_HVM_VT_D_PI_BLOCK    DEFAULT_HVM_MISC
+#define DO_TRC_HVM_PI_WAKEUP_START  DEFAULT_HVM_MISC
+#define DO_TRC_HVM_PI_WAKEUP_END    DEFAULT_HVM_MISC
 
 
 #define TRC_PAR_LONG(par) ((par)&0xFFFFFFFF),((par)>>32)
diff --git a/xen/include/public/trace.h b/xen/include/public/trace.h
index 7f2e891..c5b95ee 100644
--- a/xen/include/public/trace.h
+++ b/xen/include/public/trace.h
@@ -234,6 +234,9 @@
 #define TRC_HVM_TRAP             (TRC_HVM_HANDLER + 0x23)
 #define TRC_HVM_TRAP_DEBUG       (TRC_HVM_HANDLER + 0x24)
 #define TRC_HVM_VLAPIC           (TRC_HVM_HANDLER + 0x25)
+#define TRC_HVM_VT_D_PI_BLOCK    (TRC_HVM_HANDLER + 0x26)
+#define TRC_HVM_PI_WAKEUP_START  (TRC_HVM_HANDLER + 0x27)
+#define TRC_HVM_PI_WAKEUP_END    (TRC_HVM_HANDLER + 0x28)
 
 #define TRC_HVM_IOPORT_WRITE    (TRC_HVM_HANDLER + 0x216)
 #define TRC_HVM_IOMEM_WRITE     (TRC_HVM_HANDLER + 0x217)
-- 
1.8.3.1


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel