[PATCH v5 3/4] VT-d PI: restrict the number of vcpus in a given pcpu's PI blocking list

From: Chao Gao <chao.gao@intel.com>
To: xen-devel@lists.xen.org
Cc: Andrew Cooper <andrew.cooper3@citrix.com>,
	Kevin Tian <kevin.tian@intel.com>,
	Jun Nakajima <jun.nakajima@intel.com>,
	Jan Beulich <jbeulich@suse.com>, Chao Gao <chao.gao@intel.com>
Subject: [PATCH v5 3/4] VT-d PI: restrict the number of vcpus in a given pcpu's PI blocking list
Date: Wed, 16 Aug 2017 13:14:37 +0800	[thread overview]
Message-ID: <1502860478-84512-4-git-send-email-chao.gao@intel.com> (raw)
In-Reply-To: <1502860478-84512-1-git-send-email-chao.gao@intel.com>

Currently, a blocked vCPU is put in its pCPU's pi blocking list. If
too many vCPUs are blocked on a given pCPU, it will incur that the list
grows too long. After a simple analysis, there are 32k domains and
128 vcpu per domain, thus about 4M vCPUs may be blocked in one pCPU's
PI blocking list. When a wakeup interrupt arrives, the list is
traversed to wake up vCPUs which have events pending. This traversal in
that case would consume much time.

To mitigate this issue, this patch limits the number of vCPUs tracked by a
given pCPU's blocking list, taking factors such as perfomance of common case,
current hvm vCPU count and current pCPU count into consideration. With this
method, for the common case, it works fast and for some extreme cases, the
list length is under control.

With this patch, when a vcpu is to be blocked, we check whether the pi
blocking list's length of the pcpu where the vcpu is running exceeds
the limit which is the average vcpus per pcpu ratio plus a constant.
If no, the vcpu is added to this pcpu's pi blocking list. Otherwise,
another online pcpu is chosen to accept the vcpu.

Signed-off-by: Chao Gao <chao.gao@intel.com>
---
v5:
 - Introduce a function to choose the suitable pcpu to accept the blocked
 vcpu.
v4:
 - use a new lock to avoid adding a blocked vcpu to a offline pcpu's blocking
 list.

---
 xen/arch/x86/hvm/vmx/vmx.c | 109 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 97 insertions(+), 12 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index bf17988..646f409 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -119,16 +119,85 @@ static void vmx_pi_del_vcpu(struct pi_blocking_vcpu *pbv,
     add_sized(&vpbv->counter, -1);
 }
 
+/*
+ * By default, the local pcpu (namely, the one the vcpu is currently running on)
+ * is chosen as the destination of wakeup interrupt. But if the number of vcpus
+ * in the default pcpu's PI blocking list exceeds a limit, another suitable
+ * pcpu is chosen as the destination by iterating through all online pcpus.
+ *
+ * Currently, choose (v_tot/p_tot) + K as the limit of vcpus, where
+ * v_tot is the total number of hvm vcpus on the system, p_tot is the total
+ * number of pcpus in the system, and K is a fixed number. An experiment on a
+ * skylake server which has 112 cpus and 64G memory shows the maximum time of
+ * waking up a vcpu from a 128-entry blocking list is about 22us, which is
+ * tolerable. So choose 128 as the fixed number K.
+ *
+ * This policy makes sure:
+ * 1) for common cases, the limit won't be reached and the local pcpu is used
+ * which is beneficial to performance (at least, avoid an IPI when unblocking
+ * vcpu).
+ * 2) for the worst case, the blocking list length scales with the vcpu count
+ * divided by the pcpu count.
+ */
+#define PI_LIST_FIXED_LIMIT 128
+
+static inline bool pi_over_limit(unsigned int cpu)
+{
+    /* Compare w/ constant first to save a division and an add */
+    if ( likely(read_atomic(&per_cpu(vmx_pi_blocking, cpu).counter) <=
+                PI_LIST_FIXED_LIMIT) )
+        return 0;
+    else
+        return read_atomic(&per_cpu(vmx_pi_blocking, cpu).counter) >=
+                           (atomic_read(&num_hvm_vcpus) / num_online_cpus()) +
+                           PI_LIST_FIXED_LIMIT;
+}
+
+/*
+ * Start from @cpu and iterate cpu_online_map to look for one cpu whose
+ * blocking list length is under limit. Return with holding a lock to avoid
+ * others adding entries to the chosen cpu.
+ * There must be at least one suitable cpu for the limit is greater than the
+ * average number of all cpus' blocking list length.
+ */
+static unsigned int pi_get_blocking_cpu(unsigned int cpu, unsigned long *flags)
+{
+    spinlock_t *pi_blocking_list_lock;
+
+    for ( ; ; )
+    {
+        while ( unlikely(pi_over_limit(cpu)) )
+            cpu = cpumask_cycle(cpu, &cpu_online_map);
+
+        pi_blocking_list_lock = &per_cpu(vmx_pi_blocking, cpu).lock;
+        if ( flags )
+            spin_lock_irqsave(pi_blocking_list_lock, *flags);
+        else
+            spin_lock(pi_blocking_list_lock);
+        /*
+         * check again in case the list length exceeds the limit during taking
+         * the lock
+         */
+        if ( !pi_over_limit(cpu) )
+            break;
+        else if ( flags )
+            spin_unlock_irqrestore(pi_blocking_list_lock, *flags);
+        else
+            spin_unlock(pi_blocking_list_lock);
+    }
+
+    return cpu;
+}
+
 static void vmx_vcpu_block(struct vcpu *v)
 {
     unsigned long flags;
-    unsigned int dest;
-    spinlock_t *old_lock;
-    spinlock_t *pi_blocking_list_lock =
-		&per_cpu(vmx_pi_blocking, v->processor).lock;
+    unsigned int dest, pi_cpu;
+    spinlock_t *old_lock, *pi_blocking_list_lock;
     struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
 
-    spin_lock_irqsave(pi_blocking_list_lock, flags);
+    pi_cpu = pi_get_blocking_cpu(v->processor, &flags);
+    pi_blocking_list_lock = &per_cpu(vmx_pi_blocking, pi_cpu).lock;
     old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
                        pi_blocking_list_lock);
 
@@ -140,7 +209,7 @@ static void vmx_vcpu_block(struct vcpu *v)
     ASSERT(old_lock == NULL);
 
     vmx_pi_add_vcpu(&v->arch.hvm_vmx.pi_blocking,
-                    &per_cpu(vmx_pi_blocking, v->processor));
+                    &per_cpu(vmx_pi_blocking, pi_cpu));
     spin_unlock_irqrestore(pi_blocking_list_lock, flags);
 
     ASSERT(!pi_test_sn(pi_desc));
@@ -149,6 +218,19 @@ static void vmx_vcpu_block(struct vcpu *v)
 
     ASSERT(pi_desc->ndst ==
            (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
+    if ( unlikely(pi_cpu != v->processor) )
+    {
+        /*
+         * The vcpu is put to another pCPU's blocking list. Change 'NDST'
+         * field to that pCPU to make sure it can wake up the vcpu when an
+         * interrupt arrives. The 'NDST' field will be set to the pCPU which
+         * the vcpu is running on during VM-Entry, seeing
+         * vmx_pi_unblock_vcpu().
+         */
+        dest = cpu_physical_id(pi_cpu);
+        write_atomic(&pi_desc->ndst,
+                  x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
+    }
 
     write_atomic(&pi_desc->nv, pi_wakeup_vector);
 }
@@ -179,13 +261,17 @@ static void vmx_pi_unblock_vcpu(struct vcpu *v)
     unsigned long flags;
     spinlock_t *pi_blocking_list_lock;
     struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+    unsigned int dest = cpu_physical_id(v->processor);
 
     /*
-     * Set 'NV' field back to posted_intr_vector, so the
-     * Posted-Interrupts can be delivered to the vCPU when
-     * it is running in non-root mode.
+     * Set 'NV' field back to posted_intr_vector and 'NDST' field to the pCPU
+     * where the vcpu is running (for this field may now point to another
+     * pCPU), so the Posted-Interrupts can be delivered to the vCPU when it
+     * is running in non-root mode.
      */
     write_atomic(&pi_desc->nv, posted_intr_vector);
+    write_atomic(&pi_desc->ndst,
+                 x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
 
     pi_blocking_list_lock = v->arch.hvm_vmx.pi_blocking.lock;
 
@@ -261,17 +347,16 @@ void vmx_pi_desc_fixup(unsigned int cpu)
         }
         else
         {
+            new_cpu = cpumask_cycle(cpu, &cpu_online_map);
             /*
              * We need to find an online cpu as the NDST of the PI descriptor, it
              * doesn't matter whether it is within the cpupool of the domain or
              * not. As long as it is online, the vCPU will be woken up once the
              * notification event arrives.
              */
-            new_cpu = cpumask_any(&cpu_online_map);
+            new_cpu = pi_get_blocking_cpu(new_cpu, NULL);
             new_lock = &per_cpu(vmx_pi_blocking, new_cpu).lock;
 
-            spin_lock(new_lock);
-
             ASSERT(vmx->pi_blocking.lock == old_lock);
 
             dest = cpu_physical_id(new_cpu);
-- 
1.8.3.1


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel