[PATCH 3/5] x86/pv: Optimise prefetching in svm_load_segs()

From: Andrew Cooper <andrew.cooper3@citrix.com>
To: Xen-devel <xen-devel@lists.xenproject.org>
Cc: "Andrew Cooper" <andrew.cooper3@citrix.com>,
	"Jan Beulich" <JBeulich@suse.com>,
	"Roger Pau Monné" <roger.pau@citrix.com>, "Wei Liu" <wl@xen.org>
Subject: [PATCH 3/5] x86/pv: Optimise prefetching in svm_load_segs()
Date: Wed, 9 Sep 2020 10:59:18 +0100	[thread overview]
Message-ID: <20200909095920.25495-4-andrew.cooper3@citrix.com> (raw)
In-Reply-To: <20200909095920.25495-1-andrew.cooper3@citrix.com>

Split into two functions.  Passing a load of zeros in results in somewhat poor
register scheduling in __context_switch().

Update the prefetching comment to note that the main point is the TLB fill.

Reorder the writes in svm_load_segs() to access the VMCB fields in ascending
order, which gets better next-line prefetch behaviour out of hardware.  Update
the prefetch instruction to match.

The net delta is:

  add/remove: 1/0 grow/shrink: 0/2 up/down: 38/-39 (-1)
  Function                                     old     new   delta
  svm_load_segs_prefetch                         -      38     +38
  __context_switch                             967     951     -16
  svm_load_segs                                291     268     -23

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
---
 xen/arch/x86/domain.c             |  2 +-
 xen/arch/x86/hvm/svm/svm.c        | 43 ++++++++++++++++++++-------------------
 xen/include/asm-x86/hvm/svm/svm.h |  5 +++--
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 2271bee36a..0b0e3f8294 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1928,7 +1928,7 @@ static void __context_switch(void)
     /* Prefetch the VMCB if we expect to use it later in the context switch */
     if ( cpu_has_svm && is_pv_domain(nd) && !is_pv_32bit_domain(nd) &&
          !is_idle_domain(nd) )
-        svm_load_segs(0, 0, 0, 0, 0);
+        svm_load_segs_prefetch();
 #endif
 
     if ( need_full_gdt(nd) && !per_cpu(full_gdt_loaded, cpu) )
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index 23b2a2aa17..9a2aca7770 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1520,6 +1520,19 @@ static void svm_init_erratum_383(const struct cpuinfo_x86 *c)
 }
 
 #ifdef CONFIG_PV
+void svm_load_segs_prefetch(void)
+{
+    struct vmcb_struct *vmcb = this_cpu(host_vmcb_va);
+
+    if ( vmcb )
+        /*
+         * The main reason for this prefetch is for the TLB fill.  Use the
+         * opporunity to fetch the lowest address used, to get the best
+         * behaviour out of hardwares next-line prefetcher.
+         */
+        prefetchw(&vmcb->fs);
+}
+
 bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
                    unsigned long fs_base, unsigned long gs_base,
                    unsigned long gs_shadow)
@@ -1530,17 +1543,15 @@ bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
     if ( unlikely(!vmcb) )
         return false;
 
-    if ( !ldt_base )
-    {
-        /*
-         * The actual structure field used here was arbitrarily chosen.
-         * Empirically it doesn't seem to matter much which element is used,
-         * and a clear explanation of the otherwise poor performance has not
-         * been found/provided so far.
-         */
-        prefetchw(&vmcb->ldtr);
-        return true;
-    }
+    vmcb->fs.sel = 0;
+    vmcb->fs.attr = 0;
+    vmcb->fs.limit = 0;
+    vmcb->fs.base = fs_base;
+
+    vmcb->gs.sel = 0;
+    vmcb->gs.attr = 0;
+    vmcb->gs.limit = 0;
+    vmcb->gs.base = gs_base;
 
     if ( likely(!ldt_ents) )
         memset(&vmcb->ldtr, 0, sizeof(vmcb->ldtr));
@@ -1558,16 +1569,6 @@ bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
         vmcb->ldtr.base = ldt_base;
     }
 
-    vmcb->fs.sel = 0;
-    vmcb->fs.attr = 0;
-    vmcb->fs.limit = 0;
-    vmcb->fs.base = fs_base;
-
-    vmcb->gs.sel = 0;
-    vmcb->gs.attr = 0;
-    vmcb->gs.limit = 0;
-    vmcb->gs.base = gs_base;
-
     vmcb->kerngsbase = gs_shadow;
 
     svm_vmload_pa(per_cpu(host_vmcb, cpu));
diff --git a/xen/include/asm-x86/hvm/svm/svm.h b/xen/include/asm-x86/hvm/svm/svm.h
index 2310878e41..faeca40174 100644
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -50,12 +50,13 @@ void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len);
 void svm_update_guest_cr(struct vcpu *, unsigned int cr, unsigned int flags);
 
 /*
- * PV context switch helper. Calls with zero ldt_base request a prefetch of
- * the VMCB area to be loaded from, instead of an actual load of state.
+ * PV context switch helpers.  Prefetching the VMCB area itself has been shown
+ * to be useful for performance.
  *
  * Must only be used for NUL FS/GS, as the segment attributes/limits are not
  * read from the GDT/LDT.
  */
+void svm_load_segs_prefetch(void);
 bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
                    unsigned long fs_base, unsigned long gs_base,
                    unsigned long gs_shadow);
-- 
2.11.0