All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrew Cooper <andrew.cooper3@citrix.com>
To: Xen-devel <xen-devel@lists.xenproject.org>
Cc: "Andrew Cooper" <andrew.cooper3@citrix.com>,
	"Jan Beulich" <JBeulich@suse.com>,
	"Roger Pau Monné" <roger.pau@citrix.com>, "Wei Liu" <wl@xen.org>
Subject: [PATCH 3/5] x86/pv: Optimise prefetching in svm_load_segs()
Date: Wed, 9 Sep 2020 10:59:18 +0100	[thread overview]
Message-ID: <20200909095920.25495-4-andrew.cooper3@citrix.com> (raw)
In-Reply-To: <20200909095920.25495-1-andrew.cooper3@citrix.com>

Split into two functions.  Passing a load of zeros in results in somewhat poor
register scheduling in __context_switch().

Update the prefetching comment to note that the main point is the TLB fill.

Reorder the writes in svm_load_segs() to access the VMCB fields in ascending
order, which gets better next-line prefetch behaviour out of hardware.  Update
the prefetch instruction to match.

The net delta is:

  add/remove: 1/0 grow/shrink: 0/2 up/down: 38/-39 (-1)
  Function                                     old     new   delta
  svm_load_segs_prefetch                         -      38     +38
  __context_switch                             967     951     -16
  svm_load_segs                                291     268     -23

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
---
 xen/arch/x86/domain.c             |  2 +-
 xen/arch/x86/hvm/svm/svm.c        | 43 ++++++++++++++++++++-------------------
 xen/include/asm-x86/hvm/svm/svm.h |  5 +++--
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 2271bee36a..0b0e3f8294 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1928,7 +1928,7 @@ static void __context_switch(void)
     /* Prefetch the VMCB if we expect to use it later in the context switch */
     if ( cpu_has_svm && is_pv_domain(nd) && !is_pv_32bit_domain(nd) &&
          !is_idle_domain(nd) )
-        svm_load_segs(0, 0, 0, 0, 0);
+        svm_load_segs_prefetch();
 #endif
 
     if ( need_full_gdt(nd) && !per_cpu(full_gdt_loaded, cpu) )
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index 23b2a2aa17..9a2aca7770 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1520,6 +1520,19 @@ static void svm_init_erratum_383(const struct cpuinfo_x86 *c)
 }
 
 #ifdef CONFIG_PV
+void svm_load_segs_prefetch(void)
+{
+    struct vmcb_struct *vmcb = this_cpu(host_vmcb_va);
+
+    if ( vmcb )
+        /*
+         * The main reason for this prefetch is for the TLB fill.  Use the
+         * opporunity to fetch the lowest address used, to get the best
+         * behaviour out of hardwares next-line prefetcher.
+         */
+        prefetchw(&vmcb->fs);
+}
+
 bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
                    unsigned long fs_base, unsigned long gs_base,
                    unsigned long gs_shadow)
@@ -1530,17 +1543,15 @@ bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
     if ( unlikely(!vmcb) )
         return false;
 
-    if ( !ldt_base )
-    {
-        /*
-         * The actual structure field used here was arbitrarily chosen.
-         * Empirically it doesn't seem to matter much which element is used,
-         * and a clear explanation of the otherwise poor performance has not
-         * been found/provided so far.
-         */
-        prefetchw(&vmcb->ldtr);
-        return true;
-    }
+    vmcb->fs.sel = 0;
+    vmcb->fs.attr = 0;
+    vmcb->fs.limit = 0;
+    vmcb->fs.base = fs_base;
+
+    vmcb->gs.sel = 0;
+    vmcb->gs.attr = 0;
+    vmcb->gs.limit = 0;
+    vmcb->gs.base = gs_base;
 
     if ( likely(!ldt_ents) )
         memset(&vmcb->ldtr, 0, sizeof(vmcb->ldtr));
@@ -1558,16 +1569,6 @@ bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
         vmcb->ldtr.base = ldt_base;
     }
 
-    vmcb->fs.sel = 0;
-    vmcb->fs.attr = 0;
-    vmcb->fs.limit = 0;
-    vmcb->fs.base = fs_base;
-
-    vmcb->gs.sel = 0;
-    vmcb->gs.attr = 0;
-    vmcb->gs.limit = 0;
-    vmcb->gs.base = gs_base;
-
     vmcb->kerngsbase = gs_shadow;
 
     svm_vmload_pa(per_cpu(host_vmcb, cpu));
diff --git a/xen/include/asm-x86/hvm/svm/svm.h b/xen/include/asm-x86/hvm/svm/svm.h
index 2310878e41..faeca40174 100644
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -50,12 +50,13 @@ void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len);
 void svm_update_guest_cr(struct vcpu *, unsigned int cr, unsigned int flags);
 
 /*
- * PV context switch helper. Calls with zero ldt_base request a prefetch of
- * the VMCB area to be loaded from, instead of an actual load of state.
+ * PV context switch helpers.  Prefetching the VMCB area itself has been shown
+ * to be useful for performance.
  *
  * Must only be used for NUL FS/GS, as the segment attributes/limits are not
  * read from the GDT/LDT.
  */
+void svm_load_segs_prefetch(void);
 bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
                    unsigned long fs_base, unsigned long gs_base,
                    unsigned long gs_shadow);
-- 
2.11.0



  parent reply	other threads:[~2020-09-09 10:00 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-09  9:59 [PATCH 0/5] x86/pv: Minor perf improvements in segment handling Andrew Cooper
2020-09-09  9:59 ` [PATCH 1/5] x86/asm: Rename FS/GS base helpers Andrew Cooper
2020-09-10 14:45   ` Jan Beulich
2020-09-15  2:50   ` Tian, Kevin
2020-09-09  9:59 ` [PATCH 2/5] x86/asm: Split __wr{fs, gs}base() out of write_{fs, gs}_base() Andrew Cooper
2020-09-10 14:47   ` [PATCH 2/5] x86/asm: Split __wr{fs,gs}base() out of write_{fs,gs}_base() Jan Beulich
2020-09-14 14:34     ` Andrew Cooper
2020-09-09  9:59 ` Andrew Cooper [this message]
2020-09-10 14:57   ` [PATCH 3/5] x86/pv: Optimise prefetching in svm_load_segs() Jan Beulich
2020-09-10 20:30     ` Andrew Cooper
2020-09-11  6:31       ` Jan Beulich
2020-09-09  9:59 ` [PATCH 4/5] x86/pv: Optimise to the segment context switching paths Andrew Cooper
2020-09-11  9:49   ` Jan Beulich
2020-09-11 12:53     ` Andrew Cooper
2020-09-11 13:28       ` Jan Beulich
2020-09-09  9:59 ` [PATCH 5/5] x86/pv: Simplify emulation for the 64bit base MSRs Andrew Cooper
2020-09-11 10:01   ` Jan Beulich
2020-09-11 16:10     ` Andrew Cooper

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200909095920.25495-4-andrew.cooper3@citrix.com \
    --to=andrew.cooper3@citrix.com \
    --cc=JBeulich@suse.com \
    --cc=roger.pau@citrix.com \
    --cc=wl@xen.org \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.