[PATCH 0/2] x86: further GS base handling changes

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2] x86: further GS base handling changes
@ 2017-12-06 16:32 Jan Beulich
  2017-12-06 16:37 ` [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses Jan Beulich
  2017-12-06 16:38 ` [PATCH 2/2] x86: rename DIRTY_GS_BASE_USER Jan Beulich
  0 siblings, 2 replies; 9+ messages in thread
From: Jan Beulich @ 2017-12-06 16:32 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

1: improve MSR_SHADOW_GS accesses
2: rename DIRTY_GS_BASE_USER

Signed-off-by: Jan Beulich <jbeulich@suse.com>


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses
  2017-12-06 16:32 [PATCH 0/2] x86: further GS base handling changes Jan Beulich
@ 2017-12-06 16:37 ` Jan Beulich
  2017-12-06 19:34   ` Andrew Cooper
  2018-02-28 14:46   ` Ping: " Jan Beulich
  2017-12-06 16:38 ` [PATCH 2/2] x86: rename DIRTY_GS_BASE_USER Jan Beulich
  1 sibling, 2 replies; 9+ messages in thread
From: Jan Beulich @ 2017-12-06 16:37 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

Instead of using RDMSR/WRMSR, on fsgsbase-capable systems use a double
SWAPGS combined with RDGSBASE/WRGSBASE. This halves execution time for
a shadow GS update alone on my Haswell (and we have indications of
good performance improvements by this on Skylake too), while the win is
even higher when e.g. updating more than one base (as may and commonly
will happen in load_segments()).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1326,9 +1326,12 @@ static void load_segments(struct vcpu *n
         if ( n->arch.pv_vcpu.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
             wrfsbase(n->arch.pv_vcpu.fs_base);
 
-        /* Most kernels have non-zero GS base, so don't bother testing. */
-        /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
-        wrmsrl(MSR_SHADOW_GS_BASE, n->arch.pv_vcpu.gs_base_kernel);
+        /*
+         * Most kernels have non-zero GS base, so don't bother testing.
+         * (For old AMD hardware this is also a serialising instruction,
+         * avoiding erratum #88.)
+         */
+        wrgsshadow(n->arch.pv_vcpu.gs_base_kernel);
 
         /* This can only be non-zero if selector is NULL. */
         if ( n->arch.pv_vcpu.gs_base_user |
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -501,7 +501,7 @@ long_mode_do_msr_read(unsigned int msr,
         break;
 
     case MSR_SHADOW_GS_BASE:
-        rdmsrl(MSR_SHADOW_GS_BASE, *msr_content);
+        *msr_content = rdgsshadow();
         break;
 
     case MSR_STAR:
@@ -549,7 +549,7 @@ long_mode_do_msr_write(unsigned int msr,
         else if ( msr == MSR_GS_BASE )
             __vmwrite(GUEST_GS_BASE, msr_content);
         else
-            wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
+            wrgsshadow(msr_content);
 
         break;
 
@@ -608,12 +608,12 @@ static void vmx_save_guest_msrs(struct v
      * We cannot cache SHADOW_GS_BASE while the VCPU runs, as it can
      * be updated at any time via SWAPGS, which we cannot trap.
      */
-    rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
+    v->arch.hvm_vmx.shadow_gs = rdgsshadow();
 }
 
 static void vmx_restore_guest_msrs(struct vcpu *v)
 {
-    wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
+    wrgsshadow(v->arch.hvm_vmx.shadow_gs);
     wrmsrl(MSR_STAR,           v->arch.hvm_vmx.star);
     wrmsrl(MSR_LSTAR,          v->arch.hvm_vmx.lstar);
     wrmsrl(MSR_SYSCALL_MASK,   v->arch.hvm_vmx.sfmask);
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -1027,7 +1027,7 @@ static int write_msr(unsigned int reg, u
     case MSR_SHADOW_GS_BASE:
         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
             break;
-        wrmsrl(MSR_SHADOW_GS_BASE, val);
+        wrgsshadow(val);
         curr->arch.pv_vcpu.gs_base_user = val;
         return X86EMUL_OKAY;
 
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -1034,7 +1034,7 @@ long do_set_segment_base(unsigned int wh
     case SEGBASE_GS_USER:
         if ( is_canonical_address(base) )
         {
-            wrmsrl(MSR_SHADOW_GS_BASE, base);
+            wrgsshadow(base);
             v->arch.pv_vcpu.gs_base_user = base;
         }
         else
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -49,7 +49,7 @@ static void read_registers(struct cpu_us
     regs->gs = read_sreg(gs);
     crs[5] = rdfsbase();
     crs[6] = rdgsbase();
-    rdmsrl(MSR_SHADOW_GS_BASE, crs[7]);
+    crs[7] = rdgsshadow();
 }
 
 static void _show_registers(
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -8,6 +8,7 @@
 #include <xen/types.h>
 #include <xen/percpu.h>
 #include <xen/errno.h>
+#include <asm/alternative.h>
 #include <asm/asm_defns.h>
 #include <asm/cpufeature.h>
 
@@ -172,6 +173,24 @@ static inline unsigned long rdgsbase(voi
     return base;
 }
 
+static inline unsigned long rdgsshadow(void)
+{
+    unsigned long base;
+
+    alternative_io("mov %[msr], %%ecx\n\t"
+                   "rdmsr\n\t"
+                   "shl $32, %%rdx\n\t"
+                   "or %%rdx, %[res]",
+                   "swapgs\n\t"
+                   ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8\n\t" /* rdgsbase rax */
+                   "swapgs",
+                   X86_FEATURE_FSGSBASE,
+                   [res] "=&a" (base),
+                   [msr] "i" (MSR_SHADOW_GS_BASE) : "rcx", "rdx");
+
+    return base;
+}
+
 static inline void wrfsbase(unsigned long base)
 {
     if ( cpu_has_fsgsbase )
@@ -196,6 +215,19 @@ static inline void wrgsbase(unsigned lon
         wrmsrl(MSR_GS_BASE, base);
 }
 
+static inline void wrgsshadow(unsigned long base)
+{
+    alternative_input("mov %[msr], %%ecx\n\t"
+                      "shld $32, %[val], %%rdx\n\t"
+                      "wrmsr",
+                      "swapgs\n\t"
+                      ".byte 0xf3, 0x48, 0x0f, 0xae, 0xd8\n\t" /* wrgsbase rax */
+                      "swapgs",
+                      X86_FEATURE_FSGSBASE,
+                      [val] "a" (base),
+                      [msr] "i" (MSR_SHADOW_GS_BASE) : "rcx", "rdx");
+}
+
 DECLARE_PER_CPU(u64, efer);
 u64 read_efer(void);
 void write_efer(u64 val);



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/2] x86: rename DIRTY_GS_BASE_USER
  2017-12-06 16:32 [PATCH 0/2] x86: further GS base handling changes Jan Beulich
  2017-12-06 16:37 ` [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses Jan Beulich
@ 2017-12-06 16:38 ` Jan Beulich
  2017-12-06 19:34   ` Andrew Cooper
  1 sibling, 1 reply; 9+ messages in thread
From: Jan Beulich @ 2017-12-06 16:38 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

As of commit 91f85280b9 ("x86: fix GS-base-dirty determination") the
USER part of it isn't really appropriate anymore.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1276,7 +1276,7 @@ static DEFINE_PER_CPU(unsigned int, dirt
 #define DIRTY_FS           0x04
 #define DIRTY_GS           0x08
 #define DIRTY_FS_BASE      0x10
-#define DIRTY_GS_BASE_USER 0x20
+#define DIRTY_GS_BASE      0x20
 
 static void load_segments(struct vcpu *n)
 {
@@ -1317,7 +1317,7 @@ static void load_segments(struct vcpu *n
         all_segs_okay &= loadsegment(gs, uregs->gs);
         /* non-nul selector updates gs_base_user */
         if ( uregs->gs & ~3 )
-            dirty_segment_mask &= ~DIRTY_GS_BASE_USER;
+            dirty_segment_mask &= ~DIRTY_GS_BASE;
     }
 
     if ( !is_pv_32bit_vcpu(n) )
@@ -1335,7 +1336,7 @@ static void load_segments(struct vcpu *n
 
         /* This can only be non-zero if selector is NULL. */
         if ( n->arch.pv_vcpu.gs_base_user |
-             (dirty_segment_mask & DIRTY_GS_BASE_USER) )
+             (dirty_segment_mask & DIRTY_GS_BASE) )
             wrgsbase(n->arch.pv_vcpu.gs_base_user);
 
         /* If in kernel mode then switch the GS bases around. */
@@ -1486,7 +1486,7 @@ static void save_segments(struct vcpu *v
     }
     if ( v->arch.flags & TF_kernel_mode ? v->arch.pv_vcpu.gs_base_kernel
                                         : v->arch.pv_vcpu.gs_base_user )
-        dirty_segment_mask |= DIRTY_GS_BASE_USER;
+        dirty_segment_mask |= DIRTY_GS_BASE;
 
     this_cpu(dirty_segment_mask) = dirty_segment_mask;
 }




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses
  2017-12-06 16:37 ` [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses Jan Beulich
@ 2017-12-06 19:34   ` Andrew Cooper
  2017-12-07 10:45     ` Jan Beulich
  2018-02-28 14:46   ` Ping: " Jan Beulich
  1 sibling, 1 reply; 9+ messages in thread
From: Andrew Cooper @ 2017-12-06 19:34 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 06/12/17 16:37, Jan Beulich wrote:
> --- a/xen/include/asm-x86/msr.h
> +++ b/xen/include/asm-x86/msr.h
> @@ -8,6 +8,7 @@
>  #include <xen/types.h>
>  #include <xen/percpu.h>
>  #include <xen/errno.h>
> +#include <asm/alternative.h>
>  #include <asm/asm_defns.h>
>  #include <asm/cpufeature.h>
>  
> @@ -172,6 +173,24 @@ static inline unsigned long rdgsbase(voi
>      return base;
>  }
>  
> +static inline unsigned long rdgsshadow(void)
> +{
> +    unsigned long base;
> +
> +    alternative_io("mov %[msr], %%ecx\n\t"
> +                   "rdmsr\n\t"
> +                   "shl $32, %%rdx\n\t"
> +                   "or %%rdx, %[res]",

There needs to be some clearer distinction between the two
alternatives.  It took a while for me to spot this comma.

> +                   "swapgs\n\t"
> +                   ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8\n\t" /* rdgsbase rax */
> +                   "swapgs",
> +                   X86_FEATURE_FSGSBASE,
> +                   [res] "=&a" (base),
> +                   [msr] "i" (MSR_SHADOW_GS_BASE) : "rcx", "rdx");

MSR should be a "c" constraint, and the clobber dropped.  It will result
in better code in most of the callsites, by avoiding a reload of ecx,
and merging of the higher constant with the other MSR accesses.

I'm not entirely sure the alternative is justified here.  For
consistency alone, these helpers should match their companions, and in
the unlikely case that the runtime feature test does make a measurable
difference, wouldn't a static key be a better option anyway?

> +
> +    return base;
> +}
> +
>  static inline void wrfsbase(unsigned long base)
>  {
>      if ( cpu_has_fsgsbase )
> @@ -196,6 +215,19 @@ static inline void wrgsbase(unsigned lon
>          wrmsrl(MSR_GS_BASE, base);
>  }
>  
> +static inline void wrgsshadow(unsigned long base)
> +{
> +    alternative_input("mov %[msr], %%ecx\n\t"
> +                      "shld $32, %[val], %%rdx\n\t"

This is a vector path instruction and specifically called out to be
avoided in the AMD optimisation guide.  On all hardware (according to
Agner's latency mesurements) it alone has a longer latency to execute
that the mov/shl pair you've replaced, and that is before accounting for
mov elimination.

~Andrew

> +                      "wrmsr",
> +                      "swapgs\n\t"
> +                      ".byte 0xf3, 0x48, 0x0f, 0xae, 0xd8\n\t" /* wrgsbase rax */
> +                      "swapgs",
> +                      X86_FEATURE_FSGSBASE,
> +                      [val] "a" (base),
> +                      [msr] "i" (MSR_SHADOW_GS_BASE) : "rcx", "rdx");
> +}
> +
>  DECLARE_PER_CPU(u64, efer);
>  u64 read_efer(void);
>  void write_efer(u64 val);
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xenproject.org
> https://lists.xenproject.org/mailman/listinfo/xen-devel


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] x86: rename DIRTY_GS_BASE_USER
  2017-12-06 16:38 ` [PATCH 2/2] x86: rename DIRTY_GS_BASE_USER Jan Beulich
@ 2017-12-06 19:34   ` Andrew Cooper
  0 siblings, 0 replies; 9+ messages in thread
From: Andrew Cooper @ 2017-12-06 19:34 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 06/12/17 16:38, Jan Beulich wrote:
> As of commit 91f85280b9 ("x86: fix GS-base-dirty determination") the
> USER part of it isn't really appropriate anymore.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses
  2017-12-06 19:34   ` Andrew Cooper
@ 2017-12-07 10:45     ` Jan Beulich
  2018-03-01 15:47       ` Andrew Cooper
  0 siblings, 1 reply; 9+ messages in thread
From: Jan Beulich @ 2017-12-07 10:45 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 06.12.17 at 20:34, <andrew.cooper3@citrix.com> wrote:
> On 06/12/17 16:37, Jan Beulich wrote:
>> @@ -172,6 +173,24 @@ static inline unsigned long rdgsbase(voi
>>      return base;
>>  }
>>  
>> +static inline unsigned long rdgsshadow(void)
>> +{
>> +    unsigned long base;
>> +
>> +    alternative_io("mov %[msr], %%ecx\n\t"
>> +                   "rdmsr\n\t"
>> +                   "shl $32, %%rdx\n\t"
>> +                   "or %%rdx, %[res]",
> 
> There needs to be some clearer distinction between the two
> alternatives.  It took a while for me to spot this comma.

Any suggestion? I've been noticing the issue of the split being
hard to spot in other places as well, so I'd like to do this in a
generally applicable and sufficiently uniform way.

>> +                   "swapgs\n\t"
>> +                   ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8\n\t" /* rdgsbase rax */
>> +                   "swapgs",
>> +                   X86_FEATURE_FSGSBASE,
>> +                   [res] "=&a" (base),
>> +                   [msr] "i" (MSR_SHADOW_GS_BASE) : "rcx", "rdx");
> 
> MSR should be a "c" constraint, and the clobber dropped.  It will result
> in better code in most of the callsites, by avoiding a reload of ecx,
> and merging of the higher constant with the other MSR accesses.

That's one view to take. The other is that the alternative doesn't
require this input register at all. Furthermore, if my counting is
correct, I'd need to add NOPs to the original insn sequence in
order to fit the replacement one. The goal, however, is to keep
the two as similar in size as possible.

I also don't buy the "avoid reload of ecx" argument: There are
no other uses of MSR_SHADOW_GS_BASE without intermediate
other MSR accesses afaics.

> I'm not entirely sure the alternative is justified here.  For
> consistency alone, these helpers should match their companions, and in
> the unlikely case that the runtime feature test does make a measurable
> difference, wouldn't a static key be a better option anyway?

Static key? The main reason I dislike making {rd,wr}{fs,gs}base
use alternatives is that the original code would be much larger
than the replacement code. IOW of the options to make things
consistent, I'd prefer using alternatives for the other inline
functions too. But I think the choice here should be what fits
best.

>> @@ -196,6 +215,19 @@ static inline void wrgsbase(unsigned lon
>>          wrmsrl(MSR_GS_BASE, base);
>>  }
>>  
>> +static inline void wrgsshadow(unsigned long base)
>> +{
>> +    alternative_input("mov %[msr], %%ecx\n\t"
>> +                      "shld $32, %[val], %%rdx\n\t"
> 
> This is a vector path instruction and specifically called out to be
> avoided in the AMD optimisation guide.  On all hardware (according to
> Agner's latency mesurements) it alone has a longer latency to execute
> that the mov/shl pair you've replaced, and that is before accounting for
> mov elimination.

For one I doubt the latency of the SHLD will be noticable with
the latency of the following WRMSR. And then I think the main
goal should be to have optimal performance on modern CPUs.
That means size-optimizing original code, to reduce the amount
of NOPs needed when the alternative is being installed.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Ping: [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses
  2017-12-06 16:37 ` [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses Jan Beulich
  2017-12-06 19:34   ` Andrew Cooper
@ 2018-02-28 14:46   ` Jan Beulich
  1 sibling, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2018-02-28 14:46 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 06.12.17 at 17:37,  wrote:
> Instead of using RDMSR/WRMSR, on fsgsbase-capable systems use a double
> SWAPGS combined with RDGSBASE/WRGSBASE. This halves execution time for
> a shadow GS update alone on my Haswell (and we have indications of
> good performance improvements by this on Skylake too), while the win is
> even higher when e.g. updating more than one base (as may and commonly
> will happen in load_segments()).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

The previous (short) discussion has stalled, hence the ping.

Jan

> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1326,9 +1326,12 @@ static void load_segments(struct vcpu *n
>          if ( n->arch.pv_vcpu.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) 
> )
>              wrfsbase(n->arch.pv_vcpu.fs_base);
>  
> -        /* Most kernels have non-zero GS base, so don't bother testing. */
> -        /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
> -        wrmsrl(MSR_SHADOW_GS_BASE, n->arch.pv_vcpu.gs_base_kernel);
> +        /*
> +         * Most kernels have non-zero GS base, so don't bother testing.
> +         * (For old AMD hardware this is also a serialising instruction,
> +         * avoiding erratum #88.)
> +         */
> +        wrgsshadow(n->arch.pv_vcpu.gs_base_kernel);
>  
>          /* This can only be non-zero if selector is NULL. */
>          if ( n->arch.pv_vcpu.gs_base_user |
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -501,7 +501,7 @@ long_mode_do_msr_read(unsigned int msr,
>          break;
>  
>      case MSR_SHADOW_GS_BASE:
> -        rdmsrl(MSR_SHADOW_GS_BASE, *msr_content);
> +        *msr_content = rdgsshadow();
>          break;
>  
>      case MSR_STAR:
> @@ -549,7 +549,7 @@ long_mode_do_msr_write(unsigned int msr,
>          else if ( msr == MSR_GS_BASE )
>              __vmwrite(GUEST_GS_BASE, msr_content);
>          else
> -            wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
> +            wrgsshadow(msr_content);
>  
>          break;
>  
> @@ -608,12 +608,12 @@ static void vmx_save_guest_msrs(struct v
>       * We cannot cache SHADOW_GS_BASE while the VCPU runs, as it can
>       * be updated at any time via SWAPGS, which we cannot trap.
>       */
> -    rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
> +    v->arch.hvm_vmx.shadow_gs = rdgsshadow();
>  }
>  
>  static void vmx_restore_guest_msrs(struct vcpu *v)
>  {
> -    wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs);
> +    wrgsshadow(v->arch.hvm_vmx.shadow_gs);
>      wrmsrl(MSR_STAR,           v->arch.hvm_vmx.star);
>      wrmsrl(MSR_LSTAR,          v->arch.hvm_vmx.lstar);
>      wrmsrl(MSR_SYSCALL_MASK,   v->arch.hvm_vmx.sfmask);
> --- a/xen/arch/x86/pv/emul-priv-op.c
> +++ b/xen/arch/x86/pv/emul-priv-op.c
> @@ -1027,7 +1027,7 @@ static int write_msr(unsigned int reg, u
>      case MSR_SHADOW_GS_BASE:
>          if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
>              break;
> -        wrmsrl(MSR_SHADOW_GS_BASE, val);
> +        wrgsshadow(val);
>          curr->arch.pv_vcpu.gs_base_user = val;
>          return X86EMUL_OKAY;
>  
> --- a/xen/arch/x86/x86_64/mm.c
> +++ b/xen/arch/x86/x86_64/mm.c
> @@ -1034,7 +1034,7 @@ long do_set_segment_base(unsigned int wh
>      case SEGBASE_GS_USER:
>          if ( is_canonical_address(base) )
>          {
> -            wrmsrl(MSR_SHADOW_GS_BASE, base);
> +            wrgsshadow(base);
>              v->arch.pv_vcpu.gs_base_user = base;
>          }
>          else
> --- a/xen/arch/x86/x86_64/traps.c
> +++ b/xen/arch/x86/x86_64/traps.c
> @@ -49,7 +49,7 @@ static void read_registers(struct cpu_us
>      regs->gs = read_sreg(gs);
>      crs[5] = rdfsbase();
>      crs[6] = rdgsbase();
> -    rdmsrl(MSR_SHADOW_GS_BASE, crs[7]);
> +    crs[7] = rdgsshadow();
>  }
>  
>  static void _show_registers(
> --- a/xen/include/asm-x86/msr.h
> +++ b/xen/include/asm-x86/msr.h
> @@ -8,6 +8,7 @@
>  #include <xen/types.h>
>  #include <xen/percpu.h>
>  #include <xen/errno.h>
> +#include <asm/alternative.h>
>  #include <asm/asm_defns.h>
>  #include <asm/cpufeature.h>
>  
> @@ -172,6 +173,24 @@ static inline unsigned long rdgsbase(voi
>      return base;
>  }
>  
> +static inline unsigned long rdgsshadow(void)
> +{
> +    unsigned long base;
> +
> +    alternative_io("mov %[msr], %%ecx\n\t"
> +                   "rdmsr\n\t"
> +                   "shl $32, %%rdx\n\t"
> +                   "or %%rdx, %[res]",
> +                   "swapgs\n\t"
> +                   ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8\n\t" /* rdgsbase rax */
> +                   "swapgs",
> +                   X86_FEATURE_FSGSBASE,
> +                   [res] "=&a" (base),
> +                   [msr] "i" (MSR_SHADOW_GS_BASE) : "rcx", "rdx");
> +
> +    return base;
> +}
> +
>  static inline void wrfsbase(unsigned long base)
>  {
>      if ( cpu_has_fsgsbase )
> @@ -196,6 +215,19 @@ static inline void wrgsbase(unsigned lon
>          wrmsrl(MSR_GS_BASE, base);
>  }
>  
> +static inline void wrgsshadow(unsigned long base)
> +{
> +    alternative_input("mov %[msr], %%ecx\n\t"
> +                      "shld $32, %[val], %%rdx\n\t"
> +                      "wrmsr",
> +                      "swapgs\n\t"
> +                      ".byte 0xf3, 0x48, 0x0f, 0xae, 0xd8\n\t" /* wrgsbase rax */
> +                      "swapgs",
> +                      X86_FEATURE_FSGSBASE,
> +                      [val] "a" (base),
> +                      [msr] "i" (MSR_SHADOW_GS_BASE) : "rcx", "rdx");
> +}
> +
>  DECLARE_PER_CPU(u64, efer);
>  u64 read_efer(void);
>  void write_efer(u64 val);
> 
> 
> 



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses
  2017-12-07 10:45     ` Jan Beulich
@ 2018-03-01 15:47       ` Andrew Cooper
  2018-03-01 17:00         ` Jan Beulich
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Cooper @ 2018-03-01 15:47 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On 07/12/17 10:45, Jan Beulich wrote:
>>>> On 06.12.17 at 20:34, <andrew.cooper3@citrix.com> wrote:
>> On 06/12/17 16:37, Jan Beulich wrote:
>>> @@ -172,6 +173,24 @@ static inline unsigned long rdgsbase(voi
>>>      return base;
>>>  }
>>>  
>>> +static inline unsigned long rdgsshadow(void)
>>> +{
>>> +    unsigned long base;
>>> +
>>> +    alternative_io("mov %[msr], %%ecx\n\t"
>>> +                   "rdmsr\n\t"
>>> +                   "shl $32, %%rdx\n\t"
>>> +                   "or %%rdx, %[res]",
>> There needs to be some clearer distinction between the two
>> alternatives.  It took a while for me to spot this comma.
> Any suggestion? I've been noticing the issue of the split being
> hard to spot in other places as well, so I'd like to do this in a
> generally applicable and sufficiently uniform way.

The best I can think of is something like this:

"or %%rdx, %[res]",
/* Use a double swapgs and rdgsbase if available. */
"swapgs\n\t"


>
>> I'm not entirely sure the alternative is justified here.  For
>> consistency alone, these helpers should match their companions, and in
>> the unlikely case that the runtime feature test does make a measurable
>> difference, wouldn't a static key be a better option anyway?
> Static key? The main reason I dislike making {rd,wr}{fs,gs}base
> use alternatives is that the original code would be much larger
> than the replacement code. IOW of the options to make things
> consistent, I'd prefer using alternatives for the other inline
> functions too. But I think the choice here should be what fits
> best.
>
>>> @@ -196,6 +215,19 @@ static inline void wrgsbase(unsigned lon
>>>          wrmsrl(MSR_GS_BASE, base);
>>>  }
>>>  
>>> +static inline void wrgsshadow(unsigned long base)
>>> +{
>>> +    alternative_input("mov %[msr], %%ecx\n\t"
>>> +                      "shld $32, %[val], %%rdx\n\t"
>> This is a vector path instruction and specifically called out to be
>> avoided in the AMD optimisation guide.  On all hardware (according to
>> Agner's latency mesurements) it alone has a longer latency to execute
>> that the mov/shl pair you've replaced, and that is before accounting for
>> mov elimination.
> For one I doubt the latency of the SHLD will be noticable with
> the latency of the following WRMSR. And then I think the main
> goal should be to have optimal performance on modern CPUs.
> That means size-optimizing original code, to reduce the amount
> of NOPs needed when the alternative is being installed.

I'm sorry if this comes across as blunt, but you are too focused on the
wrong details.

I agree that `swap;{rd,wr}gsbase;swap` is better than wrmsr, and we
should be using it when available.

However, it is simply not the case that squeezing every possible byte
out of .text makes the best result.  Just as with inc and dec, there is
a very good reason why compilers don't emit shld, and that is because
the ORM's recommend against their use.  In this specific case, a mov/shl
pair is better than shld on all hardware, even in older pipelines which
don't do mov elimination.

You are going to need a more convincing argument than currently provided
as to why the helpers aren't implemented like this:

static inline unsigned long rdgsshadow(void)
{
    unsigned long base;

    if ( cpu_has_fsgsbase )
        asm volatile ( "swapgs\n\t"
#ifdef HAVE_GAS_FSGSBASE
                       "rdgsbase %0\n\t"
#else
                    ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8\n\t"
#endif
                        "swapgs\n\t"
                       :
#ifdef HAVE_GAS_FSGSBASE
                       "=r" (base)
#else
                       "=a" (base)
#endif
            );
    else
        rdmsrl(MSR_SHADOW_GS_BASE, base);

    return base;
}

static inline void wrgsshadow(unsigned long base)
{
    if ( cpu_has_fsgsbase )
        asm volatile ( "swapgs\n\t"
#ifdef HAVE_GAS_FSGSBASE
                       "wrgsbase %0\n\t"
#else
                       ".byte 0xf3, 0x48, 0x0f, 0xae, 0xd8\n\t"
#endif
                        "swapgs\n\t"
                       ::
#ifdef HAVE_GAS_FSGSBASE
                       "r" (base)
#else
                       "a" (base)
#endif
            );
    else
        wrmsrl(MSR_SHADOW_GS_BASE, base);
}

In particular:
1) This is consistent with existing {rd,wr}{fs,gs}base helpers.
2) cpu_has_fsgsbase is constant after boot, so the branch will be
predicted correctly.
3) It doesn't use instructions recommended against by the ORMs.

I don't wish to suggest that the above code is better than any and all
possible other combinations of performing the same operation, but the
more you get into the minutia of microoptimisation, the higher the
barrier for inclusion gets, because it is important to demonstrate that
the benefits genuinely outweigh the associated costs.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses
  2018-03-01 15:47       ` Andrew Cooper
@ 2018-03-01 17:00         ` Jan Beulich
  0 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2018-03-01 17:00 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 01.03.18 at 16:47, <andrew.cooper3@citrix.com> wrote:
> On 07/12/17 10:45, Jan Beulich wrote:
>>>>> On 06.12.17 at 20:34, <andrew.cooper3@citrix.com> wrote:
>>> On 06/12/17 16:37, Jan Beulich wrote:
>>>> @@ -196,6 +215,19 @@ static inline void wrgsbase(unsigned lon
>>>>          wrmsrl(MSR_GS_BASE, base);
>>>>  }
>>>>  
>>>> +static inline void wrgsshadow(unsigned long base)
>>>> +{
>>>> +    alternative_input("mov %[msr], %%ecx\n\t"
>>>> +                      "shld $32, %[val], %%rdx\n\t"
>>> This is a vector path instruction and specifically called out to be
>>> avoided in the AMD optimisation guide.  On all hardware (according to
>>> Agner's latency mesurements) it alone has a longer latency to execute
>>> that the mov/shl pair you've replaced, and that is before accounting for
>>> mov elimination.
>> For one I doubt the latency of the SHLD will be noticable with
>> the latency of the following WRMSR. And then I think the main
>> goal should be to have optimal performance on modern CPUs.
>> That means size-optimizing original code, to reduce the amount
>> of NOPs needed when the alternative is being installed.
> 
> I'm sorry if this comes across as blunt, but you are too focused on the
> wrong details.
> 
> I agree that `swap;{rd,wr}gsbase;swap` is better than wrmsr, and we
> should be using it when available.
> 
> However, it is simply not the case that squeezing every possible byte
> out of .text makes the best result.  Just as with inc and dec, there is
> a very good reason why compilers don't emit shld, and that is because
> the ORM's recommend against their use.  In this specific case, a mov/shl
> pair is better than shld on all hardware, even in older pipelines which
> don't do mov elimination.

Well, okay I'll switch away from using SHLD. It'll be a single padding
NOP only anyway, just a slightly larger one. And note that this wasn't
about size optimization, but about NOP padding reduction.

> You are going to need a more convincing argument than currently provided
> as to why the helpers aren't implemented like this:
> 
> static inline unsigned long rdgsshadow(void)
> {
>     unsigned long base;
> 
>     if ( cpu_has_fsgsbase )
>         asm volatile ( "swapgs\n\t"
> #ifdef HAVE_GAS_FSGSBASE
>                        "rdgsbase %0\n\t"
> #else
>                     ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8\n\t"
> #endif
>                         "swapgs\n\t"
>                        :
> #ifdef HAVE_GAS_FSGSBASE
>                        "=r" (base)
> #else
>                        "=a" (base)
> #endif
>             );
>     else
>         rdmsrl(MSR_SHADOW_GS_BASE, base);
> 
>     return base;
> }
> 
> static inline void wrgsshadow(unsigned long base)
> {
>     if ( cpu_has_fsgsbase )
>         asm volatile ( "swapgs\n\t"
> #ifdef HAVE_GAS_FSGSBASE
>                        "wrgsbase %0\n\t"
> #else
>                        ".byte 0xf3, 0x48, 0x0f, 0xae, 0xd8\n\t"
> #endif
>                         "swapgs\n\t"
>                        ::
> #ifdef HAVE_GAS_FSGSBASE
>                        "r" (base)
> #else
>                        "a" (base)
> #endif
>             );
>     else
>         wrmsrl(MSR_SHADOW_GS_BASE, base);
> }
> 
> In particular:
> 1) This is consistent with existing {rd,wr}{fs,gs}base helpers.

Which, in due course, I would have meant to convert to alternatives
as well.

> 2) cpu_has_fsgsbase is constant after boot, so the branch will be
> predicted correctly.

If it has an entry in the BTB in the first place.

I can certainly also switch away from using alternatives here, but
considering especially my response to 1) I would do this quite
hesitantly.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-03-01 17:00 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-06 16:32 [PATCH 0/2] x86: further GS base handling changes Jan Beulich
2017-12-06 16:37 ` [PATCH 1/2] x86: improve MSR_SHADOW_GS accesses Jan Beulich
2017-12-06 19:34   ` Andrew Cooper
2017-12-07 10:45     ` Jan Beulich
2018-03-01 15:47       ` Andrew Cooper
2018-03-01 17:00         ` Jan Beulich
2018-02-28 14:46   ` Ping: " Jan Beulich
2017-12-06 16:38 ` [PATCH 2/2] x86: rename DIRTY_GS_BASE_USER Jan Beulich
2017-12-06 19:34   ` Andrew Cooper

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.