All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] x86: use VMLOAD for PV context switch
@ 2018-09-10 14:03 Jan Beulich
  2018-09-10 21:56 ` Boris Ostrovsky
  2018-09-18 12:28 ` [PATCH v3] " Jan Beulich
  0 siblings, 2 replies; 12+ messages in thread
From: Jan Beulich @ 2018-09-10 14:03 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, Boris Ostrovsky, Brian Woods, Suravee Suthikulpanit

Having noticed that VMLOAD alone is about as fast as a single of the
involved WRMSRs, I thought it might be a reasonable idea to also use it
for PV. Measurements, however, have shown that an actual improvement can
be achieved only with an early prefetch of the VMCB (thanks to Andrew
for suggesting to try this), which I have to admit I can't really
explain. This way on my Fam15 box context switch takes over 100 clocks
less on average (the measured values are heavily varying in all cases,
though).

This is intentionally not using a new hvm_funcs hook: For one, this is
all about PV, and something similar can hardly be done for VMX.
Furthermore the indirect to direct call patching that is meant to be
applied to most hvm_funcs hooks would be ugly to make work with
functions having more than 6 parameters.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Brian Woods <brian.woods@amd.com>
---
v2: Re-base.
---
Besides the mentioned oddity with measured performance, I've also
noticed a significant difference (of at least 150 clocks) between
measuring immediately around the calls to svm_load_segs() and measuring
immediately inside the function.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -52,6 +52,7 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/support.h>
+#include <asm/hvm/svm/svm.h>
 #include <asm/hvm/viridian.h>
 #include <asm/debugreg.h>
 #include <asm/msr.h>
@@ -1281,11 +1282,35 @@ static void load_segments(struct vcpu *n
     struct cpu_user_regs *uregs = &n->arch.user_regs;
     int all_segs_okay = 1;
     unsigned int dirty_segment_mask, cpu = smp_processor_id();
+    bool fs_gs_done = false;
 
     /* Load and clear the dirty segment mask. */
     dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
     per_cpu(dirty_segment_mask, cpu) = 0;
 
+#ifdef CONFIG_HVM
+    if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm &&
+         !((uregs->fs | uregs->gs) & ~3) &&
+         /*
+          * The remaining part is just for optimization: If only shadow GS
+          * needs loading, there's nothing to be gained here.
+          */
+         (n->arch.pv.fs_base | n->arch.pv.gs_base_user) )
+    {
+        fs_gs_done = n->arch.flags & TF_kernel_mode
+            ? svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
+                            uregs->fs, n->arch.pv.fs_base,
+                            uregs->gs, n->arch.pv.gs_base_kernel,
+                            n->arch.pv.gs_base_user)
+            : svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
+                            uregs->fs, n->arch.pv.fs_base,
+                            uregs->gs, n->arch.pv.gs_base_user,
+                            n->arch.pv.gs_base_kernel);
+    }
+#endif
+    if ( !fs_gs_done )
+        load_LDT(n);
+
     /* Either selector != 0 ==> reload. */
     if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) )
     {
@@ -1301,7 +1326,7 @@ static void load_segments(struct vcpu *n
     }
 
     /* Either selector != 0 ==> reload. */
-    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) )
+    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) && !fs_gs_done )
     {
         all_segs_okay &= loadsegment(fs, uregs->fs);
         /* non-nul selector updates fs_base */
@@ -1310,7 +1335,7 @@ static void load_segments(struct vcpu *n
     }
 
     /* Either selector != 0 ==> reload. */
-    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) )
+    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) && !fs_gs_done  )
     {
         all_segs_okay &= loadsegment(gs, uregs->gs);
         /* non-nul selector updates gs_base_user */
@@ -1318,7 +1343,7 @@ static void load_segments(struct vcpu *n
             dirty_segment_mask &= ~DIRTY_GS_BASE;
     }
 
-    if ( !is_pv_32bit_vcpu(n) )
+    if ( !fs_gs_done && !is_pv_32bit_vcpu(n) )
     {
         /* This can only be non-zero if selector is NULL. */
         if ( n->arch.pv.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
@@ -1653,6 +1678,12 @@ static void __context_switch(void)
 
     write_ptbase(n);
 
+#if defined(CONFIG_PV) && defined(CONFIG_HVM)
+    if ( is_pv_domain(nd) && !is_pv_32bit_domain(nd) && !is_idle_domain(nd) &&
+         !cpu_has_fsgsbase && cpu_has_svm )
+        svm_load_segs(0, 0, 0, 0, 0, 0, 0);
+#endif
+
     if ( need_full_gdt(nd) &&
          ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
     {
@@ -1714,10 +1745,7 @@ void context_switch(struct vcpu *prev, s
         local_irq_enable();
 
         if ( is_pv_domain(nextd) )
-        {
-            load_LDT(next);
             load_segments(next);
-        }
 
         ctxt_switch_levelling(next);
 
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -78,6 +78,9 @@ static struct hvm_function_table svm_fun
  */
 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, hsa);
 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, host_vmcb);
+#ifdef CONFIG_PV
+static DEFINE_PER_CPU(struct vmcb_struct *, host_vmcb_va);
+#endif
 
 static bool_t amd_erratum383_found __read_mostly;
 
@@ -1567,6 +1570,14 @@ static void svm_cpu_dead(unsigned int cp
         *this_hsa = 0;
     }
 
+#ifdef CONFIG_PV
+    if ( per_cpu(host_vmcb_va, cpu) )
+    {
+        unmap_domain_page_global(per_cpu(host_vmcb_va, cpu));
+        per_cpu(host_vmcb_va, cpu) = NULL;
+    }
+#endif
+
     if ( *this_vmcb )
     {
         free_domheap_page(maddr_to_page(*this_vmcb));
@@ -1601,6 +1612,11 @@ static int svm_cpu_up_prepare(unsigned i
         if ( !pg )
             goto err;
 
+#ifdef CONFIG_PV
+        if ( !cpu_has_fsgsbase )
+            per_cpu(host_vmcb_va, cpu) = __map_domain_page_global(pg);
+#endif
+
         clear_domain_page(page_to_mfn(pg));
         *this_vmcb = page_to_maddr(pg);
     }
@@ -1630,6 +1646,60 @@ static void svm_init_erratum_383(const s
     }
 }
 
+#ifdef CONFIG_PV
+bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
+                   unsigned int fs_sel, unsigned long fs_base,
+                   unsigned int gs_sel, unsigned long gs_base,
+                   unsigned long gs_shadow)
+{
+    unsigned int cpu = smp_processor_id();
+    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
+
+    if ( unlikely(!vmcb) )
+        return false;
+
+    if ( !ldt_base )
+    {
+        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
+        return true;
+    }
+
+    if ( likely(!ldt_ents) )
+        memset(&vmcb->ldtr, 0, sizeof(vmcb->ldtr));
+    else
+    {
+        /* Keep GDT in sync. */
+        struct desc_struct *desc = this_cpu(gdt_table) + LDT_ENTRY -
+                                   FIRST_RESERVED_GDT_ENTRY;
+
+        _set_tssldt_desc(desc, ldt_base, ldt_ents * 8 - 1, SYS_DESC_ldt);
+
+        vmcb->ldtr.sel = LDT_ENTRY << 3;
+        vmcb->ldtr.attr = SYS_DESC_ldt | (_SEGMENT_P >> 8);
+        vmcb->ldtr.limit = ldt_ents * 8 - 1;
+        vmcb->ldtr.base = ldt_base;
+    }
+
+    ASSERT(!(fs_sel & ~3));
+    vmcb->fs.sel = fs_sel;
+    vmcb->fs.attr = 0;
+    vmcb->fs.limit = 0;
+    vmcb->fs.base = fs_base;
+
+    ASSERT(!(gs_sel & ~3));
+    vmcb->gs.sel = gs_sel;
+    vmcb->gs.attr = 0;
+    vmcb->gs.limit = 0;
+    vmcb->gs.base = gs_base;
+
+    vmcb->kerngsbase = gs_shadow;
+
+    svm_vmload_pa(per_cpu(host_vmcb, cpu));
+
+    return true;
+}
+#endif
+
 static int _svm_cpu_up(bool bsp)
 {
     uint64_t msr_content;
@@ -1662,6 +1732,8 @@ static int _svm_cpu_up(bool bsp)
     /* Initialize OSVW bits to be used by guests */
     svm_host_osvw_init();
 
+    svm_vmsave_pa(per_cpu(host_vmcb, cpu));
+
     return 0;
 }
 
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -53,6 +53,12 @@ unsigned long *svm_msrbit(unsigned long
 void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len);
 void svm_update_guest_cr(struct vcpu *, unsigned int cr, unsigned int flags);
 
+/* PV context switch helper */
+bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
+                   unsigned int fs_sel, unsigned long fs_base,
+                   unsigned int gs_sel, unsigned long gs_base,
+                   unsigned long gs_shadow);
+
 extern u32 svm_feature_flags;
 
 #define SVM_FEATURE_NPT            0 /* Nested page table support */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] x86: use VMLOAD for PV context switch
  2018-09-10 14:03 [PATCH v2] x86: use VMLOAD for PV context switch Jan Beulich
@ 2018-09-10 21:56 ` Boris Ostrovsky
  2018-09-11  7:54   ` Jan Beulich
  2018-09-18 12:28 ` [PATCH v3] " Jan Beulich
  1 sibling, 1 reply; 12+ messages in thread
From: Boris Ostrovsky @ 2018-09-10 21:56 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Andrew Cooper, Brian Woods, Suravee Suthikulpanit

On 09/10/2018 10:03 AM, Jan Beulich wrote:
> Having noticed that VMLOAD alone is about as fast as a single of the
> involved WRMSRs, I thought it might be a reasonable idea to also use it
> for PV. Measurements, however, have shown that an actual improvement can
> be achieved only with an early prefetch of the VMCB (thanks to Andrew
> for suggesting to try this), which I have to admit I can't really
> explain. This way on my Fam15 box context switch takes over 100 clocks
> less on average (the measured values are heavily varying in all cases,
> though).
>
> This is intentionally not using a new hvm_funcs hook: For one, this is
> all about PV, and something similar can hardly be done for VMX.
> Furthermore the indirect to direct call patching that is meant to be
> applied to most hvm_funcs hooks would be ugly to make work with
> functions having more than 6 parameters.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Brian Woods <brian.woods@amd.com>
> ---
> v2: Re-base.
> ---
> Besides the mentioned oddity with measured performance, I've also
> noticed a significant difference (of at least 150 clocks) between
> measuring immediately around the calls to svm_load_segs() and measuring
> immediately inside the function.
>


>  
> +#ifdef CONFIG_PV
> +bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
> +                   unsigned int fs_sel, unsigned long fs_base,
> +                   unsigned int gs_sel, unsigned long gs_base,
> +                   unsigned long gs_shadow)
> +{
> +    unsigned int cpu = smp_processor_id();
> +    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
> +
> +    if ( unlikely(!vmcb) )
> +        return false;
> +
> +    if ( !ldt_base )
> +    {
> +        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
> +        return true;


Could you explain why this is true? We haven't loaded FS/GS here.

I also couldn't find discussion about prefetch --- why is prefetching
ldtr expected to help?

-boris

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] x86: use VMLOAD for PV context switch
  2018-09-10 21:56 ` Boris Ostrovsky
@ 2018-09-11  7:54   ` Jan Beulich
  2018-09-11 14:17     ` Boris Ostrovsky
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2018-09-11  7:54 UTC (permalink / raw)
  To: Boris Ostrovsky
  Cc: Andrew Cooper, Brian Woods, Suravee Suthikulpanit, xen-devel

>>> On 10.09.18 at 23:56, <boris.ostrovsky@oracle.com> wrote:
> On 09/10/2018 10:03 AM, Jan Beulich wrote:
>> Having noticed that VMLOAD alone is about as fast as a single of the
>> involved WRMSRs, I thought it might be a reasonable idea to also use it
>> for PV. Measurements, however, have shown that an actual improvement can
>> be achieved only with an early prefetch of the VMCB (thanks to Andrew
>> for suggesting to try this), which I have to admit I can't really
>> explain. This way on my Fam15 box context switch takes over 100 clocks
>> less on average (the measured values are heavily varying in all cases,
>> though).
>>
>> This is intentionally not using a new hvm_funcs hook: For one, this is
>> all about PV, and something similar can hardly be done for VMX.
>> Furthermore the indirect to direct call patching that is meant to be
>> applied to most hvm_funcs hooks would be ugly to make work with
>> functions having more than 6 parameters.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Acked-by: Brian Woods <brian.woods@amd.com>
>> ---
>> v2: Re-base.
>> ---
>> Besides the mentioned oddity with measured performance, I've also
>> noticed a significant difference (of at least 150 clocks) between
>> measuring immediately around the calls to svm_load_segs() and measuring
>> immediately inside the function.
>>
> 
> 
>>  
>> +#ifdef CONFIG_PV
>> +bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
>> +                   unsigned int fs_sel, unsigned long fs_base,
>> +                   unsigned int gs_sel, unsigned long gs_base,
>> +                   unsigned long gs_shadow)
>> +{
>> +    unsigned int cpu = smp_processor_id();
>> +    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
>> +
>> +    if ( unlikely(!vmcb) )
>> +        return false;
>> +
>> +    if ( !ldt_base )
>> +    {
>> +        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
>> +        return true;
> 
> 
> Could you explain why this is true? We haven't loaded FS/GS here.

A zero ldt_base argument indicates a prefetch request. This is an
agreement between callers of the function and its implementation.

> I also couldn't find discussion about prefetch --- why is prefetching
> ldtr expected to help?

See the patch description. ldtr as the element is a pretty random
choice between the various fields VMLOAD touches. It's (presumably)
more the page walk than the actual cache line(s) that we want to
be pulled in ahead of time. I can only guess that VMLOAD execution
is more "synchronous" wrt its memory accesses and/or latency to
completion than other (simpler) instructions.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] x86: use VMLOAD for PV context switch
  2018-09-11  7:54   ` Jan Beulich
@ 2018-09-11 14:17     ` Boris Ostrovsky
  2018-09-11 14:38       ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Boris Ostrovsky @ 2018-09-11 14:17 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Brian Woods, Suravee Suthikulpanit, xen-devel

On 9/11/18 3:54 AM, Jan Beulich wrote:
>>>> On 10.09.18 at 23:56, <boris.ostrovsky@oracle.com> wrote:
>> On 09/10/2018 10:03 AM, Jan Beulich wrote:
>>> Having noticed that VMLOAD alone is about as fast as a single of the
>>> involved WRMSRs, I thought it might be a reasonable idea to also use it
>>> for PV. Measurements, however, have shown that an actual improvement can
>>> be achieved only with an early prefetch of the VMCB (thanks to Andrew
>>> for suggesting to try this), which I have to admit I can't really
>>> explain. This way on my Fam15 box context switch takes over 100 clocks
>>> less on average (the measured values are heavily varying in all cases,
>>> though).
>>>
>>> This is intentionally not using a new hvm_funcs hook: For one, this is
>>> all about PV, and something similar can hardly be done for VMX.
>>> Furthermore the indirect to direct call patching that is meant to be
>>> applied to most hvm_funcs hooks would be ugly to make work with
>>> functions having more than 6 parameters.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> Acked-by: Brian Woods <brian.woods@amd.com>
>>> ---
>>> v2: Re-base.
>>> ---
>>> Besides the mentioned oddity with measured performance, I've also
>>> noticed a significant difference (of at least 150 clocks) between
>>> measuring immediately around the calls to svm_load_segs() and measuring
>>> immediately inside the function.
>>>
>>
>>
>>>  
>>> +#ifdef CONFIG_PV
>>> +bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
>>> +                   unsigned int fs_sel, unsigned long fs_base,
>>> +                   unsigned int gs_sel, unsigned long gs_base,
>>> +                   unsigned long gs_shadow)
>>> +{
>>> +    unsigned int cpu = smp_processor_id();
>>> +    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
>>> +
>>> +    if ( unlikely(!vmcb) )
>>> +        return false;
>>> +
>>> +    if ( !ldt_base )
>>> +    {
>>> +        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
>>> +        return true;
>>
>>
>> Could you explain why this is true? We haven't loaded FS/GS here.
> 
> A zero ldt_base argument indicates a prefetch request. This is an
> agreement between callers of the function and its implementation.


Oh, so this is what svm_load_segs(0, 0, 0, 0, 0, 0, 0) is for?

If yes then IMO a separate call would make things a bit clearer,
especially since the return value is ignored.


> 
>> I also couldn't find discussion about prefetch --- why is prefetching
>> ldtr expected to help?
> 
> See the patch description. ldtr as the element is a pretty random
> choice between the various fields VMLOAD touches. It's (presumably)
> more the page walk than the actual cache line(s) that we want to
> be pulled in ahead of time. I can only guess that VMLOAD execution
> is more "synchronous" wrt its memory accesses and/or latency to
> completion than other (simpler) instructions.

I think a code comment would be very helpful (including the fact that
ldtr is an arbitrary field), even if this is mentioned in the commit
message.

-boris

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] x86: use VMLOAD for PV context switch
  2018-09-11 14:17     ` Boris Ostrovsky
@ 2018-09-11 14:38       ` Jan Beulich
  2018-09-11 15:10         ` Boris Ostrovsky
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2018-09-11 14:38 UTC (permalink / raw)
  To: Boris Ostrovsky
  Cc: Andrew Cooper, Brian Woods, Suravee Suthikulpanit, xen-devel

>>> On 11.09.18 at 16:17, <boris.ostrovsky@oracle.com> wrote:
> On 9/11/18 3:54 AM, Jan Beulich wrote:
>>>>> On 10.09.18 at 23:56, <boris.ostrovsky@oracle.com> wrote:
>>> On 09/10/2018 10:03 AM, Jan Beulich wrote:
>>>> Having noticed that VMLOAD alone is about as fast as a single of the
>>>> involved WRMSRs, I thought it might be a reasonable idea to also use it
>>>> for PV. Measurements, however, have shown that an actual improvement can
>>>> be achieved only with an early prefetch of the VMCB (thanks to Andrew
>>>> for suggesting to try this), which I have to admit I can't really
>>>> explain. This way on my Fam15 box context switch takes over 100 clocks
>>>> less on average (the measured values are heavily varying in all cases,
>>>> though).
>>>>
>>>> This is intentionally not using a new hvm_funcs hook: For one, this is
>>>> all about PV, and something similar can hardly be done for VMX.
>>>> Furthermore the indirect to direct call patching that is meant to be
>>>> applied to most hvm_funcs hooks would be ugly to make work with
>>>> functions having more than 6 parameters.
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>> Acked-by: Brian Woods <brian.woods@amd.com>
>>>> ---
>>>> v2: Re-base.
>>>> ---
>>>> Besides the mentioned oddity with measured performance, I've also
>>>> noticed a significant difference (of at least 150 clocks) between
>>>> measuring immediately around the calls to svm_load_segs() and measuring
>>>> immediately inside the function.
>>>>
>>>
>>>
>>>>  
>>>> +#ifdef CONFIG_PV
>>>> +bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
>>>> +                   unsigned int fs_sel, unsigned long fs_base,
>>>> +                   unsigned int gs_sel, unsigned long gs_base,
>>>> +                   unsigned long gs_shadow)
>>>> +{
>>>> +    unsigned int cpu = smp_processor_id();
>>>> +    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
>>>> +
>>>> +    if ( unlikely(!vmcb) )
>>>> +        return false;
>>>> +
>>>> +    if ( !ldt_base )
>>>> +    {
>>>> +        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
>>>> +        return true;
>>>
>>>
>>> Could you explain why this is true? We haven't loaded FS/GS here.
>> 
>> A zero ldt_base argument indicates a prefetch request. This is an
>> agreement between callers of the function and its implementation.
> 
> 
> Oh, so this is what svm_load_segs(0, 0, 0, 0, 0, 0, 0) is for?
> 
> If yes then IMO a separate call would make things a bit clearer,
> especially since the return value is ignored.

Well, to me having a single central place where everything gets done
seemed better. And it looks as if Brian agreed, considering I already
have his ack for the patch. Let me know if you feel strongly.

>>> I also couldn't find discussion about prefetch --- why is prefetching
>>> ldtr expected to help?
>> 
>> See the patch description. ldtr as the element is a pretty random
>> choice between the various fields VMLOAD touches. It's (presumably)
>> more the page walk than the actual cache line(s) that we want to
>> be pulled in ahead of time. I can only guess that VMLOAD execution
>> is more "synchronous" wrt its memory accesses and/or latency to
>> completion than other (simpler) instructions.
> 
> I think a code comment would be very helpful (including the fact that
> ldtr is an arbitrary field), even if this is mentioned in the commit
> message.

I would likely have added a comment if I could firmly state what's
going on. But this is derived from experiments only - I'd require
AMD to fill in the holes before I could write a (useful) comment.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] x86: use VMLOAD for PV context switch
  2018-09-11 14:38       ` Jan Beulich
@ 2018-09-11 15:10         ` Boris Ostrovsky
  0 siblings, 0 replies; 12+ messages in thread
From: Boris Ostrovsky @ 2018-09-11 15:10 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Brian Woods, Suravee Suthikulpanit, xen-devel

On 9/11/18 10:38 AM, Jan Beulich wrote:
>>>> On 11.09.18 at 16:17, <boris.ostrovsky@oracle.com> wrote:
>> On 9/11/18 3:54 AM, Jan Beulich wrote:
>>>>>> On 10.09.18 at 23:56, <boris.ostrovsky@oracle.com> wrote:
>>>> On 09/10/2018 10:03 AM, Jan Beulich wrote:
>>>>> Having noticed that VMLOAD alone is about as fast as a single of the
>>>>> involved WRMSRs, I thought it might be a reasonable idea to also use it
>>>>> for PV. Measurements, however, have shown that an actual improvement can
>>>>> be achieved only with an early prefetch of the VMCB (thanks to Andrew
>>>>> for suggesting to try this), which I have to admit I can't really
>>>>> explain. This way on my Fam15 box context switch takes over 100 clocks
>>>>> less on average (the measured values are heavily varying in all cases,
>>>>> though).
>>>>>
>>>>> This is intentionally not using a new hvm_funcs hook: For one, this is
>>>>> all about PV, and something similar can hardly be done for VMX.
>>>>> Furthermore the indirect to direct call patching that is meant to be
>>>>> applied to most hvm_funcs hooks would be ugly to make work with
>>>>> functions having more than 6 parameters.
>>>>>
>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>>> Acked-by: Brian Woods <brian.woods@amd.com>
>>>>> ---
>>>>> v2: Re-base.
>>>>> ---
>>>>> Besides the mentioned oddity with measured performance, I've also
>>>>> noticed a significant difference (of at least 150 clocks) between
>>>>> measuring immediately around the calls to svm_load_segs() and measuring
>>>>> immediately inside the function.
>>>>>
>>>>
>>>>
>>>>>  
>>>>> +#ifdef CONFIG_PV
>>>>> +bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
>>>>> +                   unsigned int fs_sel, unsigned long fs_base,
>>>>> +                   unsigned int gs_sel, unsigned long gs_base,
>>>>> +                   unsigned long gs_shadow)
>>>>> +{
>>>>> +    unsigned int cpu = smp_processor_id();
>>>>> +    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
>>>>> +
>>>>> +    if ( unlikely(!vmcb) )
>>>>> +        return false;
>>>>> +
>>>>> +    if ( !ldt_base )
>>>>> +    {
>>>>> +        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
>>>>> +        return true;
>>>>
>>>>
>>>> Could you explain why this is true? We haven't loaded FS/GS here.
>>>
>>> A zero ldt_base argument indicates a prefetch request. This is an
>>> agreement between callers of the function and its implementation.
>>
>>
>> Oh, so this is what svm_load_segs(0, 0, 0, 0, 0, 0, 0) is for?
>>
>> If yes then IMO a separate call would make things a bit clearer,
>> especially since the return value is ignored.
> 
> Well, to me having a single central place where everything gets done
> seemed better. And it looks as if Brian agreed, considering I already
> have his ack for the patch. Let me know if you feel strongly.


I would at least like to have a comment explaining the calling convention.


> 
>>>> I also couldn't find discussion about prefetch --- why is prefetching
>>>> ldtr expected to help?
>>>
>>> See the patch description. ldtr as the element is a pretty random
>>> choice between the various fields VMLOAD touches. It's (presumably)
>>> more the page walk than the actual cache line(s) that we want to
>>> be pulled in ahead of time. I can only guess that VMLOAD execution
>>> is more "synchronous" wrt its memory accesses and/or latency to
>>> completion than other (simpler) instructions.
>>
>> I think a code comment would be very helpful (including the fact that
>> ldtr is an arbitrary field), even if this is mentioned in the commit
>> message.
> 
> I would likely have added a comment if I could firmly state what's
> going on. But this is derived from experiments only - I'd require
> AMD to fill in the holes before I could write a (useful) comment.


Well, since we have actual code we should be able to explain why we have
it ;-). Even if this is speculation on your part.

Otherwise someone looking at this will (likely?) have no idea about
what's going on, and doing git blame doesn't always work.

-boris


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v3] x86: use VMLOAD for PV context switch
  2018-09-10 14:03 [PATCH v2] x86: use VMLOAD for PV context switch Jan Beulich
  2018-09-10 21:56 ` Boris Ostrovsky
@ 2018-09-18 12:28 ` Jan Beulich
  2018-09-18 16:11   ` Boris Ostrovsky
  2018-09-25 16:14   ` Andrew Cooper
  1 sibling, 2 replies; 12+ messages in thread
From: Jan Beulich @ 2018-09-18 12:28 UTC (permalink / raw)
  To: xen-devel
  Cc: Andrew Cooper, Boris Ostrovsky, Brian Woods, Suravee Suthikulpanit

Having noticed that VMLOAD alone is about as fast as a single of the
involved WRMSRs, I thought it might be a reasonable idea to also use it
for PV. Measurements, however, have shown that an actual improvement can
be achieved only with an early prefetch of the VMCB (thanks to Andrew
for suggesting to try this), which I have to admit I can't really
explain. This way on my Fam15 box context switch takes over 100 clocks
less on average (the measured values are heavily varying in all cases,
though).

This is intentionally not using a new hvm_funcs hook: For one, this is
all about PV, and something similar can hardly be done for VMX.
Furthermore the indirect to direct call patching that is meant to be
applied to most hvm_funcs hooks would be ugly to make work with
functions having more than 6 parameters.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Brian Woods <brian.woods@amd.com>
---
v3: Add/extend comments.
v2: Re-base.
---
Besides the mentioned oddity with measured performance, I've also
noticed a significant difference (of at least 150 clocks) between
measuring immediately around the calls to svm_load_segs() and measuring
immediately inside the function.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -52,6 +52,7 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/support.h>
+#include <asm/hvm/svm/svm.h>
 #include <asm/hvm/viridian.h>
 #include <asm/debugreg.h>
 #include <asm/msr.h>
@@ -1281,11 +1282,35 @@ static void load_segments(struct vcpu *n
     struct cpu_user_regs *uregs = &n->arch.user_regs;
     int all_segs_okay = 1;
     unsigned int dirty_segment_mask, cpu = smp_processor_id();
+    bool fs_gs_done = false;
 
     /* Load and clear the dirty segment mask. */
     dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
     per_cpu(dirty_segment_mask, cpu) = 0;
 
+#ifdef CONFIG_HVM
+    if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm &&
+         !((uregs->fs | uregs->gs) & ~3) &&
+         /*
+          * The remaining part is just for optimization: If only shadow GS
+          * needs loading, there's nothing to be gained here.
+          */
+         (n->arch.pv.fs_base | n->arch.pv.gs_base_user) )
+    {
+        fs_gs_done = n->arch.flags & TF_kernel_mode
+            ? svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
+                            uregs->fs, n->arch.pv.fs_base,
+                            uregs->gs, n->arch.pv.gs_base_kernel,
+                            n->arch.pv.gs_base_user)
+            : svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
+                            uregs->fs, n->arch.pv.fs_base,
+                            uregs->gs, n->arch.pv.gs_base_user,
+                            n->arch.pv.gs_base_kernel);
+    }
+#endif
+    if ( !fs_gs_done )
+        load_LDT(n);
+
     /* Either selector != 0 ==> reload. */
     if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) )
     {
@@ -1301,7 +1326,7 @@ static void load_segments(struct vcpu *n
     }
 
     /* Either selector != 0 ==> reload. */
-    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) )
+    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) && !fs_gs_done )
     {
         all_segs_okay &= loadsegment(fs, uregs->fs);
         /* non-nul selector updates fs_base */
@@ -1310,7 +1335,7 @@ static void load_segments(struct vcpu *n
     }
 
     /* Either selector != 0 ==> reload. */
-    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) )
+    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) && !fs_gs_done  )
     {
         all_segs_okay &= loadsegment(gs, uregs->gs);
         /* non-nul selector updates gs_base_user */
@@ -1318,7 +1343,7 @@ static void load_segments(struct vcpu *n
             dirty_segment_mask &= ~DIRTY_GS_BASE;
     }
 
-    if ( !is_pv_32bit_vcpu(n) )
+    if ( !fs_gs_done && !is_pv_32bit_vcpu(n) )
     {
         /* This can only be non-zero if selector is NULL. */
         if ( n->arch.pv.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
@@ -1653,6 +1678,12 @@ static void __context_switch(void)
 
     write_ptbase(n);
 
+#if defined(CONFIG_PV) && defined(CONFIG_HVM)
+    if ( is_pv_domain(nd) && !is_pv_32bit_domain(nd) && !is_idle_domain(nd) &&
+         !cpu_has_fsgsbase && cpu_has_svm )
+        svm_load_segs(0, 0, 0, 0, 0, 0, 0);
+#endif
+
     if ( need_full_gdt(nd) &&
          ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
     {
@@ -1714,10 +1745,7 @@ void context_switch(struct vcpu *prev, s
         local_irq_enable();
 
         if ( is_pv_domain(nextd) )
-        {
-            load_LDT(next);
             load_segments(next);
-        }
 
         ctxt_switch_levelling(next);
 
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -78,6 +78,9 @@ static struct hvm_function_table svm_fun
  */
 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, hsa);
 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, host_vmcb);
+#ifdef CONFIG_PV
+static DEFINE_PER_CPU(struct vmcb_struct *, host_vmcb_va);
+#endif
 
 static bool_t amd_erratum383_found __read_mostly;
 
@@ -1567,6 +1570,14 @@ static void svm_cpu_dead(unsigned int cp
         *this_hsa = 0;
     }
 
+#ifdef CONFIG_PV
+    if ( per_cpu(host_vmcb_va, cpu) )
+    {
+        unmap_domain_page_global(per_cpu(host_vmcb_va, cpu));
+        per_cpu(host_vmcb_va, cpu) = NULL;
+    }
+#endif
+
     if ( *this_vmcb )
     {
         free_domheap_page(maddr_to_page(*this_vmcb));
@@ -1601,6 +1612,11 @@ static int svm_cpu_up_prepare(unsigned i
         if ( !pg )
             goto err;
 
+#ifdef CONFIG_PV
+        if ( !cpu_has_fsgsbase )
+            per_cpu(host_vmcb_va, cpu) = __map_domain_page_global(pg);
+#endif
+
         clear_domain_page(page_to_mfn(pg));
         *this_vmcb = page_to_maddr(pg);
     }
@@ -1630,6 +1646,66 @@ static void svm_init_erratum_383(const s
     }
 }
 
+#ifdef CONFIG_PV
+bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
+                   unsigned int fs_sel, unsigned long fs_base,
+                   unsigned int gs_sel, unsigned long gs_base,
+                   unsigned long gs_shadow)
+{
+    unsigned int cpu = smp_processor_id();
+    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
+
+    if ( unlikely(!vmcb) )
+        return false;
+
+    if ( !ldt_base )
+    {
+        /*
+         * The actual structure field used here was arbitrarily chosen.
+         * Empirically it doesn't seem to matter much which element is used,
+         * and a clear explanation of the otherwise poor performance has not
+         * been found/provided so far.
+         */
+        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
+        return true;
+    }
+
+    if ( likely(!ldt_ents) )
+        memset(&vmcb->ldtr, 0, sizeof(vmcb->ldtr));
+    else
+    {
+        /* Keep GDT in sync. */
+        struct desc_struct *desc = this_cpu(gdt_table) + LDT_ENTRY -
+                                   FIRST_RESERVED_GDT_ENTRY;
+
+        _set_tssldt_desc(desc, ldt_base, ldt_ents * 8 - 1, SYS_DESC_ldt);
+
+        vmcb->ldtr.sel = LDT_ENTRY << 3;
+        vmcb->ldtr.attr = SYS_DESC_ldt | (_SEGMENT_P >> 8);
+        vmcb->ldtr.limit = ldt_ents * 8 - 1;
+        vmcb->ldtr.base = ldt_base;
+    }
+
+    ASSERT(!(fs_sel & ~3));
+    vmcb->fs.sel = fs_sel;
+    vmcb->fs.attr = 0;
+    vmcb->fs.limit = 0;
+    vmcb->fs.base = fs_base;
+
+    ASSERT(!(gs_sel & ~3));
+    vmcb->gs.sel = gs_sel;
+    vmcb->gs.attr = 0;
+    vmcb->gs.limit = 0;
+    vmcb->gs.base = gs_base;
+
+    vmcb->kerngsbase = gs_shadow;
+
+    svm_vmload_pa(per_cpu(host_vmcb, cpu));
+
+    return true;
+}
+#endif
+
 static int _svm_cpu_up(bool bsp)
 {
     uint64_t msr_content;
@@ -1662,6 +1738,8 @@ static int _svm_cpu_up(bool bsp)
     /* Initialize OSVW bits to be used by guests */
     svm_host_osvw_init();
 
+    svm_vmsave_pa(per_cpu(host_vmcb, cpu));
+
     return 0;
 }
 
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -53,6 +53,15 @@ unsigned long *svm_msrbit(unsigned long
 void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len);
 void svm_update_guest_cr(struct vcpu *, unsigned int cr, unsigned int flags);
 
+/*
+ * PV context switch helper. Calls with zero ldt_base request a prefetch of
+ * the VMCB area to be loaded from, instead of an actual load of state.
+ */
+bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
+                   unsigned int fs_sel, unsigned long fs_base,
+                   unsigned int gs_sel, unsigned long gs_base,
+                   unsigned long gs_shadow);
+
 extern u32 svm_feature_flags;
 
 #define SVM_FEATURE_NPT            0 /* Nested page table support */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3] x86: use VMLOAD for PV context switch
  2018-09-18 12:28 ` [PATCH v3] " Jan Beulich
@ 2018-09-18 16:11   ` Boris Ostrovsky
  2018-09-25 16:14   ` Andrew Cooper
  1 sibling, 0 replies; 12+ messages in thread
From: Boris Ostrovsky @ 2018-09-18 16:11 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Andrew Cooper, Brian Woods, Suravee Suthikulpanit

On 9/18/18 8:28 AM, Jan Beulich wrote:
> Having noticed that VMLOAD alone is about as fast as a single of the
> involved WRMSRs, I thought it might be a reasonable idea to also use it
> for PV. Measurements, however, have shown that an actual improvement can
> be achieved only with an early prefetch of the VMCB (thanks to Andrew
> for suggesting to try this), which I have to admit I can't really
> explain. This way on my Fam15 box context switch takes over 100 clocks
> less on average (the measured values are heavily varying in all cases,
> though).
>
> This is intentionally not using a new hvm_funcs hook: For one, this is
> all about PV, and something similar can hardly be done for VMX.
> Furthermore the indirect to direct call patching that is meant to be
> applied to most hvm_funcs hooks would be ugly to make work with
> functions having more than 6 parameters.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Brian Woods <brian.woods@amd.com>
> ---
> v3: Add/extend comments.
> v2: Re-base.
> ---
> Besides the mentioned oddity with measured performance, I've also
> noticed a significant difference (of at least 150 clocks) between
> measuring immediately around the calls to svm_load_segs() and measuring
> immediately inside the function.

Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3] x86: use VMLOAD for PV context switch
  2018-09-18 12:28 ` [PATCH v3] " Jan Beulich
  2018-09-18 16:11   ` Boris Ostrovsky
@ 2018-09-25 16:14   ` Andrew Cooper
  2018-09-26  6:33     ` Jan Beulich
  1 sibling, 1 reply; 12+ messages in thread
From: Andrew Cooper @ 2018-09-25 16:14 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Boris Ostrovsky, Brian Woods, Suravee Suthikulpanit

On 18/09/18 13:28, Jan Beulich wrote:
> Besides the mentioned oddity with measured performance, I've also
> noticed a significant difference (of at least 150 clocks) between
> measuring immediately around the calls to svm_load_segs() and measuring
> immediately inside the function.

This is a little concerning.  It either means that there is a bug with
your timing, or (along the same line as why I think the prefetch makes a
difference), the mapping of of svm_load_segs() is reliably TLB-cold in
the context switch path.

> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -52,6 +52,7 @@
>  #include <asm/hvm/hvm.h>
>  #include <asm/hvm/nestedhvm.h>
>  #include <asm/hvm/support.h>
> +#include <asm/hvm/svm/svm.h>
>  #include <asm/hvm/viridian.h>
>  #include <asm/debugreg.h>
>  #include <asm/msr.h>
> @@ -1281,11 +1282,35 @@ static void load_segments(struct vcpu *n
>      struct cpu_user_regs *uregs = &n->arch.user_regs;
>      int all_segs_okay = 1;
>      unsigned int dirty_segment_mask, cpu = smp_processor_id();
> +    bool fs_gs_done = false;
>  
>      /* Load and clear the dirty segment mask. */
>      dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
>      per_cpu(dirty_segment_mask, cpu) = 0;
>  
> +#ifdef CONFIG_HVM
> +    if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm &&
> +         !((uregs->fs | uregs->gs) & ~3) &&
> +         /*
> +          * The remaining part is just for optimization: If only shadow GS
> +          * needs loading, there's nothing to be gained here.

VMLOAD also loads LDT, and LLDT is fully serialising, so an even heavier
perf hit than wrmsr.

> +          */
> +         (n->arch.pv.fs_base | n->arch.pv.gs_base_user) )
> +    {
> +        fs_gs_done = n->arch.flags & TF_kernel_mode
> +            ? svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
> +                            uregs->fs, n->arch.pv.fs_base,
> +                            uregs->gs, n->arch.pv.gs_base_kernel,
> +                            n->arch.pv.gs_base_user)
> +            : svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
> +                            uregs->fs, n->arch.pv.fs_base,
> +                            uregs->gs, n->arch.pv.gs_base_user,
> +                            n->arch.pv.gs_base_kernel);

This looks like a confusing way of writing:

    {
        unsigned long gsb = n->arch.flags & TF_kernel_mode
            ? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user;
        unsigned long gss = n->arch.flags & TF_kernel_mode
            ? n->arch.pv.gs_base_user : n->arch.pv.gs_base_kernel;

        fs_gs_done = svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
                                   uregs->fs, n->arch.pv.fs_base,
                                   uregs->gs, gsb, gss);
    }


AFAICT?

> +    }
> +#endif
> +    if ( !fs_gs_done )
> +        load_LDT(n);
> +
>      /* Either selector != 0 ==> reload. */
>      if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) )
>      {
> @@ -1301,7 +1326,7 @@ static void load_segments(struct vcpu *n
>      }
>  
>      /* Either selector != 0 ==> reload. */
> -    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) )
> +    if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) && !fs_gs_done )
>      {
>          all_segs_okay &= loadsegment(fs, uregs->fs);
>          /* non-nul selector updates fs_base */
> @@ -1310,7 +1335,7 @@ static void load_segments(struct vcpu *n
>      }
>  
>      /* Either selector != 0 ==> reload. */
> -    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) )
> +    if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) && !fs_gs_done  )

One too many spaces.

>      {
>          all_segs_okay &= loadsegment(gs, uregs->gs);
>          /* non-nul selector updates gs_base_user */
> @@ -1318,7 +1343,7 @@ static void load_segments(struct vcpu *n
>              dirty_segment_mask &= ~DIRTY_GS_BASE;
>      }
>  
> -    if ( !is_pv_32bit_vcpu(n) )
> +    if ( !fs_gs_done && !is_pv_32bit_vcpu(n) )
>      {
>          /* This can only be non-zero if selector is NULL. */
>          if ( n->arch.pv.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
> @@ -1653,6 +1678,12 @@ static void __context_switch(void)
>  
>      write_ptbase(n);
>  
> +#if defined(CONFIG_PV) && defined(CONFIG_HVM)

From a comments in code point of view, this is the most useful location
to have something along the lines of:

/* Prefetch the VMCB if we expect to use it later in the context switch */

> +    if ( is_pv_domain(nd) && !is_pv_32bit_domain(nd) && !is_idle_domain(nd) &&
> +         !cpu_has_fsgsbase && cpu_has_svm )
> +        svm_load_segs(0, 0, 0, 0, 0, 0, 0);
> +#endif
> +
>      if ( need_full_gdt(nd) &&
>           ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
>      {
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -1630,6 +1646,66 @@ static void svm_init_erratum_383(const s
>      }
>  }
>  
> +#ifdef CONFIG_PV
> +bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
> +                   unsigned int fs_sel, unsigned long fs_base,
> +                   unsigned int gs_sel, unsigned long gs_base,
> +                   unsigned long gs_shadow)
> +{
> +    unsigned int cpu = smp_processor_id();
> +    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
> +
> +    if ( unlikely(!vmcb) )
> +        return false;

When can this error path ever be taken?

> +
> +    if ( !ldt_base )
> +    {
> +        /*
> +         * The actual structure field used here was arbitrarily chosen.
> +         * Empirically it doesn't seem to matter much which element is used,
> +         * and a clear explanation of the otherwise poor performance has not
> +         * been found/provided so far.
> +         */
> +        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );

prefetchw(), which already exists and is used.

~Andrew

> +        return true;
> +    }
> +
> +    if ( likely(!ldt_ents) )
> +        memset(&vmcb->ldtr, 0, sizeof(vmcb->ldtr));
> +    else
> +    {
> +        /* Keep GDT in sync. */
> +        struct desc_struct *desc = this_cpu(gdt_table) + LDT_ENTRY -
> +                                   FIRST_RESERVED_GDT_ENTRY;
> +
> +        _set_tssldt_desc(desc, ldt_base, ldt_ents * 8 - 1, SYS_DESC_ldt);
> +
> +        vmcb->ldtr.sel = LDT_ENTRY << 3;
> +        vmcb->ldtr.attr = SYS_DESC_ldt | (_SEGMENT_P >> 8);
> +        vmcb->ldtr.limit = ldt_ents * 8 - 1;
> +        vmcb->ldtr.base = ldt_base;
> +    }
> +
> +    ASSERT(!(fs_sel & ~3));
> +    vmcb->fs.sel = fs_sel;
> +    vmcb->fs.attr = 0;
> +    vmcb->fs.limit = 0;
> +    vmcb->fs.base = fs_base;
> +
> +    ASSERT(!(gs_sel & ~3));
> +    vmcb->gs.sel = gs_sel;
> +    vmcb->gs.attr = 0;
> +    vmcb->gs.limit = 0;
> +    vmcb->gs.base = gs_base;
> +
> +    vmcb->kerngsbase = gs_shadow;
> +
> +    svm_vmload_pa(per_cpu(host_vmcb, cpu));
> +
> +    return true;
> +}
> +#endif
> +
>  static int _svm_cpu_up(bool bsp)
>  {
>      uint64_t msr_content;


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3] x86: use VMLOAD for PV context switch
  2018-09-25 16:14   ` Andrew Cooper
@ 2018-09-26  6:33     ` Jan Beulich
  2018-09-26 12:47       ` Andrew Cooper
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2018-09-26  6:33 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: xen-devel, Boris Ostrovsky, Brian Woods, Suravee Suthikulpanit

>>> On 25.09.18 at 18:14, <andrew.cooper3@citrix.com> wrote:
> On 18/09/18 13:28, Jan Beulich wrote:
>> @@ -1281,11 +1282,35 @@ static void load_segments(struct vcpu *n
>>      struct cpu_user_regs *uregs = &n->arch.user_regs;
>>      int all_segs_okay = 1;
>>      unsigned int dirty_segment_mask, cpu = smp_processor_id();
>> +    bool fs_gs_done = false;
>>  
>>      /* Load and clear the dirty segment mask. */
>>      dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
>>      per_cpu(dirty_segment_mask, cpu) = 0;
>>  
>> +#ifdef CONFIG_HVM
>> +    if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm &&
>> +         !((uregs->fs | uregs->gs) & ~3) &&
>> +         /*
>> +          * The remaining part is just for optimization: If only shadow GS
>> +          * needs loading, there's nothing to be gained here.
> 
> VMLOAD also loads LDT, and LLDT is fully serialising, so an even heavier
> perf hit than wrmsr.

I don't understand how your remark relates to the comment, or ...

>> +          */
>> +         (n->arch.pv.fs_base | n->arch.pv.gs_base_user) )

... the commented code. There's nothing LDT-ish here. Or are you
meaning to suggest something LDT-ish should be added? I'd rather not,
as the common case (afaict) is for there to be no LDT.

I also don't understand the "even heavier" part - WRMSR (for the MSRs
in question) is as serializing as is LLDT, isn't it?

>> +    {
>> +        fs_gs_done = n->arch.flags & TF_kernel_mode
>> +            ? svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
>> +                            uregs->fs, n->arch.pv.fs_base,
>> +                            uregs->gs, n->arch.pv.gs_base_kernel,
>> +                            n->arch.pv.gs_base_user)
>> +            : svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
>> +                            uregs->fs, n->arch.pv.fs_base,
>> +                            uregs->gs, n->arch.pv.gs_base_user,
>> +                            n->arch.pv.gs_base_kernel);
> 
> This looks like a confusing way of writing:
> 
>     {
>         unsigned long gsb = n->arch.flags & TF_kernel_mode
>             ? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user;
>         unsigned long gss = n->arch.flags & TF_kernel_mode
>             ? n->arch.pv.gs_base_user : n->arch.pv.gs_base_kernel;
> 
>         fs_gs_done = svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
>                                    uregs->fs, n->arch.pv.fs_base,
>                                    uregs->gs, gsb, gss);
>     }
> 
> 
> AFAICT?

"Confusing" is a very subjective term. I specifically wanted to avoid
the double identical conditional. But I don't mind re-writing it as you
suggest.

>> @@ -1653,6 +1678,12 @@ static void __context_switch(void)
>>  
>>      write_ptbase(n);
>>  
>> +#if defined(CONFIG_PV) && defined(CONFIG_HVM)
> 
> From a comments in code point of view, this is the most useful location
> to have something along the lines of:
> 
> /* Prefetch the VMCB if we expect to use it later in the context switch */

Added.

>> +    if ( is_pv_domain(nd) && !is_pv_32bit_domain(nd) && !is_idle_domain(nd) &&
>> +         !cpu_has_fsgsbase && cpu_has_svm )
>> +        svm_load_segs(0, 0, 0, 0, 0, 0, 0);
>> +#endif
>> +
>>      if ( need_full_gdt(nd) &&
>>           ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
>>      {
>> --- a/xen/arch/x86/hvm/svm/svm.c
>> +++ b/xen/arch/x86/hvm/svm/svm.c
>> @@ -1630,6 +1646,66 @@ static void svm_init_erratum_383(const s
>>      }
>>  }
>>  
>> +#ifdef CONFIG_PV
>> +bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
>> +                   unsigned int fs_sel, unsigned long fs_base,
>> +                   unsigned int gs_sel, unsigned long gs_base,
>> +                   unsigned long gs_shadow)
>> +{
>> +    unsigned int cpu = smp_processor_id();
>> +    struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
>> +
>> +    if ( unlikely(!vmcb) )
>> +        return false;
> 
> When can this error path ever be taken?

__map_domain_page_global() may fail during initialization, which is
non-fatal (and may even have worked for some CPUs, but not for
others).

>> +
>> +    if ( !ldt_base )
>> +    {
>> +        /*
>> +         * The actual structure field used here was arbitrarily chosen.
>> +         * Empirically it doesn't seem to matter much which element is used,
>> +         * and a clear explanation of the otherwise poor performance has not
>> +         * been found/provided so far.
>> +         */
>> +        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
> 
> prefetchw(), which already exists and is used.

I've specifically decided against using it, as we don't mean to write any
part of the VMCB.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3] x86: use VMLOAD for PV context switch
  2018-09-26  6:33     ` Jan Beulich
@ 2018-09-26 12:47       ` Andrew Cooper
  2018-09-26 13:55         ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Andrew Cooper @ 2018-09-26 12:47 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Boris Ostrovsky, Brian Woods, Suravee Suthikulpanit

On 26/09/18 07:33, Jan Beulich wrote:
>>>> On 25.09.18 at 18:14, <andrew.cooper3@citrix.com> wrote:
>> On 18/09/18 13:28, Jan Beulich wrote:
>>> @@ -1281,11 +1282,35 @@ static void load_segments(struct vcpu *n
>>>      struct cpu_user_regs *uregs = &n->arch.user_regs;
>>>      int all_segs_okay = 1;
>>>      unsigned int dirty_segment_mask, cpu = smp_processor_id();
>>> +    bool fs_gs_done = false;
>>>  
>>>      /* Load and clear the dirty segment mask. */
>>>      dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
>>>      per_cpu(dirty_segment_mask, cpu) = 0;
>>>  
>>> +#ifdef CONFIG_HVM
>>> +    if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm &&
>>> +         !((uregs->fs | uregs->gs) & ~3) &&
>>> +         /*
>>> +          * The remaining part is just for optimization: If only shadow GS
>>> +          * needs loading, there's nothing to be gained here.
>> VMLOAD also loads LDT, and LLDT is fully serialising, so an even heavier
>> perf hit than wrmsr.
> I don't understand how your remark relates to the comment

Because the comment is false in the case that the LDT also needs loading.

> , or ...
>
>>> +          */
>>> +         (n->arch.pv.fs_base | n->arch.pv.gs_base_user) )
> ... the commented code. There's nothing LDT-ish here. Or are you
> meaning to suggest something LDT-ish should be added? I'd rather not,
> as the common case (afaict) is for there to be no LDT.
>
> I also don't understand the "even heavier" part - WRMSR (for the MSRs
> in question) is as serializing as is LLDT, isn't it?

No - I'd mixed up which MSRs weren't serialising.

>>> +
>>> +    if ( !ldt_base )
>>> +    {
>>> +        /*
>>> +         * The actual structure field used here was arbitrarily chosen.
>>> +         * Empirically it doesn't seem to matter much which element is used,
>>> +         * and a clear explanation of the otherwise poor performance has not
>>> +         * been found/provided so far.
>>> +         */
>>> +        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
>> prefetchw(), which already exists and is used.
> I've specifically decided against using it, as we don't mean to write any
> part of the VMCB.

I think you need to double check your reasoning here.  There is a reason
why this function wont compile if you made vmcb a const pointer.

Irrespective of the writeable aspect, we also have prefetch() which
should be used in preference to inline assembly.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3] x86: use VMLOAD for PV context switch
  2018-09-26 12:47       ` Andrew Cooper
@ 2018-09-26 13:55         ` Jan Beulich
  0 siblings, 0 replies; 12+ messages in thread
From: Jan Beulich @ 2018-09-26 13:55 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: xen-devel, Boris Ostrovsky, Brian Woods, Suravee Suthikulpanit

>>> On 26.09.18 at 14:47, <andrew.cooper3@citrix.com> wrote:
> On 26/09/18 07:33, Jan Beulich wrote:
>>>>> On 25.09.18 at 18:14, <andrew.cooper3@citrix.com> wrote:
>>> On 18/09/18 13:28, Jan Beulich wrote:
>>>> @@ -1281,11 +1282,35 @@ static void load_segments(struct vcpu *n
>>>>      struct cpu_user_regs *uregs = &n->arch.user_regs;
>>>>      int all_segs_okay = 1;
>>>>      unsigned int dirty_segment_mask, cpu = smp_processor_id();
>>>> +    bool fs_gs_done = false;
>>>>  
>>>>      /* Load and clear the dirty segment mask. */
>>>>      dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
>>>>      per_cpu(dirty_segment_mask, cpu) = 0;
>>>>  
>>>> +#ifdef CONFIG_HVM
>>>> +    if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm &&
>>>> +         !((uregs->fs | uregs->gs) & ~3) &&
>>>> +         /*
>>>> +          * The remaining part is just for optimization: If only shadow GS
>>>> +          * needs loading, there's nothing to be gained here.
>>> VMLOAD also loads LDT, and LLDT is fully serialising, so an even heavier
>>> perf hit than wrmsr.
>> I don't understand how your remark relates to the comment
> 
> Because the comment is false in the case that the LDT also needs loading.

True (and the comment is a result of me having written it before paying
attention to the fact that the LDT can also be loaded this way); I'll OR
n->arch.pv.ldt_ents into that extra (optimization) condition, which
I think will then render the comment correct again.

>>>> +
>>>> +    if ( !ldt_base )
>>>> +    {
>>>> +        /*
>>>> +         * The actual structure field used here was arbitrarily chosen.
>>>> +         * Empirically it doesn't seem to matter much which element is used,
>>>> +         * and a clear explanation of the otherwise poor performance has not
>>>> +         * been found/provided so far.
>>>> +         */
>>>> +        asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
>>> prefetchw(), which already exists and is used.
>> I've specifically decided against using it, as we don't mean to write any
>> part of the VMCB.
> 
> I think you need to double check your reasoning here.  There is a reason
> why this function wont compile if you made vmcb a const pointer.

Oh, right you are. It's been way too long since I wrote this code,
and hence I should have looked back at it before replying rather
than just going from the function's name.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-09-26 13:55 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-10 14:03 [PATCH v2] x86: use VMLOAD for PV context switch Jan Beulich
2018-09-10 21:56 ` Boris Ostrovsky
2018-09-11  7:54   ` Jan Beulich
2018-09-11 14:17     ` Boris Ostrovsky
2018-09-11 14:38       ` Jan Beulich
2018-09-11 15:10         ` Boris Ostrovsky
2018-09-18 12:28 ` [PATCH v3] " Jan Beulich
2018-09-18 16:11   ` Boris Ostrovsky
2018-09-25 16:14   ` Andrew Cooper
2018-09-26  6:33     ` Jan Beulich
2018-09-26 12:47       ` Andrew Cooper
2018-09-26 13:55         ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.