All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86: slightly reduce Meltdown band-aid overhead
@ 2018-01-18 15:39 Jan Beulich
  2018-01-23 17:39 ` George Dunlap
  2018-01-29 18:55 ` Andrew Cooper
  0 siblings, 2 replies; 5+ messages in thread
From: Jan Beulich @ 2018-01-18 15:39 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper

I'm not sure why I didn't do this right away: By avoiding to make any
of the cloned directmap PTEs global, there's no need to fiddle with
CR4.PGE on any of the entry paths. Only the exit paths need to flush
global mappings.

The reduced flushing, however, implies that we now need to have
interrupts off on all entry paths until after the page table switch, so
that flush IPIs can't arrive with the restricted page tables still
active, but only a non-global flush happening with the CR3 loads. Along
those lines the "sync" IPI after L4 entry updates now needs to become a
real (and global) flush IPI, so that inside Xen we'll also pick up such
changes.

Take the opportunity and also do a GET_CURRENT() -> __GET_CURRENT()
transition the original patch missed.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -3804,18 +3804,14 @@ long do_mmu_update(
     {
         /*
          * Force other vCPU-s of the affected guest to pick up L4 entry
-         * changes (if any). Issue a flush IPI with empty operation mask to
-         * facilitate this (including ourselves waiting for the IPI to
-         * actually have arrived). Utilize the fact that FLUSH_VA_VALID is
-         * meaningless without FLUSH_CACHE, but will allow to pass the no-op
-         * check in flush_area_mask().
+         * changes (if any).
          */
         unsigned int cpu = smp_processor_id();
         cpumask_t *mask = per_cpu(scratch_cpumask, cpu);
 
         cpumask_andnot(mask, pt_owner->domain_dirty_cpumask, cpumask_of(cpu));
         if ( !cpumask_empty(mask) )
-            flush_area_mask(mask, ZERO_BLOCK_PTR, FLUSH_VA_VALID);
+            flush_mask(mask, FLUSH_TLB_GLOBAL);
     }
 
     perfc_add(num_page_updates, i);
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -726,6 +726,7 @@ static int clone_mapping(const void *ptr
     }
 
     pl1e += l1_table_offset(linear);
+    flags &= ~_PAGE_GLOBAL;
 
     if ( l1e_get_flags(*pl1e) & _PAGE_PRESENT )
     {
@@ -1009,8 +1010,17 @@ void __init smp_prepare_cpus(unsigned in
     if ( rc )
         panic("Error %d setting up PV root page table\n", rc);
     if ( per_cpu(root_pgt, 0) )
+    {
         get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0));
 
+        /*
+         * All entry points which may need to switch page tables have to start
+         * with interrupts off. Re-write what pv_trap_init() has put there.
+         */
+        _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3,
+                  &int80_direct_trap);
+    }
+
     set_nr_sockets();
 
     socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -189,7 +189,7 @@ ENTRY(compat_post_handle_exception)
 
 /* See lstar_enter for entry register state. */
 ENTRY(cstar_enter)
-        sti
+        /* sti could live here when we don't switch page tables below. */
         CR4_PV32_RESTORE
         movq  8(%rsp),%rax /* Restore %rax. */
         movq  $FLAT_KERNEL_SS,8(%rsp)
@@ -206,11 +206,12 @@ ENTRY(cstar_enter)
         jz    .Lcstar_cr3_okay
         mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
         neg   %rcx
-        write_cr3 rcx, rdi, rsi
+        mov   %rcx, %cr3
         movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
 .Lcstar_cr3_okay:
+        sti
 
-        GET_CURRENT(bx)
+        __GET_CURRENT(bx)
         movq  VCPU_domain(%rbx),%rcx
         cmpb  $0,DOMAIN_is_32bit_pv(%rcx)
         je    switch_to_kernel
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -135,7 +135,7 @@ UNLIKELY_END(exit_cr3)
  * %ss must be saved into the space left by the trampoline.
  */
 ENTRY(lstar_enter)
-        sti
+        /* sti could live here when we don't switch page tables below. */
         movq  8(%rsp),%rax /* Restore %rax. */
         movq  $FLAT_KERNEL_SS,8(%rsp)
         pushq %r11
@@ -151,9 +151,10 @@ ENTRY(lstar_enter)
         jz    .Llstar_cr3_okay
         mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
         neg   %rcx
-        write_cr3 rcx, rdi, rsi
+        mov   %rcx, %cr3
         movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
 .Llstar_cr3_okay:
+        sti
 
         __GET_CURRENT(bx)
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
@@ -236,7 +237,7 @@ process_trap:
         jmp  test_all_events
 
 ENTRY(sysenter_entry)
-        sti
+        /* sti could live here when we don't switch page tables below. */
         pushq $FLAT_USER_SS
         pushq $0
         pushfq
@@ -254,9 +255,10 @@ GLOBAL(sysenter_eflags_saved)
         jz    .Lsyse_cr3_okay
         mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
         neg   %rcx
-        write_cr3 rcx, rdi, rsi
+        mov   %rcx, %cr3
         movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
 .Lsyse_cr3_okay:
+        sti
 
         __GET_CURRENT(bx)
         cmpb  $0,VCPU_sysenter_disables_events(%rbx)
@@ -300,9 +302,10 @@ ENTRY(int80_direct_trap)
         jz    .Lint80_cr3_okay
         mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
         neg   %rcx
-        write_cr3 rcx, rdi, rsi
+        mov   %rcx, %cr3
         movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
 .Lint80_cr3_okay:
+        sti
 
         cmpb  $0,untrusted_msi(%rip)
 UNLIKELY_START(ne, msi_check)
@@ -477,7 +480,7 @@ ENTRY(common_interrupt)
         mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         neg   %rcx
 .Lintr_cr3_load:
-        write_cr3 rcx, rdi, rsi
+        mov   %rcx, %cr3
         xor   %ecx, %ecx
         mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         testb $3, UREGS_cs(%rsp)
@@ -515,7 +518,7 @@ GLOBAL(handle_exception)
         mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         neg   %rcx
 .Lxcpt_cr3_load:
-        write_cr3 rcx, rdi, rsi
+        mov   %rcx, %cr3
         xor   %ecx, %ecx
         mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         testb $3, UREGS_cs(%rsp)
@@ -707,7 +710,7 @@ ENTRY(double_fault)
         jns   .Ldblf_cr3_load
         neg   %rbx
 .Ldblf_cr3_load:
-        write_cr3 rbx, rdi, rsi
+        mov   %rbx, %cr3
 .Ldblf_cr3_okay:
 
         movq  %rsp,%rdi
@@ -738,7 +741,7 @@ handle_ist_exception:
         mov   %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
         neg   %rcx
 .List_cr3_load:
-        write_cr3 rcx, rdi, rsi
+        mov   %rcx, %cr3
         movq  $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
 .List_cr3_okay:
 



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] x86: slightly reduce Meltdown band-aid overhead
  2018-01-18 15:39 [PATCH] x86: slightly reduce Meltdown band-aid overhead Jan Beulich
@ 2018-01-23 17:39 ` George Dunlap
  2018-01-29 18:55 ` Andrew Cooper
  1 sibling, 0 replies; 5+ messages in thread
From: George Dunlap @ 2018-01-23 17:39 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap, Andrew Cooper

On 01/18/2018 03:39 PM, Jan Beulich wrote:
> I'm not sure why I didn't do this right away: By avoiding to make any
> of the cloned directmap PTEs global, there's no need to fiddle with
> CR4.PGE on any of the entry paths. Only the exit paths need to flush
> global mappings.
> 
> The reduced flushing, however, implies that we now need to have
> interrupts off on all entry paths until after the page table switch, so
> that flush IPIs can't arrive with the restricted page tables still
> active, but only a non-global flush happening with the CR3 loads. Along
> those lines the "sync" IPI after L4 entry updates now needs to become a
> real (and global) flush IPI, so that inside Xen we'll also pick up such
> changes.
> 
> Take the opportunity and also do a GET_CURRENT() -> __GET_CURRENT()
> transition the original patch missed.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

FWIW, for me this patch reduces the overhead from 38% to 28%.  Below are
times from a "time make -j 4 xen"

* xpti=off: 63s
* [xpti default]: 87s (+38%)
* + this patch: 81s (+28%)

 -George

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] x86: slightly reduce Meltdown band-aid overhead
  2018-01-18 15:39 [PATCH] x86: slightly reduce Meltdown band-aid overhead Jan Beulich
  2018-01-23 17:39 ` George Dunlap
@ 2018-01-29 18:55 ` Andrew Cooper
  2018-01-29 18:56   ` Andrew Cooper
  2018-01-30  7:50   ` Jan Beulich
  1 sibling, 2 replies; 5+ messages in thread
From: Andrew Cooper @ 2018-01-29 18:55 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 18/01/18 15:39, Jan Beulich wrote:
> I'm not sure why I didn't do this right away: By avoiding to make any
> of the cloned directmap PTEs global, there's no need to fiddle with

"avoiding to make" is a very odd way of phrasing this.  Something like
"By avoiding the use of global PTEs in the cloned directmap, " perhaps?

> CR4.PGE on any of the entry paths. Only the exit paths need to flush
> global mappings.
>
> The reduced flushing, however, implies that we now need to have
> interrupts off on all entry paths until after the page table switch, so
> that flush IPIs can't arrive with the restricted page tables still
> active, but only a non-global flush happening with the CR3 loads.

I can't parse this last clause in the context of the sentence, which
looks to finish after "active".

> Along those lines the "sync" IPI after L4 entry updates now needs to become a
> real (and global) flush IPI, so that inside Xen we'll also pick up such
> changes.

The entry paths don't tick the TLB clock, so we are in no worse of a
position than before.  IOW, I don't see why this needs to change to
being a FLUSH_GLOBAL.

>
> Take the opportunity and also do a GET_CURRENT() -> __GET_CURRENT()
> transition the original patch missed.

I've just submitted an alternative to this.

>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -726,6 +726,7 @@ static int clone_mapping(const void *ptr
>      }
>  
>      pl1e += l1_table_offset(linear);
> +    flags &= ~_PAGE_GLOBAL;
>  
>      if ( l1e_get_flags(*pl1e) & _PAGE_PRESENT )
>      {
> @@ -1009,8 +1010,17 @@ void __init smp_prepare_cpus(unsigned in
>      if ( rc )
>          panic("Error %d setting up PV root page table\n", rc);
>      if ( per_cpu(root_pgt, 0) )
> +    {
>          get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0));
>  
> +        /*
> +         * All entry points which may need to switch page tables have to start
> +         * with interrupts off. Re-write what pv_trap_init() has put there.
> +         */
> +        _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3,
> +                  &int80_direct_trap);

The int82 path is also a trap gate.  Given how subtle this is, and how
hard a resulting crash would be to debug, I think it would be better to
unilaterally switch both to being interrupt gates.

Neither are fastpaths, so the single extra sti in their execution paths
will be negligible.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] x86: slightly reduce Meltdown band-aid overhead
  2018-01-29 18:55 ` Andrew Cooper
@ 2018-01-29 18:56   ` Andrew Cooper
  2018-01-30  7:50   ` Jan Beulich
  1 sibling, 0 replies; 5+ messages in thread
From: Andrew Cooper @ 2018-01-29 18:56 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: George Dunlap

On 29/01/18 18:55, Andrew Cooper wrote:
> On 18/01/18 15:39, Jan Beulich wrote:
>> Along those lines the "sync" IPI after L4 entry updates now needs to become a
>> real (and global) flush IPI, so that inside Xen we'll also pick up such
>> changes.
> The entry paths don't tick the TLB clock, so we are in no worse of a
> position than before.  IOW, I don't see why this needs to change to
> being a FLUSH_GLOBAL.

Actually, I've just worked out why.  Guest user mappings are still
global, and we do need to flush those.  I think it would be worth
identifying this in the comment.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] x86: slightly reduce Meltdown band-aid overhead
  2018-01-29 18:55 ` Andrew Cooper
  2018-01-29 18:56   ` Andrew Cooper
@ 2018-01-30  7:50   ` Jan Beulich
  1 sibling, 0 replies; 5+ messages in thread
From: Jan Beulich @ 2018-01-30  7:50 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel

>>> On 29.01.18 at 19:55, <andrew.cooper3@citrix.com> wrote:
> On 18/01/18 15:39, Jan Beulich wrote:
>> I'm not sure why I didn't do this right away: By avoiding to make any
>> of the cloned directmap PTEs global, there's no need to fiddle with
> 
> "avoiding to make" is a very odd way of phrasing this.  Something like
> "By avoiding the use of global PTEs in the cloned directmap, " perhaps?

Changed.

>> CR4.PGE on any of the entry paths. Only the exit paths need to flush
>> global mappings.
>>
>> The reduced flushing, however, implies that we now need to have
>> interrupts off on all entry paths until after the page table switch, so
>> that flush IPIs can't arrive with the restricted page tables still
>> active, but only a non-global flush happening with the CR3 loads.
> 
> I can't parse this last clause in the context of the sentence, which
> looks to finish after "active".

There are two different aspects making necessary to keep interrupts
of, both of which I try to name in each half sentence. I would re-
phrase this to

"The reduced flushing, however, implies that we now need to have
 interrupts off on all entry paths until after the page table switch:
 With the CR3 loads being only non-global flushes, flush IPIs mustn't
 arrive with the restricted page tables still active."

Is that any better?

>> Along those lines the "sync" IPI after L4 entry updates now needs to become a
>> real (and global) flush IPI, so that inside Xen we'll also pick up such
>> changes.
> 
> The entry paths don't tick the TLB clock, so we are in no worse of a
> position than before.  IOW, I don't see why this needs to change to
> being a FLUSH_GLOBAL.

In your later reply you talk about flushing (global) user mappings -
that's another way to put it, just that I didn't want to special case
user mappings here. From the perspective of this change, it doesn't
matter what global mappings there might be. All we care about is
that despite the now non-global flushes we still get rid of global
TLB entries hanging off of changed L4 entries.

>> @@ -1009,8 +1010,17 @@ void __init smp_prepare_cpus(unsigned in
>>      if ( rc )
>>          panic("Error %d setting up PV root page table\n", rc);
>>      if ( per_cpu(root_pgt, 0) )
>> +    {
>>          get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0));
>>  
>> +        /*
>> +         * All entry points which may need to switch page tables have to start
>> +         * with interrupts off. Re-write what pv_trap_init() has put there.
>> +         */
>> +        _set_gate(idt_table + LEGACY_SYSCALL_VECTOR, SYS_DESC_irq_gate, 3,
>> +                  &int80_direct_trap);
> 
> The int82 path is also a trap gate.  Given how subtle this is, and how
> hard a resulting crash would be to debug, I think it would be better to
> unilaterally switch both to being interrupt gates.

The int82 path is of no interest: As the comment says, we care about
interrupts off only on entry paths where we switch page tables (back
from the restricted ones).

> Neither are fastpaths, so the single extra sti in their execution paths
> will be negligible.

How is a system call path not performance relevant? Intel's ORM
doesn't say anything about STI afaics, but I assume it's
microcoded as much as it is for AMD (who explicitly say so).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-01-30  7:50 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-18 15:39 [PATCH] x86: slightly reduce Meltdown band-aid overhead Jan Beulich
2018-01-23 17:39 ` George Dunlap
2018-01-29 18:55 ` Andrew Cooper
2018-01-29 18:56   ` Andrew Cooper
2018-01-30  7:50   ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.