All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ingo Molnar <mingo@kernel.org>
To: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: linux-kernel@vger.kernel.org, x86@kernel.org,
	dan.j.williams@intel.com, tglx@linutronix.de, ak@linux.intel.com,
	torvalds@linux-foundation.org, luto@kernel.org,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Denys Vlasenko <dvlasenk@redhat.com>,
	Josh Poimboeuf <jpoimboe@redhat.com>,
	Brian Gerst <brgerst@gmail.com>, Borislav Petkov <bp@alien8.de>
Subject: Re: [PATCH v3 8/7] TESTING_ONLY x86/entry: reduce static footprint of idtentry
Date: Mon, 12 Feb 2018 10:37:42 +0100	[thread overview]
Message-ID: <20180212093742.6tj57fh7zagio3yj@gmail.com> (raw)
In-Reply-To: <20180211104949.12992-9-linux@dominikbrodowski.net>


* Dominik Brodowski <linux@dominikbrodowski.net> wrote:

> Play a little trick in the generic PUSH_AND_CLEAR_REGS macro
> to insert the GP registers "above" the original return address.
> This allows us to (re-)insert the macro in error_entry() and
> paranoid_entry() and to remove it from the idtentry macro. This
> reduces the static footprint significantly.
> 
> NOTE: It still needs to be evaluated whether the reduced static
> footprint at the cost of code readability is really needed here.
> 
> Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
> ---
>  arch/x86/entry/calling.h  | 11 ++++++++++-
>  arch/x86/entry/entry_64.S | 21 ++++++++++-----------
>  2 files changed, 20 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
> index 6985440c68fa..2d4523ba04a8 100644
> --- a/arch/x86/entry/calling.h
> +++ b/arch/x86/entry/calling.h
> @@ -97,7 +97,7 @@ For 32-bit we have the following conventions - kernel is built with
>  
>  #define SIZEOF_PTREGS	21*8
>  
> -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
> +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
>  	/*
>  	 * Push registers and sanitize registers of values that a
>  	 * speculation attack might otherwise want to exploit. The
> @@ -105,8 +105,14 @@ For 32-bit we have the following conventions - kernel is built with
>  	 * could be put to use in a speculative execution gadget.
>  	 * Interleave XOR with PUSH for better uop scheduling:
>  	 */
> +	.if \save_ret
> +	pushq	%rsi		/* pt_regs->si */
> +	movq	8(%rsp), %rsi	/* temporarily store ret address in %rsi */
> +	movq	%rdi, 8(%rsp)	/* pt_regs->di (overwriting original ret) */
> +	.else
>  	pushq   %rdi		/* pt_regs->di */
>  	pushq   %rsi		/* pt_regs->si */
> +	.endif
>  	pushq	\rdx		/* pt_regs->dx */
>  	pushq   %rcx		/* pt_regs->cx */
>  	pushq   \rax		/* pt_regs->ax */
> @@ -131,6 +137,9 @@ For 32-bit we have the following conventions - kernel is built with
>  	pushq	%r15		/* pt_regs->r15 */
>  	xorq    %r15, %r15	/* nospec   r15*/
>  	UNWIND_HINT_REGS
> +	.if \save_ret
> +	pushq	%rsi		/* return address on top of stack */
> +	.endif
>  .endm
>  
>  .macro POP_REGS pop_rdi=1 skip_r11rcx=0
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index b868433d7e68..a2e41177e390 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -871,12 +871,8 @@ ENTRY(\sym)
>  	pushq	$-1				/* ORIG_RAX: no syscall to restart */
>  	.endif
>  
> -	/* Save all registers in pt_regs */
> -	PUSH_AND_CLEAR_REGS
> -	ENCODE_FRAME_POINTER
> -
>  	.if \paranoid < 2
> -	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
> +	testb	$3, CS-ORIG_RAX(%rsp)		/* If coming from userspace, switch stacks */
>  	jnz	.Lfrom_usermode_switch_stack_\@
>  	.endif
>  
> @@ -1123,12 +1119,15 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
>  #endif
>  
>  /*
> - * Switch gs if needed.
> + * Save all registers in pt_regs, and switch gs if needed.
>   * Use slow, but surefire "are we in kernel?" check.
>   * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
>   */
>  ENTRY(paranoid_entry)
> +	UNWIND_HINT_FUNC
>  	cld
> +	PUSH_AND_CLEAR_REGS save_ret=1
> +	ENCODE_FRAME_POINTER 8
>  	movl	$1, %ebx
>  	movl	$MSR_GS_BASE, %ecx
>  	rdmsr
> @@ -1141,7 +1140,7 @@ ENTRY(paranoid_entry)
>  	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
>  
>  	ret
> -ENDPROC(paranoid_entry)
> +END(paranoid_entry)
>  
>  /*
>   * "Paranoid" exit path from exception stack.  This is invoked
> @@ -1172,12 +1171,14 @@ ENTRY(paranoid_exit)
>  END(paranoid_exit)
>  
>  /*
> - * Switch gs if needed.
> + * Save all registers in pt_regs, and switch gs if needed.
>   * Return: EBX=0: came from user mode; EBX=1: otherwise
>   */
>  ENTRY(error_entry)
> -	UNWIND_HINT_REGS offset=8
> +	UNWIND_HINT_FUNC
>  	cld
> +	PUSH_AND_CLEAR_REGS save_ret=1
> +	ENCODE_FRAME_POINTER 8
>  	testb	$3, CS+8(%rsp)
>  	jz	.Lerror_kernelspace
>  
> @@ -1568,8 +1569,6 @@ end_repeat_nmi:
>  	 * frame to point back to repeat_nmi.
>  	 */
>  	pushq	$-1				/* ORIG_RAX: no syscall to restart */
> -	PUSH_AND_CLEAR_REGS
> -	ENCODE_FRAME_POINTER
>  
>  	/*
>  	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit

Ok, so this does not look _that_ complicated, and the .text savings are 
significant:

   text    data     bss     dec     hex filename
  24182       0       0   24182    5e76 entry_64.o.before
  20878       0       0   20878    518e entry_64.o.after

But, let's try to estimate the dynamic I$ impact through the static entry points. 

I'm using the following list of 'static' IDT entry points:

    divide_error
    overflow
    bounds
    invalid_op
    device_not_available
    double_fault
    coprocessor_segment_overrun
    invalid_TSS
    segment_not_present
    spurious_interrupt_bug
    coprocessor_error
    alignment_check
    simd_coprocessor_error
    debug
    int3
    stack_segment
    general_protection
    page_fault
    machine_check
    nmi
    apic_timer_interrupt
    x86_platform_ipi
    threshold_interrupt
    deferred_error_interrupt
    thermal_interrupt
    call_function_single_interrupt
    call_function_interrupt
    reschedule_interrupt
    error_interrupt
    spurious_interrupt
    irq_work_interrupt

Which are present in the vmlinux with well-known addresses - this makes I$ 
estimates easier.

But let's first filter these entries - a big chunk of them are legacy vectors, 
never used on a modern x86 CPU built in the last decade or so. Another chunk are 
infrequent entries like <machine_check>.

The entry points that are relevant to an I$ analysis are just 11 entry points:

    divide_error
                                       # overflow
                                       # bounds
                                       # invalid_op
                                       # device_not_available
                                       # double_fault
                                       # coprocessor_segment_overrun
                                       # invalid_TSS
                                       # segment_not_present
                                       # spurious_interrupt_bug
                                       # coprocessor_error
                                       # alignment_check
                                       # simd_coprocessor_error
    debug
    int3
                                       # stack_segment
    general_protection
    page_fault
                                       # machine_check
    nmi
    apic_timer_interrupt
                                       # x86_platform_ipi
                                       # threshold_interrupt
                                       # deferred_error_interrupt
                                       # thermal_interrupt
    call_function_single_interrupt
    call_function_interrupt
    reschedule_interrupt
                                       # error_interrupt
                                       # spurious_interrupt
    irq_work_interrupt


( Also note that even this list is probably generous: for example 
  <apic_timer_interrupt> will only arrive once per timer IRQ, by which time it's 
  probably already out of the L1 cache - and <debug> only matters to a small 
  minority of users. )

Here's the x86-64 defconfig addresses of these entry points, on a 'vanilla' kernel 
that does not have any entry point bloating (and debloating) patches applied, sha1 
b7a60525e83a:

      address           symbol                               # size # offs
      ----------------  ------------------------------------ -------------
      ffffffff81a00e70: <divide_error>                       #   40 #    0
      ffffffff81a01250: <debug>                              #   60 #  3e0
      ffffffff81a012b0: <int3>                               #   60 #   60
      ffffffff81a01370: <general_protection>                 #   60 #   c0
      ffffffff81a013d0: <page_fault>                         #   60 #   60
      ffffffff81a01680: <nmi>                                #   cc #  2b0
      ffffffff81a01a40: <apic_timer_interrupt>               #   c0 #  3c0
      ffffffff81a01e00: <call_function_single_interrupt>     #   c0 #  3c0
      ffffffff81a01ec0: <call_function_interrupt>            #   c0 #   c0
      ffffffff81a01f80: <reschedule_interrupt>               #   c0 #   c0
      ffffffff81a021c0: <irq_work_interrupt>                 #   c0 #  240

The 'size' column is the size of the entry function, the 'offs' column shows the 
symbol's offset from the previous entry.

As can be seen in the table:

- even in the 'compressed' layout the entries are already far enough from each 
  other to make the dynamic I$ footprint 27 cachelines, assuming 64 byte 
  cachelines:

    27 == 1 + 2 + 2 + 2 + 2 + 3 + 3 + 3 + 3 + 3

After applying the simplification patches (sha: 5a10e729bc0a) the static footprint 
of the entry code becomes more bloated:

   text    data     bss     dec     hex filename
  19693       0       0   19693    4ced arch/x86/entry/entry_64.o.before
  24212       0       0   24212    5e94 arch/x86/entry/entry_64.o.after    (+4519 bytes)

But the dynamic I$ footprint does not change nearly as much:

      address           symbol                               # size # offs
      ----------------  ------------------------------------ -------------
      ffffffff81a00d00: <divide_error>                       #   70 #    0
      ffffffff81a01320: <debug>                              #   90 #  620
      ffffffff81a013b0: <int3>                               #   90 #   90
      ffffffff81a014c0: <general_protection>                 #   80 #  110
      ffffffff81a01540: <page_fault>                         #   80 #   80
      ffffffff81a01780: <nmi>                                #   cc #  240
      ffffffff81a01b70: <apic_timer_interrupt>               #   80 #  3f0
      ffffffff81a01df0: <call_function_single_interrupt>     #   80 #  280
      ffffffff81a01e70: <call_function_interrupt>            #   80 #   80
      ffffffff81a01ef0: <reschedule_interrupt>               #   80 #   80
      ffffffff81a02070: <irq_work_interrupt>                 #   80 #  180

The footprint is in fact 25 cache lines:

    25 == 2 + 3 + 3 + 2 + 2 + 3 + 2 + 2 + 2 + 2 + 2

The reason is that while exception entry size increased, the IRQ entry size 
actually shrunk, because the PUSH based sequences are more compact than the MOVs 
they used before:

# BEFORE:

ffffffff81a01f80 <reschedule_interrupt>:
[...]
ffffffff81a01f98:       48 83 c4 88             add    $0xffffffffffffff88,%rsp
ffffffff81a01f9c:       4c 89 5c 24 30          mov    %r11,0x30(%rsp)
ffffffff81a01fa1:       4c 89 54 24 38          mov    %r10,0x38(%rsp)
ffffffff81a01fa6:       4c 89 4c 24 40          mov    %r9,0x40(%rsp)
ffffffff81a01fab:       4c 89 44 24 48          mov    %r8,0x48(%rsp)
ffffffff81a01fb0:       48 89 44 24 50          mov    %rax,0x50(%rsp)
ffffffff81a01fb5:       48 89 4c 24 58          mov    %rcx,0x58(%rsp)
ffffffff81a01fba:       48 89 54 24 60          mov    %rdx,0x60(%rsp)
ffffffff81a01fbf:       48 89 74 24 68          mov    %rsi,0x68(%rsp)
ffffffff81a01fc4:       48 89 7c 24 70          mov    %rdi,0x70(%rsp)
ffffffff81a01fc9:       4c 89 3c 24             mov    %r15,(%rsp)
ffffffff81a01fcd:       4c 89 74 24 08          mov    %r14,0x8(%rsp)
ffffffff81a01fd2:       4c 89 6c 24 10          mov    %r13,0x10(%rsp)
ffffffff81a01fd7:       4c 89 64 24 18          mov    %r12,0x18(%rsp)
ffffffff81a01fdc:       48 89 6c 24 20          mov    %rbp,0x20(%rsp)
ffffffff81a01fe1:       48 89 5c 24 28          mov    %rbx,0x28(%rsp)
ffffffff81a01fe6:       31 ed                   xor    %ebp,%ebp
ffffffff81a01fe8:       31 db                   xor    %ebx,%ebx
ffffffff81a01fea:       4d 31 c0                xor    %r8,%r8
ffffffff81a01fed:       4d 31 c9                xor    %r9,%r9
ffffffff81a01ff0:       4d 31 d2                xor    %r10,%r10
ffffffff81a01ff3:       4d 31 db                xor    %r11,%r11
ffffffff81a01ff6:       4d 31 e4                xor    %r12,%r12
ffffffff81a01ff9:       4d 31 ed                xor    %r13,%r13
ffffffff81a01ffc:       4d 31 f6                xor    %r14,%r14
ffffffff81a01fff:       4d 31 ff                xor    %r15,%r15

# AFTER:

ffffffff81a01ef0 <reschedule_interrupt>:
[...]
ffffffff81a01f08:       57                      push   %rdi
ffffffff81a01f09:       56                      push   %rsi
ffffffff81a01f0a:       52                      push   %rdx
ffffffff81a01f0b:       51                      push   %rcx
ffffffff81a01f0c:       50                      push   %rax
ffffffff81a01f0d:       41 50                   push   %r8
ffffffff81a01f0f:       4d 31 c0                xor    %r8,%r8
ffffffff81a01f12:       41 51                   push   %r9
ffffffff81a01f14:       4d 31 c9                xor    %r9,%r9
ffffffff81a01f17:       41 52                   push   %r10
ffffffff81a01f19:       4d 31 d2                xor    %r10,%r10
ffffffff81a01f1c:       41 53                   push   %r11
ffffffff81a01f1e:       4d 31 db                xor    %r11,%r11
ffffffff81a01f21:       53                      push   %rbx
ffffffff81a01f22:       31 db                   xor    %ebx,%ebx
ffffffff81a01f24:       55                      push   %rbp
ffffffff81a01f25:       31 ed                   xor    %ebp,%ebp
ffffffff81a01f27:       41 54                   push   %r12
ffffffff81a01f29:       4d 31 e4                xor    %r12,%r12
ffffffff81a01f2c:       41 55                   push   %r13
ffffffff81a01f2e:       4d 31 ed                xor    %r13,%r13
ffffffff81a01f31:       41 56                   push   %r14
ffffffff81a01f33:       4d 31 f6                xor    %r14,%r14
ffffffff81a01f36:       41 57                   push   %r15
ffffffff81a01f38:       4d 31 ff                xor    %r15,%r15

Finally, applying the 8/7 debloating patch gives the following result:

      address           symbol                               # size # offs
      ----------------  ------------------------------------ -------------
      ffffffff81a00d00: <divide_error>                       #   40 #    0
      ffffffff81a010b0: <debug>                              #   50 #  3b0
      ffffffff81a01100: <int3>                               #   50 #   50
      ffffffff81a011a0: <general_protection>                 #   50 #   a0
      ffffffff81a011f0: <page_fault>                         #   50 #   50
      ffffffff81a01450: <nmi>                                #   cc #  260
      ffffffff81a01810: <apic_timer_interrupt>               #   80 #  3c0
      ffffffff81a01a90: <call_function_single_interrupt>     #   80 #  280
      ffffffff81a01b10: <call_function_interrupt>            #   80 #   80
      ffffffff81a01b90: <reschedule_interrupt>               #   80 #   80
      ffffffff81a01d10: <irq_work_interrupt>                 #   80 #  180

That's 22 cachelines:

    22 == 1 + 2 + 2 + 2 + 2 + 3 + 2 + 2 + 2 + 2 + 2
    25 == 2 + 3 + 3 + 2 + 2 + 3 + 2 + 2 + 2 + 2 + 2

So it's a 3 cachelines improvements, coming from the smaller size of these three 
entry points:

      ffffffff81a00d00: <divide_error>                       #   40 #    0
      ffffffff81a010b0: <debug>                              #   50 #  3b0
      ffffffff81a01100: <int3>                               #   50 #   50

... which are all debugging/instrumentation related and are thus performance 
critical only for a small minority of our users.

So I'm torn whether to apply this last patch, but the interesting, unintuitive 
result is that the existing MOV->PUSH simplification patches already appear to 
have improved the dynamic I$ footprint from 27 cachelines to 22 cachelines!

Thanks,

	Ingo

  reply	other threads:[~2018-02-12  9:37 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-02-11 10:49 [PATCH v3 0/7] x86/entry: simplify and unify SAVE/POP_REGS Dominik Brodowski
2018-02-11 10:49 ` [PATCH v3 1/7] x86/entry: merge SAVE_C_REGS and SAVE_EXTRA_REGS, remove unused extensions Dominik Brodowski
2018-02-12 10:15   ` [tip:x86/pti] x86/entry/64: Merge " tip-bot for Dominik Brodowski
2018-02-13  9:00   ` tip-bot for Dominik Brodowski
2018-02-11 10:49 ` [PATCH v3 2/7] x86/entry: merge POP_C_REGS and POP_EXTRA_REGS Dominik Brodowski
2018-02-12 10:16   ` [tip:x86/pti] x86/entry/64: Merge the POP_C_REGS and POP_EXTRA_REGS macros into a single POP_REGS macro tip-bot for Dominik Brodowski
2018-02-13  9:01   ` tip-bot for Dominik Brodowski
2018-02-11 10:49 ` [PATCH v3 3/7] x86/entry: interleave XOR register clearing with PUSH instructions Dominik Brodowski
2018-02-12 10:16   ` [tip:x86/pti] x86/entry/64: Interleave " tip-bot for Dominik Brodowski
2018-02-13  9:01   ` tip-bot for Dominik Brodowski
2018-02-11 10:49 ` [PATCH v3 4/7] x86/entry: introduce PUSH_AND_CLEAN_REGS Dominik Brodowski
2018-02-12 10:17   ` [tip:x86/pti] x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro tip-bot for Dominik Brodowski
2018-02-12 13:29     ` Denys Vlasenko
2018-02-12 13:36       ` David Laight
2018-02-12 13:43         ` Denys Vlasenko
2018-02-12 16:51       ` Linus Torvalds
2018-02-13  9:01   ` tip-bot for Dominik Brodowski
2018-02-11 10:49 ` [PATCH v3 5/7] x86/entry: use PUSH_AND_CLEAN_REGS in more cases Dominik Brodowski
2018-02-12 10:17   ` [tip:x86/pti] x86/entry/64: Use " tip-bot for Dominik Brodowski
2018-02-13  9:02   ` tip-bot for Dominik Brodowski
2018-02-11 10:49 ` [PATCH v3 6/7] x86/entry: get rid of ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS Dominik Brodowski
2018-02-12 10:18   ` [tip:x86/pti] x86/entry/64: Get rid of the ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS macros tip-bot for Dominik Brodowski
2018-02-13  9:02   ` tip-bot for Dominik Brodowski
2018-02-11 10:49 ` [PATCH v3 7/7] x86/entry: indent PUSH_AND_CLEAR_REGS and POP_REGS properly Dominik Brodowski
2018-02-12 10:18   ` [tip:x86/pti] x86/entry/64: Indent " tip-bot for Dominik Brodowski
2018-02-12 20:13     ` [PATCH] x86/entry/64: Remove unused icebp macro Borislav Petkov
2018-02-13  7:36       ` Ingo Molnar
2018-02-13  9:04       ` [tip:x86/pti] x86/entry/64: Remove the unused 'icebp' macro tip-bot for Borislav Petkov
2018-02-13  9:03   ` [tip:x86/pti] x86/entry/64: Indent PUSH_AND_CLEAR_REGS and POP_REGS properly tip-bot for Dominik Brodowski
2018-02-11 10:49 ` [PATCH v3 8/7] TESTING_ONLY x86/entry: reduce static footprint of idtentry Dominik Brodowski
2018-02-12  9:37   ` Ingo Molnar [this message]
2018-02-12 19:17     ` Linus Torvalds
2018-02-13 12:41       ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180212093742.6tj57fh7zagio3yj@gmail.com \
    --to=mingo@kernel.org \
    --cc=a.p.zijlstra@chello.nl \
    --cc=ak@linux.intel.com \
    --cc=bp@alien8.de \
    --cc=brgerst@gmail.com \
    --cc=dan.j.williams@intel.com \
    --cc=dvlasenk@redhat.com \
    --cc=jpoimboe@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux@dominikbrodowski.net \
    --cc=luto@kernel.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.