linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra()
@ 2017-02-14  8:11 Kyle Huey
  2017-02-14  8:11 ` [PATCH v2 1/3] x86/process: Optimize TIF checks in __switch_to_xtra() Kyle Huey
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Kyle Huey @ 2017-02-14  8:11 UTC (permalink / raw)
  To: LKML; +Cc: Thomas Gleixner, Peter Zijlstra, Andy Lutomirski, x86

GCC generates lousy code in __switch_to_xtra.  This patch series is an
updated version of tglx's patches from last year
(https://lkml.org/lkml/2016/12/15/432) that address review comments.

Since v1:
Part 1 - x86/process: Optimize TIF checks in __switch_to_xtra()
- READ_ONCE annotations added as requested by Andy Lutomirski

Part 2 - x86/process: Correct and optimize TIF_BLOCKSTEP switch
- DEBUGCTLMSR_BTF is now modified when either the previous or
  next or both tasks use it, because the MSR is "highly magical".

Part 3 - x86/process: Optimize TIF_NOTSC switch
- Unchanged

I didn't introduce a cpufeature for blockstep because that would
add additional overhead compared to the existing code, where it's
generally known at compile time that blockstep is supported. Perhaps
we should just BUG_ON(!arch_has_block_step()) here if we really
care to check anything.

arch/x86/include/asm/msr-index.h |  1 +
arch/x86/include/asm/tlbflush.h  | 10 ++++++++++
arch/x86/kernel/process.c        | 76 +++++++++++++++++++++++++++++++++++-----------------------------------------
3 files changed, 46 insertions(+), 41 deletions(-)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 1/3] x86/process: Optimize TIF checks in __switch_to_xtra()
  2017-02-14  8:11 [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra() Kyle Huey
@ 2017-02-14  8:11 ` Kyle Huey
  2017-03-11 11:49   ` [tip:x86/process] " tip-bot for Kyle Huey
  2017-02-14  8:11 ` [PATCH v2 2/3] x86/process: Correct and optimize TIF_BLOCKSTEP switch Kyle Huey
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Kyle Huey @ 2017-02-14  8:11 UTC (permalink / raw)
  To: LKML; +Cc: Thomas Gleixner, Peter Zijlstra, Andy Lutomirski, x86

Help the compiler to avoid reevaluating the thread flags for each checked
bit by reordering the bit checks and providing an explicit xor for
evaluation.

With default defconfigs for each arch,

x86_64: arch/x86/kernel/process.o
text       data     bss     dec     hex
3056       8577      16   11649    2d81	Before
3024	   8577      16	  11617	   2d61	After

i386: arch/x86/kernel/process.o
text       data     bss     dec     hex
2957	   8673	      8	  11638	   2d76	Before
2925	   8673       8	  11606	   2d56	After

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kyle Huey <khuey@kylehuey.com>
---
 arch/x86/kernel/process.c | 54 +++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b615a1113f58..01ef6f63d5fb 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -174,48 +174,56 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 }
 
+static inline void switch_to_bitmap(struct tss_struct *tss,
+				    struct thread_struct *prev,
+				    struct thread_struct *next,
+				    unsigned long tifp, unsigned long tifn)
+{
+	if (tifn & _TIF_IO_BITMAP) {
+		/*
+		 * Copy the relevant range of the IO bitmap.
+		 * Normally this is 128 bytes or less:
+		 */
+		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+		       max(prev->io_bitmap_max, next->io_bitmap_max));
+	} else if (tifp & _TIF_IO_BITMAP) {
+		/*
+		 * Clear any possible leftover bits:
+		 */
+		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+	}
+}
+
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss)
 {
 	struct thread_struct *prev, *next;
+	unsigned long tifp, tifn;
 
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
-	    test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+	tifn = READ_ONCE(task_thread_info(next_p)->flags);
+	tifp = READ_ONCE(task_thread_info(prev_p)->flags);
+	switch_to_bitmap(tss, prev, next, tifp, tifn);
+
+	propagate_user_return_notify(prev_p, next_p);
+
+	if ((tifp ^ tifn) & _TIF_BLOCKSTEP) {
 		unsigned long debugctl = get_debugctlmsr();
 
 		debugctl &= ~DEBUGCTLMSR_BTF;
-		if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
+		if (tifn & _TIF_BLOCKSTEP)
 			debugctl |= DEBUGCTLMSR_BTF;
-
 		update_debugctlmsr(debugctl);
 	}
 
-	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
-		/* prev and next are different */
-		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+	if ((tifp ^ tifn) & _TIF_NOTSC) {
+		if (tifn & _TIF_NOTSC)
 			hard_disable_TSC();
 		else
 			hard_enable_TSC();
 	}
-
-	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
-		/*
-		 * Copy the relevant range of the IO bitmap.
-		 * Normally this is 128 bytes or less:
-		 */
-		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
-		       max(prev->io_bitmap_max, next->io_bitmap_max));
-	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
-		/*
-		 * Clear any possible leftover bits:
-		 */
-		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
-	}
-	propagate_user_return_notify(prev_p, next_p);
 }
 
 /*
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 2/3] x86/process: Correct and optimize TIF_BLOCKSTEP switch
  2017-02-14  8:11 [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra() Kyle Huey
  2017-02-14  8:11 ` [PATCH v2 1/3] x86/process: Optimize TIF checks in __switch_to_xtra() Kyle Huey
@ 2017-02-14  8:11 ` Kyle Huey
  2017-03-11 11:49   ` [tip:x86/process] " tip-bot for Kyle Huey
  2017-02-14  8:11 ` [PATCH v2 3/3] x86/process: Optimize TIF_NOTSC switch Kyle Huey
  2017-02-28 18:33 ` [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra() Kyle Huey
  3 siblings, 1 reply; 10+ messages in thread
From: Kyle Huey @ 2017-02-14  8:11 UTC (permalink / raw)
  To: LKML; +Cc: Thomas Gleixner, Peter Zijlstra, Andy Lutomirski, x86

Because the MSR is "highly magical", reset DEBUGCTLMSR_BTF on every task
switch in or out of a task that uses TIF_BLOCKSTEP, even if both tasks
have TIF_BLOCKSTEP set.  Avoid branching within the TIF_BLOCKSTEP case
and evaluating boot_cpu_data twice in kernels without
CONFIG_X86_DEBUGCTLMSR.

x86_64: arch/x86/kernel/process.o
text	data	bss	dec	 hex
3024    8577    16      11617    2d61	Before
3008	8577	16	11601	 2d51	After

i386: No change

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kyle Huey <khuey@kylehuey.com>
---
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/kernel/process.c        | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 710273c617b8..a8997a7b4e74 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -127,6 +127,7 @@
 
 /* DEBUGCTLMSR bits (others vary by model): */
 #define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
+#define _DEBUGCTLMSR_BTF		1
 #define DEBUGCTLMSR_BTF			(1UL <<  1) /* single-step on branches */
 #define DEBUGCTLMSR_TR			(1UL <<  6)
 #define DEBUGCTLMSR_BTS			(1UL <<  7)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 01ef6f63d5fb..5dd08328108e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -209,13 +209,13 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 
 	propagate_user_return_notify(prev_p, next_p);
 
-	if ((tifp ^ tifn) & _TIF_BLOCKSTEP) {
-		unsigned long debugctl = get_debugctlmsr();
-
+	if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
+	    arch_has_block_step()) {
+		unsigned long debugctl;
+		rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 		debugctl &= ~DEBUGCTLMSR_BTF;
-		if (tifn & _TIF_BLOCKSTEP)
-			debugctl |= DEBUGCTLMSR_BTF;
-		update_debugctlmsr(debugctl);
+		debugctl |= (tifn & _TIF_BLOCKSTEP) >> TIF_BLOCKSTEP << _DEBUGCTLMSR_BTF;
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	}
 
 	if ((tifp ^ tifn) & _TIF_NOTSC) {
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 3/3] x86/process: Optimize TIF_NOTSC switch
  2017-02-14  8:11 [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra() Kyle Huey
  2017-02-14  8:11 ` [PATCH v2 1/3] x86/process: Optimize TIF checks in __switch_to_xtra() Kyle Huey
  2017-02-14  8:11 ` [PATCH v2 2/3] x86/process: Correct and optimize TIF_BLOCKSTEP switch Kyle Huey
@ 2017-02-14  8:11 ` Kyle Huey
  2017-03-11 11:50   ` [tip:x86/process] " tip-bot for Thomas Gleixner
  2017-02-28 18:33 ` [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra() Kyle Huey
  3 siblings, 1 reply; 10+ messages in thread
From: Kyle Huey @ 2017-02-14  8:11 UTC (permalink / raw)
  To: LKML; +Cc: Thomas Gleixner, Peter Zijlstra, Andy Lutomirski, x86

From: Thomas Gleixner <tglx@linutronix.de>

Provide and use a toggle helper instead of doing it with a branch.

x86_64: arch/x86/kernel/process.o
text	   data	    bss	    dec	    hex
3008	   8577	     16	  11601	   2d51 Before
2976       8577      16	  11569	   2d31 After

i386: arch/x86/kernel/process.o
text	   data	    bss	    dec	    hex
2925	   8673	      8	  11606	   2d56 Before
2893	   8673       8	  11574	   2d36 After

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/tlbflush.h | 10 ++++++++++
 arch/x86/kernel/process.c       | 22 ++++------------------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6fa85944af83..ff4923a19f79 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -110,6 +110,16 @@ static inline void cr4_clear_bits(unsigned long mask)
 	}
 }
 
+static inline void cr4_toggle_bits(unsigned long mask)
+{
+	unsigned long cr4;
+
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
+	cr4 ^= mask;
+	this_cpu_write(cpu_tlbstate.cr4, cr4);
+	__write_cr4(cr4);
+}
+
 /* Read the CR4 shadow. */
 static inline unsigned long cr4_read_shadow(void)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5dd08328108e..ebf2c06839bd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -116,11 +116,6 @@ void flush_thread(void)
 	fpu__clear(&tsk->thread.fpu);
 }
 
-static void hard_disable_TSC(void)
-{
-	cr4_set_bits(X86_CR4_TSD);
-}
-
 void disable_TSC(void)
 {
 	preempt_disable();
@@ -129,15 +124,10 @@ void disable_TSC(void)
 		 * Must flip the CPU state synchronously with
 		 * TIF_NOTSC in the current running context.
 		 */
-		hard_disable_TSC();
+		cr4_set_bits(X86_CR4_TSD);
 	preempt_enable();
 }
 
-static void hard_enable_TSC(void)
-{
-	cr4_clear_bits(X86_CR4_TSD);
-}
-
 static void enable_TSC(void)
 {
 	preempt_disable();
@@ -146,7 +136,7 @@ static void enable_TSC(void)
 		 * Must flip the CPU state synchronously with
 		 * TIF_NOTSC in the current running context.
 		 */
-		hard_enable_TSC();
+		cr4_clear_bits(X86_CR4_TSD);
 	preempt_enable();
 }
 
@@ -218,12 +208,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	}
 
-	if ((tifp ^ tifn) & _TIF_NOTSC) {
-		if (tifn & _TIF_NOTSC)
-			hard_disable_TSC();
-		else
-			hard_enable_TSC();
-	}
+	if ((tifp ^ tifn) & _TIF_NOTSC)
+		cr4_toggle_bits(X86_CR4_TSD);
 }
 
 /*
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra()
  2017-02-14  8:11 [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra() Kyle Huey
                   ` (2 preceding siblings ...)
  2017-02-14  8:11 ` [PATCH v2 3/3] x86/process: Optimize TIF_NOTSC switch Kyle Huey
@ 2017-02-28 18:33 ` Kyle Huey
  2017-03-08 22:07   ` Kyle Huey
  3 siblings, 1 reply; 10+ messages in thread
From: Kyle Huey @ 2017-02-28 18:33 UTC (permalink / raw)
  To: LKML
  Cc: Thomas Gleixner, Peter Zijlstra, Andy Lutomirski,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)

On Tue, Feb 14, 2017 at 12:11 AM, Kyle Huey <me@kylehuey.com> wrote:
> GCC generates lousy code in __switch_to_xtra.  This patch series is an
> updated version of tglx's patches from last year
> (https://lkml.org/lkml/2016/12/15/432) that address review comments.
>
> Since v1:
> Part 1 - x86/process: Optimize TIF checks in __switch_to_xtra()
> - READ_ONCE annotations added as requested by Andy Lutomirski
>
> Part 2 - x86/process: Correct and optimize TIF_BLOCKSTEP switch
> - DEBUGCTLMSR_BTF is now modified when either the previous or
>   next or both tasks use it, because the MSR is "highly magical".
>
> Part 3 - x86/process: Optimize TIF_NOTSC switch
> - Unchanged
>
> I didn't introduce a cpufeature for blockstep because that would
> add additional overhead compared to the existing code, where it's
> generally known at compile time that blockstep is supported. Perhaps
> we should just BUG_ON(!arch_has_block_step()) here if we really
> care to check anything.
>
> arch/x86/include/asm/msr-index.h |  1 +
> arch/x86/include/asm/tlbflush.h  | 10 ++++++++++
> arch/x86/kernel/process.c        | 76 +++++++++++++++++++++++++++++++++++-----------------------------------------
> 3 files changed, 46 insertions(+), 41 deletions(-)
>

Has anyone had a change to look at these?

- Kyle

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra()
  2017-02-28 18:33 ` [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra() Kyle Huey
@ 2017-03-08 22:07   ` Kyle Huey
  2017-03-10 20:23     ` Thomas Gleixner
  0 siblings, 1 reply; 10+ messages in thread
From: Kyle Huey @ 2017-03-08 22:07 UTC (permalink / raw)
  To: LKML
  Cc: Thomas Gleixner, Peter Zijlstra, Andy Lutomirski,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)

On Tue, Feb 28, 2017 at 10:33 AM, Kyle Huey <me@kylehuey.com> wrote:
> On Tue, Feb 14, 2017 at 12:11 AM, Kyle Huey <me@kylehuey.com> wrote:
>> GCC generates lousy code in __switch_to_xtra.  This patch series is an
>> updated version of tglx's patches from last year
>> (https://lkml.org/lkml/2016/12/15/432) that address review comments.
>>
>> Since v1:
>> Part 1 - x86/process: Optimize TIF checks in __switch_to_xtra()
>> - READ_ONCE annotations added as requested by Andy Lutomirski
>>
>> Part 2 - x86/process: Correct and optimize TIF_BLOCKSTEP switch
>> - DEBUGCTLMSR_BTF is now modified when either the previous or
>>   next or both tasks use it, because the MSR is "highly magical".
>>
>> Part 3 - x86/process: Optimize TIF_NOTSC switch
>> - Unchanged
>>
>> I didn't introduce a cpufeature for blockstep because that would
>> add additional overhead compared to the existing code, where it's
>> generally known at compile time that blockstep is supported. Perhaps
>> we should just BUG_ON(!arch_has_block_step()) here if we really
>> care to check anything.
>>
>> arch/x86/include/asm/msr-index.h |  1 +
>> arch/x86/include/asm/tlbflush.h  | 10 ++++++++++
>> arch/x86/kernel/process.c        | 76 +++++++++++++++++++++++++++++++++++-----------------------------------------
>> 3 files changed, 46 insertions(+), 41 deletions(-)
>>
>
> Has anyone had a change to look at these?

Maybe now that the 4.11 merge window is closed? :)

- Kyle

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra()
  2017-03-08 22:07   ` Kyle Huey
@ 2017-03-10 20:23     ` Thomas Gleixner
  0 siblings, 0 replies; 10+ messages in thread
From: Thomas Gleixner @ 2017-03-10 20:23 UTC (permalink / raw)
  To: Kyle Huey
  Cc: LKML, Peter Zijlstra, Andy Lutomirski,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)

On Wed, 8 Mar 2017, Kyle Huey wrote:

> On Tue, Feb 28, 2017 at 10:33 AM, Kyle Huey <me@kylehuey.com> wrote:
> > On Tue, Feb 14, 2017 at 12:11 AM, Kyle Huey <me@kylehuey.com> wrote:
> >> GCC generates lousy code in __switch_to_xtra.  This patch series is an
> >> updated version of tglx's patches from last year
> >> (https://lkml.org/lkml/2016/12/15/432) that address review comments.
> >>
> >> Since v1:
> >> Part 1 - x86/process: Optimize TIF checks in __switch_to_xtra()
> >> - READ_ONCE annotations added as requested by Andy Lutomirski
> >>
> >> Part 2 - x86/process: Correct and optimize TIF_BLOCKSTEP switch
> >> - DEBUGCTLMSR_BTF is now modified when either the previous or
> >>   next or both tasks use it, because the MSR is "highly magical".
> >>
> >> Part 3 - x86/process: Optimize TIF_NOTSC switch
> >> - Unchanged
> >>
> >> I didn't introduce a cpufeature for blockstep because that would
> >> add additional overhead compared to the existing code, where it's
> >> generally known at compile time that blockstep is supported. Perhaps
> >> we should just BUG_ON(!arch_has_block_step()) here if we really
> >> care to check anything.
> >>
> >> arch/x86/include/asm/msr-index.h |  1 +
> >> arch/x86/include/asm/tlbflush.h  | 10 ++++++++++
> >> arch/x86/kernel/process.c        | 76 +++++++++++++++++++++++++++++++++++-----------------------------------------
> >> 3 files changed, 46 insertions(+), 41 deletions(-)
> >>
> >
> > Has anyone had a change to look at these?
> 
> Maybe now that the 4.11 merge window is closed? :)

Yes. It's on my radar, but I'm swamped with regressions. Next week should
be more time for that.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [tip:x86/process] x86/process: Optimize TIF checks in __switch_to_xtra()
  2017-02-14  8:11 ` [PATCH v2 1/3] x86/process: Optimize TIF checks in __switch_to_xtra() Kyle Huey
@ 2017-03-11 11:49   ` tip-bot for Kyle Huey
  0 siblings, 0 replies; 10+ messages in thread
From: tip-bot for Kyle Huey @ 2017-03-11 11:49 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, luto, peterz, me, khuey, mingo, hpa, tglx

Commit-ID:  af8b3cd3934ec60f4c2a420d19a9d416554f140b
Gitweb:     http://git.kernel.org/tip/af8b3cd3934ec60f4c2a420d19a9d416554f140b
Author:     Kyle Huey <me@kylehuey.com>
AuthorDate: Tue, 14 Feb 2017 00:11:02 -0800
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Sat, 11 Mar 2017 12:45:17 +0100

x86/process: Optimize TIF checks in __switch_to_xtra()

Help the compiler to avoid reevaluating the thread flags for each checked
bit by reordering the bit checks and providing an explicit xor for
evaluation.

With default defconfigs for each arch,

x86_64: arch/x86/kernel/process.o
text       data     bss     dec     hex
3056       8577      16   11649    2d81	Before
3024	   8577      16	  11617	   2d61	After

i386: arch/x86/kernel/process.o
text       data     bss     dec     hex
2957	   8673	      8	  11638	   2d76	Before
2925	   8673       8	  11606	   2d56	After

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kyle Huey <khuey@kylehuey.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Link: http://lkml.kernel.org/r/20170214081104.9244-2-khuey@kylehuey.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 arch/x86/kernel/process.c | 65 ++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f675915..ea9ea25 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -182,54 +182,61 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 }
 
+static inline void switch_to_bitmap(struct tss_struct *tss,
+				    struct thread_struct *prev,
+				    struct thread_struct *next,
+				    unsigned long tifp, unsigned long tifn)
+{
+	if (tifn & _TIF_IO_BITMAP) {
+		/*
+		 * Copy the relevant range of the IO bitmap.
+		 * Normally this is 128 bytes or less:
+		 */
+		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+		       max(prev->io_bitmap_max, next->io_bitmap_max));
+		/*
+		 * Make sure that the TSS limit is correct for the CPU
+		 * to notice the IO bitmap.
+		 */
+		refresh_tss_limit();
+	} else if (tifp & _TIF_IO_BITMAP) {
+		/*
+		 * Clear any possible leftover bits:
+		 */
+		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+	}
+}
+
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss)
 {
 	struct thread_struct *prev, *next;
+	unsigned long tifp, tifn;
 
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
-	    test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+	tifn = READ_ONCE(task_thread_info(next_p)->flags);
+	tifp = READ_ONCE(task_thread_info(prev_p)->flags);
+	switch_to_bitmap(tss, prev, next, tifp, tifn);
+
+	propagate_user_return_notify(prev_p, next_p);
+
+	if ((tifp ^ tifn) & _TIF_BLOCKSTEP) {
 		unsigned long debugctl = get_debugctlmsr();
 
 		debugctl &= ~DEBUGCTLMSR_BTF;
-		if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
+		if (tifn & _TIF_BLOCKSTEP)
 			debugctl |= DEBUGCTLMSR_BTF;
-
 		update_debugctlmsr(debugctl);
 	}
 
-	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
-		/* prev and next are different */
-		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+	if ((tifp ^ tifn) & _TIF_NOTSC) {
+		if (tifn & _TIF_NOTSC)
 			hard_disable_TSC();
 		else
 			hard_enable_TSC();
 	}
-
-	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
-		/*
-		 * Copy the relevant range of the IO bitmap.
-		 * Normally this is 128 bytes or less:
-		 */
-		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
-		       max(prev->io_bitmap_max, next->io_bitmap_max));
-
-		/*
-		 * Make sure that the TSS limit is correct for the CPU
-		 * to notice the IO bitmap.
-		 */
-		refresh_tss_limit();
-	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
-		/*
-		 * Clear any possible leftover bits:
-		 */
-		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
-	}
-	propagate_user_return_notify(prev_p, next_p);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [tip:x86/process] x86/process: Correct and optimize TIF_BLOCKSTEP switch
  2017-02-14  8:11 ` [PATCH v2 2/3] x86/process: Correct and optimize TIF_BLOCKSTEP switch Kyle Huey
@ 2017-03-11 11:49   ` tip-bot for Kyle Huey
  0 siblings, 0 replies; 10+ messages in thread
From: tip-bot for Kyle Huey @ 2017-03-11 11:49 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: me, tglx, khuey, linux-kernel, peterz, luto, hpa, mingo

Commit-ID:  b9894a2f5bd18b1691cb6872c9afe32b148d0132
Gitweb:     http://git.kernel.org/tip/b9894a2f5bd18b1691cb6872c9afe32b148d0132
Author:     Kyle Huey <me@kylehuey.com>
AuthorDate: Tue, 14 Feb 2017 00:11:03 -0800
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Sat, 11 Mar 2017 12:45:18 +0100

x86/process: Correct and optimize TIF_BLOCKSTEP switch

The debug control MSR is "highly magical" as the blockstep bit can be
cleared by hardware under not well documented circumstances.

So a task switch relying on the bit set by the previous task (according to
the previous tasks thread flags) can trip over this and not update the flag
for the next task.

To fix this its required to handle DEBUGCTLMSR_BTF when either the previous
or the next or both tasks have the TIF_BLOCKSTEP flag set.

While at it avoid branching within the TIF_BLOCKSTEP case and evaluating
boot_cpu_data twice in kernels without CONFIG_X86_DEBUGCTLMSR.

x86_64: arch/x86/kernel/process.o
text	data	bss	dec	 hex
3024    8577    16      11617    2d61	Before
3008	8577	16	11601	 2d51	After

i386: No change

[ tglx: Made the shift value explicit, use a local variable to make the
code readable and massaged changelog]

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kyle Huey <khuey@kylehuey.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Link: http://lkml.kernel.org/r/20170214081104.9244-3-khuey@kylehuey.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/kernel/process.c        | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d8b5f8a..4c928f3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -127,6 +127,7 @@
 
 /* DEBUGCTLMSR bits (others vary by model): */
 #define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
+#define DEBUGCTLMSR_BTF_SHIFT		1
 #define DEBUGCTLMSR_BTF			(1UL <<  1) /* single-step on branches */
 #define DEBUGCTLMSR_TR			(1UL <<  6)
 #define DEBUGCTLMSR_BTS			(1UL <<  7)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ea9ea25..83fa3cb 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -222,13 +222,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 
 	propagate_user_return_notify(prev_p, next_p);
 
-	if ((tifp ^ tifn) & _TIF_BLOCKSTEP) {
-		unsigned long debugctl = get_debugctlmsr();
+	if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
+	    arch_has_block_step()) {
+		unsigned long debugctl, msk;
 
+		rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 		debugctl &= ~DEBUGCTLMSR_BTF;
-		if (tifn & _TIF_BLOCKSTEP)
-			debugctl |= DEBUGCTLMSR_BTF;
-		update_debugctlmsr(debugctl);
+		msk = tifn & _TIF_BLOCKSTEP;
+		debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	}
 
 	if ((tifp ^ tifn) & _TIF_NOTSC) {

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [tip:x86/process] x86/process: Optimize TIF_NOTSC switch
  2017-02-14  8:11 ` [PATCH v2 3/3] x86/process: Optimize TIF_NOTSC switch Kyle Huey
@ 2017-03-11 11:50   ` tip-bot for Thomas Gleixner
  0 siblings, 0 replies; 10+ messages in thread
From: tip-bot for Thomas Gleixner @ 2017-03-11 11:50 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: luto, hpa, peterz, mingo, tglx, linux-kernel

Commit-ID:  5a920155e388ec22a22e0532fb695b9215c9b34d
Gitweb:     http://git.kernel.org/tip/5a920155e388ec22a22e0532fb695b9215c9b34d
Author:     Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Tue, 14 Feb 2017 00:11:04 -0800
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Sat, 11 Mar 2017 12:45:18 +0100

x86/process: Optimize TIF_NOTSC switch

Provide and use a toggle helper instead of doing it with a branch.

x86_64: arch/x86/kernel/process.o
text	   data	    bss	    dec	    hex
3008	   8577	     16	  11601	   2d51 Before
2976       8577      16	  11569	   2d31 After

i386: arch/x86/kernel/process.o
text	   data	    bss	    dec	    hex
2925	   8673	      8	  11606	   2d56 Before
2893	   8673       8	  11574	   2d36 After

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Link: http://lkml.kernel.org/r/20170214081104.9244-4-khuey@kylehuey.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 arch/x86/include/asm/tlbflush.h | 10 ++++++++++
 arch/x86/kernel/process.c       | 22 ++++------------------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6fa8594..ff4923a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -110,6 +110,16 @@ static inline void cr4_clear_bits(unsigned long mask)
 	}
 }
 
+static inline void cr4_toggle_bits(unsigned long mask)
+{
+	unsigned long cr4;
+
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
+	cr4 ^= mask;
+	this_cpu_write(cpu_tlbstate.cr4, cr4);
+	__write_cr4(cr4);
+}
+
 /* Read the CR4 shadow. */
 static inline unsigned long cr4_read_shadow(void)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 83fa3cb..366db77 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -124,11 +124,6 @@ void flush_thread(void)
 	fpu__clear(&tsk->thread.fpu);
 }
 
-static void hard_disable_TSC(void)
-{
-	cr4_set_bits(X86_CR4_TSD);
-}
-
 void disable_TSC(void)
 {
 	preempt_disable();
@@ -137,15 +132,10 @@ void disable_TSC(void)
 		 * Must flip the CPU state synchronously with
 		 * TIF_NOTSC in the current running context.
 		 */
-		hard_disable_TSC();
+		cr4_set_bits(X86_CR4_TSD);
 	preempt_enable();
 }
 
-static void hard_enable_TSC(void)
-{
-	cr4_clear_bits(X86_CR4_TSD);
-}
-
 static void enable_TSC(void)
 {
 	preempt_disable();
@@ -154,7 +144,7 @@ static void enable_TSC(void)
 		 * Must flip the CPU state synchronously with
 		 * TIF_NOTSC in the current running context.
 		 */
-		hard_enable_TSC();
+		cr4_clear_bits(X86_CR4_TSD);
 	preempt_enable();
 }
 
@@ -233,12 +223,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	}
 
-	if ((tifp ^ tifn) & _TIF_NOTSC) {
-		if (tifn & _TIF_NOTSC)
-			hard_disable_TSC();
-		else
-			hard_enable_TSC();
-	}
+	if ((tifp ^ tifn) & _TIF_NOTSC)
+		cr4_toggle_bits(X86_CR4_TSD);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-03-11 11:51 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-14  8:11 [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra() Kyle Huey
2017-02-14  8:11 ` [PATCH v2 1/3] x86/process: Optimize TIF checks in __switch_to_xtra() Kyle Huey
2017-03-11 11:49   ` [tip:x86/process] " tip-bot for Kyle Huey
2017-02-14  8:11 ` [PATCH v2 2/3] x86/process: Correct and optimize TIF_BLOCKSTEP switch Kyle Huey
2017-03-11 11:49   ` [tip:x86/process] " tip-bot for Kyle Huey
2017-02-14  8:11 ` [PATCH v2 3/3] x86/process: Optimize TIF_NOTSC switch Kyle Huey
2017-03-11 11:50   ` [tip:x86/process] " tip-bot for Thomas Gleixner
2017-02-28 18:33 ` [PATCH v2 0/3] x86/process: Optimize __switch_to_xtra() Kyle Huey
2017-03-08 22:07   ` Kyle Huey
2017-03-10 20:23     ` Thomas Gleixner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).