LKML Archive on lore.kernel.org
 help / color / Atom feed
* [RFC][PATCH 0/4] x86/entry: disallow #DB more
@ 2020-05-22 20:47 Peter Zijlstra
  2020-05-22 20:47 ` [RFC][PATCH 1/4] x86/entry: Introduce local_db_{rave,restore}() Peter Zijlstra
                   ` (4 more replies)
  0 siblings, 5 replies; 15+ messages in thread
From: Peter Zijlstra @ 2020-05-22 20:47 UTC (permalink / raw)
  To: tglx, luto, peterz; +Cc: linux-kernel, x86

Hai, this kills #DB during NMI/#MC and with that allows removing all the nasty
IST rewrite crud.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC][PATCH 1/4] x86/entry: Introduce local_db_{rave,restore}()
  2020-05-22 20:47 [RFC][PATCH 0/4] x86/entry: disallow #DB more Peter Zijlstra
@ 2020-05-22 20:47 ` Peter Zijlstra
  2020-05-22 20:47 ` [RFC][PATCH 2/4] x86/entry, nmi: Disable #DB Peter Zijlstra
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2020-05-22 20:47 UTC (permalink / raw)
  To: tglx, luto, peterz; +Cc: linux-kernel, x86

In order to allow other exceptions than #DB to disable breakpoints,
provide a common helper.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/debugreg.h |   23 +++++++++++++++++++++++
 arch/x86/kernel/traps.c         |   18 ++----------------
 2 files changed, 25 insertions(+), 16 deletions(-)

--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -113,6 +113,29 @@ static inline void debug_stack_usage_inc
 static inline void debug_stack_usage_dec(void) { }
 #endif /* X86_64 */
 
+static __always_inline void local_db_save(unsigned long *dr7)
+{
+	get_debugreg(*dr7, 7);
+	set_debugreg(0, 7);
+	/*
+	 * Ensure the compiler doesn't lower the above statements into
+	 * the critical section; disabling breakpoints late would not
+	 * be good.
+	 */
+	barrier();
+}
+
+static __always_inline void local_db_restore(unsigned long dr7)
+{
+	/*
+	 * Ensure the compiler doesn't raise this statement into
+	 * the critical section; enabling breakpoints early would
+	 * not be good.
+	 */
+	barrier();
+	set_debugreg(dr7, 7);
+}
+
 #ifdef CONFIG_CPU_SUP_AMD
 extern void set_dr_addr_mask(unsigned long mask, int dr);
 #else
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -727,15 +727,7 @@ static __always_inline void debug_enter(
 	 * Entry text is excluded for HW_BP_X and cpu_entry_area, which
 	 * includes the entry stack is excluded for everything.
 	 */
-	get_debugreg(*dr7, 7);
-	set_debugreg(0, 7);
-
-	/*
-	 * Ensure the compiler doesn't lower the above statements into
-	 * the critical section; disabling breakpoints late would not
-	 * be good.
-	 */
-	barrier();
+	local_db_save(dr7);
 
 	/*
 	 * The Intel SDM says:
@@ -756,13 +748,7 @@ static __always_inline void debug_enter(
 
 static __always_inline void debug_exit(unsigned long dr7)
 {
-	/*
-	 * Ensure the compiler doesn't raise this statement into
-	 * the critical section; enabling breakpoints early would
-	 * not be good.
-	 */
-	barrier();
-	set_debugreg(dr7, 7);
+	local_db_restore(dr7);
 }
 
 /*



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC][PATCH 2/4] x86/entry, nmi: Disable #DB
  2020-05-22 20:47 [RFC][PATCH 0/4] x86/entry: disallow #DB more Peter Zijlstra
  2020-05-22 20:47 ` [RFC][PATCH 1/4] x86/entry: Introduce local_db_{rave,restore}() Peter Zijlstra
@ 2020-05-22 20:47 ` Peter Zijlstra
  2020-05-22 20:47 ` [RFC][PATCH 3/4] x86/entry: Remove debug IST frobbing Peter Zijlstra
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2020-05-22 20:47 UTC (permalink / raw)
  To: tglx, luto, peterz; +Cc: linux-kernel, x86

Instead of playing stupid games with IST stacks, fully disallow #DB
during NMIs. There is absolutely no reason to allow them, and killing
this saves a heap of trouble.

We already disallow #BD on noinstr and CEA, so we can't get #DB before
this, and this ensures we can't get it after this either.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/nmi.c |   55 ++------------------------------------------------
 1 file changed, 3 insertions(+), 52 deletions(-)

--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -478,40 +478,7 @@ enum nmi_states {
 };
 static DEFINE_PER_CPU(enum nmi_states, nmi_state);
 static DEFINE_PER_CPU(unsigned long, nmi_cr2);
-
-#ifdef CONFIG_X86_64
-/*
- * In x86_64, we need to handle breakpoint -> NMI -> breakpoint.  Without
- * some care, the inner breakpoint will clobber the outer breakpoint's
- * stack.
- *
- * If a breakpoint is being processed, and the debug stack is being
- * used, if an NMI comes in and also hits a breakpoint, the stack
- * pointer will be set to the same fixed address as the breakpoint that
- * was interrupted, causing that stack to be corrupted. To handle this
- * case, check if the stack that was interrupted is the debug stack, and
- * if so, change the IDT so that new breakpoints will use the current
- * stack and not switch to the fixed address. On return of the NMI,
- * switch back to the original IDT.
- */
-static DEFINE_PER_CPU(int, update_debug_stack);
-
-static noinstr bool is_debug_stack(unsigned long addr)
-{
-	struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks);
-	unsigned long top = CEA_ESTACK_TOP(cs, DB);
-	unsigned long bot = CEA_ESTACK_BOT(cs, DB1);
-
-	if (__this_cpu_read(debug_stack_usage))
-		return true;
-	/*
-	 * Note, this covers the guard page between DB and DB1 as well to
-	 * avoid two checks. But by all means @addr can never point into
-	 * the guard page.
-	 */
-	return addr >= bot && addr < top;
-}
-#endif
+static DEFINE_PER_CPU(unsigned long, nmi_dr7);
 
 DEFINE_IDTENTRY_NMI(exc_nmi)
 {
@@ -526,18 +493,7 @@ DEFINE_IDTENTRY_NMI(exc_nmi)
 	this_cpu_write(nmi_cr2, read_cr2());
 nmi_restart:
 
-#ifdef CONFIG_X86_64
-	/*
-	 * If we interrupted a breakpoint, it is possible that
-	 * the nmi handler will have breakpoints too. We need to
-	 * change the IDT such that breakpoints that happen here
-	 * continue to use the NMI stack.
-	 */
-	if (unlikely(is_debug_stack(regs->sp))) {
-		debug_stack_set_zero();
-		this_cpu_write(update_debug_stack, 1);
-	}
-#endif
+	local_db_save(this_cpu_ptr(&nmi_dr7));
 
 	nmi_enter();
 
@@ -548,12 +504,7 @@ DEFINE_IDTENTRY_NMI(exc_nmi)
 
 	nmi_exit();
 
-#ifdef CONFIG_X86_64
-	if (unlikely(this_cpu_read(update_debug_stack))) {
-		debug_stack_reset();
-		this_cpu_write(update_debug_stack, 0);
-	}
-#endif
+	local_db_restore(*this_cpu_ptr(&nmi_dr7));
 
 	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
 		write_cr2(this_cpu_read(nmi_cr2));



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC][PATCH 3/4] x86/entry: Remove debug IST frobbing
  2020-05-22 20:47 [RFC][PATCH 0/4] x86/entry: disallow #DB more Peter Zijlstra
  2020-05-22 20:47 ` [RFC][PATCH 1/4] x86/entry: Introduce local_db_{rave,restore}() Peter Zijlstra
  2020-05-22 20:47 ` [RFC][PATCH 2/4] x86/entry, nmi: Disable #DB Peter Zijlstra
@ 2020-05-22 20:47 ` Peter Zijlstra
  2020-05-22 20:47 ` [RFC][PATCH 4/4] x86/entry, mce: Disallow #DB during #MC Peter Zijlstra
  2020-05-22 22:13 ` [RFC][PATCH 0/4] x86/entry: disallow #DB more Andy Lutomirski
  4 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2020-05-22 20:47 UTC (permalink / raw)
  To: tglx, luto, peterz; +Cc: linux-kernel, x86

This is all unused now.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/debugreg.h |   19 -------------------
 arch/x86/include/asm/desc.h     |   32 +-------------------------------
 arch/x86/kernel/cpu/common.c    |   17 -----------------
 arch/x86/kernel/idt.c           |   22 ----------------------
 arch/x86/kernel/traps.c         |    9 ---------
 5 files changed, 1 insertion(+), 98 deletions(-)

--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -94,25 +94,6 @@ extern void aout_dump_debugregs(struct u
 
 extern void hw_breakpoint_restore(void);
 
-#ifdef CONFIG_X86_64
-DECLARE_PER_CPU(int, debug_stack_usage);
-static inline void debug_stack_usage_inc(void)
-{
-	__this_cpu_inc(debug_stack_usage);
-}
-static inline void debug_stack_usage_dec(void)
-{
-	__this_cpu_dec(debug_stack_usage);
-}
-void debug_stack_set_zero(void);
-void debug_stack_reset(void);
-#else /* !X86_64 */
-static inline void debug_stack_set_zero(void) { }
-static inline void debug_stack_reset(void) { }
-static inline void debug_stack_usage_inc(void) { }
-static inline void debug_stack_usage_dec(void) { }
-#endif /* X86_64 */
-
 static __always_inline void local_db_save(unsigned long *dr7)
 {
 	get_debugreg(*dr7, 7);
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -390,31 +390,6 @@ void alloc_intr_gate(unsigned int n, con
 
 extern unsigned long system_vectors[];
 
-#ifdef CONFIG_X86_64
-DECLARE_PER_CPU(u32, debug_idt_ctr);
-static __always_inline bool is_debug_idt_enabled(void)
-{
-	if (this_cpu_read(debug_idt_ctr))
-		return true;
-
-	return false;
-}
-
-static __always_inline void load_debug_idt(void)
-{
-	load_idt((const struct desc_ptr *)&debug_idt_descr);
-}
-#else
-static inline bool is_debug_idt_enabled(void)
-{
-	return false;
-}
-
-static inline void load_debug_idt(void)
-{
-}
-#endif
-
 /*
  * The load_current_idt() must be called with interrupts disabled
  * to avoid races. That way the IDT will always be set back to the expected
@@ -424,10 +399,7 @@ static inline void load_debug_idt(void)
  */
 static __always_inline void load_current_idt(void)
 {
-	if (is_debug_idt_enabled())
-		load_debug_idt();
-	else
-		load_idt((const struct desc_ptr *)&idt_descr);
+	load_idt((const struct desc_ptr *)&idt_descr);
 }
 
 extern void idt_setup_early_handler(void);
@@ -438,11 +410,9 @@ extern void idt_setup_apic_and_irq_gates
 #ifdef CONFIG_X86_64
 extern void idt_setup_early_pf(void);
 extern void idt_setup_ist_traps(void);
-extern void idt_setup_debugidt_traps(void);
 #else
 static inline void idt_setup_early_pf(void) { }
 static inline void idt_setup_ist_traps(void) { }
-static inline void idt_setup_debugidt_traps(void) { }
 #endif
 
 extern void idt_invalidate(void *addr);
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1689,23 +1689,6 @@ void syscall_init(void)
 	       X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
 }
 
-DEFINE_PER_CPU(int, debug_stack_usage);
-DEFINE_PER_CPU(u32, debug_idt_ctr);
-
-noinstr void debug_stack_set_zero(void)
-{
-	this_cpu_inc(debug_idt_ctr);
-	load_current_idt();
-}
-
-noinstr void debug_stack_reset(void)
-{
-	if (WARN_ON(!this_cpu_read(debug_idt_ctr)))
-		return;
-	if (this_cpu_dec_return(debug_idt_ctr) == 0)
-		load_current_idt();
-}
-
 #else	/* CONFIG_X86_64 */
 
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -177,9 +177,6 @@ struct desc_ptr idt_descr __ro_after_ini
 };
 
 #ifdef CONFIG_X86_64
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
-
 /*
  * The exceptions which use Interrupt stacks. They are setup after
  * cpu_init() when the TSS has been initialized.
@@ -192,15 +189,6 @@ static const __initconst struct idt_data
 	ISTG(X86_TRAP_MC,	asm_exc_machine_check,	IST_INDEX_MCE),
 #endif
 };
-
-/*
- * Override for the debug_idt. Same as the default, but with interrupt
- * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
- */
-const struct desc_ptr debug_idt_descr = {
-	.size		= IDT_ENTRIES * 16 - 1,
-	.address	= (unsigned long) debug_idt_table,
-};
 #endif
 
 static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
@@ -292,16 +280,6 @@ void __init idt_setup_ist_traps(void)
 {
 	idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
 }
-
-/**
- * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
- */
-void __init idt_setup_debugidt_traps(void)
-{
-	memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
-
-	idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false);
-}
 #endif
 
 /**
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -798,12 +798,6 @@ static void noinstr handle_debug(struct
 		return;
 	}
 
-	/*
-	 * Let others (NMI) know that the debug stack is in use
-	 * as we may switch to the interrupt stack.
-	 */
-	debug_stack_usage_inc();
-
 	/* It's safe to allow irq's after DR6 has been saved */
 	cond_local_irq_enable(regs);
 
@@ -831,7 +825,6 @@ static void noinstr handle_debug(struct
 
 out:
 	cond_local_irq_disable(regs);
-	debug_stack_usage_dec();
 	instrumentation_end();
 }
 
@@ -1077,6 +1070,4 @@ void __init trap_init(void)
 	cpu_init();
 
 	idt_setup_ist_traps();
-
-	idt_setup_debugidt_traps();
 }



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC][PATCH 4/4] x86/entry, mce: Disallow #DB during #MC
  2020-05-22 20:47 [RFC][PATCH 0/4] x86/entry: disallow #DB more Peter Zijlstra
                   ` (2 preceding siblings ...)
  2020-05-22 20:47 ` [RFC][PATCH 3/4] x86/entry: Remove debug IST frobbing Peter Zijlstra
@ 2020-05-22 20:47 ` Peter Zijlstra
  2020-05-22 22:13 ` [RFC][PATCH 0/4] x86/entry: disallow #DB more Andy Lutomirski
  4 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2020-05-22 20:47 UTC (permalink / raw)
  To: tglx, luto, peterz; +Cc: linux-kernel, x86

#MC is fragile as heck, don't tempt fate.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/cpu/mce/core.c |   12 ++++++++++++
 1 file changed, 12 insertions(+)

--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1943,22 +1943,34 @@ static __always_inline void exc_machine_
 /* MCE hit kernel mode */
 DEFINE_IDTENTRY_MCE(exc_machine_check)
 {
+	unsigned long dr7;
+
+	local_db_save(&dr7);
 	exc_machine_check_kernel(regs);
+	local_db_restore(dr7);
 }
 
 /* The user mode variant. */
 DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
 {
+	unsigned long dr7;
+
+	local_db_save(&dr7);
 	exc_machine_check_user(regs);
+	local_db_restore(dr7);
 }
 #else
 /* 32bit unified entry point */
 DEFINE_IDTENTRY_MCE(exc_machine_check)
 {
+	unsigned long dr7;
+
+	local_db_save(&dr7);
 	if (user_mode(regs))
 		exc_machine_check_user(regs);
 	else
 		exc_machine_check_kernel(regs);
+	local_db_restore(dr7);
 }
 #endif
 



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more
  2020-05-22 20:47 [RFC][PATCH 0/4] x86/entry: disallow #DB more Peter Zijlstra
                   ` (3 preceding siblings ...)
  2020-05-22 20:47 ` [RFC][PATCH 4/4] x86/entry, mce: Disallow #DB during #MC Peter Zijlstra
@ 2020-05-22 22:13 ` Andy Lutomirski
  2020-05-22 22:20   ` Sean Christopherson
  2020-05-23 12:59   ` Peter Zijlstra
  4 siblings, 2 replies; 15+ messages in thread
From: Andy Lutomirski @ 2020-05-22 22:13 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Thomas Gleixner, LKML, X86 ML

On Fri, May 22, 2020 at 1:49 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Hai, this kills #DB during NMI/#MC and with that allows removing all the nasty
> IST rewrite crud.
>

This is great, except that the unconditional DR7 write is going to
seriously hurt perf performance.  Fortunately, no one cares about
perf, right? :)  Even just reading first won't help enough because DR7
reads are likely to be VM exits.  Can we have a percpu dr7 shadow
(with careful ordering) or even just a percpu count of dr7 users so we
can skip this if there are no breakpoints?  We have cpu_dr7, and some
minor changes would make this work.  Maybe replace all the direct
cpu_dr7 access with helpers like dr7_set_bits() and dr7_clear_bits()?

Also, I like raving at DR7 :)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more
  2020-05-22 22:13 ` [RFC][PATCH 0/4] x86/entry: disallow #DB more Andy Lutomirski
@ 2020-05-22 22:20   ` Sean Christopherson
  2020-05-22 22:43     ` Andy Lutomirski
  2020-05-23 12:59   ` Peter Zijlstra
  1 sibling, 1 reply; 15+ messages in thread
From: Sean Christopherson @ 2020-05-22 22:20 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Peter Zijlstra, Thomas Gleixner, LKML, X86 ML

On Fri, May 22, 2020 at 03:13:57PM -0700, Andy Lutomirski wrote:
> On Fri, May 22, 2020 at 1:49 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Hai, this kills #DB during NMI/#MC and with that allows removing all the nasty
> > IST rewrite crud.
> >
> 
> This is great, except that the unconditional DR7 write is going to
> seriously hurt perf performance.  Fortunately, no one cares about
> perf, right? :)  Even just reading first won't help enough because DR7
> reads are likely to be VM exits.  Can we have a percpu dr7 shadow
> (with careful ordering) or even just a percpu count of dr7 users so we
> can skip this if there are no breakpoints?

Hmm, I believe hw_breakpoint_active() is what you're looking for, KVM uses
it to avoid unnecessary restoration of host DR7 after VM-Exit.

Amusingly, checking that in the NMI handler could give a false positive if
an NMI occurs in guest as DR7 is cleared on exit and KVM invokes the NMI
handler prior to restoring host DR7.  I doubt that's common enough to care
about though.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more
  2020-05-22 22:20   ` Sean Christopherson
@ 2020-05-22 22:43     ` Andy Lutomirski
  0 siblings, 0 replies; 15+ messages in thread
From: Andy Lutomirski @ 2020-05-22 22:43 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Andy Lutomirski, Peter Zijlstra, Thomas Gleixner, LKML, X86 ML


> On May 22, 2020, at 3:20 PM, Sean Christopherson <sean.j.christopherson@intel.com> wrote:
> 
> On Fri, May 22, 2020 at 03:13:57PM -0700, Andy Lutomirski wrote:
>>> On Fri, May 22, 2020 at 1:49 PM Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>> Hai, this kills #DB during NMI/#MC and with that allows removing all the nasty
>>> IST rewrite crud.
>>> 
>> 
>> This is great, except that the unconditional DR7 write is going to
>> seriously hurt perf performance.  Fortunately, no one cares about
>> perf, right? :)  Even just reading first won't help enough because DR7
>> reads are likely to be VM exits.  Can we have a percpu dr7 shadow
>> (with careful ordering) or even just a percpu count of dr7 users so we
>> can skip this if there are no breakpoints?
> 
> Hmm, I believe hw_breakpoint_active() is what you're looking for, KVM uses
> it to avoid unnecessary restoration of host DR7 after VM-Exit.
> 
> Amusingly, checking that in the NMI handler could give a false positive if
> an NMI occurs in guest as DR7 is cleared on exit and KVM invokes the NMI
> handler prior to restoring host DR7.  I doubt that's common enough to care
> about though.

False positives are unavoidable: there’s no way we can set a percpu variable and set DR7 without risking an NMI in between.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more
  2020-05-22 22:13 ` [RFC][PATCH 0/4] x86/entry: disallow #DB more Andy Lutomirski
  2020-05-22 22:20   ` Sean Christopherson
@ 2020-05-23 12:59   ` Peter Zijlstra
  2020-05-23 21:32     ` Peter Zijlstra
  1 sibling, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2020-05-23 12:59 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Thomas Gleixner, LKML, X86 ML

On Fri, May 22, 2020 at 03:13:57PM -0700, Andy Lutomirski wrote:
> On Fri, May 22, 2020 at 1:49 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Hai, this kills #DB during NMI/#MC and with that allows removing all the nasty
> > IST rewrite crud.
> >
> 
> This is great, except that the unconditional DR7 write is going to
> seriously hurt perf performance.  Fortunately, no one cares about
> perf, right? :)

Good point, so the trivial optimization is below. I couldn't find
instruction latency numbers for DRn load/stores anywhere. I'm hoping
loads are cheap.

> Even just reading first won't help enough because DR7
> reads are likely to be VM exits.

WTF, why is virt always such a horrible piece of crap?

> Can we have a percpu dr7 shadow
> (with careful ordering) or even just a percpu count of dr7 users so we
> can skip this if there are no breakpoints?  We have cpu_dr7, and some
> minor changes would make this work.  Maybe replace all the direct
> cpu_dr7 access with helpers like dr7_set_bits() and dr7_clear_bits()?

I'll try and sort through that on Monday or so.


---
 arch/x86/include/asm/debugreg.h |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -97,7 +97,8 @@ extern void hw_breakpoint_restore(void);
 static __always_inline void local_db_save(unsigned long *dr7)
 {
 	get_debugreg(*dr7, 7);
-	set_debugreg(0, 7);
+	if (*dr7)
+		set_debugreg(0, 7);
 	/*
 	 * Ensure the compiler doesn't lower the above statements into
 	 * the critical section; disabling breakpoints late would not
@@ -114,7 +115,8 @@ static __always_inline void local_db_res
 	 * not be good.
 	 */
 	barrier();
-	set_debugreg(dr7, 7);
+	if (dr7)
+		set_debugreg(dr7, 7);
 }
 
 #ifdef CONFIG_CPU_SUP_AMD


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more
  2020-05-23 12:59   ` Peter Zijlstra
@ 2020-05-23 21:32     ` Peter Zijlstra
  2020-05-25 10:02       ` Rasmus Villemoes
  0 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2020-05-23 21:32 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Thomas Gleixner, LKML, X86 ML

On Sat, May 23, 2020 at 02:59:40PM +0200, Peter Zijlstra wrote:
> On Fri, May 22, 2020 at 03:13:57PM -0700, Andy Lutomirski wrote:

> > This is great, except that the unconditional DR7 write is going to
> > seriously hurt perf performance.  Fortunately, no one cares about
> > perf, right? :)
> 
> Good point, so the trivial optimization is below. I couldn't find
> instruction latency numbers for DRn load/stores anywhere. I'm hoping
> loads are cheap.

+	u64 empty = 0, read = 0, write = 0;
+	unsigned long dr7;
+
+	for (i=0; i<100; i++) {
+		u64 s;
+
+		s = rdtsc();
+		barrier_nospec();
+		barrier_nospec();
+		empty += rdtsc() - s;
+
+		s = rdtsc();
+		barrier_nospec();
+		dr7 = native_get_debugreg(7);
+		barrier_nospec();
+		read += rdtsc() - s;
+
+		s = rdtsc();
+		barrier_nospec();
+		native_set_debugreg(7, 0);
+		barrier_nospec();
+		write += rdtsc() - s;
+	}
+
+	printk("XXX: %ld %ld %ld\n", empty, read, write);


[    1.628125] XXX: 2800 2404 19600

IOW, reading DR7 is basically free, and certainly cheaper than looking
at cpu_dr7 which would probably be an insta cache miss.

Which seems to suggest KVM can go pound sand. Maybe they can fix
themselves with some paravirt love.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more
  2020-05-23 21:32     ` Peter Zijlstra
@ 2020-05-25 10:02       ` Rasmus Villemoes
  2020-05-25 10:40         ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: Rasmus Villemoes @ 2020-05-25 10:02 UTC (permalink / raw)
  To: Peter Zijlstra, Andy Lutomirski; +Cc: Thomas Gleixner, LKML, X86 ML

On 23/05/2020 23.32, Peter Zijlstra wrote:
> On Sat, May 23, 2020 at 02:59:40PM +0200, Peter Zijlstra wrote:
>> On Fri, May 22, 2020 at 03:13:57PM -0700, Andy Lutomirski wrote:
> 
>> Good point, so the trivial optimization is below. I couldn't find
>> instruction latency numbers for DRn load/stores anywhere. I'm hoping
>> loads are cheap.
> 
> +	u64 empty = 0, read = 0, write = 0;
> +	unsigned long dr7;
> +
> +	for (i=0; i<100; i++) {
> +		u64 s;
> +
> +		s = rdtsc();
> +		barrier_nospec();
> +		barrier_nospec();
> +		empty += rdtsc() - s;
> +
> +		s = rdtsc();
> +		barrier_nospec();
> +		dr7 = native_get_debugreg(7);
> +		barrier_nospec();
> +		read += rdtsc() - s;
> +
> +		s = rdtsc();
> +		barrier_nospec();
> +		native_set_debugreg(7, 0);
> +		barrier_nospec();
> +		write += rdtsc() - s;
> +	}
> +
> +	printk("XXX: %ld %ld %ld\n", empty, read, write);
> 
> 
> [    1.628125] XXX: 2800 2404 19600
> 
> IOW, reading DR7 is basically free, and certainly cheaper than looking
> at cpu_dr7 which would probably be an insta cache miss.
> 

Naive question: did you check disassembly to see whether gcc threw your
native_get_debugreg() away, given that the asm isn't volatile and the
result is not used for anything? Testing here only shows a "mov
%r9,%db7", but the read did seem to get thrown away.

Rasmus

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more
  2020-05-25 10:02       ` Rasmus Villemoes
@ 2020-05-25 10:40         ` Peter Zijlstra
  2020-05-25 11:01           ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2020-05-25 10:40 UTC (permalink / raw)
  To: Rasmus Villemoes; +Cc: Andy Lutomirski, Thomas Gleixner, LKML, X86 ML

On Mon, May 25, 2020 at 12:02:48PM +0200, Rasmus Villemoes wrote:

> Naive question: did you check disassembly to see whether gcc threw your
> native_get_debugreg() away, given that the asm isn't volatile and the
> result is not used for anything? Testing here only shows a "mov
> %r9,%db7", but the read did seem to get thrown away.

Argh.. no I did not. Writing it all in asm gets me:

[    1.627405] XXX: 3900 8304 22632

which is a lot worse...

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more
  2020-05-25 10:40         ` Peter Zijlstra
@ 2020-05-25 11:01           ` Peter Zijlstra
  2020-05-25 17:19             ` Andy Lutomirski
  0 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2020-05-25 11:01 UTC (permalink / raw)
  To: Rasmus Villemoes; +Cc: Andy Lutomirski, Thomas Gleixner, LKML, X86 ML

On Mon, May 25, 2020 at 12:40:38PM +0200, Peter Zijlstra wrote:
> On Mon, May 25, 2020 at 12:02:48PM +0200, Rasmus Villemoes wrote:
> 
> > Naive question: did you check disassembly to see whether gcc threw your
> > native_get_debugreg() away, given that the asm isn't volatile and the
> > result is not used for anything? Testing here only shows a "mov
> > %r9,%db7", but the read did seem to get thrown away.
> 
> Argh.. no I did not. Writing it all in asm gets me:
> 
> [    1.627405] XXX: 3900 8304 22632
> 
> which is a lot worse...

+	u64 empty = 0, read = 0, write = 0, cpu = 0, cpu1 = 0;
+	unsigned long dr7;
+
+	for (i=0; i<100; i++) {
+		u64 s;
+
+		s = rdtsc();
+		asm volatile ("lfence; lfence;");
+		empty += rdtsc() - s;
+
+		s = rdtsc();
+		asm volatile ("lfence; mov %%db7, %0; lfence;" : "=r" (dr7));
+		read += rdtsc() - s;
+
+		s = rdtsc();
+		asm volatile ("lfence; mov %0, %%db7; lfence;" :: "r" (dr7));
+		write += rdtsc() - s;
+
+		s = rdtsc();
+		asm volatile ("lfence; mov %0, %%db7; lfence;" :: "r" (dr7));
+		write += rdtsc() - s;
+
+		clflush(this_cpu_ptr(&cpu_dr7));
+
+		s = rdtsc();
+		asm volatile ("lfence;");
+		dr7 = this_cpu_read(cpu_dr7);
+		asm volatile ("lfence;");
+		cpu += rdtsc() - s;
+
+		s = rdtsc();
+		asm volatile ("lfence;");
+		dr7 = this_cpu_read(cpu_dr7);
+		asm volatile ("lfence;");
+		cpu1 += rdtsc() - s;
+	}
+
+	printk("XXX: %ld %ld %ld %ld %ld\n", empty, read, write, cpu, cpu1);

[    1.628252] XXX: 3820 8224 45516 35560 4800

Which still seems to suggest using DR7 directly is probably a good
thing. It's slower than a L1 hit, but massively faster than a full miss.

---

11f:   0f 31                   rdtsc
121:   48 89 d1                mov    %rdx,%rcx
124:   48 89 c6                mov    %rax,%rsi
127:   0f ae e8                lfence
12a:   0f ae e8                lfence
12d:   0f 31                   rdtsc
12f:   48 c1 e2 20             shl    $0x20,%rdx
133:   48 c1 e1 20             shl    $0x20,%rcx
137:   48 09 c2                or     %rax,%rdx
13a:   48 09 f1                or     %rsi,%rcx
13d:   48 29 ca                sub    %rcx,%rdx
140:   48 01 d3                add    %rdx,%rbx
143:   0f 31                   rdtsc
145:   48 89 d1                mov    %rdx,%rcx
148:   48 89 c6                mov    %rax,%rsi
14b:   0f ae e8                lfence
14e:   41 0f 21 fb             mov    %db7,%r11
152:   0f ae e8                lfence
155:   0f 31                   rdtsc
157:   48 c1 e2 20             shl    $0x20,%rdx
15b:   48 c1 e1 20             shl    $0x20,%rcx
15f:   48 09 c2                or     %rax,%rdx
162:   48 09 f1                or     %rsi,%rcx
165:   48 29 ca                sub    %rcx,%rdx
168:   48 01 d5                add    %rdx,%rbp
16b:   0f 31                   rdtsc
16d:   48 89 d6                mov    %rdx,%rsi
170:   49 89 c1                mov    %rax,%r9
173:   0f ae e8                lfence
176:   41 0f 23 fb             mov    %r11,%db7
17a:   0f ae e8                lfence
17d:   0f 31                   rdtsc
17f:   48 89 d7                mov    %rdx,%rdi
182:   49 89 c2                mov    %rax,%r10
185:   0f 31                   rdtsc
187:   48 89 d1                mov    %rdx,%rcx
18a:   49 89 c0                mov    %rax,%r8
18d:   0f ae e8                lfence
190:   41 0f 23 fb             mov    %r11,%db7
194:   0f ae e8                lfence
197:   0f 31                   rdtsc
199:   48 c1 e2 20             shl    $0x20,%rdx
19d:   48 c1 e6 20             shl    $0x20,%rsi
1a1:   48 09 c2                or     %rax,%rdx
1a4:   48 89 f8                mov    %rdi,%rax
1a7:   48 c1 e1 20             shl    $0x20,%rcx
1ab:   48 c1 e0 20             shl    $0x20,%rax
1af:   49 09 f1                or     %rsi,%r9
1b2:   49 09 c8                or     %rcx,%r8
1b5:   49 09 c2                or     %rax,%r10
1b8:   4a 8d 04 12             lea    (%rdx,%r10,1),%rax
1bc:   48 c7 c2 00 00 00 00    mov    $0x0,%rdx
			1bf: R_X86_64_32S       cpu_dr7
1c3:   4c 29 c8                sub    %r9,%rax
1c6:   4c 29 c0                sub    %r8,%rax
1c9:   49 01 c4                add    %rax,%r12
1cc:   48 89 14 24             mov    %rdx,(%rsp)
1d0:   48 89 54 24 08          mov    %rdx,0x8(%rsp)
1d5:   e8 00 00 00 00          callq  1da <sched_init+0xe1>
			1d6: R_X86_64_PLT32     debug_smp_processor_id-0x4
1da:   48 c7 c1 00 00 00 00    mov    $0x0,%rcx
			1dd: R_X86_64_32S       __per_cpu_offset
1e1:   48 8b 14 24             mov    (%rsp),%rdx
1e5:   89 c0                   mov    %eax,%eax
1e7:   48 03 14 c1             add    (%rcx,%rax,8),%rdx
1eb:   0f ae 3a                clflush (%rdx)
1ee:   0f 31                   rdtsc
1f0:   48 89 d1                mov    %rdx,%rcx
1f3:   48 89 c6                mov    %rax,%rsi
1f6:   0f ae e8                lfence
1f9:   65 48 8b 05 00 00 00    mov    %gs:0x0(%rip),%rax        # 201 <sched_init+0x108>
200:   00
			1fd: R_X86_64_PC32      cpu_dr7-0x4
201:   0f ae e8                lfence
204:   0f 31                   rdtsc
206:   48 c1 e2 20             shl    $0x20,%rdx
20a:   48 c1 e1 20             shl    $0x20,%rcx
20e:   48 09 c2                or     %rax,%rdx
211:   48 09 f1                or     %rsi,%rcx
214:   48 29 ca                sub    %rcx,%rdx
217:   49 01 d5                add    %rdx,%r13
21a:   0f 31                   rdtsc
21c:   48 89 d1                mov    %rdx,%rcx
21f:   48 89 c6                mov    %rax,%rsi
222:   0f ae e8                lfence
225:   65 48 8b 05 00 00 00    mov    %gs:0x0(%rip),%rax        # 22d <sched_init+0x134>
22c:   00
			229: R_X86_64_PC32      cpu_dr7-0x4
22d:   0f ae e8                lfence
230:   0f 31                   rdtsc
232:   48 c1 e2 20             shl    $0x20,%rdx
236:   48 c1 e1 20             shl    $0x20,%rcx
23a:   48 09 c2                or     %rax,%rdx
23d:   48 09 f1                or     %rsi,%rcx
240:   48 29 ca                sub    %rcx,%rdx
243:   49 01 d6                add    %rdx,%r14
246:   41 ff cf                dec    %r15d
249:   0f 85 d0 fe ff ff       jne    11f <sched_init+0x26>



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more
  2020-05-25 11:01           ` Peter Zijlstra
@ 2020-05-25 17:19             ` Andy Lutomirski
  2020-05-25 18:08               ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: Andy Lutomirski @ 2020-05-25 17:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rasmus Villemoes, Andy Lutomirski, Thomas Gleixner, LKML, X86 ML



> On May 25, 2020, at 4:01 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Mon, May 25, 2020 at 12:40:38PM +0200, Peter Zijlstra wrote:
>>> On Mon, May 25, 2020 at 12:02:48PM +0200, Rasmus Villemoes wrote:
>>> 
>>> Naive question: did you check disassembly to see whether gcc threw your
>>> native_get_debugreg() away, given that the asm isn't volatile and the
>>> result is not used for anything? Testing here only shows a "mov
>>> %r9,%db7", but the read did seem to get thrown away.
>> 
>> Argh.. no I did not. Writing it all in asm gets me:
>> 
>> [    1.627405] XXX: 3900 8304 22632
>> 
>> which is a lot worse...
> 
> +    u64 empty = 0, read = 0, write = 0, cpu = 0, cpu1 = 0;
> +    unsigned long dr7;
> +
> +    for (i=0; i<100; i++) {
> +        u64 s;
> +
> +        s = rdtsc();
> +        asm volatile ("lfence; lfence;");
> +        empty += rdtsc() - s;
> +
> +        s = rdtsc();
> +        asm volatile ("lfence; mov %%db7, %0; lfence;" : "=r" (dr7));
> +        read += rdtsc() - s;
> +
> +        s = rdtsc();
> +        asm volatile ("lfence; mov %0, %%db7; lfence;" :: "r" (dr7));
> +        write += rdtsc() - s;
> +
> +        s = rdtsc();
> +        asm volatile ("lfence; mov %0, %%db7; lfence;" :: "r" (dr7));
> +        write += rdtsc() - s;
> +
> +        clflush(this_cpu_ptr(&cpu_dr7));
> +
> +        s = rdtsc();
> +        asm volatile ("lfence;");
> +        dr7 = this_cpu_read(cpu_dr7);
> +        asm volatile ("lfence;");
> +        cpu += rdtsc() - s;
> +
> +        s = rdtsc();
> +        asm volatile ("lfence;");
> +        dr7 = this_cpu_read(cpu_dr7);
> +        asm volatile ("lfence;");
> +        cpu1 += rdtsc() - s;
> +    }
> +
> +    printk("XXX: %ld %ld %ld %ld %ld\n", empty, read, write, cpu, cpu1);
> 
> [    1.628252] XXX: 3820 8224 45516 35560 4800
> 
> Which still seems to suggest using DR7 directly is probably a good
> thing. It's slower than a L1 hit, but massively faster than a full miss.
> 

How about adding it to cpu_tlbstate?  A lot of NMIs are going to read that anyway to check CR3.

And blaming KVM is a bit misplaced. This isn’t KVM’s fault — it’s Intel’s. VT-x has two modes: DR access exits and DR access doesn’t exit. There’s no shadow mode.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH 0/4] x86/entry: disallow #DB more
  2020-05-25 17:19             ` Andy Lutomirski
@ 2020-05-25 18:08               ` Peter Zijlstra
  0 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2020-05-25 18:08 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Rasmus Villemoes, Andy Lutomirski, Thomas Gleixner, LKML, X86 ML

On Mon, May 25, 2020 at 10:19:08AM -0700, Andy Lutomirski wrote:

> How about adding it to cpu_tlbstate?  A lot of NMIs are going to read
> that anyway to check CR3.

That might work I suppose; we're really pushing the name of it though.
Also, that's PTI specific IIRC, and we're getting to the point where a
significant number of CPUs no longer need that, right?

> And blaming KVM is a bit misplaced. This isn’t KVM’s fault — it’s
> Intel’s. VT-x has two modes: DR access exits and DR access doesn’t
> exit. There’s no shadow mode.

It's virt, I can't be arsed to care, whoever misdesigned it.
We already have debugreg pvops, they can do shadow there.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, back to index

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-22 20:47 [RFC][PATCH 0/4] x86/entry: disallow #DB more Peter Zijlstra
2020-05-22 20:47 ` [RFC][PATCH 1/4] x86/entry: Introduce local_db_{rave,restore}() Peter Zijlstra
2020-05-22 20:47 ` [RFC][PATCH 2/4] x86/entry, nmi: Disable #DB Peter Zijlstra
2020-05-22 20:47 ` [RFC][PATCH 3/4] x86/entry: Remove debug IST frobbing Peter Zijlstra
2020-05-22 20:47 ` [RFC][PATCH 4/4] x86/entry, mce: Disallow #DB during #MC Peter Zijlstra
2020-05-22 22:13 ` [RFC][PATCH 0/4] x86/entry: disallow #DB more Andy Lutomirski
2020-05-22 22:20   ` Sean Christopherson
2020-05-22 22:43     ` Andy Lutomirski
2020-05-23 12:59   ` Peter Zijlstra
2020-05-23 21:32     ` Peter Zijlstra
2020-05-25 10:02       ` Rasmus Villemoes
2020-05-25 10:40         ` Peter Zijlstra
2020-05-25 11:01           ` Peter Zijlstra
2020-05-25 17:19             ` Andy Lutomirski
2020-05-25 18:08               ` Peter Zijlstra

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git