linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] [LBR] Dump LBRs on Oops
@ 2014-11-21 17:03 Emmanuel Berthier
  2014-11-22  0:50 ` Thomas Gleixner
  0 siblings, 1 reply; 27+ messages in thread
From: Emmanuel Berthier @ 2014-11-21 17:03 UTC (permalink / raw)
  To: tglx, mingo, hpa, x86; +Cc: robert.jarzmik, emmanuel.berthier, linux-kernel

The purpose of this patch is to stop LBR at the early stage of
Exception Handling, and dump its content later in the dumpstack
process.
Only for X86_64 yet.

Signed-off-by: Emmanuel Berthier <emmanuel.berthier@intel.com>
---
 arch/x86/Kconfig.debug                     |    9 ++++++
 arch/x86/include/asm/processor.h           |    1 +
 arch/x86/kernel/cpu/common.c               |    4 +++
 arch/x86/kernel/cpu/perf_event.h           |    2 ++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |   12 +++++--
 arch/x86/kernel/dumpstack_64.c             |   47 ++++++++++++++++++++++++++--
 arch/x86/kernel/entry_64.S                 |   40 +++++++++++++++++++++++
 7 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 61bd2ad..7a998b2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -323,4 +323,13 @@ config X86_DEBUG_STATIC_CPU_HAS
 
 	  If unsure, say N.
 
+config LBR_DUMP_ON_EXCEPTION
+	bool "Dump Last Branch Records on Oops"
+	depends on DEBUG_KERNEL && X86_64
+	---help---
+	  Enabling this option turns on LBR recording and dump during Oops.
+
+	  This might help diagnose exceptions where faulting code
+	  is not easy to determine from the call stack.
+
 endmenu
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index eb71ec7..0c3ed67 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -462,6 +462,7 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
 extern unsigned int xstate_size;
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
+extern unsigned int lbr_dump_on_exception;
 
 struct perf_event;
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4b4f78c..f49a26c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1238,6 +1238,10 @@ DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 
 #endif	/* CONFIG_X86_64 */
 
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+unsigned int lbr_dump_on_exception = 1;
+#endif
+
 /*
  * Clear all 6 debug registers:
  */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index fc5eb39..ed9de7f 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -731,6 +731,8 @@ void intel_pmu_lbr_enable_all(void);
 
 void intel_pmu_lbr_disable_all(void);
 
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
+
 void intel_pmu_lbr_read(void);
 
 void intel_pmu_lbr_init_core(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 45fa730..baa840c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -4,7 +4,9 @@
 #include <asm/perf_event.h>
 #include <asm/msr.h>
 #include <asm/insn.h>
-
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+#include <asm/processor.h>
+#endif
 #include "perf_event.h"
 
 enum {
@@ -135,6 +137,9 @@ static void __intel_pmu_lbr_enable(void)
 	u64 debugctl;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
+	if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION))
+		lbr_dump_on_exception = 0;
+
 	if (cpuc->lbr_sel)
 		wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
 
@@ -147,6 +152,9 @@ static void __intel_pmu_lbr_disable(void)
 {
 	u64 debugctl;
 
+	if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION))
+		lbr_dump_on_exception = 1;
+
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
@@ -278,7 +286,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
  * is the same as the linear address, allowing us to merge the LIP and EIP
  * LBR formats.
  */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
 	int lbr_format = x86_pmu.intel_cap.lbr_format;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 1abcb50..fd78477 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -15,7 +15,10 @@
 #include <linux/nmi.h>
 
 #include <asm/stacktrace.h>
-
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+#include <asm/processor.h>
+#include "cpu/perf_event.h"
+#endif
 
 #define N_EXCEPTION_STACKS_END \
 		(N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
@@ -295,6 +298,41 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
+void show_lbrs(void)
+{
+	if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION)) {
+		u64 debugctl;
+		int i, lbr_on;
+
+		rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+		lbr_on = debugctl & DEBUGCTLMSR_LBR;
+
+		pr_info("Last Branch Records:");
+		if (!lbr_dump_on_exception) {
+			pr_cont(" (Disabled by perf_event)\n");
+		} else if (x86_pmu.lbr_nr == 0) {
+			pr_cont(" (x86_model unknown, check intel_pmu_init())\n");
+		} else if (lbr_on) {
+			pr_cont(" (not halted)\n");
+		} else {
+			struct cpu_hw_events *cpuc =
+						this_cpu_ptr(&cpu_hw_events);
+
+			intel_pmu_lbr_read_64(cpuc);
+
+			pr_cont("\n");
+			for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+				pr_info("   to: [<%016llx>] ",
+						cpuc->lbr_entries[i].to);
+				print_symbol("%s\n", cpuc->lbr_entries[i].to);
+				pr_info(" from: [<%016llx>] ",
+						cpuc->lbr_entries[i].from);
+				print_symbol("%s\n", cpuc->lbr_entries[i].from);
+			}
+		}
+	}
+}
+
 void show_regs(struct pt_regs *regs)
 {
 	int i;
@@ -314,10 +352,15 @@ void show_regs(struct pt_regs *regs)
 		unsigned char c;
 		u8 *ip;
 
+		/*
+		 * Called before show_stack_log_lvl() as it could trig
+		 * page_fault and reenable LBR
+		 */
+		show_lbrs();
+
 		printk(KERN_DEFAULT "Stack:\n");
 		show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
 				   0, KERN_DEFAULT);
-
 		printk(KERN_DEFAULT "Code: ");
 
 		ip = (u8 *)regs->ip - code_prologue;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index df088bb..120e989 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1035,6 +1035,42 @@ apicinterrupt IRQ_WORK_VECTOR \
 	irq_work_interrupt smp_irq_work_interrupt
 #endif
 
+.macro STOP_LBR
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+	testl $1, lbr_dump_on_exception
+	jz 1f
+	push %rax
+	push %rcx
+	push %rdx
+	movl $MSR_IA32_DEBUGCTLMSR, %ecx
+	rdmsr
+	and $~1, %eax	/* Disable LBR recording */
+	wrmsr
+	pop %rdx
+	pop %rcx
+	pop %rax
+1:
+#endif
+.endm
+
+.macro START_LBR
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+	testl $1, lbr_dump_on_exception
+	jz 1f
+	push %rax
+	push %rcx
+	push %rdx
+	movl $MSR_IA32_DEBUGCTLMSR, %ecx
+	rdmsr
+	or $1, %eax		/* Enable LBR recording */
+	wrmsr
+	pop %rdx
+	pop %rcx
+	pop %rax
+1:
+#endif
+.endm
+
 /*
  * Exception entry points.
  */
@@ -1063,6 +1099,8 @@ ENTRY(\sym)
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 
+	STOP_LBR
+
 	.if \paranoid
 	call save_paranoid
 	.else
@@ -1094,6 +1132,8 @@ ENTRY(\sym)
 
 	call \do_sym
 
+	START_LBR
+
 	.if \shift_ist != -1
 	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
 	.endif
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH] [LBR] Dump LBRs on Oops
  2014-11-21 17:03 [PATCH] [LBR] Dump LBRs on Oops Emmanuel Berthier
@ 2014-11-22  0:50 ` Thomas Gleixner
  2014-11-26 10:56   ` Berthier, Emmanuel
  0 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-11-22  0:50 UTC (permalink / raw)
  To: Emmanuel Berthier; +Cc: mingo, hpa, x86, robert.jarzmik, linux-kernel

On Fri, 21 Nov 2014, Emmanuel Berthier wrote:

> The purpose of this patch is to stop LBR at the early stage of
> Exception Handling, and dump its content later in the dumpstack
> process.

And that's useful in what way? The changelog should not tell WHAT the
patch does. It should tell WHY it is useful and what are the
usecases/benefits. Neither does it tell how that feature can be
used/enabled/disabled and how it provides useful information.

Where is that sample output which demonstrates that this is something
which adds debugging value rather than another level of pointless
featuritis?

> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -4,7 +4,9 @@
>  #include <asm/perf_event.h>
>  #include <asm/msr.h>
>  #include <asm/insn.h>
> -
> +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> +#include <asm/processor.h>
> +#endif

We just can include that file unconditionally.

>  #include "perf_event.h"
>  
>  enum {
> @@ -135,6 +137,9 @@ static void __intel_pmu_lbr_enable(void)
>  	u64 debugctl;
>  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>  
> +	if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION))
> +		lbr_dump_on_exception = 0;

With certain compilers you might get a surprise here, because they are
too stupid to remove that 'lbr_dump_on_exception = 0;' right
away. They kill it in the optimization phase. So they complain about
lbr_dump_on_exception not being defined.

So you need something like this:

static inline void lbr_set_dump_on_oops(bool enable)
{
#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
       ....
#endif
}

and make that

     if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION))
             lbr_set_dump_on_oops(false);

which is completely pointless as you can just call

      lbr_set_dump_on_oops(false);

unconditionally and be done with it.

IS_ENABLED(CONFIG_XXX) is not a proper solution for all problems. It
can avoid #ifdefs, but it also can introduce interesting nonsense.

>  	if (cpuc->lbr_sel)
>  		wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
>  
> @@ -147,6 +152,9 @@ static void __intel_pmu_lbr_disable(void)
>  {
>  	u64 debugctl;
>  
> +	if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION))
> +		lbr_dump_on_exception = 1;

Now the even more interesting question is, WHY is
lbr_dump_on_exception enabled in __intel_pmu_lbr_disable and disabled
in __intel_pmu_lbr_enable?

This obviously lacks a understandable comment, but before you
elaborate on this see the next comment.

> +void show_lbrs(void)
> +{
> +	if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION)) {
> +		u64 debugctl;
> +		int i, lbr_on;
> +
> +		rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
> +		lbr_on = debugctl & DEBUGCTLMSR_LBR;
> +
> +		pr_info("Last Branch Records:");
> +		if (!lbr_dump_on_exception) {
> +			pr_cont(" (Disabled by perf_event)\n");

So, if perf uses LBR we do not print it? What a weird design
decision. If the machine crashes, we want that information no matter
whether perf is active or not. What kind of twisted logic is that?

> +		} else if (x86_pmu.lbr_nr == 0) {
> +			pr_cont(" (x86_model unknown, check intel_pmu_init())\n");

Huch? Why we get here if the pmu does not support it at all? Why
should we bother to print it? If it's not printed it's not
available. It's that simple.

> +		} else if (lbr_on) {
> +			pr_cont(" (not halted)\n");

Why would it be not halted? Code comments are optional, right?

> +		} else {
> +			struct cpu_hw_events *cpuc =
> +						this_cpu_ptr(&cpu_hw_events);

A simple #ifdef would have saved you an indentation level and actually
made that code readable. IS_ENABLED() is a proper hammer for some
things but not everything is a nail.

> +			intel_pmu_lbr_read_64(cpuc);
> +
> +			pr_cont("\n");
> +			for (i = 0; i < cpuc->lbr_stack.nr; i++) {
> +				pr_info("   to: [<%016llx>] ",
> +						cpuc->lbr_entries[i].to);
> +				print_symbol("%s\n", cpuc->lbr_entries[i].to);
> +				pr_info(" from: [<%016llx>] ",
> +						cpuc->lbr_entries[i].from);
> +				print_symbol("%s\n", cpuc->lbr_entries[i].from);
> +			}
> +		}
> +	}
> +}
> +
>  void show_regs(struct pt_regs *regs)
>  {
>  	int i;
> @@ -314,10 +352,15 @@ void show_regs(struct pt_regs *regs)
>  		unsigned char c;
>  		u8 *ip;
>  
> +		/*
> +		 * Called before show_stack_log_lvl() as it could trig
> +		 * page_fault and reenable LBR

Huch? The kernel stack dump is going to page fault? If that happens
then you are in deep shit anyway. I doubt that anything useful gets
out of the machine at this point, LBR or not.

Aside of that if we want to debug with the LBR then we better freeze
that whole thing across a dump and be done with it.

> +		 */
> +		show_lbrs();

> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index df088bb..120e989 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -1035,6 +1035,42 @@ apicinterrupt IRQ_WORK_VECTOR \
>  	irq_work_interrupt smp_irq_work_interrupt
>  #endif
>  
> +.macro STOP_LBR
> +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> +	testl $1, lbr_dump_on_exception
> +	jz 1f
> +	push %rax
> +	push %rcx
> +	push %rdx
> +	movl $MSR_IA32_DEBUGCTLMSR, %ecx
> +	rdmsr
> +	and $~1, %eax	/* Disable LBR recording */
> +	wrmsr
> +	pop %rdx
> +	pop %rcx
> +	pop %rax
> +1:
> +#endif
> +.endm
> +
> +.macro START_LBR
> +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> +	testl $1, lbr_dump_on_exception
> +	jz 1f
> +	push %rax
> +	push %rcx
> +	push %rdx
> +	movl $MSR_IA32_DEBUGCTLMSR, %ecx
> +	rdmsr
> +	or $1, %eax		/* Enable LBR recording */

This is really brilliant. So the logic here is:

Perf uses LBR  	LBR state at 	 LBR state at	Dump
     	  	exception entry	 exception exit	  

Yes		No change	 No change	No

No		off -> off or	 off -> on	empty
		on  -> off 	 off -> on    	perhaps useful

So how does LBR get magically enabled?

   By producing fault #1 and then on the exception exit enabling
   LBR so that fault #2 can provide data?
   
   That's the only way I can see how to do that if you are not
   magically tweaking MSR_IA32_DEBUGCTLMSR from user space.

   Now that magically works, because you add that stuff into all
   exception entry/exit stubs.

   So it will be enabled magically no matter what and it will also be
   enabled unconditonally on any machine whether it supports that
   feature or not. That's the whole reason why you have no 32bit
   support for this yet.

How is that thing useful when perf uses LBR?

   Not at all. We do not gain anything. We explode and have no value
   at all.

Aside of that what is setting the proper options for LBR recording in
MSR_LBR_SELECT, if available?

   Nothing, which is useless as well, as the dump might just conatin
   completely useless crap. There is a reason why haswell introduced
   LBR filtering.

So what's the point of this?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH] [LBR] Dump LBRs on Oops
  2014-11-22  0:50 ` Thomas Gleixner
@ 2014-11-26 10:56   ` Berthier, Emmanuel
  2014-11-26 13:08     ` Thomas Gleixner
  0 siblings, 1 reply; 27+ messages in thread
From: Berthier, Emmanuel @ 2014-11-26 10:56 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: mingo, hpa, x86, Jarzmik, Robert, linux-kernel

> From: Thomas Gleixner [mailto:tglx@linutronix.de]
> Sent: Saturday, November 22, 2014 1:51 AM On Fri, 21 Nov 2014, 
> Emmanuel Berthier wrote:
> 
> > The purpose of this patch is to stop LBR at the early stage of 
> > Exception Handling, and dump its content later in the dumpstack 
> > process.
> 
> And that's useful in what way? The changelog should not tell WHAT the 
> patch does. It should tell WHY it is useful and what are the usecases/benefits.
> Neither does it tell how that feature can be used/enabled/disabled and 
> how it provides useful information.
> Where is that sample output which demonstrates that this is something 
> which adds debugging value rather than another level of pointless featuritis?

Oops, right. 

Let's take the case of a stack corruption:

static int corrupt_stack(void *data, u64 val)  {  long long ptr[1];

	asm ("");
	ptr[0]=0;
	ptr[1]=0;
	ptr[2]=0;
	ptr[3]=0;

	return -1;
 }

The standard Panic will report:

 BUG: unable to handle kernel NULL pointer dereference at           (null)
 IP: [<          (null)>]           (null)
 PGD 48605067 PUD 0
 Oops: 0010 [#1] PREEMPT SMP
 task: ffff8800384f6300 ti: ffff880035c70000 task.ti: ffff880035c70000
 RIP: 0010:[<0000000000000000>]  [<          (null)>]           (null)
 RSP: 0018:ffff880035c71ec8  EFLAGS: 00010246
 RAX: 00000000ffffffff RBX: fffffffffffffff2 RCX: 000000000000002a
 RDX: ffff880035c71e90 RSI: 0000000000000001 RDI: 0000000000000000
 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001
 R10: 000000000000000a R11: f000000000000000 R12: ffff880033be0e50
 R13: 0000000000000002 R14: 0000000000000002 R15: ffff880033be0e00
 FS:  0000000000000000(0000) GS:ffff88007ea80000(0063) knlGS:00000000f76cd280
 CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
 CR2: 0000000000000000 CR3: 000000003871b000 CR4: 00000000001007e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
 Stack:
  0000000000000000 00000000f802bb54 ffff880075e85680 0000000000000002
  00000000f802bb54 ffff880035c71f50 0000000000000000 ffff880035c71f38
  ffffffff821b8266 ffff880075e85680 00000000f802bb54 0000000000000002  Call Trace:
  [<ffffffff821b8266>] ? vfs_write+0xb6/0x1c0
  [<ffffffff821b86fd>] ? SyS_write+0x4d/0x90
  [<ffffffff82817c65>] ? sysenter_dispatch+0x7/0x23
 Code:  Bad RIP value.
 RIP  [<          (null)>]           (null)
  RSP <ffff880035c71ec8>
 CR2: 0000000000000000

The purpose of this patch is to use the LBR as a small instruction trace.
The result will be:

 Last Branch Records:
  _to: [<ffffffff82810980>] page_fault+0x0/0x70
 from: [<0000000000000000>] 0x0
  _to: [<0000000000000000>] 0x0
 from: [<ffffffff8263693c>] corrupt_stack+0x3c/0x40
  _to: [<ffffffff82636900>] corrupt_stack+0x0/0x40
 from: [<ffffffff821dde6a>] simple_attr_write+0xca/0xf0
  _to: [<ffffffff821dde63>] simple_attr_write+0xc3/0xf0
 from: [<ffffffff8235387f>] simple_strtoll+0xf/0x20
  _to: [<ffffffff8235387e>] simple_strtoll+0xe/0x20
 from: [<ffffffff82351d5b>] simple_strtoull+0x4b/0x50
  _to: [<ffffffff82351d4e>] simple_strtoull+0x3e/0x50
 from: [<ffffffff82351d48>] simple_strtoull+0x38/0x50
  _to: [<ffffffff82351d3d>] simple_strtoull+0x2d/0x50
 from: [<ffffffff8235b4cb>] _parse_integer+0x9b/0xc0
  _to: [<ffffffff8235b4b0>] _parse_integer+0x80/0xc0
 from: [<ffffffff8235b497>] _parse_integer+0x67/0xc0
 
> > --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> > +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> > @@ -4,7 +4,9 @@
> >  #include <asm/perf_event.h>
> >  #include <asm/msr.h>
> >  #include <asm/insn.h>
> > -
> > +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION #include <asm/processor.h> 
> > +#endif
> 
> We just can include that file unconditionally.

Ok for patch 2.

> >  #include "perf_event.h"
> >
> >  enum {
> > @@ -135,6 +137,9 @@ static void __intel_pmu_lbr_enable(void)
> >  	u64 debugctl;
> >  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> >
> > +	if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION))
> > +		lbr_dump_on_exception = 0;
> 
> With certain compilers you might get a surprise here, because they are 
> too stupid to remove that 'lbr_dump_on_exception = 0;' right away. 
> They kill it in the optimization phase. So they complain about 
> lbr_dump_on_exception not being defined.
> 
> So you need something like this:
> 
> static inline void lbr_set_dump_on_oops(bool enable) { #ifdef 
> CONFIG_LBR_DUMP_ON_EXCEPTION
>        ....
> #endif
> }
> 
> and make that
> 
>      if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION))
>              lbr_set_dump_on_oops(false);
> 
> which is completely pointless as you can just call
> 
>       lbr_set_dump_on_oops(false);
> 
> unconditionally and be done with it.
> 
> IS_ENABLED(CONFIG_XXX) is not a proper solution for all problems. It 
> can avoid #ifdefs, but it also can introduce interesting nonsense.

Ok for patch 2.

> >  	if (cpuc->lbr_sel)
> >  		wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
> >
> > @@ -147,6 +152,9 @@ static void __intel_pmu_lbr_disable(void)  {
> >  	u64 debugctl;
> >
> > +	if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION))
> > +		lbr_dump_on_exception = 1;
> 
> Now the even more interesting question is, WHY is 
> lbr_dump_on_exception enabled in __intel_pmu_lbr_disable and disabled 
> in __intel_pmu_lbr_enable?
> 
> This obviously lacks a understandable comment, but before you 
> elaborate on this see the next comment.

Oh no, I would have liked to explain it right now ;-)

> > +void show_lbrs(void)
> > +{
> > +	if (IS_ENABLED(CONFIG_LBR_DUMP_ON_EXCEPTION)) {
> > +		u64 debugctl;
> > +		int i, lbr_on;
> > +
> > +		rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
> > +		lbr_on = debugctl & DEBUGCTLMSR_LBR;
> > +
> > +		pr_info("Last Branch Records:");
> > +		if (!lbr_dump_on_exception) {
> > +			pr_cont(" (Disabled by perf_event)\n");
> 
> So, if perf uses LBR we do not print it? What a weird design decision. 
> If the machine crashes, we want that information no matter whether 
> perf is active or not. What kind of twisted logic is that?

Ok, let me explain.
LBR usages are exclusive. Perf uses LBR to calculate some CPU statistics. I use LBR to track code execution before Exceptions.
So, as soon as we enable perf, I disable LBR dump and vice versa.

> > +		} else if (x86_pmu.lbr_nr == 0) {
> > +			pr_cont(" (x86_model unknown, check
> intel_pmu_init())\n");
> 
> Huch? Why we get here if the pmu does not support it at all? Why 
> should we bother to print it? If it's not printed it's not available. It's that simple.

That's a warning to point out that current core is not supported. New cores have to be declared in
intel_pmu_init() after:

	switch (boot_cpu_data.x86_model) {
	. . . 
	case 28: /* Atom */
	case 38: /* Lincroft */
	case 39: /* Penwell */
	case 53: /* Cloverview */
	case 54: /* Cedarview */

I work on new cores and their names will not be revealed before a while.
So, next time this feature will be used on a new core, it's important to understand why is not supported and where to make the simple update.

> > +		} else if (lbr_on) {
> > +			pr_cont(" (not halted)\n");
> 
> Why would it be not halted? Code comments are optional, right?

The LBR recording is not halted if we call directly panic() without coming from an exception.
In this case, LBR dump is irrelevant.
I will add a comment in patch 2.

> > +		} else {
> > +			struct cpu_hw_events *cpuc =
> > +
> 	this_cpu_ptr(&cpu_hw_events);
> 
> A simple #ifdef would have saved you an indentation level and actually 
> made that code readable. IS_ENABLED() is a proper hammer for some 
> things but not everything is a nail.

Ok, will correct that in patch 2.

> > +			intel_pmu_lbr_read_64(cpuc);
> > +
> > +			pr_cont("\n");
> > +			for (i = 0; i < cpuc->lbr_stack.nr; i++) {
> > +				pr_info("   to: [<%016llx>] ",
> > +						cpuc->lbr_entries[i].to);
> > +				print_symbol("%s\n", cpuc-
> >lbr_entries[i].to);
> > +				pr_info(" from: [<%016llx>] ",
> > +						cpuc->lbr_entries[i].from);
> > +				print_symbol("%s\n", cpuc-
> >lbr_entries[i].from);
> > +			}
> > +		}
> > +	}
> > +}
> > +
> >  void show_regs(struct pt_regs *regs)  {
> >  	int i;
> > @@ -314,10 +352,15 @@ void show_regs(struct pt_regs *regs)
> >  		unsigned char c;
> >  		u8 *ip;
> >
> > +		/*
> > +		 * Called before show_stack_log_lvl() as it could trig
> > +		 * page_fault and reenable LBR
> 
> Huch? The kernel stack dump is going to page fault? If that happens 
> then you are in deep shit anyway. I doubt that anything useful gets 
> out of the machine at this point, LBR or not.
> 
> Aside of that if we want to debug with the LBR then we better freeze 
> that whole thing across a dump and be done with it.

I met that case but did no dig deeply into it...

> > +		 */
> > +		show_lbrs();
> 
> > diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S 
> > index df088bb..120e989 100644
> > --- a/arch/x86/kernel/entry_64.S
> > +++ b/arch/x86/kernel/entry_64.S
> > @@ -1035,6 +1035,42 @@ apicinterrupt IRQ_WORK_VECTOR \
> >  	irq_work_interrupt smp_irq_work_interrupt  #endif
> >
> > +.macro STOP_LBR
> > +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> > +	testl $1, lbr_dump_on_exception
> > +	jz 1f
> > +	push %rax
> > +	push %rcx
> > +	push %rdx
> > +	movl $MSR_IA32_DEBUGCTLMSR, %ecx
> > +	rdmsr
> > +	and $~1, %eax	/* Disable LBR recording */
> > +	wrmsr
> > +	pop %rdx
> > +	pop %rcx
> > +	pop %rax
> > +1:
> > +#endif
> > +.endm
> > +
> > +.macro START_LBR
> > +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> > +	testl $1, lbr_dump_on_exception
> > +	jz 1f
> > +	push %rax
> > +	push %rcx
> > +	push %rdx
> > +	movl $MSR_IA32_DEBUGCTLMSR, %ecx
> > +	rdmsr
> > +	or $1, %eax		/* Enable LBR recording */
> 
> This is really brilliant. So the logic here is:
> 
> Perf uses LBR  	LBR state at 	 LBR state at	Dump
>      	  	exception entry	 exception exit
> 
> Yes		No change	 No change	No
> 
> No		off -> off or	 off -> on	empty
> 		on  -> off 	 off -> on    	perhaps useful
> 
> So how does LBR get magically enabled?
> 
>    By producing fault #1 and then on the exception exit enabling
>    LBR so that fault #2 can provide data?
> 
>    That's the only way I can see how to do that if you are not
>    magically tweaking MSR_IA32_DEBUGCTLMSR from user space.
> 
>    Now that magically works, because you add that stuff into all
>    exception entry/exit stubs.
> 
>    So it will be enabled magically no matter what and it will also be
>    enabled unconditonally on any machine whether it supports that
>    feature or not. That's the whole reason why you have no 32bit
>    support for this yet.
> 
> How is that thing useful when perf uses LBR?
> 
>    Not at all. We do not gain anything. We explode and have no value
>    at all.
> 
> Aside of that what is setting the proper options for LBR recording in 
> MSR_LBR_SELECT, if available?
> 
>    Nothing, which is useless as well, as the dump might just conatin
>    completely useless crap. There is a reason why haswell introduced
>    LBR filtering.
> 
> So what's the point of this?

Yes, that's a kind of magic ;-)
I've stated that LBR is incompatible with Perf.
So, by default, when Perf is not used, LBR will be enabled at the first exception (usually a simple page fault) with default filtering options, i.e no filtering.
As soon as we start perf, the lbr_dump_on_exception global is unset and LBR start/stop are bypassed.
LBR filtering is reset during perf stop.

> Thanks,
> 
> 	tglx

Thanks a lot for your time!
Emmanuel.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH] [LBR] Dump LBRs on Oops
  2014-11-26 10:56   ` Berthier, Emmanuel
@ 2014-11-26 13:08     ` Thomas Gleixner
  2014-11-26 14:17       ` Berthier, Emmanuel
  0 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-11-26 13:08 UTC (permalink / raw)
  To: Berthier, Emmanuel; +Cc: mingo, hpa, x86, Jarzmik, Robert, linux-kernel

On Wed, 26 Nov 2014, Berthier, Emmanuel wrote:
> > From: Thomas Gleixner [mailto:tglx@linutronix.de]
> The purpose of this patch is to use the LBR as a small instruction trace.
> The result will be:
> 
>  Last Branch Records:
>   _to: [<ffffffff82810980>] page_fault+0x0/0x70
>  from: [<0000000000000000>] 0x0
>   _to: [<0000000000000000>] 0x0
>  from: [<ffffffff8263693c>] corrupt_stack+0x3c/0x40
>   _to: [<ffffffff82636900>] corrupt_stack+0x0/0x40
>  from: [<ffffffff821dde6a>] simple_attr_write+0xca/0xf0
>   _to: [<ffffffff821dde63>] simple_attr_write+0xc3/0xf0
>  from: [<ffffffff8235387f>] simple_strtoll+0xf/0x20
>   _to: [<ffffffff8235387e>] simple_strtoll+0xe/0x20
>  from: [<ffffffff82351d5b>] simple_strtoull+0x4b/0x50
>   _to: [<ffffffff82351d4e>] simple_strtoull+0x3e/0x50
>  from: [<ffffffff82351d48>] simple_strtoull+0x38/0x50
>   _to: [<ffffffff82351d3d>] simple_strtoull+0x2d/0x50
>  from: [<ffffffff8235b4cb>] _parse_integer+0x9b/0xc0
>   _to: [<ffffffff8235b4b0>] _parse_integer+0x80/0xc0
>  from: [<ffffffff8235b497>] _parse_integer+0x67/0xc0

Ok. That could be useful indeed.
  
> > So, if perf uses LBR we do not print it? What a weird design decision. 
> > If the machine crashes, we want that information no matter whether 
> > perf is active or not. What kind of twisted logic is that?
> 
> Ok, let me explain.
> LBR usages are exclusive. Perf uses LBR to calculate some CPU
> statistics. I use LBR to track code execution before Exceptions.
> So, as soon as we enable perf, I disable LBR dump and vice versa.

That wants to be documented in the code.

> > > +		} else if (x86_pmu.lbr_nr == 0) {
> > > +			pr_cont(" (x86_model unknown, check
> > intel_pmu_init())\n");
> > 
> > Huch? Why we get here if the pmu does not support it at all? Why 
> > should we bother to print it? If it's not printed it's not available. It's that simple.
> 
> That's a warning to point out that current core is not
> supported. New cores have to be declared in
> 
> intel_pmu_init() after:
> 
> 	switch (boot_cpu_data.x86_model) {
> 	. . . 
> 	case 28: /* Atom */
> 	case 38: /* Lincroft */
> 	case 39: /* Penwell */
> 	case 53: /* Cloverview */
> 	case 54: /* Cedarview */
> 
> I work on new cores and their names will not be revealed before a while.
> So, next time this feature will be used on a new core, it's
> important to understand why is not supported and where to make the
> simple update.

We add printks not for people who work on the support of unreleased
hardware. They should better know what they are doing. If they can't
figure that out they should not touch the kernel in the first place.

> > >  void show_regs(struct pt_regs *regs)  {
> > >  	int i;
> > > @@ -314,10 +352,15 @@ void show_regs(struct pt_regs *regs)
> > >  		unsigned char c;
> > >  		u8 *ip;
> > >
> > > +		/*
> > > +		 * Called before show_stack_log_lvl() as it could trig
> > > +		 * page_fault and reenable LBR
> > 
> > Huch? The kernel stack dump is going to page fault? If that happens 
> > then you are in deep shit anyway. I doubt that anything useful gets 
> > out of the machine at this point, LBR or not.
> > 
> > Aside of that if we want to debug with the LBR then we better freeze 
> > that whole thing across a dump and be done with it.
> 
> I met that case but did no dig deeply into it...

Hmm, a corrupted stack might trigger this together with some of the
other debug options enabled. So we really might to put it in front.

> > > diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S 
> > > index df088bb..120e989 100644
> > > --- a/arch/x86/kernel/entry_64.S
> > > +++ b/arch/x86/kernel/entry_64.S
> > > @@ -1035,6 +1035,42 @@ apicinterrupt IRQ_WORK_VECTOR \
> > >  	irq_work_interrupt smp_irq_work_interrupt  #endif
> > >
> > > +.macro STOP_LBR
> > > +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> > > +	testl $1, lbr_dump_on_exception
> > > +	jz 1f
> > > +	push %rax
> > > +	push %rcx
> > > +	push %rdx
> > > +	movl $MSR_IA32_DEBUGCTLMSR, %ecx
> > > +	rdmsr
> > > +	and $~1, %eax	/* Disable LBR recording */
> > > +	wrmsr
> > > +	pop %rdx
> > > +	pop %rcx
> > > +	pop %rax
> > > +1:
> > > +#endif
> > > +.endm
> > > +
> > > +.macro START_LBR
> > > +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> > > +	testl $1, lbr_dump_on_exception
> > > +	jz 1f
> > > +	push %rax
> > > +	push %rcx
> > > +	push %rdx
> > > +	movl $MSR_IA32_DEBUGCTLMSR, %ecx
> > > +	rdmsr
> > > +	or $1, %eax		/* Enable LBR recording */

> So, by default, when Perf is not used, LBR will be enabled at the
> first exception (usually a simple page fault) with default filtering
> options, i.e no filtering.  As soon as we start perf, the
> lbr_dump_on_exception global is unset and LBR start/stop are
> bypassed.
> 
> LBR filtering is reset during perf stop.

So while I can see how this could be useful there are a few things
which need more thought:

1) We want to enable/disable this at boot time.

   In the disabled case we might also stub out the test/jz and replace
   it by an unconditional jump, but that needs more thought.

2) Right now you stop the trace on every exception no matter whether
   it comes from user or kernel space.

   Stopping the trace when we handle a user space fault does not make
   any sense and inflicts just pointless overhead.

   Aside of that if the fault handler then crashes we do not have the
   LBR information because we froze it when entering from user space
   in the first place.

Thanks,

	tglx



^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH] [LBR] Dump LBRs on Oops
  2014-11-26 13:08     ` Thomas Gleixner
@ 2014-11-26 14:17       ` Berthier, Emmanuel
  2014-11-26 14:46         ` Thomas Gleixner
  0 siblings, 1 reply; 27+ messages in thread
From: Berthier, Emmanuel @ 2014-11-26 14:17 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: mingo, hpa, x86, Jarzmik, Robert, linux-kernel

> -----Original Message-----
> From: Thomas Gleixner [mailto:tglx@linutronix.de]
> Sent: Wednesday, November 26, 2014 2:08 PM
> To: Berthier, Emmanuel
> Cc: mingo@redhat.com; hpa@zytor.com; x86@kernel.org; Jarzmik, Robert;
> linux-kernel@vger.kernel.org
> Subject: RE: [PATCH] [LBR] Dump LBRs on Oops
> 
> On Wed, 26 Nov 2014, Berthier, Emmanuel wrote:
> > > So, if perf uses LBR we do not print it? What a weird design decision.
> > > If the machine crashes, we want that information no matter whether
> > > perf is active or not. What kind of twisted logic is that?
> >
> > Ok, let me explain.
> > LBR usages are exclusive. Perf uses LBR to calculate some CPU
> > statistics. I use LBR to track code execution before Exceptions.
> > So, as soon as we enable perf, I disable LBR dump and vice versa.
> 
> That wants to be documented in the code.

Ok, will be in patch 2

> > > > +		} else if (x86_pmu.lbr_nr == 0) {
> > > > +			pr_cont(" (x86_model unknown, check
> > > intel_pmu_init())\n");
> > >
> > > Huch? Why we get here if the pmu does not support it at all? Why
> > > should we bother to print it? If it's not printed it's not available. It's that
> simple.
> >
> > That's a warning to point out that current core is not supported. New
> > cores have to be declared in
> >
> > intel_pmu_init() after:
> >
> > 	switch (boot_cpu_data.x86_model) {
> > 	. . .
> > 	case 28: /* Atom */
> > 	case 38: /* Lincroft */
> > 	case 39: /* Penwell */
> > 	case 53: /* Cloverview */
> > 	case 54: /* Cedarview */
> >
> > I work on new cores and their names will not be revealed before a while.
> > So, next time this feature will be used on a new core, it's important
> > to understand why is not supported and where to make the simple
> > update.
> 
> We add printks not for people who work on the support of unreleased
> hardware. They should better know what they are doing. If they can't figure
> that out they should not touch the kernel in the first place.

LoL
I'm part of those people, I've touched the kernel and I've figured out what was wrong.
And I would like to be helped next year for the next Core: I'm an old man and I need
to leave a white stone trail  ;-)
Could we agree on that one?

> > > >  void show_regs(struct pt_regs *regs)  {
> > > >  	int i;
> > > > @@ -314,10 +352,15 @@ void show_regs(struct pt_regs *regs)
> > > >  		unsigned char c;
> > > >  		u8 *ip;
> > > >
> > > > +		/*
> > > > +		 * Called before show_stack_log_lvl() as it could trig
> > > > +		 * page_fault and reenable LBR
> > >
> > > Huch? The kernel stack dump is going to page fault? If that happens
> > > then you are in deep shit anyway. I doubt that anything useful gets
> > > out of the machine at this point, LBR or not.
> > >
> > > Aside of that if we want to debug with the LBR then we better freeze
> > > that whole thing across a dump and be done with it.
> >
> > I met that case but did no dig deeply into it...
> 
> Hmm, a corrupted stack might trigger this together with some of the other
> debug options enabled. So we really might to put it in front.

Didn't catch you. Could you elaborate on that?

> > > > diff --git a/arch/x86/kernel/entry_64.S
> > > > b/arch/x86/kernel/entry_64.S index df088bb..120e989 100644
> > > > --- a/arch/x86/kernel/entry_64.S
> > > > +++ b/arch/x86/kernel/entry_64.S
> > > > @@ -1035,6 +1035,42 @@ apicinterrupt IRQ_WORK_VECTOR \
> > > >  	irq_work_interrupt smp_irq_work_interrupt  #endif
> > > >
> > > > +.macro STOP_LBR
> > > > +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> > > > +	testl $1, lbr_dump_on_exception
> > > > +	jz 1f
> > > > +	push %rax
> > > > +	push %rcx
> > > > +	push %rdx
> > > > +	movl $MSR_IA32_DEBUGCTLMSR, %ecx
> > > > +	rdmsr
> > > > +	and $~1, %eax	/* Disable LBR recording */
> > > > +	wrmsr
> > > > +	pop %rdx
> > > > +	pop %rcx
> > > > +	pop %rax
> > > > +1:
> > > > +#endif
> > > > +.endm
> > > > +
> > > > +.macro START_LBR
> > > > +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> > > > +	testl $1, lbr_dump_on_exception
> > > > +	jz 1f
> > > > +	push %rax
> > > > +	push %rcx
> > > > +	push %rdx
> > > > +	movl $MSR_IA32_DEBUGCTLMSR, %ecx
> > > > +	rdmsr
> > > > +	or $1, %eax		/* Enable LBR recording */
> 
> > So, by default, when Perf is not used, LBR will be enabled at the
> > first exception (usually a simple page fault) with default filtering
> > options, i.e no filtering.  As soon as we start perf, the
> > lbr_dump_on_exception global is unset and LBR start/stop are bypassed.
> >
> > LBR filtering is reset during perf stop.
> 
> So while I can see how this could be useful there are a few things which need
> more thought:
> 
> 1) We want to enable/disable this at boot time.
> 
>    In the disabled case we might also stub out the test/jz and replace
>    it by an unconditional jump, but that needs more thought.

I can add a cmdline option to disable it at boot time.
Do you propose to use code instruction patching (same as ftrace) also?
Is-it really worth to bypass test/jz as page fault handling is much more than few instructions?

> 2) Right now you stop the trace on every exception no matter whether
>    it comes from user or kernel space.
> 
>    Stopping the trace when we handle a user space fault does not make
>    any sense and inflicts just pointless overhead.
>
>    Aside of that if the fault handler then crashes we do not have the
>    LBR information because we froze it when entering from user space
>    in the first place.

Agree, but the LBR buffer contains only 8 records: we have to stop it as soon as possible.
If we add some test/jump/call before stopping it, relevant info will be flushed out.

Thanks.

> Thanks,
> 
> 	tglx
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH] [LBR] Dump LBRs on Oops
  2014-11-26 14:17       ` Berthier, Emmanuel
@ 2014-11-26 14:46         ` Thomas Gleixner
  2014-11-26 15:43           ` Berthier, Emmanuel
  0 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-11-26 14:46 UTC (permalink / raw)
  To: Berthier, Emmanuel; +Cc: mingo, hpa, x86, Jarzmik, Robert, linux-kernel

On Wed, 26 Nov 2014, Berthier, Emmanuel wrote:
> > We add printks not for people who work on the support of unreleased
> > hardware. They should better know what they are doing. If they can't figure
> > that out they should not touch the kernel in the first place.
> 
> LoL
> I'm part of those people, I've touched the kernel and I've figured out what was wrong.
> And I would like to be helped next year for the next Core: I'm an old man and I need
> to leave a white stone trail  ;-)
> Could we agree on that one?

We already have a printk in init_intel_pmu() where we tell about the
'unidentified cpu', so we better extend that instead of having
something dependent on a OOPS.
 
> > > > Aside of that if we want to debug with the LBR then we better freeze
> > > > that whole thing across a dump and be done with it.
> > >
> > > I met that case but did no dig deeply into it...
> > 
> > Hmm, a corrupted stack might trigger this together with some of the other
> > debug options enabled. So we really might to put it in front.
> 
> Didn't catch you. Could you elaborate on that?

Assume a stack corruption, so the stack dumper follows it w/o noticing
and hits an unmapped page. So that would be an argument to move the
LBR print out ahead of the stack dump.
 
> > 1) We want to enable/disable this at boot time.
> > 
> >    In the disabled case we might also stub out the test/jz and replace
> >    it by an unconditional jump, but that needs more thought.
> 
> I can add a cmdline option to disable it at boot time.

Enable. Should be disabled by default I think.

> Do you propose to use code instruction patching (same as ftrace)
> also?  Is-it really worth to bypass test/jz as page fault handling
> is much more than few instructions?

That's why I said: but that needs more thought.

Though OTOH we keep adding stuff there and if we want to enable that
LBR feature more widely we should think about keeping the overhead low
if it is disabled.

We can discuss this after we have a agreed on patch for that feature.

> > 2) Right now you stop the trace on every exception no matter whether
> >    it comes from user or kernel space.
> > 
> >    Stopping the trace when we handle a user space fault does not make
> >    any sense and inflicts just pointless overhead.
> >
> >    Aside of that if the fault handler then crashes we do not have the
> >    LBR information because we froze it when entering from user space
> >    in the first place.
> 
> Agree, but the LBR buffer contains only 8 records: we have to stop
> it as soon as possible.  If we add some test/jump/call before
> stopping it, relevant info will be flushed out.

Well, you can certainly test that w/o a jump. Hint:

      if (enabled && is_kernel)
      	 goto x;

can be written in ASM with a single branch as well :)

That adds more instructions before the jz, which might in fact make
the code patching for the disabled case more interesting.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH] [LBR] Dump LBRs on Oops
  2014-11-26 14:46         ` Thomas Gleixner
@ 2014-11-26 15:43           ` Berthier, Emmanuel
  2014-11-27 14:40             ` [PATCH v2] [LBR] Dump LBRs on Exception Emmanuel Berthier
  0 siblings, 1 reply; 27+ messages in thread
From: Berthier, Emmanuel @ 2014-11-26 15:43 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: mingo, hpa, x86, Jarzmik, Robert, linux-kernel

> -----Original Message-----
> From: Thomas Gleixner [mailto:tglx@linutronix.de]
> Sent: Wednesday, November 26, 2014 3:47 PM
> To: Berthier, Emmanuel
> Cc: mingo@redhat.com; hpa@zytor.com; x86@kernel.org; Jarzmik, Robert;
> linux-kernel@vger.kernel.org
> Subject: RE: [PATCH] [LBR] Dump LBRs on Oops
> 
> On Wed, 26 Nov 2014, Berthier, Emmanuel wrote:
> > > We add printks not for people who work on the support of unreleased
> > > hardware. They should better know what they are doing. If they can't
> > > figure that out they should not touch the kernel in the first place.
> >
> > LoL
> > I'm part of those people, I've touched the kernel and I've figured out what
> was wrong.
> > And I would like to be helped next year for the next Core: I'm an old
> > man and I need to leave a white stone trail  ;-) Could we agree on
> > that one?
> 
> We already have a printk in init_intel_pmu() where we tell about the
> 'unidentified cpu', so we better extend that instead of having something
> dependent on a OOPS.

That's right, I have a patch for that also.

> > > > > Aside of that if we want to debug with the LBR then we better
> > > > > freeze that whole thing across a dump and be done with it.
> > > >
> > > > I met that case but did no dig deeply into it...
> > >
> > > Hmm, a corrupted stack might trigger this together with some of the
> > > other debug options enabled. So we really might to put it in front.
> >
> > Didn't catch you. Could you elaborate on that?
> 
> Assume a stack corruption, so the stack dumper follows it w/o noticing and
> hits an unmapped page. So that would be an argument to move the LBR print
> out ahead of the stack dump.

Ok, so no need to change anything here?

> > > 1) We want to enable/disable this at boot time.
> > >
> > >    In the disabled case we might also stub out the test/jz and replace
> > >    it by an unconditional jump, but that needs more thought.
> >
> > I can add a cmdline option to disable it at boot time.
> 
> Enable. Should be disabled by default I think.

ok

> > Do you propose to use code instruction patching (same as ftrace) also?
> > Is-it really worth to bypass test/jz as page fault handling is much
> > more than few instructions?
> 
> That's why I said: but that needs more thought.
> 
> Though OTOH we keep adding stuff there and if we want to enable that LBR
> feature more widely we should think about keeping the overhead low if it is
> disabled.
> 
> We can discuss this after we have a agreed on patch for that feature.

Ok

> > > 2) Right now you stop the trace on every exception no matter whether
> > >    it comes from user or kernel space.
> > >
> > >    Stopping the trace when we handle a user space fault does not make
> > >    any sense and inflicts just pointless overhead.
> > >
> > >    Aside of that if the fault handler then crashes we do not have the
> > >    LBR information because we froze it when entering from user space
> > >    in the first place.
> >
> > Agree, but the LBR buffer contains only 8 records: we have to stop it
> > as soon as possible.  If we add some test/jump/call before stopping
> > it, relevant info will be flushed out.
> 
> Well, you can certainly test that w/o a jump. Hint:
> 
>       if (enabled && is_kernel)
>       	 goto x;
> 
> can be written in ASM with a single branch as well :)
> 
> That adds more instructions before the jz, which might in fact make the code
> patching for the disabled case more interesting.

I will keep it small, trust me! ;-)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] [LBR] Dump LBRs on Exception
  2014-11-26 15:43           ` Berthier, Emmanuel
@ 2014-11-27 14:40             ` Emmanuel Berthier
  2014-11-27 21:22               ` Thomas Gleixner
  0 siblings, 1 reply; 27+ messages in thread
From: Emmanuel Berthier @ 2014-11-27 14:40 UTC (permalink / raw)
  To: tglx, mingo, hpa, x86; +Cc: robert.jarzmik, emmanuel.berthier, linux-kernel

There are some cases where call stack and register dump are not enough to debug
a Panic.
Let's take the case of a stack corruption:

 static int corrupt_stack(void *data, u64 val)
 {
 long long ptr[1];

	asm ("");
	ptr[0]=0;
	ptr[1]=0;
	ptr[2]=0;
	ptr[3]=0;

	return -1;
 }

The standard Panic will report:

 BUG: unable to handle kernel NULL pointer dereference at           (null)
 IP: [<          (null)>]           (null)
 PGD 48605067 PUD 0
 Oops: 0010 [#1] PREEMPT SMP
 task: ffff8800384f6300 ti: ffff880035c70000 task.ti: ffff880035c70000
 RIP: 0010:[<0000000000000000>]  [<          (null)>]           (null)
 RSP: 0018:ffff880035c71ec8  EFLAGS: 00010246
 RAX: 00000000ffffffff RBX: fffffffffffffff2 RCX: 000000000000002a
 RDX: ffff880035c71e90 RSI: 0000000000000001 RDI: 0000000000000000
 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001
 R10: 000000000000000a R11: f000000000000000 R12: ffff880033be0e50
 R13: 0000000000000002 R14: 0000000000000002 R15: ffff880033be0e00
 FS:  0000000000000000(0000) GS:ffff88007ea80000(0063) knlGS:00000000f76cd280
 CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
 CR2: 0000000000000000 CR3: 000000003871b000 CR4: 00000000001007e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
 Stack:
  0000000000000000 00000000f802bb54 ffff880075e85680 0000000000000002
  00000000f802bb54 ffff880035c71f50 0000000000000000 ffff880035c71f38
  ffffffff821b8266 ffff880075e85680 00000000f802bb54 0000000000000002
 Call Trace:
  [<ffffffff821b8266>] ? vfs_write+0xb6/0x1c0
  [<ffffffff821b86fd>] ? SyS_write+0x4d/0x90
  [<ffffffff82817c65>] ? sysenter_dispatch+0x7/0x23
 Code:  Bad RIP value.
 RIP  [<          (null)>]           (null)
  RSP <ffff880035c71ec8>
 CR2: 0000000000000000

The purpose of this patch is to use the LBR as a small instruction trace.
The result will be:

 Last Branch Records:
  _to: [<ffffffff82810980>] page_fault+0x0/0x70
 from: [<0000000000000000>] 0x0
  _to: [<0000000000000000>] 0x0
 from: [<ffffffff8263693c>] corrupt_stack+0x3c/0x40
  _to: [<ffffffff82636900>] corrupt_stack+0x0/0x40
 from: [<ffffffff821dde6a>] simple_attr_write+0xca/0xf0
  _to: [<ffffffff821dde63>] simple_attr_write+0xc3/0xf0
 from: [<ffffffff8235387f>] simple_strtoll+0xf/0x20
  _to: [<ffffffff8235387e>] simple_strtoll+0xe/0x20
 from: [<ffffffff82351d5b>] simple_strtoull+0x4b/0x50
  _to: [<ffffffff82351d4e>] simple_strtoull+0x3e/0x50
 from: [<ffffffff82351d48>] simple_strtoull+0x38/0x50
  _to: [<ffffffff82351d3d>] simple_strtoull+0x2d/0x50
 from: [<ffffffff8235b4cb>] _parse_integer+0x9b/0xc0
  _to: [<ffffffff8235b4b0>] _parse_integer+0x80/0xc0
 from: [<ffffffff8235b497>] _parse_integer+0x67/0xc0

Signed-off-by: Emmanuel Berthier <emmanuel.berthier@intel.com>
---
since v1: took into account Thomas's comments.
          for next round for review.
---
 arch/x86/Kconfig.debug                     |   11 ++++++
 arch/x86/include/asm/processor.h           |    1 +
 arch/x86/kernel/cpu/perf_event.h           |    2 ++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |   41 ++++++++++++++++++++--
 arch/x86/kernel/dumpstack_64.c             |   52 ++++++++++++++++++++++++++--
 arch/x86/kernel/entry_64.S                 |   44 +++++++++++++++++++++++
 6 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 61bd2ad..a571d40 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -323,4 +323,15 @@ config X86_DEBUG_STATIC_CPU_HAS
 
 	  If unsure, say N.
 
+config LBR_DUMP_ON_EXCEPTION
+	bool "Dump Last Branch Records on Exception"
+	depends on DEBUG_KERNEL && X86_64
+	---help---
+	  Enabling this option turns on LBR dump during exception.
+	  This provides a small "last instructions before exception" trace.
+
+	  Add 'lbr_dump_on_exception' option in cmdline to really enable it.
+
+	  This might help diagnose exceptions generated by stack corruption.
+
 endmenu
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index eb71ec7..0c3ed67 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -462,6 +462,7 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
 extern unsigned int xstate_size;
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
+extern unsigned int lbr_dump_on_exception;
 
 struct perf_event;
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index fc5eb39..ed9de7f 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -731,6 +731,8 @@ void intel_pmu_lbr_enable_all(void);
 
 void intel_pmu_lbr_disable_all(void);
 
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
+
 void intel_pmu_lbr_read(void);
 
 void intel_pmu_lbr_init_core(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 45fa730..0a69365 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -4,7 +4,7 @@
 #include <asm/perf_event.h>
 #include <asm/msr.h>
 #include <asm/insn.h>
-
+#include <asm/processor.h>
 #include "perf_event.h"
 
 enum {
@@ -130,11 +130,46 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
  * otherwise it becomes near impossible to get a reliable stack.
  */
 
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+/*
+ * LBR usage is exclusive, so need to disable "LBR Dump on exception" feature
+ * when Perf is using it
+ */
+unsigned int lbr_dump_on_exception;
+static bool lbr_used_by_perf;
+static bool lbr_dump_enabled_by_cmdline;
+
+static inline void lbr_update_dump_on_exception(void)
+{
+	lbr_dump_on_exception = !lbr_used_by_perf &&
+				lbr_dump_enabled_by_cmdline;
+}
+
+static int __init lbr_dump_on_exception_setup(char *str)
+{
+	lbr_dump_enabled_by_cmdline = true;
+	lbr_update_dump_on_exception();
+
+	return 0;
+}
+early_param("lbr_dump_on_exception", lbr_dump_on_exception_setup);
+#endif
+
+static inline void lbr_set_used_by_perf(bool used)
+{
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+	lbr_used_by_perf = used;
+	lbr_update_dump_on_exception();
+#endif
+}
+
 static void __intel_pmu_lbr_enable(void)
 {
 	u64 debugctl;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
+	lbr_set_used_by_perf(true);
+
 	if (cpuc->lbr_sel)
 		wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
 
@@ -147,6 +182,8 @@ static void __intel_pmu_lbr_disable(void)
 {
 	u64 debugctl;
 
+	lbr_set_used_by_perf(false);
+
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
@@ -278,7 +315,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
  * is the same as the linear address, allowing us to merge the LIP and EIP
  * LBR formats.
  */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
 	int lbr_format = x86_pmu.intel_cap.lbr_format;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 1abcb50..9ff358b 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -15,7 +15,10 @@
 #include <linux/nmi.h>
 
 #include <asm/stacktrace.h>
-
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+#include <asm/processor.h>
+#include "cpu/perf_event.h"
+#endif
 
 #define N_EXCEPTION_STACKS_END \
 		(N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
@@ -295,6 +298,46 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
+void show_lbrs(void)
+{
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+	u64 debugctl;
+	int i, lbr_on;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	lbr_on = debugctl & DEBUGCTLMSR_LBR;
+
+	pr_info("Last Branch Records:");
+	if (!lbr_dump_on_exception) {
+		/*
+		 * Not enabled in cmdline
+		 * or used by Perf (Usage is exclusive)
+		 */
+		pr_cont(" (disabled)\n");
+	} else if (x86_pmu.lbr_nr == 0) {
+		/* new core: need to declare it in intel_pmu_init() */
+		pr_cont(" (x86_model unknown)\n");
+	} else if (lbr_on) {
+		/* LBR is irrelevant in case of simple Panic */
+		pr_cont(" (no exception)\n");
+	} else {
+		struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+		intel_pmu_lbr_read_64(cpuc);
+
+		pr_cont("\n");
+		for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+			pr_info("   to: [<%016llx>] ",
+				cpuc->lbr_entries[i].to);
+			print_symbol("%s\n", cpuc->lbr_entries[i].to);
+			pr_info(" from: [<%016llx>] ",
+				cpuc->lbr_entries[i].from);
+			print_symbol("%s\n", cpuc->lbr_entries[i].from);
+		}
+	}
+#endif
+}
+
 void show_regs(struct pt_regs *regs)
 {
 	int i;
@@ -314,10 +357,15 @@ void show_regs(struct pt_regs *regs)
 		unsigned char c;
 		u8 *ip;
 
+		/*
+		 * Called before show_stack_log_lvl() as it could trig
+		 * page_fault and reenable LBR
+		 */
+		show_lbrs();
+
 		printk(KERN_DEFAULT "Stack:\n");
 		show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
 				   0, KERN_DEFAULT);
-
 		printk(KERN_DEFAULT "Code: ");
 
 		ip = (u8 *)regs->ip - code_prologue;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index df088bb..f39cded 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1035,6 +1035,46 @@ apicinterrupt IRQ_WORK_VECTOR \
 	irq_work_interrupt smp_irq_work_interrupt
 #endif
 
+.macro STOP_LBR
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+	testl $3,CS+8(%rsp)		/* Kernel Space? */
+	jz 1f
+	testl $1, lbr_dump_on_exception
+	jz 1f
+	push %rax
+	push %rcx
+	push %rdx
+	movl $MSR_IA32_DEBUGCTLMSR, %ecx
+	rdmsr
+	and $~1, %eax			/* Disable LBR recording */
+	wrmsr
+	pop %rdx
+	pop %rcx
+	pop %rax
+1:
+#endif
+.endm
+
+.macro START_LBR
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+	testl $3,CS+8(%rsp)		/* Kernel Space? */
+	jz 1f
+	testl $1, lbr_dump_on_exception
+	jz 1f
+	push %rax
+	push %rcx
+	push %rdx
+	movl $MSR_IA32_DEBUGCTLMSR, %ecx
+	rdmsr
+	or $1, %eax			/* Enable LBR recording */
+	wrmsr
+	pop %rdx
+	pop %rcx
+	pop %rax
+1:
+#endif
+.endm
+
 /*
  * Exception entry points.
  */
@@ -1063,6 +1103,8 @@ ENTRY(\sym)
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 
+	STOP_LBR
+
 	.if \paranoid
 	call save_paranoid
 	.else
@@ -1094,6 +1136,8 @@ ENTRY(\sym)
 
 	call \do_sym
 
+	START_LBR
+
 	.if \shift_ist != -1
 	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
 	.endif
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-11-27 14:40             ` [PATCH v2] [LBR] Dump LBRs on Exception Emmanuel Berthier
@ 2014-11-27 21:22               ` Thomas Gleixner
  2014-11-27 21:56                 ` Andy Lutomirski
  2014-11-28 10:28                 ` Berthier, Emmanuel
  0 siblings, 2 replies; 27+ messages in thread
From: Thomas Gleixner @ 2014-11-27 21:22 UTC (permalink / raw)
  To: Emmanuel Berthier
  Cc: H. Peter Anvin, x86, robert.jarzmik, LKML, Andy Lutomirski

On Thu, 27 Nov 2014, Emmanuel Berthier wrote:
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> index 45fa730..0a69365 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -4,7 +4,7 @@
>  #include <asm/perf_event.h>
>  #include <asm/msr.h>
>  #include <asm/insn.h>
> -

This newline is intentional to seperate asm includes from the local
one.

>  static void __intel_pmu_lbr_enable(void)
>  {
>  	u64 debugctl;
>  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>  
> +	lbr_set_used_by_perf(true);

This cannot work.

CPU0					CPU1

__intel_pmu_lbr_enable()
   lbr_set_used_by_perf(true);
					__intel_pmu_lbr_disable()
					  lbr_set_used_by_perf(false);

This is a per cpu property.

And there is more to that. Let's look at a single CPU.

lbr for oops is enabled

context switch()
   __intel_pmu_lbr_enable()	-> LBR used by perf, oops dumper disabled

context switch()
   __intel_pmu_lbr_disable()	-> LBR not longer used by perf, oops
   				   dumper enabled

So after that context switch we crash in the kernel and LBR is empty
because we did disable it at the context switch.

So you need per cpu state, which handles the LBR dumper state:

#define LBR_OOPS_DISABLED 0x01
#define LBR_PERF_USAGE	  0x02

DEFINE_PER_CPU(unsigned long, lbr_dump_state) = LBR_OOPS_DISABLED;

lbr_perf_enable()
	this_cpu_add(lbr_dump_state, LBR_PERF_USAGE);

lbr_perf_disable()
	if (!this_cpu_sub_return(lbr_dump_state, LBR_PERF_USAGE))
	   	enable_lbr_oops();

Now of course you need to handle this in the exception path per cpu as
well.

>  /*
>   * Exception entry points.
>   */
> @@ -1063,6 +1103,8 @@ ENTRY(\sym)
>  	subq $ORIG_RAX-R15, %rsp
>  	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
>  
> +	STOP_LBR

We really cannot do this unconditionally for every exception. This
wants to be conditional, i.e.

       .if \stop_lbr
       cond_stop_lbr
       .endif

So we can select which exceptions actually get that treatment.
do_page_fault is probably the only one which is interesting
here.

Now looking at your macro maze, I really wonder whether we can do it a
little bit less convoluted. We need to push/pop registers. error_entry
saves the registers already and has a (admitedly convoluted)
kernel/user space check. But we might be able to do something sane
there. Cc'ing Andy as he is the master of that universe.

Thanks,

	tglx




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-11-27 21:22               ` Thomas Gleixner
@ 2014-11-27 21:56                 ` Andy Lutomirski
  2014-11-28  8:44                   ` Berthier, Emmanuel
  2014-11-28 10:28                 ` Berthier, Emmanuel
  1 sibling, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2014-11-27 21:56 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Emmanuel Berthier, H. Peter Anvin, X86 ML, robert.jarzmik, LKML

On Thu, Nov 27, 2014 at 1:22 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Thu, 27 Nov 2014, Emmanuel Berthier wrote:
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> index 45fa730..0a69365 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
>> @@ -4,7 +4,7 @@
>>  #include <asm/perf_event.h>
>>  #include <asm/msr.h>
>>  #include <asm/insn.h>
>> -
>
> This newline is intentional to seperate asm includes from the local
> one.
>
>>  static void __intel_pmu_lbr_enable(void)
>>  {
>>       u64 debugctl;
>>       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>>
>> +     lbr_set_used_by_perf(true);
>
> This cannot work.
>
> CPU0                                    CPU1
>
> __intel_pmu_lbr_enable()
>    lbr_set_used_by_perf(true);
>                                         __intel_pmu_lbr_disable()
>                                           lbr_set_used_by_perf(false);
>
> This is a per cpu property.
>
> And there is more to that. Let's look at a single CPU.
>
> lbr for oops is enabled
>
> context switch()
>    __intel_pmu_lbr_enable()     -> LBR used by perf, oops dumper disabled
>
> context switch()
>    __intel_pmu_lbr_disable()    -> LBR not longer used by perf, oops
>                                    dumper enabled
>
> So after that context switch we crash in the kernel and LBR is empty
> because we did disable it at the context switch.
>
> So you need per cpu state, which handles the LBR dumper state:
>
> #define LBR_OOPS_DISABLED 0x01
> #define LBR_PERF_USAGE    0x02
>
> DEFINE_PER_CPU(unsigned long, lbr_dump_state) = LBR_OOPS_DISABLED;
>
> lbr_perf_enable()
>         this_cpu_add(lbr_dump_state, LBR_PERF_USAGE);
>
> lbr_perf_disable()
>         if (!this_cpu_sub_return(lbr_dump_state, LBR_PERF_USAGE))
>                 enable_lbr_oops();
>
> Now of course you need to handle this in the exception path per cpu as
> well.
>
>>  /*
>>   * Exception entry points.
>>   */
>> @@ -1063,6 +1103,8 @@ ENTRY(\sym)
>>       subq $ORIG_RAX-R15, %rsp
>>       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
>>
>> +     STOP_LBR
>
> We really cannot do this unconditionally for every exception. This
> wants to be conditional, i.e.
>
>        .if \stop_lbr
>        cond_stop_lbr
>        .endif
>
> So we can select which exceptions actually get that treatment.
> do_page_fault is probably the only one which is interesting
> here.
>
> Now looking at your macro maze, I really wonder whether we can do it a
> little bit less convoluted. We need to push/pop registers. error_entry
> saves the registers already and has a (admitedly convoluted)
> kernel/user space check. But we might be able to do something sane
> there. Cc'ing Andy as he is the master of that universe.
>

Can one of you give me some context as to what this code is intended
to do?  I haven't followed the thread.

In particular, knowing why this needs to be in asm instead of in C
would be nice, because asm in entry_64.S has an amazing ability to
have little bugs hiding for years.

There's also the caveat that, especialy for the IST exceptions, you're
running in a weird context in which lots of things that are usually
safe are verboten.  Page faults can be tricky too, though.

--Andy

> Thanks,
>
>         tglx
>
>
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-11-27 21:56                 ` Andy Lutomirski
@ 2014-11-28  8:44                   ` Berthier, Emmanuel
  2014-11-28 15:15                     ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: Berthier, Emmanuel @ 2014-11-28  8:44 UTC (permalink / raw)
  To: Andy Lutomirski, Thomas Gleixner
  Cc: H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 3769 bytes --]

> -----Original Message-----
> From: Andy Lutomirski [mailto:luto@amacapital.net]
> Sent: Thursday, November 27, 2014 10:56 PM
> To: Thomas Gleixner
> Cc: Berthier, Emmanuel; H. Peter Anvin; X86 ML; Jarzmik, Robert; LKML
> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
> 
> On Thu, Nov 27, 2014 at 1:22 PM, Thomas Gleixner <tglx@linutronix.de>
> wrote:
> >>  /*
> >>   * Exception entry points.
> >>   */
> >> @@ -1063,6 +1103,8 @@ ENTRY(\sym)
> >>       subq $ORIG_RAX-R15, %rsp
> >>       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
> >>
> >> +     STOP_LBR
> >
> > We really cannot do this unconditionally for every exception. This
> > wants to be conditional, i.e.
> >
> >        .if \stop_lbr
> >        cond_stop_lbr
> >        .endif
> >
> > So we can select which exceptions actually get that treatment.
> > do_page_fault is probably the only one which is interesting here.
> >
> > Now looking at your macro maze, I really wonder whether we can do it a
> > little bit less convoluted. We need to push/pop registers. error_entry
> > saves the registers already and has a (admitedly convoluted)
> > kernel/user space check. But we might be able to do something sane
> > there. Cc'ing Andy as he is the master of that universe.
> >
> 
> Can one of you give me some context as to what this code is intended to do?
> I haven't followed the thread.
> 
> In particular, knowing why this needs to be in asm instead of in C would be
> nice, because asm in entry_64.S has an amazing ability to have little bugs
> hiding for years.
> 
> There's also the caveat that, especialy for the IST exceptions, you're running
> in a weird context in which lots of things that are usually safe are verboten.
> Page faults can be tricky too, though.
> 
> --Andy

Welcome Andy.
The global purpose of this patch is to disable/enable LBR during exception handling and dump them later in the Panic process in order to get a small instruction trace which could help in case of stack corruption by example.
This has to be done at the very early stage of exception handling as LBR contains few records (8 or 16) and we do not want to flush useful ones (those before the exception), so this code should avoid executing any jump/branch/call before stopping the LBR.

The proposed patch regarding asm code is as follow:

 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index df088bb..f39cded 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1035,6 +1035,46 @@ apicinterrupt IRQ_WORK_VECTOR \
 	irq_work_interrupt smp_irq_work_interrupt  #endif
 
+.macro STOP_LBR
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+	testl $3,CS+8(%rsp)		/* Kernel Space? */
+	jz 1f
+	testl $1, lbr_dump_on_exception
+	jz 1f
+	push %rax
+	push %rcx
+	push %rdx
+	movl $MSR_IA32_DEBUGCTLMSR, %ecx
+	rdmsr
+	and $~1, %eax			/* Disable LBR recording */
+	wrmsr
+	pop %rdx
+	pop %rcx
+	pop %rax
+1:
+#endif
+.endm
+
+.macro START_LBR
+#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
+	testl $3,CS+8(%rsp)		/* Kernel Space? */
+	jz 1f
+	testl $1, lbr_dump_on_exception
+	jz 1f
+	push %rax
+	push %rcx
+	push %rdx
+	movl $MSR_IA32_DEBUGCTLMSR, %ecx
+	rdmsr
+	or $1, %eax			/* Enable LBR recording */
+	wrmsr
+	pop %rdx
+	pop %rcx
+	pop %rax
+1:
+#endif
+.endm
+
 /*
  * Exception entry points.
  */
@@ -1063,6 +1103,8 @@ ENTRY(\sym)
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 
+	STOP_LBR
+
 	.if \paranoid
 	call save_paranoid
 	.else
@@ -1094,6 +1136,8 @@ ENTRY(\sym)
 
 	call \do_sym
 
+	START_LBR
+
 	.if \shift_ist != -1
 	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
 	.endif
--
1.7.9.5

Thanks,
Emmanuel.
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-11-27 21:22               ` Thomas Gleixner
  2014-11-27 21:56                 ` Andy Lutomirski
@ 2014-11-28 10:28                 ` Berthier, Emmanuel
  1 sibling, 0 replies; 27+ messages in thread
From: Berthier, Emmanuel @ 2014-11-28 10:28 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: H. Peter Anvin, x86, Jarzmik, Robert, LKML, Andy Lutomirski

> From: Thomas Gleixner [mailto:tglx@linutronix.de]
> Sent: Thursday, November 27, 2014 10:23 PM
> To: Berthier, Emmanuel
> Cc: H. Peter Anvin; x86@kernel.org; Jarzmik, Robert; LKML; Andy Lutomirski
> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
> 
> On Thu, 27 Nov 2014, Emmanuel Berthier wrote:
> > diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> > b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> > index 45fa730..0a69365 100644
> > --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> > +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> > @@ -4,7 +4,7 @@
> >  #include <asm/perf_event.h>
> >  #include <asm/msr.h>
> >  #include <asm/insn.h>
> > -
> 
> This newline is intentional to seperate asm includes from the local one.

Got it.

> >  static void __intel_pmu_lbr_enable(void)  {
> >  	u64 debugctl;
> >  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> >
> > +	lbr_set_used_by_perf(true);
> 
> This cannot work.
> 
> CPU0					CPU1
> 
> __intel_pmu_lbr_enable()
>    lbr_set_used_by_perf(true);
> 					__intel_pmu_lbr_disable()
> 					  lbr_set_used_by_perf(false);
> 
> This is a per cpu property.
> 
> And there is more to that. Let's look at a single CPU.
> 
> lbr for oops is enabled
> 
> context switch()
>    __intel_pmu_lbr_enable()	-> LBR used by perf, oops dumper disabled
> 
> context switch()
>    __intel_pmu_lbr_disable()	-> LBR not longer used by perf, oops
>    				   dumper enabled
> 
> So after that context switch we crash in the kernel and LBR is empty because
> we did disable it at the context switch.
> 
> So you need per cpu state, which handles the LBR dumper state:
> 
> #define LBR_OOPS_DISABLED 0x01
> #define LBR_PERF_USAGE	  0x02
> 
> DEFINE_PER_CPU(unsigned long, lbr_dump_state) = LBR_OOPS_DISABLED;
> 
> lbr_perf_enable()
> 	this_cpu_add(lbr_dump_state, LBR_PERF_USAGE);
> 
> lbr_perf_disable()
> 	if (!this_cpu_sub_return(lbr_dump_state, LBR_PERF_USAGE))
> 	   	enable_lbr_oops();
> 
> Now of course you need to handle this in the exception path per cpu as well.

Agree, I will do that.

Thx.
Emmanuel.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-11-28  8:44                   ` Berthier, Emmanuel
@ 2014-11-28 15:15                     ` Andy Lutomirski
  2014-12-02 19:09                       ` Berthier, Emmanuel
  0 siblings, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2014-11-28 15:15 UTC (permalink / raw)
  To: Berthier, Emmanuel
  Cc: Thomas Gleixner, H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

On Fri, Nov 28, 2014 at 12:44 AM, Berthier, Emmanuel
<emmanuel.berthier@intel.com> wrote:
>> -----Original Message-----
>> From: Andy Lutomirski [mailto:luto@amacapital.net]
>> Sent: Thursday, November 27, 2014 10:56 PM
>> To: Thomas Gleixner
>> Cc: Berthier, Emmanuel; H. Peter Anvin; X86 ML; Jarzmik, Robert; LKML
>> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
>>
>> On Thu, Nov 27, 2014 at 1:22 PM, Thomas Gleixner <tglx@linutronix.de>
>> wrote:
>> >>  /*
>> >>   * Exception entry points.
>> >>   */
>> >> @@ -1063,6 +1103,8 @@ ENTRY(\sym)
>> >>       subq $ORIG_RAX-R15, %rsp
>> >>       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
>> >>
>> >> +     STOP_LBR
>> >
>> > We really cannot do this unconditionally for every exception. This
>> > wants to be conditional, i.e.
>> >
>> >        .if \stop_lbr
>> >        cond_stop_lbr
>> >        .endif
>> >
>> > So we can select which exceptions actually get that treatment.
>> > do_page_fault is probably the only one which is interesting here.
>> >
>> > Now looking at your macro maze, I really wonder whether we can do it a
>> > little bit less convoluted. We need to push/pop registers. error_entry
>> > saves the registers already and has a (admitedly convoluted)
>> > kernel/user space check. But we might be able to do something sane
>> > there. Cc'ing Andy as he is the master of that universe.
>> >
>>
>> Can one of you give me some context as to what this code is intended to do?
>> I haven't followed the thread.
>>
>> In particular, knowing why this needs to be in asm instead of in C would be
>> nice, because asm in entry_64.S has an amazing ability to have little bugs
>> hiding for years.
>>
>> There's also the caveat that, especialy for the IST exceptions, you're running
>> in a weird context in which lots of things that are usually safe are verboten.
>> Page faults can be tricky too, though.
>>
>> --Andy
>
> Welcome Andy.
> The global purpose of this patch is to disable/enable LBR during exception handling and dump them later in the Panic process in order to get a small instruction trace which could help in case of stack corruption by example.
> This has to be done at the very early stage of exception handling as LBR contains few records (8 or 16) and we do not want to flush useful ones (those before the exception), so this code should avoid executing any jump/branch/call before stopping the LBR.
>
> The proposed patch regarding asm code is as follow:
>
>  diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index df088bb..f39cded 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -1035,6 +1035,46 @@ apicinterrupt IRQ_WORK_VECTOR \
>         irq_work_interrupt smp_irq_work_interrupt  #endif
>
> +.macro STOP_LBR
> +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> +       testl $3,CS+8(%rsp)             /* Kernel Space? */
> +       jz 1f
> +       testl $1, lbr_dump_on_exception

Is there a guarantee that, if lbr_dump_on_exception is true, then LBR is on?

What happens if you schedule between stopping and resuming LBR?

> +       jz 1f
> +       push %rax
> +       push %rcx
> +       push %rdx
> +       movl $MSR_IA32_DEBUGCTLMSR, %ecx
> +       rdmsr
> +       and $~1, %eax                   /* Disable LBR recording */
> +       wrmsr

wrmsr is rather slow.  Have you checked whether this is faster than
just saving the LBR trace on exception entry?

--Andy

> +       pop %rdx
> +       pop %rcx
> +       pop %rax
> +1:
> +#endif
> +.endm
> +
> +.macro START_LBR
> +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> +       testl $3,CS+8(%rsp)             /* Kernel Space? */
> +       jz 1f
> +       testl $1, lbr_dump_on_exception
> +       jz 1f
> +       push %rax
> +       push %rcx
> +       push %rdx
> +       movl $MSR_IA32_DEBUGCTLMSR, %ecx
> +       rdmsr
> +       or $1, %eax                     /* Enable LBR recording */
> +       wrmsr
> +       pop %rdx
> +       pop %rcx
> +       pop %rax
> +1:
> +#endif
> +.endm
> +
>  /*
>   * Exception entry points.
>   */
> @@ -1063,6 +1103,8 @@ ENTRY(\sym)
>         subq $ORIG_RAX-R15, %rsp
>         CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
>
> +       STOP_LBR
> +
>         .if \paranoid
>         call save_paranoid
>         .else
> @@ -1094,6 +1136,8 @@ ENTRY(\sym)
>
>         call \do_sym
>
> +       START_LBR
> +
>         .if \shift_ist != -1
>         addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
>         .endif
> --
> 1.7.9.5
>
> Thanks,
> Emmanuel.



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-11-28 15:15                     ` Andy Lutomirski
@ 2014-12-02 19:09                       ` Berthier, Emmanuel
  2014-12-02 19:33                         ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: Berthier, Emmanuel @ 2014-12-02 19:09 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 2377 bytes --]

> From: Andy Lutomirski [mailto:luto@amacapital.net]
> Sent: Friday, November 28, 2014 4:15 PM
> To: Berthier, Emmanuel
> Cc: Thomas Gleixner; H. Peter Anvin; X86 ML; Jarzmik, Robert; LKML
> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
> 
> On Fri, Nov 28, 2014 at 12:44 AM, Berthier, Emmanuel
> <emmanuel.berthier@intel.com> wrote:
> >  diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> > index df088bb..f39cded 100644
> > --- a/arch/x86/kernel/entry_64.S
> > +++ b/arch/x86/kernel/entry_64.S
> > @@ -1035,6 +1035,46 @@ apicinterrupt IRQ_WORK_VECTOR \
> >         irq_work_interrupt smp_irq_work_interrupt  #endif
> >
> > +.macro STOP_LBR
> > +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> > +       testl $3,CS+8(%rsp)             /* Kernel Space? */
> > +       jz 1f
> > +       testl $1, lbr_dump_on_exception
> 
> Is there a guarantee that, if lbr_dump_on_exception is true, then LBR is on?
> What happens if you schedule between stopping and resuming LBR?

Good point. The current assumption is to rely on the numerous exceptions to "re-arm" the LBR recording.
Even if we bypass UserSpace page faults, we can keep rely on kernel VMalloc page faults to re-arm the recording.

> > +       jz 1f
> > +       push %rax
> > +       push %rcx
> > +       push %rdx
> > +       movl $MSR_IA32_DEBUGCTLMSR, %ecx
> > +       rdmsr
> > +       and $~1, %eax                   /* Disable LBR recording */
> > +       wrmsr
> 
> wrmsr is rather slow.  Have you checked whether this is faster than just
> saving the LBR trace on exception entry?

The figures I have show that for common MSR, rdmsr and wrmsr have quite the same impact, around 100 cycles (greatly depends on the arch).
The cost of stop/start is: 2 rdmsr + 2 wrmsr = 4 msr
The cost of reading LBR is: 1 rdmsr for TOS + 2 rdmsr per record, and there are from 8 to 32 Records (Arch specific) = between 17 to 65 msr.
I've measured on Atom arch (8 records): LBR read versus stop/start: around x3 more time.
As the LBR size is arch dependent, it's not easy to implement the record reading in ASM without any branch, and this would generate maintenance dependency.
I prefer to let perf_event_lbr dealing with all that stuff.

Thx,

Emmanuel.

ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-02 19:09                       ` Berthier, Emmanuel
@ 2014-12-02 19:33                         ` Andy Lutomirski
  2014-12-02 19:56                           ` Thomas Gleixner
  0 siblings, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2014-12-02 19:33 UTC (permalink / raw)
  To: Berthier, Emmanuel
  Cc: Thomas Gleixner, H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

On Tue, Dec 2, 2014 at 11:09 AM, Berthier, Emmanuel
<emmanuel.berthier@intel.com> wrote:
>> From: Andy Lutomirski [mailto:luto@amacapital.net]
>> Sent: Friday, November 28, 2014 4:15 PM
>> To: Berthier, Emmanuel
>> Cc: Thomas Gleixner; H. Peter Anvin; X86 ML; Jarzmik, Robert; LKML
>> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
>>
>> On Fri, Nov 28, 2014 at 12:44 AM, Berthier, Emmanuel
>> <emmanuel.berthier@intel.com> wrote:
>> >  diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
>> > index df088bb..f39cded 100644
>> > --- a/arch/x86/kernel/entry_64.S
>> > +++ b/arch/x86/kernel/entry_64.S
>> > @@ -1035,6 +1035,46 @@ apicinterrupt IRQ_WORK_VECTOR \
>> >         irq_work_interrupt smp_irq_work_interrupt  #endif
>> >
>> > +.macro STOP_LBR
>> > +#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
>> > +       testl $3,CS+8(%rsp)             /* Kernel Space? */
>> > +       jz 1f
>> > +       testl $1, lbr_dump_on_exception
>>
>> Is there a guarantee that, if lbr_dump_on_exception is true, then LBR is on?
>> What happens if you schedule between stopping and resuming LBR?
>
> Good point. The current assumption is to rely on the numerous exceptions to "re-arm" the LBR recording.
> Even if we bypass UserSpace page faults, we can keep rely on kernel VMalloc page faults to re-arm the recording.

I don't really understand this.  Presumably page_fault should leave
the LBR setting exactly the way it found it, because otherwise it'll
need all kinds of fancy coordination with perf.  And vmalloc faults
are very rare.

You should also make sure that the perf code is okay with a PMI
nesting *inside* a fault that has disabled LBR.  Also, page faults are
rather performance sensitive, so the performance hit from this isn't
so great.

And keep in mind that we can context switch both inside an exception
handler and on the way out, so that all needs to work, too.

TBH, I'm wondering whether this is actually a good idea.  It might be
more valuable and less scary to try to make this work for BUG instead.
To get the most impact, it might be worth allocating a new exception
vector for BUG and using 'int 0xwhatever', and the prologue to that
could read out all the MSRs without any branches.

--Andy

>
>> > +       jz 1f
>> > +       push %rax
>> > +       push %rcx
>> > +       push %rdx
>> > +       movl $MSR_IA32_DEBUGCTLMSR, %ecx
>> > +       rdmsr
>> > +       and $~1, %eax                   /* Disable LBR recording */
>> > +       wrmsr
>>
>> wrmsr is rather slow.  Have you checked whether this is faster than just
>> saving the LBR trace on exception entry?
>
> The figures I have show that for common MSR, rdmsr and wrmsr have quite the same impact, around 100 cycles (greatly depends on the arch).
> The cost of stop/start is: 2 rdmsr + 2 wrmsr = 4 msr
> The cost of reading LBR is: 1 rdmsr for TOS + 2 rdmsr per record, and there are from 8 to 32 Records (Arch specific) = between 17 to 65 msr.
> I've measured on Atom arch (8 records): LBR read versus stop/start: around x3 more time.
> As the LBR size is arch dependent, it's not easy to implement the record reading in ASM without any branch, and this would generate maintenance dependency.
> I prefer to let perf_event_lbr dealing with all that stuff.
>
> Thx,
>
> Emmanuel.
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-02 19:33                         ` Andy Lutomirski
@ 2014-12-02 19:56                           ` Thomas Gleixner
  2014-12-02 20:12                             ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-12-02 19:56 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Berthier, Emmanuel, H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

On Tue, 2 Dec 2014, Andy Lutomirski wrote:
> TBH, I'm wondering whether this is actually a good idea.  It might be
> more valuable and less scary to try to make this work for BUG instead.
> To get the most impact, it might be worth allocating a new exception
> vector for BUG and using 'int 0xwhatever', and the prologue to that
> could read out all the MSRs without any branches.

BUG is pretty uninteresting. We usually know how we got there. Now
where LBR might be useful is if you have stack corruption and branch
into nirvana or access random crap via a few hoops. There the LBR data
might help, because the corrupted stack does not tell anything.

So yes, this is a corner case debugging scenario, but given the
complexity of coordination with perf and the possible intrusiveness in
the low level entry path, we really need to see a few real world
examples where this helps, aside of the constructed case.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-02 19:56                           ` Thomas Gleixner
@ 2014-12-02 20:12                             ` Andy Lutomirski
  2014-12-03 18:25                               ` Berthier, Emmanuel
  0 siblings, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2014-12-02 20:12 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Berthier, Emmanuel, H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

On Tue, Dec 2, 2014 at 11:56 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Tue, 2 Dec 2014, Andy Lutomirski wrote:
>> TBH, I'm wondering whether this is actually a good idea.  It might be
>> more valuable and less scary to try to make this work for BUG instead.
>> To get the most impact, it might be worth allocating a new exception
>> vector for BUG and using 'int 0xwhatever', and the prologue to that
>> could read out all the MSRs without any branches.
>
> BUG is pretty uninteresting. We usually know how we got there. Now
> where LBR might be useful is if you have stack corruption and branch
> into nirvana or access random crap via a few hoops. There the LBR data
> might help, because the corrupted stack does not tell anything.
>
> So yes, this is a corner case debugging scenario, but given the
> complexity of coordination with perf and the possible intrusiveness in
> the low level entry path, we really need to see a few real world
> examples where this helps, aside of the constructed case.
>

One option would be to treat this as the debugging feature that it is.

Don't change the exception entry code at all.  Instead add a run-time
switch that will ask perf to set up the LBR stuff appropriately and
will change the IDT entries to stubs that first use rdmsr to copy all
the LBRs to some per-cpu, per-vector area and then jump to the real
entry point.

Yes, performance will suck, and there will be some artifacts if the
exception nests inside itself before reporting the error (which can
happen for page faults, I think), but this isn't the end of the world
if it's an opt-in feature.

And the whole pile of rdmsrs can be generated at run-time to match the
running CPU.

--Andy

> Thanks,
>
>         tglx



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-02 20:12                             ` Andy Lutomirski
@ 2014-12-03 18:25                               ` Berthier, Emmanuel
  2014-12-03 19:29                                 ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: Berthier, Emmanuel @ 2014-12-03 18:25 UTC (permalink / raw)
  To: Andy Lutomirski, Thomas Gleixner
  Cc: H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 7731 bytes --]

> From: Andy Lutomirski [mailto:luto@amacapital.net]
> Sent: Tuesday, December 2, 2014 9:12 PM
> To: Thomas Gleixner
> Cc: Berthier, Emmanuel; H. Peter Anvin; X86 ML; Jarzmik, Robert; LKML
> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
> 
> On Tue, Dec 2, 2014 at 11:56 AM, Thomas Gleixner <tglx@linutronix.de>
> wrote:
> > On Tue, 2 Dec 2014, Andy Lutomirski wrote:
> >> TBH, I'm wondering whether this is actually a good idea.  It might be
> >> more valuable and less scary to try to make this work for BUG instead.
> >> To get the most impact, it might be worth allocating a new exception
> >> vector for BUG and using 'int 0xwhatever', and the prologue to that
> >> could read out all the MSRs without any branches.
> >
> > BUG is pretty uninteresting. We usually know how we got there. Now
> > where LBR might be useful is if you have stack corruption and branch
> > into nirvana or access random crap via a few hoops. There the LBR data
> > might help, because the corrupted stack does not tell anything.

Right, I remember some strange Panic where the register content was not coherent with the EIP.
That's what I call "jump into space".

> > So yes, this is a corner case debugging scenario, but given the
> > complexity of coordination with perf and the possible intrusiveness in
> > the low level entry path, we really need to see a few real world
> > examples where this helps, aside of the constructed case.

Here is such "jump into space":

<1>[  533.394885] BUG: unable to handle kernel paging request at c12329c0
<1>[  533.394930] IP: [<c12329ed>] s0ix_complete+0x2d/0x30
<4>[  533.394973] *pde = 362c8063 *pte = 01232161 
<4>[  533.395001] Oops: 0003 [#1] PREEMPT SMP 
<4>[  533.395030] Modules linked in: atomisp lm3554 mt9m114 ov8830 tty_hci wl12xx_sdio(O) wl12xx(O) mac80211(O) cfg80211(O) compat(O) rmi4(C) st_drv videobuf_vmalloc videobuf_core matrix(C) sgx(C)
<4>[  533.395120] 
<4>[  533.395135] Pid: 0, comm: swapper/1 Tainted: G         C O 3.4.21-183401-g34a7d6b #1 Intel Corporation CloverTrail/FFRD
<4>[  533.395165] EIP: 0060:[<c12329ed>] EFLAGS: 00011ed6 CPU: 1
<4>[  533.395184] EIP is at s0ix_complete+0x2d/0x30
<4>[  533.395198] EAX: c12329c0 EBX: c1d120a0 ECX: 00000001 EDX: 00000000
<4>[  533.395211] ESI: e0d26b2b EDI: 12e6492c EBP: f60a9eec ESP: f60a9ef0
<4>[  533.395225]  DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
<4>[  533.395239] CR0: 8005003b CR2: c12329c0 CR3: 01dfe000 CR4: 000007d0
<4>[  533.395253] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
<4>[  533.395265] DR6: ffff0ff0 DR7: 00000400
<0>[  533.395279] Process swapper/1 (pid: 0, ti=f60a8000 task=f607b520 task.ti=f60a8000)
<0>[  533.395290] Stack:
<4>[  533.395299]  c1230b36 f60a9f40 c153d73a 00000767 f60a9f10 c123b908 f60a9f0c 00000000
<4>[  533.395342]  00000052 00000006 f67eb45c 00000001 00000007 00000000 c1d120a0 00000052
<4>[  533.395384]  00000001 c1d120a0 00000214 f67eb45c f60a9f50 c174ccd5 fffffff0 00000001
<0>[  533.395427] Call Trace:
<4>[  533.395445]  [<c1230b36>] pmu_set_s0ix_complete+0x16/0x20
<4>[  533.395468]  [<c153d73a>] ? soc_s0ix_idle+0x13a/0x4b0
<4>[  533.395488]  [<c123b908>] ? ns_to_timespec+0x28/0x40
<4>[  533.395510]  [<c174ccd5>] ? cpuidle_enter+0x15/0x20
<4>[  533.395529]  [<c174d2ef>] ? cpuidle_idle_call+0x9f/0x320
<4>[  533.395549]  [<c1209795>] ? cpu_idle+0x75/0xd0
<4>[  533.395569]  [<c19955bd>] ? start_secondary+0x23c/0x23e
<0>[  533.395583] Code: e5 3e 8d 74 26 00 a1 44 91 e0 c1 8b 90 38 0a 00 00 85 d2 75 02 5d c3 8b 90 34 1d 00 00 31 c0 89 42 58 5d c3 8d 76 00 8d bc 27 00 <00> 00 00 55 89 e5 53 3e 8d 74 26 00 89 c1 31 c0 83 f9 3f 77 24 
<0>[  533.395862] EIP: [<c12329ed>] s0ix_complete+0x2d/0x30 SS:ESP 0068:f60a9ef0
<4>[  533.395888] CR2: 00000000c12329c0

Analysis:

After replaying the game with T32 in normal situation and comparing the snapshots,  it appears that to get the same call stack, we come from pmu_set_s0ix_complete():

                    |void pmu_set_s0ix_complete(void)
                437 |{
                438 |        if (pmu_ops->set_s0ix_complete)
   NP:0000:C1230B28 |A14071E0C1                mov        eax,[0xC1E07140]   ; eax,pmu_ops
   NP:0000:C1230B2D |8B401C                    mov        eax,[eax+0x1C]
   NP:0000:C1230B30 |85C0                      test       eax,eax
   NP:0000:C1230B32 |7402                      je         0xC1230B36
                439 |                pmu_ops->set_s0ix_complete();
   NP:0000:C1230B34 |FFD0                      call       eax
                440 |}
   NP:0000:C1230B36 |5D                        pop        ebp
   NP:0000:C1230B37 |C3                        ret

and we have not yet executed the beginning of s0ix_complete() as EBP is not yet stored in the stack and EAX has not be modified:

                   |void s0ix_complete(void)
               257 |{
  NP:0000:C12329C0 |55______________s0ix_com.:push_______ebp
  NP:0000:C12329C1 |89E5                      mov        ebp,esp
  NP:0000:C12329C3 |3E8D742600                lea        esi,ds:[esi+0x0]
                   |
                   |void s0ix_complete(void)
               257 |{
               258 |        if (unlikely(mid_pmu_cxt->s0ix_entered))
  NP:0000:C12329C8 |A14471E0C1                mov        eax,[0xC1E07144]   ; eax,mid_pmu_cxt
  NP:0000:C12329CD |8B90380A0000              mov        edx,[eax+0xA38]
  NP:0000:C12329D3 |85D2                      test       edx,edx
  NP:0000:C12329D5 |7502                      jne        0xC12329D9
               260 |}
  NP:0000:C12329D7 |5D                        pop        ebp
  NP:0000:C12329D8 |C3                        ret
               259 |                writel(0, &mid_pmu_cxt->pmu_reg->pm_msic);
  NP:0000:C12329D9 |8B90341D0000              mov        edx,[eax+0x1D34]
                63 |build_mmio_write(writel, "l", unsigned int, "r", :"memory")
  NP:0000:C12329DF |31C0                      xor        eax,eax
  NP:0000:C12329E1 |894258                    mov        [edx+0x58],eax
               260 |}
  NP:0000:C12329E4 |5D                        pop        ebp
  NP:0000:C12329E5 |C3                        ret
  NP:0000:C12329E6 |8D7600                    lea        esi,[esi+0x0]
  NP:0000:C12329E9 |8DBC2700000000            lea        edi,[edi+0x0]

So, the execution of "call eax" has jumped directly to eax+2d.

> One option would be to treat this as the debugging feature that it is.

Agree.

> Don't change the exception entry code at all.  Instead add a run-time switch
> that will ask perf to set up the LBR stuff appropriately and will change the IDT
> entries to stubs that first use rdmsr to copy all the LBRs to some per-cpu, per-
> vector area and then jump to the real entry point.
> 
> Yes, performance will suck, and there will be some artifacts if the exception
> nests inside itself before reporting the error (which can happen for page
> faults, I think), but this isn't the end of the world if it's an opt-in feature.
> 
> And the whole pile of rdmsrs can be generated at run-time to match the
> running CPU.

The final patch will bypass the new code in case of UserSpace page fault, so performance impact will be very low.
LBRs copy takes much more time than LBR stop/start.

The simple is the better:

.macro STOP_LBR
#ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
	testl $3,CS(%rsp)		/* Kernel Space? */
	jnz 1f
	testl $3, PER_CPU_VAR(lbr_dump_state) /* Disabled? */
	jnz 1f
	push %rax
	push %rcx
	push %rdx
	movl $MSR_IA32_DEBUGCTLMSR, %ecx
	rdmsr
	and $~1, %eax	/* Disable LBR recording */
	wrmsr
	pop %rdx
	pop %rcx
	pop %rax
1:
#endif
.endm

Thx,

Emmanuel.
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-03 18:25                               ` Berthier, Emmanuel
@ 2014-12-03 19:29                                 ` Andy Lutomirski
  2014-12-04 16:01                                   ` Berthier, Emmanuel
  0 siblings, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2014-12-03 19:29 UTC (permalink / raw)
  To: Berthier, Emmanuel
  Cc: Thomas Gleixner, H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

On Wed, Dec 3, 2014 at 10:25 AM, Berthier, Emmanuel
<emmanuel.berthier@intel.com> wrote:
>> From: Andy Lutomirski [mailto:luto@amacapital.net]
>> Sent: Tuesday, December 2, 2014 9:12 PM
>> To: Thomas Gleixner
>> Cc: Berthier, Emmanuel; H. Peter Anvin; X86 ML; Jarzmik, Robert; LKML
>> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
>>
>> On Tue, Dec 2, 2014 at 11:56 AM, Thomas Gleixner <tglx@linutronix.de>
>> wrote:
>> > On Tue, 2 Dec 2014, Andy Lutomirski wrote:
>> >> TBH, I'm wondering whether this is actually a good idea.  It might be
>> >> more valuable and less scary to try to make this work for BUG instead.
>> >> To get the most impact, it might be worth allocating a new exception
>> >> vector for BUG and using 'int 0xwhatever', and the prologue to that
>> >> could read out all the MSRs without any branches.
>> >
>> > BUG is pretty uninteresting. We usually know how we got there. Now
>> > where LBR might be useful is if you have stack corruption and branch
>> > into nirvana or access random crap via a few hoops. There the LBR data
>> > might help, because the corrupted stack does not tell anything.
>
> Right, I remember some strange Panic where the register content was not coherent with the EIP.
> That's what I call "jump into space".
>
>> > So yes, this is a corner case debugging scenario, but given the
>> > complexity of coordination with perf and the possible intrusiveness in
>> > the low level entry path, we really need to see a few real world
>> > examples where this helps, aside of the constructed case.
>
> Here is such "jump into space":
>
> <1>[  533.394885] BUG: unable to handle kernel paging request at c12329c0
> <1>[  533.394930] IP: [<c12329ed>] s0ix_complete+0x2d/0x30
> <4>[  533.394973] *pde = 362c8063 *pte = 01232161
> <4>[  533.395001] Oops: 0003 [#1] PREEMPT SMP
> <4>[  533.395030] Modules linked in: atomisp lm3554 mt9m114 ov8830 tty_hci wl12xx_sdio(O) wl12xx(O) mac80211(O) cfg80211(O) compat(O) rmi4(C) st_drv videobuf_vmalloc videobuf_core matrix(C) sgx(C)
> <4>[  533.395120]
> <4>[  533.395135] Pid: 0, comm: swapper/1 Tainted: G         C O 3.4.21-183401-g34a7d6b #1 Intel Corporation CloverTrail/FFRD
> <4>[  533.395165] EIP: 0060:[<c12329ed>] EFLAGS: 00011ed6 CPU: 1
> <4>[  533.395184] EIP is at s0ix_complete+0x2d/0x30
> <4>[  533.395198] EAX: c12329c0 EBX: c1d120a0 ECX: 00000001 EDX: 00000000
> <4>[  533.395211] ESI: e0d26b2b EDI: 12e6492c EBP: f60a9eec ESP: f60a9ef0
> <4>[  533.395225]  DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
> <4>[  533.395239] CR0: 8005003b CR2: c12329c0 CR3: 01dfe000 CR4: 000007d0
> <4>[  533.395253] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
> <4>[  533.395265] DR6: ffff0ff0 DR7: 00000400
> <0>[  533.395279] Process swapper/1 (pid: 0, ti=f60a8000 task=f607b520 task.ti=f60a8000)
> <0>[  533.395290] Stack:
> <4>[  533.395299]  c1230b36 f60a9f40 c153d73a 00000767 f60a9f10 c123b908 f60a9f0c 00000000
> <4>[  533.395342]  00000052 00000006 f67eb45c 00000001 00000007 00000000 c1d120a0 00000052
> <4>[  533.395384]  00000001 c1d120a0 00000214 f67eb45c f60a9f50 c174ccd5 fffffff0 00000001
> <0>[  533.395427] Call Trace:
> <4>[  533.395445]  [<c1230b36>] pmu_set_s0ix_complete+0x16/0x20
> <4>[  533.395468]  [<c153d73a>] ? soc_s0ix_idle+0x13a/0x4b0
> <4>[  533.395488]  [<c123b908>] ? ns_to_timespec+0x28/0x40
> <4>[  533.395510]  [<c174ccd5>] ? cpuidle_enter+0x15/0x20
> <4>[  533.395529]  [<c174d2ef>] ? cpuidle_idle_call+0x9f/0x320
> <4>[  533.395549]  [<c1209795>] ? cpu_idle+0x75/0xd0
> <4>[  533.395569]  [<c19955bd>] ? start_secondary+0x23c/0x23e
> <0>[  533.395583] Code: e5 3e 8d 74 26 00 a1 44 91 e0 c1 8b 90 38 0a 00 00 85 d2 75 02 5d c3 8b 90 34 1d 00 00 31 c0 89 42 58 5d c3 8d 76 00 8d bc 27 00 <00> 00 00 55 89 e5 53 3e 8d 74 26 00 89 c1 31 c0 83 f9 3f 77 24
> <0>[  533.395862] EIP: [<c12329ed>] s0ix_complete+0x2d/0x30 SS:ESP 0068:f60a9ef0
> <4>[  533.395888] CR2: 00000000c12329c0
>
> Analysis:
>
> After replaying the game with T32 in normal situation and comparing the snapshots,  it appears that to get the same call stack, we come from pmu_set_s0ix_complete():
>
>                     |void pmu_set_s0ix_complete(void)
>                 437 |{
>                 438 |        if (pmu_ops->set_s0ix_complete)
>    NP:0000:C1230B28 |A14071E0C1                mov        eax,[0xC1E07140]   ; eax,pmu_ops
>    NP:0000:C1230B2D |8B401C                    mov        eax,[eax+0x1C]
>    NP:0000:C1230B30 |85C0                      test       eax,eax
>    NP:0000:C1230B32 |7402                      je         0xC1230B36
>                 439 |                pmu_ops->set_s0ix_complete();
>    NP:0000:C1230B34 |FFD0                      call       eax
>                 440 |}
>    NP:0000:C1230B36 |5D                        pop        ebp
>    NP:0000:C1230B37 |C3                        ret
>
> and we have not yet executed the beginning of s0ix_complete() as EBP is not yet stored in the stack and EAX has not be modified:
>
>                    |void s0ix_complete(void)
>                257 |{
>   NP:0000:C12329C0 |55______________s0ix_com.:push_______ebp
>   NP:0000:C12329C1 |89E5                      mov        ebp,esp
>   NP:0000:C12329C3 |3E8D742600                lea        esi,ds:[esi+0x0]
>                    |
>                    |void s0ix_complete(void)
>                257 |{
>                258 |        if (unlikely(mid_pmu_cxt->s0ix_entered))
>   NP:0000:C12329C8 |A14471E0C1                mov        eax,[0xC1E07144]   ; eax,mid_pmu_cxt
>   NP:0000:C12329CD |8B90380A0000              mov        edx,[eax+0xA38]
>   NP:0000:C12329D3 |85D2                      test       edx,edx
>   NP:0000:C12329D5 |7502                      jne        0xC12329D9
>                260 |}
>   NP:0000:C12329D7 |5D                        pop        ebp
>   NP:0000:C12329D8 |C3                        ret
>                259 |                writel(0, &mid_pmu_cxt->pmu_reg->pm_msic);
>   NP:0000:C12329D9 |8B90341D0000              mov        edx,[eax+0x1D34]
>                 63 |build_mmio_write(writel, "l", unsigned int, "r", :"memory")
>   NP:0000:C12329DF |31C0                      xor        eax,eax
>   NP:0000:C12329E1 |894258                    mov        [edx+0x58],eax
>                260 |}
>   NP:0000:C12329E4 |5D                        pop        ebp
>   NP:0000:C12329E5 |C3                        ret
>   NP:0000:C12329E6 |8D7600                    lea        esi,[esi+0x0]
>   NP:0000:C12329E9 |8DBC2700000000            lea        edi,[edi+0x0]
>
> So, the execution of "call eax" has jumped directly to eax+2d.
>
>> One option would be to treat this as the debugging feature that it is.
>
> Agree.
>
>> Don't change the exception entry code at all.  Instead add a run-time switch
>> that will ask perf to set up the LBR stuff appropriately and will change the IDT
>> entries to stubs that first use rdmsr to copy all the LBRs to some per-cpu, per-
>> vector area and then jump to the real entry point.
>>
>> Yes, performance will suck, and there will be some artifacts if the exception
>> nests inside itself before reporting the error (which can happen for page
>> faults, I think), but this isn't the end of the world if it's an opt-in feature.
>>
>> And the whole pile of rdmsrs can be generated at run-time to match the
>> running CPU.
>
> The final patch will bypass the new code in case of UserSpace page fault, so performance impact will be very low.
> LBRs copy takes much more time than LBR stop/start.
>
> The simple is the better:
>
> .macro STOP_LBR
> #ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
>         testl $3,CS(%rsp)               /* Kernel Space? */
>         jnz 1f
>         testl $3, PER_CPU_VAR(lbr_dump_state) /* Disabled? */
>         jnz 1f

But that just wasted two of your LBR slots.

>         push %rax
>         push %rcx
>         push %rdx
>         movl $MSR_IA32_DEBUGCTLMSR, %ecx
>         rdmsr
>         and $~1, %eax   /* Disable LBR recording */
>         wrmsr
>         pop %rdx
>         pop %rcx
>         pop %rax

And the general problem with this approach (even ignoring the
performance hit, and kernel faults on user addresses really do happen
in real workloads) is that you're not saving and restoring
MSR_IA32_DEBUGCTL.  It may be that the rest of your patch does
whatever magic is needed to make this work, but from just this code
it's not at all obvious that this is correct.

Hence my suggestion for rdmsr -- if you're willing to enable this and
take the performance hit, you can simplify it a lot and save some
branch slots by unconditionally doing the rdmsrs if you've enabled the
LBR tracing IDT entry.  The simplification from using rdmsr isn't that
the save code is simplified -- it's that there's no state change on
exception entry, so you don't need to worry about restoring state
correctly on the way out or during a context switch.  And you can
enable/disable the whole thing just by writing to the IDT, so there's
no performance hit at all in the disabled case.

--Andy

> 1:
> #endif
> .endm
>
> Thx,
>
> Emmanuel.



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-03 19:29                                 ` Andy Lutomirski
@ 2014-12-04 16:01                                   ` Berthier, Emmanuel
  2014-12-04 18:09                                     ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: Berthier, Emmanuel @ 2014-12-04 16:01 UTC (permalink / raw)
  To: 'Andy Lutomirski'
  Cc: Thomas Gleixner, H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 3735 bytes --]

> From: Andy Lutomirski [mailto:luto@amacapital.net]
> Sent: Wednesday, December 3, 2014 8:30 PM
> To: Berthier, Emmanuel
> Cc: Thomas Gleixner; H. Peter Anvin; X86 ML; Jarzmik, Robert; LKML
> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
> > The final patch will bypass the new code in case of UserSpace page fault, so
> performance impact will be very low.
> > LBRs copy takes much more time than LBR stop/start.
> >
> > The simple is the better:
> >
> > .macro STOP_LBR
> > #ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
> >         testl $3,CS(%rsp)               /* Kernel Space? */
> >         jnz 1f
> >         testl $3, PER_CPU_VAR(lbr_dump_state) /* Disabled? */
> >         jnz 1f
> 
> But that just wasted two of your LBR slots.

No: false test does not generate Branch record, ex:

  Last Branch Records:
    to: [<ffffffff828122a0>] page_fault+0x0/0x90
  from: [<ffffffff823c0e06>] sysrq_handle_crash+0x16/0x20
    to: [<ffffffff823c0df0>] sysrq_handle_crash+0x0/0x20
  from: [<ffffffff823c156c>] __handle_sysrq+0x9c/0x170
    to: [<ffffffff823c1562>] __handle_sysrq+0x92/0x170

> >         push %rax
> >         push %rcx
> >         push %rdx
> >         movl $MSR_IA32_DEBUGCTLMSR, %ecx
> >         rdmsr
> >         and $~1, %eax   /* Disable LBR recording */
> >         wrmsr
> >         pop %rdx
> >         pop %rcx
> >         pop %rax
> 
> And the general problem with this approach (even ignoring the performance
> hit, and kernel faults on user addresses really do happen in real workloads) is
> that you're not saving and restoring MSR_IA32_DEBUGCTL. 
> It may be that
> the rest of your patch does whatever magic is needed to make this work, but
> from just this code it's not at all obvious that this is correct.

The algorithm is quite simple:
When I enter in Exception handler, I stop LBR recording, and dump its content later if needed.
When I leave Exception Handler, I restart LBR recording.
So, after the first exception, LBR in On.
In case of nested Exceptions and crash, you're right, LBR will probably not be relevant.
But your proposal does not solve this issue: If we save registers during 1rst exception, and then overwrite them during 2nd level,
we will lose relevant info if crash is due to the 1rst exception.

> Hence my suggestion for rdmsr -- if you're willing to enable this and take the
> performance hit, you can simplify it a lot and save some branch slots by
> unconditionally doing the rdmsrs if you've enabled the LBR tracing IDT entry.
> The simplification from using rdmsr isn't that the save code is simplified -- it's
> that there's no state change on exception entry, so you don't need to worry
> about restoring state correctly on the way out or during a context switch.
> And you can enable/disable the whole thing just by writing to the IDT, so
> there's no performance hit at all in the disabled case.

Concerning performances: if it's really matter, the better is to disable the CONFIG switch.
But if we enable it, it's for using it I guess, and in that case, bypassing UserSpace page faults is better.
You're proposal of "unconditionally doing the rdmsrs" is not good in that case.
The only small gain is when CONFIG is enable and feature is disabled by cmdline:
- with my proposal, we get 1 test and 1 jmp more (if I switch Kernel test with LBR state test): for an exception treatment, does it really matter?

We can mix our proposals: keep my STOP/START code, and replace the dynamic disabling test by IDT change.
I hope the code will stay readable.
Do we really want to save 2 instructions?

Thanks,

Emmanuel.
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-04 16:01                                   ` Berthier, Emmanuel
@ 2014-12-04 18:09                                     ` Andy Lutomirski
  2014-12-05 13:14                                       ` Berthier, Emmanuel
  2014-12-06 10:31                                       ` Robert Jarzmik
  0 siblings, 2 replies; 27+ messages in thread
From: Andy Lutomirski @ 2014-12-04 18:09 UTC (permalink / raw)
  To: Berthier, Emmanuel
  Cc: Thomas Gleixner, H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

On Thu, Dec 4, 2014 at 8:01 AM, Berthier, Emmanuel
<emmanuel.berthier@intel.com> wrote:
>> From: Andy Lutomirski [mailto:luto@amacapital.net]
>> Sent: Wednesday, December 3, 2014 8:30 PM
>> To: Berthier, Emmanuel
>> Cc: Thomas Gleixner; H. Peter Anvin; X86 ML; Jarzmik, Robert; LKML
>> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
>> > The final patch will bypass the new code in case of UserSpace page fault, so
>> performance impact will be very low.
>> > LBRs copy takes much more time than LBR stop/start.
>> >
>> > The simple is the better:
>> >
>> > .macro STOP_LBR
>> > #ifdef CONFIG_LBR_DUMP_ON_EXCEPTION
>> >         testl $3,CS(%rsp)               /* Kernel Space? */
>> >         jnz 1f
>> >         testl $3, PER_CPU_VAR(lbr_dump_state) /* Disabled? */
>> >         jnz 1f
>>
>> But that just wasted two of your LBR slots.
>
> No: false test does not generate Branch record, ex:
>
>   Last Branch Records:
>     to: [<ffffffff828122a0>] page_fault+0x0/0x90
>   from: [<ffffffff823c0e06>] sysrq_handle_crash+0x16/0x20
>     to: [<ffffffff823c0df0>] sysrq_handle_crash+0x0/0x20
>   from: [<ffffffff823c156c>] __handle_sysrq+0x9c/0x170
>     to: [<ffffffff823c1562>] __handle_sysrq+0x92/0x170
>
>> >         push %rax
>> >         push %rcx
>> >         push %rdx
>> >         movl $MSR_IA32_DEBUGCTLMSR, %ecx
>> >         rdmsr
>> >         and $~1, %eax   /* Disable LBR recording */
>> >         wrmsr
>> >         pop %rdx
>> >         pop %rcx
>> >         pop %rax
>>
>> And the general problem with this approach (even ignoring the performance
>> hit, and kernel faults on user addresses really do happen in real workloads) is
>> that you're not saving and restoring MSR_IA32_DEBUGCTL.
>> It may be that
>> the rest of your patch does whatever magic is needed to make this work, but
>> from just this code it's not at all obvious that this is correct.
>
> The algorithm is quite simple:
> When I enter in Exception handler, I stop LBR recording, and dump its content later if needed.
> When I leave Exception Handler, I restart LBR recording.
> So, after the first exception, LBR in On.
> In case of nested Exceptions and crash, you're right, LBR will probably not be relevant.
> But your proposal does not solve this issue: If we save registers during 1rst exception, and then overwrite them during 2nd level,
> we will lose relevant info if crash is due to the 1rst exception.
>
>> Hence my suggestion for rdmsr -- if you're willing to enable this and take the
>> performance hit, you can simplify it a lot and save some branch slots by
>> unconditionally doing the rdmsrs if you've enabled the LBR tracing IDT entry.
>> The simplification from using rdmsr isn't that the save code is simplified -- it's
>> that there's no state change on exception entry, so you don't need to worry
>> about restoring state correctly on the way out or during a context switch.
>> And you can enable/disable the whole thing just by writing to the IDT, so
>> there's no performance hit at all in the disabled case.
>
> Concerning performances: if it's really matter, the better is to disable the CONFIG switch.
> But if we enable it, it's for using it I guess, and in that case, bypassing UserSpace page faults is better.
> You're proposal of "unconditionally doing the rdmsrs" is not good in that case.
> The only small gain is when CONFIG is enable and feature is disabled by cmdline:
> - with my proposal, we get 1 test and 1 jmp more (if I switch Kernel test with LBR state test): for an exception treatment, does it really matter?
>
> We can mix our proposals: keep my STOP/START code, and replace the dynamic disabling test by IDT change.
> I hope the code will stay readable.
> Do we really want to save 2 instructions?

I don't really care about the number of instructions.  But there are
still all the nasty cases:

 - Context switch during exception processing (both in the C handler
and in the retint code).
 - PMI during exception processing.
 - Exception while perf is poking at LBR msrs.

Where are you planning on saving the start/stop previous state?

--Andy

>
> Thanks,
>
> Emmanuel.



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-04 18:09                                     ` Andy Lutomirski
@ 2014-12-05 13:14                                       ` Berthier, Emmanuel
  2014-12-06 10:31                                       ` Robert Jarzmik
  1 sibling, 0 replies; 27+ messages in thread
From: Berthier, Emmanuel @ 2014-12-05 13:14 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Thomas Gleixner, H. Peter Anvin, X86 ML, Jarzmik, Robert, LKML

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 3629 bytes --]

> From: Andy Lutomirski [mailto:luto@amacapital.net]
> Sent: Thursday, December 4, 2014 7:10 PM
> To: Berthier, Emmanuel
> Cc: Thomas Gleixner; H. Peter Anvin; X86 ML; Jarzmik, Robert; LKML
> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
> >> And the general problem with this approach (even ignoring the
> >> performance hit, and kernel faults on user addresses really do happen
> >> in real workloads) is that you're not saving and restoring
> MSR_IA32_DEBUGCTL.
> >> It may be that
> >> the rest of your patch does whatever magic is needed to make this
> >> work, but from just this code it's not at all obvious that this is correct.
> >
> > The algorithm is quite simple:
> > When I enter in Exception handler, I stop LBR recording, and dump its
> content later if needed.
> > When I leave Exception Handler, I restart LBR recording.
> > So, after the first exception, LBR in On.
> > In case of nested Exceptions and crash, you're right, LBR will probably not
> be relevant.
> > But your proposal does not solve this issue: If we save registers
> > during 1rst exception, and then overwrite them during 2nd level, we will
> lose relevant info if crash is due to the 1rst exception.
> >
> >> Hence my suggestion for rdmsr -- if you're willing to enable this and
> >> take the performance hit, you can simplify it a lot and save some
> >> branch slots by unconditionally doing the rdmsrs if you've enabled the LBR
> tracing IDT entry.
> >> The simplification from using rdmsr isn't that the save code is
> >> simplified -- it's that there's no state change on exception entry,
> >> so you don't need to worry about restoring state correctly on the way out
> or during a context switch.
> >> And you can enable/disable the whole thing just by writing to the
> >> IDT, so there's no performance hit at all in the disabled case.
> >
> > Concerning performances: if it's really matter, the better is to disable the
> CONFIG switch.
> > But if we enable it, it's for using it I guess, and in that case, bypassing
> UserSpace page faults is better.
> > You're proposal of "unconditionally doing the rdmsrs" is not good in that
> case.
> > The only small gain is when CONFIG is enable and feature is disabled by
> cmdline:
> > - with my proposal, we get 1 test and 1 jmp more (if I switch Kernel test
> with LBR state test): for an exception treatment, does it really matter?
> >
> > We can mix our proposals: keep my STOP/START code, and replace the
> dynamic disabling test by IDT change.
> > I hope the code will stay readable.
> > Do we really want to save 2 instructions?
> 
> I don't really care about the number of instructions. 

Ok.

> But there are still all the nasty cases:
> 
>  - Context switch during exception processing (both in the C handler and in
> the retint code).
>  - PMI during exception processing.
>  - Exception while perf is poking at LBR msrs.

For Perf compatibility: we disable LBR dump feature when Perf is using LBR.
For other cases, they are rare, but real. The current implementation does not
break anything, only the LBR dump content will be irrelevant.

> Where are you planning on saving the start/stop previous state?

To handle nested exceptions or context switch, we need to store LBRs into the stack.
And as LBRs number is linked to arch, it will be funny.
In fact, I have evaluated this solution during the study, and gave it up after looking at the impact.

Maybe someone has a better proposal?


Thanks,

Emmmanuel.





ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-04 18:09                                     ` Andy Lutomirski
  2014-12-05 13:14                                       ` Berthier, Emmanuel
@ 2014-12-06 10:31                                       ` Robert Jarzmik
       [not found]                                         ` <CALCETrXhfzd9Fkikvm5qj0LWgWtDzgdpY_0EC3ChwyyGZksTMw@mail.gmail.com>
  1 sibling, 1 reply; 27+ messages in thread
From: Robert Jarzmik @ 2014-12-06 10:31 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Berthier, Emmanuel, Thomas Gleixner, H. Peter Anvin, X86 ML,
	Jarzmik, Robert, LKML

Andy Lutomirski <luto@amacapital.net> writes:

> I don't really care about the number of instructions.
Right, a couple of test/jz/jnz is negligible in the exception path, that's what
I also think.

>  But there are still all the nasty cases:
>
>  - Context switch during exception processing (both in the C handler
> and in the retint code).
>  - PMI during exception processing.
>  - Exception while perf is poking at LBR msrs.

Yes.
Wasn't that what Thomas's suggestion on the per-cpu variable was solving ?
Ie:
        DEFINE_PER_CPU(unsigned long, lbr_dump_state) = LBR_OOPS_DISABLED;
        ...

We would have a "LBR resource" variable to track who owns the LBR :
 - nobody : LBR_UNCLAIMED
 - the exception handler : LBR_EXCEPTION_DEBUG_USAGE
   - activated with a runtime variable or config
   - impossible to activate if perf has hold of it
 - the perf code : LBR_PERF_USAGE
   - activated through perf infrastructure
   - impossible to activated if exception handler has hold of it

Now this solves the perf/exception concurrency on the LBR registers. If there is
a rescheduling during the exception, or a PMI, can that have an impact ?
 - case 1: nobody is handling LBR
   => no impact, expception handlers won't touch LBR
 - case 2: perf is handling LBR
   => no imppact, exception handler won't touch LBR

 - case 3: exception handlers are handling LBR

   - case 3a: simple user exception
       -> exception entry
       -> is kernel exception == false => bypass LBR handling
       -> exception handling

   - case 3b: simple kernel exception
       -> exception entry
       -> test lbr_dump_state == EXCEPTION_OWNED => true => STOP LBR
       -> no reschedule, no PMI
       -> exception handling
       -> test lbr_dump_state == EXCEPTION_OWNED => true => START LBR

   - case 3c: kernel exception with PMI
       -> exception entry
       -> test lbr_dump_state == EXCEPTION_OWNED => true => STOP LBR
       -> PMI
          can't touch LBR, as lbr_dump_state == EXCEPTION_OWNED
       -> exception handling
       -> test lbr_dump_state == EXCEPTION_OWNED => true => START LBR

   - case 3d: kernel exception with a reschedule inside
       -> exception entry
       -> test lbr_dump_state == EXCEPTION_OWNED => true => STOP LBR
       -> exception handling
       -> context_switch()
          -> perf cannot touch LBR, nobody can
       -> test lbr_dump_state == EXCEPTION_OWNED => true => START LBR

I might be very wrong in the description as I'm not that sharp on x86, but is
there a flaw in the above cases ?

If not, a couple of tests and Thomas's per-cpu variable can solve the issue,
while keeping the exception handler code simple as Emmanual has proposed (given
the additionnal test inclusion - which will be designed to not pollute the LBR),
and having a small impact on perf to solve the resource acquire issue.

Cheers.

--
Robert

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
       [not found]                                         ` <CALCETrXhfzd9Fkikvm5qj0LWgWtDzgdpY_0EC3ChwyyGZksTMw@mail.gmail.com>
@ 2014-12-07 18:40                                           ` Robert Jarzmik
  2014-12-07 19:10                                             ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: Robert Jarzmik @ 2014-12-07 18:40 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Robert Jarzmik, linux-kernel, H. Peter Anvin, X86 ML,
	Thomas Gleixner, Berthier, Emmanuel

Hi Andy,

Andy Lutomirski <luto@amacapital.net> writes:
> On Dec 6, 2014 2:31 AM, "Robert Jarzmik" <robert.jarzmik@intel.com> wrote:
>> We would have a "LBR resource" variable to track who owns the LBR :
>> - nobody : LBR_UNCLAIMED
>> - the exception handler : LBR_EXCEPTION_DEBUG_USAGE
>
> Which exception handler? There can be several on the stack.
All of them, ie. LBR is used by exception handlers, ie. perf cannot use it, just
as what Emmanuel's patch is doing I think. Or said differently LBR are reserved
for expeption handlers only, whichever have the implementation to use them.

>> - case 3d: kernel exception with a reschedule inside
>> -> exception entry
>> -> test lbr_dump_state == EXCEPTION_OWNED => true => STOP LBR
>> -> exception handling
>> -> context_switch()
>> -> perf cannot touch LBR, nobody can
>> -> test lbr_dump_state == EXCEPTION_OWNED => true => START LBR
>
> Careful. This is still the nested exception, and it just did the wrong thing.
Can you be more explicit about the "wrong" thing ? And would that wrong thing be
solved by a per-cpu reference counter ?

>> I might be very wrong in the description as I'm not that sharp on x86, but is
>> there a flaw in the above cases ?
>>
>> If not, a couple of tests and Thomas's per-cpu variable can solve the issue,
>> while keeping the exception handler code simple as Emmanual has proposed
> (given
>> the additionnal test inclusion - which will be designed to not pollute the
> LBR),
>> and having a small impact on perf to solve the resource acquire issue.
>
> On current kernels, percpu memory is vmalloced, so accessing it can fault, so
> you can't touch percpu memory at all from page_fault until the vmalloc fixup
> runs. Sorry :(
What about INIT_PER_CPU_VAR (as in gdt_page) ? Won't that be mapped all the time
without need for faulting in pages ?

> This is a problem with rdmsr, too.
You mean rdmsr can fault in a non-hypervisor environment ? Because that
definetely opens a new range of corner cases.

> It may be worth fixing that. In fact, it may be worth getting rid of lazy vmap
> entirely.
Your battle ? ;)

Anyway, would a static per-cpu variable (or variables, one about resources
usage, one reference counter) solve our cases (ie. 3d) ?

-- 
Robert

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-07 18:40                                           ` Robert Jarzmik
@ 2014-12-07 19:10                                             ` Andy Lutomirski
  2014-12-12 17:30                                               ` Berthier, Emmanuel
  0 siblings, 1 reply; 27+ messages in thread
From: Andy Lutomirski @ 2014-12-07 19:10 UTC (permalink / raw)
  To: Robert Jarzmik
  Cc: linux-kernel, H. Peter Anvin, X86 ML, Thomas Gleixner, Berthier,
	Emmanuel, Tejun Heo

On Sun, Dec 7, 2014 at 10:40 AM, Robert Jarzmik
<robert.jarzmik@intel.com> wrote:
> Hi Andy,
>
> Andy Lutomirski <luto@amacapital.net> writes:
>> On Dec 6, 2014 2:31 AM, "Robert Jarzmik" <robert.jarzmik@intel.com> wrote:
>>> We would have a "LBR resource" variable to track who owns the LBR :
>>> - nobody : LBR_UNCLAIMED
>>> - the exception handler : LBR_EXCEPTION_DEBUG_USAGE
>>
>> Which exception handler? There can be several on the stack.
> All of them, ie. LBR is used by exception handlers, ie. perf cannot use it, just
> as what Emmanuel's patch is doing I think. Or said differently LBR are reserved
> for expeption handlers only, whichever have the implementation to use them.
>
>>> - case 3d: kernel exception with a reschedule inside
>>> -> exception entry
>>> -> test lbr_dump_state == EXCEPTION_OWNED => true => STOP LBR
>>> -> exception handling
>>> -> context_switch()
>>> -> perf cannot touch LBR, nobody can
>>> -> test lbr_dump_state == EXCEPTION_OWNED => true => START LBR
>>
>> Careful. This is still the nested exception, and it just did the wrong thing.
> Can you be more explicit about the "wrong" thing ? And would that wrong thing be
> solved by a per-cpu reference counter ?

Suppose you have an int3 with a page fault inside.  If the int3
disabled LBR, then the int3 should re-enable it, and the page fault
should not.  This means that, if the inner page fault is, in fact, an
OOPS, then you don't get the LBR trace.

A per-cpu reference counter would solve it.  So would using rdmsr
instead of wrmsr, because there would be nothing to re-enable.  (The
latter also means that both exceptions get the LBR trace if they turn
out to be OOPSes.)

But a per-cpu reference counter still has the per-cpu issue below.

>
>>> I might be very wrong in the description as I'm not that sharp on x86, but is
>>> there a flaw in the above cases ?
>>>
>>> If not, a couple of tests and Thomas's per-cpu variable can solve the issue,
>>> while keeping the exception handler code simple as Emmanual has proposed
>> (given
>>> the additionnal test inclusion - which will be designed to not pollute the
>> LBR),
>>> and having a small impact on perf to solve the resource acquire issue.
>>
>> On current kernels, percpu memory is vmalloced, so accessing it can fault, so
>> you can't touch percpu memory at all from page_fault until the vmalloc fixup
>> runs. Sorry :(
> What about INIT_PER_CPU_VAR (as in gdt_page) ? Won't that be mapped all the time
> without need for faulting in pages ?

I'm not sure.  It may not if CPUs are hotplugged.

>
>> This is a problem with rdmsr, too.
> You mean rdmsr can fault in a non-hypervisor environment ? Because that
> definetely opens a new range of corner cases.
>
>> It may be worth fixing that. In fact, it may be worth getting rid of lazy vmap
>> entirely.
> Your battle ? ;)
>
> Anyway, would a static per-cpu variable (or variables, one about resources
> usage, one reference counter) solve our cases (ie. 3d) ?
>

Possibly, but only if static per-cpu reference counters are safe to
touch in the exception entry code.

Tejun?

--Andy

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-07 19:10                                             ` Andy Lutomirski
@ 2014-12-12 17:30                                               ` Berthier, Emmanuel
  2014-12-12 17:54                                                 ` Andy Lutomirski
  0 siblings, 1 reply; 27+ messages in thread
From: Berthier, Emmanuel @ 2014-12-12 17:30 UTC (permalink / raw)
  To: Andy Lutomirski, Jarzmik, Robert, Thomas Gleixner
  Cc: linux-kernel, H. Peter Anvin, X86 ML, Tejun Heo

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 4421 bytes --]

> From: Andy Lutomirski [mailto:luto@amacapital.net]
> Sent: Sunday, December 7, 2014 8:10 PM
> To: Jarzmik, Robert
> Cc: linux-kernel@vger.kernel.org; H. Peter Anvin; X86 ML; Thomas Gleixner;
> Berthier, Emmanuel; Tejun Heo
> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
> 
> On Sun, Dec 7, 2014 at 10:40 AM, Robert Jarzmik <robert.jarzmik@intel.com>
> wrote:
> > Hi Andy,
> >
> > Andy Lutomirski <luto@amacapital.net> writes:
> >> On Dec 6, 2014 2:31 AM, "Robert Jarzmik" <robert.jarzmik@intel.com>
> wrote:
> >>> We would have a "LBR resource" variable to track who owns the LBR :
> >>> - nobody : LBR_UNCLAIMED
> >>> - the exception handler : LBR_EXCEPTION_DEBUG_USAGE
> >>
> >> Which exception handler? There can be several on the stack.
> > All of them, ie. LBR is used by exception handlers, ie. perf cannot
> > use it, just as what Emmanuel's patch is doing I think. Or said
> > differently LBR are reserved for expeption handlers only, whichever have
> the implementation to use them.
> >
> >>> - case 3d: kernel exception with a reschedule inside
> >>> -> exception entry
> >>> -> test lbr_dump_state == EXCEPTION_OWNED => true => STOP LBR
> >>> -> exception handling
> >>> -> context_switch()
> >>> -> perf cannot touch LBR, nobody can test lbr_dump_state ==
> >>> -> EXCEPTION_OWNED => true => START LBR
> >>
> >> Careful. This is still the nested exception, and it just did the wrong thing.
> > Can you be more explicit about the "wrong" thing ? And would that
> > wrong thing be solved by a per-cpu reference counter ?
> 
> Suppose you have an int3 with a page fault inside.  If the int3 disabled LBR,
> then the int3 should re-enable it, and the page fault should not.  This means
> that, if the inner page fault is, in fact, an OOPS, then you don't get the LBR
> trace.

Please keep in mind that LBR STOP/START is implemented in Exception handlers only, not in Interruption ones.

> A per-cpu reference counter would solve it.  So would using rdmsr instead of
> wrmsr, because there would be nothing to re-enable.  (The latter also means
> that both exceptions get the LBR trace if they turn out to be OOPSes.)
> 
> But a per-cpu reference counter still has the per-cpu issue below.
> 
> >
> >>> I might be very wrong in the description as I'm not that sharp on
> >>> x86, but is there a flaw in the above cases ?
> >>>
> >>> If not, a couple of tests and Thomas's per-cpu variable can solve
> >>> the issue, while keeping the exception handler code simple as
> >>> Emmanual has proposed
> >> (given
> >>> the additionnal test inclusion - which will be designed to not
> >>> pollute the
> >> LBR),
> >>> and having a small impact on perf to solve the resource acquire issue.
> >>
> >> On current kernels, percpu memory is vmalloced, so accessing it can
> >> fault, so you can't touch percpu memory at all from page_fault until
> >> the vmalloc fixup runs. Sorry :(
> > What about INIT_PER_CPU_VAR (as in gdt_page) ? Won't that be mapped
> > all the time without need for faulting in pages ?
> 
> I'm not sure.  It may not if CPUs are hotplugged.

I propose to replace PER_CPU variable by an atomic usage counter.
The behavior will be all or nothing: as soon as perf_event uses LBR on any Core, the lbr dump feature will be disabled.

> >
> >> This is a problem with rdmsr, too.
> > You mean rdmsr can fault in a non-hypervisor environment ? Because
> > that definetely opens a new range of corner cases.
> >
> >> It may be worth fixing that. In fact, it may be worth getting rid of
> >> lazy vmap entirely.
> > Your battle ? ;)
> >
> > Anyway, would a static per-cpu variable (or variables, one about
> > resources usage, one reference counter) solve our cases (ie. 3d) ?
> >
> 
> Possibly, but only if static per-cpu reference counters are safe to touch in the
> exception entry code.
> 
> Tejun?
> 
> --Andy

For the nested exception/rescheduling topic, the only solution I see would be to store LBRs into the stack.
But as the purpose of this feature is to catch stack corruption issue, this is not an option.
So, we have to accept that the nested case is not supported, and in case of crash inside the nested treatment, LBR dump will be irrelevant.
I will try to add a test to report this case in the dumpstack.

Thanks,

Emmanuel.

ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] [LBR] Dump LBRs on Exception
  2014-12-12 17:30                                               ` Berthier, Emmanuel
@ 2014-12-12 17:54                                                 ` Andy Lutomirski
  0 siblings, 0 replies; 27+ messages in thread
From: Andy Lutomirski @ 2014-12-12 17:54 UTC (permalink / raw)
  To: Berthier, Emmanuel
  Cc: Jarzmik, Robert, Thomas Gleixner, linux-kernel, H. Peter Anvin,
	X86 ML, Tejun Heo

On Fri, Dec 12, 2014 at 9:30 AM, Berthier, Emmanuel
<emmanuel.berthier@intel.com> wrote:
>> From: Andy Lutomirski [mailto:luto@amacapital.net]
>> Sent: Sunday, December 7, 2014 8:10 PM
>> To: Jarzmik, Robert
>> Cc: linux-kernel@vger.kernel.org; H. Peter Anvin; X86 ML; Thomas Gleixner;
>> Berthier, Emmanuel; Tejun Heo
>> Subject: Re: [PATCH v2] [LBR] Dump LBRs on Exception
>>
>> On Sun, Dec 7, 2014 at 10:40 AM, Robert Jarzmik <robert.jarzmik@intel.com>
>> wrote:
>> > Hi Andy,
>> >
>> > Andy Lutomirski <luto@amacapital.net> writes:
>> >> On Dec 6, 2014 2:31 AM, "Robert Jarzmik" <robert.jarzmik@intel.com>
>> wrote:
>> >>> We would have a "LBR resource" variable to track who owns the LBR :
>> >>> - nobody : LBR_UNCLAIMED
>> >>> - the exception handler : LBR_EXCEPTION_DEBUG_USAGE
>> >>
>> >> Which exception handler? There can be several on the stack.
>> > All of them, ie. LBR is used by exception handlers, ie. perf cannot
>> > use it, just as what Emmanuel's patch is doing I think. Or said
>> > differently LBR are reserved for expeption handlers only, whichever have
>> the implementation to use them.
>> >
>> >>> - case 3d: kernel exception with a reschedule inside
>> >>> -> exception entry
>> >>> -> test lbr_dump_state == EXCEPTION_OWNED => true => STOP LBR
>> >>> -> exception handling
>> >>> -> context_switch()
>> >>> -> perf cannot touch LBR, nobody can test lbr_dump_state ==
>> >>> -> EXCEPTION_OWNED => true => START LBR
>> >>
>> >> Careful. This is still the nested exception, and it just did the wrong thing.
>> > Can you be more explicit about the "wrong" thing ? And would that
>> > wrong thing be solved by a per-cpu reference counter ?
>>
>> Suppose you have an int3 with a page fault inside.  If the int3 disabled LBR,
>> then the int3 should re-enable it, and the page fault should not.  This means
>> that, if the inner page fault is, in fact, an OOPS, then you don't get the LBR
>> trace.
>
> Please keep in mind that LBR STOP/START is implemented in Exception handlers only, not in Interruption ones.
>
>> A per-cpu reference counter would solve it.  So would using rdmsr instead of
>> wrmsr, because there would be nothing to re-enable.  (The latter also means
>> that both exceptions get the LBR trace if they turn out to be OOPSes.)
>>
>> But a per-cpu reference counter still has the per-cpu issue below.
>>
>> >
>> >>> I might be very wrong in the description as I'm not that sharp on
>> >>> x86, but is there a flaw in the above cases ?
>> >>>
>> >>> If not, a couple of tests and Thomas's per-cpu variable can solve
>> >>> the issue, while keeping the exception handler code simple as
>> >>> Emmanual has proposed
>> >> (given
>> >>> the additionnal test inclusion - which will be designed to not
>> >>> pollute the
>> >> LBR),
>> >>> and having a small impact on perf to solve the resource acquire issue.
>> >>
>> >> On current kernels, percpu memory is vmalloced, so accessing it can
>> >> fault, so you can't touch percpu memory at all from page_fault until
>> >> the vmalloc fixup runs. Sorry :(
>> > What about INIT_PER_CPU_VAR (as in gdt_page) ? Won't that be mapped
>> > all the time without need for faulting in pages ?
>>
>> I'm not sure.  It may not if CPUs are hotplugged.
>
> I propose to replace PER_CPU variable by an atomic usage counter.
> The behavior will be all or nothing: as soon as perf_event uses LBR on any Core, the lbr dump feature will be disabled.
>
>> >
>> >> This is a problem with rdmsr, too.
>> > You mean rdmsr can fault in a non-hypervisor environment ? Because
>> > that definetely opens a new range of corner cases.
>> >
>> >> It may be worth fixing that. In fact, it may be worth getting rid of
>> >> lazy vmap entirely.
>> > Your battle ? ;)
>> >
>> > Anyway, would a static per-cpu variable (or variables, one about
>> > resources usage, one reference counter) solve our cases (ie. 3d) ?
>> >
>>
>> Possibly, but only if static per-cpu reference counters are safe to touch in the
>> exception entry code.
>>
>> Tejun?
>>
>> --Andy
>
> For the nested exception/rescheduling topic, the only solution I see would be to store LBRs into the stack.
> But as the purpose of this feature is to catch stack corruption issue, this is not an option.

I don't understand the problem here.

If the stack is corrupt, causing you to follow a bad pointer and get a
page fault, the new frame written by the page fault will be fine.

If the stack *pointer* is bad, then you won't enter page_fault at all
-- you'll enter double_fault instead.  double_fault will be called
with a valid stack pointer from the IST table.

If you were to put LBRs on the stack, you'd probably want to put them
below pt_regs, which might require a bit of gymnastics, but it would
certainly be doable.  You wouldn't be able to do that by tweaking the
IDT entry, though.

You could also put them above pt_regs, which could be done by tweaking
the IDT entry (no exit changed would be needed whatsoever), but you'd
have to move the hardware frame down by the size of the LBR record in
your stub.

--Andy

> So, we have to accept that the nested case is not supported, and in case of crash inside the nested treatment, LBR dump will be irrelevant.
> I will try to add a test to report this case in the dumpstack.
>
> Thanks,
>
> Emmanuel.
>



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2014-12-12 17:55 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-21 17:03 [PATCH] [LBR] Dump LBRs on Oops Emmanuel Berthier
2014-11-22  0:50 ` Thomas Gleixner
2014-11-26 10:56   ` Berthier, Emmanuel
2014-11-26 13:08     ` Thomas Gleixner
2014-11-26 14:17       ` Berthier, Emmanuel
2014-11-26 14:46         ` Thomas Gleixner
2014-11-26 15:43           ` Berthier, Emmanuel
2014-11-27 14:40             ` [PATCH v2] [LBR] Dump LBRs on Exception Emmanuel Berthier
2014-11-27 21:22               ` Thomas Gleixner
2014-11-27 21:56                 ` Andy Lutomirski
2014-11-28  8:44                   ` Berthier, Emmanuel
2014-11-28 15:15                     ` Andy Lutomirski
2014-12-02 19:09                       ` Berthier, Emmanuel
2014-12-02 19:33                         ` Andy Lutomirski
2014-12-02 19:56                           ` Thomas Gleixner
2014-12-02 20:12                             ` Andy Lutomirski
2014-12-03 18:25                               ` Berthier, Emmanuel
2014-12-03 19:29                                 ` Andy Lutomirski
2014-12-04 16:01                                   ` Berthier, Emmanuel
2014-12-04 18:09                                     ` Andy Lutomirski
2014-12-05 13:14                                       ` Berthier, Emmanuel
2014-12-06 10:31                                       ` Robert Jarzmik
     [not found]                                         ` <CALCETrXhfzd9Fkikvm5qj0LWgWtDzgdpY_0EC3ChwyyGZksTMw@mail.gmail.com>
2014-12-07 18:40                                           ` Robert Jarzmik
2014-12-07 19:10                                             ` Andy Lutomirski
2014-12-12 17:30                                               ` Berthier, Emmanuel
2014-12-12 17:54                                                 ` Andy Lutomirski
2014-11-28 10:28                 ` Berthier, Emmanuel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).