All of lore.kernel.org
 help / color / mirror / Atom feed
* [bug] kpti, perf_event, bts: sporadic truncated trace
@ 2018-07-12 10:15 Metzger, Markus T
  2018-07-12 14:01 ` Andi Kleen
  2018-07-13  1:22 ` Hugh Dickins
  0 siblings, 2 replies; 6+ messages in thread
From: Metzger, Markus T @ 2018-07-12 10:15 UTC (permalink / raw)
  To: Hugh Dickins, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo
  Cc: Shishkin, Alexander, Kleen, Andi, Linux Kernel Mailing List

Hello,

Starting with 4.15 I noticed that BTS is sporadically missing the tail
of the trace in the perf_event data buffer.  It shows as

    [decode error (1): instruction overflow]

in GDB.  Chances to see this are higher the longer the debuggee is
running.  With this [1] tiny patch to one of GDB's tests, I am able to
reproduce it reliably on my box.  To run the test, use:

    $ make -s check RUNTESTFLAGS="gdb.btrace/exception.exp"

from the gdb/ sub-directory in the GDB build directory.

The issue remains when I use 'nopti' on the kernel command-line.


Bisecting yielded commit

    c1961a4 x86/events/intel/ds: Map debug buffers in cpu_entry_area

I reverted the commit on top of v4.17 [2] and the issue disappears
when I use 'nopti' on the kernel command-line.

regards,
markus.


[1]
diff --git a/gdb/testsuite/gdb.btrace/exception.exp b/gdb/testsuite/gdb.btrace/exception.exp
index 9408d61..a24ddd3 100755
--- a/gdb/testsuite/gdb.btrace/exception.exp
+++ b/gdb/testsuite/gdb.btrace/exception.exp
@@ -36,16 +36,12 @@ if ![runto_main] {
 gdb_test_no_output "set record function-call-history-size 0"
 
 # set bp
-set bp_1 [gdb_get_line_number "bp.1" $srcfile]
 set bp_2 [gdb_get_line_number "bp.2" $srcfile]
-gdb_breakpoint $bp_1
 gdb_breakpoint $bp_2
 
-# trace the code between the two breakpoints
-gdb_continue_to_breakpoint "cont to bp.1" ".*$srcfile:$bp_1\r\n.*"
 # increase the BTS buffer size - the trace can be quite big
-gdb_test_no_output "set record btrace bts buffer-size 128000"
-gdb_test_no_output "record btrace"
+gdb_test_no_output "set record btrace bts buffer-size 1024000"
+gdb_test_no_output "record btrace bts"
 gdb_continue_to_breakpoint "cont to bp.2" ".*$srcfile:$bp_2\r\n.*"
 
 # show the flat branch trace


[2]
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8a10a045b57b..46381f73c595 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,7 +3,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
-#include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
 #include <asm/tlbflush.h>
 #include <asm/insn.h>
@@ -282,67 +281,17 @@ void fini_debug_store_on_cpu(int cpu)
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 
-static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
-{
-	unsigned long start = (unsigned long)cea;
-	phys_addr_t pa;
-	size_t msz = 0;
-
-	pa = virt_to_phys(addr);
-
-	preempt_disable();
-	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
-		cea_set_pte(cea, pa, prot);
-
-	/*
-	 * This is a cross-CPU update of the cpu_entry_area, we must shoot down
-	 * all TLB entries for it.
-	 */
-	flush_tlb_kernel_range(start, start + size);
-	preempt_enable();
-}
-
-static void ds_clear_cea(void *cea, size_t size)
-{
-	unsigned long start = (unsigned long)cea;
-	size_t msz = 0;
-
-	preempt_disable();
-	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
-		cea_set_pte(cea, 0, PAGE_NONE);
-
-	flush_tlb_kernel_range(start, start + size);
-	preempt_enable();
-}
-
-static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
-{
-	unsigned int order = get_order(size);
-	int node = cpu_to_node(cpu);
-	struct page *page;
-
-	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
-	return page ? page_address(page) : NULL;
-}
-
-static void dsfree_pages(const void *buffer, size_t size)
-{
-	if (buffer)
-		free_pages((unsigned long)buffer, get_order(size));
-}
-
 static int alloc_pebs_buffer(int cpu)
 {
-	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-	struct debug_store *ds = hwev->ds;
-	size_t bsiz = x86_pmu.pebs_buffer_size;
-	int max, node = cpu_to_node(cpu);
-	void *buffer, *ibuffer, *cea;
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int node = cpu_to_node(cpu);
+	int max;
+	void *buffer, *ibuffer;
 
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
+	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -353,94 +302,99 @@ static int alloc_pebs_buffer(int cpu)
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		if (!ibuffer) {
-			dsfree_pages(buffer, bsiz);
+			kfree(buffer);
 			return -ENOMEM;
 		}
 		per_cpu(insn_buffer, cpu) = ibuffer;
 	}
-	hwev->ds_pebs_vaddr = buffer;
-	/* Update the cpu entry area mapping */
-	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
-	ds->pebs_buffer_base = (unsigned long) cea;
-	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
+
+	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
+
+	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
 	ds->pebs_index = ds->pebs_buffer_base;
-	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
-	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
+	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+		max * x86_pmu.pebs_record_size;
+
 	return 0;
 }
 
 static void release_pebs_buffer(int cpu)
 {
-	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-	void *cea;
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 
-	if (!x86_pmu.pebs)
+	if (!ds || !x86_pmu.pebs)
 		return;
 
 	kfree(per_cpu(insn_buffer, cpu));
 	per_cpu(insn_buffer, cpu) = NULL;
 
-	/* Clear the fixmap */
-	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
-	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
-	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
-	hwev->ds_pebs_vaddr = NULL;
+	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	ds->pebs_buffer_base = 0;
 }
 
 static int alloc_bts_buffer(int cpu)
 {
-	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-	struct debug_store *ds = hwev->ds;
-	void *buffer, *cea;
-	int max;
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int node = cpu_to_node(cpu);
+	int max, thresh;
+	void *buffer;
 
 	if (!x86_pmu.bts)
 		return 0;
 
-	buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
+	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
 	if (unlikely(!buffer)) {
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		return -ENOMEM;
 	}
-	hwev->ds_bts_vaddr = buffer;
-	/* Update the fixmap */
-	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
-	ds->bts_buffer_base = (unsigned long) cea;
-	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
+
+	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+	thresh = max / 16;
+
+	ds->bts_buffer_base = (u64)(unsigned long)buffer;
 	ds->bts_index = ds->bts_buffer_base;
-	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
-	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
-	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
+	ds->bts_absolute_maximum = ds->bts_buffer_base +
+		max * BTS_RECORD_SIZE;
+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+		thresh * BTS_RECORD_SIZE;
+
 	return 0;
 }
 
 static void release_bts_buffer(int cpu)
 {
-	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-	void *cea;
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 
-	if (!x86_pmu.bts)
+	if (!ds || !x86_pmu.bts)
 		return;
 
-	/* Clear the fixmap */
-	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
-	ds_clear_cea(cea, BTS_BUFFER_SIZE);
-	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
-	hwev->ds_bts_vaddr = NULL;
+	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	ds->bts_buffer_base = 0;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
-	struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
+	int node = cpu_to_node(cpu);
+	struct debug_store *ds;
+
+	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
+	if (unlikely(!ds))
+		return -ENOMEM;
 
-	memset(ds, 0, sizeof(*ds));
 	per_cpu(cpu_hw_events, cpu).ds = ds;
+
 	return 0;
 }
 
 static void release_ds_buffer(int cpu)
 {
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
+	kfree(ds);
 }
 
 void release_ds_buffers(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9f3711470ec1..4eb9c12915e4 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -200,8 +200,6 @@ struct cpu_hw_events {
 	 * Intel DebugStore bits
 	 */
 	struct debug_store	*ds;
-	void			*ds_pebs_vaddr;
-	void			*ds_bts_vaddr;
 	u64			pebs_enabled;
 	int			n_pebs;
 	int			n_large_pebs;

Intel Deutschland GmbH
Registered Address: Am Campeon 10-12, 85579 Neubiberg, Germany
Tel: +49 89 99 8853-0, www.intel.de
Managing Directors: Christin Eisenschmid, Christian Lamprechter
Chairperson of the Supervisory Board: Nicole Lau
Registered Office: Munich
Commercial Register: Amtsgericht Muenchen HRB 186928


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [bug] kpti, perf_event, bts: sporadic truncated trace
  2018-07-12 10:15 [bug] kpti, perf_event, bts: sporadic truncated trace Metzger, Markus T
@ 2018-07-12 14:01 ` Andi Kleen
  2018-07-13  1:22 ` Hugh Dickins
  1 sibling, 0 replies; 6+ messages in thread
From: Andi Kleen @ 2018-07-12 14:01 UTC (permalink / raw)
  To: Metzger, Markus T
  Cc: Hugh Dickins, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Shishkin, Alexander, Kleen, Andi,
	Linux Kernel Mailing List, dave.hansen

"Metzger, Markus T" <markus.t.metzger@intel.com> writes:

Adding Dave Hansen

> Hello,
>
> Starting with 4.15 I noticed that BTS is sporadically missing the tail
> of the trace in the perf_event data buffer.  It shows as
>
>     [decode error (1): instruction overflow]
>
> in GDB.  Chances to see this are higher the longer the debuggee is
> running.  With this [1] tiny patch to one of GDB's tests, I am able to
> reproduce it reliably on my box.  To run the test, use:
>
>     $ make -s check RUNTESTFLAGS="gdb.btrace/exception.exp"
>
> from the gdb/ sub-directory in the GDB build directory.
>
> The issue remains when I use 'nopti' on the kernel command-line.
>
>
> Bisecting yielded commit
>
>     c1961a4 x86/events/intel/ds: Map debug buffers in cpu_entry_area
>
> I reverted the commit on top of v4.17 [2] and the issue disappears
> when I use 'nopti' on the kernel command-line.
>
> regards,
> markus.
>
>
> [1]
> diff --git a/gdb/testsuite/gdb.btrace/exception.exp b/gdb/testsuite/gdb.btrace/exception.exp
> index 9408d61..a24ddd3 100755
> --- a/gdb/testsuite/gdb.btrace/exception.exp
> +++ b/gdb/testsuite/gdb.btrace/exception.exp
> @@ -36,16 +36,12 @@ if ![runto_main] {
>  gdb_test_no_output "set record function-call-history-size 0"
>  
>  # set bp
> -set bp_1 [gdb_get_line_number "bp.1" $srcfile]
>  set bp_2 [gdb_get_line_number "bp.2" $srcfile]
> -gdb_breakpoint $bp_1
>  gdb_breakpoint $bp_2
>  
> -# trace the code between the two breakpoints
> -gdb_continue_to_breakpoint "cont to bp.1" ".*$srcfile:$bp_1\r\n.*"
>  # increase the BTS buffer size - the trace can be quite big
> -gdb_test_no_output "set record btrace bts buffer-size 128000"
> -gdb_test_no_output "record btrace"
> +gdb_test_no_output "set record btrace bts buffer-size 1024000"
> +gdb_test_no_output "record btrace bts"
>  gdb_continue_to_breakpoint "cont to bp.2" ".*$srcfile:$bp_2\r\n.*"
>  
>  # show the flat branch trace
>
>
> [2]
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index 8a10a045b57b..46381f73c595 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -3,7 +3,6 @@
>  #include <linux/types.h>
>  #include <linux/slab.h>
>  
> -#include <asm/cpu_entry_area.h>
>  #include <asm/perf_event.h>
>  #include <asm/tlbflush.h>
>  #include <asm/insn.h>
> @@ -282,67 +281,17 @@ void fini_debug_store_on_cpu(int cpu)
>  
>  static DEFINE_PER_CPU(void *, insn_buffer);
>  
> -static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
> -{
> -	unsigned long start = (unsigned long)cea;
> -	phys_addr_t pa;
> -	size_t msz = 0;
> -
> -	pa = virt_to_phys(addr);
> -
> -	preempt_disable();
> -	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
> -		cea_set_pte(cea, pa, prot);
> -
> -	/*
> -	 * This is a cross-CPU update of the cpu_entry_area, we must shoot down
> -	 * all TLB entries for it.
> -	 */
> -	flush_tlb_kernel_range(start, start + size);
> -	preempt_enable();
> -}
> -
> -static void ds_clear_cea(void *cea, size_t size)
> -{
> -	unsigned long start = (unsigned long)cea;
> -	size_t msz = 0;
> -
> -	preempt_disable();
> -	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
> -		cea_set_pte(cea, 0, PAGE_NONE);
> -
> -	flush_tlb_kernel_range(start, start + size);
> -	preempt_enable();
> -}
> -
> -static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
> -{
> -	unsigned int order = get_order(size);
> -	int node = cpu_to_node(cpu);
> -	struct page *page;
> -
> -	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
> -	return page ? page_address(page) : NULL;
> -}
> -
> -static void dsfree_pages(const void *buffer, size_t size)
> -{
> -	if (buffer)
> -		free_pages((unsigned long)buffer, get_order(size));
> -}
> -
>  static int alloc_pebs_buffer(int cpu)
>  {
> -	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
> -	struct debug_store *ds = hwev->ds;
> -	size_t bsiz = x86_pmu.pebs_buffer_size;
> -	int max, node = cpu_to_node(cpu);
> -	void *buffer, *ibuffer, *cea;
> +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
> +	int node = cpu_to_node(cpu);
> +	int max;
> +	void *buffer, *ibuffer;
>  
>  	if (!x86_pmu.pebs)
>  		return 0;
>  
> -	buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
> +	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
>  	if (unlikely(!buffer))
>  		return -ENOMEM;
>  
> @@ -353,94 +302,99 @@ static int alloc_pebs_buffer(int cpu)
>  	if (x86_pmu.intel_cap.pebs_format < 2) {
>  		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
>  		if (!ibuffer) {
> -			dsfree_pages(buffer, bsiz);
> +			kfree(buffer);
>  			return -ENOMEM;
>  		}
>  		per_cpu(insn_buffer, cpu) = ibuffer;
>  	}
> -	hwev->ds_pebs_vaddr = buffer;
> -	/* Update the cpu entry area mapping */
> -	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
> -	ds->pebs_buffer_base = (unsigned long) cea;
> -	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
> +
> +	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
> +
> +	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
>  	ds->pebs_index = ds->pebs_buffer_base;
> -	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
> -	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
> +	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
> +		max * x86_pmu.pebs_record_size;
> +
>  	return 0;
>  }
>  
>  static void release_pebs_buffer(int cpu)
>  {
> -	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
> -	void *cea;
> +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
>  
> -	if (!x86_pmu.pebs)
> +	if (!ds || !x86_pmu.pebs)
>  		return;
>  
>  	kfree(per_cpu(insn_buffer, cpu));
>  	per_cpu(insn_buffer, cpu) = NULL;
>  
> -	/* Clear the fixmap */
> -	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
> -	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
> -	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
> -	hwev->ds_pebs_vaddr = NULL;
> +	kfree((void *)(unsigned long)ds->pebs_buffer_base);
> +	ds->pebs_buffer_base = 0;
>  }
>  
>  static int alloc_bts_buffer(int cpu)
>  {
> -	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
> -	struct debug_store *ds = hwev->ds;
> -	void *buffer, *cea;
> -	int max;
> +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
> +	int node = cpu_to_node(cpu);
> +	int max, thresh;
> +	void *buffer;
>  
>  	if (!x86_pmu.bts)
>  		return 0;
>  
> -	buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
> +	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
>  	if (unlikely(!buffer)) {
>  		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
>  		return -ENOMEM;
>  	}
> -	hwev->ds_bts_vaddr = buffer;
> -	/* Update the fixmap */
> -	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
> -	ds->bts_buffer_base = (unsigned long) cea;
> -	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
> +
> +	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
> +	thresh = max / 16;
> +
> +	ds->bts_buffer_base = (u64)(unsigned long)buffer;
>  	ds->bts_index = ds->bts_buffer_base;
> -	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
> -	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
> -	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
> +	ds->bts_absolute_maximum = ds->bts_buffer_base +
> +		max * BTS_RECORD_SIZE;
> +	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
> +		thresh * BTS_RECORD_SIZE;
> +
>  	return 0;
>  }
>  
>  static void release_bts_buffer(int cpu)
>  {
> -	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
> -	void *cea;
> +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
>  
> -	if (!x86_pmu.bts)
> +	if (!ds || !x86_pmu.bts)
>  		return;
>  
> -	/* Clear the fixmap */
> -	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
> -	ds_clear_cea(cea, BTS_BUFFER_SIZE);
> -	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
> -	hwev->ds_bts_vaddr = NULL;
> +	kfree((void *)(unsigned long)ds->bts_buffer_base);
> +	ds->bts_buffer_base = 0;
>  }
>  
>  static int alloc_ds_buffer(int cpu)
>  {
> -	struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
> +	int node = cpu_to_node(cpu);
> +	struct debug_store *ds;
> +
> +	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
> +	if (unlikely(!ds))
> +		return -ENOMEM;
>  
> -	memset(ds, 0, sizeof(*ds));
>  	per_cpu(cpu_hw_events, cpu).ds = ds;
> +
>  	return 0;
>  }
>  
>  static void release_ds_buffer(int cpu)
>  {
> +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
> +
> +	if (!ds)
> +		return;
> +
>  	per_cpu(cpu_hw_events, cpu).ds = NULL;
> +	kfree(ds);
>  }
>  
>  void release_ds_buffers(void)
> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> index 9f3711470ec1..4eb9c12915e4 100644
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -200,8 +200,6 @@ struct cpu_hw_events {
>  	 * Intel DebugStore bits
>  	 */
>  	struct debug_store	*ds;
> -	void			*ds_pebs_vaddr;
> -	void			*ds_bts_vaddr;
>  	u64			pebs_enabled;
>  	int			n_pebs;
>  	int			n_large_pebs;
>
> Intel Deutschland GmbH
> Registered Address: Am Campeon 10-12, 85579 Neubiberg, Germany
> Tel: +49 89 99 8853-0, www.intel.de
> Managing Directors: Christin Eisenschmid, Christian Lamprechter
> Chairperson of the Supervisory Board: Nicole Lau
> Registered Office: Munich
> Commercial Register: Amtsgericht Muenchen HRB 186928

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [bug] kpti, perf_event, bts: sporadic truncated trace
  2018-07-12 10:15 [bug] kpti, perf_event, bts: sporadic truncated trace Metzger, Markus T
  2018-07-12 14:01 ` Andi Kleen
@ 2018-07-13  1:22 ` Hugh Dickins
  2018-07-13 10:06   ` Metzger, Markus T
  2018-07-13 10:53   ` Peter Zijlstra
  1 sibling, 2 replies; 6+ messages in thread
From: Hugh Dickins @ 2018-07-13  1:22 UTC (permalink / raw)
  To: Metzger, Markus T
  Cc: Hugh Dickins, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Shishkin, Alexander, Kleen, Andi,
	Dave Hansen, Thomas Gleixner, Stephane Eranian,
	Linux Kernel Mailing List

On Thu, 12 Jul 2018, Metzger, Markus T wrote:

> Hello,
> 
> Starting with 4.15 I noticed that BTS is sporadically missing the tail
> of the trace in the perf_event data buffer.  It shows as
> 
>     [decode error (1): instruction overflow]
> 
> in GDB.  Chances to see this are higher the longer the debuggee is
> running.  With this [1] tiny patch to one of GDB's tests, I am able to
> reproduce it reliably on my box.  To run the test, use:
> 
>     $ make -s check RUNTESTFLAGS="gdb.btrace/exception.exp"
> 
> from the gdb/ sub-directory in the GDB build directory.
> 
> The issue remains when I use 'nopti' on the kernel command-line.
> 
> 
> Bisecting yielded commit
> 
>     c1961a4 x86/events/intel/ds: Map debug buffers in cpu_entry_area
> 
> I reverted the commit on top of v4.17 [2] and the issue disappears
> when I use 'nopti' on the kernel command-line.
> 
> regards,
> markus.
> 
> 
> [1]
> diff --git a/gdb/testsuite/gdb.btrace/exception.exp b/gdb/testsuite/gdb.btrace/exception.exp
> index 9408d61..a24ddd3 100755
> --- a/gdb/testsuite/gdb.btrace/exception.exp
> +++ b/gdb/testsuite/gdb.btrace/exception.exp
> @@ -36,16 +36,12 @@ if ![runto_main] {
>  gdb_test_no_output "set record function-call-history-size 0"
>  
>  # set bp
> -set bp_1 [gdb_get_line_number "bp.1" $srcfile]
>  set bp_2 [gdb_get_line_number "bp.2" $srcfile]
> -gdb_breakpoint $bp_1
>  gdb_breakpoint $bp_2
>  
> -# trace the code between the two breakpoints
> -gdb_continue_to_breakpoint "cont to bp.1" ".*$srcfile:$bp_1\r\n.*"
>  # increase the BTS buffer size - the trace can be quite big
> -gdb_test_no_output "set record btrace bts buffer-size 128000"
> -gdb_test_no_output "record btrace"
> +gdb_test_no_output "set record btrace bts buffer-size 1024000"
> +gdb_test_no_output "record btrace bts"
>  gdb_continue_to_breakpoint "cont to bp.2" ".*$srcfile:$bp_2\r\n.*"
>  
>  # show the flat branch trace
> 
> 
> [2]
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
[ snipped the revert ]

Although my name was kept on that commit as a generous courtesy, it
did change a lot after leaving my fingers - and I was never the best
person to be making perf changes in the first place!

I'm sorry to hear that it's breaking you, I've spent a little while
looking through its final state, most of it looks fine to me, but I
notice one discrepancy: whose effect I cannot predict at all, but
there's a chance that it has something to do with what you're seeing.

A little "optimization" crept into alloc_bts_buffer() along the way,
which now places bts_interrupt_threshold not on a record boundary.
And Stephane has shown me the sentence in Vol 3B, 17.4.9, which says
"This address must point to an offset from the BTS buffer base that
is a multiple of the BTS record size."

Please give the patch below a try, and let us know if it helps (if it
does not, then I think we'll need perfier expertise than I can give).

Hugh

--- 4.18-rc4/arch/x86/events/intel/ds.c	2018-06-03 14:15:21.000000000 -0700
+++ linux/arch/x86/events/intel/ds.c	2018-07-12 17:38:28.471378616 -0700
@@ -408,9 +408,11 @@ static int alloc_bts_buffer(int cpu)
 	ds->bts_buffer_base = (unsigned long) cea;
 	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
 	ds->bts_index = ds->bts_buffer_base;
-	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
-	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
-	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
+	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+	ds->bts_absolute_maximum = ds->bts_buffer_base +
+					max * BTS_RECORD_SIZE;
+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+					(max / 16) * BTS_RECORD_SIZE;
 	return 0;
 }
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [bug] kpti, perf_event, bts: sporadic truncated trace
  2018-07-13  1:22 ` Hugh Dickins
@ 2018-07-13 10:06   ` Metzger, Markus T
  2018-07-13 18:31     ` Hugh Dickins
  2018-07-13 10:53   ` Peter Zijlstra
  1 sibling, 1 reply; 6+ messages in thread
From: Metzger, Markus T @ 2018-07-13 10:06 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo, Shishkin,
	Alexander, Kleen, Andi, Hansen, Dave, Thomas Gleixner,
	Stephane Eranian, Linux Kernel Mailing List

Hello Hugh,

> A little "optimization" crept into alloc_bts_buffer() along the way, which now
> places bts_interrupt_threshold not on a record boundary.
> And Stephane has shown me the sentence in Vol 3B, 17.4.9, which says "This
> address must point to an offset from the BTS buffer base that is a multiple of the
> BTS record size."
> 
> Please give the patch below a try, and let us know if it helps (if it does not, then I
> think we'll need perfier expertise than I can give).
> 
> Hugh
> 
> --- 4.18-rc4/arch/x86/events/intel/ds.c	2018-06-03 14:15:21.000000000 -0700
> +++ linux/arch/x86/events/intel/ds.c	2018-07-12 17:38:28.471378616 -0700
> @@ -408,9 +408,11 @@ static int alloc_bts_buffer(int cpu)
>  	ds->bts_buffer_base = (unsigned long) cea;
>  	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
>  	ds->bts_index = ds->bts_buffer_base;
> -	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
> -	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
> -	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
> +	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
> +	ds->bts_absolute_maximum = ds->bts_buffer_base +
> +					max * BTS_RECORD_SIZE;
> +	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
> +					(max / 16) * BTS_RECORD_SIZE;
>  	return 0;
>  }
> 

Your patch fixes it.

Will you send it to the list for inclusion?  I'd appreciate if it could also be backported
to 4.15, 4.16, and 4.17.

Thanks,
Markus.

Intel Deutschland GmbH
Registered Address: Am Campeon 10-12, 85579 Neubiberg, Germany
Tel: +49 89 99 8853-0, www.intel.de
Managing Directors: Christin Eisenschmid, Christian Lamprechter
Chairperson of the Supervisory Board: Nicole Lau
Registered Office: Munich
Commercial Register: Amtsgericht Muenchen HRB 186928


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [bug] kpti, perf_event, bts: sporadic truncated trace
  2018-07-13  1:22 ` Hugh Dickins
  2018-07-13 10:06   ` Metzger, Markus T
@ 2018-07-13 10:53   ` Peter Zijlstra
  1 sibling, 0 replies; 6+ messages in thread
From: Peter Zijlstra @ 2018-07-13 10:53 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Metzger, Markus T, Ingo Molnar, Arnaldo Carvalho de Melo,
	Shishkin, Alexander, Kleen, Andi, Dave Hansen, Thomas Gleixner,
	Stephane Eranian, Linux Kernel Mailing List

On Thu, Jul 12, 2018 at 06:22:13PM -0700, Hugh Dickins wrote:
> A little "optimization" crept into alloc_bts_buffer() along the way,
> which now places bts_interrupt_threshold not on a record boundary.
> And Stephane has shown me the sentence in Vol 3B, 17.4.9, which says
> "This address must point to an offset from the BTS buffer base that
> is a multiple of the BTS record size."
> 
> Please give the patch below a try, and let us know if it helps (if it
> does not, then I think we'll need perfier expertise than I can give).

Ooh, good find!

> --- 4.18-rc4/arch/x86/events/intel/ds.c	2018-06-03 14:15:21.000000000 -0700
> +++ linux/arch/x86/events/intel/ds.c	2018-07-12 17:38:28.471378616 -0700
> @@ -408,9 +408,11 @@ static int alloc_bts_buffer(int cpu)
>  	ds->bts_buffer_base = (unsigned long) cea;
>  	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
>  	ds->bts_index = ds->bts_buffer_base;
> -	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
> -	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
> -	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
> +	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
> +	ds->bts_absolute_maximum = ds->bts_buffer_base +
> +					max * BTS_RECORD_SIZE;
> +	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
> +					(max / 16) * BTS_RECORD_SIZE;
>  	return 0;
>  }
>  

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [bug] kpti, perf_event, bts: sporadic truncated trace
  2018-07-13 10:06   ` Metzger, Markus T
@ 2018-07-13 18:31     ` Hugh Dickins
  0 siblings, 0 replies; 6+ messages in thread
From: Hugh Dickins @ 2018-07-13 18:31 UTC (permalink / raw)
  To: Metzger, Markus T
  Cc: Hugh Dickins, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Shishkin, Alexander, Kleen, Andi,
	Hansen, Dave, Thomas Gleixner, Stephane Eranian,
	Linux Kernel Mailing List

On Fri, 13 Jul 2018, Metzger, Markus T wrote:
> Hello Hugh,
> 
> > A little "optimization" crept into alloc_bts_buffer() along the way, which now
> > places bts_interrupt_threshold not on a record boundary.
> > And Stephane has shown me the sentence in Vol 3B, 17.4.9, which says "This
> > address must point to an offset from the BTS buffer base that is a multiple of the
> > BTS record size."
> > 
> > Please give the patch below a try, and let us know if it helps (if it does not, then I
> > think we'll need perfier expertise than I can give).
> > 
> > Hugh
> > 
> > --- 4.18-rc4/arch/x86/events/intel/ds.c	2018-06-03 14:15:21.000000000 -0700
> > +++ linux/arch/x86/events/intel/ds.c	2018-07-12 17:38:28.471378616 -0700
> > @@ -408,9 +408,11 @@ static int alloc_bts_buffer(int cpu)
> >  	ds->bts_buffer_base = (unsigned long) cea;
> >  	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
> >  	ds->bts_index = ds->bts_buffer_base;
> > -	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
> > -	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
> > -	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
> > +	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
> > +	ds->bts_absolute_maximum = ds->bts_buffer_base +
> > +					max * BTS_RECORD_SIZE;
> > +	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
> > +					(max / 16) * BTS_RECORD_SIZE;
> >  	return 0;
> >  }
> > 
> 
> Your patch fixes it.

Oh, very welcome news! Thanks for putting in all the effort you did to
track down from your end: I'm relieved it was easier to fix than that.

> 
> Will you send it to the list for inclusion?

Sure, I'll send to x86 guys and lkml later today.  I can't promise that
it will make 4.18, since the regression it fixes pre-dates 4.18-rc; but
they don't usually insist on that rule for something as small and benign
as this, so I expect it'll make it.

> I'd appreciate if it could also be backported
> to 4.15, 4.16, and 4.17.

I'll mark it Cc stable, so that it will soon percolate through to the
4.14 longterm and 4.17 stable trees.  But 4.15 and 4.16 stable trees are
now EOL, so you'll have to apply the patch where you need it yourself -
it has no release-dependent subtlety, it should apply cleanly at offset.

Hugh

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2018-07-13 18:31 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-12 10:15 [bug] kpti, perf_event, bts: sporadic truncated trace Metzger, Markus T
2018-07-12 14:01 ` Andi Kleen
2018-07-13  1:22 ` Hugh Dickins
2018-07-13 10:06   ` Metzger, Markus T
2018-07-13 18:31     ` Hugh Dickins
2018-07-13 10:53   ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.