All of lore.kernel.org
 help / color / mirror / Atom feed
* [bug] kpti, perf_event, bts: sporadic truncated trace
@ 2018-07-12 10:15 Metzger, Markus T
  2018-07-12 14:01 ` Andi Kleen
  2018-07-13  1:22 ` Hugh Dickins
  0 siblings, 2 replies; 6+ messages in thread
From: Metzger, Markus T @ 2018-07-12 10:15 UTC (permalink / raw)
  To: Hugh Dickins, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo
  Cc: Shishkin, Alexander, Kleen, Andi, Linux Kernel Mailing List

Hello,

Starting with 4.15 I noticed that BTS is sporadically missing the tail
of the trace in the perf_event data buffer.  It shows as

    [decode error (1): instruction overflow]

in GDB.  Chances to see this are higher the longer the debuggee is
running.  With this [1] tiny patch to one of GDB's tests, I am able to
reproduce it reliably on my box.  To run the test, use:

    $ make -s check RUNTESTFLAGS="gdb.btrace/exception.exp"

from the gdb/ sub-directory in the GDB build directory.

The issue remains when I use 'nopti' on the kernel command-line.


Bisecting yielded commit

    c1961a4 x86/events/intel/ds: Map debug buffers in cpu_entry_area

I reverted the commit on top of v4.17 [2] and the issue disappears
when I use 'nopti' on the kernel command-line.

regards,
markus.


[1]
diff --git a/gdb/testsuite/gdb.btrace/exception.exp b/gdb/testsuite/gdb.btrace/exception.exp
index 9408d61..a24ddd3 100755
--- a/gdb/testsuite/gdb.btrace/exception.exp
+++ b/gdb/testsuite/gdb.btrace/exception.exp
@@ -36,16 +36,12 @@ if ![runto_main] {
 gdb_test_no_output "set record function-call-history-size 0"
 
 # set bp
-set bp_1 [gdb_get_line_number "bp.1" $srcfile]
 set bp_2 [gdb_get_line_number "bp.2" $srcfile]
-gdb_breakpoint $bp_1
 gdb_breakpoint $bp_2
 
-# trace the code between the two breakpoints
-gdb_continue_to_breakpoint "cont to bp.1" ".*$srcfile:$bp_1\r\n.*"
 # increase the BTS buffer size - the trace can be quite big
-gdb_test_no_output "set record btrace bts buffer-size 128000"
-gdb_test_no_output "record btrace"
+gdb_test_no_output "set record btrace bts buffer-size 1024000"
+gdb_test_no_output "record btrace bts"
 gdb_continue_to_breakpoint "cont to bp.2" ".*$srcfile:$bp_2\r\n.*"
 
 # show the flat branch trace


[2]
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8a10a045b57b..46381f73c595 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,7 +3,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
-#include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
 #include <asm/tlbflush.h>
 #include <asm/insn.h>
@@ -282,67 +281,17 @@ void fini_debug_store_on_cpu(int cpu)
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 
-static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
-{
-	unsigned long start = (unsigned long)cea;
-	phys_addr_t pa;
-	size_t msz = 0;
-
-	pa = virt_to_phys(addr);
-
-	preempt_disable();
-	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
-		cea_set_pte(cea, pa, prot);
-
-	/*
-	 * This is a cross-CPU update of the cpu_entry_area, we must shoot down
-	 * all TLB entries for it.
-	 */
-	flush_tlb_kernel_range(start, start + size);
-	preempt_enable();
-}
-
-static void ds_clear_cea(void *cea, size_t size)
-{
-	unsigned long start = (unsigned long)cea;
-	size_t msz = 0;
-
-	preempt_disable();
-	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
-		cea_set_pte(cea, 0, PAGE_NONE);
-
-	flush_tlb_kernel_range(start, start + size);
-	preempt_enable();
-}
-
-static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
-{
-	unsigned int order = get_order(size);
-	int node = cpu_to_node(cpu);
-	struct page *page;
-
-	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
-	return page ? page_address(page) : NULL;
-}
-
-static void dsfree_pages(const void *buffer, size_t size)
-{
-	if (buffer)
-		free_pages((unsigned long)buffer, get_order(size));
-}
-
 static int alloc_pebs_buffer(int cpu)
 {
-	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-	struct debug_store *ds = hwev->ds;
-	size_t bsiz = x86_pmu.pebs_buffer_size;
-	int max, node = cpu_to_node(cpu);
-	void *buffer, *ibuffer, *cea;
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int node = cpu_to_node(cpu);
+	int max;
+	void *buffer, *ibuffer;
 
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
+	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -353,94 +302,99 @@ static int alloc_pebs_buffer(int cpu)
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		if (!ibuffer) {
-			dsfree_pages(buffer, bsiz);
+			kfree(buffer);
 			return -ENOMEM;
 		}
 		per_cpu(insn_buffer, cpu) = ibuffer;
 	}
-	hwev->ds_pebs_vaddr = buffer;
-	/* Update the cpu entry area mapping */
-	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
-	ds->pebs_buffer_base = (unsigned long) cea;
-	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
+
+	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
+
+	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
 	ds->pebs_index = ds->pebs_buffer_base;
-	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
-	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
+	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+		max * x86_pmu.pebs_record_size;
+
 	return 0;
 }
 
 static void release_pebs_buffer(int cpu)
 {
-	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-	void *cea;
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 
-	if (!x86_pmu.pebs)
+	if (!ds || !x86_pmu.pebs)
 		return;
 
 	kfree(per_cpu(insn_buffer, cpu));
 	per_cpu(insn_buffer, cpu) = NULL;
 
-	/* Clear the fixmap */
-	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
-	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
-	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
-	hwev->ds_pebs_vaddr = NULL;
+	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	ds->pebs_buffer_base = 0;
 }
 
 static int alloc_bts_buffer(int cpu)
 {
-	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-	struct debug_store *ds = hwev->ds;
-	void *buffer, *cea;
-	int max;
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int node = cpu_to_node(cpu);
+	int max, thresh;
+	void *buffer;
 
 	if (!x86_pmu.bts)
 		return 0;
 
-	buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
+	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
 	if (unlikely(!buffer)) {
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		return -ENOMEM;
 	}
-	hwev->ds_bts_vaddr = buffer;
-	/* Update the fixmap */
-	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
-	ds->bts_buffer_base = (unsigned long) cea;
-	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
+
+	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+	thresh = max / 16;
+
+	ds->bts_buffer_base = (u64)(unsigned long)buffer;
 	ds->bts_index = ds->bts_buffer_base;
-	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
-	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
-	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
+	ds->bts_absolute_maximum = ds->bts_buffer_base +
+		max * BTS_RECORD_SIZE;
+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+		thresh * BTS_RECORD_SIZE;
+
 	return 0;
 }
 
 static void release_bts_buffer(int cpu)
 {
-	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-	void *cea;
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 
-	if (!x86_pmu.bts)
+	if (!ds || !x86_pmu.bts)
 		return;
 
-	/* Clear the fixmap */
-	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
-	ds_clear_cea(cea, BTS_BUFFER_SIZE);
-	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
-	hwev->ds_bts_vaddr = NULL;
+	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	ds->bts_buffer_base = 0;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
-	struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
+	int node = cpu_to_node(cpu);
+	struct debug_store *ds;
+
+	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
+	if (unlikely(!ds))
+		return -ENOMEM;
 
-	memset(ds, 0, sizeof(*ds));
 	per_cpu(cpu_hw_events, cpu).ds = ds;
+
 	return 0;
 }
 
 static void release_ds_buffer(int cpu)
 {
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
+	kfree(ds);
 }
 
 void release_ds_buffers(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9f3711470ec1..4eb9c12915e4 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -200,8 +200,6 @@ struct cpu_hw_events {
 	 * Intel DebugStore bits
 	 */
 	struct debug_store	*ds;
-	void			*ds_pebs_vaddr;
-	void			*ds_bts_vaddr;
 	u64			pebs_enabled;
 	int			n_pebs;
 	int			n_large_pebs;

Intel Deutschland GmbH
Registered Address: Am Campeon 10-12, 85579 Neubiberg, Germany
Tel: +49 89 99 8853-0, www.intel.de
Managing Directors: Christin Eisenschmid, Christian Lamprechter
Chairperson of the Supervisory Board: Nicole Lau
Registered Office: Munich
Commercial Register: Amtsgericht Muenchen HRB 186928


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2018-07-13 18:31 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-12 10:15 [bug] kpti, perf_event, bts: sporadic truncated trace Metzger, Markus T
2018-07-12 14:01 ` Andi Kleen
2018-07-13  1:22 ` Hugh Dickins
2018-07-13 10:06   ` Metzger, Markus T
2018-07-13 18:31     ` Hugh Dickins
2018-07-13 10:53   ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.