Re: [bug] kpti, perf_event, bts: sporadic truncated trace

From: Andi Kleen <ak@linux.intel.com>
To: "Metzger\, Markus T" <markus.t.metzger@intel.com>
Cc: Hugh Dickins <hughd@google.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	Arnaldo Carvalho de Melo <acme@kernel.org>, "Shishkin\,
	Alexander" <alexander.shishkin@intel.com>, "Kleen\,
	Andi" <andi.kleen@intel.com>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	dave.hansen@intel.com
Subject: Re: [bug] kpti, perf_event, bts: sporadic truncated trace
Date: Thu, 12 Jul 2018 07:01:35 -0700	[thread overview]
Message-ID: <87h8l4h7sw.fsf@linux.intel.com> (raw)
In-Reply-To: <A78C989F6D9628469189715575E55B236B330102@IRSMSX104.ger.corp.intel.com> (Markus T. Metzger's message of "Thu, 12 Jul 2018 10:15:32 +0000")

"Metzger, Markus T" <markus.t.metzger@intel.com> writes:

Adding Dave Hansen

> Hello,
>
> Starting with 4.15 I noticed that BTS is sporadically missing the tail
> of the trace in the perf_event data buffer.  It shows as
>
>     [decode error (1): instruction overflow]
>
> in GDB.  Chances to see this are higher the longer the debuggee is
> running.  With this [1] tiny patch to one of GDB's tests, I am able to
> reproduce it reliably on my box.  To run the test, use:
>
>     $ make -s check RUNTESTFLAGS="gdb.btrace/exception.exp"
>
> from the gdb/ sub-directory in the GDB build directory.
>
> The issue remains when I use 'nopti' on the kernel command-line.
>
>
> Bisecting yielded commit
>
>     c1961a4 x86/events/intel/ds: Map debug buffers in cpu_entry_area
>
> I reverted the commit on top of v4.17 [2] and the issue disappears
> when I use 'nopti' on the kernel command-line.
>
> regards,
> markus.
>
>
> [1]
> diff --git a/gdb/testsuite/gdb.btrace/exception.exp b/gdb/testsuite/gdb.btrace/exception.exp
> index 9408d61..a24ddd3 100755
> --- a/gdb/testsuite/gdb.btrace/exception.exp
> +++ b/gdb/testsuite/gdb.btrace/exception.exp
> @@ -36,16 +36,12 @@ if ![runto_main] {
>  gdb_test_no_output "set record function-call-history-size 0"
>  
>  # set bp
> -set bp_1 [gdb_get_line_number "bp.1" $srcfile]
>  set bp_2 [gdb_get_line_number "bp.2" $srcfile]
> -gdb_breakpoint $bp_1
>  gdb_breakpoint $bp_2
>  
> -# trace the code between the two breakpoints
> -gdb_continue_to_breakpoint "cont to bp.1" ".*$srcfile:$bp_1\r\n.*"
>  # increase the BTS buffer size - the trace can be quite big
> -gdb_test_no_output "set record btrace bts buffer-size 128000"
> -gdb_test_no_output "record btrace"
> +gdb_test_no_output "set record btrace bts buffer-size 1024000"
> +gdb_test_no_output "record btrace bts"
>  gdb_continue_to_breakpoint "cont to bp.2" ".*$srcfile:$bp_2\r\n.*"
>  
>  # show the flat branch trace
>
>
> [2]
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index 8a10a045b57b..46381f73c595 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -3,7 +3,6 @@
>  #include <linux/types.h>
>  #include <linux/slab.h>
>  
> -#include <asm/cpu_entry_area.h>
>  #include <asm/perf_event.h>
>  #include <asm/tlbflush.h>
>  #include <asm/insn.h>
> @@ -282,67 +281,17 @@ void fini_debug_store_on_cpu(int cpu)
>  
>  static DEFINE_PER_CPU(void *, insn_buffer);
>  
> -static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
> -{
> -	unsigned long start = (unsigned long)cea;
> -	phys_addr_t pa;
> -	size_t msz = 0;
> -
> -	pa = virt_to_phys(addr);
> -
> -	preempt_disable();
> -	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
> -		cea_set_pte(cea, pa, prot);
> -
> -	/*
> -	 * This is a cross-CPU update of the cpu_entry_area, we must shoot down
> -	 * all TLB entries for it.
> -	 */
> -	flush_tlb_kernel_range(start, start + size);
> -	preempt_enable();
> -}
> -
> -static void ds_clear_cea(void *cea, size_t size)
> -{
> -	unsigned long start = (unsigned long)cea;
> -	size_t msz = 0;
> -
> -	preempt_disable();
> -	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
> -		cea_set_pte(cea, 0, PAGE_NONE);
> -
> -	flush_tlb_kernel_range(start, start + size);
> -	preempt_enable();
> -}
> -
> -static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
> -{
> -	unsigned int order = get_order(size);
> -	int node = cpu_to_node(cpu);
> -	struct page *page;
> -
> -	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
> -	return page ? page_address(page) : NULL;
> -}
> -
> -static void dsfree_pages(const void *buffer, size_t size)
> -{
> -	if (buffer)
> -		free_pages((unsigned long)buffer, get_order(size));
> -}
> -
>  static int alloc_pebs_buffer(int cpu)
>  {
> -	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
> -	struct debug_store *ds = hwev->ds;
> -	size_t bsiz = x86_pmu.pebs_buffer_size;
> -	int max, node = cpu_to_node(cpu);
> -	void *buffer, *ibuffer, *cea;
> +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
> +	int node = cpu_to_node(cpu);
> +	int max;
> +	void *buffer, *ibuffer;
>  
>  	if (!x86_pmu.pebs)
>  		return 0;
>  
> -	buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
> +	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
>  	if (unlikely(!buffer))
>  		return -ENOMEM;
>  
> @@ -353,94 +302,99 @@ static int alloc_pebs_buffer(int cpu)
>  	if (x86_pmu.intel_cap.pebs_format < 2) {
>  		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
>  		if (!ibuffer) {
> -			dsfree_pages(buffer, bsiz);
> +			kfree(buffer);
>  			return -ENOMEM;
>  		}
>  		per_cpu(insn_buffer, cpu) = ibuffer;
>  	}
> -	hwev->ds_pebs_vaddr = buffer;
> -	/* Update the cpu entry area mapping */
> -	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
> -	ds->pebs_buffer_base = (unsigned long) cea;
> -	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
> +
> +	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
> +
> +	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
>  	ds->pebs_index = ds->pebs_buffer_base;
> -	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
> -	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
> +	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
> +		max * x86_pmu.pebs_record_size;
> +
>  	return 0;
>  }
>  
>  static void release_pebs_buffer(int cpu)
>  {
> -	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
> -	void *cea;
> +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
>  
> -	if (!x86_pmu.pebs)
> +	if (!ds || !x86_pmu.pebs)
>  		return;
>  
>  	kfree(per_cpu(insn_buffer, cpu));
>  	per_cpu(insn_buffer, cpu) = NULL;
>  
> -	/* Clear the fixmap */
> -	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
> -	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
> -	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
> -	hwev->ds_pebs_vaddr = NULL;
> +	kfree((void *)(unsigned long)ds->pebs_buffer_base);
> +	ds->pebs_buffer_base = 0;
>  }
>  
>  static int alloc_bts_buffer(int cpu)
>  {
> -	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
> -	struct debug_store *ds = hwev->ds;
> -	void *buffer, *cea;
> -	int max;
> +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
> +	int node = cpu_to_node(cpu);
> +	int max, thresh;
> +	void *buffer;
>  
>  	if (!x86_pmu.bts)
>  		return 0;
>  
> -	buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
> +	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
>  	if (unlikely(!buffer)) {
>  		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
>  		return -ENOMEM;
>  	}
> -	hwev->ds_bts_vaddr = buffer;
> -	/* Update the fixmap */
> -	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
> -	ds->bts_buffer_base = (unsigned long) cea;
> -	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
> +
> +	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
> +	thresh = max / 16;
> +
> +	ds->bts_buffer_base = (u64)(unsigned long)buffer;
>  	ds->bts_index = ds->bts_buffer_base;
> -	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
> -	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
> -	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
> +	ds->bts_absolute_maximum = ds->bts_buffer_base +
> +		max * BTS_RECORD_SIZE;
> +	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
> +		thresh * BTS_RECORD_SIZE;
> +
>  	return 0;
>  }
>  
>  static void release_bts_buffer(int cpu)
>  {
> -	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
> -	void *cea;
> +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
>  
> -	if (!x86_pmu.bts)
> +	if (!ds || !x86_pmu.bts)
>  		return;
>  
> -	/* Clear the fixmap */
> -	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
> -	ds_clear_cea(cea, BTS_BUFFER_SIZE);
> -	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
> -	hwev->ds_bts_vaddr = NULL;
> +	kfree((void *)(unsigned long)ds->bts_buffer_base);
> +	ds->bts_buffer_base = 0;
>  }
>  
>  static int alloc_ds_buffer(int cpu)
>  {
> -	struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
> +	int node = cpu_to_node(cpu);
> +	struct debug_store *ds;
> +
> +	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
> +	if (unlikely(!ds))
> +		return -ENOMEM;
>  
> -	memset(ds, 0, sizeof(*ds));
>  	per_cpu(cpu_hw_events, cpu).ds = ds;
> +
>  	return 0;
>  }
>  
>  static void release_ds_buffer(int cpu)
>  {
> +	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
> +
> +	if (!ds)
> +		return;
> +
>  	per_cpu(cpu_hw_events, cpu).ds = NULL;
> +	kfree(ds);
>  }
>  
>  void release_ds_buffers(void)
> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> index 9f3711470ec1..4eb9c12915e4 100644
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -200,8 +200,6 @@ struct cpu_hw_events {
>  	 * Intel DebugStore bits
>  	 */
>  	struct debug_store	*ds;
> -	void			*ds_pebs_vaddr;
> -	void			*ds_bts_vaddr;
>  	u64			pebs_enabled;
>  	int			n_pebs;
>  	int			n_large_pebs;
>
> Intel Deutschland GmbH
> Registered Address: Am Campeon 10-12, 85579 Neubiberg, Germany
> Tel: +49 89 99 8853-0, www.intel.de
> Managing Directors: Christin Eisenschmid, Christian Lamprechter
> Chairperson of the Supervisory Board: Nicole Lau
> Registered Office: Munich
> Commercial Register: Amtsgericht Muenchen HRB 186928