LKML Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH 0/6] perf: Reduce stack usage (and misc bits)
@ 2020-10-30 15:13 Peter Zijlstra
  2020-10-30 15:13 ` [PATCH 1/6] perf: Reduce stack usage of perf_output_begin() Peter Zijlstra
                   ` (5 more replies)
  0 siblings, 6 replies; 23+ messages in thread
From: Peter Zijlstra @ 2020-10-30 15:13 UTC (permalink / raw)
  To: mingo, tglx, rostedt
  Cc: linux-kernel, kan.liang, acme, mark.rutland, alexander.shishkin,
	jolsa, namhyung, ak, eranian, peterz

Hi,

Steve reported an NMI stack overflow when running perf and function tracing
together. Thomas found that we had 4 copies of struct perf_sample_data
on-stack.

These here patches reduce that to 2 copies and half the size of it.

So just for perf_sample_data we go from 4*384=1536 to 2*192=384 bytes of stack.
Also remove one struct pt_regs instance from __intel_pmu_pebs_event(); it has
another instance in struct x86_perf_regs which I haven't yet managed to
offload.

Perf seems to still work... :-)


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/6] perf: Reduce stack usage of perf_output_begin()
  2020-10-30 15:13 [PATCH 0/6] perf: Reduce stack usage (and misc bits) Peter Zijlstra
@ 2020-10-30 15:13 ` Peter Zijlstra
  2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
  2020-10-30 15:13 ` [PATCH 2/6] perf/x86: Reduce stack usage for x86_pmu::drain_pebs() Peter Zijlstra
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2020-10-30 15:13 UTC (permalink / raw)
  To: mingo, tglx, rostedt
  Cc: linux-kernel, kan.liang, acme, mark.rutland, alexander.shishkin,
	jolsa, namhyung, ak, eranian, peterz

__perf_output_begin() has an on-stack struct perf_sample_data in the
unlikely case it needs to generate a LOST record. However, every call
to perf_output_begin() must already have a perf_sample_data on-stack.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/powerpc/perf/imc-pmu.c     |    2 +-
 arch/s390/kernel/perf_cpum_sf.c |    2 +-
 arch/x86/events/intel/ds.c      |    4 ++--
 include/linux/perf_event.h      |    7 +++++--
 kernel/events/core.c            |   32 +++++++++++++++++---------------
 kernel/events/ring_buffer.c     |   20 +++++++++++---------
 6 files changed, 37 insertions(+), 30 deletions(-)

--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1336,7 +1336,7 @@ static void dump_trace_imc_data(struct p
 			/* If this is a valid record, create the sample */
 			struct perf_output_handle handle;
 
-			if (perf_output_begin(&handle, event, header.size))
+			if (perf_output_begin(&handle, &data, event, header.size))
 				return;
 
 			perf_output_sample(&handle, &header, &data, event);
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -672,7 +672,7 @@ static void cpumsf_output_event_pid(stru
 	rcu_read_lock();
 
 	perf_prepare_sample(&header, data, event, regs);
-	if (perf_output_begin(&handle, event, header.size))
+	if (perf_output_begin(&handle, data, event, header.size))
 		goto out;
 
 	/* Update the process ID (see also kernel/events/core.c) */
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -642,8 +642,8 @@ int intel_pmu_drain_bts_buffer(void)
 	rcu_read_lock();
 	perf_prepare_sample(&header, &data, event, &regs);
 
-	if (perf_output_begin(&handle, event, header.size *
-			      (top - base - skip)))
+	if (perf_output_begin(&handle, &data, event,
+			      header.size * (top - base - skip)))
 		goto unlock;
 
 	for (at = base; at < top; at++) {
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1400,11 +1400,14 @@ perf_event_addr_filters(struct perf_even
 extern void perf_event_addr_filters_sync(struct perf_event *event);
 
 extern int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_sample_data *data,
 			     struct perf_event *event, unsigned int size);
 extern int perf_output_begin_forward(struct perf_output_handle *handle,
-				    struct perf_event *event,
-				    unsigned int size);
+				     struct perf_sample_data *data,
+				     struct perf_event *event,
+				     unsigned int size);
 extern int perf_output_begin_backward(struct perf_output_handle *handle,
+				      struct perf_sample_data *data,
 				      struct perf_event *event,
 				      unsigned int size);
 
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7186,6 +7186,7 @@ __perf_event_output(struct perf_event *e
 		    struct perf_sample_data *data,
 		    struct pt_regs *regs,
 		    int (*output_begin)(struct perf_output_handle *,
+					struct perf_sample_data *,
 					struct perf_event *,
 					unsigned int))
 {
@@ -7198,7 +7199,7 @@ __perf_event_output(struct perf_event *e
 
 	perf_prepare_sample(&header, data, event, regs);
 
-	err = output_begin(&handle, event, header.size);
+	err = output_begin(&handle, data, event, header.size);
 	if (err)
 		goto exit;
 
@@ -7264,7 +7265,7 @@ perf_event_read_event(struct perf_event
 	int ret;
 
 	perf_event_header__init_id(&read_event.header, &sample, event);
-	ret = perf_output_begin(&handle, event, read_event.header.size);
+	ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
 	if (ret)
 		return;
 
@@ -7533,7 +7534,7 @@ static void perf_event_task_output(struc
 
 	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				task_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -7636,7 +7637,7 @@ static void perf_event_comm_output(struc
 		return;
 
 	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				comm_event->event_id.header.size);
 
 	if (ret)
@@ -7736,7 +7737,7 @@ static void perf_event_namespaces_output
 
 	perf_event_header__init_id(&namespaces_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				namespaces_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -7863,7 +7864,7 @@ static void perf_event_cgroup_output(str
 
 	perf_event_header__init_id(&cgroup_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				cgroup_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -7989,7 +7990,7 @@ static void perf_event_mmap_output(struc
 	}
 
 	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				mmap_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -8299,7 +8300,7 @@ void perf_event_aux_event(struct perf_ev
 	int ret;
 
 	perf_event_header__init_id(&rec.header, &sample, event);
-	ret = perf_output_begin(&handle, event, rec.header.size);
+	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
 
 	if (ret)
 		return;
@@ -8333,7 +8334,7 @@ void perf_log_lost_samples(struct perf_e
 
 	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				lost_samples_event.header.size);
 	if (ret)
 		return;
@@ -8388,7 +8389,7 @@ static void perf_event_switch_output(str
 
 	perf_event_header__init_id(&se->event_id.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event, se->event_id.header.size);
+	ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
 	if (ret)
 		return;
 
@@ -8463,7 +8464,7 @@ static void perf_log_throttle(struct per
 
 	perf_event_header__init_id(&throttle_event.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				throttle_event.header.size);
 	if (ret)
 		return;
@@ -8506,7 +8507,7 @@ static void perf_event_ksymbol_output(st
 
 	perf_event_header__init_id(&ksymbol_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				ksymbol_event->event_id.header.size);
 	if (ret)
 		return;
@@ -8596,7 +8597,7 @@ static void perf_event_bpf_output(struct
 
 	perf_event_header__init_id(&bpf_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, data, event,
 				bpf_event->event_id.header.size);
 	if (ret)
 		return;
@@ -8705,7 +8706,8 @@ static void perf_event_text_poke_output(
 
 	perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size);
+	ret = perf_output_begin(&handle, &sample, event,
+				text_poke_event->event_id.header.size);
 	if (ret)
 		return;
 
@@ -8786,7 +8788,7 @@ static void perf_log_itrace_start(struct
 	rec.tid	= perf_event_tid(event, current);
 
 	perf_event_header__init_id(&rec.header, &sample, event);
-	ret = perf_output_begin(&handle, event, rec.header.size);
+	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
 
 	if (ret)
 		return;
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head
 
 static __always_inline int
 __perf_output_begin(struct perf_output_handle *handle,
+		    struct perf_sample_data *data,
 		    struct perf_event *event, unsigned int size,
 		    bool backward)
 {
@@ -237,18 +238,16 @@ __perf_output_begin(struct perf_output_h
 	handle->size = (1UL << page_shift) - offset;
 
 	if (unlikely(have_lost)) {
-		struct perf_sample_data sample_data;
-
 		lost_event.header.size = sizeof(lost_event);
 		lost_event.header.type = PERF_RECORD_LOST;
 		lost_event.header.misc = 0;
 		lost_event.id          = event->id;
 		lost_event.lost        = local_xchg(&rb->lost, 0);
 
-		perf_event_header__init_id(&lost_event.header,
-					   &sample_data, event);
+		/* XXX mostly redundant; @data is already fully initializes */
+		perf_event_header__init_id(&lost_event.header, data, event);
 		perf_output_put(handle, lost_event);
-		perf_event__output_id_sample(event, handle, &sample_data);
+		perf_event__output_id_sample(event, handle, data);
 	}
 
 	return 0;
@@ -263,22 +262,25 @@ __perf_output_begin(struct perf_output_h
 }
 
 int perf_output_begin_forward(struct perf_output_handle *handle,
-			     struct perf_event *event, unsigned int size)
+			      struct perf_sample_data *data,
+			      struct perf_event *event, unsigned int size)
 {
-	return __perf_output_begin(handle, event, size, false);
+	return __perf_output_begin(handle, data, event, size, false);
 }
 
 int perf_output_begin_backward(struct perf_output_handle *handle,
+			       struct perf_sample_data *data,
 			       struct perf_event *event, unsigned int size)
 {
-	return __perf_output_begin(handle, event, size, true);
+	return __perf_output_begin(handle, data, event, size, true);
 }
 
 int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_sample_data *data,
 		      struct perf_event *event, unsigned int size)
 {
 
-	return __perf_output_begin(handle, event, size,
+	return __perf_output_begin(handle, data, event, size,
 				   unlikely(is_write_backward(event)));
 }
 



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 2/6] perf/x86: Reduce stack usage for x86_pmu::drain_pebs()
  2020-10-30 15:13 [PATCH 0/6] perf: Reduce stack usage (and misc bits) Peter Zijlstra
  2020-10-30 15:13 ` [PATCH 1/6] perf: Reduce stack usage of perf_output_begin() Peter Zijlstra
@ 2020-10-30 15:13 ` Peter Zijlstra
  2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
  2020-10-30 15:13 ` [PATCH 3/6] perf: Fix get_recursion_context() Peter Zijlstra
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2020-10-30 15:13 UTC (permalink / raw)
  To: mingo, tglx, rostedt
  Cc: linux-kernel, kan.liang, acme, mark.rutland, alexander.shishkin,
	jolsa, namhyung, ak, eranian, peterz

intel_pmu_drain_pebs_*() is typically called from handle_pmi_common(),
both have an on-stack struct perf_sample_data, which is *big*. Rewire
things so that drain_pebs() can use the one handle_pmi_common() has.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/events/intel/core.c |    2 -
 arch/x86/events/intel/ds.c   |   47 ++++++++++++++++++++++---------------------
 arch/x86/events/perf_event.h |    2 -
 3 files changed, 27 insertions(+), 24 deletions(-)

--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2630,7 +2630,7 @@ static int handle_pmi_common(struct pt_r
 		u64 pebs_enabled = cpuc->pebs_enabled;
 
 		handled++;
-		x86_pmu.drain_pebs(regs);
+		x86_pmu.drain_pebs(regs, &data);
 		status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
 
 		/*
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -670,7 +670,9 @@ int intel_pmu_drain_bts_buffer(void)
 
 static inline void intel_pmu_drain_pebs_buffer(void)
 {
-	x86_pmu.drain_pebs(NULL);
+	struct perf_sample_data data;
+
+	x86_pmu.drain_pebs(NULL, &data);
 }
 
 /*
@@ -1719,19 +1721,20 @@ intel_pmu_save_and_restart_reload(struct
 	return 0;
 }
 
-static void __intel_pmu_pebs_event(struct perf_event *event,
-				   struct pt_regs *iregs,
-				   void *base, void *top,
-				   int bit, int count,
-				   void (*setup_sample)(struct perf_event *,
-						struct pt_regs *,
-						void *,
-						struct perf_sample_data *,
-						struct pt_regs *))
+static __always_inline void
+__intel_pmu_pebs_event(struct perf_event *event,
+		       struct pt_regs *iregs,
+		       struct perf_sample_data *data,
+		       void *base, void *top,
+		       int bit, int count,
+		       void (*setup_sample)(struct perf_event *,
+					    struct pt_regs *,
+					    void *,
+					    struct perf_sample_data *,
+					    struct pt_regs *))
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	struct perf_sample_data data;
 	struct x86_perf_regs perf_regs;
 	struct pt_regs *regs = &perf_regs.regs;
 	void *at = get_next_pebs_record_by_bit(base, top, bit);
@@ -1752,14 +1755,14 @@ static void __intel_pmu_pebs_event(struc
 		iregs = &dummy_iregs;
 
 	while (count > 1) {
-		setup_sample(event, iregs, at, &data, regs);
-		perf_event_output(event, &data, regs);
+		setup_sample(event, iregs, at, data, regs);
+		perf_event_output(event, data, regs);
 		at += cpuc->pebs_record_size;
 		at = get_next_pebs_record_by_bit(at, top, bit);
 		count--;
 	}
 
-	setup_sample(event, iregs, at, &data, regs);
+	setup_sample(event, iregs, at, data, regs);
 	if (iregs == &dummy_iregs) {
 		/*
 		 * The PEBS records may be drained in the non-overflow context,
@@ -1767,18 +1770,18 @@ static void __intel_pmu_pebs_event(struc
 		 * last record the same as other PEBS records, and doesn't
 		 * invoke the generic overflow handler.
 		 */
-		perf_event_output(event, &data, regs);
+		perf_event_output(event, data, regs);
 	} else {
 		/*
 		 * All but the last records are processed.
 		 * The last one is left to be able to call the overflow handler.
 		 */
-		if (perf_event_overflow(event, &data, regs))
+		if (perf_event_overflow(event, data, regs))
 			x86_pmu_stop(event, 0);
 	}
 }
 
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
@@ -1812,7 +1815,7 @@ static void intel_pmu_drain_pebs_core(st
 		return;
 	}
 
-	__intel_pmu_pebs_event(event, iregs, at, top, 0, n,
+	__intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
 			       setup_pebs_fixed_sample_data);
 }
 
@@ -1835,7 +1838,7 @@ static void intel_pmu_pebs_event_update_
 	}
 }
 
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_data *data)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
@@ -1942,14 +1945,14 @@ static void intel_pmu_drain_pebs_nhm(str
 		}
 
 		if (counts[bit]) {
-			__intel_pmu_pebs_event(event, iregs, base,
+			__intel_pmu_pebs_event(event, iregs, data, base,
 					       top, bit, counts[bit],
 					       setup_pebs_fixed_sample_data);
 		}
 	}
 }
 
-static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
 {
 	short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1997,7 +2000,7 @@ static void intel_pmu_drain_pebs_icl(str
 		if (WARN_ON_ONCE(!event->attr.precise_ip))
 			continue;
 
-		__intel_pmu_pebs_event(event, iregs, base,
+		__intel_pmu_pebs_event(event, iregs, data, base,
 				       top, bit, counts[bit],
 				       setup_pebs_adaptive_sample_data);
 	}
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -727,7 +727,7 @@ struct x86_pmu {
 	int		pebs_record_size;
 	int		pebs_buffer_size;
 	int		max_pebs_events;
-	void		(*drain_pebs)(struct pt_regs *regs);
+	void		(*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data);
 	struct event_constraint *pebs_constraints;
 	void		(*pebs_aliases)(struct perf_event *event);
 	unsigned long	large_pebs_flags;



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 3/6] perf: Fix get_recursion_context()
  2020-10-30 15:13 [PATCH 0/6] perf: Reduce stack usage (and misc bits) Peter Zijlstra
  2020-10-30 15:13 ` [PATCH 1/6] perf: Reduce stack usage of perf_output_begin() Peter Zijlstra
  2020-10-30 15:13 ` [PATCH 2/6] perf/x86: Reduce stack usage for x86_pmu::drain_pebs() Peter Zijlstra
@ 2020-10-30 15:13 ` Peter Zijlstra
  2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
  2020-10-30 15:13 ` [PATCH 4/6] perf: Optimize get_recursion_context() Peter Zijlstra
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2020-10-30 15:13 UTC (permalink / raw)
  To: mingo, tglx, rostedt
  Cc: linux-kernel, kan.liang, acme, mark.rutland, alexander.shishkin,
	jolsa, namhyung, ak, eranian, peterz

One should use in_server_softirq() to detector SoftIRQ context.

Fixes: 96f6d4444302 ("perf_counter: avoid recursion")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/internal.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -211,7 +211,7 @@ static inline int get_recursion_context(
 		rctx = 3;
 	else if (in_irq())
 		rctx = 2;
-	else if (in_softirq())
+	else if (in_serving_softirq())
 		rctx = 1;
 	else
 		rctx = 0;



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-10-30 15:13 [PATCH 0/6] perf: Reduce stack usage (and misc bits) Peter Zijlstra
                   ` (2 preceding siblings ...)
  2020-10-30 15:13 ` [PATCH 3/6] perf: Fix get_recursion_context() Peter Zijlstra
@ 2020-10-30 15:13 ` Peter Zijlstra
  2020-10-30 17:11   ` Jesper Dangaard Brouer
  2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
  2020-10-30 15:13 ` [PATCH 5/6] perf/arch: Remove perf_sample_data::regs_user_copy Peter Zijlstra
  2020-10-30 15:13 ` [PATCH 6/6] perf/x86: Make dummy_iregs static Peter Zijlstra
  5 siblings, 2 replies; 23+ messages in thread
From: Peter Zijlstra @ 2020-10-30 15:13 UTC (permalink / raw)
  To: mingo, tglx, rostedt
  Cc: linux-kernel, kan.liang, acme, mark.rutland, alexander.shishkin,
	jolsa, namhyung, ak, eranian, peterz, Jesper Dangaard Brouer

  "Look ma, no branches!"

Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/internal.h |   17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -205,16 +205,15 @@ DEFINE_OUTPUT_COPY(__output_copy_user, a
 
 static inline int get_recursion_context(int *recursion)
 {
-	int rctx;
+	unsigned int pc = preempt_count();
+	unsigned int rctx = 0;
 
-	if (unlikely(in_nmi()))
-		rctx = 3;
-	else if (in_irq())
-		rctx = 2;
-	else if (in_serving_softirq())
-		rctx = 1;
-	else
-		rctx = 0;
+	if (pc & (NMI_MASK))
+		rctx++;
+	if (pc & (NMI_MASK | HARDIRQ_MASK))
+		rctx++;
+	if (pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))
+		rctx++;
 
 	if (recursion[rctx])
 		return -1;



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 5/6] perf/arch: Remove perf_sample_data::regs_user_copy
  2020-10-30 15:13 [PATCH 0/6] perf: Reduce stack usage (and misc bits) Peter Zijlstra
                   ` (3 preceding siblings ...)
  2020-10-30 15:13 ` [PATCH 4/6] perf: Optimize get_recursion_context() Peter Zijlstra
@ 2020-10-30 15:13 ` Peter Zijlstra
  2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
  2020-10-30 15:13 ` [PATCH 6/6] perf/x86: Make dummy_iregs static Peter Zijlstra
  5 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2020-10-30 15:13 UTC (permalink / raw)
  To: mingo, tglx, rostedt
  Cc: linux-kernel, kan.liang, acme, mark.rutland, alexander.shishkin,
	jolsa, namhyung, ak, eranian, peterz

struct perf_sample_data lives on-stack, we should be careful about it's
size. Furthermore, the pt_regs copy in there is only because x86_64 is a
trainwreck, solve it differently.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/arm/kernel/perf_regs.c   |    3 +--
 arch/arm64/kernel/perf_regs.c |    3 +--
 arch/csky/kernel/perf_regs.c  |    3 +--
 arch/powerpc/perf/perf_regs.c |    3 +--
 arch/riscv/kernel/perf_regs.c |    3 +--
 arch/s390/kernel/perf_regs.c  |    3 +--
 arch/x86/kernel/perf_regs.c   |   15 +++++++++++----
 include/linux/perf_event.h    |    6 ------
 include/linux/perf_regs.h     |    6 ++----
 kernel/events/core.c          |    8 +++-----
 10 files changed, 22 insertions(+), 31 deletions(-)

--- a/arch/arm/kernel/perf_regs.c
+++ b/arch/arm/kernel/perf_regs.c
@@ -32,8 +32,7 @@ u64 perf_reg_abi(struct task_struct *tas
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -73,8 +73,7 @@ u64 perf_reg_abi(struct task_struct *tas
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
--- a/arch/csky/kernel/perf_regs.c
+++ b/arch/csky/kernel/perf_regs.c
@@ -32,8 +32,7 @@ u64 perf_reg_abi(struct task_struct *tas
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -144,8 +144,7 @@ u64 perf_reg_abi(struct task_struct *tas
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = (regs_user->regs) ? perf_reg_abi(current) :
--- a/arch/riscv/kernel/perf_regs.c
+++ b/arch/riscv/kernel/perf_regs.c
@@ -36,8 +36,7 @@ u64 perf_reg_abi(struct task_struct *tas
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
--- a/arch/s390/kernel/perf_regs.c
+++ b/arch/s390/kernel/perf_regs.c
@@ -53,8 +53,7 @@ u64 perf_reg_abi(struct task_struct *tas
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	/*
 	 * Use the regs from the first interruption and let
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *tas
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
@@ -129,12 +128,20 @@ u64 perf_reg_abi(struct task_struct *tas
 		return PERF_SAMPLE_REGS_ABI_64;
 }
 
+static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs);
+
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
+	struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs);
 	struct pt_regs *user_regs = task_pt_regs(current);
 
+	if (!in_nmi()) {
+		regs_user->regs = user_regs;
+		regs_user->abi = perf_reg_abi(current);
+		return;
+	}
+
 	/*
 	 * If we're in an NMI that interrupted task_pt_regs setup, then
 	 * we can't sample user regs at all.  This check isn't really
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1022,13 +1022,7 @@ struct perf_sample_data {
 	struct perf_callchain_entry	*callchain;
 	u64				aux_size;
 
-	/*
-	 * regs_user may point to task_pt_regs or to regs_user_copy, depending
-	 * on arch details.
-	 */
 	struct perf_regs		regs_user;
-	struct pt_regs			regs_user_copy;
-
 	struct perf_regs		regs_intr;
 	u64				stack_user_size;
 
--- a/include/linux/perf_regs.h
+++ b/include/linux/perf_regs.h
@@ -20,8 +20,7 @@ u64 perf_reg_value(struct pt_regs *regs,
 int perf_reg_validate(u64 mask);
 u64 perf_reg_abi(struct task_struct *task);
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy);
+			struct pt_regs *regs);
 #else
 
 #define PERF_REG_EXTENDED_MASK	0
@@ -42,8 +41,7 @@ static inline u64 perf_reg_abi(struct ta
 }
 
 static inline void perf_get_regs_user(struct perf_regs *regs_user,
-				      struct pt_regs *regs,
-				      struct pt_regs *regs_user_copy)
+				      struct pt_regs *regs);
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6374,14 +6374,13 @@ perf_output_sample_regs(struct perf_outp
 }
 
 static void perf_sample_regs_user(struct perf_regs *regs_user,
-				  struct pt_regs *regs,
-				  struct pt_regs *regs_user_copy)
+				  struct pt_regs *regs)
 {
 	if (user_mode(regs)) {
 		regs_user->abi = perf_reg_abi(current);
 		regs_user->regs = regs;
 	} else if (!(current->flags & PF_KTHREAD)) {
-		perf_get_regs_user(regs_user, regs, regs_user_copy);
+		perf_get_regs_user(regs_user, regs);
 	} else {
 		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
 		regs_user->regs = NULL;
@@ -7083,8 +7082,7 @@ void perf_prepare_sample(struct perf_eve
 	}
 
 	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
-		perf_sample_regs_user(&data->regs_user, regs,
-				      &data->regs_user_copy);
+		perf_sample_regs_user(&data->regs_user, regs);
 
 	if (sample_type & PERF_SAMPLE_REGS_USER) {
 		/* regs dump ABI info */



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 6/6] perf/x86: Make dummy_iregs static
  2020-10-30 15:13 [PATCH 0/6] perf: Reduce stack usage (and misc bits) Peter Zijlstra
                   ` (4 preceding siblings ...)
  2020-10-30 15:13 ` [PATCH 5/6] perf/arch: Remove perf_sample_data::regs_user_copy Peter Zijlstra
@ 2020-10-30 15:13 ` Peter Zijlstra
  2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
  5 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2020-10-30 15:13 UTC (permalink / raw)
  To: mingo, tglx, rostedt
  Cc: linux-kernel, kan.liang, acme, mark.rutland, alexander.shishkin,
	jolsa, namhyung, ak, eranian, peterz

Having pt_regs on-stack is unfortunate, it's 168 bytes. Since it isn't
actually used, make it a static variable. This both gets if off the
stack and ensures it gets 0 initialized, just in case someone does
look at it.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/events/intel/ds.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1735,7 +1735,7 @@ static void __intel_pmu_pebs_event(struc
 	struct x86_perf_regs perf_regs;
 	struct pt_regs *regs = &perf_regs.regs;
 	void *at = get_next_pebs_record_by_bit(base, top, bit);
-	struct pt_regs dummy_iregs;
+	static struct pt_regs dummy_iregs;
 
 	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
 		/*



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-10-30 15:13 ` [PATCH 4/6] perf: Optimize get_recursion_context() Peter Zijlstra
@ 2020-10-30 17:11   ` Jesper Dangaard Brouer
  2020-10-30 20:22     ` Steven Rostedt
  2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
  1 sibling, 1 reply; 23+ messages in thread
From: Jesper Dangaard Brouer @ 2020-10-30 17:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, tglx, rostedt, linux-kernel, kan.liang, acme,
	mark.rutland, alexander.shishkin, jolsa, namhyung, ak, eranian,
	brouer

On Fri, 30 Oct 2020 16:13:49 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

>   "Look ma, no branches!"
> 
> Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---

Cool trick! :-)

Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>

>  kernel/events/internal.h |   17 ++++++++---------
>  1 file changed, 8 insertions(+), 9 deletions(-)
> 
> --- a/kernel/events/internal.h
> +++ b/kernel/events/internal.h
> @@ -205,16 +205,15 @@ DEFINE_OUTPUT_COPY(__output_copy_user, a
>  
>  static inline int get_recursion_context(int *recursion)
>  {
> -	int rctx;
> +	unsigned int pc = preempt_count();
> +	unsigned int rctx = 0;
>  
> -	if (unlikely(in_nmi()))
> -		rctx = 3;
> -	else if (in_irq())
> -		rctx = 2;
> -	else if (in_serving_softirq())
> -		rctx = 1;
> -	else
> -		rctx = 0;
> +	if (pc & (NMI_MASK))
> +		rctx++;
> +	if (pc & (NMI_MASK | HARDIRQ_MASK))
> +		rctx++;
> +	if (pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))
> +		rctx++;
>  
>  	if (recursion[rctx])
>  		return -1;
> 
> 



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-10-30 17:11   ` Jesper Dangaard Brouer
@ 2020-10-30 20:22     ` Steven Rostedt
  2020-10-30 22:14       ` Thomas Gleixner
  2020-10-30 23:01       ` Peter Zijlstra
  0 siblings, 2 replies; 23+ messages in thread
From: Steven Rostedt @ 2020-10-30 20:22 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Peter Zijlstra, mingo, tglx, linux-kernel, kan.liang, acme,
	mark.rutland, alexander.shishkin, jolsa, namhyung, ak, eranian

On Fri, 30 Oct 2020 18:11:38 +0100
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> On Fri, 30 Oct 2020 16:13:49 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> >   "Look ma, no branches!"
> > 
> > Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> > Cc: Steven Rostedt <rostedt@goodmis.org>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---  
> 
> Cool trick! :-)
> 
> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
> 
> >  kernel/events/internal.h |   17 ++++++++---------
> >  1 file changed, 8 insertions(+), 9 deletions(-)
> > 
> > --- a/kernel/events/internal.h
> > +++ b/kernel/events/internal.h
> > @@ -205,16 +205,15 @@ DEFINE_OUTPUT_COPY(__output_copy_user, a
> >  
> >  static inline int get_recursion_context(int *recursion)
> >  {
> > -	int rctx;
> > +	unsigned int pc = preempt_count();
> > +	unsigned int rctx = 0;
> >  
> > -	if (unlikely(in_nmi()))
> > -		rctx = 3;
> > -	else if (in_irq())
> > -		rctx = 2;
> > -	else if (in_serving_softirq())
> > -		rctx = 1;
> > -	else
> > -		rctx = 0;
> > +	if (pc & (NMI_MASK))
> > +		rctx++;
> > +	if (pc & (NMI_MASK | HARDIRQ_MASK))
> > +		rctx++;
> > +	if (pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))
> > +		rctx++;
> >  
> >  	if (recursion[rctx])
> >  		return -1;
> > 
> > 

As this is something that ftrace recursion also does, perhaps we should
move this into interrupt.h so that anyone that needs a counter can get
it quickly, and not keep re-implementing it.

/*
 * Quickly find what context you are in.
 * 0 - normal
 * 1 - softirq
 * 2 - hard interrupt
 * 3 - NMI
 */
static inline int irq_context()
{
	unsigned int pc = preempt_count();
	int rctx = 0;

	if (pc & (NMI_MASK))
		rctx++;
	if (pc & (NMI_MASK | HARDIRQ_MASK))
		rctx++;
	if (pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))
		rctx++;

	return rctx;
}

-- Steve

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-10-30 20:22     ` Steven Rostedt
@ 2020-10-30 22:14       ` Thomas Gleixner
  2020-10-30 23:31         ` Steven Rostedt
  2020-10-30 23:01       ` Peter Zijlstra
  1 sibling, 1 reply; 23+ messages in thread
From: Thomas Gleixner @ 2020-10-30 22:14 UTC (permalink / raw)
  To: Steven Rostedt, Jesper Dangaard Brouer
  Cc: Peter Zijlstra, mingo, linux-kernel, kan.liang, acme,
	mark.rutland, alexander.shishkin, jolsa, namhyung, ak, eranian

On Fri, Oct 30 2020 at 16:22, Steven Rostedt wrote:
> As this is something that ftrace recursion also does, perhaps we should
> move this into interrupt.h so that anyone that needs a counter can get

Not in interrupt.h please. We should create kernel/include/ for stuff
which really should only be available in the core kernel code.

Thanks,

        tglx



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-10-30 20:22     ` Steven Rostedt
  2020-10-30 22:14       ` Thomas Gleixner
@ 2020-10-30 23:01       ` Peter Zijlstra
  2020-10-31 12:11         ` David Laight
  1 sibling, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2020-10-30 23:01 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Jesper Dangaard Brouer, mingo, tglx, linux-kernel, kan.liang,
	acme, mark.rutland, alexander.shishkin, jolsa, namhyung, ak,
	eranian

On Fri, Oct 30, 2020 at 04:22:48PM -0400, Steven Rostedt wrote:
> As this is something that ftrace recursion also does, perhaps we should
> move this into interrupt.h so that anyone that needs a counter can get
> it quickly, and not keep re-implementing it.

Works for me, however:

> /*
>  * Quickly find what context you are in.
>  * 0 - normal
>  * 1 - softirq
>  * 2 - hard interrupt
>  * 3 - NMI
>  */
> static inline int irq_context()
> {
> 	unsigned int pc = preempt_count();
> 	int rctx = 0;

unsigned

> 
> 	if (pc & (NMI_MASK))
> 		rctx++;
> 	if (pc & (NMI_MASK | HARDIRQ_MASK))
> 		rctx++;
> 	if (pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))
> 		rctx++;
> 
> 	return rctx;
> }

otherwise you'll get an extra instruction to sign extend it, which is
daft (yes, i've been staring at GCC output far too much).

Also, gcc-9 does worse (like 1 byte iirc) with:

	rctx += !!(pc & (NMI_MASK));
	rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
	rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));

but gcc-10 doesn't seem to care.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-10-30 22:14       ` Thomas Gleixner
@ 2020-10-30 23:31         ` Steven Rostedt
  2020-10-31 11:23           ` Peter Zijlstra
  0 siblings, 1 reply; 23+ messages in thread
From: Steven Rostedt @ 2020-10-30 23:31 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jesper Dangaard Brouer, Peter Zijlstra, mingo, linux-kernel,
	kan.liang, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, ak, eranian

On Fri, 30 Oct 2020 23:14:18 +0100
Thomas Gleixner <tglx@linutronix.de> wrote:

> On Fri, Oct 30 2020 at 16:22, Steven Rostedt wrote:
> > As this is something that ftrace recursion also does, perhaps we should
> > move this into interrupt.h so that anyone that needs a counter can get  
> 
> Not in interrupt.h please. We should create kernel/include/ for stuff
> which really should only be available in the core kernel code.
> 

The recursion protection is needed for anything that registers a
callback to ftrace. I have patches that already basically do the same
thing (although, with branches) that I'm going to place in
include/linux/trace_recursion.h, so that there are helper functions
that ftrace callbacks can use, instead of having to implement their own
recursion protection. After fixing two bugs in the recursion protection
code, it's something that should be reused, instead of everyone making
similar mistakes.

Note, I thought that in_nmi() and friends was in interrupt.h, but is
really in preempt.h. All the values used in Peter's code is also
defined in preempt.h, so why not have something like that there?

I take back adding it to interrupt.h but have it in preempt.h, as it's
not defining anything new there.

-- Steve

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-10-30 23:31         ` Steven Rostedt
@ 2020-10-31 11:23           ` Peter Zijlstra
  0 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2020-10-31 11:23 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Thomas Gleixner, Jesper Dangaard Brouer, mingo, linux-kernel,
	kan.liang, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, ak, eranian

On Fri, Oct 30, 2020 at 07:31:24PM -0400, Steven Rostedt wrote:

> Note, I thought that in_nmi() and friends was in interrupt.h, but is
> really in preempt.h. All the values used in Peter's code is also
> defined in preempt.h, so why not have something like that there?
> 
> I take back adding it to interrupt.h but have it in preempt.h, as it's
> not defining anything new there.

Yeah, preempt.h is the right place for it.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-10-30 23:01       ` Peter Zijlstra
@ 2020-10-31 12:11         ` David Laight
  2020-10-31 13:18           ` David Laight
  2020-11-09 12:12           ` Peter Zijlstra
  0 siblings, 2 replies; 23+ messages in thread
From: David Laight @ 2020-10-31 12:11 UTC (permalink / raw)
  To: 'Peter Zijlstra', Steven Rostedt
  Cc: Jesper Dangaard Brouer, mingo, tglx, linux-kernel, kan.liang,
	acme, mark.rutland, alexander.shishkin, jolsa, namhyung, ak,
	eranian

From: Peter Zijlstra
> Sent: 30 October 2020 23:02
> 
> On Fri, Oct 30, 2020 at 04:22:48PM -0400, Steven Rostedt wrote:
> > As this is something that ftrace recursion also does, perhaps we should
> > move this into interrupt.h so that anyone that needs a counter can get
> > it quickly, and not keep re-implementing it.
> 
> Works for me, however:
> 
> > /*
> >  * Quickly find what context you are in.
> >  * 0 - normal
> >  * 1 - softirq
> >  * 2 - hard interrupt
> >  * 3 - NMI
> >  */
> > static inline int irq_context()
> > {
> > 	unsigned int pc = preempt_count();
> > 	int rctx = 0;
> 
> unsigned
> 
> >
> > 	if (pc & (NMI_MASK))
> > 		rctx++;
> > 	if (pc & (NMI_MASK | HARDIRQ_MASK))
> > 		rctx++;
> > 	if (pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))
> > 		rctx++;
> >
> > 	return rctx;
> > }
> 
> otherwise you'll get an extra instruction to sign extend it, which is
> daft (yes, i've been staring at GCC output far too much).
> 
> Also, gcc-9 does worse (like 1 byte iirc) with:
> 
> 	rctx += !!(pc & (NMI_MASK));
> 	rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
> 	rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
> 
> but gcc-10 doesn't seem to care.

You've made be look at some gcc output (it's raining).

The gcc 7.5.0 I have handy probably generates the best code for:

unsigned char q_2(unsigned int pc)
{
        unsigned char rctx = 0;

        rctx += !!(pc & (NMI_MASK));
        rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
        rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));

        return rctx;
}

0000000000000000 <q_2>:
   0:   f7 c7 00 00 f0 00       test   $0xf00000,%edi     # clock 0
   6:   0f 95 c0                setne  %al                # clock 1
   9:   f7 c7 00 00 ff 00       test   $0xff0000,%edi     # clock 0
   f:   0f 95 c2                setne  %dl                # clock 1
  12:   01 c2                   add    %eax,%edx          # clock 2
  14:   81 e7 00 01 ff 00       and    $0xff0100,%edi
  1a:   0f 95 c0                setne  %al
  1d:   01 d0                   add    %edx,%eax          # clock 3
  1f:   c3                      retq

I doubt that is beatable.

I've annotated the register dependency chain.
Likely to be 3 (or maybe 4) clocks.
The other versions are a lot worse (7 or 8) without allowing
for 'sbb' taking 2 clocks on a lot of Intel cpus.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-10-31 12:11         ` David Laight
@ 2020-10-31 13:18           ` David Laight
  2020-11-09 12:12           ` Peter Zijlstra
  1 sibling, 0 replies; 23+ messages in thread
From: David Laight @ 2020-10-31 13:18 UTC (permalink / raw)
  To: David Laight, 'Peter Zijlstra', Steven Rostedt
  Cc: Jesper Dangaard Brouer, mingo, tglx, linux-kernel, kan.liang,
	acme, mark.rutland, alexander.shishkin, jolsa, namhyung, ak,
	eranian

From: David Laight
> Sent: 31 October 2020 12:12
> 
...
> The gcc 7.5.0 I have handy probably generates the best code for:
> 
> unsigned char q_2(unsigned int pc)
> {
>         unsigned char rctx = 0;
> 
>         rctx += !!(pc & (NMI_MASK));
>         rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
>         rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
> 
>         return rctx;
> }
> 
> 0000000000000000 <q_2>:
>    0:   f7 c7 00 00 f0 00       test   $0xf00000,%edi     # clock 0
>    6:   0f 95 c0                setne  %al                # clock 1
>    9:   f7 c7 00 00 ff 00       test   $0xff0000,%edi     # clock 0
>    f:   0f 95 c2                setne  %dl                # clock 1
>   12:   01 c2                   add    %eax,%edx          # clock 2
>   14:   81 e7 00 01 ff 00       and    $0xff0100,%edi
>   1a:   0f 95 c0                setne  %al
>   1d:   01 d0                   add    %edx,%eax          # clock 3
>   1f:   c3                      retq
> 
> I doubt that is beatable.

I lied, you should be able to get:
	test   $0xff0000,%edi     # clock 0
	setne  %al                # clock 1
	test   $0xff0100,%edi     # clock 0
	setne  %dl                # clock 1
	add    $fffff000,%edi
	adc    %dl, %al           # clock 2

But I suspect getting it from the compiler might be hard!

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-10-31 12:11         ` David Laight
  2020-10-31 13:18           ` David Laight
@ 2020-11-09 12:12           ` Peter Zijlstra
  2020-11-09 14:14             ` David Laight
  1 sibling, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2020-11-09 12:12 UTC (permalink / raw)
  To: David Laight
  Cc: Steven Rostedt, Jesper Dangaard Brouer, mingo, tglx,
	linux-kernel, kan.liang, acme, mark.rutland, alexander.shishkin,
	jolsa, namhyung, ak, eranian

On Sat, Oct 31, 2020 at 12:11:42PM +0000, David Laight wrote:
> The gcc 7.5.0 I have handy probably generates the best code for:
> 
> unsigned char q_2(unsigned int pc)
> {
>         unsigned char rctx = 0;
> 
>         rctx += !!(pc & (NMI_MASK));
>         rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
>         rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
> 
>         return rctx;
> }
> 
> 0000000000000000 <q_2>:
>    0:   f7 c7 00 00 f0 00       test   $0xf00000,%edi     # clock 0
>    6:   0f 95 c0                setne  %al                # clock 1
>    9:   f7 c7 00 00 ff 00       test   $0xff0000,%edi     # clock 0
>    f:   0f 95 c2                setne  %dl                # clock 1
>   12:   01 c2                   add    %eax,%edx          # clock 2
>   14:   81 e7 00 01 ff 00       and    $0xff0100,%edi
>   1a:   0f 95 c0                setne  %al
>   1d:   01 d0                   add    %edx,%eax          # clock 3
>   1f:   c3                      retq
> 
> I doubt that is beatable.
> 
> I've annotated the register dependency chain.
> Likely to be 3 (or maybe 4) clocks.
> The other versions are a lot worse (7 or 8) without allowing
> for 'sbb' taking 2 clocks on a lot of Intel cpus.

https://godbolt.org/z/EfnG8E

Recent GCC just doesn't want to do that. Still, using u8 makes sense, so
I've kept that.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH 4/6] perf: Optimize get_recursion_context()
  2020-11-09 12:12           ` Peter Zijlstra
@ 2020-11-09 14:14             ` David Laight
  0 siblings, 0 replies; 23+ messages in thread
From: David Laight @ 2020-11-09 14:14 UTC (permalink / raw)
  To: 'Peter Zijlstra'
  Cc: Steven Rostedt, Jesper Dangaard Brouer, mingo, tglx,
	linux-kernel, kan.liang, acme, mark.rutland, alexander.shishkin,
	jolsa, namhyung, ak, eranian



> -----Original Message-----
> From: Peter Zijlstra <peterz@infradead.org>
> Sent: 09 November 2020 12:13
> To: David Laight <David.Laight@ACULAB.COM>
> Cc: Steven Rostedt <rostedt@goodmis.org>; Jesper Dangaard Brouer <brouer@redhat.com>;
> mingo@kernel.org; tglx@linutronix.de; linux-kernel@vger.kernel.org; kan.liang@linux.intel.com;
> acme@kernel.org; mark.rutland@arm.com; alexander.shishkin@linux.intel.com; jolsa@redhat.com;
> namhyung@kernel.org; ak@linux.intel.com; eranian@google.com
> Subject: Re: [PATCH 4/6] perf: Optimize get_recursion_context()
> 
> On Sat, Oct 31, 2020 at 12:11:42PM +0000, David Laight wrote:
> > The gcc 7.5.0 I have handy probably generates the best code for:
> >
> > unsigned char q_2(unsigned int pc)
> > {
> >         unsigned char rctx = 0;
> >
> >         rctx += !!(pc & (NMI_MASK));
> >         rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
> >         rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
> >
> >         return rctx;
> > }
> >
> > 0000000000000000 <q_2>:
> >    0:   f7 c7 00 00 f0 00       test   $0xf00000,%edi     # clock 0
> >    6:   0f 95 c0                setne  %al                # clock 1
> >    9:   f7 c7 00 00 ff 00       test   $0xff0000,%edi     # clock 0
> >    f:   0f 95 c2                setne  %dl                # clock 1
> >   12:   01 c2                   add    %eax,%edx          # clock 2
> >   14:   81 e7 00 01 ff 00       and    $0xff0100,%edi
> >   1a:   0f 95 c0                setne  %al
> >   1d:   01 d0                   add    %edx,%eax          # clock 3
> >   1f:   c3                      retq
> >
> > I doubt that is beatable.
> >
> > I've annotated the register dependency chain.
> > Likely to be 3 (or maybe 4) clocks.
> > The other versions are a lot worse (7 or 8) without allowing
> > for 'sbb' taking 2 clocks on a lot of Intel cpus.
> 
> https://godbolt.org/z/EfnG8E
> 
> Recent GCC just doesn't want to do that. Still, using u8 makes sense, so
> I've kept that.

u8 helps x86 because its 'setne' only affects the low 8 bits.
I guess that seemed a good idea when it was added (386).
It doesn't seem to make the other architectures much worse.

gcc 10.x can be persuaded to generate the above code.

https://godbolt.org/z/6GoT94

It sometimes seems to me that every new version of gcc is
larger, slower and generates worse code than the previous one.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [tip: perf/urgent] perf/x86: Make dummy_iregs static
  2020-10-30 15:13 ` [PATCH 6/6] perf/x86: Make dummy_iregs static Peter Zijlstra
@ 2020-11-10 12:45   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 23+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-10 12:45 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Steven Rostedt, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     e506d1dac0edb2df82f2aa0582e814f9cd9aa07d
Gitweb:        https://git.kernel.org/tip/e506d1dac0edb2df82f2aa0582e814f9cd9aa07d
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 30 Oct 2020 12:15:06 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 09 Nov 2020 18:12:35 +01:00

perf/x86: Make dummy_iregs static

Having pt_regs on-stack is unfortunate, it's 168 bytes. Since it isn't
actually used, make it a static variable. This both gets if off the
stack and ensures it gets 0 initialized, just in case someone does
look at it.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201030151955.324273677@infradead.org
---
 arch/x86/events/intel/ds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 276b29d..b47cc42 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1738,7 +1738,7 @@ __intel_pmu_pebs_event(struct perf_event *event,
 	struct x86_perf_regs perf_regs;
 	struct pt_regs *regs = &perf_regs.regs;
 	void *at = get_next_pebs_record_by_bit(base, top, bit);
-	struct pt_regs dummy_iregs;
+	static struct pt_regs dummy_iregs;
 
 	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
 		/*

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [tip: perf/urgent] perf/arch: Remove perf_sample_data::regs_user_copy
  2020-10-30 15:13 ` [PATCH 5/6] perf/arch: Remove perf_sample_data::regs_user_copy Peter Zijlstra
@ 2020-11-10 12:45   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 23+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-10 12:45 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Thomas Gleixner, Peter Zijlstra (Intel),
	Steven Rostedt, x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     76a4efa80900fc40e0fdf243b42aec9fb8c35d24
Gitweb:        https://git.kernel.org/tip/76a4efa80900fc40e0fdf243b42aec9fb8c35d24
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 30 Oct 2020 12:14:21 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 09 Nov 2020 18:12:34 +01:00

perf/arch: Remove perf_sample_data::regs_user_copy

struct perf_sample_data lives on-stack, we should be careful about it's
size. Furthermore, the pt_regs copy in there is only because x86_64 is a
trainwreck, solve it differently.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/20201030151955.258178461@infradead.org
---
 arch/arm/kernel/perf_regs.c   |  3 +--
 arch/arm64/kernel/perf_regs.c |  3 +--
 arch/csky/kernel/perf_regs.c  |  3 +--
 arch/powerpc/perf/perf_regs.c |  3 +--
 arch/riscv/kernel/perf_regs.c |  3 +--
 arch/s390/kernel/perf_regs.c  |  3 +--
 arch/x86/kernel/perf_regs.c   | 15 +++++++++++----
 include/linux/perf_event.h    |  6 ------
 include/linux/perf_regs.h     |  6 ++----
 kernel/events/core.c          |  8 +++-----
 10 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c
index 05fe92a..0529f90 100644
--- a/arch/arm/kernel/perf_regs.c
+++ b/arch/arm/kernel/perf_regs.c
@@ -32,8 +32,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index 94e8718..f6f58e6 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -73,8 +73,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
diff --git a/arch/csky/kernel/perf_regs.c b/arch/csky/kernel/perf_regs.c
index eb32838..09b7f88 100644
--- a/arch/csky/kernel/perf_regs.c
+++ b/arch/csky/kernel/perf_regs.c
@@ -32,8 +32,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index 8e53f2f..6f681b1 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -144,8 +144,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = (regs_user->regs) ? perf_reg_abi(current) :
diff --git a/arch/riscv/kernel/perf_regs.c b/arch/riscv/kernel/perf_regs.c
index 04a38fb..fd304a2 100644
--- a/arch/riscv/kernel/perf_regs.c
+++ b/arch/riscv/kernel/perf_regs.c
@@ -36,8 +36,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c
index 4352a50..6e9e5d5 100644
--- a/arch/s390/kernel/perf_regs.c
+++ b/arch/s390/kernel/perf_regs.c
@@ -53,8 +53,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	/*
 	 * Use the regs from the first interruption and let
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index bb7e113..f9e5352 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *task)
 }
 
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
@@ -129,12 +128,20 @@ u64 perf_reg_abi(struct task_struct *task)
 		return PERF_SAMPLE_REGS_ABI_64;
 }
 
+static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs);
+
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy)
+			struct pt_regs *regs)
 {
+	struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs);
 	struct pt_regs *user_regs = task_pt_regs(current);
 
+	if (!in_nmi()) {
+		regs_user->regs = user_regs;
+		regs_user->abi = perf_reg_abi(current);
+		return;
+	}
+
 	/*
 	 * If we're in an NMI that interrupted task_pt_regs setup, then
 	 * we can't sample user regs at all.  This check isn't really
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b775ae0..96450f6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1022,13 +1022,7 @@ struct perf_sample_data {
 	struct perf_callchain_entry	*callchain;
 	u64				aux_size;
 
-	/*
-	 * regs_user may point to task_pt_regs or to regs_user_copy, depending
-	 * on arch details.
-	 */
 	struct perf_regs		regs_user;
-	struct pt_regs			regs_user_copy;
-
 	struct perf_regs		regs_intr;
 	u64				stack_user_size;
 
diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h
index 2d12e97..f632c57 100644
--- a/include/linux/perf_regs.h
+++ b/include/linux/perf_regs.h
@@ -20,8 +20,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx);
 int perf_reg_validate(u64 mask);
 u64 perf_reg_abi(struct task_struct *task);
 void perf_get_regs_user(struct perf_regs *regs_user,
-			struct pt_regs *regs,
-			struct pt_regs *regs_user_copy);
+			struct pt_regs *regs);
 #else
 
 #define PERF_REG_EXTENDED_MASK	0
@@ -42,8 +41,7 @@ static inline u64 perf_reg_abi(struct task_struct *task)
 }
 
 static inline void perf_get_regs_user(struct perf_regs *regs_user,
-				      struct pt_regs *regs,
-				      struct pt_regs *regs_user_copy)
+				      struct pt_regs *regs)
 {
 	regs_user->regs = task_pt_regs(current);
 	regs_user->abi = perf_reg_abi(current);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc681c7..d67c9cb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6374,14 +6374,13 @@ perf_output_sample_regs(struct perf_output_handle *handle,
 }
 
 static void perf_sample_regs_user(struct perf_regs *regs_user,
-				  struct pt_regs *regs,
-				  struct pt_regs *regs_user_copy)
+				  struct pt_regs *regs)
 {
 	if (user_mode(regs)) {
 		regs_user->abi = perf_reg_abi(current);
 		regs_user->regs = regs;
 	} else if (!(current->flags & PF_KTHREAD)) {
-		perf_get_regs_user(regs_user, regs, regs_user_copy);
+		perf_get_regs_user(regs_user, regs);
 	} else {
 		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
 		regs_user->regs = NULL;
@@ -7083,8 +7082,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 
 	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
-		perf_sample_regs_user(&data->regs_user, regs,
-				      &data->regs_user_copy);
+		perf_sample_regs_user(&data->regs_user, regs);
 
 	if (sample_type & PERF_SAMPLE_REGS_USER) {
 		/* regs dump ABI info */

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [tip: perf/urgent] perf: Fix get_recursion_context()
  2020-10-30 15:13 ` [PATCH 3/6] perf: Fix get_recursion_context() Peter Zijlstra
@ 2020-11-10 12:45   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 23+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-10 12:45 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     ce0f17fc93f63ee91428af10b7b2ddef38cd19e5
Gitweb:        https://git.kernel.org/tip/ce0f17fc93f63ee91428af10b7b2ddef38cd19e5
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 30 Oct 2020 12:49:45 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 09 Nov 2020 18:12:34 +01:00

perf: Fix get_recursion_context()

One should use in_serving_softirq() to detect SoftIRQ context.

Fixes: 96f6d4444302 ("perf_counter: avoid recursion")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201030151955.120572175@infradead.org
---
 kernel/events/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index fcbf561..402054e 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -211,7 +211,7 @@ static inline int get_recursion_context(int *recursion)
 		rctx = 3;
 	else if (in_irq())
 		rctx = 2;
-	else if (in_softirq())
+	else if (in_serving_softirq())
 		rctx = 1;
 	else
 		rctx = 0;

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [tip: perf/urgent] perf: Optimize get_recursion_context()
  2020-10-30 15:13 ` [PATCH 4/6] perf: Optimize get_recursion_context() Peter Zijlstra
  2020-10-30 17:11   ` Jesper Dangaard Brouer
@ 2020-11-10 12:45   ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 23+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-10 12:45 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Peter Zijlstra (Intel), Jesper Dangaard Brouer, x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     09da9c81253dd8e43e0d2d7cea02de6f9f19499d
Gitweb:        https://git.kernel.org/tip/09da9c81253dd8e43e0d2d7cea02de6f9f19499d
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 30 Oct 2020 13:43:16 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 09 Nov 2020 18:12:34 +01:00

perf: Optimize get_recursion_context()

  "Look ma, no branches!"

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lkml.kernel.org/r/20201030151955.187580298@infradead.org
---
 kernel/events/internal.h | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 402054e..228801e 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -205,16 +205,12 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
 
 static inline int get_recursion_context(int *recursion)
 {
-	int rctx;
-
-	if (unlikely(in_nmi()))
-		rctx = 3;
-	else if (in_irq())
-		rctx = 2;
-	else if (in_serving_softirq())
-		rctx = 1;
-	else
-		rctx = 0;
+	unsigned int pc = preempt_count();
+	unsigned char rctx = 0;
+
+	rctx += !!(pc & (NMI_MASK));
+	rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
+	rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
 
 	if (recursion[rctx])
 		return -1;

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [tip: perf/urgent] perf/x86: Reduce stack usage for x86_pmu::drain_pebs()
  2020-10-30 15:13 ` [PATCH 2/6] perf/x86: Reduce stack usage for x86_pmu::drain_pebs() Peter Zijlstra
@ 2020-11-10 12:45   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 23+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-10 12:45 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Thomas Gleixner, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     9dfa9a5c9bae3417b87824e7ac73b00c10b6a874
Gitweb:        https://git.kernel.org/tip/9dfa9a5c9bae3417b87824e7ac73b00c10b6a874
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 30 Oct 2020 14:58:48 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 09 Nov 2020 18:12:33 +01:00

perf/x86: Reduce stack usage for x86_pmu::drain_pebs()

intel_pmu_drain_pebs_*() is typically called from handle_pmi_common(),
both have an on-stack struct perf_sample_data, which is *big*. Rewire
things so that drain_pebs() can use the one handle_pmi_common() has.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201030151955.054099690@infradead.org
---
 arch/x86/events/intel/core.c |  2 +-
 arch/x86/events/intel/ds.c   | 47 ++++++++++++++++++-----------------
 arch/x86/events/perf_event.h |  2 +-
 3 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f1926e9..c37387c 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2630,7 +2630,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 		u64 pebs_enabled = cpuc->pebs_enabled;
 
 		handled++;
-		x86_pmu.drain_pebs(regs);
+		x86_pmu.drain_pebs(regs, &data);
 		status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
 
 		/*
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index cd2ae14..276b29d 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -670,7 +670,9 @@ unlock:
 
 static inline void intel_pmu_drain_pebs_buffer(void)
 {
-	x86_pmu.drain_pebs(NULL);
+	struct perf_sample_data data;
+
+	x86_pmu.drain_pebs(NULL, &data);
 }
 
 /*
@@ -1719,19 +1721,20 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
 	return 0;
 }
 
-static void __intel_pmu_pebs_event(struct perf_event *event,
-				   struct pt_regs *iregs,
-				   void *base, void *top,
-				   int bit, int count,
-				   void (*setup_sample)(struct perf_event *,
-						struct pt_regs *,
-						void *,
-						struct perf_sample_data *,
-						struct pt_regs *))
+static __always_inline void
+__intel_pmu_pebs_event(struct perf_event *event,
+		       struct pt_regs *iregs,
+		       struct perf_sample_data *data,
+		       void *base, void *top,
+		       int bit, int count,
+		       void (*setup_sample)(struct perf_event *,
+					    struct pt_regs *,
+					    void *,
+					    struct perf_sample_data *,
+					    struct pt_regs *))
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	struct perf_sample_data data;
 	struct x86_perf_regs perf_regs;
 	struct pt_regs *regs = &perf_regs.regs;
 	void *at = get_next_pebs_record_by_bit(base, top, bit);
@@ -1752,14 +1755,14 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 		iregs = &dummy_iregs;
 
 	while (count > 1) {
-		setup_sample(event, iregs, at, &data, regs);
-		perf_event_output(event, &data, regs);
+		setup_sample(event, iregs, at, data, regs);
+		perf_event_output(event, data, regs);
 		at += cpuc->pebs_record_size;
 		at = get_next_pebs_record_by_bit(at, top, bit);
 		count--;
 	}
 
-	setup_sample(event, iregs, at, &data, regs);
+	setup_sample(event, iregs, at, data, regs);
 	if (iregs == &dummy_iregs) {
 		/*
 		 * The PEBS records may be drained in the non-overflow context,
@@ -1767,18 +1770,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 		 * last record the same as other PEBS records, and doesn't
 		 * invoke the generic overflow handler.
 		 */
-		perf_event_output(event, &data, regs);
+		perf_event_output(event, data, regs);
 	} else {
 		/*
 		 * All but the last records are processed.
 		 * The last one is left to be able to call the overflow handler.
 		 */
-		if (perf_event_overflow(event, &data, regs))
+		if (perf_event_overflow(event, data, regs))
 			x86_pmu_stop(event, 0);
 	}
 }
 
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
@@ -1812,7 +1815,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 		return;
 	}
 
-	__intel_pmu_pebs_event(event, iregs, at, top, 0, n,
+	__intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
 			       setup_pebs_fixed_sample_data);
 }
 
@@ -1835,7 +1838,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int
 	}
 }
 
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_data *data)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
@@ -1942,14 +1945,14 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		}
 
 		if (counts[bit]) {
-			__intel_pmu_pebs_event(event, iregs, base,
+			__intel_pmu_pebs_event(event, iregs, data, base,
 					       top, bit, counts[bit],
 					       setup_pebs_fixed_sample_data);
 		}
 	}
 }
 
-static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
 {
 	short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1997,7 +2000,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs)
 		if (WARN_ON_ONCE(!event->attr.precise_ip))
 			continue;
 
-		__intel_pmu_pebs_event(event, iregs, base,
+		__intel_pmu_pebs_event(event, iregs, data, base,
 				       top, bit, counts[bit],
 				       setup_pebs_adaptive_sample_data);
 	}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ee2b9b9..1d1fe46 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -727,7 +727,7 @@ struct x86_pmu {
 	int		pebs_record_size;
 	int		pebs_buffer_size;
 	int		max_pebs_events;
-	void		(*drain_pebs)(struct pt_regs *regs);
+	void		(*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data);
 	struct event_constraint *pebs_constraints;
 	void		(*pebs_aliases)(struct perf_event *event);
 	unsigned long	large_pebs_flags;

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [tip: perf/urgent] perf: Reduce stack usage of perf_output_begin()
  2020-10-30 15:13 ` [PATCH 1/6] perf: Reduce stack usage of perf_output_begin() Peter Zijlstra
@ 2020-11-10 12:45   ` tip-bot2 for Peter Zijlstra
  0 siblings, 0 replies; 23+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2020-11-10 12:45 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Thomas Gleixner, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     267fb27352b6fc9fdbad753127a239f75618ecbc
Gitweb:        https://git.kernel.org/tip/267fb27352b6fc9fdbad753127a239f75618ecbc
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Fri, 30 Oct 2020 15:50:32 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 09 Nov 2020 18:12:33 +01:00

perf: Reduce stack usage of perf_output_begin()

__perf_output_begin() has an on-stack struct perf_sample_data in the
unlikely case it needs to generate a LOST record. However, every call
to perf_output_begin() must already have a perf_sample_data on-stack.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201030151954.985416146@infradead.org
---
 arch/powerpc/perf/imc-pmu.c     |  2 +-
 arch/s390/kernel/perf_cpum_sf.c |  2 +-
 arch/x86/events/intel/ds.c      |  4 ++--
 include/linux/perf_event.h      |  7 +++++--
 kernel/events/core.c            | 32 +++++++++++++++++---------------
 kernel/events/ring_buffer.c     | 20 +++++++++++---------
 6 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 9ed4fcc..7b25548 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1336,7 +1336,7 @@ static void dump_trace_imc_data(struct perf_event *event)
 			/* If this is a valid record, create the sample */
 			struct perf_output_handle handle;
 
-			if (perf_output_begin(&handle, event, header.size))
+			if (perf_output_begin(&handle, &data, event, header.size))
 				return;
 
 			perf_output_sample(&handle, &header, &data, event);
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 4f9e462..00255ae 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -672,7 +672,7 @@ static void cpumsf_output_event_pid(struct perf_event *event,
 	rcu_read_lock();
 
 	perf_prepare_sample(&header, data, event, regs);
-	if (perf_output_begin(&handle, event, header.size))
+	if (perf_output_begin(&handle, data, event, header.size))
 		goto out;
 
 	/* Update the process ID (see also kernel/events/core.c) */
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 404315d..cd2ae14 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -642,8 +642,8 @@ int intel_pmu_drain_bts_buffer(void)
 	rcu_read_lock();
 	perf_prepare_sample(&header, &data, event, &regs);
 
-	if (perf_output_begin(&handle, event, header.size *
-			      (top - base - skip)))
+	if (perf_output_begin(&handle, &data, event,
+			      header.size * (top - base - skip)))
 		goto unlock;
 
 	for (at = base; at < top; at++) {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0c19d27..b775ae0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1400,11 +1400,14 @@ perf_event_addr_filters(struct perf_event *event)
 extern void perf_event_addr_filters_sync(struct perf_event *event);
 
 extern int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_sample_data *data,
 			     struct perf_event *event, unsigned int size);
 extern int perf_output_begin_forward(struct perf_output_handle *handle,
-				    struct perf_event *event,
-				    unsigned int size);
+				     struct perf_sample_data *data,
+				     struct perf_event *event,
+				     unsigned int size);
 extern int perf_output_begin_backward(struct perf_output_handle *handle,
+				      struct perf_sample_data *data,
 				      struct perf_event *event,
 				      unsigned int size);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5a29ab0..fc681c7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7186,6 +7186,7 @@ __perf_event_output(struct perf_event *event,
 		    struct perf_sample_data *data,
 		    struct pt_regs *regs,
 		    int (*output_begin)(struct perf_output_handle *,
+					struct perf_sample_data *,
 					struct perf_event *,
 					unsigned int))
 {
@@ -7198,7 +7199,7 @@ __perf_event_output(struct perf_event *event,
 
 	perf_prepare_sample(&header, data, event, regs);
 
-	err = output_begin(&handle, event, header.size);
+	err = output_begin(&handle, data, event, header.size);
 	if (err)
 		goto exit;
 
@@ -7264,7 +7265,7 @@ perf_event_read_event(struct perf_event *event,
 	int ret;
 
 	perf_event_header__init_id(&read_event.header, &sample, event);
-	ret = perf_output_begin(&handle, event, read_event.header.size);
+	ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
 	if (ret)
 		return;
 
@@ -7533,7 +7534,7 @@ static void perf_event_task_output(struct perf_event *event,
 
 	perf_event_header__init_id(&task_event->event_id.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				task_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -7636,7 +7637,7 @@ static void perf_event_comm_output(struct perf_event *event,
 		return;
 
 	perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				comm_event->event_id.header.size);
 
 	if (ret)
@@ -7736,7 +7737,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
 
 	perf_event_header__init_id(&namespaces_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				namespaces_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -7863,7 +7864,7 @@ static void perf_event_cgroup_output(struct perf_event *event, void *data)
 
 	perf_event_header__init_id(&cgroup_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				cgroup_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -7989,7 +7990,7 @@ static void perf_event_mmap_output(struct perf_event *event,
 	}
 
 	perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				mmap_event->event_id.header.size);
 	if (ret)
 		goto out;
@@ -8299,7 +8300,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
 	int ret;
 
 	perf_event_header__init_id(&rec.header, &sample, event);
-	ret = perf_output_begin(&handle, event, rec.header.size);
+	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
 
 	if (ret)
 		return;
@@ -8333,7 +8334,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
 
 	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				lost_samples_event.header.size);
 	if (ret)
 		return;
@@ -8388,7 +8389,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data)
 
 	perf_event_header__init_id(&se->event_id.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event, se->event_id.header.size);
+	ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
 	if (ret)
 		return;
 
@@ -8463,7 +8464,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 
 	perf_event_header__init_id(&throttle_event.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				throttle_event.header.size);
 	if (ret)
 		return;
@@ -8506,7 +8507,7 @@ static void perf_event_ksymbol_output(struct perf_event *event, void *data)
 
 	perf_event_header__init_id(&ksymbol_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, &sample, event,
 				ksymbol_event->event_id.header.size);
 	if (ret)
 		return;
@@ -8596,7 +8597,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data)
 
 	perf_event_header__init_id(&bpf_event->event_id.header,
 				   &sample, event);
-	ret = perf_output_begin(&handle, event,
+	ret = perf_output_begin(&handle, data, event,
 				bpf_event->event_id.header.size);
 	if (ret)
 		return;
@@ -8705,7 +8706,8 @@ static void perf_event_text_poke_output(struct perf_event *event, void *data)
 
 	perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
 
-	ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size);
+	ret = perf_output_begin(&handle, &sample, event,
+				text_poke_event->event_id.header.size);
 	if (ret)
 		return;
 
@@ -8786,7 +8788,7 @@ static void perf_log_itrace_start(struct perf_event *event)
 	rec.tid	= perf_event_tid(event, current);
 
 	perf_event_header__init_id(&rec.header, &sample, event);
-	ret = perf_output_begin(&handle, event, rec.header.size);
+	ret = perf_output_begin(&handle, &sample, event, rec.header.size);
 
 	if (ret)
 		return;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 192b8ab..ef91ae7 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail,
 
 static __always_inline int
 __perf_output_begin(struct perf_output_handle *handle,
+		    struct perf_sample_data *data,
 		    struct perf_event *event, unsigned int size,
 		    bool backward)
 {
@@ -237,18 +238,16 @@ __perf_output_begin(struct perf_output_handle *handle,
 	handle->size = (1UL << page_shift) - offset;
 
 	if (unlikely(have_lost)) {
-		struct perf_sample_data sample_data;
-
 		lost_event.header.size = sizeof(lost_event);
 		lost_event.header.type = PERF_RECORD_LOST;
 		lost_event.header.misc = 0;
 		lost_event.id          = event->id;
 		lost_event.lost        = local_xchg(&rb->lost, 0);
 
-		perf_event_header__init_id(&lost_event.header,
-					   &sample_data, event);
+		/* XXX mostly redundant; @data is already fully initializes */
+		perf_event_header__init_id(&lost_event.header, data, event);
 		perf_output_put(handle, lost_event);
-		perf_event__output_id_sample(event, handle, &sample_data);
+		perf_event__output_id_sample(event, handle, data);
 	}
 
 	return 0;
@@ -263,22 +262,25 @@ out:
 }
 
 int perf_output_begin_forward(struct perf_output_handle *handle,
-			     struct perf_event *event, unsigned int size)
+			      struct perf_sample_data *data,
+			      struct perf_event *event, unsigned int size)
 {
-	return __perf_output_begin(handle, event, size, false);
+	return __perf_output_begin(handle, data, event, size, false);
 }
 
 int perf_output_begin_backward(struct perf_output_handle *handle,
+			       struct perf_sample_data *data,
 			       struct perf_event *event, unsigned int size)
 {
-	return __perf_output_begin(handle, event, size, true);
+	return __perf_output_begin(handle, data, event, size, true);
 }
 
 int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_sample_data *data,
 		      struct perf_event *event, unsigned int size)
 {
 
-	return __perf_output_begin(handle, event, size,
+	return __perf_output_begin(handle, data, event, size,
 				   unlikely(is_write_backward(event)));
 }
 

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, back to index

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-30 15:13 [PATCH 0/6] perf: Reduce stack usage (and misc bits) Peter Zijlstra
2020-10-30 15:13 ` [PATCH 1/6] perf: Reduce stack usage of perf_output_begin() Peter Zijlstra
2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
2020-10-30 15:13 ` [PATCH 2/6] perf/x86: Reduce stack usage for x86_pmu::drain_pebs() Peter Zijlstra
2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
2020-10-30 15:13 ` [PATCH 3/6] perf: Fix get_recursion_context() Peter Zijlstra
2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
2020-10-30 15:13 ` [PATCH 4/6] perf: Optimize get_recursion_context() Peter Zijlstra
2020-10-30 17:11   ` Jesper Dangaard Brouer
2020-10-30 20:22     ` Steven Rostedt
2020-10-30 22:14       ` Thomas Gleixner
2020-10-30 23:31         ` Steven Rostedt
2020-10-31 11:23           ` Peter Zijlstra
2020-10-30 23:01       ` Peter Zijlstra
2020-10-31 12:11         ` David Laight
2020-10-31 13:18           ` David Laight
2020-11-09 12:12           ` Peter Zijlstra
2020-11-09 14:14             ` David Laight
2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
2020-10-30 15:13 ` [PATCH 5/6] perf/arch: Remove perf_sample_data::regs_user_copy Peter Zijlstra
2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra
2020-10-30 15:13 ` [PATCH 6/6] perf/x86: Make dummy_iregs static Peter Zijlstra
2020-11-10 12:45   ` [tip: perf/urgent] " tip-bot2 for Peter Zijlstra

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git
	git clone --mirror https://lore.kernel.org/lkml/9 lkml/git/9.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git