linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl
@ 2016-04-20 22:47 Arnaldo Carvalho de Melo
  2016-04-20 23:04 ` Alexei Starovoitov
                   ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-20 22:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, David Ahern, He Kuang,
	Jiri Olsa, Masami Hiramatsu, Milian Wolff, Namhyung Kim,
	Stephane Eranian, Thomas Gleixner, Vince Weaver, Wang Nan,
	Zefan Li, Linux Kernel Mailing List

The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.
    
And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.
    
The new file is:

  # cat /proc/sys/kernel/perf_event_max_stack
  127

Chaging it:

  # echo 256 > /proc/sys/kernel/perf_event_max_stack
  # cat /proc/sys/kernel/perf_event_max_stack
  256

But as soon as there is some event using callchains we get:

  # echo 512 > /proc/sys/kernel/perf_event_max_stack
  -bash: echo: write error: Device or resource busy
  #

Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.

Reported-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Link: http://lkml.kernel.org/n/tip-cgls6uuncwjtq969tys1j6b0@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..260cde08e92e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_warn
 - perf_cpu_time_max_percent
 - perf_event_paranoid
+- perf_event_max_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN).  The default value is 1.
 
 ==============================================================
 
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+	while ((entry->nr < sysctl_perf_event_max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		while (entry->nr < sysctl_perf_event_max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+		while ((entry->nr < sysctl_perf_event_max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= PERF_MAX_STACK_DEPTH)
+			if (entry->nr >= sysctl_perf_event_max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= PERF_MAX_STACK_DEPTH)
+		if (entry->nr >= sysctl_perf_event_max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..41d93d0e972b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
 				callchain_trace, NULL, entry);
 }
 
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b8b195fbe787..d599ed32af92 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
 
 struct perf_callchain_entry {
 	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
+	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_raw_record {
@@ -983,9 +983,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
+extern int sysctl_perf_event_max_stack;
+
 static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < PERF_MAX_STACK_DEPTH) {
+	if (entry->nr < sysctl_perf_event_max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
@@ -1007,6 +1009,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..1390747b2195 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
 	struct perf_callchain_entry	*cpu_entries[0];
 };
 
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static size_t perf_callchain_entry__sizeof(void)
+{
+	return sizeof(struct perf_callchain_entry) +
+	       sizeof(__u64) * sysctl_perf_event_max_stack;
+}
+
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -215,3 +223,25 @@ exit_put:
 
 	return entry;
 }
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int new_value = sysctl_perf_event_max_stack, ret;
+	struct ctl_table new_table = *table;
+
+	new_table.data = &new_value;
+	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&callchain_mutex);
+	if (atomic_read(&nr_callchain_events))
+		ret = -EBUSY;
+	else
+		sysctl_perf_event_max_stack = new_value;
+	
+	mutex_unlock(&callchain_mutex);
+
+	return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..afbe9fb32ec5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1144,6 +1144,14 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "perf_event_max_stack",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(sysctl_perf_event_max_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-20 22:47 [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl Arnaldo Carvalho de Melo
@ 2016-04-20 23:04 ` Alexei Starovoitov
  2016-04-22 20:52   ` [PATCH/RFC v2] " Arnaldo Carvalho de Melo
  2016-04-20 23:10 ` [PATCH/RFC] " David Ahern
  2016-04-21 10:48 ` Peter Zijlstra
  2 siblings, 1 reply; 38+ messages in thread
From: Alexei Starovoitov @ 2016-04-20 23:04 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Peter Zijlstra, Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Brendan Gregg, Alexander Shishkin, Alexei Starovoitov,
	David Ahern, He Kuang, Jiri Olsa, Masami Hiramatsu, Milian Wolff,
	Namhyung Kim, Stephane Eranian, Thomas Gleixner, Vince Weaver,
	Wang Nan, Zefan Li, Linux Kernel Mailing List

On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> The default remains 127, which is good for most cases, and not even hit
> most of the time, but then for some cases, as reported by Brendan, 1024+
> deep frames are appearing on the radar for things like groovy, ruby.
>     
> And in some workloads putting a _lower_ cap on this may make sense. One
> that is per event still needs to be put in place tho.
>     
> The new file is:
> 
>   # cat /proc/sys/kernel/perf_event_max_stack
>   127
> 
> Chaging it:
> 
>   # echo 256 > /proc/sys/kernel/perf_event_max_stack
>   # cat /proc/sys/kernel/perf_event_max_stack
>   256
> 
> But as soon as there is some event using callchains we get:
> 
>   # echo 512 > /proc/sys/kernel/perf_event_max_stack
>   -bash: echo: write error: Device or resource busy
>   #
> 
> Because we only allocate the callchain percpu data structures when there
> is a user, which allows for changing the max easily, its just a matter
> of having no callchain users at that point.
> 
> Reported-by: Brendan Gregg <brendan.d.gregg@gmail.com>
> Link: http://lkml.kernel.org/n/tip-cgls6uuncwjtq969tys1j6b0@git.kernel.org
> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

Nice. I like it. That's a great approach to hard problem.
Java guys will be happy too.
Please also adjust two places in kernel/bpf/stackmap.c

> +	{
> +		.procname	= "perf_event_max_stack",
> +		.data		= NULL, /* filled in by handler */
> +		.maxlen		= sizeof(sysctl_perf_event_max_stack),
> +		.mode		= 0644,
> +		.proc_handler	= perf_event_max_stack_handler,
> +		.extra1		= &zero,

zero seems to be the wrong minimum. I think it should be at least 2 to
to fit user/kernel tags ?
Probably needs to define max as well.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-20 22:47 [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl Arnaldo Carvalho de Melo
  2016-04-20 23:04 ` Alexei Starovoitov
@ 2016-04-20 23:10 ` David Ahern
  2016-04-21  0:18   ` Arnaldo Carvalho de Melo
  2016-04-21 10:48 ` Peter Zijlstra
  2 siblings, 1 reply; 38+ messages in thread
From: David Ahern @ 2016-04-20 23:10 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Peter Zijlstra
  Cc: Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

On 4/20/16 4:47 PM, Arnaldo Carvalho de Melo wrote:
> The new file is:
>
>    # cat /proc/sys/kernel/perf_event_max_stack
>    127
>
> Chaging it:
>
>    # echo 256 > /proc/sys/kernel/perf_event_max_stack
>    # cat /proc/sys/kernel/perf_event_max_stack
>    256
>
> But as soon as there is some event using callchains we get:
>
>    # echo 512 > /proc/sys/kernel/perf_event_max_stack
>    -bash: echo: write error: Device or resource busy
>    #
>
> Because we only allocate the callchain percpu data structures when there
> is a user, which allows for changing the max easily, its just a matter
> of having no callchain users at that point.
>
> Reported-by: Brendan Gregg <brendan.d.gregg@gmail.com>
> Link: http://lkml.kernel.org/n/tip-cgls6uuncwjtq969tys1j6b0@git.kernel.org
> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
>

I would love to see something like this go in. Right now I have to 
recompile the kernel because I want a lower max count.

In the past we talked about about making this part of the attribute with 
separate controls for both kernel stack and userspace stack. Have you 
given up on that option?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-20 23:10 ` [PATCH/RFC] " David Ahern
@ 2016-04-21  0:18   ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-21  0:18 UTC (permalink / raw)
  To: David Ahern
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Frederic Weisbecker,
	Ingo Molnar, Adrian Hunter, Brendan Gregg, Alexander Shishkin,
	Alexei Starovoitov, He Kuang, Jiri Olsa, Masami Hiramatsu,
	Milian Wolff, Namhyung Kim, Stephane Eranian, Thomas Gleixner,
	Vince Weaver, Wang Nan, Zefan Li, Linux Kernel Mailing List

Em Wed, Apr 20, 2016 at 05:10:17PM -0600, David Ahern escreveu:
> On 4/20/16 4:47 PM, Arnaldo Carvalho de Melo wrote:
> >   # echo 256 > /proc/sys/kernel/perf_event_max_stack
> >   # cat /proc/sys/kernel/perf_event_max_stack
> >   256
 
> I would love to see something like this go in. Right now I have to recompile
> the kernel because I want a lower max count.
 
> In the past we talked about about making this part of the attribute with
> separate controls for both kernel stack and userspace stack. Have you given
> up on that option?

Nope, I'll try to work on that too, I even mentioned it above, the part
"one per event remains to be done".

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-20 22:47 [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl Arnaldo Carvalho de Melo
  2016-04-20 23:04 ` Alexei Starovoitov
  2016-04-20 23:10 ` [PATCH/RFC] " David Ahern
@ 2016-04-21 10:48 ` Peter Zijlstra
  2016-04-21 15:17   ` Arnaldo Carvalho de Melo
  2 siblings, 1 reply; 38+ messages in thread
From: Peter Zijlstra @ 2016-04-21 10:48 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, David Ahern, He Kuang,
	Jiri Olsa, Masami Hiramatsu, Milian Wolff, Namhyung Kim,
	Stephane Eranian, Thomas Gleixner, Vince Weaver, Wang Nan,
	Zefan Li, Linux Kernel Mailing List

On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> The default remains 127, which is good for most cases, and not even hit
> most of the time, but then for some cases, as reported by Brendan, 1024+
> deep frames are appearing on the radar for things like groovy, ruby.

yea gawds ;-)



> index 343c22f5e867..1390747b2195 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -18,6 +18,14 @@ struct callchain_cpus_entries {
>  	struct perf_callchain_entry	*cpu_entries[0];
>  };
>  
> +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> +
> +static size_t perf_callchain_entry__sizeof(void)
> +{
> +	return sizeof(struct perf_callchain_entry) +
> +	       sizeof(__u64) * sysctl_perf_event_max_stack;
> +}
> +
>  static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
>  static atomic_t nr_callchain_events;
>  static DEFINE_MUTEX(callchain_mutex);
> @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
>  	if (!entries)
>  		return -ENOMEM;
>  
> -	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> +	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
>  
>  	for_each_possible_cpu(cpu) {
>  		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,

And this alloc _will_ fail if you put in a decent sized value..

Should we put in a dmesg WARN if this alloc fails and
perf_event_max_stack is 'large' ?

> @@ -215,3 +223,25 @@ exit_put:
>  
>  	return entry;
>  }
> +
> +int perf_event_max_stack_handler(struct ctl_table *table, int write,
> +				 void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	int new_value = sysctl_perf_event_max_stack, ret;
> +	struct ctl_table new_table = *table;
> +
> +	new_table.data = &new_value;

cute :-)

> +	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
> +	if (ret || !write)
> +		return ret;
> +
> +	mutex_lock(&callchain_mutex);
> +	if (atomic_read(&nr_callchain_events))
> +		ret = -EBUSY;
> +	else
> +		sysctl_perf_event_max_stack = new_value;
> +	
> +	mutex_unlock(&callchain_mutex);
> +
> +	return ret;
> +}

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-21 10:48 ` Peter Zijlstra
@ 2016-04-21 15:17   ` Arnaldo Carvalho de Melo
  2016-04-21 15:42     ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-21 15:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, David Ahern, He Kuang,
	Jiri Olsa, Masami Hiramatsu, Milian Wolff, Namhyung Kim,
	Stephane Eranian, Thomas Gleixner, Vince Weaver, Wang Nan,
	Zefan Li, Linux Kernel Mailing List

Em Thu, Apr 21, 2016 at 12:48:58PM +0200, Peter Zijlstra escreveu:
> On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> > The default remains 127, which is good for most cases, and not even hit
> > most of the time, but then for some cases, as reported by Brendan, 1024+
> > deep frames are appearing on the radar for things like groovy, ruby.
 
> yea gawds ;-)
 
> > +++ b/kernel/events/callchain.c

> > @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> >  	if (!entries)
> >  		return -ENOMEM;
> >  
> > -	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> > +	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> >  
> >  	for_each_possible_cpu(cpu) {
> >  		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
> 
> And this alloc _will_ fail if you put in a decent sized value..
> 
> Should we put in a dmesg WARN if this alloc fails and
> perf_event_max_stack is 'large' ?

Unsure, it already returns -ENOMEM, see, some lines above, i.e. it
better have error handling up this, ho-hum, call chain, I'm checking...
 
> > @@ -215,3 +223,25 @@ exit_put:
> >  
> >  	return entry;
> >  }
> > +
> > +int perf_event_max_stack_handler(struct ctl_table *table, int write,
> > +				 void __user *buffer, size_t *lenp, loff_t *ppos)
> > +{
> > +	int new_value = sysctl_perf_event_max_stack, ret;
> > +	struct ctl_table new_table = *table;
> > +
> > +	new_table.data = &new_value;
 
> cute :-)

Hey, I found it on sysctl_schedstats() and sysctl_numa_balancing(), as a
way to read that value but only make it take effect if some condition
was true (nr_callchain_events == 0 in this case), granted, could be
better, less clever, but I leave this for later ;-)
 
> > +	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
> > +	if (ret || !write)
> > +		return ret;
> > +
> > +	mutex_lock(&callchain_mutex);
> > +	if (atomic_read(&nr_callchain_events))
> > +		ret = -EBUSY;
> > +	else
> > +		sysctl_perf_event_max_stack = new_value;
> > +	
> > +	mutex_unlock(&callchain_mutex);
> > +
> > +	return ret;
> > +}

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-21 15:17   ` Arnaldo Carvalho de Melo
@ 2016-04-21 15:42     ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-21 15:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, David Ahern, He Kuang,
	Jiri Olsa, Masami Hiramatsu, Milian Wolff, Namhyung Kim,
	Stephane Eranian, Thomas Gleixner, Vince Weaver, Wang Nan,
	Zefan Li, Linux Kernel Mailing List

Em Thu, Apr 21, 2016 at 12:17:07PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Thu, Apr 21, 2016 at 12:48:58PM +0200, Peter Zijlstra escreveu:
> > On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> > > +++ b/kernel/events/callchain.c
> 
> > > @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> > >  	if (!entries)
> > >  		return -ENOMEM;

> > > -	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> > > +	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;

> > >  	for_each_possible_cpu(cpu) {
> > >  		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,

> > And this alloc _will_ fail if you put in a decent sized value..

> > Should we put in a dmesg WARN if this alloc fails and
> > perf_event_max_stack is 'large' ?
> 
> Unsure, it already returns -ENOMEM, see, some lines above, i.e. it
> better have error handling up this, ho-hum, call chain, I'm checking...
  
So, it is, using the tools, since it fits this bill:

[root@jouet acme]# perf probe get_callchain_buffers
Added new event:
  probe:get_callchain_buffers (on get_callchain_buffers)

You can now use it in all perf tools, such as:

	perf record -e probe:get_callchain_buffers -aR sleep 1

[root@jouet acme]# perf trace -e perf_event_open --event probe:get_callchain_buffers/call-graph=fp/ -- perf record -g usleep 1
[ perf record: Woken up 1 times to write data ]
    26.740 ( 0.008 ms): perf/1264 perf_event_open(attr_uptr: 0x1e6b618, pid: 1265, group_fd: -1, flags: FD_CLOEXEC) ...
    26.740 (         ): probe:get_callchain_buffers:(ffffffff811a9480))
                                       get_callchain_buffers+0xfe200001 ([kernel.kallsyms])
                                       SYSC_perf_event_open+0xfe2003bc ([kernel.kallsyms])
                                       sys_perf_event_open+0xfe200009 ([kernel.kallsyms])
                                       do_syscall_64+0xfe200062 ([kernel.kallsyms])
                                       return_from_SYSCALL_64+0xfe200000 ([kernel.kallsyms])
                                       syscall+0xffff0179d3c22019 (/usr/lib64/libc-2.22.so)
                                       cmd_record+0xffffffffff8005fc (/home/acme/bin/perf)
                                       run_builtin+0xffffffffff800061 (/home/acme/bin/perf)
                                       main+0xffffffffff800672 (/home/acme/bin/perf)
                                       __libc_start_main+0xffff0179d3c220f0 (/usr/lib64/libc-2.22.so)
    26.749 ( 0.016 ms): perf/1264  ... [continued]: perf_event_open()) = 4
<SNIP the perf_event_open syscalls for the other 3 CPUs in this system)

So, if we put some too big value there and the allocation fails, people asking
for callchains will get that -ENOMEM back in their face.

I'll do this test, i.e. put some huge value there and ask for callchains...

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-20 23:04 ` Alexei Starovoitov
@ 2016-04-22 20:52   ` Arnaldo Carvalho de Melo
  2016-04-22 21:37     ` Alexei Starovoitov
  2016-04-22 22:05     ` David Ahern
  0 siblings, 2 replies; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-22 20:52 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Brendan Gregg, Alexander Shishkin, Alexei Starovoitov,
	David Ahern, He Kuang, Jiri Olsa, Masami Hiramatsu, Milian Wolff,
	Namhyung Kim, Stephane Eranian, Thomas Gleixner, Vince Weaver,
	Wang Nan, Zefan Li, Linux Kernel Mailing List

Em Wed, Apr 20, 2016 at 04:04:12PM -0700, Alexei Starovoitov escreveu:
> On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
 
> Nice. I like it. That's a great approach to hard problem.
> Java guys will be happy too.
> Please also adjust two places in kernel/bpf/stackmap.c
 
> > +	{
> > +		.procname	= "perf_event_max_stack",
> > +		.data		= NULL, /* filled in by handler */
> > +		.maxlen		= sizeof(sysctl_perf_event_max_stack),
> > +		.mode		= 0644,
> > +		.proc_handler	= perf_event_max_stack_handler,
> > +		.extra1		= &zero,
 
> zero seems to be the wrong minimum. I think it should be at least 2 to
> to fit user/kernel tags ?  Probably needs to define max as well.

So, if someone asks for zero, it will not copy anything, but then, this
would be what the user had asked for :-)

Ditto for the max, if someone asks for too big a callchain, then when
allocating it it will fail and no callchain will be produced, that or it
will be able to allocate but will take too long copying that many
addresses, and we would be prevented from doing so by some other
protection, iirc there is perf_cpu_time_max_percent, and then buffer
space will run out.

So I think that leaving it as is is enough, no?

Can I keep your Acked-by? David, can I keep yours?

I'll improve tools/perf to look at this file and then push this to Ingo,
if nobody hollers.

Peter, I think my response was enough about reporting allocation
failure, right?

Brendan tested it and it solved the problem he reported, the callchains
he got are really that big and now he has good looking flamegraphs, yay!

- Arnaldo

Here is V2, with stackmap.c adjusted, only build tested.

commit 7c24bb249e392e3dd7dd71a26277c64e313bab4e
Author: Arnaldo Carvalho de Melo <acme@redhat.com>
Date:   Thu Apr 21 12:28:50 2016 -0300

    perf core: Allow setting up max frame stack depth via sysctl
    
    The default remains 127, which is good for most cases, and not even hit
    most of the time, but then for some cases, as reported by Brendan, 1024+
    deep frames are appearing on the radar for things like groovy, ruby.
    
    And in some workloads putting a _lower_ cap on this may make sense. One
    that is per event still needs to be put in place tho.
    
    The new file is:
    
      # cat /proc/sys/kernel/perf_event_max_stack
      127
    
    Chaging it:
    
      # echo 256 > /proc/sys/kernel/perf_event_max_stack
      # cat /proc/sys/kernel/perf_event_max_stack
      256
    
    But as soon as there is some event using callchains we get:
    
      # echo 512 > /proc/sys/kernel/perf_event_max_stack
      -bash: echo: write error: Device or resource busy
      #
    
    Because we only allocate the callchain percpu data structures when there
    is a user, which allows for changing the max easily, its just a matter
    of having no callchain users at that point.
    
    Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
    Acked-by: Alexei Starovoitov <ast@kernel.org>
    Acked-by: David Ahern <dsahern@gmail.com>
    Cc: Adrian Hunter <adrian.hunter@intel.com>
    Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
    Cc: He Kuang <hekuang@huawei.com>
    Cc: Jiri Olsa <jolsa@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Masami Hiramatsu <mhiramat@kernel.org>
    Cc: Milian Wolff <milian.wolff@kdab.com>
    Cc: Namhyung Kim <namhyung@kernel.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Stephane Eranian <eranian@google.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Vince Weaver <vincent.weaver@maine.edu>
    Cc: Wang Nan <wangnan0@huawei.com>
    Cc: Zefan Li <lizefan@huawei.com>
    Link: http://lkml.kernel.org/n/tip-cgls6uuncwjtq969tys1j6b0@git.kernel.org
    Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..260cde08e92e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_warn
 - perf_cpu_time_max_percent
 - perf_event_paranoid
+- perf_event_max_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN).  The default value is 1.
 
 ==============================================================
 
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+	while ((entry->nr < sysctl_perf_event_max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		while (entry->nr < sysctl_perf_event_max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+		while ((entry->nr < sysctl_perf_event_max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= PERF_MAX_STACK_DEPTH)
+			if (entry->nr >= sysctl_perf_event_max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= PERF_MAX_STACK_DEPTH)
+		if (entry->nr >= sysctl_perf_event_max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..41d93d0e972b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
 				callchain_trace, NULL, entry);
 }
 
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b8b195fbe787..d599ed32af92 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
 
 struct perf_callchain_entry {
 	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
+	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_raw_record {
@@ -983,9 +983,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
+extern int sysctl_perf_event_max_stack;
+
 static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < PERF_MAX_STACK_DEPTH) {
+	if (entry->nr < sysctl_perf_event_max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
@@ -1007,6 +1009,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    value_size < 8 || value_size % 8 ||
-	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+	    value_size / 8 > sysctl_perf_event_max_stack)
 		return ERR_PTR(-EINVAL);
 
 	/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 max_depth = map->value_size / 8;
-	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
-	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 		return -EFAULT;
 
 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
-	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 	 */
 	trace_nr = trace->nr - init_nr;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..6fe77349fa9d 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
 	struct perf_callchain_entry	*cpu_entries[0];
 };
 
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static size_t perf_callchain_entry__sizeof(void)
+{
+	return sizeof(struct perf_callchain_entry) +
+	       sizeof(__u64) * sysctl_perf_event_max_stack;
+}
+
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -215,3 +223,25 @@ exit_put:
 
 	return entry;
 }
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int new_value = sysctl_perf_event_max_stack, ret;
+	struct ctl_table new_table = *table;
+
+	new_table.data = &new_value;
+	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&callchain_mutex);
+	if (atomic_read(&nr_callchain_events))
+		ret = -EBUSY;
+	else
+		sysctl_perf_event_max_stack = new_value;
+
+	mutex_unlock(&callchain_mutex);
+
+	return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..afbe9fb32ec5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1144,6 +1144,14 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "perf_event_max_stack",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(sysctl_perf_event_max_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-22 20:52   ` [PATCH/RFC v2] " Arnaldo Carvalho de Melo
@ 2016-04-22 21:37     ` Alexei Starovoitov
  2016-04-22 22:05     ` David Ahern
  1 sibling, 0 replies; 38+ messages in thread
From: Alexei Starovoitov @ 2016-04-22 21:37 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Peter Zijlstra, Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Brendan Gregg, Alexander Shishkin, Alexei Starovoitov,
	David Ahern, He Kuang, Jiri Olsa, Masami Hiramatsu, Milian Wolff,
	Namhyung Kim, Stephane Eranian, Thomas Gleixner, Vince Weaver,
	Wang Nan, Zefan Li, Linux Kernel Mailing List

On Fri, Apr 22, 2016 at 05:52:32PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Wed, Apr 20, 2016 at 04:04:12PM -0700, Alexei Starovoitov escreveu:
> > On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
>  
> > Nice. I like it. That's a great approach to hard problem.
> > Java guys will be happy too.
> > Please also adjust two places in kernel/bpf/stackmap.c
>  
> > > +	{
> > > +		.procname	= "perf_event_max_stack",
> > > +		.data		= NULL, /* filled in by handler */
> > > +		.maxlen		= sizeof(sysctl_perf_event_max_stack),
> > > +		.mode		= 0644,
> > > +		.proc_handler	= perf_event_max_stack_handler,
> > > +		.extra1		= &zero,
>  
> > zero seems to be the wrong minimum. I think it should be at least 2 to
> > to fit user/kernel tags ?  Probably needs to define max as well.
> 
> So, if someone asks for zero, it will not copy anything, but then, this
> would be what the user had asked for :-)
> 
> Ditto for the max, if someone asks for too big a callchain, then when
> allocating it it will fail and no callchain will be produced, that or it
> will be able to allocate but will take too long copying that many
> addresses, and we would be prevented from doing so by some other
> protection, iirc there is perf_cpu_time_max_percent, and then buffer
> space will run out.
> 
> So I think that leaving it as is is enough, no?
> 
> Can I keep your Acked-by? David, can I keep yours?

I'm still a bit concerned with perf_event_max_stack==0, since it
means that alloc_callchain_buffers() will be failing,
so perf_event_open() will also be failing with ENOMEM which
will be hard to understand for any user and not clear whether
perf user space can print any hints, since such errno is ambiguous.
Also I'm concerned with very large perf_event_max_stack that
can cause integer overflow.
So I still think we have to set reasonable min and max.
Other than that it's ack.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-22 20:52   ` [PATCH/RFC v2] " Arnaldo Carvalho de Melo
  2016-04-22 21:37     ` Alexei Starovoitov
@ 2016-04-22 22:05     ` David Ahern
  2016-04-22 22:18       ` Alexei Starovoitov
  1 sibling, 1 reply; 38+ messages in thread
From: David Ahern @ 2016-04-22 22:05 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Alexei Starovoitov
  Cc: Peter Zijlstra, Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Brendan Gregg, Alexander Shishkin, Alexei Starovoitov, He Kuang,
	Jiri Olsa, Masami Hiramatsu, Milian Wolff, Namhyung Kim,
	Stephane Eranian, Thomas Gleixner, Vince Weaver, Wang Nan,
	Zefan Li, Linux Kernel Mailing List

On 4/22/16 2:52 PM, Arnaldo Carvalho de Melo wrote:
> Em Wed, Apr 20, 2016 at 04:04:12PM -0700, Alexei Starovoitov escreveu:
>> On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
>
>> Nice. I like it. That's a great approach to hard problem.
>> Java guys will be happy too.
>> Please also adjust two places in kernel/bpf/stackmap.c
>
>>> +	{
>>> +		.procname	= "perf_event_max_stack",
>>> +		.data		= NULL, /* filled in by handler */
>>> +		.maxlen		= sizeof(sysctl_perf_event_max_stack),
>>> +		.mode		= 0644,
>>> +		.proc_handler	= perf_event_max_stack_handler,
>>> +		.extra1		= &zero,
>
>> zero seems to be the wrong minimum. I think it should be at least 2 to
>> to fit user/kernel tags ?  Probably needs to define max as well.
>
> So, if someone asks for zero, it will not copy anything, but then, this
> would be what the user had asked for :-)
>
> Ditto for the max, if someone asks for too big a callchain, then when
> allocating it it will fail and no callchain will be produced, that or it
> will be able to allocate but will take too long copying that many
> addresses, and we would be prevented from doing so by some other
> protection, iirc there is perf_cpu_time_max_percent, and then buffer
> space will run out.
>
> So I think that leaving it as is is enough, no?
>
> Can I keep your Acked-by? David, can I keep yours?

Yes


> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 343c22f5e867..6fe77349fa9d 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -18,6 +18,14 @@ struct callchain_cpus_entries {
>   	struct perf_callchain_entry	*cpu_entries[0];
>   };
>
> +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> +
> +static size_t perf_callchain_entry__sizeof(void)
> +{
> +	return sizeof(struct perf_callchain_entry) +
> +	       sizeof(__u64) * sysctl_perf_event_max_stack;
> +}
> +

To Alexei's comment, a max_stack of 0 still has a non-zero alloc size so 
that should be ok.


>   static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
>   static atomic_t nr_callchain_events;
>   static DEFINE_MUTEX(callchain_mutex);
> @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
>   	if (!entries)
>   		return -ENOMEM;
>
> -	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> +	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
>
>   	for_each_possible_cpu(cpu) {
>   		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-22 22:05     ` David Ahern
@ 2016-04-22 22:18       ` Alexei Starovoitov
  2016-04-25 16:14         ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 38+ messages in thread
From: Alexei Starovoitov @ 2016-04-22 22:18 UTC (permalink / raw)
  To: David Ahern
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Frederic Weisbecker,
	Ingo Molnar, Adrian Hunter, Brendan Gregg, Alexander Shishkin,
	Alexei Starovoitov, He Kuang, Jiri Olsa, Masami Hiramatsu,
	Milian Wolff, Namhyung Kim, Stephane Eranian, Thomas Gleixner,
	Vince Weaver, Wang Nan, Zefan Li, Linux Kernel Mailing List

On Fri, Apr 22, 2016 at 04:05:31PM -0600, David Ahern wrote:
> On 4/22/16 2:52 PM, Arnaldo Carvalho de Melo wrote:
> >Em Wed, Apr 20, 2016 at 04:04:12PM -0700, Alexei Starovoitov escreveu:
> >>On Wed, Apr 20, 2016 at 07:47:30PM -0300, Arnaldo Carvalho de Melo wrote:
> >
> >>Nice. I like it. That's a great approach to hard problem.
> >>Java guys will be happy too.
> >>Please also adjust two places in kernel/bpf/stackmap.c
> >
> >>>+	{
> >>>+		.procname	= "perf_event_max_stack",
> >>>+		.data		= NULL, /* filled in by handler */
> >>>+		.maxlen		= sizeof(sysctl_perf_event_max_stack),
> >>>+		.mode		= 0644,
> >>>+		.proc_handler	= perf_event_max_stack_handler,
> >>>+		.extra1		= &zero,
> >
> >>zero seems to be the wrong minimum. I think it should be at least 2 to
> >>to fit user/kernel tags ?  Probably needs to define max as well.
> >
> >So, if someone asks for zero, it will not copy anything, but then, this
> >would be what the user had asked for :-)
> >
> >Ditto for the max, if someone asks for too big a callchain, then when
> >allocating it it will fail and no callchain will be produced, that or it
> >will be able to allocate but will take too long copying that many
> >addresses, and we would be prevented from doing so by some other
> >protection, iirc there is perf_cpu_time_max_percent, and then buffer
> >space will run out.
> >
> >So I think that leaving it as is is enough, no?
> >
> >Can I keep your Acked-by? David, can I keep yours?
> 
> Yes
> 
> 
> >diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> >index 343c22f5e867..6fe77349fa9d 100644
> >--- a/kernel/events/callchain.c
> >+++ b/kernel/events/callchain.c
> >@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
> >  	struct perf_callchain_entry	*cpu_entries[0];
> >  };
> >
> >+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> >+
> >+static size_t perf_callchain_entry__sizeof(void)
> >+{
> >+	return sizeof(struct perf_callchain_entry) +
> >+	       sizeof(__u64) * sysctl_perf_event_max_stack;
> >+}
> >+
> 
> To Alexei's comment, a max_stack of 0 still has a non-zero alloc size so
> that should be ok.
> 
> 
> >  static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> >  static atomic_t nr_callchain_events;
> >  static DEFINE_MUTEX(callchain_mutex);
> >@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> >  	if (!entries)
> >  		return -ENOMEM;
> >
> >-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> >+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> >
> >  	for_each_possible_cpu(cpu) {
> >  		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,

right... and looking into it further, realized that the patch is broken,
since get_callchain_entry() is doing:
  return &entries->cpu_entries[cpu][*rctx];
whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
So definitely needs another respin.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-22 22:18       ` Alexei Starovoitov
@ 2016-04-25 16:14         ` Arnaldo Carvalho de Melo
  2016-04-25 16:27           ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-25 16:14 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Ahern, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> On Fri, Apr 22, 2016 at 04:05:31PM -0600, David Ahern wrote:
> > On 4/22/16 2:52 PM, Arnaldo Carvalho de Melo wrote:
> > >+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> > >+
> > >+static size_t perf_callchain_entry__sizeof(void)
> > >+{
> > >+	return sizeof(struct perf_callchain_entry) +
> > >+	       sizeof(__u64) * sysctl_perf_event_max_stack;
> > >+}
> > >+

> > To Alexei's comment, a max_stack of 0 still has a non-zero alloc size so
> > that should be ok.

> > >  static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> > >  static atomic_t nr_callchain_events;
> > >  static DEFINE_MUTEX(callchain_mutex);
> > >@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> > >  	if (!entries)
> > >  		return -ENOMEM;

> > >-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> > >+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;

> > >  	for_each_possible_cpu(cpu) {
> > >  		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
 
> right... and looking into it further, realized that the patch is broken,
> since get_callchain_entry() is doing:
>   return &entries->cpu_entries[cpu][*rctx];
> whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> So definitely needs another respin.

Huh? Can you elaborate a bit more?

Are you saying this is a bug introduced by this patch or something
pre-existing?

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-25 16:14         ` Arnaldo Carvalho de Melo
@ 2016-04-25 16:27           ` Arnaldo Carvalho de Melo
  2016-04-25 19:22             ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-25 16:27 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Ahern, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Mon, Apr 25, 2016 at 01:14:25PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> > On Fri, Apr 22, 2016 at 04:05:31PM -0600, David Ahern wrote:
> > > On 4/22/16 2:52 PM, Arnaldo Carvalho de Melo wrote:
> > > >+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> > > >+
> > > >+static size_t perf_callchain_entry__sizeof(void)
> > > >+{
> > > >+	return sizeof(struct perf_callchain_entry) +
> > > >+	       sizeof(__u64) * sysctl_perf_event_max_stack;
> > > >+}
> > > >+
> 
> > > To Alexei's comment, a max_stack of 0 still has a non-zero alloc size so
> > > that should be ok.
> 
> > > >  static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> > > >  static atomic_t nr_callchain_events;
> > > >  static DEFINE_MUTEX(callchain_mutex);
> > > >@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
> > > >  	if (!entries)
> > > >  		return -ENOMEM;
> 
> > > >-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
> > > >+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> 
> > > >  	for_each_possible_cpu(cpu) {
> > > >  		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
>  
> > right... and looking into it further, realized that the patch is broken,
> > since get_callchain_entry() is doing:
> >   return &entries->cpu_entries[cpu][*rctx];
> > whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> > So definitely needs another respin.
> 
> Huh? Can you elaborate a bit more?
> 
> Are you saying this is a bug introduced by this patch or something
> pre-existing?

When we alloc we alloc:

/*
 * Number of contexts where an event can trigger:
 *      task, softirq, hardirq, nmi.
 */
#define PERF_NR_CONTEXTS        4

        callchain_cpus_entries[0 .. (nr_cpu_ids - 1)][in_nmi() (3)]
        callchain_cpus_entries[0 .. (nr_cpu_ids - 1)][in_irq() (2)]
        callchain_cpus_entries[0 .. (nr_cpu_ids - 1)][in_sortirq() (1)]
        callchain_cpus_entries[0 .. (nr_cpu_ids - 1)][(0)]

And all of these have this type:

struct perf_callchain_entry {
        __u64     nr;
        __u64     ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
};

So, it will return a struct_callchain_entry pointer to a 8-byte sized
chunk, with just perf_callchain_entry->nr, and will not try to touch
perf_callchain_entry->ip[] since sysctl_perf_event_max_stack is zero.

But perf_callchain_entry->ip is not a pointer... Got it ;-\

Touché, respinning...

Brendan, this probably affected you results...

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-25 16:27           ` Arnaldo Carvalho de Melo
@ 2016-04-25 19:22             ` Arnaldo Carvalho de Melo
  2016-04-25 20:06               ` Alexei Starovoitov
  0 siblings, 1 reply; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-25 19:22 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Ahern, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Mon, Apr 25, 2016 at 01:27:06PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Mon, Apr 25, 2016 at 01:14:25PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> > > right... and looking into it further, realized that the patch is broken,
> > > since get_callchain_entry() is doing:
> > >   return &entries->cpu_entries[cpu][*rctx];
> > > whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> > > So definitely needs another respin.

> struct perf_callchain_entry {
>         __u64     nr;
>         __u64     ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
> };
 
> But perf_callchain_entry->ip is not a pointer... Got it ;-\

This is what I am building now, a patch on top of the previous, that
will be combined and sent as v3, if I don't find any more funnies:

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6fe77349fa9d..40657892a7e0 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -20,11 +20,10 @@ struct callchain_cpus_entries {
 
 int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
 
-static size_t perf_callchain_entry__sizeof(void)
-{
-	return sizeof(struct perf_callchain_entry) +
-	       sizeof(__u64) * sysctl_perf_event_max_stack;
-}
+#define __perf_callchain_entry_size(entries) \
+	(sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
+
+static size_t perf_callchain_entry_size __read_mostly = __perf_callchain_entry_size(PERF_MAX_STACK_DEPTH);
 
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
@@ -81,7 +80,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
+	size = perf_callchain_entry_size * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -155,7 +154,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 
 	cpu = smp_processor_id();
 
-	return &entries->cpu_entries[cpu][*rctx];
+	return (((void *)&entries->cpu_entries[cpu][0]) +
+		(*rctx * perf_callchain_entry_size));
 }
 
 static void
@@ -236,10 +236,12 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write,
 		return ret;
 
 	mutex_lock(&callchain_mutex);
-	if (atomic_read(&nr_callchain_events))
+	if (atomic_read(&nr_callchain_events)) {
 		ret = -EBUSY;
-	else
+	} else {
 		sysctl_perf_event_max_stack = new_value;
+		perf_callchain_entry_size   = __perf_callchain_entry_size(new_value);
+	}
 
 	mutex_unlock(&callchain_mutex);
 

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-25 19:22             ` Arnaldo Carvalho de Melo
@ 2016-04-25 20:06               ` Alexei Starovoitov
  2016-04-25 20:17                 ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 38+ messages in thread
From: Alexei Starovoitov @ 2016-04-25 20:06 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: David Ahern, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

On Mon, Apr 25, 2016 at 04:22:29PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Apr 25, 2016 at 01:27:06PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Mon, Apr 25, 2016 at 01:14:25PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> > > > right... and looking into it further, realized that the patch is broken,
> > > > since get_callchain_entry() is doing:
> > > >   return &entries->cpu_entries[cpu][*rctx];
> > > > whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> > > > So definitely needs another respin.
> 
> > struct perf_callchain_entry {
> >         __u64     nr;
> >         __u64     ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
> > };
>  
> > But perf_callchain_entry->ip is not a pointer... Got it ;-\
> 
> This is what I am building now, a patch on top of the previous, that
> will be combined and sent as v3, if I don't find any more funnies:
> 
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 6fe77349fa9d..40657892a7e0 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -20,11 +20,10 @@ struct callchain_cpus_entries {
>  
>  int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
>  
> -static size_t perf_callchain_entry__sizeof(void)
> -{
> -	return sizeof(struct perf_callchain_entry) +
> -	       sizeof(__u64) * sysctl_perf_event_max_stack;
> -}
> +#define __perf_callchain_entry_size(entries) \
> +	(sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
> +
> +static size_t perf_callchain_entry_size __read_mostly = __perf_callchain_entry_size(PERF_MAX_STACK_DEPTH);
>  
>  static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
>  static atomic_t nr_callchain_events;
> @@ -81,7 +80,7 @@ static int alloc_callchain_buffers(void)
>  	if (!entries)
>  		return -ENOMEM;
>  
> -	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> +	size = perf_callchain_entry_size * PERF_NR_CONTEXTS;
>  
>  	for_each_possible_cpu(cpu) {
>  		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
> @@ -155,7 +154,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
>  
>  	cpu = smp_processor_id();
>  
> -	return &entries->cpu_entries[cpu][*rctx];
> +	return (((void *)&entries->cpu_entries[cpu][0]) +
> +		(*rctx * perf_callchain_entry_size));

I think the following would be easier to read:

+	return (void *)entries->cpu_entries[cpu] +
+		*rctx * perf_callchain_entry_size;
?
if didn't mixed up the ordering...

and probably we could do the math on the spot instead of introducing
perf_callchain_entry_size global variable?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-25 20:06               ` Alexei Starovoitov
@ 2016-04-25 20:17                 ` Arnaldo Carvalho de Melo
  2016-04-25 21:59                   ` Alexei Starovoitov
  0 siblings, 1 reply; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-25 20:17 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Ahern, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Mon, Apr 25, 2016 at 01:06:48PM -0700, Alexei Starovoitov escreveu:
> On Mon, Apr 25, 2016 at 04:22:29PM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Mon, Apr 25, 2016 at 01:27:06PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Em Mon, Apr 25, 2016 at 01:14:25PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > > Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> > > > > right... and looking into it further, realized that the patch is broken,
> > > > > since get_callchain_entry() is doing:
> > > > >   return &entries->cpu_entries[cpu][*rctx];
> > > > > whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> > > > > So definitely needs another respin.
> > 
> > > struct perf_callchain_entry {
> > >         __u64     nr;
> > >         __u64     ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
> > > };
> >  
> > > But perf_callchain_entry->ip is not a pointer... Got it ;-\
> > 
> > This is what I am building now, a patch on top of the previous, that
> > will be combined and sent as v3, if I don't find any more funnies:
> > 
> > diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> > index 6fe77349fa9d..40657892a7e0 100644
> > --- a/kernel/events/callchain.c
> > +++ b/kernel/events/callchain.c
> > @@ -20,11 +20,10 @@ struct callchain_cpus_entries {
> >  
> >  int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> >  
> > -static size_t perf_callchain_entry__sizeof(void)
> > -{
> > -	return sizeof(struct perf_callchain_entry) +
> > -	       sizeof(__u64) * sysctl_perf_event_max_stack;
> > -}
> > +#define __perf_callchain_entry_size(entries) \
> > +	(sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
> > +
> > +static size_t perf_callchain_entry_size __read_mostly = __perf_callchain_entry_size(PERF_MAX_STACK_DEPTH);
> >  
> >  static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> >  static atomic_t nr_callchain_events;
> > @@ -81,7 +80,7 @@ static int alloc_callchain_buffers(void)
> >  	if (!entries)
> >  		return -ENOMEM;
> >  
> > -	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> > +	size = perf_callchain_entry_size * PERF_NR_CONTEXTS;
> >  
> >  	for_each_possible_cpu(cpu) {
> >  		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
> > @@ -155,7 +154,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
> >  
> >  	cpu = smp_processor_id();
> >  
> > -	return &entries->cpu_entries[cpu][*rctx];
> > +	return (((void *)&entries->cpu_entries[cpu][0]) +
> > +		(*rctx * perf_callchain_entry_size));
> 
> I think the following would be easier to read:
> 
> +	return (void *)entries->cpu_entries[cpu] +
> +		*rctx * perf_callchain_entry_size;

Well, I thought that multiline expressions required parentheses, to make
them easier to read for someone, maybe Ingo? ;-)

Whatever, both generate the same result, really want me to change this?

> ?
> if didn't mixed up the ordering...

If you are not sure, then its not clearer, huh? ;-P
 
> and probably we could do the math on the spot instead of introducing
> perf_callchain_entry_size global variable?

I was trying to avoid the calc for each alloc, just doing it when we
change that number via the sysctl, probably not that big a deal, do you
really think we should do the math per-alloc instead of
per-sysctl-changing?

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-25 20:17                 ` Arnaldo Carvalho de Melo
@ 2016-04-25 21:59                   ` Alexei Starovoitov
  2016-04-25 23:41                     ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 38+ messages in thread
From: Alexei Starovoitov @ 2016-04-25 21:59 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: David Ahern, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

On Mon, Apr 25, 2016 at 05:17:50PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Apr 25, 2016 at 01:06:48PM -0700, Alexei Starovoitov escreveu:
> > On Mon, Apr 25, 2016 at 04:22:29PM -0300, Arnaldo Carvalho de Melo wrote:
> > > Em Mon, Apr 25, 2016 at 01:27:06PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > > Em Mon, Apr 25, 2016 at 01:14:25PM -0300, Arnaldo Carvalho de Melo escreveu:
> > > > > Em Fri, Apr 22, 2016 at 03:18:08PM -0700, Alexei Starovoitov escreveu:
> > > > > > right... and looking into it further, realized that the patch is broken,
> > > > > > since get_callchain_entry() is doing:
> > > > > >   return &entries->cpu_entries[cpu][*rctx];
> > > > > > whereas it should be dynamic offset based on sysctl_perf_event_max_stack*8
> > > > > > So definitely needs another respin.
> > > 
> > > > struct perf_callchain_entry {
> > > >         __u64     nr;
> > > >         __u64     ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
> > > > };
> > >  
> > > > But perf_callchain_entry->ip is not a pointer... Got it ;-\
> > > 
> > > This is what I am building now, a patch on top of the previous, that
> > > will be combined and sent as v3, if I don't find any more funnies:
> > > 
> > > diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> > > index 6fe77349fa9d..40657892a7e0 100644
> > > --- a/kernel/events/callchain.c
> > > +++ b/kernel/events/callchain.c
> > > @@ -20,11 +20,10 @@ struct callchain_cpus_entries {
> > >  
> > >  int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> > >  
> > > -static size_t perf_callchain_entry__sizeof(void)
> > > -{
> > > -	return sizeof(struct perf_callchain_entry) +
> > > -	       sizeof(__u64) * sysctl_perf_event_max_stack;
> > > -}
> > > +#define __perf_callchain_entry_size(entries) \
> > > +	(sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
> > > +
> > > +static size_t perf_callchain_entry_size __read_mostly = __perf_callchain_entry_size(PERF_MAX_STACK_DEPTH);
> > >  
> > >  static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
> > >  static atomic_t nr_callchain_events;
> > > @@ -81,7 +80,7 @@ static int alloc_callchain_buffers(void)
> > >  	if (!entries)
> > >  		return -ENOMEM;
> > >  
> > > -	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
> > > +	size = perf_callchain_entry_size * PERF_NR_CONTEXTS;
> > >  
> > >  	for_each_possible_cpu(cpu) {
> > >  		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
> > > @@ -155,7 +154,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
> > >  
> > >  	cpu = smp_processor_id();
> > >  
> > > -	return &entries->cpu_entries[cpu][*rctx];
> > > +	return (((void *)&entries->cpu_entries[cpu][0]) +
> > > +		(*rctx * perf_callchain_entry_size));
> > 
> > I think the following would be easier to read:
> > 
> > +	return (void *)entries->cpu_entries[cpu] +
> > +		*rctx * perf_callchain_entry_size;
> 
> Well, I thought that multiline expressions required parentheses, to make
> them easier to read for someone, maybe Ingo? ;-)
> 
> Whatever, both generate the same result, really want me to change this?

I was mainly complaining about unnecessary &..[0]

> > ?
> > if didn't mixed up the ordering...
> 
> If you are not sure, then its not clearer, huh? ;-P

matter of taste. No strong opinion

> > and probably we could do the math on the spot instead of introducing
> > perf_callchain_entry_size global variable?
> 
> I was trying to avoid the calc for each alloc, just doing it when we
> change that number via the sysctl, probably not that big a deal, do you
> really think we should do the math per-alloc instead of
> per-sysctl-changing?

The math is trivial:
#define __perf_callchain_entry_size(entries) \
	(sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
shift and add after load is almost the same speed as load, since
integer multiply right after is dominating the cost.
whereas updating two global variables can potentially cause
race conditions that need to be analyzed.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-25 21:59                   ` Alexei Starovoitov
@ 2016-04-25 23:41                     ` Arnaldo Carvalho de Melo
  2016-04-26  0:07                       ` Alexei Starovoitov
  0 siblings, 1 reply; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-25 23:41 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Ahern, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Mon, Apr 25, 2016 at 02:59:49PM -0700, Alexei Starovoitov escreveu:
> On Mon, Apr 25, 2016 at 05:17:50PM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Mon, Apr 25, 2016 at 01:06:48PM -0700, Alexei Starovoitov escreveu:
> > > On Mon, Apr 25, 2016 at 04:22:29PM -0300, Arnaldo Carvalho de Melo wrote:
> > Well, I thought that multiline expressions required parentheses, to make
> > them easier to read for someone, maybe Ingo? ;-)
> > 
> > Whatever, both generate the same result, really want me to change this?
> 
> I was mainly complaining about unnecessary &..[0]
> 
> > > ?
> > > if didn't mixed up the ordering...
> > 
> > If you are not sure, then its not clearer, huh? ;-P
> 
> matter of taste. No strong opinion
> 
> > > and probably we could do the math on the spot instead of introducing
> > > perf_callchain_entry_size global variable?
> > 
> > I was trying to avoid the calc for each alloc, just doing it when we
> > change that number via the sysctl, probably not that big a deal, do you
> > really think we should do the math per-alloc instead of
> > per-sysctl-changing?
> 
> The math is trivial:
> #define __perf_callchain_entry_size(entries) \
> 	(sizeof(struct perf_callchain_entry) + sizeof(__u64) * entries)
> shift and add after load is almost the same speed as load, since
> integer multiply right after is dominating the cost.
> whereas updating two global variables can potentially cause
> race conditions that need to be analyzed.

Ok, I thought the places where we changed those variables were always
under that callchain_mutex, but nah, too much trouble checking, find v3
below, see if I can have your Acked-by.

Thanks,

- Arnaldo

commit 565e8b138157fbe1c4495f083b88db681f52c6b3
Author: Arnaldo Carvalho de Melo <acme@redhat.com>
Date:   Thu Apr 21 12:28:50 2016 -0300

    perf core: Allow setting up max frame stack depth via sysctl
    
    The default remains 127, which is good for most cases, and not even hit
    most of the time, but then for some cases, as reported by Brendan, 1024+
    deep frames are appearing on the radar for things like groovy, ruby.
    
    And in some workloads putting a _lower_ cap on this may make sense. One
    that is per event still needs to be put in place tho.
    
    The new file is:
    
      # cat /proc/sys/kernel/perf_event_max_stack
      127
    
    Chaging it:
    
      # echo 256 > /proc/sys/kernel/perf_event_max_stack
      # cat /proc/sys/kernel/perf_event_max_stack
      256
    
    But as soon as there is some event using callchains we get:
    
      # echo 512 > /proc/sys/kernel/perf_event_max_stack
      -bash: echo: write error: Device or resource busy
      #
    
    Because we only allocate the callchain percpu data structures when there
    is a user, which allows for changing the max easily, its just a matter
    of having no callchain users at that point.
    
    Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
    Acked-by: Alexei Starovoitov <ast@kernel.org>
    Acked-by: David Ahern <dsahern@gmail.com>
    Cc: Adrian Hunter <adrian.hunter@intel.com>
    Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
    Cc: He Kuang <hekuang@huawei.com>
    Cc: Jiri Olsa <jolsa@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Masami Hiramatsu <mhiramat@kernel.org>
    Cc: Milian Wolff <milian.wolff@kdab.com>
    Cc: Namhyung Kim <namhyung@kernel.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Stephane Eranian <eranian@google.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Vince Weaver <vincent.weaver@maine.edu>
    Cc: Wang Nan <wangnan0@huawei.com>
    Cc: Zefan Li <lizefan@huawei.com>
    Link: http://lkml.kernel.org/n/tip-cgls6uuncwjtq969tys1j6b0@git.kernel.org
    Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..260cde08e92e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_warn
 - perf_cpu_time_max_percent
 - perf_event_paranoid
+- perf_event_max_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN).  The default value is 1.
 
 ==============================================================
 
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+	while ((entry->nr < sysctl_perf_event_max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		while (entry->nr < sysctl_perf_event_max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+		while ((entry->nr < sysctl_perf_event_max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= PERF_MAX_STACK_DEPTH)
+			if (entry->nr >= sysctl_perf_event_max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= PERF_MAX_STACK_DEPTH)
+		if (entry->nr >= sysctl_perf_event_max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..41d93d0e972b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
 				callchain_trace, NULL, entry);
 }
 
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 85749ae8cb5f..a090700cccca 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
 
 struct perf_callchain_entry {
 	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
+	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
+extern int sysctl_perf_event_max_stack;
+
 static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < PERF_MAX_STACK_DEPTH) {
+	if (entry->nr < sysctl_perf_event_max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    value_size < 8 || value_size % 8 ||
-	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+	    value_size / 8 > sysctl_perf_event_max_stack)
 		return ERR_PTR(-EINVAL);
 
 	/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 max_depth = map->value_size / 8;
-	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
-	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 		return -EFAULT;
 
 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
-	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 	 */
 	trace_nr = trace->nr - init_nr;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..b9325e7dcba1 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
 	struct perf_callchain_entry	*cpu_entries[0];
 };
 
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+	return (sizeof(struct perf_callchain_entry) +
+		sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 
 	cpu = smp_processor_id();
 
-	return &entries->cpu_entries[cpu][*rctx];
+	return (((void *)entries->cpu_entries[cpu]) +
+		(*rctx * perf_callchain_entry__sizeof()));
 }
 
 static void
@@ -215,3 +224,25 @@ exit_put:
 
 	return entry;
 }
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int new_value = sysctl_perf_event_max_stack, ret;
+	struct ctl_table new_table = *table;
+
+	new_table.data = &new_value;
+	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&callchain_mutex);
+	if (atomic_read(&nr_callchain_events))
+		ret = -EBUSY;
+	else
+		sysctl_perf_event_max_stack = new_value;
+
+	mutex_unlock(&callchain_mutex);
+
+	return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..afbe9fb32ec5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1144,6 +1144,14 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "perf_event_max_stack",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(sysctl_perf_event_max_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v2] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-25 23:41                     ` Arnaldo Carvalho de Melo
@ 2016-04-26  0:07                       ` Alexei Starovoitov
  2016-04-26  0:29                         ` [PATCH/RFC v3] " Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 38+ messages in thread
From: Alexei Starovoitov @ 2016-04-26  0:07 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: David Ahern, Peter Zijlstra, Frederic Weisbecker, Ingo Molnar,
	Adrian Hunter, Brendan Gregg, Alexander Shishkin,
	Alexei Starovoitov, He Kuang, Jiri Olsa, Masami Hiramatsu,
	Milian Wolff, Namhyung Kim, Stephane Eranian, Thomas Gleixner,
	Vince Weaver, Wang Nan, Zefan Li, Linux Kernel Mailing List

On Mon, Apr 25, 2016 at 08:41:39PM -0300, Arnaldo Carvalho de Melo wrote:
>  
> +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
> +
> +static inline size_t perf_callchain_entry__sizeof(void)
> +{
> +	return (sizeof(struct perf_callchain_entry) +
> +		sizeof(__u64) * sysctl_perf_event_max_stack);
> +}
> @@ -1144,6 +1144,14 @@ static struct ctl_table kern_table[] = {
>  		.extra1		= &zero,
>  		.extra2		= &one_hundred,
>  	},
> +	{
> +		.procname	= "perf_event_max_stack",
> +		.data		= NULL, /* filled in by handler */
> +		.maxlen		= sizeof(sysctl_perf_event_max_stack),
> +		.mode		= 0644,
> +		.proc_handler	= perf_event_max_stack_handler,
> +		.extra1		= &zero,
> +	},

you need to define a max value otherwise perf_callchain_entry__sizeof
will overflow. Sure it's root only facility, but still not nice.
1M? Anything above 1M stack frames would be insane anyway.
The rest looks good. Thanks!

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26  0:07                       ` Alexei Starovoitov
@ 2016-04-26  0:29                         ` Arnaldo Carvalho de Melo
  2016-04-26  0:44                           ` Alexei Starovoitov
                                             ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-26  0:29 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Arnaldo Carvalho de Melo, David Ahern, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Mon, Apr 25, 2016 at 05:07:26PM -0700, Alexei Starovoitov escreveu:
> > +	{
> > +		.procname	= "perf_event_max_stack",
> > +		.data		= NULL, /* filled in by handler */
> > +		.maxlen		= sizeof(sysctl_perf_event_max_stack),
> > +		.mode		= 0644,
> > +		.proc_handler	= perf_event_max_stack_handler,
> > +		.extra1		= &zero,
> > +	},
 
> you need to define a max value otherwise perf_callchain_entry__sizeof
> will overflow. Sure it's root only facility, but still not nice.
> 1M? Anything above 1M stack frames would be insane anyway.
> The rest looks good. Thanks!

Something else? ;-)

:-)

- Arnaldo

commit cd544af4f7fede01cb512d52bb3efe62aa19271d
Author: Arnaldo Carvalho de Melo <acme@redhat.com>
Date:   Thu Apr 21 12:28:50 2016 -0300

    perf core: Allow setting up max frame stack depth via sysctl
    
    The default remains 127, which is good for most cases, and not even hit
    most of the time, but then for some cases, as reported by Brendan, 1024+
    deep frames are appearing on the radar for things like groovy, ruby.
    
    And in some workloads putting a _lower_ cap on this may make sense. One
    that is per event still needs to be put in place tho.
    
    The new file is:
    
      # cat /proc/sys/kernel/perf_event_max_stack
      127
    
    Chaging it:
    
      # echo 256 > /proc/sys/kernel/perf_event_max_stack
      # cat /proc/sys/kernel/perf_event_max_stack
      256
    
    But as soon as there is some event using callchains we get:
    
      # echo 512 > /proc/sys/kernel/perf_event_max_stack
      -bash: echo: write error: Device or resource busy
      #
    
    Because we only allocate the callchain percpu data structures when there
    is a user, which allows for changing the max easily, its just a matter
    of having no callchain users at that point.
    
    Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
    Acked-by: Alexei Starovoitov <ast@kernel.org>
    Acked-by: David Ahern <dsahern@gmail.com>
    Cc: Adrian Hunter <adrian.hunter@intel.com>
    Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
    Cc: He Kuang <hekuang@huawei.com>
    Cc: Jiri Olsa <jolsa@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Masami Hiramatsu <mhiramat@kernel.org>
    Cc: Milian Wolff <milian.wolff@kdab.com>
    Cc: Namhyung Kim <namhyung@kernel.org>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Stephane Eranian <eranian@google.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Cc: Vince Weaver <vincent.weaver@maine.edu>
    Cc: Wang Nan <wangnan0@huawei.com>
    Cc: Zefan Li <lizefan@huawei.com>
    Link: http://lkml.kernel.org/n/tip-cgls6uuncwjtq969tys1j6b0@git.kernel.org
    Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..260cde08e92e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_warn
 - perf_cpu_time_max_percent
 - perf_event_paranoid
+- perf_event_max_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN).  The default value is 1.
 
 ==============================================================
 
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+	while ((entry->nr < sysctl_perf_event_max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		while (entry->nr < sysctl_perf_event_max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+		while ((entry->nr < sysctl_perf_event_max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= PERF_MAX_STACK_DEPTH)
+			if (entry->nr >= sysctl_perf_event_max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= PERF_MAX_STACK_DEPTH)
+		if (entry->nr >= sysctl_perf_event_max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..41d93d0e972b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
 				callchain_trace, NULL, entry);
 }
 
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 85749ae8cb5f..a090700cccca 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
 
 struct perf_callchain_entry {
 	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
+	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
+extern int sysctl_perf_event_max_stack;
+
 static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < PERF_MAX_STACK_DEPTH) {
+	if (entry->nr < sysctl_perf_event_max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    value_size < 8 || value_size % 8 ||
-	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+	    value_size / 8 > sysctl_perf_event_max_stack)
 		return ERR_PTR(-EINVAL);
 
 	/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 max_depth = map->value_size / 8;
-	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
-	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 		return -EFAULT;
 
 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
-	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 	 */
 	trace_nr = trace->nr - init_nr;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..b9325e7dcba1 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
 	struct perf_callchain_entry	*cpu_entries[0];
 };
 
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+	return (sizeof(struct perf_callchain_entry) +
+		sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 
 	cpu = smp_processor_id();
 
-	return &entries->cpu_entries[cpu][*rctx];
+	return (((void *)entries->cpu_entries[cpu]) +
+		(*rctx * perf_callchain_entry__sizeof()));
 }
 
 static void
@@ -215,3 +224,25 @@ exit_put:
 
 	return entry;
 }
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int new_value = sysctl_perf_event_max_stack, ret;
+	struct ctl_table new_table = *table;
+
+	new_table.data = &new_value;
+	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&callchain_mutex);
+	if (atomic_read(&nr_callchain_events))
+		ret = -EBUSY;
+	else
+		sysctl_perf_event_max_stack = new_value;
+
+	mutex_unlock(&callchain_mutex);
+
+	return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..c8b318663525 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "perf_event_max_stack",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(sysctl_perf_event_max_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+		.extra2		= &six_hundred_forty_kb,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26  0:29                         ` [PATCH/RFC v3] " Arnaldo Carvalho de Melo
@ 2016-04-26  0:44                           ` Alexei Starovoitov
  2016-04-26  0:47                             ` Arnaldo Carvalho de Melo
  2016-04-26 21:58                           ` Frederic Weisbecker
  2016-04-27 15:39                           ` [tip:perf/core] " tip-bot for Arnaldo Carvalho de Melo
  2 siblings, 1 reply; 38+ messages in thread
From: Alexei Starovoitov @ 2016-04-26  0:44 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: David Ahern, Peter Zijlstra, Frederic Weisbecker, Ingo Molnar,
	Adrian Hunter, Brendan Gregg, Alexander Shishkin,
	Alexei Starovoitov, He Kuang, Jiri Olsa, Masami Hiramatsu,
	Milian Wolff, Namhyung Kim, Stephane Eranian, Thomas Gleixner,
	Vince Weaver, Wang Nan, Zefan Li, Linux Kernel Mailing List

On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Apr 25, 2016 at 05:07:26PM -0700, Alexei Starovoitov escreveu:
> > > +	{
> > > +		.procname	= "perf_event_max_stack",
> > > +		.data		= NULL, /* filled in by handler */
> > > +		.maxlen		= sizeof(sysctl_perf_event_max_stack),
> > > +		.mode		= 0644,
> > > +		.proc_handler	= perf_event_max_stack_handler,
> > > +		.extra1		= &zero,
> > > +	},
>  
> > you need to define a max value otherwise perf_callchain_entry__sizeof
> > will overflow. Sure it's root only facility, but still not nice.
> > 1M? Anything above 1M stack frames would be insane anyway.
> > The rest looks good. Thanks!
> 
> Something else? ;-)

all looks good to me. Thanks a bunch!

> 
> commit cd544af4f7fede01cb512d52bb3efe62aa19271d
> Author: Arnaldo Carvalho de Melo <acme@redhat.com>
> Date:   Thu Apr 21 12:28:50 2016 -0300
> 
>     perf core: Allow setting up max frame stack depth via sysctl
>     
>     The default remains 127, which is good for most cases, and not even hit
>     most of the time, but then for some cases, as reported by Brendan, 1024+
>     deep frames are appearing on the radar for things like groovy, ruby.
>     
>     And in some workloads putting a _lower_ cap on this may make sense. One
>     that is per event still needs to be put in place tho.
>     
>     The new file is:
>     
>       # cat /proc/sys/kernel/perf_event_max_stack
>       127
>     
>     Chaging it:
>     
>       # echo 256 > /proc/sys/kernel/perf_event_max_stack
>       # cat /proc/sys/kernel/perf_event_max_stack
>       256
>     
>     But as soon as there is some event using callchains we get:
>     
>       # echo 512 > /proc/sys/kernel/perf_event_max_stack
>       -bash: echo: write error: Device or resource busy
>       #
>     
>     Because we only allocate the callchain percpu data structures when there
>     is a user, which allows for changing the max easily, its just a matter
>     of having no callchain users at that point.
>     
>     Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
>     Acked-by: Alexei Starovoitov <ast@kernel.org>

yep :)
hopefully Brendan can give it another spin.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26  0:44                           ` Alexei Starovoitov
@ 2016-04-26  0:47                             ` Arnaldo Carvalho de Melo
  2016-04-26  0:49                               ` Brendan Gregg
  0 siblings, 1 reply; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-26  0:47 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Arnaldo Carvalho de Melo, David Ahern, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
> On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Mon, Apr 25, 2016 at 05:07:26PM -0700, Alexei Starovoitov escreveu:
> > > > +	{
> > > > +		.procname	= "perf_event_max_stack",
> > > > +		.data		= NULL, /* filled in by handler */
> > > > +		.maxlen		= sizeof(sysctl_perf_event_max_stack),
> > > > +		.mode		= 0644,
> > > > +		.proc_handler	= perf_event_max_stack_handler,
> > > > +		.extra1		= &zero,
> > > > +	},
> >  
> > > you need to define a max value otherwise perf_callchain_entry__sizeof
> > > will overflow. Sure it's root only facility, but still not nice.
> > > 1M? Anything above 1M stack frames would be insane anyway.
> > > The rest looks good. Thanks!
> > 
> > Something else? ;-)
> 
> all looks good to me. Thanks a bunch!

Thanks for checking!
 
> >     Because we only allocate the callchain percpu data structures when there
> >     is a user, which allows for changing the max easily, its just a matter
> >     of having no callchain users at that point.
> >     
> >     Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
> >     Acked-by: Alexei Starovoitov <ast@kernel.org>
> 
> yep :)
> hopefully Brendan can give it another spin.

Agreed, and I'm calling it a day anyway, Brendan, please consider
retesting, thanks,

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26  0:47                             ` Arnaldo Carvalho de Melo
@ 2016-04-26  0:49                               ` Brendan Gregg
  2016-04-26 16:33                                 ` Arnaldo Carvalho de Melo
  2016-04-26 20:02                                 ` Brendan Gregg
  0 siblings, 2 replies; 38+ messages in thread
From: Brendan Gregg @ 2016-04-26  0:49 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Alexei Starovoitov, David Ahern, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo
<arnaldo.melo@gmail.com> wrote:
> Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
>> On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
>> > Em Mon, Apr 25, 2016 at 05:07:26PM -0700, Alexei Starovoitov escreveu:
>> > > > +       {
>> > > > +               .procname       = "perf_event_max_stack",
>> > > > +               .data           = NULL, /* filled in by handler */
>> > > > +               .maxlen         = sizeof(sysctl_perf_event_max_stack),
>> > > > +               .mode           = 0644,
>> > > > +               .proc_handler   = perf_event_max_stack_handler,
>> > > > +               .extra1         = &zero,
>> > > > +       },
>> >
>> > > you need to define a max value otherwise perf_callchain_entry__sizeof
>> > > will overflow. Sure it's root only facility, but still not nice.
>> > > 1M? Anything above 1M stack frames would be insane anyway.
>> > > The rest looks good. Thanks!
>> >
>> > Something else? ;-)
>>
>> all looks good to me. Thanks a bunch!
>
> Thanks for checking!
>
>> >     Because we only allocate the callchain percpu data structures when there
>> >     is a user, which allows for changing the max easily, its just a matter
>> >     of having no callchain users at that point.
>> >
>> >     Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
>> >     Acked-by: Alexei Starovoitov <ast@kernel.org>
>>
>> yep :)
>> hopefully Brendan can give it another spin.
>
> Agreed, and I'm calling it a day anyway, Brendan, please consider
> retesting, thanks,
>

Will do, thanks!

Brendan


> - Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26  0:49                               ` Brendan Gregg
@ 2016-04-26 16:33                                 ` Arnaldo Carvalho de Melo
  2016-04-26 16:45                                   ` David Ahern
  2016-04-26 20:02                                 ` Brendan Gregg
  1 sibling, 1 reply; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-26 16:33 UTC (permalink / raw)
  To: Brendan Gregg
  Cc: Alexei Starovoitov, David Ahern, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Mon, Apr 25, 2016 at 05:49:38PM -0700, Brendan Gregg escreveu:
> On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo
> <arnaldo.melo@gmail.com> wrote:
> > Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
> >> On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> >> >     Because we only allocate the callchain percpu data structures when there
> >> >     is a user, which allows for changing the max easily, its just a matter
> >> >     of having no callchain users at that point.
> >> >
> >> >     Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
> >> >     Acked-by: Alexei Starovoitov <ast@kernel.org>
> >>
> >> yep :)
> >> hopefully Brendan can give it another spin.
> >
> > Agreed, and I'm calling it a day anyway, Brendan, please consider
> > retesting, thanks,
 
> Will do, thanks!
 
> Brendan

So, for completeness, further testing it to see how far it goes on a 8GB
machine I got:

[root@emilia ~]# echo 131100 > /proc/sys/kernel/perf_event_max_stack 
[root@emilia ~]# perf record -g ls
Error:
The sys_perf_event_open() syscall returned with 12 (Cannot allocate memory) for event (cycles).
/bin/dmesg may provide additional information.
No CONFIG_PERF_EVENTS=y kernel support configured?

[root@emilia ~]#

[root@emilia ~]# echo 131000 > /proc/sys/kernel/perf_event_max_stack 
[root@emilia ~]# perf record -g usleep
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.013 MB perf.data (9 samples) ]
[root@emilia ~]# ls -la perf.data
-rw-------. 1 root root 15736 Apr 26 13:33 perf.data
[root@emilia ~]# 

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26 16:33                                 ` Arnaldo Carvalho de Melo
@ 2016-04-26 16:45                                   ` David Ahern
  2016-04-27 13:20                                     ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 38+ messages in thread
From: David Ahern @ 2016-04-26 16:45 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Brendan Gregg
  Cc: Alexei Starovoitov, Peter Zijlstra, Frederic Weisbecker,
	Ingo Molnar, Adrian Hunter, Alexander Shishkin,
	Alexei Starovoitov, He Kuang, Jiri Olsa, Masami Hiramatsu,
	Milian Wolff, Namhyung Kim, Stephane Eranian, Thomas Gleixner,
	Vince Weaver, Wang Nan, Zefan Li, Linux Kernel Mailing List

On 4/26/16 10:33 AM, Arnaldo Carvalho de Melo wrote:
> So, for completeness, further testing it to see how far it goes on a 8GB
> machine I got:
>
> [root@emilia ~]# echo 131100 > /proc/sys/kernel/perf_event_max_stack
> [root@emilia ~]# perf record -g ls
> Error:
> The sys_perf_event_open() syscall returned with 12 (Cannot allocate memory) for event (cycles).
> /bin/dmesg may provide additional information.
> No CONFIG_PERF_EVENTS=y kernel support configured?

I thought we fixed up the last line to not display for errors like this.

How about adding ENOMEM case to perf_evsel__open_strerror?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26  0:49                               ` Brendan Gregg
  2016-04-26 16:33                                 ` Arnaldo Carvalho de Melo
@ 2016-04-26 20:02                                 ` Brendan Gregg
  2016-04-26 21:05                                   ` Arnaldo Carvalho de Melo
  1 sibling, 1 reply; 38+ messages in thread
From: Brendan Gregg @ 2016-04-26 20:02 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Alexei Starovoitov, David Ahern, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

On Mon, Apr 25, 2016 at 5:49 PM, Brendan Gregg
<brendan.d.gregg@gmail.com> wrote:
> On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo
> <arnaldo.melo@gmail.com> wrote:
>> Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
>>> On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
>>> > Em Mon, Apr 25, 2016 at 05:07:26PM -0700, Alexei Starovoitov escreveu:
>>> > > > +       {
>>> > > > +               .procname       = "perf_event_max_stack",
>>> > > > +               .data           = NULL, /* filled in by handler */
>>> > > > +               .maxlen         = sizeof(sysctl_perf_event_max_stack),
>>> > > > +               .mode           = 0644,
>>> > > > +               .proc_handler   = perf_event_max_stack_handler,
>>> > > > +               .extra1         = &zero,
>>> > > > +       },
>>> >
>>> > > you need to define a max value otherwise perf_callchain_entry__sizeof
>>> > > will overflow. Sure it's root only facility, but still not nice.
>>> > > 1M? Anything above 1M stack frames would be insane anyway.
>>> > > The rest looks good. Thanks!
>>> >
>>> > Something else? ;-)
>>>
>>> all looks good to me. Thanks a bunch!
>>
>> Thanks for checking!
>>
>>> >     Because we only allocate the callchain percpu data structures when there
>>> >     is a user, which allows for changing the max easily, its just a matter
>>> >     of having no callchain users at that point.
>>> >
>>> >     Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
>>> >     Acked-by: Alexei Starovoitov <ast@kernel.org>
>>>
>>> yep :)
>>> hopefully Brendan can give it another spin.
>>
>> Agreed, and I'm calling it a day anyway, Brendan, please consider
>> retesting, thanks,
>>
>
> Will do, thanks!
>

Looks good.

I started with max depth = 512, and even that was still truncated, and
had to profile again at 1024 to capture the full stacks. Seems to
generally match the flame graph I generated with V1, which made me
want to check that I'm running the new patch, and am:

# grep six_hundred_forty_kb /proc/kallsyms
ffffffff81c431e0 d six_hundred_forty_kb

I was mucking around and was able to get "corrupted callchain.
skipping..." errors, but these look to be expected -- that was
profiling a binary (bash) that doesn't have frame pointers. Some perf
script -D output:

16 3204735442777 0x18f0d8 [0x2030]: PERF_RECORD_SAMPLE(IP, 0x1):
18134/18134: 0xffffffff8118b6a4 period: 1001001 addr: 0
... FP chain: nr:1023
.....  0: ffffffffffffff80
.....  1: ffffffff8118b6a4
.....  2: ffffffff8118bc47
.....  3: ffffffff811d8c85
.....  4: ffffffff811b18f8
.....  5: ffffffff811b2a55
.....  6: ffffffff811b5ea0
.....  7: ffffffff810663c0
.....  8: ffffffff810666e0
.....  9: ffffffff817b9d28
..... 10: fffffffffffffe00
..... 11: 00000000004b45e2
..... 12: 000000000000610f
..... 13: 0000000000006110
..... 14: 0000000000006111
..... 15: 0000000000006112
..... 16: 0000000000006113
..... 17: 0000000000006114
..... 18: 0000000000006115
..... 19: 0000000000006116
..... 20: 0000000000006117
[...]
..... 1021: 000000000000650b
..... 1022: 000000000000650c
 ... thread: bash:18134
 ...... dso: /lib/modules/4.6.0-rc5-virtual/build/vmlinux
bash 18134 [016]  3204.735442:    1001001 cpu-clock:

Brendan

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26 20:02                                 ` Brendan Gregg
@ 2016-04-26 21:05                                   ` Arnaldo Carvalho de Melo
  2016-04-26 21:55                                     ` Peter Zijlstra
  2016-04-26 21:58                                     ` Brendan Gregg
  0 siblings, 2 replies; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-26 21:05 UTC (permalink / raw)
  To: Brendan Gregg
  Cc: Arnaldo Carvalho de Melo, Alexei Starovoitov, David Ahern,
	Peter Zijlstra, Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Tue, Apr 26, 2016 at 01:02:34PM -0700, Brendan Gregg escreveu:
> On Mon, Apr 25, 2016 at 5:49 PM, Brendan Gregg <brendan.d.gregg@gmail.com> wrote:
> > On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote:
> >> Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
> >>> yep :)
> >>> hopefully Brendan can give it another spin.
> >>
> >> Agreed, and I'm calling it a day anyway, Brendan, please consider
> >> retesting, thanks,
> >
> > Will do, thanks!
> 
> Looks good.
> 
> I started with max depth = 512, and even that was still truncated, and
> had to profile again at 1024 to capture the full stacks. Seems to
> generally match the flame graph I generated with V1, which made me
> want to check that I'm running the new patch, and am:
> 
> # grep six_hundred_forty_kb /proc/kallsyms
> ffffffff81c431e0 d six_hundred_forty_kb
> 
> I was mucking around and was able to get "corrupted callchain.
> skipping..." errors, but these look to be expected -- that was

Yeah, thanks for testing!

And since you talked about userspace without frame pointers, have you
played with '--call-graph lbr'?

- Arnaldo

> profiling a binary (bash) that doesn't have frame pointers. Some perf
> script -D output:
> 
> 16 3204735442777 0x18f0d8 [0x2030]: PERF_RECORD_SAMPLE(IP, 0x1):
> 18134/18134: 0xffffffff8118b6a4 period: 1001001 addr: 0
> ... FP chain: nr:1023
> .....  0: ffffffffffffff80
> .....  1: ffffffff8118b6a4
> .....  2: ffffffff8118bc47
> .....  3: ffffffff811d8c85
> .....  4: ffffffff811b18f8
> .....  5: ffffffff811b2a55
> .....  6: ffffffff811b5ea0
> .....  7: ffffffff810663c0
> .....  8: ffffffff810666e0
> .....  9: ffffffff817b9d28
> ..... 10: fffffffffffffe00
> ..... 11: 00000000004b45e2
> ..... 12: 000000000000610f
> ..... 13: 0000000000006110
> ..... 14: 0000000000006111
> ..... 15: 0000000000006112
> ..... 16: 0000000000006113
> ..... 17: 0000000000006114
> ..... 18: 0000000000006115
> ..... 19: 0000000000006116
> ..... 20: 0000000000006117
> [...]
> ..... 1021: 000000000000650b
> ..... 1022: 000000000000650c
>  ... thread: bash:18134
>  ...... dso: /lib/modules/4.6.0-rc5-virtual/build/vmlinux
> bash 18134 [016]  3204.735442:    1001001 cpu-clock:
> 
> Brendan

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26 21:05                                   ` Arnaldo Carvalho de Melo
@ 2016-04-26 21:55                                     ` Peter Zijlstra
  2016-04-27 12:53                                       ` Arnaldo Carvalho de Melo
  2016-04-26 21:58                                     ` Brendan Gregg
  1 sibling, 1 reply; 38+ messages in thread
From: Peter Zijlstra @ 2016-04-26 21:55 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Brendan Gregg, Arnaldo Carvalho de Melo, Alexei Starovoitov,
	David Ahern, Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

On Tue, Apr 26, 2016 at 06:05:00PM -0300, Arnaldo Carvalho de Melo wrote:
> > I started with max depth = 512, and even that was still truncated, and
> > had to profile again at 1024 to capture the full stacks. Seems to

                           ^^^^^^

> > generally match the flame graph I generated with V1, which made me
> > want to check that I'm running the new patch, and am:
> > 
> > # grep six_hundred_forty_kb /proc/kallsyms
> > ffffffff81c431e0 d six_hundred_forty_kb
> > 
> > I was mucking around and was able to get "corrupted callchain.
> > skipping..." errors, but these look to be expected -- that was
> 
> Yeah, thanks for testing!
> 
> And since you talked about userspace without frame pointers, have you
> played with '--call-graph lbr'?

That seems to be at odds with his requirements; he needs 1024 entries to
capture full stacks, LBR is limited to 16/32 or so entries. That's 2
orders of magnitude difference.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26  0:29                         ` [PATCH/RFC v3] " Arnaldo Carvalho de Melo
  2016-04-26  0:44                           ` Alexei Starovoitov
@ 2016-04-26 21:58                           ` Frederic Weisbecker
  2016-04-27 12:53                             ` Arnaldo Carvalho de Melo
  2016-04-27 15:39                           ` [tip:perf/core] " tip-bot for Arnaldo Carvalho de Melo
  2 siblings, 1 reply; 38+ messages in thread
From: Frederic Weisbecker @ 2016-04-26 21:58 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Alexei Starovoitov, David Ahern, Peter Zijlstra, Ingo Molnar,
	Adrian Hunter, Brendan Gregg, Alexander Shishkin,
	Alexei Starovoitov, He Kuang, Jiri Olsa, Masami Hiramatsu,
	Milian Wolff, Namhyung Kim, Stephane Eranian, Thomas Gleixner,
	Vince Weaver, Wang Nan, Zefan Li, Linux Kernel Mailing List

On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> commit cd544af4f7fede01cb512d52bb3efe62aa19271d
> Author: Arnaldo Carvalho de Melo <acme@redhat.com>
> Date:   Thu Apr 21 12:28:50 2016 -0300
> 
>     perf core: Allow setting up max frame stack depth via sysctl
>     
>     The default remains 127, which is good for most cases, and not even hit
>     most of the time, but then for some cases, as reported by Brendan, 1024+
>     deep frames are appearing on the radar for things like groovy, ruby.
>     
>     And in some workloads putting a _lower_ cap on this may make sense. One
>     that is per event still needs to be put in place tho.
>     
>     The new file is:
>     
>       # cat /proc/sys/kernel/perf_event_max_stack
>       127
>     
>     Chaging it:
>     
>       # echo 256 > /proc/sys/kernel/perf_event_max_stack
>       # cat /proc/sys/kernel/perf_event_max_stack
>       256
>     
>     But as soon as there is some event using callchains we get:
>     
>       # echo 512 > /proc/sys/kernel/perf_event_max_stack
>       -bash: echo: write error: Device or resource busy
>       #
>     
>     Because we only allocate the callchain percpu data structures when there
>     is a user, which allows for changing the max easily, its just a matter
>     of having no callchain users at that point.
>     
>     Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
>     Acked-by: Alexei Starovoitov <ast@kernel.org>
>     Acked-by: David Ahern <dsahern@gmail.com>

Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>

I first thought that this should be a tunable per event instead of a global sysctl
but then I realized that we still need a root-only-tunable maximum limit value to oppose
against any future per event limit and this sysctl value does the job.

Nice patch!

Thanks.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26 21:05                                   ` Arnaldo Carvalho de Melo
  2016-04-26 21:55                                     ` Peter Zijlstra
@ 2016-04-26 21:58                                     ` Brendan Gregg
  2016-04-26 22:10                                       ` Peter Zijlstra
  2016-04-27 12:56                                       ` Arnaldo Carvalho de Melo
  1 sibling, 2 replies; 38+ messages in thread
From: Brendan Gregg @ 2016-04-26 21:58 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Alexei Starovoitov, David Ahern, Peter Zijlstra,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

On Tue, Apr 26, 2016 at 2:05 PM, Arnaldo Carvalho de Melo
<arnaldo.melo@gmail.com> wrote:
> Em Tue, Apr 26, 2016 at 01:02:34PM -0700, Brendan Gregg escreveu:
>> On Mon, Apr 25, 2016 at 5:49 PM, Brendan Gregg <brendan.d.gregg@gmail.com> wrote:
>> > On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote:
>> >> Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
>> >>> yep :)
>> >>> hopefully Brendan can give it another spin.
>> >>
>> >> Agreed, and I'm calling it a day anyway, Brendan, please consider
>> >> retesting, thanks,
>> >
>> > Will do, thanks!
>>
>> Looks good.
>>
>> I started with max depth = 512, and even that was still truncated, and
>> had to profile again at 1024 to capture the full stacks. Seems to
>> generally match the flame graph I generated with V1, which made me
>> want to check that I'm running the new patch, and am:
>>
>> # grep six_hundred_forty_kb /proc/kallsyms
>> ffffffff81c431e0 d six_hundred_forty_kb
>>
>> I was mucking around and was able to get "corrupted callchain.
>> skipping..." errors, but these look to be expected -- that was
>
> Yeah, thanks for testing!
>
> And since you talked about userspace without frame pointers, have you
> played with '--call-graph lbr'?

Not really. Isn't it only 16 levels deep max?

Most of our Linux is Xen guests (EC2), and I'd have to check if the
MSRs are available for LBR (perf record --call-graph lbr ... returns
"The sys_perf_event_open() syscall returned with 95 (Operation not
supported) for event (cpu-clock).", so I'd guess not, although many
other MSRs are exposed).

BTS seemed more promising (deeper stacks), and there's already Xen
support for it (need to boot the Xen host with vpmu=bts, preferably
vpmu=bts,arch for some PMCs as well :).

Brendan

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26 21:58                                     ` Brendan Gregg
@ 2016-04-26 22:10                                       ` Peter Zijlstra
  2016-04-27 12:56                                       ` Arnaldo Carvalho de Melo
  1 sibling, 0 replies; 38+ messages in thread
From: Peter Zijlstra @ 2016-04-26 22:10 UTC (permalink / raw)
  To: Brendan Gregg
  Cc: Arnaldo Carvalho de Melo, Alexei Starovoitov, David Ahern,
	Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

On Tue, Apr 26, 2016 at 02:58:41PM -0700, Brendan Gregg wrote:
> BTS seemed more promising (deeper stacks), and there's already Xen
> support for it (need to boot the Xen host with vpmu=bts, preferably
> vpmu=bts,arch for some PMCs as well :).

BTS is a branch tracer; it simply traces _all_ branches, including
calls. It has fairly significant overhead.

But yes, by tracing all branches, you should get really shiny call
traces :-)

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26 21:55                                     ` Peter Zijlstra
@ 2016-04-27 12:53                                       ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-27 12:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Brendan Gregg, Arnaldo Carvalho de Melo, Alexei Starovoitov,
	David Ahern, Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Tue, Apr 26, 2016 at 11:55:36PM +0200, Peter Zijlstra escreveu:
> On Tue, Apr 26, 2016 at 06:05:00PM -0300, Arnaldo Carvalho de Melo wrote:
> > > I started with max depth = 512, and even that was still truncated, and
> > > had to profile again at 1024 to capture the full stacks. Seems to
 
>                            ^^^^^^
 
> > And since you talked about userspace without frame pointers, have you
> > played with '--call-graph lbr'?
 
> That seems to be at odds with his requirements; he needs 1024 entries to
> capture full stacks, LBR is limited to 16/32 or so entries. That's 2
> orders of magnitude difference.

Duh, you're right, must've confused with that BTS thing, but as you said
in another message, that generates way too much info, have to read more
about it and the PT as well :-\

Anyway, for syscall backtraces, using 16 or 32 as the default in 'perf
trace', seems interesting, automatically switching to DWARF (or FP if
configured by the user in a system built with -fno-omit-frame-pointer)
if, say, --max-stack 38, say, is specified.

Have to take a look at how to do that for raw_syscalls:sys_enter.

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26 21:58                           ` Frederic Weisbecker
@ 2016-04-27 12:53                             ` Arnaldo Carvalho de Melo
  2016-04-27 16:09                               ` Frederic Weisbecker
  0 siblings, 1 reply; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-27 12:53 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Arnaldo Carvalho de Melo, Alexei Starovoitov, David Ahern,
	Peter Zijlstra, Ingo Molnar, Adrian Hunter, Brendan Gregg,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Tue, Apr 26, 2016 at 11:58:10PM +0200, Frederic Weisbecker escreveu:
> On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> > commit cd544af4f7fede01cb512d52bb3efe62aa19271d
> > Author: Arnaldo Carvalho de Melo <acme@redhat.com>
> > Date:   Thu Apr 21 12:28:50 2016 -0300
> > 
> >     perf core: Allow setting up max frame stack depth via sysctl
> >     
> >     The default remains 127, which is good for most cases, and not even hit
> >     most of the time, but then for some cases, as reported by Brendan, 1024+
> >     deep frames are appearing on the radar for things like groovy, ruby.
> >     
> >     And in some workloads putting a _lower_ cap on this may make sense. One
> >     that is per event still needs to be put in place tho.
> >     
> >     The new file is:
> >     
> >       # cat /proc/sys/kernel/perf_event_max_stack
> >       127
> >     
> >     Chaging it:
> >     
> >       # echo 256 > /proc/sys/kernel/perf_event_max_stack
> >       # cat /proc/sys/kernel/perf_event_max_stack
> >       256
> >     
> >     But as soon as there is some event using callchains we get:
> >     
> >       # echo 512 > /proc/sys/kernel/perf_event_max_stack
> >       -bash: echo: write error: Device or resource busy
> >       #
> >     
> >     Because we only allocate the callchain percpu data structures when there
> >     is a user, which allows for changing the max easily, its just a matter
> >     of having no callchain users at that point.
> >     
> >     Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
> >     Acked-by: Alexei Starovoitov <ast@kernel.org>
> >     Acked-by: David Ahern <dsahern@gmail.com>
> 
> Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
> 
> I first thought that this should be a tunable per event instead of a global sysctl

Yeah, I'll work on that too.

> but then I realized that we still need a root-only-tunable maximum limit value to oppose
> against any future per event limit and this sysctl value does the job.
 
> Nice patch!

Thanks for reviewieng it!

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26 21:58                                     ` Brendan Gregg
  2016-04-26 22:10                                       ` Peter Zijlstra
@ 2016-04-27 12:56                                       ` Arnaldo Carvalho de Melo
  1 sibling, 0 replies; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-27 12:56 UTC (permalink / raw)
  To: Brendan Gregg
  Cc: Arnaldo Carvalho de Melo, Alexei Starovoitov, David Ahern,
	Peter Zijlstra, Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Tue, Apr 26, 2016 at 02:58:41PM -0700, Brendan Gregg escreveu:
> On Tue, Apr 26, 2016 at 2:05 PM, Arnaldo Carvalho de Melo
> <arnaldo.melo@gmail.com> wrote:
> > Em Tue, Apr 26, 2016 at 01:02:34PM -0700, Brendan Gregg escreveu:
> >> On Mon, Apr 25, 2016 at 5:49 PM, Brendan Gregg <brendan.d.gregg@gmail.com> wrote:
> >> > On Mon, Apr 25, 2016 at 5:47 PM, Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote:
> >> >> Em Mon, Apr 25, 2016 at 05:44:00PM -0700, Alexei Starovoitov escreveu:
> >> >>> yep :)
> >> >>> hopefully Brendan can give it another spin.
> >> >>
> >> >> Agreed, and I'm calling it a day anyway, Brendan, please consider
> >> >> retesting, thanks,
> >> >
> >> > Will do, thanks!
> >>
> >> Looks good.
> >>
> >> I started with max depth = 512, and even that was still truncated, and
> >> had to profile again at 1024 to capture the full stacks. Seems to
> >> generally match the flame graph I generated with V1, which made me
> >> want to check that I'm running the new patch, and am:
> >>
> >> # grep six_hundred_forty_kb /proc/kallsyms
> >> ffffffff81c431e0 d six_hundred_forty_kb
> >>
> >> I was mucking around and was able to get "corrupted callchain.
> >> skipping..." errors, but these look to be expected -- that was
> >
> > Yeah, thanks for testing!
> >
> > And since you talked about userspace without frame pointers, have you
> > played with '--call-graph lbr'?
> 
> Not really. Isn't it only 16 levels deep max?

Yeah, stoopid me :-\
 
> Most of our Linux is Xen guests (EC2), and I'd have to check if the
> MSRs are available for LBR (perf record --call-graph lbr ... returns
> "The sys_perf_event_open() syscall returned with 95 (Operation not

That is because it is only accepted for PERF_TYPE_HARDWARE on x86, i.e.
it retunrs EOPNOTSUPP for all the other cases, like for
PERF_TYPE_SOFTWARE counters, like "cpu-clock"

> supported) for event (cpu-clock).", so I'd guess not, although many
> other MSRs are exposed).
> 
> BTS seemed more promising (deeper stacks), and there's already Xen
> support for it (need to boot the Xen host with vpmu=bts, preferably
> vpmu=bts,arch for some PMCs as well :).

Yeah, worth a look, I guess, but doesn't look like a low hanging fruit
tho.

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26 16:45                                   ` David Ahern
@ 2016-04-27 13:20                                     ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 38+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-04-27 13:20 UTC (permalink / raw)
  To: David Ahern
  Cc: Arnaldo Carvalho de Melo, Brendan Gregg, Alexei Starovoitov,
	Peter Zijlstra, Frederic Weisbecker, Ingo Molnar, Adrian Hunter,
	Alexander Shishkin, Alexei Starovoitov, He Kuang, Jiri Olsa,
	Masami Hiramatsu, Milian Wolff, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Wang Nan, Zefan Li,
	Linux Kernel Mailing List

Em Tue, Apr 26, 2016 at 10:45:28AM -0600, David Ahern escreveu:
> On 4/26/16 10:33 AM, Arnaldo Carvalho de Melo wrote:
> >So, for completeness, further testing it to see how far it goes on a 8GB
> >machine I got:
> >
> >[root@emilia ~]# echo 131100 > /proc/sys/kernel/perf_event_max_stack
> >[root@emilia ~]# perf record -g ls
> >Error:
> >The sys_perf_event_open() syscall returned with 12 (Cannot allocate memory) for event (cycles).
> >/bin/dmesg may provide additional information.
> >No CONFIG_PERF_EVENTS=y kernel support configured?
> 
> I thought we fixed up the last line to not display for errors like this.

For most cases, yes.
 
> How about adding ENOMEM case to perf_evsel__open_strerror?

I'll do.

- Arnaldo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [tip:perf/core] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-26  0:29                         ` [PATCH/RFC v3] " Arnaldo Carvalho de Melo
  2016-04-26  0:44                           ` Alexei Starovoitov
  2016-04-26 21:58                           ` Frederic Weisbecker
@ 2016-04-27 15:39                           ` tip-bot for Arnaldo Carvalho de Melo
  2 siblings, 0 replies; 38+ messages in thread
From: tip-bot for Arnaldo Carvalho de Melo @ 2016-04-27 15:39 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: dsahern, jolsa, eranian, wangnan0, adrian.hunter, hekuang,
	fweisbec, namhyung, tglx, mingo, hpa, linux-kernel, acme,
	lizefan, peterz, milian.wolff, vincent.weaver,
	alexander.shishkin, mhiramat, brendan.d.gregg, torvalds, ast

Commit-ID:  c5dfd78eb79851e278b7973031b9ca363da87a7e
Gitweb:     http://git.kernel.org/tip/c5dfd78eb79851e278b7973031b9ca363da87a7e
Author:     Arnaldo Carvalho de Melo <acme@redhat.com>
AuthorDate: Thu, 21 Apr 2016 12:28:50 -0300
Committer:  Arnaldo Carvalho de Melo <acme@redhat.com>
CommitDate: Wed, 27 Apr 2016 10:20:39 -0300

perf core: Allow setting up max frame stack depth via sysctl

The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.

And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.

The new file is:

  # cat /proc/sys/kernel/perf_event_max_stack
  127

Chaging it:

  # echo 256 > /proc/sys/kernel/perf_event_max_stack
  # cat /proc/sys/kernel/perf_event_max_stack
  256

But as soon as there is some event using callchains we get:

  # echo 512 > /proc/sys/kernel/perf_event_max_stack
  -bash: echo: write error: Device or resource busy
  #

Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.

Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: David Ahern <dsahern@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/20160426002928.GB16708@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 Documentation/sysctl/kernel.txt    | 14 ++++++++++++++
 arch/arm/kernel/perf_callchain.c   |  2 +-
 arch/arm64/kernel/perf_callchain.c |  4 ++--
 arch/metag/kernel/perf_callchain.c |  2 +-
 arch/mips/kernel/perf_event.c      |  4 ++--
 arch/powerpc/perf/callchain.c      |  4 ++--
 arch/sparc/kernel/perf_event.c     |  6 +++---
 arch/x86/events/core.c             |  4 ++--
 arch/xtensa/kernel/perf_event.c    |  4 ++--
 include/linux/perf_event.h         |  8 ++++++--
 kernel/bpf/stackmap.c              |  8 ++++----
 kernel/events/callchain.c          | 35 +++++++++++++++++++++++++++++++++--
 kernel/sysctl.c                    | 12 ++++++++++++
 13 files changed, 84 insertions(+), 23 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a4..260cde0 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_warn
 - perf_cpu_time_max_percent
 - perf_event_paranoid
+- perf_event_max_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN).  The default value is 1.
 
 ==============================================================
 
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5..27563be 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+	while ((entry->nr < sysctl_perf_event_max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff46654..32c3c6e 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		while (entry->nr < sysctl_perf_event_max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+		while ((entry->nr < sysctl_perf_event_max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 3156334..252abc1 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6..5021c54 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= PERF_MAX_STACK_DEPTH)
+			if (entry->nr >= sysctl_perf_event_max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= PERF_MAX_STACK_DEPTH)
+		if (entry->nr >= sysctl_perf_event_max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a675..22d9015 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66..a4b8b5a 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442..41d93d0 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f0118..a6b00b3 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
 				callchain_trace, NULL, entry);
 }
 
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 85749ae..a090700 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
 
 struct perf_callchain_entry {
 	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
+	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
+extern int sysctl_perf_event_max_stack;
+
 static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < PERF_MAX_STACK_DEPTH) {
+	if (entry->nr < sysctl_perf_event_max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e9..f5a1954 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    value_size < 8 || value_size % 8 ||
-	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+	    value_size / 8 > sysctl_perf_event_max_stack)
 		return ERR_PTR(-EINVAL);
 
 	/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 max_depth = map->value_size / 8;
-	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
-	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 		return -EFAULT;
 
 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
-	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 	 */
 	trace_nr = trace->nr - init_nr;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f..b9325e7 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
 	struct perf_callchain_entry	*cpu_entries[0];
 };
 
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+	return (sizeof(struct perf_callchain_entry) +
+		sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 
 	cpu = smp_processor_id();
 
-	return &entries->cpu_entries[cpu][*rctx];
+	return (((void *)entries->cpu_entries[cpu]) +
+		(*rctx * perf_callchain_entry__sizeof()));
 }
 
 static void
@@ -215,3 +224,25 @@ exit_put:
 
 	return entry;
 }
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int new_value = sysctl_perf_event_max_stack, ret;
+	struct ctl_table new_table = *table;
+
+	new_table.data = &new_value;
+	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&callchain_mutex);
+	if (atomic_read(&nr_callchain_events))
+		ret = -EBUSY;
+	else
+		sysctl_perf_event_max_stack = new_value;
+
+	mutex_unlock(&callchain_mutex);
+
+	return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f..c8b3186 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "perf_event_max_stack",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(sysctl_perf_event_max_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+		.extra2		= &six_hundred_forty_kb,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-27 12:53                             ` Arnaldo Carvalho de Melo
@ 2016-04-27 16:09                               ` Frederic Weisbecker
  2016-04-27 16:27                                 ` David Ahern
  0 siblings, 1 reply; 38+ messages in thread
From: Frederic Weisbecker @ 2016-04-27 16:09 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Alexei Starovoitov, David Ahern, Peter Zijlstra, Ingo Molnar,
	Adrian Hunter, Brendan Gregg, Alexander Shishkin,
	Alexei Starovoitov, He Kuang, Jiri Olsa, Masami Hiramatsu,
	Milian Wolff, Namhyung Kim, Stephane Eranian, Thomas Gleixner,
	Vince Weaver, Wang Nan, Zefan Li, Linux Kernel Mailing List

On Wed, Apr 27, 2016 at 09:53:58AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Tue, Apr 26, 2016 at 11:58:10PM +0200, Frederic Weisbecker escreveu:
> > On Mon, Apr 25, 2016 at 09:29:28PM -0300, Arnaldo Carvalho de Melo wrote:
> > > commit cd544af4f7fede01cb512d52bb3efe62aa19271d
> > > Author: Arnaldo Carvalho de Melo <acme@redhat.com>
> > > Date:   Thu Apr 21 12:28:50 2016 -0300
> > > 
> > >     perf core: Allow setting up max frame stack depth via sysctl
> > >     
> > >     The default remains 127, which is good for most cases, and not even hit
> > >     most of the time, but then for some cases, as reported by Brendan, 1024+
> > >     deep frames are appearing on the radar for things like groovy, ruby.
> > >     
> > >     And in some workloads putting a _lower_ cap on this may make sense. One
> > >     that is per event still needs to be put in place tho.
> > >     
> > >     The new file is:
> > >     
> > >       # cat /proc/sys/kernel/perf_event_max_stack
> > >       127
> > >     
> > >     Chaging it:
> > >     
> > >       # echo 256 > /proc/sys/kernel/perf_event_max_stack
> > >       # cat /proc/sys/kernel/perf_event_max_stack
> > >       256
> > >     
> > >     But as soon as there is some event using callchains we get:
> > >     
> > >       # echo 512 > /proc/sys/kernel/perf_event_max_stack
> > >       -bash: echo: write error: Device or resource busy
> > >       #
> > >     
> > >     Because we only allocate the callchain percpu data structures when there
> > >     is a user, which allows for changing the max easily, its just a matter
> > >     of having no callchain users at that point.
> > >     
> > >     Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
> > >     Acked-by: Alexei Starovoitov <ast@kernel.org>
> > >     Acked-by: David Ahern <dsahern@gmail.com>
> > 
> > Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
> > 
> > I first thought that this should be a tunable per event instead of a global sysctl
> 
> Yeah, I'll work on that too.

There is no rush though. The sysfs limit will probably be enough for most users. Unless
someone requested it?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH/RFC v3] perf core: Allow setting up max frame stack depth via sysctl
  2016-04-27 16:09                               ` Frederic Weisbecker
@ 2016-04-27 16:27                                 ` David Ahern
  0 siblings, 0 replies; 38+ messages in thread
From: David Ahern @ 2016-04-27 16:27 UTC (permalink / raw)
  To: Frederic Weisbecker, Arnaldo Carvalho de Melo
  Cc: Alexei Starovoitov, Peter Zijlstra, Ingo Molnar, Adrian Hunter,
	Brendan Gregg, Alexander Shishkin, Alexei Starovoitov, He Kuang,
	Jiri Olsa, Masami Hiramatsu, Milian Wolff, Namhyung Kim,
	Stephane Eranian, Thomas Gleixner, Vince Weaver, Wang Nan,
	Zefan Li, Linux Kernel Mailing List

On 4/27/16 10:09 AM, Frederic Weisbecker wrote:
>>> I first thought that this should be a tunable per event instead of a global sysctl
>>
>> Yeah, I'll work on that too.
>
> There is no rush though. The sysfs limit will probably be enough for most users. Unless
> someone requested it?
>

I have. I spent time last winter (2015) looking into how to do it. The 
userspace syntax was more of a pain than passing the parameters to the 
kernel side as the intent is to specify N kernel frames and M user frames.

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2016-04-27 16:27 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-20 22:47 [PATCH/RFC] perf core: Allow setting up max frame stack depth via sysctl Arnaldo Carvalho de Melo
2016-04-20 23:04 ` Alexei Starovoitov
2016-04-22 20:52   ` [PATCH/RFC v2] " Arnaldo Carvalho de Melo
2016-04-22 21:37     ` Alexei Starovoitov
2016-04-22 22:05     ` David Ahern
2016-04-22 22:18       ` Alexei Starovoitov
2016-04-25 16:14         ` Arnaldo Carvalho de Melo
2016-04-25 16:27           ` Arnaldo Carvalho de Melo
2016-04-25 19:22             ` Arnaldo Carvalho de Melo
2016-04-25 20:06               ` Alexei Starovoitov
2016-04-25 20:17                 ` Arnaldo Carvalho de Melo
2016-04-25 21:59                   ` Alexei Starovoitov
2016-04-25 23:41                     ` Arnaldo Carvalho de Melo
2016-04-26  0:07                       ` Alexei Starovoitov
2016-04-26  0:29                         ` [PATCH/RFC v3] " Arnaldo Carvalho de Melo
2016-04-26  0:44                           ` Alexei Starovoitov
2016-04-26  0:47                             ` Arnaldo Carvalho de Melo
2016-04-26  0:49                               ` Brendan Gregg
2016-04-26 16:33                                 ` Arnaldo Carvalho de Melo
2016-04-26 16:45                                   ` David Ahern
2016-04-27 13:20                                     ` Arnaldo Carvalho de Melo
2016-04-26 20:02                                 ` Brendan Gregg
2016-04-26 21:05                                   ` Arnaldo Carvalho de Melo
2016-04-26 21:55                                     ` Peter Zijlstra
2016-04-27 12:53                                       ` Arnaldo Carvalho de Melo
2016-04-26 21:58                                     ` Brendan Gregg
2016-04-26 22:10                                       ` Peter Zijlstra
2016-04-27 12:56                                       ` Arnaldo Carvalho de Melo
2016-04-26 21:58                           ` Frederic Weisbecker
2016-04-27 12:53                             ` Arnaldo Carvalho de Melo
2016-04-27 16:09                               ` Frederic Weisbecker
2016-04-27 16:27                                 ` David Ahern
2016-04-27 15:39                           ` [tip:perf/core] " tip-bot for Arnaldo Carvalho de Melo
2016-04-20 23:10 ` [PATCH/RFC] " David Ahern
2016-04-21  0:18   ` Arnaldo Carvalho de Melo
2016-04-21 10:48 ` Peter Zijlstra
2016-04-21 15:17   ` Arnaldo Carvalho de Melo
2016-04-21 15:42     ` Arnaldo Carvalho de Melo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).