All of lore.kernel.org
 help / color / mirror / Atom feed
* x86, perf: throttling issues with long nmi latencies
@ 2013-10-14 20:35 Don Zickus
  2013-10-14 21:28 ` Andi Kleen
  2013-10-15 10:14 ` Peter Zijlstra
  0 siblings, 2 replies; 47+ messages in thread
From: Don Zickus @ 2013-10-14 20:35 UTC (permalink / raw)
  To: dave.hansen, a.p.zijlstra, eranian, ak; +Cc: jmario, linux-kernel

Hi Folks,

I have been playing with quad socket Ivy Bridges for awhile and have seen
numerous "perf samples too long" messages, to the point, the machine is
unusable for any perf analyzing.

So I tried to investigate the source of the NMI latencies using the
traditional 'rdtscll()' command.  That failed miserably.  Then it was
pointed out to me that rdtscll() is terrible for benchmarking due to
out-of-order execution by the Intel processors.  This Intel whitepaper
describes a better way using cpuid and rdtsc:

http://www.intel.fr/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf

(I attached the patch I used below)

This gave more stable numbers such that I could now narrow things down.
While there are a few places that are causing latencies, for now I focused on
the longest one first.  It seems to be 'copy_user_from_nmi'

intel_pmu_handle_irq ->
	intel_pmu_drain_pebs_nhm ->
		__intel_pmu_drain_pebs_nhm ->
			__intel_pmu_pebs_event ->
				intel_pmu_pebs_fixup_ip ->
					copy_from_user_nmi

In intel_pmu_pebs_fixup_ip(), if the while-loop goes over 50, the sum of
all the copy_from_user_nmi latencies seems to go over 1,000,000 cycles
(there are some cases where only 10 iterations are needed to go that high
too, but in generall over 50 or so).  At this point copy_user_from_nmi
seems to account for over 90% of the nmi latency.

I am not entirely familar with how copy_from_user_nmi() works, so I am
posting this email to help with suggestions on how to proceed or if this is
even an issue.

The command I most frequently run to gather my data is:

<some linpack benchmark test in background>
perf record  -W -d -a -e cpu/mem-loads,ldlat=30/pp,cpu/mem-stores/pp sleep 50

Help?

Cheers,
Don


diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index cb75028..f957948 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -57,6 +57,39 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
 #define EAX_EDX_RET(val, low, high)	"=A" (val)
 #endif
 
+/* benchmark functions based on the Intel doc .... */
+static __always_inline unsigned long long __benchmark_start(void)
+{
+	DECLARE_ARGS(val, low, high);
+
+	asm volatile("cpuid\n\t"
+		     "rdtsc\n\t"
+		     "mov %%edx, %0\n\t"
+		     "mov %%eax, %1\n\t" : "=r" (high), "=r" (low)
+		     :: "%rax", "%rbx", "%rcx", "%rdx");
+
+	return EAX_EDX_VAL(val, low, high);
+}
+
+static __always_inline unsigned long long __benchmark_stop(void)
+{
+	unsigned low, high;
+
+	asm volatile(".byte 0x0f,0x01,0xf9\n\t"
+		     "mov %%edx, %0\n\t"
+		     "mov %%eax, %1\n\t"
+		     "cpuid\n\t" : "=r" (high), "=r" (low)
+		     :: "%rax", "%rbx", "%rcx", "%rdx");
+
+	return low | ((u64)high << 32);
+}
+
+#define benchmark_start(val)						\
+	((val) = __benchmark_start())
+
+#define benchmark_stop(val)						\
+	((val) = __benchmark_stop())
+
 static inline unsigned long long native_read_msr(unsigned int msr)
 {
 	DECLARE_ARGS(val, low, high);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 3e6c653..d6ffea2 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -143,6 +143,8 @@ struct cpu_hw_events {
 	 */
 	struct debug_store	*ds;
 	u64			pebs_enabled;
+	u64			benchmark;
+	int			count;
 
 	/*
 	 * Intel LBR bits
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index d9cb6a7..eb3e5e5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1180,8 +1180,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	int bit, loops;
 	u64 status;
 	int handled;
+	u64 start, finish;
+	u64 finish1=0;
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
+	cpuc->benchmark = 0;
+	cpuc->count = 0;
+	benchmark_start(start);
 
 	/*
 	 * Some chipsets need to unmask the LVTPC in a particular spot
@@ -1223,6 +1228,7 @@ again:
 		x86_pmu.drain_pebs(regs);
 	}
 
+	benchmark_stop(finish1);
 	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_event *event = cpuc->events[bit];
 
@@ -1251,6 +1257,11 @@ again:
 		goto again;
 
 done:
+	benchmark_stop(finish);
+	if (((finish - start) > 10000))
+		trace_printk("DON [%d][%d]: %lld/%lld/%lld\n", handled, cpuc->count,
+				(finish - start), (finish1-start),(cpuc->benchmark));
+
 	intel_pmu_enable_all(0);
 	return handled;
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 60250f6..b4ab92d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -617,6 +617,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	unsigned long old_to, to = cpuc->lbr_entries[0].to;
 	unsigned long ip = regs->ip;
 	int is_64bit = 0;
+	u64 start, finish;
 
 	/*
 	 * We don't need to fixup if the PEBS assist is fault like
@@ -660,9 +661,19 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 		if (!kernel_ip(ip)) {
 			int bytes, size = MAX_INSN_SIZE;
 
+			benchmark_start(start);
 			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-			if (bytes != size)
+			benchmark_stop(finish);
+			cpuc->benchmark += finish - start;
+
+			//count how many non-kernel_ip paths taken
+			cpuc->count += 10000;
+
+			if (bytes != size) {
+				//mark early exit
+				cpuc->count += 10000000;
 				return 0;
+			}
 
 			kaddr = buf;
 		} else
@@ -674,6 +685,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 		insn_init(&insn, kaddr, is_64bit);
 		insn_get_length(&insn);
 		to += insn.length;
+		cpuc->count++;
 	} while (to < ip);
 
 	if (to == ip) {

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: x86, perf: throttling issues with long nmi latencies
  2013-10-14 20:35 x86, perf: throttling issues with long nmi latencies Don Zickus
@ 2013-10-14 21:28 ` Andi Kleen
  2013-10-15 10:14 ` Peter Zijlstra
  1 sibling, 0 replies; 47+ messages in thread
From: Andi Kleen @ 2013-10-14 21:28 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, a.p.zijlstra, eranian, jmario, linux-kernel

On Mon, Oct 14, 2013 at 04:35:49PM -0400, Don Zickus wrote:
> I have been playing with quad socket Ivy Bridges for awhile and have seen
> numerous "perf samples too long" messages, to the point, the machine is
> unusable for any perf analyzing.

We've seen the same problem on our large systems. Dave 
did some fixes in mainline, but they only work around the problem.

One main cause I believe is that dynamic period, which often 
goes down to insanely low values for cycles.

This also causes a lot of measurement overhead, without really giving better
data.

If you use -c ... with a reasonable period the problem completely
goes away (with pmu-tools ocperf stat -c default sets a reasonable default)

> So I tried to investigate the source of the NMI latencies using the
> traditional 'rdtscll()' command.  That failed miserably.  Then it was
> pointed out to me that rdtscll() is terrible for benchmarking due to
> out-of-order execution by the Intel processors.  This Intel whitepaper
> describes a better way using cpuid and rdtsc:

We just used ftrace function tracer.

> the longest one first.  It seems to be 'copy_user_from_nmi'
> 
> intel_pmu_handle_irq ->
> 	intel_pmu_drain_pebs_nhm ->
> 		__intel_pmu_drain_pebs_nhm ->
> 			__intel_pmu_pebs_event ->
> 				intel_pmu_pebs_fixup_ip ->
> 					copy_from_user_nmi
> 
> In intel_pmu_pebs_fixup_ip(), if the while-loop goes over 50, the sum of
> all the copy_from_user_nmi latencies seems to go over 1,000,000 cycles

fixup_ip has to decode a whole basic block, to correct off by one.
I'm not sure why the copy dominates though. But copy_from_user_nmi
does a lot of nasty things.

I would just use :p which skips this. The single instruction correction 
is not worth all the overhead, and  there is always more skid anyways
even with the correction.

The good news is that Haswell fixes the overhead, :pp is as fast as :p

> (there are some cases where only 10 iterations are needed to go that high
> too, but in generall over 50 or so).  At this point copy_user_from_nmi
> seems to account for over 90% of the nmi latency.

Yes saw the same. It's unclear why it is that expensive.
I've also seen the copy dominate with -g.

Also for some reason it seems to hurt much more on larger systems
(cache misses?) Unfortunately it's hard to use perf to analyze
perf, that was the road block last time I understanding this better.

One guess was that if you profile the same code running on many
cores the copy*user_nmi code will have a very hot cache line
with the page reference count.

Some obvious improvements are likely possible:

The copy function is pretty dumb -- for example it repins the pages
for each access. It would be likely much faster to batch that
and only do it once per backtrace/decode. This would need
a new interface.

I suppose there would be a way to do this access without actually
incrementing the ref count (e.g. with a seqlock like scheme
or just using TSX)

But if you don't do the IP correction and only the stack access
in theory it should be possible to avoid the majority of changes.

First level recommendations:

- Always use -c ... / or -F ..., NEVER dynamic period
- Don't use :pp

-Andi


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: x86, perf: throttling issues with long nmi latencies
  2013-10-14 20:35 x86, perf: throttling issues with long nmi latencies Don Zickus
  2013-10-14 21:28 ` Andi Kleen
@ 2013-10-15 10:14 ` Peter Zijlstra
  2013-10-15 13:02   ` Peter Zijlstra
  1 sibling, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-15 10:14 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme

On Mon, Oct 14, 2013 at 04:35:49PM -0400, Don Zickus wrote:

> This gave more stable numbers such that I could now narrow things down.
> While there are a few places that are causing latencies, for now I focused on
> the longest one first.  It seems to be 'copy_user_from_nmi'
> 
> intel_pmu_handle_irq ->
> 	intel_pmu_drain_pebs_nhm ->
> 		__intel_pmu_drain_pebs_nhm ->
> 			__intel_pmu_pebs_event ->
> 				intel_pmu_pebs_fixup_ip ->
> 					copy_from_user_nmi
> 
> In intel_pmu_pebs_fixup_ip(), if the while-loop goes over 50, the sum of
> all the copy_from_user_nmi latencies seems to go over 1,000,000 cycles
> (there are some cases where only 10 iterations are needed to go that high
> too, but in generall over 50 or so).  At this point copy_user_from_nmi
> seems to account for over 90% of the nmi latency.

What does the below do? It appears the perf userspace lost the ability
to display the MISC_EXACT_IP percentage so I've no clue if it actually
works or not.

---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 43 ++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 32e9ed81cd00..3978e72a1c9f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -722,6 +722,8 @@ void intel_pmu_pebs_disable_all(void)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 }
 
+static DEFINE_PER_CPU(u8 [PAGE_SIZE], insn_page);
+
 static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -729,6 +731,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	unsigned long old_to, to = cpuc->lbr_entries[0].to;
 	unsigned long ip = regs->ip;
 	int is_64bit = 0;
+	int size, bytes;
+	void *kaddr;
 
 	/*
 	 * We don't need to fixup if the PEBS assist is fault like
@@ -763,29 +767,44 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 		return 1;
 	}
 
+refill:
+	if (kernel_ip(ip)) {
+		u8 *buf = &__get_cpu_var(insn_page[0]);
+		size = PAGE_SIZE - ((unsigned long)to & (PAGE_SIZE-1));
+		if (size < MAX_INSN_SIZE) {
+			/*
+			 * If we're going to have to touch two pages; just copy
+			 * as much as we can hold.
+			 */
+			size = PAGE_SIZE;
+		}
+		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+		if (bytes != size)
+			return 0;
+
+		kaddr = buf;
+	} else {
+		size = INT_MAX;
+		kaddr = (void *)to;
+	}
+
 	do {
 		struct insn insn;
-		u8 buf[MAX_INSN_SIZE];
-		void *kaddr;
-
-		old_to = to;
-		if (!kernel_ip(ip)) {
-			int bytes, size = MAX_INSN_SIZE;
 
-			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-			if (bytes != size)
-				return 0;
+		if (size < MAX_INSN_SIZE)
+			goto refill;
 
-			kaddr = buf;
-		} else
-			kaddr = (void *)to;
+		old_to = to;
 
 #ifdef CONFIG_X86_64
 		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
 #endif
 		insn_init(&insn, kaddr, is_64bit);
 		insn_get_length(&insn);
+
 		to += insn.length;
+		kaddr += insn.length;
+		size -= insn.length;
 	} while (to < ip);
 
 	if (to == ip) {

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: x86, perf: throttling issues with long nmi latencies
  2013-10-15 10:14 ` Peter Zijlstra
@ 2013-10-15 13:02   ` Peter Zijlstra
  2013-10-15 14:32     ` Peter Zijlstra
  2013-10-15 14:36     ` Don Zickus
  0 siblings, 2 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-15 13:02 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme

On Tue, Oct 15, 2013 at 12:14:04PM +0200, Peter Zijlstra wrote:
>  arch/x86/kernel/cpu/perf_event_intel_ds.c | 43 ++++++++++++++++++++++---------
>  1 file changed, 31 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> index 32e9ed81cd00..3978e72a1c9f 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -722,6 +722,8 @@ void intel_pmu_pebs_disable_all(void)
>  		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
>  }
>  
> +static DEFINE_PER_CPU(u8 [PAGE_SIZE], insn_page);
> +
>  static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
>  {
>  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> @@ -729,6 +731,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
>  	unsigned long old_to, to = cpuc->lbr_entries[0].to;
>  	unsigned long ip = regs->ip;
>  	int is_64bit = 0;
> +	int size, bytes;
> +	void *kaddr;
>  
>  	/*
>  	 * We don't need to fixup if the PEBS assist is fault like
> @@ -763,29 +767,44 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
>  		return 1;
>  	}
>  
> +refill:
> +	if (kernel_ip(ip)) {
> +		u8 *buf = &__get_cpu_var(insn_page[0]);
> +		size = PAGE_SIZE - ((unsigned long)to & (PAGE_SIZE-1));
> +		if (size < MAX_INSN_SIZE) {
> +			/*
> +			 * If we're going to have to touch two pages; just copy
> +			 * as much as we can hold.
> +			 */
> +			size = PAGE_SIZE;


Arguably we'd want that to be:

			size = min(PAGE_SIZE, ip - to);

As there's no point in copying beyond the basic block.

> +		}
> +		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
> +		if (bytes != size)
> +			return 0;
> +
> +		kaddr = buf;

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: x86, perf: throttling issues with long nmi latencies
  2013-10-15 13:02   ` Peter Zijlstra
@ 2013-10-15 14:32     ` Peter Zijlstra
  2013-10-15 15:07       ` Peter Zijlstra
  2013-10-15 14:36     ` Don Zickus
  1 sibling, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-15 14:32 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme


This one seems to actually work and is somewhat simpler.

---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 32e9ed81cd00..9c7e043f8514 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -722,6 +722,8 @@ void intel_pmu_pebs_disable_all(void)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 }
 
+static DEFINE_PER_CPU(u8 [PAGE_SIZE], insn_page);
+
 static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -729,6 +731,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	unsigned long old_to, to = cpuc->lbr_entries[0].to;
 	unsigned long ip = regs->ip;
 	int is_64bit = 0;
+	void *kaddr;
 
 	/*
 	 * We don't need to fixup if the PEBS assist is fault like
@@ -763,29 +766,33 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 		return 1;
 	}
 
+	if (!kernel_ip(ip)) {
+		int size, bytes;
+		u8 *buf = &__get_cpu_var(insn_page[0]);
+
+		size = ip - to; /* Must be le than PAGE_SIZE, see above */
+		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+		if (bytes != size)
+			return 0;
+
+		kaddr = buf;
+	} else {
+		kaddr = (void *)to;
+	}
+
 	do {
 		struct insn insn;
-		u8 buf[MAX_INSN_SIZE];
-		void *kaddr;
 
 		old_to = to;
-		if (!kernel_ip(ip)) {
-			int bytes, size = MAX_INSN_SIZE;
-
-			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-			if (bytes != size)
-				return 0;
-
-			kaddr = buf;
-		} else
-			kaddr = (void *)to;
 
 #ifdef CONFIG_X86_64
 		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
 #endif
 		insn_init(&insn, kaddr, is_64bit);
 		insn_get_length(&insn);
+
 		to += insn.length;
+		kaddr += insn.length;
 	} while (to < ip);
 
 	if (to == ip) {

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: x86, perf: throttling issues with long nmi latencies
  2013-10-15 13:02   ` Peter Zijlstra
  2013-10-15 14:32     ` Peter Zijlstra
@ 2013-10-15 14:36     ` Don Zickus
  2013-10-15 14:39       ` Peter Zijlstra
  1 sibling, 1 reply; 47+ messages in thread
From: Don Zickus @ 2013-10-15 14:36 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme

On Tue, Oct 15, 2013 at 03:02:26PM +0200, Peter Zijlstra wrote:
> On Tue, Oct 15, 2013 at 12:14:04PM +0200, Peter Zijlstra wrote:
> >  arch/x86/kernel/cpu/perf_event_intel_ds.c | 43 ++++++++++++++++++++++---------
> >  1 file changed, 31 insertions(+), 12 deletions(-)
> > 
> > diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> > index 32e9ed81cd00..3978e72a1c9f 100644
> > --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> > +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> > @@ -722,6 +722,8 @@ void intel_pmu_pebs_disable_all(void)
> >  		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
> >  }
> >  
> > +static DEFINE_PER_CPU(u8 [PAGE_SIZE], insn_page);
> > +
> >  static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
> >  {
> >  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> > @@ -729,6 +731,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
> >  	unsigned long old_to, to = cpuc->lbr_entries[0].to;
> >  	unsigned long ip = regs->ip;
> >  	int is_64bit = 0;
> > +	int size, bytes;
> > +	void *kaddr;
> >  
> >  	/*
> >  	 * We don't need to fixup if the PEBS assist is fault like
> > @@ -763,29 +767,44 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
> >  		return 1;
> >  	}
> >  
> > +refill:
> > +	if (kernel_ip(ip)) {
> > +		u8 *buf = &__get_cpu_var(insn_page[0]);
> > +		size = PAGE_SIZE - ((unsigned long)to & (PAGE_SIZE-1));
> > +		if (size < MAX_INSN_SIZE) {
> > +			/*
> > +			 * If we're going to have to touch two pages; just copy
> > +			 * as much as we can hold.
> > +			 */
> > +			size = PAGE_SIZE;
> 
> 
> Arguably we'd want that to be:
> 
> 			size = min(PAGE_SIZE, ip - to);
> 
> As there's no point in copying beyond the basic block.

Hey Peter,

I haven't looked to deep yet, but it has panic'd twice with


intel-brickland-03 login: [  385.203323] BUG: unable to handle kernel paging request at 00000000006e39f0
[  385.211128] IP: [<ffffffff812fc419>] insn_get_prefixes.part.2+0x29/0x270
[  385.218635] PGD 1850266067 PUD 1848f21067 PMD 18485aa067 PTE 84aabf025
[  385.225981] Oops: 0000 [#1] SMP
[  385.229609] Modules linked in: nfsv3 nfs_acl nfs lockd sunrpc fscache nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE ip6table_nat nf_nat_ipv6 ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat nf_nat_ipv4 nf_nat iptable_mangle ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ip_tables sg xfs libcrc32c iTCO_wdt iTCO_vendor_support ixgbe ptp pcspkr pps_core mtip32xx mdio lpc_ich i2c_i801 dca mfd_core wmi acpi_cpufreq mperf binfmt_misc sr_mod sd_mod cdrom crc_t10dif mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ttm drm ahci libahci libata megaraid_sas i2c_core dm_mirror dm_region_hash dm_log dm_mod
[  385.303771] CPU: 0 PID: 9545 Comm: xlinpack_xeon64 Not tainted 3.10.0c2c_mmap2+ #37
[  385.312327] Hardware name: Intel Corporation BRICKLAND/BRICKLAND, BIOS BIVTSDP1.86B.0038.R02.1307231126 07/23/2013
[  385.323892] task: ffff88203cd9e680 ti: ffff88204e4d8000 task.ti: ffff88204e4d8000
[  385.332253] RIP: 0010:[<ffffffff812fc419>]  [<ffffffff812fc419>] insn_get_prefixes.part.2+0x29/0x270
[  385.342473] RSP: 0000:ffff88085f806a18  EFLAGS: 00010083
[  385.348408] RAX: 0000000000000001 RBX: ffff88085f806b20 RCX: 0000000000000000
[  385.356379] RDX: 00000000006e39f0 RSI: 00000000006e39f0 RDI: ffff88085f806b20
[  385.364350] RBP: ffff88085f806a38 R08: 00000000006e39f0 R09: ffff88085f806b20
[  385.372324] R10: 0000000000000000 R11: 0000000000000001 R12: ffff88085f80c9a0
[  385.380295] R13: ffff88085f806b20 R14: ffff88085f806c08 R15: 000000007fffffff
[  385.388268] FS:  0000000001679680(0063) GS:ffff88085f800000(0000) knlGS:0000000000000000
[  385.397307] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  385.403725] CR2: 00000000006e39f0 CR3: 0000001847c70000 CR4: 00000000001407f0
[  385.411697] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  385.419669] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  385.427640] Stack:
[  385.429885]  ffff88085f806b20 ffff88085f80c9a0 00000000006e39f0 ffff88085f806c08
[  385.438199]  ffff88085f806a58 ffffffff812fc7fd ffff88085f806b20 ffff88085f80c9a0
[  385.446513]  ffff88085f806a78 ffffffff812fc92d ffff88085f806b20 ffff88085f80c9a0
[  385.454830] Call Trace:
[  385.457561]  <NMI>
[  385.459710]  [<ffffffff812fc7fd>] insn_get_opcode+0x9d/0x160
[  385.466254]  [<ffffffff812fc92d>] insn_get_modrm.part.4+0x6d/0xf0
[  385.473065]  [<ffffffff812fca2e>] insn_get_sib+0x1e/0x80
[  385.478991]  [<ffffffff812fcb15>] insn_get_displacement+0x85/0x110
[  385.485898]  [<ffffffff812fccb5>] insn_get_immediate+0x115/0x3d0
[  385.492611]  [<ffffffff812fcfa5>] insn_get_length+0x35/0x40
[  385.498832]  [<ffffffff810254a2>] __intel_pmu_pebs_event+0x2e2/0x550
[  385.505937]  [<ffffffff810df24c>] ? __audit_syscall_exit+0x4c/0x2a0
[  385.512944]  [<ffffffff81018b65>] ? native_sched_clock+0x15/0x80
[  385.519655]  [<ffffffff81018bd9>] ? sched_clock+0x9/0x10
[  385.525591]  [<ffffffff8102585f>] intel_pmu_drain_pebs_nhm+0x14f/0x1c0
[  385.532888]  [<ffffffff81026fb2>] intel_pmu_handle_irq+0x372/0x490
[  385.539795]  [<ffffffff81018b65>] ? native_sched_clock+0x15/0x80
[  385.546507]  [<ffffffff81018bd9>] ? sched_clock+0x9/0x10
[  385.552446]  [<ffffffff810976f5>] ? sched_clock_cpu+0xb5/0x100
[  385.558968]  [<ffffffff8160437b>] perf_event_nmi_handler+0x2b/0x50
[  385.565876]  [<ffffffff81603b39>] nmi_handle.isra.0+0x59/0x90
[  385.572297]  [<ffffffff81603c40>] do_nmi+0xd0/0x310
[  385.577746]  [<ffffffff81603181>] end_repeat_nmi+0x1e/0x2e
[  385.583873]  <<EOE>>
[  385.586217] Code: 90 90 0f 1f 44 00 00 55 48 89 e5 41 56 41 55 49 89 fd 41 54 53 48 8b 57 58 48 8d 42 01 48 2b 47 50 48 83 f8 10 0f 8f 5b 01 00 00 <0f> b6 1a 45 31 e4 0f b6 fb e8 29 fe ff ff 83 e0 0f 31 f6 8d 50
[  385.608244] RIP  [<ffffffff812fc419>] insn_get_prefixes.part.2+0x29/0x270
[  385.615840]  RSP <ffff88085f806a18>
[  385.619736] CR2: 00000000006e39f0
[    0.000000] Initializing cgroup subsys cpuset

Quick thoughts?

Cheers,
Don

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: x86, perf: throttling issues with long nmi latencies
  2013-10-15 14:36     ` Don Zickus
@ 2013-10-15 14:39       ` Peter Zijlstra
  0 siblings, 0 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-15 14:39 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme

On Tue, Oct 15, 2013 at 10:36:31AM -0400, Don Zickus wrote:
> > > +refill:
> > > +	if (kernel_ip(ip)) {
> > > +		u8 *buf = &__get_cpu_var(insn_page[0]);
> > > +		size = PAGE_SIZE - ((unsigned long)to & (PAGE_SIZE-1));
> > > +		if (size < MAX_INSN_SIZE) {
> > > +			/*
> > > +			 * If we're going to have to touch two pages; just copy
> > > +			 * as much as we can hold.
> > > +			 */
> > > +			size = PAGE_SIZE;

> Quick thoughts?

Yeah, see the patch I just send; but notably I got the kernel_ip(ip)
case the wrong way about. See how it copies userspace memory for kernel
IPs and vice versa.



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: x86, perf: throttling issues with long nmi latencies
  2013-10-15 14:32     ` Peter Zijlstra
@ 2013-10-15 15:07       ` Peter Zijlstra
  2013-10-15 15:41         ` Don Zickus
  2013-10-15 16:22         ` x86, perf: throttling issues with long nmi latencies Don Zickus
  0 siblings, 2 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-15 15:07 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme

On Tue, Oct 15, 2013 at 04:32:27PM +0200, Peter Zijlstra wrote:
> 
> This one seems to actually work and is somewhat simpler.

And here's some hackery to avoid that atomic page inc frobbery.

---
 lib/usercopy.c |   12 ++++++++--
 mm/gup.c       |   63 ++++++++++++++++++++++++++++++++++++---------------------
 2 files changed, 49 insertions(+), 26 deletions(-)

--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -10,6 +10,8 @@
 #include <asm/word-at-a-time.h>
 #include <linux/sched.h>
 
+extern int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
+			  struct page **pages);
 /*
  * best effort, GUP based copy_from_user() that is NMI-safe
  */
@@ -18,6 +20,7 @@ copy_from_user_nmi(void *to, const void
 {
 	unsigned long offset, addr = (unsigned long)from;
 	unsigned long size, len = 0;
+	unsigned long flags;
 	struct page *page;
 	void *map;
 	int ret;
@@ -26,9 +29,12 @@ copy_from_user_nmi(void *to, const void
 		return len;
 
 	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
+		local_irq_save(flags);
+		ret = ___get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret) {
+			local_irq_restore(flags);
 			break;
+		}
 
 		offset = addr & (PAGE_SIZE - 1);
 		size = min(PAGE_SIZE - offset, n - len);
@@ -36,7 +42,7 @@ copy_from_user_nmi(void *to, const void
 		map = kmap_atomic(page);
 		memcpy(to, map+offset, size);
 		kunmap_atomic(map);
-		put_page(page);
+		local_irq_restore(flags);
 
 		len  += size;
 		to   += size;
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -63,19 +63,22 @@ static inline pte_t gup_get_pte(pte_t *p
 #endif
 }
 
+#define GUPF_GET	0x01
+#define GUPF_WRITE	0x02
+
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
  * register pressure.
  */
 static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t *ptep;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 
 	ptep = pte_offset_map(&pmd, addr);
@@ -89,7 +92,8 @@ static noinline int gup_pte_range(pmd_t
 		}
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 		page = pte_page(pte);
-		get_page(page);
+		if (flags & GUPF_GET)
+			get_page(page);
 		SetPageReferenced(page);
 		pages[*nr] = page;
 		(*nr)++;
@@ -109,7 +113,7 @@ static inline void get_head_page_multipl
 }
 
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t pte = *(pte_t *)&pmd;
@@ -117,7 +121,7 @@ static noinline int gup_huge_pmd(pmd_t p
 	int refs;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 	if ((pte_flags(pte) & mask) != mask)
 		return 0;
@@ -131,19 +135,20 @@ static noinline int gup_huge_pmd(pmd_t p
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
-		if (PageTail(page))
+		if ((flags & GUPF_GET) && PageTail(page))
 			get_huge_page_tail(page);
 		(*nr)++;
 		page++;
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
-	get_head_page_multiple(head, refs);
+	if (flags & GUPF_GET)
+		get_head_page_multiple(head, refs);
 
 	return 1;
 }
 
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
-		int write, struct page **pages, int *nr)
+		int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pmd_t *pmdp;
@@ -167,10 +172,10 @@ static int gup_pmd_range(pud_t pud, unsi
 		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 			return 0;
 		if (unlikely(pmd_large(pmd))) {
-			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+			if (!gup_huge_pmd(pmd, addr, next, flags, pages, nr))
 				return 0;
 		} else {
-			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+			if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
 				return 0;
 		}
 	} while (pmdp++, addr = next, addr != end);
@@ -179,7 +184,7 @@ static int gup_pmd_range(pud_t pud, unsi
 }
 
 static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t pte = *(pte_t *)&pud;
@@ -187,7 +192,7 @@ static noinline int gup_huge_pud(pud_t p
 	int refs;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 	if ((pte_flags(pte) & mask) != mask)
 		return 0;
@@ -201,19 +206,20 @@ static noinline int gup_huge_pud(pud_t p
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
-		if (PageTail(page))
+		if ((flags & GUPF_GET) && PageTail(page))
 			get_huge_page_tail(page);
 		(*nr)++;
 		page++;
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
-	get_head_page_multiple(head, refs);
+	if (flags & GUPF_GET)
+		get_head_page_multiple(head, refs);
 
 	return 1;
 }
 
 static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
-			int write, struct page **pages, int *nr)
+			int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pud_t *pudp;
@@ -226,10 +232,10 @@ static int gup_pud_range(pgd_t pgd, unsi
 		if (pud_none(pud))
 			return 0;
 		if (unlikely(pud_large(pud))) {
-			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
+			if (!gup_huge_pud(pud, addr, next, flags, pages, nr))
 				return 0;
 		} else {
-			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
 				return 0;
 		}
 	} while (pudp++, addr = next, addr != end);
@@ -241,13 +247,12 @@ static int gup_pud_range(pgd_t pgd, unsi
  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
  * back to the regular GUP.
  */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
 			  struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr, len, end;
 	unsigned long next;
-	unsigned long flags;
 	pgd_t *pgdp;
 	int nr = 0;
 
@@ -255,7 +260,7 @@ int __get_user_pages_fast(unsigned long
 	addr = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
-	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+	if (unlikely(!access_ok((flags & GUPF_WRITE) ? VERIFY_WRITE : VERIFY_READ,
 					(void __user *)start, len)))
 		return 0;
 
@@ -277,7 +282,6 @@ int __get_user_pages_fast(unsigned long
 	 * (which we do on x86, with the above PAE exception), we can follow the
 	 * address down to the the page and take a ref on it.
 	 */
-	local_irq_save(flags);
 	pgdp = pgd_offset(mm, addr);
 	do {
 		pgd_t pgd = *pgdp;
@@ -285,14 +289,27 @@ int __get_user_pages_fast(unsigned long
 		next = pgd_addr_end(addr, end);
 		if (pgd_none(pgd))
 			break;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+		if (!gup_pud_range(pgd, addr, next, flags, pages, &nr))
 			break;
 	} while (pgdp++, addr = next, addr != end);
-	local_irq_restore(flags);
 
 	return nr;
 }
 
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret = ___get_user_pages_fast(start, nr_pages,
+			GUPF_GET | (write ? GUPF_WRITE : 0), pages);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
 /**
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: x86, perf: throttling issues with long nmi latencies
  2013-10-15 15:07       ` Peter Zijlstra
@ 2013-10-15 15:41         ` Don Zickus
  2013-10-16 10:57           ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Peter Zijlstra
  2013-10-15 16:22         ` x86, perf: throttling issues with long nmi latencies Don Zickus
  1 sibling, 1 reply; 47+ messages in thread
From: Don Zickus @ 2013-10-15 15:41 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme

On Tue, Oct 15, 2013 at 05:07:36PM +0200, Peter Zijlstra wrote:
> On Tue, Oct 15, 2013 at 04:32:27PM +0200, Peter Zijlstra wrote:
> > 
> > This one seems to actually work and is somewhat simpler.

Hey Peter,

Your previous patch made a huge difference in improvement.  The
copy_from_user_nmi() no longer hits the million of cycles.  I still have a
batch of 100,000-300,000 cycles.  My longest NMI paths used to be
dominated by copy_from_user_nmi, now it is not (I have to dig up the new
hot path).

I'll try your patch below to see if it removes the last of the
copy_from_user_nmi issues.

As expected the perf throttling takes longer for it to throttle down to
1000 samples per second (60 seconds vs 10 seconds). :-)

I have to give up the machine for the afternoon, but I will respond with
how the latest patch works and hopefully figure out the new hottest path
(somewhere above the for-loop in handle_irq).

Thanks!

Cheers,
Don

> 
> And here's some hackery to avoid that atomic page inc frobbery.
> 
> ---
>  lib/usercopy.c |   12 ++++++++--
>  mm/gup.c       |   63 ++++++++++++++++++++++++++++++++++++---------------------
>  2 files changed, 49 insertions(+), 26 deletions(-)
> 
> --- a/arch/x86/lib/usercopy.c
> +++ b/arch/x86/lib/usercopy.c
> @@ -10,6 +10,8 @@
>  #include <asm/word-at-a-time.h>
>  #include <linux/sched.h>
>  
> +extern int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
> +			  struct page **pages);
>  /*
>   * best effort, GUP based copy_from_user() that is NMI-safe
>   */
> @@ -18,6 +20,7 @@ copy_from_user_nmi(void *to, const void
>  {
>  	unsigned long offset, addr = (unsigned long)from;
>  	unsigned long size, len = 0;
> +	unsigned long flags;
>  	struct page *page;
>  	void *map;
>  	int ret;
> @@ -26,9 +29,12 @@ copy_from_user_nmi(void *to, const void
>  		return len;
>  
>  	do {
> -		ret = __get_user_pages_fast(addr, 1, 0, &page);
> -		if (!ret)
> +		local_irq_save(flags);
> +		ret = ___get_user_pages_fast(addr, 1, 0, &page);
> +		if (!ret) {
> +			local_irq_restore(flags);
>  			break;
> +		}
>  
>  		offset = addr & (PAGE_SIZE - 1);
>  		size = min(PAGE_SIZE - offset, n - len);
> @@ -36,7 +42,7 @@ copy_from_user_nmi(void *to, const void
>  		map = kmap_atomic(page);
>  		memcpy(to, map+offset, size);
>  		kunmap_atomic(map);
> -		put_page(page);
> +		local_irq_restore(flags);
>  
>  		len  += size;
>  		to   += size;
> --- a/arch/x86/mm/gup.c
> +++ b/arch/x86/mm/gup.c
> @@ -63,19 +63,22 @@ static inline pte_t gup_get_pte(pte_t *p
>  #endif
>  }
>  
> +#define GUPF_GET	0x01
> +#define GUPF_WRITE	0x02
> +
>  /*
>   * The performance critical leaf functions are made noinline otherwise gcc
>   * inlines everything into a single function which results in too much
>   * register pressure.
>   */
>  static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
> -		unsigned long end, int write, struct page **pages, int *nr)
> +		unsigned long end, int flags, struct page **pages, int *nr)
>  {
>  	unsigned long mask;
>  	pte_t *ptep;
>  
>  	mask = _PAGE_PRESENT|_PAGE_USER;
> -	if (write)
> +	if (flags & GUPF_WRITE)
>  		mask |= _PAGE_RW;
>  
>  	ptep = pte_offset_map(&pmd, addr);
> @@ -89,7 +92,8 @@ static noinline int gup_pte_range(pmd_t
>  		}
>  		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
>  		page = pte_page(pte);
> -		get_page(page);
> +		if (flags & GUPF_GET)
> +			get_page(page);
>  		SetPageReferenced(page);
>  		pages[*nr] = page;
>  		(*nr)++;
> @@ -109,7 +113,7 @@ static inline void get_head_page_multipl
>  }
>  
>  static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
> -		unsigned long end, int write, struct page **pages, int *nr)
> +		unsigned long end, int flags, struct page **pages, int *nr)
>  {
>  	unsigned long mask;
>  	pte_t pte = *(pte_t *)&pmd;
> @@ -117,7 +121,7 @@ static noinline int gup_huge_pmd(pmd_t p
>  	int refs;
>  
>  	mask = _PAGE_PRESENT|_PAGE_USER;
> -	if (write)
> +	if (flags & GUPF_WRITE)
>  		mask |= _PAGE_RW;
>  	if ((pte_flags(pte) & mask) != mask)
>  		return 0;
> @@ -131,19 +135,20 @@ static noinline int gup_huge_pmd(pmd_t p
>  	do {
>  		VM_BUG_ON(compound_head(page) != head);
>  		pages[*nr] = page;
> -		if (PageTail(page))
> +		if ((flags & GUPF_GET) && PageTail(page))
>  			get_huge_page_tail(page);
>  		(*nr)++;
>  		page++;
>  		refs++;
>  	} while (addr += PAGE_SIZE, addr != end);
> -	get_head_page_multiple(head, refs);
> +	if (flags & GUPF_GET)
> +		get_head_page_multiple(head, refs);
>  
>  	return 1;
>  }
>  
>  static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
> -		int write, struct page **pages, int *nr)
> +		int flags, struct page **pages, int *nr)
>  {
>  	unsigned long next;
>  	pmd_t *pmdp;
> @@ -167,10 +172,10 @@ static int gup_pmd_range(pud_t pud, unsi
>  		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
>  			return 0;
>  		if (unlikely(pmd_large(pmd))) {
> -			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
> +			if (!gup_huge_pmd(pmd, addr, next, flags, pages, nr))
>  				return 0;
>  		} else {
> -			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
> +			if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
>  				return 0;
>  		}
>  	} while (pmdp++, addr = next, addr != end);
> @@ -179,7 +184,7 @@ static int gup_pmd_range(pud_t pud, unsi
>  }
>  
>  static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
> -		unsigned long end, int write, struct page **pages, int *nr)
> +		unsigned long end, int flags, struct page **pages, int *nr)
>  {
>  	unsigned long mask;
>  	pte_t pte = *(pte_t *)&pud;
> @@ -187,7 +192,7 @@ static noinline int gup_huge_pud(pud_t p
>  	int refs;
>  
>  	mask = _PAGE_PRESENT|_PAGE_USER;
> -	if (write)
> +	if (flags & GUPF_WRITE)
>  		mask |= _PAGE_RW;
>  	if ((pte_flags(pte) & mask) != mask)
>  		return 0;
> @@ -201,19 +206,20 @@ static noinline int gup_huge_pud(pud_t p
>  	do {
>  		VM_BUG_ON(compound_head(page) != head);
>  		pages[*nr] = page;
> -		if (PageTail(page))
> +		if ((flags & GUPF_GET) && PageTail(page))
>  			get_huge_page_tail(page);
>  		(*nr)++;
>  		page++;
>  		refs++;
>  	} while (addr += PAGE_SIZE, addr != end);
> -	get_head_page_multiple(head, refs);
> +	if (flags & GUPF_GET)
> +		get_head_page_multiple(head, refs);
>  
>  	return 1;
>  }
>  
>  static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
> -			int write, struct page **pages, int *nr)
> +			int flags, struct page **pages, int *nr)
>  {
>  	unsigned long next;
>  	pud_t *pudp;
> @@ -226,10 +232,10 @@ static int gup_pud_range(pgd_t pgd, unsi
>  		if (pud_none(pud))
>  			return 0;
>  		if (unlikely(pud_large(pud))) {
> -			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
> +			if (!gup_huge_pud(pud, addr, next, flags, pages, nr))
>  				return 0;
>  		} else {
> -			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
> +			if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
>  				return 0;
>  		}
>  	} while (pudp++, addr = next, addr != end);
> @@ -241,13 +247,12 @@ static int gup_pud_range(pgd_t pgd, unsi
>   * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
>   * back to the regular GUP.
>   */
> -int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> +int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
>  			  struct page **pages)
>  {
>  	struct mm_struct *mm = current->mm;
>  	unsigned long addr, len, end;
>  	unsigned long next;
> -	unsigned long flags;
>  	pgd_t *pgdp;
>  	int nr = 0;
>  
> @@ -255,7 +260,7 @@ int __get_user_pages_fast(unsigned long
>  	addr = start;
>  	len = (unsigned long) nr_pages << PAGE_SHIFT;
>  	end = start + len;
> -	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
> +	if (unlikely(!access_ok((flags & GUPF_WRITE) ? VERIFY_WRITE : VERIFY_READ,
>  					(void __user *)start, len)))
>  		return 0;
>  
> @@ -277,7 +282,6 @@ int __get_user_pages_fast(unsigned long
>  	 * (which we do on x86, with the above PAE exception), we can follow the
>  	 * address down to the the page and take a ref on it.
>  	 */
> -	local_irq_save(flags);
>  	pgdp = pgd_offset(mm, addr);
>  	do {
>  		pgd_t pgd = *pgdp;
> @@ -285,14 +289,27 @@ int __get_user_pages_fast(unsigned long
>  		next = pgd_addr_end(addr, end);
>  		if (pgd_none(pgd))
>  			break;
> -		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
> +		if (!gup_pud_range(pgd, addr, next, flags, pages, &nr))
>  			break;
>  	} while (pgdp++, addr = next, addr != end);
> -	local_irq_restore(flags);
>  
>  	return nr;
>  }
>  
> +int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> +			  struct page **pages)
> +{
> +	unsigned long flags;
> +	int ret;
> +
> +	local_irq_save(flags);
> +	ret = ___get_user_pages_fast(start, nr_pages,
> +			GUPF_GET | (write ? GUPF_WRITE : 0), pages);
> +	local_irq_restore(flags);
> +
> +	return ret;
> +}
> +
>  /**
>   * get_user_pages_fast() - pin user pages in memory
>   * @start:	starting user address

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: x86, perf: throttling issues with long nmi latencies
  2013-10-15 15:07       ` Peter Zijlstra
  2013-10-15 15:41         ` Don Zickus
@ 2013-10-15 16:22         ` Don Zickus
  1 sibling, 0 replies; 47+ messages in thread
From: Don Zickus @ 2013-10-15 16:22 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme

On Tue, Oct 15, 2013 at 05:07:36PM +0200, Peter Zijlstra wrote:
> On Tue, Oct 15, 2013 at 04:32:27PM +0200, Peter Zijlstra wrote:
> > 
> > This one seems to actually work and is somewhat simpler.
> 
> And here's some hackery to avoid that atomic page inc frobbery.

Hmm, for some reason, I did not see much noticable improvement (over your
second patch) with this patch. :-(  Not to say it doesn't improve things,
I just don't have hard numbers to see if it is a small percentage better
or not.

Cheers,
Don

> 
> ---
>  lib/usercopy.c |   12 ++++++++--
>  mm/gup.c       |   63 ++++++++++++++++++++++++++++++++++++---------------------
>  2 files changed, 49 insertions(+), 26 deletions(-)
> 
> --- a/arch/x86/lib/usercopy.c
> +++ b/arch/x86/lib/usercopy.c
> @@ -10,6 +10,8 @@
>  #include <asm/word-at-a-time.h>
>  #include <linux/sched.h>
>  
> +extern int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
> +			  struct page **pages);
>  /*
>   * best effort, GUP based copy_from_user() that is NMI-safe
>   */
> @@ -18,6 +20,7 @@ copy_from_user_nmi(void *to, const void
>  {
>  	unsigned long offset, addr = (unsigned long)from;
>  	unsigned long size, len = 0;
> +	unsigned long flags;
>  	struct page *page;
>  	void *map;
>  	int ret;
> @@ -26,9 +29,12 @@ copy_from_user_nmi(void *to, const void
>  		return len;
>  
>  	do {
> -		ret = __get_user_pages_fast(addr, 1, 0, &page);
> -		if (!ret)
> +		local_irq_save(flags);
> +		ret = ___get_user_pages_fast(addr, 1, 0, &page);
> +		if (!ret) {
> +			local_irq_restore(flags);
>  			break;
> +		}
>  
>  		offset = addr & (PAGE_SIZE - 1);
>  		size = min(PAGE_SIZE - offset, n - len);
> @@ -36,7 +42,7 @@ copy_from_user_nmi(void *to, const void
>  		map = kmap_atomic(page);
>  		memcpy(to, map+offset, size);
>  		kunmap_atomic(map);
> -		put_page(page);
> +		local_irq_restore(flags);
>  
>  		len  += size;
>  		to   += size;
> --- a/arch/x86/mm/gup.c
> +++ b/arch/x86/mm/gup.c
> @@ -63,19 +63,22 @@ static inline pte_t gup_get_pte(pte_t *p
>  #endif
>  }
>  
> +#define GUPF_GET	0x01
> +#define GUPF_WRITE	0x02
> +
>  /*
>   * The performance critical leaf functions are made noinline otherwise gcc
>   * inlines everything into a single function which results in too much
>   * register pressure.
>   */
>  static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
> -		unsigned long end, int write, struct page **pages, int *nr)
> +		unsigned long end, int flags, struct page **pages, int *nr)
>  {
>  	unsigned long mask;
>  	pte_t *ptep;
>  
>  	mask = _PAGE_PRESENT|_PAGE_USER;
> -	if (write)
> +	if (flags & GUPF_WRITE)
>  		mask |= _PAGE_RW;
>  
>  	ptep = pte_offset_map(&pmd, addr);
> @@ -89,7 +92,8 @@ static noinline int gup_pte_range(pmd_t
>  		}
>  		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
>  		page = pte_page(pte);
> -		get_page(page);
> +		if (flags & GUPF_GET)
> +			get_page(page);
>  		SetPageReferenced(page);
>  		pages[*nr] = page;
>  		(*nr)++;
> @@ -109,7 +113,7 @@ static inline void get_head_page_multipl
>  }
>  
>  static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
> -		unsigned long end, int write, struct page **pages, int *nr)
> +		unsigned long end, int flags, struct page **pages, int *nr)
>  {
>  	unsigned long mask;
>  	pte_t pte = *(pte_t *)&pmd;
> @@ -117,7 +121,7 @@ static noinline int gup_huge_pmd(pmd_t p
>  	int refs;
>  
>  	mask = _PAGE_PRESENT|_PAGE_USER;
> -	if (write)
> +	if (flags & GUPF_WRITE)
>  		mask |= _PAGE_RW;
>  	if ((pte_flags(pte) & mask) != mask)
>  		return 0;
> @@ -131,19 +135,20 @@ static noinline int gup_huge_pmd(pmd_t p
>  	do {
>  		VM_BUG_ON(compound_head(page) != head);
>  		pages[*nr] = page;
> -		if (PageTail(page))
> +		if ((flags & GUPF_GET) && PageTail(page))
>  			get_huge_page_tail(page);
>  		(*nr)++;
>  		page++;
>  		refs++;
>  	} while (addr += PAGE_SIZE, addr != end);
> -	get_head_page_multiple(head, refs);
> +	if (flags & GUPF_GET)
> +		get_head_page_multiple(head, refs);
>  
>  	return 1;
>  }
>  
>  static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
> -		int write, struct page **pages, int *nr)
> +		int flags, struct page **pages, int *nr)
>  {
>  	unsigned long next;
>  	pmd_t *pmdp;
> @@ -167,10 +172,10 @@ static int gup_pmd_range(pud_t pud, unsi
>  		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
>  			return 0;
>  		if (unlikely(pmd_large(pmd))) {
> -			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
> +			if (!gup_huge_pmd(pmd, addr, next, flags, pages, nr))
>  				return 0;
>  		} else {
> -			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
> +			if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
>  				return 0;
>  		}
>  	} while (pmdp++, addr = next, addr != end);
> @@ -179,7 +184,7 @@ static int gup_pmd_range(pud_t pud, unsi
>  }
>  
>  static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
> -		unsigned long end, int write, struct page **pages, int *nr)
> +		unsigned long end, int flags, struct page **pages, int *nr)
>  {
>  	unsigned long mask;
>  	pte_t pte = *(pte_t *)&pud;
> @@ -187,7 +192,7 @@ static noinline int gup_huge_pud(pud_t p
>  	int refs;
>  
>  	mask = _PAGE_PRESENT|_PAGE_USER;
> -	if (write)
> +	if (flags & GUPF_WRITE)
>  		mask |= _PAGE_RW;
>  	if ((pte_flags(pte) & mask) != mask)
>  		return 0;
> @@ -201,19 +206,20 @@ static noinline int gup_huge_pud(pud_t p
>  	do {
>  		VM_BUG_ON(compound_head(page) != head);
>  		pages[*nr] = page;
> -		if (PageTail(page))
> +		if ((flags & GUPF_GET) && PageTail(page))
>  			get_huge_page_tail(page);
>  		(*nr)++;
>  		page++;
>  		refs++;
>  	} while (addr += PAGE_SIZE, addr != end);
> -	get_head_page_multiple(head, refs);
> +	if (flags & GUPF_GET)
> +		get_head_page_multiple(head, refs);
>  
>  	return 1;
>  }
>  
>  static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
> -			int write, struct page **pages, int *nr)
> +			int flags, struct page **pages, int *nr)
>  {
>  	unsigned long next;
>  	pud_t *pudp;
> @@ -226,10 +232,10 @@ static int gup_pud_range(pgd_t pgd, unsi
>  		if (pud_none(pud))
>  			return 0;
>  		if (unlikely(pud_large(pud))) {
> -			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
> +			if (!gup_huge_pud(pud, addr, next, flags, pages, nr))
>  				return 0;
>  		} else {
> -			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
> +			if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
>  				return 0;
>  		}
>  	} while (pudp++, addr = next, addr != end);
> @@ -241,13 +247,12 @@ static int gup_pud_range(pgd_t pgd, unsi
>   * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
>   * back to the regular GUP.
>   */
> -int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> +int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
>  			  struct page **pages)
>  {
>  	struct mm_struct *mm = current->mm;
>  	unsigned long addr, len, end;
>  	unsigned long next;
> -	unsigned long flags;
>  	pgd_t *pgdp;
>  	int nr = 0;
>  
> @@ -255,7 +260,7 @@ int __get_user_pages_fast(unsigned long
>  	addr = start;
>  	len = (unsigned long) nr_pages << PAGE_SHIFT;
>  	end = start + len;
> -	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
> +	if (unlikely(!access_ok((flags & GUPF_WRITE) ? VERIFY_WRITE : VERIFY_READ,
>  					(void __user *)start, len)))
>  		return 0;
>  
> @@ -277,7 +282,6 @@ int __get_user_pages_fast(unsigned long
>  	 * (which we do on x86, with the above PAE exception), we can follow the
>  	 * address down to the the page and take a ref on it.
>  	 */
> -	local_irq_save(flags);
>  	pgdp = pgd_offset(mm, addr);
>  	do {
>  		pgd_t pgd = *pgdp;
> @@ -285,14 +289,27 @@ int __get_user_pages_fast(unsigned long
>  		next = pgd_addr_end(addr, end);
>  		if (pgd_none(pgd))
>  			break;
> -		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
> +		if (!gup_pud_range(pgd, addr, next, flags, pages, &nr))
>  			break;
>  	} while (pgdp++, addr = next, addr != end);
> -	local_irq_restore(flags);
>  
>  	return nr;
>  }
>  
> +int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> +			  struct page **pages)
> +{
> +	unsigned long flags;
> +	int ret;
> +
> +	local_irq_save(flags);
> +	ret = ___get_user_pages_fast(start, nr_pages,
> +			GUPF_GET | (write ? GUPF_WRITE : 0), pages);
> +	local_irq_restore(flags);
> +
> +	return ret;
> +}
> +
>  /**
>   * get_user_pages_fast() - pin user pages in memory
>   * @start:	starting user address

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-15 15:41         ` Don Zickus
@ 2013-10-16 10:57           ` Peter Zijlstra
  2013-10-16 12:46             ` Don Zickus
                               ` (3 more replies)
  0 siblings, 4 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-16 10:57 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

A prettier patch below. The main difference is on-demand allocation of
the scratch buffer.

---
Subject: perf, x86: Optimize intel_pmu_pebs_fixup_ip()
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 15 Oct 2013 12:14:04 +0200

On Mon, Oct 14, 2013 at 04:35:49PM -0400, Don Zickus wrote:
> While there are a few places that are causing latencies, for now I focused on
> the longest one first.  It seems to be 'copy_user_from_nmi'
>
> intel_pmu_handle_irq ->
> 	intel_pmu_drain_pebs_nhm ->
> 		__intel_pmu_drain_pebs_nhm ->
> 			__intel_pmu_pebs_event ->
> 				intel_pmu_pebs_fixup_ip ->
> 					copy_from_user_nmi
>
> In intel_pmu_pebs_fixup_ip(), if the while-loop goes over 50, the sum of
> all the copy_from_user_nmi latencies seems to go over 1,000,000 cycles
> (there are some cases where only 10 iterations are needed to go that high
> too, but in generall over 50 or so).  At this point copy_user_from_nmi
> seems to account for over 90% of the nmi latency.

So avoid having to call copy_from_user_nmi() for every instruction.
Since we already limit the max basic block size, we can easily
pre-allocate a piece of memory to copy the entire thing into in one
go.

Don reports (for a previous version):
> Your patch made a huge difference in improvement.  The
> copy_from_user_nmi() no longer hits the million of cycles.  I still
> have a batch of 100,000-300,000 cycles.  My longest NMI paths used
> to be dominated by copy_from_user_nmi, now it is not (I have to dig
> up the new hot path).

Cc: eranian@google.com
Cc: ak@linux.intel.com
Cc: jmario@redhat.com
Cc: acme@infradead.org
Cc: dave.hansen@linux.intel.com
Reported-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c |   48 +++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 14 deletions(-)

--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -12,6 +12,7 @@
 
 #define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
 #define PEBS_BUFFER_SIZE	PAGE_SIZE
+#define PEBS_FIXUP_SIZE		PAGE_SIZE
 
 /*
  * pebs_record_32 for p4 and core not supported
@@ -228,12 +229,14 @@ void fini_debug_store_on_cpu(int cpu)
 	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 }
 
+static DEFINE_PER_CPU(void *, insn_buffer);
+
 static int alloc_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 	int node = cpu_to_node(cpu);
 	int max, thresh = 1; /* always use a single PEBS record */
-	void *buffer;
+	void *buffer, *ibuffer;
 
 	if (!x86_pmu.pebs)
 		return 0;
@@ -242,6 +245,15 @@ static int alloc_pebs_buffer(int cpu)
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
+	if (x86_pmu.intel_cap.pebs_format < 2) {
+		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+		if (!ibuffer) {
+			kfree(buffer);
+			return -ENOMEM;
+		}
+		per_cpu(insn_buffer, cpu) = ibuffer;
+	}
+
 	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
 
 	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
@@ -262,6 +274,9 @@ static void release_pebs_buffer(int cpu)
 	if (!ds || !x86_pmu.pebs)
 		return;
 
+	kfree(per_cpu(insn_buffer, cpu));
+	per_cpu(insn_buffer, cpu) = NULL;
+
 	kfree((void *)(unsigned long)ds->pebs_buffer_base);
 	ds->pebs_buffer_base = 0;
 }
@@ -729,6 +744,7 @@ static int intel_pmu_pebs_fixup_ip(struc
 	unsigned long old_to, to = cpuc->lbr_entries[0].to;
 	unsigned long ip = regs->ip;
 	int is_64bit = 0;
+	void *kaddr;
 
 	/*
 	 * We don't need to fixup if the PEBS assist is fault like
@@ -752,7 +768,7 @@ static int intel_pmu_pebs_fixup_ip(struc
 	 * unsigned math, either ip is before the start (impossible) or
 	 * the basic block is larger than 1 page (sanity)
 	 */
-	if ((ip - to) > PAGE_SIZE)
+	if ((ip - to) > PEBS_FIXUP_SIZE)
 		return 0;
 
 	/*
@@ -763,29 +779,33 @@ static int intel_pmu_pebs_fixup_ip(struc
 		return 1;
 	}
 
+	if (!kernel_ip(ip)) {
+		int size, bytes;
+		u8 *buf = this_cpu_ptr(insn_buffer);
+
+		size = ip - to; /* Must fit our buffer, see above */
+		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+		if (bytes != size)
+			return 0;
+
+		kaddr = buf;
+	} else {
+		kaddr = (void *)to;
+	}
+
 	do {
 		struct insn insn;
-		u8 buf[MAX_INSN_SIZE];
-		void *kaddr;
 
 		old_to = to;
-		if (!kernel_ip(ip)) {
-			int bytes, size = MAX_INSN_SIZE;
-
-			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-			if (bytes != size)
-				return 0;
-
-			kaddr = buf;
-		} else
-			kaddr = (void *)to;
 
 #ifdef CONFIG_X86_64
 		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
 #endif
 		insn_init(&insn, kaddr, is_64bit);
 		insn_get_length(&insn);
+
 		to += insn.length;
+		kaddr += insn.length;
 	} while (to < ip);
 
 	if (to == ip) {

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 10:57           ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Peter Zijlstra
@ 2013-10-16 12:46             ` Don Zickus
  2013-10-16 13:31               ` Peter Zijlstra
  2013-10-16 20:52             ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Andi Kleen
                               ` (2 subsequent siblings)
  3 siblings, 1 reply; 47+ messages in thread
From: Don Zickus @ 2013-10-16 12:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

On Wed, Oct 16, 2013 at 12:57:55PM +0200, Peter Zijlstra wrote:
> A prettier patch below. The main difference is on-demand allocation of
> the scratch buffer.

I'll see if I can sanity test this in the next couple hours.

Further testing yesterday showed that intel_pmu_drain_pebs_nhm still
has long latencies somewhere.  With 15 minute reboots, isolation goes
slooow.

Thanks!

Cheers,
Don

> 
> ---
> Subject: perf, x86: Optimize intel_pmu_pebs_fixup_ip()
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue, 15 Oct 2013 12:14:04 +0200
> 
> On Mon, Oct 14, 2013 at 04:35:49PM -0400, Don Zickus wrote:
> > While there are a few places that are causing latencies, for now I focused on
> > the longest one first.  It seems to be 'copy_user_from_nmi'
> >
> > intel_pmu_handle_irq ->
> > 	intel_pmu_drain_pebs_nhm ->
> > 		__intel_pmu_drain_pebs_nhm ->
> > 			__intel_pmu_pebs_event ->
> > 				intel_pmu_pebs_fixup_ip ->
> > 					copy_from_user_nmi
> >
> > In intel_pmu_pebs_fixup_ip(), if the while-loop goes over 50, the sum of
> > all the copy_from_user_nmi latencies seems to go over 1,000,000 cycles
> > (there are some cases where only 10 iterations are needed to go that high
> > too, but in generall over 50 or so).  At this point copy_user_from_nmi
> > seems to account for over 90% of the nmi latency.
> 
> So avoid having to call copy_from_user_nmi() for every instruction.
> Since we already limit the max basic block size, we can easily
> pre-allocate a piece of memory to copy the entire thing into in one
> go.
> 
> Don reports (for a previous version):
> > Your patch made a huge difference in improvement.  The
> > copy_from_user_nmi() no longer hits the million of cycles.  I still
> > have a batch of 100,000-300,000 cycles.  My longest NMI paths used
> > to be dominated by copy_from_user_nmi, now it is not (I have to dig
> > up the new hot path).
> 
> Cc: eranian@google.com
> Cc: ak@linux.intel.com
> Cc: jmario@redhat.com
> Cc: acme@infradead.org
> Cc: dave.hansen@linux.intel.com
> Reported-by: Don Zickus <dzickus@redhat.com>
> Signed-off-by: Peter Zijlstra <peterz@infradead.org>
> ---
>  arch/x86/kernel/cpu/perf_event_intel_ds.c |   48 +++++++++++++++++++++---------
>  1 file changed, 34 insertions(+), 14 deletions(-)
> 
> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -12,6 +12,7 @@
>  
>  #define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
>  #define PEBS_BUFFER_SIZE	PAGE_SIZE
> +#define PEBS_FIXUP_SIZE		PAGE_SIZE
>  
>  /*
>   * pebs_record_32 for p4 and core not supported
> @@ -228,12 +229,14 @@ void fini_debug_store_on_cpu(int cpu)
>  	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
>  }
>  
> +static DEFINE_PER_CPU(void *, insn_buffer);
> +
>  static int alloc_pebs_buffer(int cpu)
>  {
>  	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
>  	int node = cpu_to_node(cpu);
>  	int max, thresh = 1; /* always use a single PEBS record */
> -	void *buffer;
> +	void *buffer, *ibuffer;
>  
>  	if (!x86_pmu.pebs)
>  		return 0;
> @@ -242,6 +245,15 @@ static int alloc_pebs_buffer(int cpu)
>  	if (unlikely(!buffer))
>  		return -ENOMEM;
>  
> +	if (x86_pmu.intel_cap.pebs_format < 2) {
> +		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
> +		if (!ibuffer) {
> +			kfree(buffer);
> +			return -ENOMEM;
> +		}
> +		per_cpu(insn_buffer, cpu) = ibuffer;
> +	}
> +
>  	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
>  
>  	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
> @@ -262,6 +274,9 @@ static void release_pebs_buffer(int cpu)
>  	if (!ds || !x86_pmu.pebs)
>  		return;
>  
> +	kfree(per_cpu(insn_buffer, cpu));
> +	per_cpu(insn_buffer, cpu) = NULL;
> +
>  	kfree((void *)(unsigned long)ds->pebs_buffer_base);
>  	ds->pebs_buffer_base = 0;
>  }
> @@ -729,6 +744,7 @@ static int intel_pmu_pebs_fixup_ip(struc
>  	unsigned long old_to, to = cpuc->lbr_entries[0].to;
>  	unsigned long ip = regs->ip;
>  	int is_64bit = 0;
> +	void *kaddr;
>  
>  	/*
>  	 * We don't need to fixup if the PEBS assist is fault like
> @@ -752,7 +768,7 @@ static int intel_pmu_pebs_fixup_ip(struc
>  	 * unsigned math, either ip is before the start (impossible) or
>  	 * the basic block is larger than 1 page (sanity)
>  	 */
> -	if ((ip - to) > PAGE_SIZE)
> +	if ((ip - to) > PEBS_FIXUP_SIZE)
>  		return 0;
>  
>  	/*
> @@ -763,29 +779,33 @@ static int intel_pmu_pebs_fixup_ip(struc
>  		return 1;
>  	}
>  
> +	if (!kernel_ip(ip)) {
> +		int size, bytes;
> +		u8 *buf = this_cpu_ptr(insn_buffer);
> +
> +		size = ip - to; /* Must fit our buffer, see above */
> +		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
> +		if (bytes != size)
> +			return 0;
> +
> +		kaddr = buf;
> +	} else {
> +		kaddr = (void *)to;
> +	}
> +
>  	do {
>  		struct insn insn;
> -		u8 buf[MAX_INSN_SIZE];
> -		void *kaddr;
>  
>  		old_to = to;
> -		if (!kernel_ip(ip)) {
> -			int bytes, size = MAX_INSN_SIZE;
> -
> -			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
> -			if (bytes != size)
> -				return 0;
> -
> -			kaddr = buf;
> -		} else
> -			kaddr = (void *)to;
>  
>  #ifdef CONFIG_X86_64
>  		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
>  #endif
>  		insn_init(&insn, kaddr, is_64bit);
>  		insn_get_length(&insn);
> +
>  		to += insn.length;
> +		kaddr += insn.length;
>  	} while (to < ip);
>  
>  	if (to == ip) {

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 12:46             ` Don Zickus
@ 2013-10-16 13:31               ` Peter Zijlstra
  2013-10-16 13:54                 ` Don Zickus
                                   ` (2 more replies)
  0 siblings, 3 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-16 13:31 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

On Wed, Oct 16, 2013 at 08:46:49AM -0400, Don Zickus wrote:
> On Wed, Oct 16, 2013 at 12:57:55PM +0200, Peter Zijlstra wrote:
> > A prettier patch below. The main difference is on-demand allocation of
> > the scratch buffer.
> 
> I'll see if I can sanity test this in the next couple hours.
> 
> Further testing yesterday showed that intel_pmu_drain_pebs_nhm still
> has long latencies somewhere.  With 15 minute reboots, isolation goes
> slooow.

Pick a smaller box? I seem to be able to reproduce on my wsm-ep, which
boots inside a minute :-)

root@westmere:~# cd /debug/tracing/
root@westmere:/debug/tracing# echo function > current_tracer
root@westmere:/debug/tracing# cat available_filter_functions | grep ^inat > set_ftrace_notrace
root@westmere:/debug/tracing# cat available_filter_functions | grep ^insn | grep -v get_length >> set_ftrace_notrace

Run: perf top --stdio -e 'cycles:pp' in another window and when the
console output shows:

[  610.319486] perf samples too long (19310 > 19230), lowering kernel.perf_event_max_sample_rate to 7000

quickly press enter here:

root@westmere:/debug/tracing# echo 0 > tracing_on
root@westmere:/debug/tracing# cat trace > ~/trace1
root@westmere:/debug/tracing# cat ~/trace1 | awk '/rcu_nmi_enter/ {
t=gensub(":", "", "g", $4); cpu=gensub("[][]", "", "g", $2);
start[cpu]=t; } /rcu_nmi_exit/ { x=gensub(":", "", "g", $4);
cpu=gensub("[][]", "", "g", $2); t=start[cpu]; printf "%6.6f -- starting
at: %6.6f on cpu: %d\n", x-t, t, cpu } ' | sort -n | tail -10
0.000037 -- starting at: 605.317795 on cpu: 9
0.000039 -- starting at: 602.831019 on cpu: 23
0.000039 -- starting at: 602.831148 on cpu: 6
0.000039 -- starting at: 602.955953 on cpu: 20
0.000040 -- starting at: 602.834012 on cpu: 18
0.000040 -- starting at: 602.956972 on cpu: 21
0.000040 -- starting at: 602.960048 on cpu: 22
0.000040 -- starting at: 609.290776 on cpu: 7
0.000075 -- starting at: 609.773875 on cpu: 0
0.009398 -- starting at: 610.319445 on cpu: 1
root@westmere:/debug/tracing# grep "\[001\]" ~/trace1 | awk 'BEGIN {p=0}
/610.319445/ {p=1} /rcu_nmi_exit/ {p=0} {if (p) print $0}'

Now obviously the whole printk stuff below is insane, but it does show
its one that triggered the check. And the trace does give a fair idea of
what its doing:

          <idle>-0     [001] d.h.   610.319445: rcu_nmi_enter <-do_nmi
          <idle>-0     [001] d.h.   610.319446: nmi_handle.isra.3 <-do_nmi
          <idle>-0     [001] d.h.   610.319447: intel_pmu_handle_irq <-perf_event_nmi_handler
          <idle>-0     [001] d.h.   610.319447: intel_pmu_disable_all <-intel_pmu_handle_irq
          <idle>-0     [001] d.h.   610.319448: intel_pmu_pebs_disable_all <-intel_pmu_disable_all
          <idle>-0     [001] d.h.   610.319448: intel_pmu_lbr_disable_all <-intel_pmu_disable_all
          <idle>-0     [001] d.h.   610.319449: intel_pmu_drain_bts_buffer <-intel_pmu_handle_irq
          <idle>-0     [001] d.h.   610.319449: intel_pmu_lbr_read <-intel_pmu_handle_irq
          <idle>-0     [001] d.h.   610.319453: intel_pmu_drain_pebs_nhm <-intel_pmu_handle_irq
          <idle>-0     [001] d.h.   610.319453: __intel_pmu_pebs_event <-intel_pmu_drain_pebs_nhm
          <idle>-0     [001] d.h.   610.319454: intel_pmu_save_and_restart <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319455: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319456: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319457: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319458: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319458: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319459: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319459: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319460: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319460: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319461: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319461: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319462: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319462: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319463: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319464: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319464: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319465: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319465: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319466: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319466: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319467: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319467: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319468: insn_get_length <-__intel_pmu_pebs_event
          <idle>-0     [001] d.h.   610.319469: task_tgid_nr_ns <-perf_event_pid
          <idle>-0     [001] d.h.   610.319469: __task_pid_nr_ns <-perf_event_tid
          <idle>-0     [001] d.h.   610.319470: perf_output_begin <-perf_log_throttle
          <idle>-0     [001] d.h.   610.319470: perf_output_copy <-perf_log_throttle
          <idle>-0     [001] d.h.   610.319470: perf_output_copy <-perf_event__output_id_sample
          <idle>-0     [001] d.h.   610.319471: perf_output_copy <-perf_event__output_id_sample
          <idle>-0     [001] d.h.   610.319472: perf_output_copy <-perf_event__output_id_sample
          <idle>-0     [001] d.h.   610.319472: perf_output_end <-perf_log_throttle
          <idle>-0     [001] d.h.   610.319472: perf_output_put_handle <-perf_output_end
          <idle>-0     [001] d.h.   610.319473: kvm_is_in_guest <-perf_misc_flags
          <idle>-0     [001] d.h.   610.319473: task_tgid_nr_ns <-perf_event_pid
          <idle>-0     [001] d.h.   610.319474: __task_pid_nr_ns <-perf_event_tid
          <idle>-0     [001] d.h.   610.319474: kvm_is_in_guest <-perf_instruction_pointer
          <idle>-0     [001] d.h.   610.319475: perf_output_begin <-__perf_event_overflow
          <idle>-0     [001] d.h.   610.319475: perf_output_copy <-perf_output_sample
          <idle>-0     [001] d.h.   610.319475: perf_output_copy <-perf_output_sample
          <idle>-0     [001] d.h.   610.319476: perf_output_copy <-perf_output_sample
          <idle>-0     [001] d.h.   610.319476: perf_output_copy <-perf_output_sample
          <idle>-0     [001] d.h.   610.319476: perf_output_copy <-perf_output_sample
          <idle>-0     [001] d.h.   610.319477: perf_output_copy <-perf_output_sample
          <idle>-0     [001] d.h.   610.319477: perf_output_end <-__perf_event_overflow
          <idle>-0     [001] d.h.   610.319477: perf_output_put_handle <-perf_output_end
          <idle>-0     [001] d.h.   610.319478: intel_pmu_disable_event <-x86_pmu_stop
          <idle>-0     [001] d.h.   610.319478: intel_pmu_lbr_disable <-intel_pmu_disable_event
          <idle>-0     [001] d.h.   610.319479: intel_pmu_pebs_disable <-intel_pmu_disable_event
          <idle>-0     [001] d.h.   610.319480: intel_pmu_enable_all <-intel_pmu_handle_irq
          <idle>-0     [001] d.h.   610.319480: intel_pmu_pebs_enable_all <-intel_pmu_enable_all
          <idle>-0     [001] d.h.   610.319480: intel_pmu_lbr_enable_all <-intel_pmu_enable_all
          <idle>-0     [001] d.h.   610.319481: _raw_spin_trylock <-___ratelimit
          <idle>-0     [001] d.h.   610.319482: _raw_spin_unlock_irqrestore <-___ratelimit
          <idle>-0     [001] d.h.   610.319482: printk <-perf_sample_event_took
          <idle>-0     [001] d.h.   610.319482: vprintk_emit <-printk

< snip ~8000 lines >

          <idle>-0     [001] d.h.   610.328841: wake_up_klogd <-console_unlock
          <idle>-0     [001] d.h.   610.328841: arch_irq_work_raise <-irq_work_queue
          <idle>-0     [001] d.h.   610.328842: apic_send_IPI_self <-arch_irq_work_raise
          <idle>-0     [001] d.h.   610.328842: native_apic_wait_icr_idle <-arch_irq_work_raise
          <idle>-0     [001] d.h.   610.328843: arch_trigger_all_cpu_backtrace_handler <-nmi_handle.isra.3

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 13:31               ` Peter Zijlstra
@ 2013-10-16 13:54                 ` Don Zickus
  2013-10-17 11:21                 ` Peter Zijlstra
  2013-10-17 13:33                 ` Peter Zijlstra
  2 siblings, 0 replies; 47+ messages in thread
From: Don Zickus @ 2013-10-16 13:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

On Wed, Oct 16, 2013 at 03:31:25PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 16, 2013 at 08:46:49AM -0400, Don Zickus wrote:
> > On Wed, Oct 16, 2013 at 12:57:55PM +0200, Peter Zijlstra wrote:
> > > A prettier patch below. The main difference is on-demand allocation of
> > > the scratch buffer.
> > 
> > I'll see if I can sanity test this in the next couple hours.
> > 
> > Further testing yesterday showed that intel_pmu_drain_pebs_nhm still
> > has long latencies somewhere.  With 15 minute reboots, isolation goes
> > slooow.
> 
> Pick a smaller box? I seem to be able to reproduce on my wsm-ep, which
> boots inside a minute :-)

Heh.  It seemed to take longer on those boxes, but maybe I wasn't pushing
it hard enough. :-)

> 
> root@westmere:~# cd /debug/tracing/
> root@westmere:/debug/tracing# echo function > current_tracer
> root@westmere:/debug/tracing# cat available_filter_functions | grep ^inat > set_ftrace_notrace
> root@westmere:/debug/tracing# cat available_filter_functions | grep ^insn | grep -v get_length >> set_ftrace_notrace
> 

I guess now is a good time to learn ftrace.  Seems powerful if you know
how to use awk/sed/grep properly :-)

Thanks!  This might make debugging easier.

Cheers,
Don

> Run: perf top --stdio -e 'cycles:pp' in another window and when the
> console output shows:
> 
> [  610.319486] perf samples too long (19310 > 19230), lowering kernel.perf_event_max_sample_rate to 7000
> 
> quickly press enter here:
> 
> root@westmere:/debug/tracing# echo 0 > tracing_on
> root@westmere:/debug/tracing# cat trace > ~/trace1
> root@westmere:/debug/tracing# cat ~/trace1 | awk '/rcu_nmi_enter/ {
> t=gensub(":", "", "g", $4); cpu=gensub("[][]", "", "g", $2);
> start[cpu]=t; } /rcu_nmi_exit/ { x=gensub(":", "", "g", $4);
> cpu=gensub("[][]", "", "g", $2); t=start[cpu]; printf "%6.6f -- starting
> at: %6.6f on cpu: %d\n", x-t, t, cpu } ' | sort -n | tail -10
> 0.000037 -- starting at: 605.317795 on cpu: 9
> 0.000039 -- starting at: 602.831019 on cpu: 23
> 0.000039 -- starting at: 602.831148 on cpu: 6
> 0.000039 -- starting at: 602.955953 on cpu: 20
> 0.000040 -- starting at: 602.834012 on cpu: 18
> 0.000040 -- starting at: 602.956972 on cpu: 21
> 0.000040 -- starting at: 602.960048 on cpu: 22
> 0.000040 -- starting at: 609.290776 on cpu: 7
> 0.000075 -- starting at: 609.773875 on cpu: 0
> 0.009398 -- starting at: 610.319445 on cpu: 1
> root@westmere:/debug/tracing# grep "\[001\]" ~/trace1 | awk 'BEGIN {p=0}
> /610.319445/ {p=1} /rcu_nmi_exit/ {p=0} {if (p) print $0}'
> 
> Now obviously the whole printk stuff below is insane, but it does show
> its one that triggered the check. And the trace does give a fair idea of
> what its doing:
> 
>           <idle>-0     [001] d.h.   610.319445: rcu_nmi_enter <-do_nmi
>           <idle>-0     [001] d.h.   610.319446: nmi_handle.isra.3 <-do_nmi
>           <idle>-0     [001] d.h.   610.319447: intel_pmu_handle_irq <-perf_event_nmi_handler
>           <idle>-0     [001] d.h.   610.319447: intel_pmu_disable_all <-intel_pmu_handle_irq
>           <idle>-0     [001] d.h.   610.319448: intel_pmu_pebs_disable_all <-intel_pmu_disable_all
>           <idle>-0     [001] d.h.   610.319448: intel_pmu_lbr_disable_all <-intel_pmu_disable_all
>           <idle>-0     [001] d.h.   610.319449: intel_pmu_drain_bts_buffer <-intel_pmu_handle_irq
>           <idle>-0     [001] d.h.   610.319449: intel_pmu_lbr_read <-intel_pmu_handle_irq
>           <idle>-0     [001] d.h.   610.319453: intel_pmu_drain_pebs_nhm <-intel_pmu_handle_irq
>           <idle>-0     [001] d.h.   610.319453: __intel_pmu_pebs_event <-intel_pmu_drain_pebs_nhm
>           <idle>-0     [001] d.h.   610.319454: intel_pmu_save_and_restart <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319455: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319456: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319457: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319458: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319458: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319459: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319459: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319460: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319460: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319461: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319461: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319462: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319462: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319463: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319464: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319464: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319465: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319465: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319466: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319466: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319467: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319467: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319468: insn_get_length <-__intel_pmu_pebs_event
>           <idle>-0     [001] d.h.   610.319469: task_tgid_nr_ns <-perf_event_pid
>           <idle>-0     [001] d.h.   610.319469: __task_pid_nr_ns <-perf_event_tid
>           <idle>-0     [001] d.h.   610.319470: perf_output_begin <-perf_log_throttle
>           <idle>-0     [001] d.h.   610.319470: perf_output_copy <-perf_log_throttle
>           <idle>-0     [001] d.h.   610.319470: perf_output_copy <-perf_event__output_id_sample
>           <idle>-0     [001] d.h.   610.319471: perf_output_copy <-perf_event__output_id_sample
>           <idle>-0     [001] d.h.   610.319472: perf_output_copy <-perf_event__output_id_sample
>           <idle>-0     [001] d.h.   610.319472: perf_output_end <-perf_log_throttle
>           <idle>-0     [001] d.h.   610.319472: perf_output_put_handle <-perf_output_end
>           <idle>-0     [001] d.h.   610.319473: kvm_is_in_guest <-perf_misc_flags
>           <idle>-0     [001] d.h.   610.319473: task_tgid_nr_ns <-perf_event_pid
>           <idle>-0     [001] d.h.   610.319474: __task_pid_nr_ns <-perf_event_tid
>           <idle>-0     [001] d.h.   610.319474: kvm_is_in_guest <-perf_instruction_pointer
>           <idle>-0     [001] d.h.   610.319475: perf_output_begin <-__perf_event_overflow
>           <idle>-0     [001] d.h.   610.319475: perf_output_copy <-perf_output_sample
>           <idle>-0     [001] d.h.   610.319475: perf_output_copy <-perf_output_sample
>           <idle>-0     [001] d.h.   610.319476: perf_output_copy <-perf_output_sample
>           <idle>-0     [001] d.h.   610.319476: perf_output_copy <-perf_output_sample
>           <idle>-0     [001] d.h.   610.319476: perf_output_copy <-perf_output_sample
>           <idle>-0     [001] d.h.   610.319477: perf_output_copy <-perf_output_sample
>           <idle>-0     [001] d.h.   610.319477: perf_output_end <-__perf_event_overflow
>           <idle>-0     [001] d.h.   610.319477: perf_output_put_handle <-perf_output_end
>           <idle>-0     [001] d.h.   610.319478: intel_pmu_disable_event <-x86_pmu_stop
>           <idle>-0     [001] d.h.   610.319478: intel_pmu_lbr_disable <-intel_pmu_disable_event
>           <idle>-0     [001] d.h.   610.319479: intel_pmu_pebs_disable <-intel_pmu_disable_event
>           <idle>-0     [001] d.h.   610.319480: intel_pmu_enable_all <-intel_pmu_handle_irq
>           <idle>-0     [001] d.h.   610.319480: intel_pmu_pebs_enable_all <-intel_pmu_enable_all
>           <idle>-0     [001] d.h.   610.319480: intel_pmu_lbr_enable_all <-intel_pmu_enable_all
>           <idle>-0     [001] d.h.   610.319481: _raw_spin_trylock <-___ratelimit
>           <idle>-0     [001] d.h.   610.319482: _raw_spin_unlock_irqrestore <-___ratelimit
>           <idle>-0     [001] d.h.   610.319482: printk <-perf_sample_event_took
>           <idle>-0     [001] d.h.   610.319482: vprintk_emit <-printk
> 
> < snip ~8000 lines >
> 
>           <idle>-0     [001] d.h.   610.328841: wake_up_klogd <-console_unlock
>           <idle>-0     [001] d.h.   610.328841: arch_irq_work_raise <-irq_work_queue
>           <idle>-0     [001] d.h.   610.328842: apic_send_IPI_self <-arch_irq_work_raise
>           <idle>-0     [001] d.h.   610.328842: native_apic_wait_icr_idle <-arch_irq_work_raise
>           <idle>-0     [001] d.h.   610.328843: arch_trigger_all_cpu_backtrace_handler <-nmi_handle.isra.3

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 10:57           ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Peter Zijlstra
  2013-10-16 12:46             ` Don Zickus
@ 2013-10-16 20:52             ` Andi Kleen
  2013-10-16 21:03               ` Peter Zijlstra
  2013-10-17 14:49             ` Don Zickus
  2013-10-17 16:50             ` [tip:perf/core] perf/x86: " tip-bot for Peter Zijlstra
  3 siblings, 1 reply; 47+ messages in thread
From: Andi Kleen @ 2013-10-16 20:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Don Zickus, dave.hansen, eranian, jmario, linux-kernel, acme, mingo

> So avoid having to call copy_from_user_nmi() for every instruction.
> Since we already limit the max basic block size, we can easily
> pre-allocate a piece of memory to copy the entire thing into in one
> go.

It would be better/more generic if you split copy_from_user_nmi() into
init() copy() end() 

(and some state that checks if the underlying page changes)

Then first you don't need the buffer and it could be also
be applied to the other cases, like the stack unwinding,
where copying everything is likely too slow.

-Andi

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 20:52             ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Andi Kleen
@ 2013-10-16 21:03               ` Peter Zijlstra
  2013-10-16 23:07                 ` Peter Zijlstra
  0 siblings, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-16 21:03 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Don Zickus, dave.hansen, eranian, jmario, linux-kernel, acme, mingo

On Wed, Oct 16, 2013 at 01:52:27PM -0700, Andi Kleen wrote:
> > So avoid having to call copy_from_user_nmi() for every instruction.
> > Since we already limit the max basic block size, we can easily
> > pre-allocate a piece of memory to copy the entire thing into in one
> > go.
> 
> It would be better/more generic if you split copy_from_user_nmi() into
> init() copy() end() 
> 
> (and some state that checks if the underlying page changes)
> 
> Then first you don't need the buffer and it could be also
> be applied to the other cases, like the stack unwinding,
> where copying everything is likely too slow.

You'd need to make an iterator interface because of the kmap_atomic crap
needed for 32bit.

But yes, something like that might work, it shouldn't be that hard to
cobble on top of that GUP patch I send out the other day.

The only real nasty part is where an instruction straddles a page
boundary, in that case the iterator stuff fails to be fully transparant
and you need a temp copy of sorts.

Anyway; if you want to have a go at this, feel free.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 21:03               ` Peter Zijlstra
@ 2013-10-16 23:07                 ` Peter Zijlstra
  2013-10-17  9:41                   ` Peter Zijlstra
  0 siblings, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-16 23:07 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Don Zickus, dave.hansen, eranian, jmario, linux-kernel, acme, mingo

On Wed, Oct 16, 2013 at 11:03:19PM +0200, Peter Zijlstra wrote:
> Anyway; if you want to have a go at this, feel free.

OK, couldn't help myself; completely untested patch below.

I think the full once copy it best for the decode as even with the below
interface you'd end up doing a lot of duplicate copying due to the
variable size insn mess.

But it should help lots with the fragmented stack pointer chase, where
hopefully you'd have multiple frames on the same stack page.

---
 arch/x86/include/asm/uaccess.h   | 13 +++++++
 arch/x86/kernel/cpu/perf_event.c |  9 ++++-
 arch/x86/lib/usercopy.c          | 84 ++++++++++++++++++++++++++++++++++++++--
 arch/x86/mm/gup.c                | 63 +++++++++++++++++++-----------
 4 files changed, 141 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 5838fa911aa0..06c87fc989bd 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -516,6 +516,19 @@ struct __large_struct { unsigned long buf[100]; };
 
 extern unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
+
+struct copy_from_user_nmi_state {
+	unsigned long address;
+	unsigned long flags;
+	void *map;
+}
+
+extern unsigned long
+copy_from_user_nmi_iter(void *to, const void __user *from,
+			unsigned long n, struct copy_from_user_nmi_state *state);
+extern void
+copy_from_user_nmi_end(struct copy_from_user_nmi_state *state);
+
 extern __must_check long
 strncpy_from_user(char *dst, const char __user *src, long count);
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 19c9d86d2f04..7faf12c585d0 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1979,6 +1979,7 @@ static inline int
 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	/* 32-bit process in 64-bit kernel. */
+	struct copy_from_user_nmi_state state = { 0, 0, NULL };
 	unsigned long ss_base, cs_base;
 	struct stack_frame_ia32 frame;
 	const void __user *fp;
@@ -1995,7 +1996,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		frame.next_frame     = 0;
 		frame.return_address = 0;
 
-		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+		bytes = copy_from_user_nmi_iter(&frame, fp, sizeof(frame), &state);
 		if (bytes != sizeof(frame))
 			break;
 
@@ -2005,6 +2006,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		perf_callchain_store(entry, cs_base + frame.return_address);
 		fp = compat_ptr(ss_base + frame.next_frame);
 	}
+	copy_from_user_nmi_end(&state);
 	return 1;
 }
 #else
@@ -2018,6 +2020,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 void
 perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
+	struct copy_from_user_nmi_state state = { 0, 0, NULL };
 	struct stack_frame frame;
 	const void __user *fp;
 
@@ -2044,10 +2047,11 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
 		unsigned long bytes;
+
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
 
-		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+		bytes = copy_from_user_nmi_iter(&frame, fp, sizeof(frame), &state);
 		if (bytes != sizeof(frame))
 			break;
 
@@ -2057,6 +2061,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		perf_callchain_store(entry, frame.return_address);
 		fp = frame.next_frame;
 	}
+	copy_from_user_nmi_end(&state);
 }
 
 /*
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index 4f74d94c8d97..bce8179227cf 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -10,6 +10,8 @@
 #include <asm/word-at-a-time.h>
 #include <linux/sched.h>
 
+extern int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
+			  struct page **pages);
 /*
  * best effort, GUP based copy_from_user() that is NMI-safe
  */
@@ -18,6 +20,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long offset, addr = (unsigned long)from;
 	unsigned long size, len = 0;
+	unsigned long flags;
 	struct page *page;
 	void *map;
 	int ret;
@@ -26,9 +29,12 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 		return len;
 
 	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
+		local_irq_save(flags);
+		ret = ___get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret) {
+			local_irq_restore(flags);
 			break;
+		}
 
 		offset = addr & (PAGE_SIZE - 1);
 		size = min(PAGE_SIZE - offset, n - len);
@@ -36,7 +42,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 		map = kmap_atomic(page);
 		memcpy(to, map+offset, size);
 		kunmap_atomic(map);
-		put_page(page);
+		local_irq_restore(flags);
 
 		len  += size;
 		to   += size;
@@ -47,3 +53,75 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	return len;
 }
 EXPORT_SYMBOL_GPL(copy_from_user_nmi);
+
+unsigned long
+copy_from_user_nmi_iter(void *to, const void __user *from, unsigned long n,
+			struct copy_from_user_nmi_state *state)
+{
+	unsigned long offset, addr = (unsigned long)from;
+	unsigned long size, len = 0;
+	unsigned long flags;
+	struct page *page;
+	void *map;
+	int ret;
+
+	if (__range_not_ok(from, n, TASK_SIZE))
+		return len;
+
+	if (state->map) {
+		if ((state->address >> PAGE_SHIFT) ==
+		    (addr >> PAGE_SHIFT)) {
+			flags = state->flags;
+			map = state->map;
+			goto got_page;
+		}
+		kunmap_atomic(state->map);
+		local_irq_restore(state->flags);
+	}
+
+	do {
+		local_irq_save(flags);
+		ret = ___get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret) {
+			local_irq_restore(flags);
+			break;
+		}
+
+		map = kmap_atomic(page);
+got_page:
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
+
+		memcpy(to, map+offset, size);
+
+		len  += size;
+		to   += size;
+		addr += size;
+
+		if (len == n && offset + size < PAGE_SIZE) {
+			state->address = addr;
+			state->flags = flags;
+			state->map = map;
+			return len;
+		}
+
+		kunmap_atomic(map);
+		local_irq_restore(flags);
+
+	} while (len < n);
+
+	memset(state, 0, sizeof(*state));
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi_iter);
+
+void copy_from_user_nmi_end(struct copy_from_user_nmi_state *state)
+{
+	if (state->map) {
+		kunmap_atomic(state->map);
+		local_irq_restore(state->flags);
+		memset(state, 0, sizeof(*state));
+	}
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi_end);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index dd74e46828c0..e383caf323e4 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -63,19 +63,22 @@ static inline pte_t gup_get_pte(pte_t *ptep)
 #endif
 }
 
+#define GUPF_GET	0x01
+#define GUPF_WRITE	0x02
+
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
  * register pressure.
  */
 static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t *ptep;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 
 	ptep = pte_offset_map(&pmd, addr);
@@ -89,7 +92,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 		}
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 		page = pte_page(pte);
-		get_page(page);
+		if (flags & GUPF_GET)
+			get_page(page);
 		SetPageReferenced(page);
 		pages[*nr] = page;
 		(*nr)++;
@@ -109,7 +113,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 }
 
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t pte = *(pte_t *)&pmd;
@@ -117,7 +121,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 	int refs;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 	if ((pte_flags(pte) & mask) != mask)
 		return 0;
@@ -131,19 +135,20 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
-		if (PageTail(page))
+		if ((flags & GUPF_GET) && PageTail(page))
 			get_huge_page_tail(page);
 		(*nr)++;
 		page++;
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
-	get_head_page_multiple(head, refs);
+	if (flags & GUPF_GET)
+		get_head_page_multiple(head, refs);
 
 	return 1;
 }
 
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
-		int write, struct page **pages, int *nr)
+		int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pmd_t *pmdp;
@@ -167,10 +172,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 			return 0;
 		if (unlikely(pmd_large(pmd))) {
-			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+			if (!gup_huge_pmd(pmd, addr, next, flags, pages, nr))
 				return 0;
 		} else {
-			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+			if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
 				return 0;
 		}
 	} while (pmdp++, addr = next, addr != end);
@@ -179,7 +184,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 }
 
 static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t pte = *(pte_t *)&pud;
@@ -187,7 +192,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
 	int refs;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 	if ((pte_flags(pte) & mask) != mask)
 		return 0;
@@ -201,19 +206,20 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
-		if (PageTail(page))
+		if ((flags & GUPF_GET) && PageTail(page))
 			get_huge_page_tail(page);
 		(*nr)++;
 		page++;
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
-	get_head_page_multiple(head, refs);
+	if (flags & GUPF_GET)
+		get_head_page_multiple(head, refs);
 
 	return 1;
 }
 
 static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
-			int write, struct page **pages, int *nr)
+			int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pud_t *pudp;
@@ -226,10 +232,10 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 		if (pud_none(pud))
 			return 0;
 		if (unlikely(pud_large(pud))) {
-			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
+			if (!gup_huge_pud(pud, addr, next, flags, pages, nr))
 				return 0;
 		} else {
-			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
 				return 0;
 		}
 	} while (pudp++, addr = next, addr != end);
@@ -241,13 +247,12 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
  * back to the regular GUP.
  */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
 			  struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr, len, end;
 	unsigned long next;
-	unsigned long flags;
 	pgd_t *pgdp;
 	int nr = 0;
 
@@ -255,7 +260,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	addr = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
-	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+	if (unlikely(!access_ok((flags & GUPF_WRITE) ? VERIFY_WRITE : VERIFY_READ,
 					(void __user *)start, len)))
 		return 0;
 
@@ -277,7 +282,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	 * (which we do on x86, with the above PAE exception), we can follow the
 	 * address down to the the page and take a ref on it.
 	 */
-	local_irq_save(flags);
 	pgdp = pgd_offset(mm, addr);
 	do {
 		pgd_t pgd = *pgdp;
@@ -285,14 +289,27 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none(pgd))
 			break;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+		if (!gup_pud_range(pgd, addr, next, flags, pages, &nr))
 			break;
 	} while (pgdp++, addr = next, addr != end);
-	local_irq_restore(flags);
 
 	return nr;
 }
 
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret = ___get_user_pages_fast(start, nr_pages,
+			GUPF_GET | (write ? GUPF_WRITE : 0), pages);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
 /**
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 23:07                 ` Peter Zijlstra
@ 2013-10-17  9:41                   ` Peter Zijlstra
  2013-10-17 16:00                     ` Don Zickus
  0 siblings, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-17  9:41 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Don Zickus, dave.hansen, eranian, jmario, linux-kernel, acme, mingo

On Thu, Oct 17, 2013 at 01:07:12AM +0200, Peter Zijlstra wrote:
> On Wed, Oct 16, 2013 at 11:03:19PM +0200, Peter Zijlstra wrote:
> > Anyway; if you want to have a go at this, feel free.
> 
> OK, couldn't help myself; completely untested patch below.
> 
> I think the full once copy it best for the decode as even with the below
> interface you'd end up doing a lot of duplicate copying due to the
> variable size insn mess.

Duh, a very small tweak would make it work for that and avoid most of
the memcpy()s.

---
 arch/x86/include/asm/uaccess.h            | 13 +++++
 arch/x86/kernel/cpu/perf_event.c          | 32 +++++------
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 21 ++++---
 arch/x86/lib/usercopy.c                   | 91 ++++++++++++++++++++++++++++++-
 arch/x86/mm/gup.c                         | 63 +++++++++++++--------
 5 files changed, 165 insertions(+), 55 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 5838fa911aa0..a341de0eadd1 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -516,6 +516,19 @@ struct __large_struct { unsigned long buf[100]; };
 
 extern unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
+
+struct copy_from_user_nmi_state {
+	void *map;
+	unsigned long address;
+	unsigned long flags;
+};
+
+extern void *
+copy_from_user_nmi_iter(void *to, const void __user *from,
+			unsigned long n, struct copy_from_user_nmi_state *state);
+extern void
+copy_from_user_nmi_end(struct copy_from_user_nmi_state *state);
+
 extern __must_check long
 strncpy_from_user(char *dst, const char __user *src, long count);
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 19c9d86d2f04..c917fe470861 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1979,8 +1979,9 @@ static inline int
 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	/* 32-bit process in 64-bit kernel. */
+	struct copy_from_user_nmi_state state = { NULL };
 	unsigned long ss_base, cs_base;
-	struct stack_frame_ia32 frame;
+	struct stack_frame_ia32 frame, *f;
 	const void __user *fp;
 
 	if (!test_thread_flag(TIF_IA32))
@@ -1991,20 +1992,17 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
-		unsigned long bytes;
-		frame.next_frame     = 0;
-		frame.return_address = 0;
-
-		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
-		if (bytes != sizeof(frame))
+		f = copy_from_user_nmi_iter(&frame, fp, sizeof(frame), &state);
+		if (!f)
 			break;
 
 		if (!valid_user_frame(fp, sizeof(frame)))
 			break;
 
-		perf_callchain_store(entry, cs_base + frame.return_address);
-		fp = compat_ptr(ss_base + frame.next_frame);
+		perf_callchain_store(entry, cs_base + f->return_address);
+		fp = compat_ptr(ss_base + f->next_frame);
 	}
+	copy_from_user_nmi_end(&state);
 	return 1;
 }
 #else
@@ -2018,7 +2016,8 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 void
 perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
-	struct stack_frame frame;
+	struct copy_from_user_nmi_state state = { NULL };
+	struct stack_frame frame, *f;
 	const void __user *fp;
 
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
@@ -2043,20 +2042,17 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
-		unsigned long bytes;
-		frame.next_frame	     = NULL;
-		frame.return_address = 0;
-
-		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
-		if (bytes != sizeof(frame))
+		f = copy_from_user_nmi_iter(&frame, fp, sizeof(frame), &state);
+		if (!f)
 			break;
 
 		if (!valid_user_frame(fp, sizeof(frame)))
 			break;
 
-		perf_callchain_store(entry, frame.return_address);
-		fp = frame.next_frame;
+		perf_callchain_store(entry, f->return_address);
+		fp = f->next_frame;
 	}
+	copy_from_user_nmi_end(&state);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 32e9ed81cd00..5bd3f2091da9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -725,10 +725,14 @@ void intel_pmu_pebs_disable_all(void)
 static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct copy_from_user_nmi_state state = { NULL };
 	unsigned long from = cpuc->lbr_entries[0].from;
 	unsigned long old_to, to = cpuc->lbr_entries[0].to;
 	unsigned long ip = regs->ip;
+	u8 buf[MAX_INSN_SIZE];
+	struct insn insn;
 	int is_64bit = 0;
+	void *kaddr;
 
 	/*
 	 * We don't need to fixup if the PEBS assist is fault like
@@ -764,19 +768,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	}
 
 	do {
-		struct insn insn;
-		u8 buf[MAX_INSN_SIZE];
-		void *kaddr;
-
 		old_to = to;
 		if (!kernel_ip(ip)) {
-			int bytes, size = MAX_INSN_SIZE;
-
-			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-			if (bytes != size)
-				return 0;
-
-			kaddr = buf;
+			kaddr = copy_from_user_nmi_iter(buf, (void __user *)to,
+							MAX_INSN_SIZE, &state);
+			if (!kaddr)
+				break;
 		} else
 			kaddr = (void *)to;
 
@@ -788,6 +785,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 		to += insn.length;
 	} while (to < ip);
 
+	copy_from_user_nmi_end(&state);
+
 	if (to == ip) {
 		set_linear_ip(regs, old_to);
 		return 1;
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index 4f74d94c8d97..da6c36a8b842 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -10,6 +10,8 @@
 #include <asm/word-at-a-time.h>
 #include <linux/sched.h>
 
+extern int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
+			  struct page **pages);
 /*
  * best effort, GUP based copy_from_user() that is NMI-safe
  */
@@ -18,6 +20,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long offset, addr = (unsigned long)from;
 	unsigned long size, len = 0;
+	unsigned long flags;
 	struct page *page;
 	void *map;
 	int ret;
@@ -26,9 +29,12 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 		return len;
 
 	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
+		local_irq_save(flags);
+		ret = ___get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret) {
+			local_irq_restore(flags);
 			break;
+		}
 
 		offset = addr & (PAGE_SIZE - 1);
 		size = min(PAGE_SIZE - offset, n - len);
@@ -36,7 +42,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 		map = kmap_atomic(page);
 		memcpy(to, map+offset, size);
 		kunmap_atomic(map);
-		put_page(page);
+		local_irq_restore(flags);
 
 		len  += size;
 		to   += size;
@@ -47,3 +53,82 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	return len;
 }
 EXPORT_SYMBOL_GPL(copy_from_user_nmi);
+
+void *copy_from_user_nmi_iter(void *to, const void __user *from,
+		unsigned long n, struct copy_from_user_nmi_state *state)
+{
+	unsigned long offset, addr = (unsigned long)from;
+	unsigned long size, len = 0;
+	unsigned long flags;
+	struct page *page;
+	void *map, *_to = to;
+	int ret;
+
+	if (__range_not_ok(from, n, TASK_SIZE))
+		return NULL;
+
+	if (state->map) {
+		if ((state->address >> PAGE_SHIFT) ==
+		    (addr >> PAGE_SHIFT)) {
+			flags = state->flags;
+			map = state->map;
+			goto got_page;
+		}
+		kunmap_atomic(state->map);
+		local_irq_restore(state->flags);
+	}
+
+	for (;;) {
+		local_irq_save(flags);
+		ret = ___get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret) {
+			local_irq_restore(flags);
+			state->map = NULL;
+			return NULL;
+		}
+
+		map = kmap_atomic(page);
+
+got_page:
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
+
+		/*
+		 * If the entire desired range falls within the one page
+		 * avoid the copy and return a pointer into the kmap.
+		 */
+		if (size == n) {
+			_to = map + offset;
+			break;
+		}
+
+		memcpy(to, map+offset, size);
+		len += size;
+
+		if (len == n)
+			break;
+
+		to   += size;
+		addr += size;
+
+		kunmap_atomic(map);
+		local_irq_restore(flags);
+	}
+
+	state->address = addr;
+	state->flags = flags;
+	state->map = map;
+
+	return _to;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi_iter);
+
+void copy_from_user_nmi_end(struct copy_from_user_nmi_state *state)
+{
+	if (state->map) {
+		kunmap_atomic(state->map);
+		local_irq_restore(state->flags);
+		state->map = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi_end);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index dd74e46828c0..e383caf323e4 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -63,19 +63,22 @@ static inline pte_t gup_get_pte(pte_t *ptep)
 #endif
 }
 
+#define GUPF_GET	0x01
+#define GUPF_WRITE	0x02
+
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
  * register pressure.
  */
 static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t *ptep;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 
 	ptep = pte_offset_map(&pmd, addr);
@@ -89,7 +92,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 		}
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 		page = pte_page(pte);
-		get_page(page);
+		if (flags & GUPF_GET)
+			get_page(page);
 		SetPageReferenced(page);
 		pages[*nr] = page;
 		(*nr)++;
@@ -109,7 +113,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 }
 
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t pte = *(pte_t *)&pmd;
@@ -117,7 +121,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 	int refs;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 	if ((pte_flags(pte) & mask) != mask)
 		return 0;
@@ -131,19 +135,20 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
-		if (PageTail(page))
+		if ((flags & GUPF_GET) && PageTail(page))
 			get_huge_page_tail(page);
 		(*nr)++;
 		page++;
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
-	get_head_page_multiple(head, refs);
+	if (flags & GUPF_GET)
+		get_head_page_multiple(head, refs);
 
 	return 1;
 }
 
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
-		int write, struct page **pages, int *nr)
+		int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pmd_t *pmdp;
@@ -167,10 +172,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 			return 0;
 		if (unlikely(pmd_large(pmd))) {
-			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+			if (!gup_huge_pmd(pmd, addr, next, flags, pages, nr))
 				return 0;
 		} else {
-			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+			if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
 				return 0;
 		}
 	} while (pmdp++, addr = next, addr != end);
@@ -179,7 +184,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 }
 
 static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t pte = *(pte_t *)&pud;
@@ -187,7 +192,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
 	int refs;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 	if ((pte_flags(pte) & mask) != mask)
 		return 0;
@@ -201,19 +206,20 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
-		if (PageTail(page))
+		if ((flags & GUPF_GET) && PageTail(page))
 			get_huge_page_tail(page);
 		(*nr)++;
 		page++;
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
-	get_head_page_multiple(head, refs);
+	if (flags & GUPF_GET)
+		get_head_page_multiple(head, refs);
 
 	return 1;
 }
 
 static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
-			int write, struct page **pages, int *nr)
+			int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pud_t *pudp;
@@ -226,10 +232,10 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 		if (pud_none(pud))
 			return 0;
 		if (unlikely(pud_large(pud))) {
-			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
+			if (!gup_huge_pud(pud, addr, next, flags, pages, nr))
 				return 0;
 		} else {
-			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
 				return 0;
 		}
 	} while (pudp++, addr = next, addr != end);
@@ -241,13 +247,12 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
  * back to the regular GUP.
  */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
 			  struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr, len, end;
 	unsigned long next;
-	unsigned long flags;
 	pgd_t *pgdp;
 	int nr = 0;
 
@@ -255,7 +260,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	addr = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
-	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+	if (unlikely(!access_ok((flags & GUPF_WRITE) ? VERIFY_WRITE : VERIFY_READ,
 					(void __user *)start, len)))
 		return 0;
 
@@ -277,7 +282,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 	 * (which we do on x86, with the above PAE exception), we can follow the
 	 * address down to the the page and take a ref on it.
 	 */
-	local_irq_save(flags);
 	pgdp = pgd_offset(mm, addr);
 	do {
 		pgd_t pgd = *pgdp;
@@ -285,14 +289,27 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none(pgd))
 			break;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+		if (!gup_pud_range(pgd, addr, next, flags, pages, &nr))
 			break;
 	} while (pgdp++, addr = next, addr != end);
-	local_irq_restore(flags);
 
 	return nr;
 }
 
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret = ___get_user_pages_fast(start, nr_pages,
+			GUPF_GET | (write ? GUPF_WRITE : 0), pages);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
 /**
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 13:31               ` Peter Zijlstra
  2013-10-16 13:54                 ` Don Zickus
@ 2013-10-17 11:21                 ` Peter Zijlstra
  2013-10-17 13:33                 ` Peter Zijlstra
  2 siblings, 0 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-17 11:21 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

On Wed, Oct 16, 2013 at 03:31:25PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 16, 2013 at 08:46:49AM -0400, Don Zickus wrote:
> > On Wed, Oct 16, 2013 at 12:57:55PM +0200, Peter Zijlstra wrote:
> > > A prettier patch below. The main difference is on-demand allocation of
> > > the scratch buffer.
> > 
> > I'll see if I can sanity test this in the next couple hours.
> > 
> > Further testing yesterday showed that intel_pmu_drain_pebs_nhm still
> > has long latencies somewhere.  With 15 minute reboots, isolation goes
> > slooow.
> 
> Pick a smaller box? I seem to be able to reproduce on my wsm-ep, which
> boots inside a minute :-)
> 
> root@westmere:~# cd /debug/tracing/
> root@westmere:/debug/tracing# echo function > current_tracer
> root@westmere:/debug/tracing# cat available_filter_functions | grep ^inat > set_ftrace_notrace
> root@westmere:/debug/tracing# cat available_filter_functions | grep ^insn | grep -v get_length >> set_ftrace_notrace
> 
> Run: perf top --stdio -e 'cycles:pp' in another window and when the
> console output shows:
> 
> [  610.319486] perf samples too long (19310 > 19230), lowering kernel.perf_event_max_sample_rate to 7000
> 
> quickly press enter here:

BTW; you can also replace this bit of manual intervention with something
like:

There's 3 changes:
 - changed atomic_t into regular int; there's nothing atomic about
   atomic_set vs atomic_read, so atomic_t is pointless
 - made perf_proc_update_handler() clear the running_sample_length
   state.
 - added if (avg_local_sample_len > 30000) tracing_off().

Of course you should tweak the 30000 to match whatever value you're
interested in. But tracing_off() does the same as that:

  echo 0 > tracing_on

And avoids being too late and having lost the trace buffer content.

---
 kernel/events/core.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c716385f6483..ea787d0d0e78 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,10 @@ int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
 static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;
 
-static atomic_t perf_sample_allowed_ns __read_mostly =
-	ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+static int perf_sample_allowed_ns __read_mostly =
+	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
+
+static DEFINE_PER_CPU(u64, running_sample_length);
 
 void update_perf_cpu_limits(void)
 {
@@ -184,7 +186,7 @@ void update_perf_cpu_limits(void)
 
 	tmp *= sysctl_perf_cpu_time_max_percent;
 	do_div(tmp, 100);
-	atomic_set(&perf_sample_allowed_ns, tmp);
+	ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
 }
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -194,6 +196,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
 		loff_t *ppos)
 {
 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	int cpu;
 
 	if (ret || !write)
 		return ret;
@@ -202,6 +205,9 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
 	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 	update_perf_cpu_limits();
 
+	for_each_possible_cpu(cpu)
+		per_cpu(running_sample_length, cpu) = 0;
+
 	return 0;
 }
 
@@ -228,14 +234,13 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
  * we detect that events are taking too long.
  */
 #define NR_ACCUMULATED_SAMPLES 128
-DEFINE_PER_CPU(u64, running_sample_length);
 
 void perf_sample_event_took(u64 sample_len_ns)
 {
 	u64 avg_local_sample_len;
 	u64 local_samples_len;
 
-	if (atomic_read(&perf_sample_allowed_ns) == 0)
+	if (ACCESS_ONCE(perf_sample_allowed_ns) == 0)
 		return;
 
 	/* decay the counter by 1 average sample */
@@ -251,12 +256,15 @@ void perf_sample_event_took(u64 sample_len_ns)
 	 */
 	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
 
-	if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+	if (avg_local_sample_len <= ACCESS_ONCE(perf_sample_allowed_ns))
 		return;
 
 	if (max_samples_per_tick <= 1)
 		return;
 
+	if (avg_local_sample_len > 30000)
+		tracing_off();
+
 	max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
 	sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
 	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
@@ -265,7 +273,7 @@ void perf_sample_event_took(u64 sample_len_ns)
 			"perf samples too long (%lld > %d), lowering "
 			"kernel.perf_event_max_sample_rate to %d\n",
 			avg_local_sample_len,
-			atomic_read(&perf_sample_allowed_ns),
+			ACCESS_ONCE(perf_sample_allowed_ns),
 			sysctl_perf_event_sample_rate);
 
 	update_perf_cpu_limits();

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 13:31               ` Peter Zijlstra
  2013-10-16 13:54                 ` Don Zickus
  2013-10-17 11:21                 ` Peter Zijlstra
@ 2013-10-17 13:33                 ` Peter Zijlstra
  2013-10-29 14:07                   ` [tip:perf/urgent] perf/x86: Fix NMI measurements tip-bot for Peter Zijlstra
  2 siblings, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-17 13:33 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

On Wed, Oct 16, 2013 at 03:31:25PM +0200, Peter Zijlstra wrote:
> Pick a smaller box? I seem to be able to reproduce on my wsm-ep, which
> boots inside a minute :-)

OK, so what I'm actually seeing on my WSM is that sched/clock.c is
'broken' for the purpose we're using it for.

What triggered it is that my WSM-EP is broken :-(

[    0.001000] tsc: Fast TSC calibration using PIT
[    0.002000] tsc: Detected 2533.715 MHz processor
[    0.500180] TSC synchronization [CPU#0 -> CPU#6]:
[    0.505197] Measured 3 cycles TSC warp between CPUs, turning off TSC clock.
[    0.004000] tsc: Marking TSC unstable due to check_tsc_sync_source failed

For some reason it consistently detects TSC skew, even though NHM+
should have a single clock domain for 'reasonable' systems.

This marks sched_clock_stable=0, which means that we do fancy stuff to
try and get a 'sane' clock. Part of this fancy stuff relies on the tick,
clearly that's gone when NOHZ=y. So for idle cpus time gets stuck, until
it either wakes up or gets kicked by another cpu.

While this is perfectly fine for the scheduler -- it only cares about
actually running stuff, and when we're running stuff we're obviously not
idle. This does somewhat break down for perf which can trigger events
just fine on an otherwise idle cpu.

So I've got NMIs get get 'measured' as taking ~1ms, which actually
don't last nearly that long:

          <idle>-0     [013] d.h.   886.311970: rcu_nmi_enter <-do_nmi
...
          <idle>-0     [013] d.h.   886.311997: perf_sample_event_took: HERE!!! : 1040990

So ftrace (which uses sched_clock(), not the fancy bits) only sees
~27us, but we measure ~1ms !!

Now since all this measurement stuff lives in x86 code, we can actually
fix it:

---
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1276,16 +1276,16 @@ void perf_events_lapic_init(void)
 static int __kprobes
 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 {
-	int ret;
 	u64 start_clock;
 	u64 finish_clock;
+	int ret;
 
 	if (!atomic_read(&active_events))
 		return NMI_DONE;
 
-	start_clock = local_clock();
+	start_clock = sched_clock();
 	ret = x86_pmu.handle_irq(regs);
-	finish_clock = local_clock();
+	finish_clock = sched_clock();
 
 	perf_sample_event_took(finish_clock - start_clock);
 
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ba77ebc2c353..6fcb49ce50a1 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -113,10 +113,10 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
 		u64 before, delta, whole_msecs;
 		int remainder_ns, decimal_msecs, thishandled;
 
-		before = local_clock();
+		before = sched_clock();
 		thishandled = a->handler(type, regs);
 		handled += thishandled;
-		delta = local_clock() - before;
+		delta = sched_clock() - before;
 		trace_nmi_handler(a->handler, (int)delta, thishandled);
 
 		if (delta < nmi_longest_ns)



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 10:57           ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Peter Zijlstra
  2013-10-16 12:46             ` Don Zickus
  2013-10-16 20:52             ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Andi Kleen
@ 2013-10-17 14:49             ` Don Zickus
  2013-10-17 14:51               ` Peter Zijlstra
  2013-10-17 16:50             ` [tip:perf/core] perf/x86: " tip-bot for Peter Zijlstra
  3 siblings, 1 reply; 47+ messages in thread
From: Don Zickus @ 2013-10-17 14:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

On Wed, Oct 16, 2013 at 12:57:55PM +0200, Peter Zijlstra wrote:
> A prettier patch below. The main difference is on-demand allocation of
> the scratch buffer.
> 
> ---
> Subject: perf, x86: Optimize intel_pmu_pebs_fixup_ip()
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue, 15 Oct 2013 12:14:04 +0200
> 
> On Mon, Oct 14, 2013 at 04:35:49PM -0400, Don Zickus wrote:
> > While there are a few places that are causing latencies, for now I focused on
> > the longest one first.  It seems to be 'copy_user_from_nmi'
> >
> > intel_pmu_handle_irq ->
> > 	intel_pmu_drain_pebs_nhm ->
> > 		__intel_pmu_drain_pebs_nhm ->
> > 			__intel_pmu_pebs_event ->
> > 				intel_pmu_pebs_fixup_ip ->
> > 					copy_from_user_nmi
> >
> > In intel_pmu_pebs_fixup_ip(), if the while-loop goes over 50, the sum of
> > all the copy_from_user_nmi latencies seems to go over 1,000,000 cycles
> > (there are some cases where only 10 iterations are needed to go that high
> > too, but in generall over 50 or so).  At this point copy_user_from_nmi
> > seems to account for over 90% of the nmi latency.
> 
> So avoid having to call copy_from_user_nmi() for every instruction.
> Since we already limit the max basic block size, we can easily
> pre-allocate a piece of memory to copy the entire thing into in one
> go.
> 
> Don reports (for a previous version):
> > Your patch made a huge difference in improvement.  The
> > copy_from_user_nmi() no longer hits the million of cycles.  I still
> > have a batch of 100,000-300,000 cycles.  My longest NMI paths used
> > to be dominated by copy_from_user_nmi, now it is not (I have to dig
> > up the new hot path).

Hi Peter,

For some reason this patch is page faulting at an invalid address inside
__intel_pmu_pebs_event().

Cheers,
Don


> 
> Cc: eranian@google.com
> Cc: ak@linux.intel.com
> Cc: jmario@redhat.com
> Cc: acme@infradead.org
> Cc: dave.hansen@linux.intel.com
> Reported-by: Don Zickus <dzickus@redhat.com>
> Signed-off-by: Peter Zijlstra <peterz@infradead.org>
> ---
>  arch/x86/kernel/cpu/perf_event_intel_ds.c |   48 +++++++++++++++++++++---------
>  1 file changed, 34 insertions(+), 14 deletions(-)
> 
> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -12,6 +12,7 @@
>  
>  #define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
>  #define PEBS_BUFFER_SIZE	PAGE_SIZE
> +#define PEBS_FIXUP_SIZE		PAGE_SIZE
>  
>  /*
>   * pebs_record_32 for p4 and core not supported
> @@ -228,12 +229,14 @@ void fini_debug_store_on_cpu(int cpu)
>  	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
>  }
>  
> +static DEFINE_PER_CPU(void *, insn_buffer);
> +
>  static int alloc_pebs_buffer(int cpu)
>  {
>  	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
>  	int node = cpu_to_node(cpu);
>  	int max, thresh = 1; /* always use a single PEBS record */
> -	void *buffer;
> +	void *buffer, *ibuffer;
>  
>  	if (!x86_pmu.pebs)
>  		return 0;
> @@ -242,6 +245,15 @@ static int alloc_pebs_buffer(int cpu)
>  	if (unlikely(!buffer))
>  		return -ENOMEM;
>  
> +	if (x86_pmu.intel_cap.pebs_format < 2) {
> +		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
> +		if (!ibuffer) {
> +			kfree(buffer);
> +			return -ENOMEM;
> +		}
> +		per_cpu(insn_buffer, cpu) = ibuffer;
> +	}
> +
>  	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
>  
>  	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
> @@ -262,6 +274,9 @@ static void release_pebs_buffer(int cpu)
>  	if (!ds || !x86_pmu.pebs)
>  		return;
>  
> +	kfree(per_cpu(insn_buffer, cpu));
> +	per_cpu(insn_buffer, cpu) = NULL;
> +
>  	kfree((void *)(unsigned long)ds->pebs_buffer_base);
>  	ds->pebs_buffer_base = 0;
>  }
> @@ -729,6 +744,7 @@ static int intel_pmu_pebs_fixup_ip(struc
>  	unsigned long old_to, to = cpuc->lbr_entries[0].to;
>  	unsigned long ip = regs->ip;
>  	int is_64bit = 0;
> +	void *kaddr;
>  
>  	/*
>  	 * We don't need to fixup if the PEBS assist is fault like
> @@ -752,7 +768,7 @@ static int intel_pmu_pebs_fixup_ip(struc
>  	 * unsigned math, either ip is before the start (impossible) or
>  	 * the basic block is larger than 1 page (sanity)
>  	 */
> -	if ((ip - to) > PAGE_SIZE)
> +	if ((ip - to) > PEBS_FIXUP_SIZE)
>  		return 0;
>  
>  	/*
> @@ -763,29 +779,33 @@ static int intel_pmu_pebs_fixup_ip(struc
>  		return 1;
>  	}
>  
> +	if (!kernel_ip(ip)) {
> +		int size, bytes;
> +		u8 *buf = this_cpu_ptr(insn_buffer);
> +
> +		size = ip - to; /* Must fit our buffer, see above */
> +		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
> +		if (bytes != size)
> +			return 0;
> +
> +		kaddr = buf;
> +	} else {
> +		kaddr = (void *)to;
> +	}
> +
>  	do {
>  		struct insn insn;
> -		u8 buf[MAX_INSN_SIZE];
> -		void *kaddr;
>  
>  		old_to = to;
> -		if (!kernel_ip(ip)) {
> -			int bytes, size = MAX_INSN_SIZE;
> -
> -			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
> -			if (bytes != size)
> -				return 0;
> -
> -			kaddr = buf;
> -		} else
> -			kaddr = (void *)to;
>  
>  #ifdef CONFIG_X86_64
>  		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
>  #endif
>  		insn_init(&insn, kaddr, is_64bit);
>  		insn_get_length(&insn);
> +
>  		to += insn.length;
> +		kaddr += insn.length;
>  	} while (to < ip);
>  
>  	if (to == ip) {

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 14:49             ` Don Zickus
@ 2013-10-17 14:51               ` Peter Zijlstra
  2013-10-17 15:03                 ` Don Zickus
  0 siblings, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-17 14:51 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

On Thu, Oct 17, 2013 at 10:49:13AM -0400, Don Zickus wrote:
> For some reason this patch is page faulting at an invalid address inside
> __intel_pmu_pebs_event().

Ah yes, I lost a refresh, but read on; I've send a gazillion new emails
since ;-)

I think it was something like: s/this_cpu_ptr/this_cpu_read/ to make it
work again.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 14:51               ` Peter Zijlstra
@ 2013-10-17 15:03                 ` Don Zickus
  2013-10-17 15:09                   ` Peter Zijlstra
  0 siblings, 1 reply; 47+ messages in thread
From: Don Zickus @ 2013-10-17 15:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

On Thu, Oct 17, 2013 at 04:51:31PM +0200, Peter Zijlstra wrote:
> On Thu, Oct 17, 2013 at 10:49:13AM -0400, Don Zickus wrote:
> > For some reason this patch is page faulting at an invalid address inside
> > __intel_pmu_pebs_event().
> 
> Ah yes, I lost a refresh, but read on; I've send a gazillion new emails
> since ;-)

Yes, I have noticed, but I got worried when you labeled some of them with
'untested'. :-)  Thought I would stick to something a little more close to
working.

> 
> I think it was something like: s/this_cpu_ptr/this_cpu_read/ to make it
> work again.

Thanks.  I will try some of the other pieces today (unlike yesterday).

Cheers,
Don


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 15:03                 ` Don Zickus
@ 2013-10-17 15:09                   ` Peter Zijlstra
  2013-10-17 15:11                     ` Peter Zijlstra
  0 siblings, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-17 15:09 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

On Thu, Oct 17, 2013 at 11:03:58AM -0400, Don Zickus wrote:
> On Thu, Oct 17, 2013 at 04:51:31PM +0200, Peter Zijlstra wrote:
> > On Thu, Oct 17, 2013 at 10:49:13AM -0400, Don Zickus wrote:
> > > For some reason this patch is page faulting at an invalid address inside
> > > __intel_pmu_pebs_event().
> > 
> > Ah yes, I lost a refresh, but read on; I've send a gazillion new emails
> > since ;-)
> 
> Yes, I have noticed, but I got worried when you labeled some of them with
> 'untested'. :-)  Thought I would stick to something a little more close to
> working.
> 
> > 
> > I think it was something like: s/this_cpu_ptr/this_cpu_read/ to make it
> > work again.
> 
> Thanks.  I will try some of the other pieces today (unlike yesterday).

The patches you find in:

http://programming.kicks-ass.net/sekrit/patches.tar.bz2

are actually running on my machine now.

One of the things I was considering was further shrinking the max basic
block size from 4k to maybe 1k or 512 bytes. Not sure what a sane basic
block length limit would be.

I did try 1k earlier today and I seemed to still get near 100% rewind
success rates.

Stephane, Andi, any clues?

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 15:09                   ` Peter Zijlstra
@ 2013-10-17 15:11                     ` Peter Zijlstra
  0 siblings, 0 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-17 15:11 UTC (permalink / raw)
  To: Don Zickus; +Cc: dave.hansen, eranian, ak, jmario, linux-kernel, acme, mingo

On Thu, Oct 17, 2013 at 05:09:44PM +0200, Peter Zijlstra wrote:
> The patches you find in:
> 
> http://programming.kicks-ass.net/sekrit/patches.tar.bz2

# sha256sum patches.tar.bz2 
28e26d4a20004eee231a4c0c6067508a322241046b400a226af1cceed8854bfb  patches.tar.bz2


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17  9:41                   ` Peter Zijlstra
@ 2013-10-17 16:00                     ` Don Zickus
  2013-10-17 16:04                       ` Don Zickus
  0 siblings, 1 reply; 47+ messages in thread
From: Don Zickus @ 2013-10-17 16:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, dave.hansen, eranian, jmario, linux-kernel, acme, mingo

On Thu, Oct 17, 2013 at 11:41:45AM +0200, Peter Zijlstra wrote:
> On Thu, Oct 17, 2013 at 01:07:12AM +0200, Peter Zijlstra wrote:
> > On Wed, Oct 16, 2013 at 11:03:19PM +0200, Peter Zijlstra wrote:
> > > Anyway; if you want to have a go at this, feel free.
> > 
> > OK, couldn't help myself; completely untested patch below.
> > 
> > I think the full once copy it best for the decode as even with the below
> > interface you'd end up doing a lot of duplicate copying due to the
> > variable size insn mess.
> 
> Duh, a very small tweak would make it work for that and avoid most of
> the memcpy()s.

Hmm, for some reason, even though copy_from_user_nmi_iter is super fast
now, the while(to < ip) count increased dramatically and so did my
latency. :-(

Not sure what happened between your pretty patch yesterday and this
direction.

Cheers,
Don

> 
> ---
>  arch/x86/include/asm/uaccess.h            | 13 +++++
>  arch/x86/kernel/cpu/perf_event.c          | 32 +++++------
>  arch/x86/kernel/cpu/perf_event_intel_ds.c | 21 ++++---
>  arch/x86/lib/usercopy.c                   | 91 ++++++++++++++++++++++++++++++-
>  arch/x86/mm/gup.c                         | 63 +++++++++++++--------
>  5 files changed, 165 insertions(+), 55 deletions(-)
> 
> diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
> index 5838fa911aa0..a341de0eadd1 100644
> --- a/arch/x86/include/asm/uaccess.h
> +++ b/arch/x86/include/asm/uaccess.h
> @@ -516,6 +516,19 @@ struct __large_struct { unsigned long buf[100]; };
>  
>  extern unsigned long
>  copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
> +
> +struct copy_from_user_nmi_state {
> +	void *map;
> +	unsigned long address;
> +	unsigned long flags;
> +};
> +
> +extern void *
> +copy_from_user_nmi_iter(void *to, const void __user *from,
> +			unsigned long n, struct copy_from_user_nmi_state *state);
> +extern void
> +copy_from_user_nmi_end(struct copy_from_user_nmi_state *state);
> +
>  extern __must_check long
>  strncpy_from_user(char *dst, const char __user *src, long count);
>  
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 19c9d86d2f04..c917fe470861 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -1979,8 +1979,9 @@ static inline int
>  perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>  {
>  	/* 32-bit process in 64-bit kernel. */
> +	struct copy_from_user_nmi_state state = { NULL };
>  	unsigned long ss_base, cs_base;
> -	struct stack_frame_ia32 frame;
> +	struct stack_frame_ia32 frame, *f;
>  	const void __user *fp;
>  
>  	if (!test_thread_flag(TIF_IA32))
> @@ -1991,20 +1992,17 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>  
>  	fp = compat_ptr(ss_base + regs->bp);
>  	while (entry->nr < PERF_MAX_STACK_DEPTH) {
> -		unsigned long bytes;
> -		frame.next_frame     = 0;
> -		frame.return_address = 0;
> -
> -		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
> -		if (bytes != sizeof(frame))
> +		f = copy_from_user_nmi_iter(&frame, fp, sizeof(frame), &state);
> +		if (!f)
>  			break;
>  
>  		if (!valid_user_frame(fp, sizeof(frame)))
>  			break;
>  
> -		perf_callchain_store(entry, cs_base + frame.return_address);
> -		fp = compat_ptr(ss_base + frame.next_frame);
> +		perf_callchain_store(entry, cs_base + f->return_address);
> +		fp = compat_ptr(ss_base + f->next_frame);
>  	}
> +	copy_from_user_nmi_end(&state);
>  	return 1;
>  }
>  #else
> @@ -2018,7 +2016,8 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
>  void
>  perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
>  {
> -	struct stack_frame frame;
> +	struct copy_from_user_nmi_state state = { NULL };
> +	struct stack_frame frame, *f;
>  	const void __user *fp;
>  
>  	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
> @@ -2043,20 +2042,17 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
>  		return;
>  
>  	while (entry->nr < PERF_MAX_STACK_DEPTH) {
> -		unsigned long bytes;
> -		frame.next_frame	     = NULL;
> -		frame.return_address = 0;
> -
> -		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
> -		if (bytes != sizeof(frame))
> +		f = copy_from_user_nmi_iter(&frame, fp, sizeof(frame), &state);
> +		if (!f)
>  			break;
>  
>  		if (!valid_user_frame(fp, sizeof(frame)))
>  			break;
>  
> -		perf_callchain_store(entry, frame.return_address);
> -		fp = frame.next_frame;
> +		perf_callchain_store(entry, f->return_address);
> +		fp = f->next_frame;
>  	}
> +	copy_from_user_nmi_end(&state);
>  }
>  
>  /*
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> index 32e9ed81cd00..5bd3f2091da9 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -725,10 +725,14 @@ void intel_pmu_pebs_disable_all(void)
>  static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
>  {
>  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> +	struct copy_from_user_nmi_state state = { NULL };
>  	unsigned long from = cpuc->lbr_entries[0].from;
>  	unsigned long old_to, to = cpuc->lbr_entries[0].to;
>  	unsigned long ip = regs->ip;
> +	u8 buf[MAX_INSN_SIZE];
> +	struct insn insn;
>  	int is_64bit = 0;
> +	void *kaddr;
>  
>  	/*
>  	 * We don't need to fixup if the PEBS assist is fault like
> @@ -764,19 +768,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
>  	}
>  
>  	do {
> -		struct insn insn;
> -		u8 buf[MAX_INSN_SIZE];
> -		void *kaddr;
> -
>  		old_to = to;
>  		if (!kernel_ip(ip)) {
> -			int bytes, size = MAX_INSN_SIZE;
> -
> -			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
> -			if (bytes != size)
> -				return 0;
> -
> -			kaddr = buf;
> +			kaddr = copy_from_user_nmi_iter(buf, (void __user *)to,
> +							MAX_INSN_SIZE, &state);
> +			if (!kaddr)
> +				break;
>  		} else
>  			kaddr = (void *)to;
>  
> @@ -788,6 +785,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
>  		to += insn.length;
>  	} while (to < ip);
>  
> +	copy_from_user_nmi_end(&state);
> +
>  	if (to == ip) {
>  		set_linear_ip(regs, old_to);
>  		return 1;
> diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
> index 4f74d94c8d97..da6c36a8b842 100644
> --- a/arch/x86/lib/usercopy.c
> +++ b/arch/x86/lib/usercopy.c
> @@ -10,6 +10,8 @@
>  #include <asm/word-at-a-time.h>
>  #include <linux/sched.h>
>  
> +extern int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
> +			  struct page **pages);
>  /*
>   * best effort, GUP based copy_from_user() that is NMI-safe
>   */
> @@ -18,6 +20,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
>  {
>  	unsigned long offset, addr = (unsigned long)from;
>  	unsigned long size, len = 0;
> +	unsigned long flags;
>  	struct page *page;
>  	void *map;
>  	int ret;
> @@ -26,9 +29,12 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
>  		return len;
>  
>  	do {
> -		ret = __get_user_pages_fast(addr, 1, 0, &page);
> -		if (!ret)
> +		local_irq_save(flags);
> +		ret = ___get_user_pages_fast(addr, 1, 0, &page);
> +		if (!ret) {
> +			local_irq_restore(flags);
>  			break;
> +		}
>  
>  		offset = addr & (PAGE_SIZE - 1);
>  		size = min(PAGE_SIZE - offset, n - len);
> @@ -36,7 +42,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
>  		map = kmap_atomic(page);
>  		memcpy(to, map+offset, size);
>  		kunmap_atomic(map);
> -		put_page(page);
> +		local_irq_restore(flags);
>  
>  		len  += size;
>  		to   += size;
> @@ -47,3 +53,82 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
>  	return len;
>  }
>  EXPORT_SYMBOL_GPL(copy_from_user_nmi);
> +
> +void *copy_from_user_nmi_iter(void *to, const void __user *from,
> +		unsigned long n, struct copy_from_user_nmi_state *state)
> +{
> +	unsigned long offset, addr = (unsigned long)from;
> +	unsigned long size, len = 0;
> +	unsigned long flags;
> +	struct page *page;
> +	void *map, *_to = to;
> +	int ret;
> +
> +	if (__range_not_ok(from, n, TASK_SIZE))
> +		return NULL;
> +
> +	if (state->map) {
> +		if ((state->address >> PAGE_SHIFT) ==
> +		    (addr >> PAGE_SHIFT)) {
> +			flags = state->flags;
> +			map = state->map;
> +			goto got_page;
> +		}
> +		kunmap_atomic(state->map);
> +		local_irq_restore(state->flags);
> +	}
> +
> +	for (;;) {
> +		local_irq_save(flags);
> +		ret = ___get_user_pages_fast(addr, 1, 0, &page);
> +		if (!ret) {
> +			local_irq_restore(flags);
> +			state->map = NULL;
> +			return NULL;
> +		}
> +
> +		map = kmap_atomic(page);
> +
> +got_page:
> +		offset = addr & (PAGE_SIZE - 1);
> +		size = min(PAGE_SIZE - offset, n - len);
> +
> +		/*
> +		 * If the entire desired range falls within the one page
> +		 * avoid the copy and return a pointer into the kmap.
> +		 */
> +		if (size == n) {
> +			_to = map + offset;
> +			break;
> +		}
> +
> +		memcpy(to, map+offset, size);
> +		len += size;
> +
> +		if (len == n)
> +			break;
> +
> +		to   += size;
> +		addr += size;
> +
> +		kunmap_atomic(map);
> +		local_irq_restore(flags);
> +	}
> +
> +	state->address = addr;
> +	state->flags = flags;
> +	state->map = map;
> +
> +	return _to;
> +}
> +EXPORT_SYMBOL_GPL(copy_from_user_nmi_iter);
> +
> +void copy_from_user_nmi_end(struct copy_from_user_nmi_state *state)
> +{
> +	if (state->map) {
> +		kunmap_atomic(state->map);
> +		local_irq_restore(state->flags);
> +		state->map = NULL;
> +	}
> +}
> +EXPORT_SYMBOL_GPL(copy_from_user_nmi_end);
> diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
> index dd74e46828c0..e383caf323e4 100644
> --- a/arch/x86/mm/gup.c
> +++ b/arch/x86/mm/gup.c
> @@ -63,19 +63,22 @@ static inline pte_t gup_get_pte(pte_t *ptep)
>  #endif
>  }
>  
> +#define GUPF_GET	0x01
> +#define GUPF_WRITE	0x02
> +
>  /*
>   * The performance critical leaf functions are made noinline otherwise gcc
>   * inlines everything into a single function which results in too much
>   * register pressure.
>   */
>  static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
> -		unsigned long end, int write, struct page **pages, int *nr)
> +		unsigned long end, int flags, struct page **pages, int *nr)
>  {
>  	unsigned long mask;
>  	pte_t *ptep;
>  
>  	mask = _PAGE_PRESENT|_PAGE_USER;
> -	if (write)
> +	if (flags & GUPF_WRITE)
>  		mask |= _PAGE_RW;
>  
>  	ptep = pte_offset_map(&pmd, addr);
> @@ -89,7 +92,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
>  		}
>  		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
>  		page = pte_page(pte);
> -		get_page(page);
> +		if (flags & GUPF_GET)
> +			get_page(page);
>  		SetPageReferenced(page);
>  		pages[*nr] = page;
>  		(*nr)++;
> @@ -109,7 +113,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
>  }
>  
>  static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
> -		unsigned long end, int write, struct page **pages, int *nr)
> +		unsigned long end, int flags, struct page **pages, int *nr)
>  {
>  	unsigned long mask;
>  	pte_t pte = *(pte_t *)&pmd;
> @@ -117,7 +121,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
>  	int refs;
>  
>  	mask = _PAGE_PRESENT|_PAGE_USER;
> -	if (write)
> +	if (flags & GUPF_WRITE)
>  		mask |= _PAGE_RW;
>  	if ((pte_flags(pte) & mask) != mask)
>  		return 0;
> @@ -131,19 +135,20 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
>  	do {
>  		VM_BUG_ON(compound_head(page) != head);
>  		pages[*nr] = page;
> -		if (PageTail(page))
> +		if ((flags & GUPF_GET) && PageTail(page))
>  			get_huge_page_tail(page);
>  		(*nr)++;
>  		page++;
>  		refs++;
>  	} while (addr += PAGE_SIZE, addr != end);
> -	get_head_page_multiple(head, refs);
> +	if (flags & GUPF_GET)
> +		get_head_page_multiple(head, refs);
>  
>  	return 1;
>  }
>  
>  static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
> -		int write, struct page **pages, int *nr)
> +		int flags, struct page **pages, int *nr)
>  {
>  	unsigned long next;
>  	pmd_t *pmdp;
> @@ -167,10 +172,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
>  		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
>  			return 0;
>  		if (unlikely(pmd_large(pmd))) {
> -			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
> +			if (!gup_huge_pmd(pmd, addr, next, flags, pages, nr))
>  				return 0;
>  		} else {
> -			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
> +			if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
>  				return 0;
>  		}
>  	} while (pmdp++, addr = next, addr != end);
> @@ -179,7 +184,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
>  }
>  
>  static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
> -		unsigned long end, int write, struct page **pages, int *nr)
> +		unsigned long end, int flags, struct page **pages, int *nr)
>  {
>  	unsigned long mask;
>  	pte_t pte = *(pte_t *)&pud;
> @@ -187,7 +192,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
>  	int refs;
>  
>  	mask = _PAGE_PRESENT|_PAGE_USER;
> -	if (write)
> +	if (flags & GUPF_WRITE)
>  		mask |= _PAGE_RW;
>  	if ((pte_flags(pte) & mask) != mask)
>  		return 0;
> @@ -201,19 +206,20 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
>  	do {
>  		VM_BUG_ON(compound_head(page) != head);
>  		pages[*nr] = page;
> -		if (PageTail(page))
> +		if ((flags & GUPF_GET) && PageTail(page))
>  			get_huge_page_tail(page);
>  		(*nr)++;
>  		page++;
>  		refs++;
>  	} while (addr += PAGE_SIZE, addr != end);
> -	get_head_page_multiple(head, refs);
> +	if (flags & GUPF_GET)
> +		get_head_page_multiple(head, refs);
>  
>  	return 1;
>  }
>  
>  static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
> -			int write, struct page **pages, int *nr)
> +			int flags, struct page **pages, int *nr)
>  {
>  	unsigned long next;
>  	pud_t *pudp;
> @@ -226,10 +232,10 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
>  		if (pud_none(pud))
>  			return 0;
>  		if (unlikely(pud_large(pud))) {
> -			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
> +			if (!gup_huge_pud(pud, addr, next, flags, pages, nr))
>  				return 0;
>  		} else {
> -			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
> +			if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
>  				return 0;
>  		}
>  	} while (pudp++, addr = next, addr != end);
> @@ -241,13 +247,12 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
>   * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
>   * back to the regular GUP.
>   */
> -int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> +int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
>  			  struct page **pages)
>  {
>  	struct mm_struct *mm = current->mm;
>  	unsigned long addr, len, end;
>  	unsigned long next;
> -	unsigned long flags;
>  	pgd_t *pgdp;
>  	int nr = 0;
>  
> @@ -255,7 +260,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
>  	addr = start;
>  	len = (unsigned long) nr_pages << PAGE_SHIFT;
>  	end = start + len;
> -	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
> +	if (unlikely(!access_ok((flags & GUPF_WRITE) ? VERIFY_WRITE : VERIFY_READ,
>  					(void __user *)start, len)))
>  		return 0;
>  
> @@ -277,7 +282,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
>  	 * (which we do on x86, with the above PAE exception), we can follow the
>  	 * address down to the the page and take a ref on it.
>  	 */
> -	local_irq_save(flags);
>  	pgdp = pgd_offset(mm, addr);
>  	do {
>  		pgd_t pgd = *pgdp;
> @@ -285,14 +289,27 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
>  		next = pgd_addr_end(addr, end);
>  		if (pgd_none(pgd))
>  			break;
> -		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
> +		if (!gup_pud_range(pgd, addr, next, flags, pages, &nr))
>  			break;
>  	} while (pgdp++, addr = next, addr != end);
> -	local_irq_restore(flags);
>  
>  	return nr;
>  }
>  
> +int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> +			  struct page **pages)
> +{
> +	unsigned long flags;
> +	int ret;
> +
> +	local_irq_save(flags);
> +	ret = ___get_user_pages_fast(start, nr_pages,
> +			GUPF_GET | (write ? GUPF_WRITE : 0), pages);
> +	local_irq_restore(flags);
> +
> +	return ret;
> +}
> +
>  /**
>   * get_user_pages_fast() - pin user pages in memory
>   * @start:	starting user address
> 

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 16:00                     ` Don Zickus
@ 2013-10-17 16:04                       ` Don Zickus
  2013-10-17 16:30                         ` Peter Zijlstra
  0 siblings, 1 reply; 47+ messages in thread
From: Don Zickus @ 2013-10-17 16:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, dave.hansen, eranian, jmario, linux-kernel, acme, mingo

On Thu, Oct 17, 2013 at 12:00:34PM -0400, Don Zickus wrote:
> On Thu, Oct 17, 2013 at 11:41:45AM +0200, Peter Zijlstra wrote:
> > On Thu, Oct 17, 2013 at 01:07:12AM +0200, Peter Zijlstra wrote:
> > > On Wed, Oct 16, 2013 at 11:03:19PM +0200, Peter Zijlstra wrote:
> > > > Anyway; if you want to have a go at this, feel free.
> > > 
> > > OK, couldn't help myself; completely untested patch below.
> > > 
> > > I think the full once copy it best for the decode as even with the below
> > > interface you'd end up doing a lot of duplicate copying due to the
> > > variable size insn mess.
> > 
> > Duh, a very small tweak would make it work for that and avoid most of
> > the memcpy()s.
> 
> Hmm, for some reason, even though copy_from_user_nmi_iter is super fast
> now, the while(to < ip) count increased dramatically and so did my
> latency. :-(

I take that back the copy_from_user_nmi_iter is not super fast, I just had
a bug in how I accumulate total time.  So some how this approach is slower
that yesterdays.

Cheers,
Don

> 
> Not sure what happened between your pretty patch yesterday and this
> direction.
> 
> Cheers,
> Don
> 
> > 
> > ---
> >  arch/x86/include/asm/uaccess.h            | 13 +++++
> >  arch/x86/kernel/cpu/perf_event.c          | 32 +++++------
> >  arch/x86/kernel/cpu/perf_event_intel_ds.c | 21 ++++---
> >  arch/x86/lib/usercopy.c                   | 91 ++++++++++++++++++++++++++++++-
> >  arch/x86/mm/gup.c                         | 63 +++++++++++++--------
> >  5 files changed, 165 insertions(+), 55 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
> > index 5838fa911aa0..a341de0eadd1 100644
> > --- a/arch/x86/include/asm/uaccess.h
> > +++ b/arch/x86/include/asm/uaccess.h
> > @@ -516,6 +516,19 @@ struct __large_struct { unsigned long buf[100]; };
> >  
> >  extern unsigned long
> >  copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
> > +
> > +struct copy_from_user_nmi_state {
> > +	void *map;
> > +	unsigned long address;
> > +	unsigned long flags;
> > +};
> > +
> > +extern void *
> > +copy_from_user_nmi_iter(void *to, const void __user *from,
> > +			unsigned long n, struct copy_from_user_nmi_state *state);
> > +extern void
> > +copy_from_user_nmi_end(struct copy_from_user_nmi_state *state);
> > +
> >  extern __must_check long
> >  strncpy_from_user(char *dst, const char __user *src, long count);
> >  
> > diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> > index 19c9d86d2f04..c917fe470861 100644
> > --- a/arch/x86/kernel/cpu/perf_event.c
> > +++ b/arch/x86/kernel/cpu/perf_event.c
> > @@ -1979,8 +1979,9 @@ static inline int
> >  perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
> >  {
> >  	/* 32-bit process in 64-bit kernel. */
> > +	struct copy_from_user_nmi_state state = { NULL };
> >  	unsigned long ss_base, cs_base;
> > -	struct stack_frame_ia32 frame;
> > +	struct stack_frame_ia32 frame, *f;
> >  	const void __user *fp;
> >  
> >  	if (!test_thread_flag(TIF_IA32))
> > @@ -1991,20 +1992,17 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
> >  
> >  	fp = compat_ptr(ss_base + regs->bp);
> >  	while (entry->nr < PERF_MAX_STACK_DEPTH) {
> > -		unsigned long bytes;
> > -		frame.next_frame     = 0;
> > -		frame.return_address = 0;
> > -
> > -		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
> > -		if (bytes != sizeof(frame))
> > +		f = copy_from_user_nmi_iter(&frame, fp, sizeof(frame), &state);
> > +		if (!f)
> >  			break;
> >  
> >  		if (!valid_user_frame(fp, sizeof(frame)))
> >  			break;
> >  
> > -		perf_callchain_store(entry, cs_base + frame.return_address);
> > -		fp = compat_ptr(ss_base + frame.next_frame);
> > +		perf_callchain_store(entry, cs_base + f->return_address);
> > +		fp = compat_ptr(ss_base + f->next_frame);
> >  	}
> > +	copy_from_user_nmi_end(&state);
> >  	return 1;
> >  }
> >  #else
> > @@ -2018,7 +2016,8 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
> >  void
> >  perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
> >  {
> > -	struct stack_frame frame;
> > +	struct copy_from_user_nmi_state state = { NULL };
> > +	struct stack_frame frame, *f;
> >  	const void __user *fp;
> >  
> >  	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
> > @@ -2043,20 +2042,17 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
> >  		return;
> >  
> >  	while (entry->nr < PERF_MAX_STACK_DEPTH) {
> > -		unsigned long bytes;
> > -		frame.next_frame	     = NULL;
> > -		frame.return_address = 0;
> > -
> > -		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
> > -		if (bytes != sizeof(frame))
> > +		f = copy_from_user_nmi_iter(&frame, fp, sizeof(frame), &state);
> > +		if (!f)
> >  			break;
> >  
> >  		if (!valid_user_frame(fp, sizeof(frame)))
> >  			break;
> >  
> > -		perf_callchain_store(entry, frame.return_address);
> > -		fp = frame.next_frame;
> > +		perf_callchain_store(entry, f->return_address);
> > +		fp = f->next_frame;
> >  	}
> > +	copy_from_user_nmi_end(&state);
> >  }
> >  
> >  /*
> > diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> > index 32e9ed81cd00..5bd3f2091da9 100644
> > --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> > +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> > @@ -725,10 +725,14 @@ void intel_pmu_pebs_disable_all(void)
> >  static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
> >  {
> >  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
> > +	struct copy_from_user_nmi_state state = { NULL };
> >  	unsigned long from = cpuc->lbr_entries[0].from;
> >  	unsigned long old_to, to = cpuc->lbr_entries[0].to;
> >  	unsigned long ip = regs->ip;
> > +	u8 buf[MAX_INSN_SIZE];
> > +	struct insn insn;
> >  	int is_64bit = 0;
> > +	void *kaddr;
> >  
> >  	/*
> >  	 * We don't need to fixup if the PEBS assist is fault like
> > @@ -764,19 +768,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
> >  	}
> >  
> >  	do {
> > -		struct insn insn;
> > -		u8 buf[MAX_INSN_SIZE];
> > -		void *kaddr;
> > -
> >  		old_to = to;
> >  		if (!kernel_ip(ip)) {
> > -			int bytes, size = MAX_INSN_SIZE;
> > -
> > -			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
> > -			if (bytes != size)
> > -				return 0;
> > -
> > -			kaddr = buf;
> > +			kaddr = copy_from_user_nmi_iter(buf, (void __user *)to,
> > +							MAX_INSN_SIZE, &state);
> > +			if (!kaddr)
> > +				break;
> >  		} else
> >  			kaddr = (void *)to;
> >  
> > @@ -788,6 +785,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
> >  		to += insn.length;
> >  	} while (to < ip);
> >  
> > +	copy_from_user_nmi_end(&state);
> > +
> >  	if (to == ip) {
> >  		set_linear_ip(regs, old_to);
> >  		return 1;
> > diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
> > index 4f74d94c8d97..da6c36a8b842 100644
> > --- a/arch/x86/lib/usercopy.c
> > +++ b/arch/x86/lib/usercopy.c
> > @@ -10,6 +10,8 @@
> >  #include <asm/word-at-a-time.h>
> >  #include <linux/sched.h>
> >  
> > +extern int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
> > +			  struct page **pages);
> >  /*
> >   * best effort, GUP based copy_from_user() that is NMI-safe
> >   */
> > @@ -18,6 +20,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
> >  {
> >  	unsigned long offset, addr = (unsigned long)from;
> >  	unsigned long size, len = 0;
> > +	unsigned long flags;
> >  	struct page *page;
> >  	void *map;
> >  	int ret;
> > @@ -26,9 +29,12 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
> >  		return len;
> >  
> >  	do {
> > -		ret = __get_user_pages_fast(addr, 1, 0, &page);
> > -		if (!ret)
> > +		local_irq_save(flags);
> > +		ret = ___get_user_pages_fast(addr, 1, 0, &page);
> > +		if (!ret) {
> > +			local_irq_restore(flags);
> >  			break;
> > +		}
> >  
> >  		offset = addr & (PAGE_SIZE - 1);
> >  		size = min(PAGE_SIZE - offset, n - len);
> > @@ -36,7 +42,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
> >  		map = kmap_atomic(page);
> >  		memcpy(to, map+offset, size);
> >  		kunmap_atomic(map);
> > -		put_page(page);
> > +		local_irq_restore(flags);
> >  
> >  		len  += size;
> >  		to   += size;
> > @@ -47,3 +53,82 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
> >  	return len;
> >  }
> >  EXPORT_SYMBOL_GPL(copy_from_user_nmi);
> > +
> > +void *copy_from_user_nmi_iter(void *to, const void __user *from,
> > +		unsigned long n, struct copy_from_user_nmi_state *state)
> > +{
> > +	unsigned long offset, addr = (unsigned long)from;
> > +	unsigned long size, len = 0;
> > +	unsigned long flags;
> > +	struct page *page;
> > +	void *map, *_to = to;
> > +	int ret;
> > +
> > +	if (__range_not_ok(from, n, TASK_SIZE))
> > +		return NULL;
> > +
> > +	if (state->map) {
> > +		if ((state->address >> PAGE_SHIFT) ==
> > +		    (addr >> PAGE_SHIFT)) {
> > +			flags = state->flags;
> > +			map = state->map;
> > +			goto got_page;
> > +		}
> > +		kunmap_atomic(state->map);
> > +		local_irq_restore(state->flags);
> > +	}
> > +
> > +	for (;;) {
> > +		local_irq_save(flags);
> > +		ret = ___get_user_pages_fast(addr, 1, 0, &page);
> > +		if (!ret) {
> > +			local_irq_restore(flags);
> > +			state->map = NULL;
> > +			return NULL;
> > +		}
> > +
> > +		map = kmap_atomic(page);
> > +
> > +got_page:
> > +		offset = addr & (PAGE_SIZE - 1);
> > +		size = min(PAGE_SIZE - offset, n - len);
> > +
> > +		/*
> > +		 * If the entire desired range falls within the one page
> > +		 * avoid the copy and return a pointer into the kmap.
> > +		 */
> > +		if (size == n) {
> > +			_to = map + offset;
> > +			break;
> > +		}
> > +
> > +		memcpy(to, map+offset, size);
> > +		len += size;
> > +
> > +		if (len == n)
> > +			break;
> > +
> > +		to   += size;
> > +		addr += size;
> > +
> > +		kunmap_atomic(map);
> > +		local_irq_restore(flags);
> > +	}
> > +
> > +	state->address = addr;
> > +	state->flags = flags;
> > +	state->map = map;
> > +
> > +	return _to;
> > +}
> > +EXPORT_SYMBOL_GPL(copy_from_user_nmi_iter);
> > +
> > +void copy_from_user_nmi_end(struct copy_from_user_nmi_state *state)
> > +{
> > +	if (state->map) {
> > +		kunmap_atomic(state->map);
> > +		local_irq_restore(state->flags);
> > +		state->map = NULL;
> > +	}
> > +}
> > +EXPORT_SYMBOL_GPL(copy_from_user_nmi_end);
> > diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
> > index dd74e46828c0..e383caf323e4 100644
> > --- a/arch/x86/mm/gup.c
> > +++ b/arch/x86/mm/gup.c
> > @@ -63,19 +63,22 @@ static inline pte_t gup_get_pte(pte_t *ptep)
> >  #endif
> >  }
> >  
> > +#define GUPF_GET	0x01
> > +#define GUPF_WRITE	0x02
> > +
> >  /*
> >   * The performance critical leaf functions are made noinline otherwise gcc
> >   * inlines everything into a single function which results in too much
> >   * register pressure.
> >   */
> >  static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
> > -		unsigned long end, int write, struct page **pages, int *nr)
> > +		unsigned long end, int flags, struct page **pages, int *nr)
> >  {
> >  	unsigned long mask;
> >  	pte_t *ptep;
> >  
> >  	mask = _PAGE_PRESENT|_PAGE_USER;
> > -	if (write)
> > +	if (flags & GUPF_WRITE)
> >  		mask |= _PAGE_RW;
> >  
> >  	ptep = pte_offset_map(&pmd, addr);
> > @@ -89,7 +92,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
> >  		}
> >  		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
> >  		page = pte_page(pte);
> > -		get_page(page);
> > +		if (flags & GUPF_GET)
> > +			get_page(page);
> >  		SetPageReferenced(page);
> >  		pages[*nr] = page;
> >  		(*nr)++;
> > @@ -109,7 +113,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
> >  }
> >  
> >  static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
> > -		unsigned long end, int write, struct page **pages, int *nr)
> > +		unsigned long end, int flags, struct page **pages, int *nr)
> >  {
> >  	unsigned long mask;
> >  	pte_t pte = *(pte_t *)&pmd;
> > @@ -117,7 +121,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
> >  	int refs;
> >  
> >  	mask = _PAGE_PRESENT|_PAGE_USER;
> > -	if (write)
> > +	if (flags & GUPF_WRITE)
> >  		mask |= _PAGE_RW;
> >  	if ((pte_flags(pte) & mask) != mask)
> >  		return 0;
> > @@ -131,19 +135,20 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
> >  	do {
> >  		VM_BUG_ON(compound_head(page) != head);
> >  		pages[*nr] = page;
> > -		if (PageTail(page))
> > +		if ((flags & GUPF_GET) && PageTail(page))
> >  			get_huge_page_tail(page);
> >  		(*nr)++;
> >  		page++;
> >  		refs++;
> >  	} while (addr += PAGE_SIZE, addr != end);
> > -	get_head_page_multiple(head, refs);
> > +	if (flags & GUPF_GET)
> > +		get_head_page_multiple(head, refs);
> >  
> >  	return 1;
> >  }
> >  
> >  static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
> > -		int write, struct page **pages, int *nr)
> > +		int flags, struct page **pages, int *nr)
> >  {
> >  	unsigned long next;
> >  	pmd_t *pmdp;
> > @@ -167,10 +172,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
> >  		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
> >  			return 0;
> >  		if (unlikely(pmd_large(pmd))) {
> > -			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
> > +			if (!gup_huge_pmd(pmd, addr, next, flags, pages, nr))
> >  				return 0;
> >  		} else {
> > -			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
> > +			if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
> >  				return 0;
> >  		}
> >  	} while (pmdp++, addr = next, addr != end);
> > @@ -179,7 +184,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
> >  }
> >  
> >  static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
> > -		unsigned long end, int write, struct page **pages, int *nr)
> > +		unsigned long end, int flags, struct page **pages, int *nr)
> >  {
> >  	unsigned long mask;
> >  	pte_t pte = *(pte_t *)&pud;
> > @@ -187,7 +192,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
> >  	int refs;
> >  
> >  	mask = _PAGE_PRESENT|_PAGE_USER;
> > -	if (write)
> > +	if (flags & GUPF_WRITE)
> >  		mask |= _PAGE_RW;
> >  	if ((pte_flags(pte) & mask) != mask)
> >  		return 0;
> > @@ -201,19 +206,20 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
> >  	do {
> >  		VM_BUG_ON(compound_head(page) != head);
> >  		pages[*nr] = page;
> > -		if (PageTail(page))
> > +		if ((flags & GUPF_GET) && PageTail(page))
> >  			get_huge_page_tail(page);
> >  		(*nr)++;
> >  		page++;
> >  		refs++;
> >  	} while (addr += PAGE_SIZE, addr != end);
> > -	get_head_page_multiple(head, refs);
> > +	if (flags & GUPF_GET)
> > +		get_head_page_multiple(head, refs);
> >  
> >  	return 1;
> >  }
> >  
> >  static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
> > -			int write, struct page **pages, int *nr)
> > +			int flags, struct page **pages, int *nr)
> >  {
> >  	unsigned long next;
> >  	pud_t *pudp;
> > @@ -226,10 +232,10 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
> >  		if (pud_none(pud))
> >  			return 0;
> >  		if (unlikely(pud_large(pud))) {
> > -			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
> > +			if (!gup_huge_pud(pud, addr, next, flags, pages, nr))
> >  				return 0;
> >  		} else {
> > -			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
> > +			if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
> >  				return 0;
> >  		}
> >  	} while (pudp++, addr = next, addr != end);
> > @@ -241,13 +247,12 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
> >   * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
> >   * back to the regular GUP.
> >   */
> > -int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> > +int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
> >  			  struct page **pages)
> >  {
> >  	struct mm_struct *mm = current->mm;
> >  	unsigned long addr, len, end;
> >  	unsigned long next;
> > -	unsigned long flags;
> >  	pgd_t *pgdp;
> >  	int nr = 0;
> >  
> > @@ -255,7 +260,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> >  	addr = start;
> >  	len = (unsigned long) nr_pages << PAGE_SHIFT;
> >  	end = start + len;
> > -	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
> > +	if (unlikely(!access_ok((flags & GUPF_WRITE) ? VERIFY_WRITE : VERIFY_READ,
> >  					(void __user *)start, len)))
> >  		return 0;
> >  
> > @@ -277,7 +282,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> >  	 * (which we do on x86, with the above PAE exception), we can follow the
> >  	 * address down to the the page and take a ref on it.
> >  	 */
> > -	local_irq_save(flags);
> >  	pgdp = pgd_offset(mm, addr);
> >  	do {
> >  		pgd_t pgd = *pgdp;
> > @@ -285,14 +289,27 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> >  		next = pgd_addr_end(addr, end);
> >  		if (pgd_none(pgd))
> >  			break;
> > -		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
> > +		if (!gup_pud_range(pgd, addr, next, flags, pages, &nr))
> >  			break;
> >  	} while (pgdp++, addr = next, addr != end);
> > -	local_irq_restore(flags);
> >  
> >  	return nr;
> >  }
> >  
> > +int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> > +			  struct page **pages)
> > +{
> > +	unsigned long flags;
> > +	int ret;
> > +
> > +	local_irq_save(flags);
> > +	ret = ___get_user_pages_fast(start, nr_pages,
> > +			GUPF_GET | (write ? GUPF_WRITE : 0), pages);
> > +	local_irq_restore(flags);
> > +
> > +	return ret;
> > +}
> > +
> >  /**
> >   * get_user_pages_fast() - pin user pages in memory
> >   * @start:	starting user address
> > 

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 16:04                       ` Don Zickus
@ 2013-10-17 16:30                         ` Peter Zijlstra
  2013-10-17 18:26                           ` Linus Torvalds
  0 siblings, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-17 16:30 UTC (permalink / raw)
  To: Don Zickus
  Cc: Andi Kleen, dave.hansen, eranian, jmario, linux-kernel, acme, mingo

On Thu, Oct 17, 2013 at 12:04:39PM -0400, Don Zickus wrote:
> I take that back the copy_from_user_nmi_iter is not super fast, I just had
> a bug in how I accumulate total time.  So some how this approach is slower
> that yesterdays.

Humm interesting..

Slightly weird, because that instruction decoder stuff is a nest of calls
too, I wouldn't have thought the one extra call made such a difference.

I suppose it would still be an improvement for the FP chase.

So I'll stick with the one below for now; this one is actually compile
and runtime tested.

---
Subject: perf, x86: Optimize intel_pmu_pebs_fixup_ip()
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Oct 2013 12:57:55 +0200

On Mon, Oct 14, 2013 at 04:35:49PM -0400, Don Zickus wrote:
> While there are a few places that are causing latencies, for now I focused on
> the longest one first.  It seems to be 'copy_user_from_nmi'
>
> intel_pmu_handle_irq ->
> 	intel_pmu_drain_pebs_nhm ->
> 		__intel_pmu_drain_pebs_nhm ->
> 			__intel_pmu_pebs_event ->
> 				intel_pmu_pebs_fixup_ip ->
> 					copy_from_user_nmi
>
> In intel_pmu_pebs_fixup_ip(), if the while-loop goes over 50, the sum of
> all the copy_from_user_nmi latencies seems to go over 1,000,000 cycles
> (there are some cases where only 10 iterations are needed to go that high
> too, but in generall over 50 or so).  At this point copy_user_from_nmi
> seems to account for over 90% of the nmi latency.

So avoid having to call copy_from_user_nmi() for every instruction.
Since we already limit the max basic block size, we can easily
pre-allocate a piece of memory to copy the entire thing into in one
go.

Don reports (for a previous version):
> Your patch made a huge difference in improvement.  The
> copy_from_user_nmi() no longer hits the million of cycles.  I still
> have a batch of 100,000-300,000 cycles.  My longest NMI paths used
> to be dominated by copy_from_user_nmi, now it is not (I have to dig
> up the new hot path).

Cc: jmario@redhat.com
Cc: acme@infradead.org
Cc: mingo@kernel.org
Cc: dave.hansen@linux.intel.com
Cc: eranian@google.com
Cc: ak@linux.intel.com
Reported-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c |   48 +++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 14 deletions(-)

--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -12,6 +12,7 @@
 
 #define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
 #define PEBS_BUFFER_SIZE	PAGE_SIZE
+#define PEBS_FIXUP_SIZE		PAGE_SIZE
 
 /*
  * pebs_record_32 for p4 and core not supported
@@ -228,12 +229,14 @@ void fini_debug_store_on_cpu(int cpu)
 	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 }
 
+static DEFINE_PER_CPU(void *, insn_buffer);
+
 static int alloc_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 	int node = cpu_to_node(cpu);
 	int max, thresh = 1; /* always use a single PEBS record */
-	void *buffer;
+	void *buffer, *ibuffer;
 
 	if (!x86_pmu.pebs)
 		return 0;
@@ -242,6 +245,15 @@ static int alloc_pebs_buffer(int cpu)
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
+	if (x86_pmu.intel_cap.pebs_format < 2) {
+		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+		if (!ibuffer) {
+			kfree(buffer);
+			return -ENOMEM;
+		}
+		per_cpu(insn_buffer, cpu) = ibuffer;
+	}
+
 	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
 
 	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
@@ -262,6 +274,9 @@ static void release_pebs_buffer(int cpu)
 	if (!ds || !x86_pmu.pebs)
 		return;
 
+	kfree(per_cpu(insn_buffer, cpu));
+	per_cpu(insn_buffer, cpu) = NULL;
+
 	kfree((void *)(unsigned long)ds->pebs_buffer_base);
 	ds->pebs_buffer_base = 0;
 }
@@ -729,6 +744,7 @@ static int intel_pmu_pebs_fixup_ip(struc
 	unsigned long old_to, to = cpuc->lbr_entries[0].to;
 	unsigned long ip = regs->ip;
 	int is_64bit = 0;
+	void *kaddr;
 
 	/*
 	 * We don't need to fixup if the PEBS assist is fault like
@@ -752,7 +768,7 @@ static int intel_pmu_pebs_fixup_ip(struc
 	 * unsigned math, either ip is before the start (impossible) or
 	 * the basic block is larger than 1 page (sanity)
 	 */
-	if ((ip - to) > PAGE_SIZE)
+	if ((ip - to) > PEBS_FIXUP_SIZE)
 		return 0;
 
 	/*
@@ -763,29 +779,33 @@ static int intel_pmu_pebs_fixup_ip(struc
 		return 1;
 	}
 
+	if (!kernel_ip(ip)) {
+		int size, bytes;
+		u8 *buf = this_cpu_read(insn_buffer);
+
+		size = ip - to; /* Must fit our buffer, see above */
+		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+		if (bytes != size)
+			return 0;
+
+		kaddr = buf;
+	} else {
+		kaddr = (void *)to;
+	}
+
 	do {
 		struct insn insn;
-		u8 buf[MAX_INSN_SIZE];
-		void *kaddr;
 
 		old_to = to;
-		if (!kernel_ip(ip)) {
-			int bytes, size = MAX_INSN_SIZE;
-
-			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-			if (bytes != size)
-				return 0;
-
-			kaddr = buf;
-		} else
-			kaddr = (void *)to;
 
 #ifdef CONFIG_X86_64
 		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
 #endif
 		insn_init(&insn, kaddr, is_64bit);
 		insn_get_length(&insn);
+
 		to += insn.length;
+		kaddr += insn.length;
 	} while (to < ip);
 
 	if (to == ip) {

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [tip:perf/core] perf/x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-16 10:57           ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Peter Zijlstra
                               ` (2 preceding siblings ...)
  2013-10-17 14:49             ` Don Zickus
@ 2013-10-17 16:50             ` tip-bot for Peter Zijlstra
  3 siblings, 0 replies; 47+ messages in thread
From: tip-bot for Peter Zijlstra @ 2013-10-17 16:50 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, torvalds, peterz, tglx, dzickus

Commit-ID:  9536c8d2da8059b00775bd9c5a84816b608cf6f4
Gitweb:     http://git.kernel.org/tip/9536c8d2da8059b00775bd9c5a84816b608cf6f4
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 15 Oct 2013 12:14:04 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Wed, 16 Oct 2013 15:44:00 +0200

perf/x86: Optimize intel_pmu_pebs_fixup_ip()

There's been reports of high NMI handler overhead, highlighted by
such kernel messages:

  [ 3697.380195] perf samples too long (10009 > 10000), lowering kernel.perf_event_max_sample_rate to 13000
  [ 3697.389509] INFO: NMI handler (perf_event_nmi_handler) took too long to run: 9.331 msecs

Don Zickus analyzed the source of the overhead and reported:

 > While there are a few places that are causing latencies, for now I focused on
 > the longest one first.  It seems to be 'copy_user_from_nmi'
 >
 > intel_pmu_handle_irq ->
 >	intel_pmu_drain_pebs_nhm ->
 >		__intel_pmu_drain_pebs_nhm ->
 >			__intel_pmu_pebs_event ->
 >				intel_pmu_pebs_fixup_ip ->
 >					copy_from_user_nmi
 >
 > In intel_pmu_pebs_fixup_ip(), if the while-loop goes over 50, the sum of
 > all the copy_from_user_nmi latencies seems to go over 1,000,000 cycles
 > (there are some cases where only 10 iterations are needed to go that high
 > too, but in generall over 50 or so).  At this point copy_user_from_nmi
 > seems to account for over 90% of the nmi latency.

The solution to that is to avoid having to call copy_from_user_nmi() for
every instruction.

Since we already limit the max basic block size, we can easily
pre-allocate a piece of memory to copy the entire thing into in one
go.

Don reported this test result:

 > Your patch made a huge difference in improvement.  The
 > copy_from_user_nmi() no longer hits the million of cycles.  I still
 > have a batch of 100,000-300,000 cycles.  My longest NMI paths used
 > to be dominated by copy_from_user_nmi, now it is not (I have to dig
 > up the new hot path).

Reported-and-tested-by: Don Zickus <dzickus@redhat.com>
Cc: jmario@redhat.com
Cc: acme@infradead.org
Cc: dave.hansen@linux.intel.com
Cc: eranian@google.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20131016105755.GX10651@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 52 ++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 32e9ed8..c1760ff 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -12,6 +12,7 @@
 
 #define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
 #define PEBS_BUFFER_SIZE	PAGE_SIZE
+#define PEBS_FIXUP_SIZE		PAGE_SIZE
 
 /*
  * pebs_record_32 for p4 and core not supported
@@ -228,12 +229,14 @@ void fini_debug_store_on_cpu(int cpu)
 	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 }
 
+static DEFINE_PER_CPU(void *, insn_buffer);
+
 static int alloc_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 	int node = cpu_to_node(cpu);
 	int max, thresh = 1; /* always use a single PEBS record */
-	void *buffer;
+	void *buffer, *ibuffer;
 
 	if (!x86_pmu.pebs)
 		return 0;
@@ -242,6 +245,19 @@ static int alloc_pebs_buffer(int cpu)
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
+	/*
+	 * HSW+ already provides us the eventing ip; no need to allocate this
+	 * buffer then.
+	 */
+	if (x86_pmu.intel_cap.pebs_format < 2) {
+		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+		if (!ibuffer) {
+			kfree(buffer);
+			return -ENOMEM;
+		}
+		per_cpu(insn_buffer, cpu) = ibuffer;
+	}
+
 	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
 
 	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
@@ -262,6 +278,9 @@ static void release_pebs_buffer(int cpu)
 	if (!ds || !x86_pmu.pebs)
 		return;
 
+	kfree(per_cpu(insn_buffer, cpu));
+	per_cpu(insn_buffer, cpu) = NULL;
+
 	kfree((void *)(unsigned long)ds->pebs_buffer_base);
 	ds->pebs_buffer_base = 0;
 }
@@ -729,6 +748,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	unsigned long old_to, to = cpuc->lbr_entries[0].to;
 	unsigned long ip = regs->ip;
 	int is_64bit = 0;
+	void *kaddr;
 
 	/*
 	 * We don't need to fixup if the PEBS assist is fault like
@@ -752,7 +772,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	 * unsigned math, either ip is before the start (impossible) or
 	 * the basic block is larger than 1 page (sanity)
 	 */
-	if ((ip - to) > PAGE_SIZE)
+	if ((ip - to) > PEBS_FIXUP_SIZE)
 		return 0;
 
 	/*
@@ -763,29 +783,33 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 		return 1;
 	}
 
+	if (!kernel_ip(ip)) {
+		int size, bytes;
+		u8 *buf = this_cpu_read(insn_buffer);
+
+		size = ip - to; /* Must fit our buffer, see above */
+		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+		if (bytes != size)
+			return 0;
+
+		kaddr = buf;
+	} else {
+		kaddr = (void *)to;
+	}
+
 	do {
 		struct insn insn;
-		u8 buf[MAX_INSN_SIZE];
-		void *kaddr;
 
 		old_to = to;
-		if (!kernel_ip(ip)) {
-			int bytes, size = MAX_INSN_SIZE;
-
-			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-			if (bytes != size)
-				return 0;
-
-			kaddr = buf;
-		} else
-			kaddr = (void *)to;
 
 #ifdef CONFIG_X86_64
 		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
 #endif
 		insn_init(&insn, kaddr, is_64bit);
 		insn_get_length(&insn);
+
 		to += insn.length;
+		kaddr += insn.length;
 	} while (to < ip);
 
 	if (to == ip) {

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 16:30                         ` Peter Zijlstra
@ 2013-10-17 18:26                           ` Linus Torvalds
  2013-10-17 21:08                             ` Peter Zijlstra
  2013-10-17 22:01                             ` Peter Zijlstra
  0 siblings, 2 replies; 47+ messages in thread
From: Linus Torvalds @ 2013-10-17 18:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Don Zickus, Andi Kleen, dave.hansen, Stephane Eranian, jmario,
	Linux Kernel Mailing List, Arnaldo Carvalho de Melo, Ingo Molnar

On Thu, Oct 17, 2013 at 9:30 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> So avoid having to call copy_from_user_nmi() for every instruction.
> Since we already limit the max basic block size, we can easily
> pre-allocate a piece of memory to copy the entire thing into in one
> go.

copy_from_user_nmi() itself is all kinds of nasty.

Using __get_user_pages_fast() for a single page is quite expensive,
and mucks around with the page counts etc.

If copy_from_user_nmi() just did the (simple) page table walk by hand,
it could avoid *all* of that. No page count stuff - just have
interrupts disabled over not just the page walk, but the copy too - to
guarantee that no cross-CPU TLB flush can come in.

So instead of trying to improve __get_user_pages_fast() - which is
impossible because the interface fundamentally means that it has to
iterate over things and check page counts - you could simplify the
caller instead.

That is, if we really care any more. Maybe this "do the
copy_from_user_nmi() just once" is already good enough that nobody
much cares.

                 Linus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 18:26                           ` Linus Torvalds
@ 2013-10-17 21:08                             ` Peter Zijlstra
  2013-10-17 21:11                               ` Peter Zijlstra
  2013-10-17 22:01                             ` Peter Zijlstra
  1 sibling, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-17 21:08 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Don Zickus, Andi Kleen, dave.hansen, Stephane Eranian, jmario,
	Linux Kernel Mailing List, Arnaldo Carvalho de Melo, Ingo Molnar

On Thu, Oct 17, 2013 at 11:26:23AM -0700, Linus Torvalds wrote:
> On Thu, Oct 17, 2013 at 9:30 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > So avoid having to call copy_from_user_nmi() for every instruction.
> > Since we already limit the max basic block size, we can easily
> > pre-allocate a piece of memory to copy the entire thing into in one
> > go.
> 
> copy_from_user_nmi() itself is all kinds of nasty.
> 
> Using __get_user_pages_fast() for a single page is quite expensive,
> and mucks around with the page counts etc.
> 
> If copy_from_user_nmi() just did the (simple) page table walk by hand,
> it could avoid *all* of that. No page count stuff - just have
> interrupts disabled over not just the page walk, but the copy too - to
> guarantee that no cross-CPU TLB flush can come in.
> 
> So instead of trying to improve __get_user_pages_fast() - which is
> impossible because the interface fundamentally means that it has to
> iterate over things and check page counts - you could simplify the
> caller instead.
> 
> That is, if we really care any more. Maybe this "do the
> copy_from_user_nmi() just once" is already good enough that nobody
> much cares.

I did a patch that avoids the page count mucking about, Don didn't see
any significant improvements from it.

---
Subject: x86: Optimize copy_from_user_nmi()
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed Oct 16 10:55:59 CEST 2013

Since copy_from_user_nmi() is pretty much always called from either
IRQ or NMI context there's no need to take and release a reference on
the page.

Provide yet another __gup_fast() interface: '___gup_fast()' which
assumes the called has disabled IRQs and which makes the taking of
page count references optional.

Then change copy_from_user_nmi() to use this new variant to avoid
taking and releasing page references, thereby avoiding a number of
atomic ops.

This can be esp. useful when profiling threaded apps that run mostly
the same code; in that case intel_pmu_pebs_fixup_ip() can call
copy_form_user_nmi() a lot on the same few text pages from many CPUs
at the same time.

Cc: eranian@google.com
Cc: Don Zickus <dzickus@redhat.com>
Cc: jmario@redhat.com
Cc: acme@infradead.org
Cc: mingo@kernel.org
Cc: dave.hansen@linux.intel.com
Suggested-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20131015150736.GZ26785@twins.programming.kicks-ass.net
---
 arch/x86/lib/usercopy.c |   12 ++++++---
 arch/x86/mm/gup.c       |   63 ++++++++++++++++++++++++++++++------------------
 2 files changed, 49 insertions(+), 26 deletions(-)

--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -10,6 +10,8 @@
 #include <asm/word-at-a-time.h>
 #include <linux/sched.h>
 
+extern int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
+			  struct page **pages);
 /*
  * best effort, GUP based copy_from_user() that is NMI-safe
  */
@@ -18,6 +20,7 @@ copy_from_user_nmi(void *to, const void
 {
 	unsigned long offset, addr = (unsigned long)from;
 	unsigned long size, len = 0;
+	unsigned long flags;
 	struct page *page;
 	void *map;
 	int ret;
@@ -26,9 +29,12 @@ copy_from_user_nmi(void *to, const void
 		return len;
 
 	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
+		local_irq_save(flags);
+		ret = ___get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret) {
+			local_irq_restore(flags);
 			break;
+		}
 
 		offset = addr & (PAGE_SIZE - 1);
 		size = min(PAGE_SIZE - offset, n - len);
@@ -36,7 +42,7 @@ copy_from_user_nmi(void *to, const void
 		map = kmap_atomic(page);
 		memcpy(to, map+offset, size);
 		kunmap_atomic(map);
-		put_page(page);
+		local_irq_restore(flags);
 
 		len  += size;
 		to   += size;
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -63,19 +63,22 @@ static inline pte_t gup_get_pte(pte_t *p
 #endif
 }
 
+#define GUPF_GET	0x01
+#define GUPF_WRITE	0x02
+
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
  * register pressure.
  */
 static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t *ptep;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 
 	ptep = pte_offset_map(&pmd, addr);
@@ -89,7 +92,8 @@ static noinline int gup_pte_range(pmd_t
 		}
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 		page = pte_page(pte);
-		get_page(page);
+		if (flags & GUPF_GET)
+			get_page(page);
 		SetPageReferenced(page);
 		pages[*nr] = page;
 		(*nr)++;
@@ -109,7 +113,7 @@ static inline void get_head_page_multipl
 }
 
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t pte = *(pte_t *)&pmd;
@@ -117,7 +121,7 @@ static noinline int gup_huge_pmd(pmd_t p
 	int refs;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 	if ((pte_flags(pte) & mask) != mask)
 		return 0;
@@ -131,19 +135,20 @@ static noinline int gup_huge_pmd(pmd_t p
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
-		if (PageTail(page))
+		if ((flags & GUPF_GET) && PageTail(page))
 			get_huge_page_tail(page);
 		(*nr)++;
 		page++;
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
-	get_head_page_multiple(head, refs);
+	if (flags & GUPF_GET)
+		get_head_page_multiple(head, refs);
 
 	return 1;
 }
 
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
-		int write, struct page **pages, int *nr)
+		int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pmd_t *pmdp;
@@ -167,10 +172,10 @@ static int gup_pmd_range(pud_t pud, unsi
 		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 			return 0;
 		if (unlikely(pmd_large(pmd))) {
-			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+			if (!gup_huge_pmd(pmd, addr, next, flags, pages, nr))
 				return 0;
 		} else {
-			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+			if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
 				return 0;
 		}
 	} while (pmdp++, addr = next, addr != end);
@@ -179,7 +184,7 @@ static int gup_pmd_range(pud_t pud, unsi
 }
 
 static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
+		unsigned long end, int flags, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	pte_t pte = *(pte_t *)&pud;
@@ -187,7 +192,7 @@ static noinline int gup_huge_pud(pud_t p
 	int refs;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
-	if (write)
+	if (flags & GUPF_WRITE)
 		mask |= _PAGE_RW;
 	if ((pte_flags(pte) & mask) != mask)
 		return 0;
@@ -201,19 +206,20 @@ static noinline int gup_huge_pud(pud_t p
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
-		if (PageTail(page))
+		if ((flags & GUPF_GET) && PageTail(page))
 			get_huge_page_tail(page);
 		(*nr)++;
 		page++;
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
-	get_head_page_multiple(head, refs);
+	if (flags & GUPF_GET)
+		get_head_page_multiple(head, refs);
 
 	return 1;
 }
 
 static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
-			int write, struct page **pages, int *nr)
+			int flags, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pud_t *pudp;
@@ -226,10 +232,10 @@ static int gup_pud_range(pgd_t pgd, unsi
 		if (pud_none(pud))
 			return 0;
 		if (unlikely(pud_large(pud))) {
-			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
+			if (!gup_huge_pud(pud, addr, next, flags, pages, nr))
 				return 0;
 		} else {
-			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
 				return 0;
 		}
 	} while (pudp++, addr = next, addr != end);
@@ -241,13 +247,12 @@ static int gup_pud_range(pgd_t pgd, unsi
  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
  * back to the regular GUP.
  */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+int ___get_user_pages_fast(unsigned long start, int nr_pages, int flags,
 			  struct page **pages)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr, len, end;
 	unsigned long next;
-	unsigned long flags;
 	pgd_t *pgdp;
 	int nr = 0;
 
@@ -255,7 +260,7 @@ int __get_user_pages_fast(unsigned long
 	addr = start;
 	len = (unsigned long) nr_pages << PAGE_SHIFT;
 	end = start + len;
-	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+	if (unlikely(!access_ok((flags & GUPF_WRITE) ? VERIFY_WRITE : VERIFY_READ,
 					(void __user *)start, len)))
 		return 0;
 
@@ -277,7 +282,6 @@ int __get_user_pages_fast(unsigned long
 	 * (which we do on x86, with the above PAE exception), we can follow the
 	 * address down to the the page and take a ref on it.
 	 */
-	local_irq_save(flags);
 	pgdp = pgd_offset(mm, addr);
 	do {
 		pgd_t pgd = *pgdp;
@@ -285,14 +289,27 @@ int __get_user_pages_fast(unsigned long
 		next = pgd_addr_end(addr, end);
 		if (pgd_none(pgd))
 			break;
-		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+		if (!gup_pud_range(pgd, addr, next, flags, pages, &nr))
 			break;
 	} while (pgdp++, addr = next, addr != end);
-	local_irq_restore(flags);
 
 	return nr;
 }
 
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret = ___get_user_pages_fast(start, nr_pages,
+			GUPF_GET | (write ? GUPF_WRITE : 0), pages);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
 /**
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 21:08                             ` Peter Zijlstra
@ 2013-10-17 21:11                               ` Peter Zijlstra
  0 siblings, 0 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-17 21:11 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Don Zickus, Andi Kleen, dave.hansen, Stephane Eranian, jmario,
	Linux Kernel Mailing List, Arnaldo Carvalho de Melo, Ingo Molnar

On Thu, Oct 17, 2013 at 11:08:16PM +0200, Peter Zijlstra wrote:
> I did a patch that avoids the page count mucking about, Don didn't see
> any significant improvements from it.

On top of which there's another patch -- which could as easily be done
without it, that adds some state to the copy_from_user_nmi() and avoids
re-doing the page walk and memcpy().

Don also tried a vraiant of the below for the pebs fixup code; that
turned out to be slower than the single copy all at once.

The below -- as stated in the changelog -- might still improve the
frame-pointer chase, but I've no numbers what so ever atm.

---
Subject: perf, x86: Optimize copy_from_user_nmi()
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 17 Oct 2013 11:41:45 +0200

In an attempt to speed up the frame-pointer chase we introduce a new
copy_from_user_nmi() interface:

  struct copy_from_user_nmi_state;

  void *copy_from_user_nmi_iter(void *to, void __user *from, int s,
				struct copy_from_user_nmi_state *state);
  void copy_from_user_nmi_end(struct copy_from_user_nmi_state *state);

The _iter() method returns a pointer to the memory requested; if this
is entirely contained within one page it simply returns a pointer into
the kmap and avoids the copy. Otherwise it will copy into the buffer
provided in the @to argument and return a pointer thereto.

Because we potentially need to keep the kmap alive, we need the
closing _end() function and the @state variable.

Since we keep state, we also avoid the page-table walk for consecutive
accesses to the same page.

Using this we (hopefully) reduce the number of page-table walks (and
kmap on i386) operations.

Cc: Don Zickus <dzickus@redhat.com>
Cc: dave.hansen@linux.intel.com
Cc: eranian@google.com
Cc: jmario@redhat.com
Cc: acme@infradead.org
Cc: mingo@kernel.org
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20131017094145.GE3364@laptop.programming.kicks-ass.net
---
 arch/x86/include/asm/uaccess.h   |   13 ++++++
 arch/x86/kernel/cpu/perf_event.c |   32 ++++++---------
 arch/x86/lib/usercopy.c          |   79 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+), 18 deletions(-)

--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -516,6 +516,19 @@ struct __large_struct { unsigned long bu
 
 extern unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
+
+struct copy_from_user_nmi_state {
+	void *map;
+	unsigned long address;
+	unsigned long flags;
+};
+
+extern void *
+copy_from_user_nmi_iter(void *to, const void __user *from,
+			unsigned long n, struct copy_from_user_nmi_state *state);
+extern void
+copy_from_user_nmi_end(struct copy_from_user_nmi_state *state);
+
 extern __must_check long
 strncpy_from_user(char *dst, const char __user *src, long count);
 
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1979,8 +1979,9 @@ static inline int
 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	/* 32-bit process in 64-bit kernel. */
+	struct copy_from_user_nmi_state state = { NULL };
 	unsigned long ss_base, cs_base;
-	struct stack_frame_ia32 frame;
+	struct stack_frame_ia32 frame, *f;
 	const void __user *fp;
 
 	if (!test_thread_flag(TIF_IA32))
@@ -1991,20 +1992,17 @@ perf_callchain_user32(struct pt_regs *re
 
 	fp = compat_ptr(ss_base + regs->bp);
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
-		unsigned long bytes;
-		frame.next_frame     = 0;
-		frame.return_address = 0;
-
-		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
-		if (bytes != sizeof(frame))
+		f = copy_from_user_nmi_iter(&frame, fp, sizeof(frame), &state);
+		if (!f)
 			break;
 
 		if (!valid_user_frame(fp, sizeof(frame)))
 			break;
 
-		perf_callchain_store(entry, cs_base + frame.return_address);
-		fp = compat_ptr(ss_base + frame.next_frame);
+		perf_callchain_store(entry, cs_base + f->return_address);
+		fp = compat_ptr(ss_base + f->next_frame);
 	}
+	copy_from_user_nmi_end(&state);
 	return 1;
 }
 #else
@@ -2018,7 +2016,8 @@ perf_callchain_user32(struct pt_regs *re
 void
 perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
-	struct stack_frame frame;
+	struct copy_from_user_nmi_state state = { NULL };
+	struct stack_frame frame, *f;
 	const void __user *fp;
 
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
@@ -2043,20 +2042,17 @@ perf_callchain_user(struct perf_callchai
 		return;
 
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
-		unsigned long bytes;
-		frame.next_frame	     = NULL;
-		frame.return_address = 0;
-
-		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
-		if (bytes != sizeof(frame))
+		f = copy_from_user_nmi_iter(&frame, fp, sizeof(frame), &state);
+		if (!f)
 			break;
 
 		if (!valid_user_frame(fp, sizeof(frame)))
 			break;
 
-		perf_callchain_store(entry, frame.return_address);
-		fp = frame.next_frame;
+		perf_callchain_store(entry, f->return_address);
+		fp = f->next_frame;
 	}
+	copy_from_user_nmi_end(&state);
 }
 
 /*
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -53,3 +53,82 @@ copy_from_user_nmi(void *to, const void
 	return len;
 }
 EXPORT_SYMBOL_GPL(copy_from_user_nmi);
+
+void *copy_from_user_nmi_iter(void *to, const void __user *from,
+		unsigned long n, struct copy_from_user_nmi_state *state)
+{
+	unsigned long offset, addr = (unsigned long)from;
+	unsigned long size, len = 0;
+	unsigned long flags;
+	struct page *page;
+	void *map, *_to = to;
+	int ret;
+
+	if (__range_not_ok(from, n, TASK_SIZE))
+		return NULL;
+
+	if (state->map) {
+		if ((state->address >> PAGE_SHIFT) ==
+		    (addr >> PAGE_SHIFT)) {
+			flags = state->flags;
+			map = state->map;
+			goto got_page;
+		}
+		kunmap_atomic(state->map);
+		local_irq_restore(state->flags);
+	}
+
+	for (;;) {
+		local_irq_save(flags);
+		ret = ___get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret) {
+			local_irq_restore(flags);
+			state->map = NULL;
+			return NULL;
+		}
+
+		map = kmap_atomic(page);
+
+got_page:
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
+
+		/*
+		 * If the entire desired range falls within the one page
+		 * avoid the copy and return a pointer into the kmap.
+		 */
+		if (size == n) {
+			_to = map + offset;
+			break;
+		}
+
+		memcpy(to, map+offset, size);
+		len += size;
+
+		if (len == n)
+			break;
+
+		to   += size;
+		addr += size;
+
+		kunmap_atomic(map);
+		local_irq_restore(flags);
+	}
+
+	state->address = addr;
+	state->flags = flags;
+	state->map = map;
+
+	return _to;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi_iter);
+
+void copy_from_user_nmi_end(struct copy_from_user_nmi_state *state)
+{
+	if (state->map) {
+		kunmap_atomic(state->map);
+		local_irq_restore(state->flags);
+		state->map = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi_end);

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 18:26                           ` Linus Torvalds
  2013-10-17 21:08                             ` Peter Zijlstra
@ 2013-10-17 22:01                             ` Peter Zijlstra
  2013-10-17 22:27                               ` Linus Torvalds
  1 sibling, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-17 22:01 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Don Zickus, Andi Kleen, dave.hansen, Stephane Eranian, jmario,
	Linux Kernel Mailing List, Arnaldo Carvalho de Melo, Ingo Molnar

On Thu, Oct 17, 2013 at 11:26:23AM -0700, Linus Torvalds wrote:
> On Thu, Oct 17, 2013 at 9:30 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > So avoid having to call copy_from_user_nmi() for every instruction.
> > Since we already limit the max basic block size, we can easily
> > pre-allocate a piece of memory to copy the entire thing into in one
> > go.
> 
> copy_from_user_nmi() itself is all kinds of nasty.

Oh wait,.. now that Steven fixed being able to take faults from NMI
context; we could actually try copy_from_user_inatomic(). Being able to
directly access userspace would make the whole deal a lot easier again.

I'll go try this tomorrow.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 22:01                             ` Peter Zijlstra
@ 2013-10-17 22:27                               ` Linus Torvalds
  2013-10-22 21:12                                 ` Peter Zijlstra
  0 siblings, 1 reply; 47+ messages in thread
From: Linus Torvalds @ 2013-10-17 22:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Don Zickus, Andi Kleen, dave.hansen, Stephane Eranian, jmario,
	Linux Kernel Mailing List, Arnaldo Carvalho de Melo, Ingo Molnar

On Thu, Oct 17, 2013 at 3:01 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Oh wait,.. now that Steven fixed being able to take faults from NMI
> context; we could actually try copy_from_user_inatomic(). Being able to
> directly access userspace would make the whole deal a lot easier again.

Careful! There is one magic piece of state that you need to
save-and-restore if you do this, namely %cr2. Taking a page fault
always writes to %cr2, and we must *not* corrupt it in the NMI
handler.

Also, right now, it looks like we call notify_page_fault() in the
atomic page fault case, and that would be deadly from within an NMI.

But if you move the "in_atomic()" check earlier in __do_page_fault(),
you can *try* to do something like this:

  unsigned long
  copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
  {
        unsigned long cr2, flags,ret;

        if (__range_not_ok(from, n, TASK_SIZE))
                return 0;
        local_irq_save(flags);
        cr2 = read_cr2();
        ret = __copy_from_user_inatomic(to, from, n);
        /* Reading cr2 is likely much faster than writing it - but go
check this.. */
        if (cr2 != read_cr2())
                write_cr2(cr2);
        local_irq_restore(flags);
        return n - ret;
  }

or something close to that. But you absolutely *have* to save/restore
%cr2 (the above tries to avoid writing it if it didn't change,
somebody should check the timings on that to see whether it makes
sense or not).

             Linus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-17 22:27                               ` Linus Torvalds
@ 2013-10-22 21:12                                 ` Peter Zijlstra
  2013-10-23  7:09                                   ` Linus Torvalds
  2013-10-23  7:44                                   ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Ingo Molnar
  0 siblings, 2 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-22 21:12 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Don Zickus, Andi Kleen, dave.hansen, Stephane Eranian, jmario,
	Linux Kernel Mailing List, Arnaldo Carvalho de Melo, Ingo Molnar

On Thu, Oct 17, 2013 at 03:27:48PM -0700, Linus Torvalds wrote:
> On Thu, Oct 17, 2013 at 3:01 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Oh wait,.. now that Steven fixed being able to take faults from NMI
> > context; we could actually try copy_from_user_inatomic(). Being able to
> > directly access userspace would make the whole deal a lot easier again.
> 
> Careful! There is one magic piece of state that you need to
> save-and-restore if you do this, namely %cr2. Taking a page fault
> always writes to %cr2, and we must *not* corrupt it in the NMI
> handler.

It looks like this is already dealt with (a similar thing is done for
i386).

---
commit 7fbb98c5cb07563d3ee08714073a8e5452a96be2
Author: Steven Rostedt <srostedt@redhat.com>
Date:   Thu Jun 7 10:21:21 2012 -0400

    x86: Save cr2 in NMI in case NMIs take a page fault
    
    Avi Kivity reported that page faults in NMIs could cause havic if
    the NMI preempted another page fault handler:
    
       The recent changes to NMI allow exceptions to take place in NMI
       handlers, but I think that a #PF (say, due to access to vmalloc space)
       is still problematic.  Consider the sequence
    
        #PF  (cr2 set by processor)
          NMI
            ...
            #PF (cr2 clobbered)
              do_page_fault()
              IRET
            ...
            IRET
          do_page_fault()
            address = read_cr2()
    
       The last line reads the overwritten cr2 value.
    
    Originally I wrote a patch to solve this by saving the cr2 on the stack.
    Brian Gerst suggested to save it in the r12 register as both r12 and rbx
    are saved by the do_nmi handler as required by the C standard. But rbx
    is already used for saving if swapgs needs to be run on exit of the NMI
    handler.
    
    Link: http://lkml.kernel.org/r/4FBB8C40.6080304@redhat.com
    Link: http://lkml.kernel.org/r/1337763411.13348.140.camel@gandalf.stny.rr.com
    
    Reported-by: Avi Kivity <avi@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: H. Peter Anvin <hpa@zytor.com>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Suggested-by: Brian Gerst <brgerst@gmail.com>
    Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 7d65133..111f6bb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1758,10 +1758,30 @@ ENTRY(nmi)
 	 */
 	call save_paranoid
 	DEFAULT_FRAME 0
+
+	/*
+	 * Save off the CR2 register. If we take a page fault in the NMI then
+	 * it could corrupt the CR2 value. If the NMI preempts a page fault
+	 * handler before it was able to read the CR2 register, and then the
+	 * NMI itself takes a page fault, the page fault that was preempted
+	 * will read the information from the NMI page fault and not the
+	 * origin fault. Save it off and restore it if it changes.
+	 * Use the r12 callee-saved register.
+	 */
+	movq %cr2, %r12
+
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
 	movq %rsp,%rdi
 	movq $-1,%rsi
 	call do_nmi
+
+	/* Did the NMI take a page fault? Restore cr2 if it did */
+	movq %cr2, %rcx
+	cmpq %rcx, %r12
+	je 1f
+	movq %r12, %cr2
+1:
+	
 	testl %ebx,%ebx				/* swapgs needed? */
 	jnz nmi_restore
 nmi_swapgs:

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-22 21:12                                 ` Peter Zijlstra
@ 2013-10-23  7:09                                   ` Linus Torvalds
  2013-10-23 20:48                                     ` Peter Zijlstra
  2013-10-23  7:44                                   ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Ingo Molnar
  1 sibling, 1 reply; 47+ messages in thread
From: Linus Torvalds @ 2013-10-23  7:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Don Zickus, Andi Kleen, dave.hansen, Stephane Eranian, jmario,
	Linux Kernel Mailing List, Arnaldo Carvalho de Melo, Ingo Molnar

On Tue, Oct 22, 2013 at 10:12 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> Careful! There is one magic piece of state that you need to
>> save-and-restore if you do this, namely %cr2. Taking a page fault
>> always writes to %cr2, and we must *not* corrupt it in the NMI
>> handler.
>
> It looks like this is already dealt with (a similar thing is done for
> i386).

Oh, ok then, we should be good to go. I wonder why we needed that
special "_nmi()" version, then..

Please do check that NMI increment the irq-counts etc.. Otherwise
you'll need to add the explicit "pagefault_disable/enable()" pair
around the __copy_from_user_inatomic()..

            Linus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-22 21:12                                 ` Peter Zijlstra
  2013-10-23  7:09                                   ` Linus Torvalds
@ 2013-10-23  7:44                                   ` Ingo Molnar
  1 sibling, 0 replies; 47+ messages in thread
From: Ingo Molnar @ 2013-10-23  7:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Don Zickus, Andi Kleen, dave.hansen,
	Stephane Eranian, jmario, Linux Kernel Mailing List,
	Arnaldo Carvalho de Melo


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, Oct 17, 2013 at 03:27:48PM -0700, Linus Torvalds wrote:
> > On Thu, Oct 17, 2013 at 3:01 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > Oh wait,.. now that Steven fixed being able to take faults from NMI
> > > context; we could actually try copy_from_user_inatomic(). Being able to
> > > directly access userspace would make the whole deal a lot easier again.
> > 
> > Careful! There is one magic piece of state that you need to
> > save-and-restore if you do this, namely %cr2. Taking a page fault
> > always writes to %cr2, and we must *not* corrupt it in the NMI
> > handler.
> 
> It looks like this is already dealt with (a similar thing is done 
> for i386).

> commit 7fbb98c5cb07563d3ee08714073a8e5452a96be2
> Author: Steven Rostedt <srostedt@redhat.com>
> Date:   Thu Jun 7 10:21:21 2012 -0400
> 
>     x86: Save cr2 in NMI in case NMIs take a page fault
>     
>     Avi Kivity reported that page faults in NMIs could cause havic if
>     the NMI preempted another page fault handler:
>     
>        The recent changes to NMI allow exceptions to take place in NMI
>        handlers, but I think that a #PF (say, due to access to vmalloc space)
>        is still problematic.  Consider the sequence
>     
>         #PF  (cr2 set by processor)
>           NMI
>             ...
>             #PF (cr2 clobbered)
>               do_page_fault()
>               IRET
>             ...
>             IRET
>           do_page_fault()
>             address = read_cr2()
>     
>        The last line reads the overwritten cr2 value.
>     
>     Originally I wrote a patch to solve this by saving the cr2 on the stack.
>     Brian Gerst suggested to save it in the r12 register as both r12 and rbx
>     are saved by the do_nmi handler as required by the C standard. But rbx
>     is already used for saving if swapgs needs to be run on exit of the NMI
>     handler.
>     
>     Link: http://lkml.kernel.org/r/4FBB8C40.6080304@redhat.com
>     Link: http://lkml.kernel.org/r/1337763411.13348.140.camel@gandalf.stny.rr.com
>     
>     Reported-by: Avi Kivity <avi@redhat.com>
>     Cc: Linus Torvalds <torvalds@linux-foundation.org>
>     Cc: H. Peter Anvin <hpa@zytor.com>
>     Cc: Thomas Gleixner <tglx@linutronix.de>
>     Suggested-by: Brian Gerst <brgerst@gmail.com>
>     Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
> 
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 7d65133..111f6bb 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -1758,10 +1758,30 @@ ENTRY(nmi)
>  	 */
>  	call save_paranoid
>  	DEFAULT_FRAME 0
> +
> +	/*
> +	 * Save off the CR2 register. If we take a page fault in the NMI then
> +	 * it could corrupt the CR2 value. If the NMI preempts a page fault
> +	 * handler before it was able to read the CR2 register, and then the
> +	 * NMI itself takes a page fault, the page fault that was preempted
> +	 * will read the information from the NMI page fault and not the
> +	 * origin fault. Save it off and restore it if it changes.
> +	 * Use the r12 callee-saved register.
> +	 */
> +	movq %cr2, %r12

> +	/* Did the NMI take a page fault? Restore cr2 if it did */
> +	movq %cr2, %rcx
> +	cmpq %rcx, %r12
> +	je 1f
> +	movq %r12, %cr2
> +1:
> +	

Btw., depending on how expensive cr2 is to read this could be 
optimized a bit, by matching on the race window RIP range instead of 
saving cr2 unconditionally.

It would take a bit of restructuring of the page fault entry path to 
get the race window into a single RIP range, but that should be 
fairly trivial and might even speed up the page fault handler as the 
CPU can start the cr2 load sooner in the page fault handling 
sequence.

It's all a function of how expensive a cr2 read is. If it's 2 cycles 
then it doesn't matter much. If it's 20 then it might be worthwile.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-23  7:09                                   ` Linus Torvalds
@ 2013-10-23 20:48                                     ` Peter Zijlstra
  2013-10-24 10:52                                       ` Peter Zijlstra
  0 siblings, 1 reply; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-23 20:48 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Don Zickus, Andi Kleen, dave.hansen, Stephane Eranian, jmario,
	Linux Kernel Mailing List, Arnaldo Carvalho de Melo, Ingo Molnar

On Wed, Oct 23, 2013 at 08:09:53AM +0100, Linus Torvalds wrote:
> On Tue, Oct 22, 2013 at 10:12 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> >>
> >> Careful! There is one magic piece of state that you need to
> >> save-and-restore if you do this, namely %cr2. Taking a page fault
> >> always writes to %cr2, and we must *not* corrupt it in the NMI
> >> handler.
> >
> > It looks like this is already dealt with (a similar thing is done for
> > i386).
> 
> Oh, ok then, we should be good to go. I wonder why we needed that
> special "_nmi()" version, then..

Ah, the whole fault from nmi trickery from Steve is from after we did
the copy_from_user_nmi() thing. We're only just catching up :-)

> Please do check that NMI increment the irq-counts etc.. Otherwise
> you'll need to add the explicit "pagefault_disable/enable()" pair
> around the __copy_from_user_inatomic()..

Yeah, we add NMI_OFFSET to preempt_count on nmi_enter.

I'll also make sure to test we actually hit the fault path
by concurrently running something like:

 while :; echo 1 > /proc/sys/vm/drop_caches ; done

while doing perf top or so.. 

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-23 20:48                                     ` Peter Zijlstra
@ 2013-10-24 10:52                                       ` Peter Zijlstra
  2013-10-24 13:47                                         ` Don Zickus
                                                           ` (2 more replies)
  0 siblings, 3 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-24 10:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Don Zickus, Andi Kleen, dave.hansen, Stephane Eranian, jmario,
	Linux Kernel Mailing List, Arnaldo Carvalho de Melo, Ingo Molnar

On Wed, Oct 23, 2013 at 10:48:38PM +0200, Peter Zijlstra wrote:
> I'll also make sure to test we actually hit the fault path
> by concurrently running something like:
> 
>  while :; echo 1 > /proc/sys/vm/drop_caches ; done
> 
> while doing perf top or so.. 

So the below appears to work; I've ran:

  while :; do echo 1 > /proc/sys/vm/drop_caches; sleep 1; done &
  while :; do make O=defconfig-build/ clean; perf record -a -g fp -e cycles:pp make O=defconfig-build/ -s -j64; done

And verified that the if (in_nmi()) trace_printk() was visible in the
trace output verifying we indeed took the fault from the NMI code.

I've had this running for ~ 30 minutes or so and the machine is still
healthy.

Don, can you give this stuff a spin on your system?

---
 arch/x86/lib/usercopy.c | 43 +++++++++++++++----------------------------
 arch/x86/mm/fault.c     | 43 +++++++++++++++++++++++--------------------
 2 files changed, 38 insertions(+), 48 deletions(-)

diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index 4f74d94c8d97..5465b8613944 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -11,39 +11,26 @@
 #include <linux/sched.h>
 
 /*
- * best effort, GUP based copy_from_user() that is NMI-safe
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
  */
 unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 {
-	unsigned long offset, addr = (unsigned long)from;
-	unsigned long size, len = 0;
-	struct page *page;
-	void *map;
-	int ret;
+	unsigned long ret;
 
 	if (__range_not_ok(from, n, TASK_SIZE))
-		return len;
-
-	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
-			break;
-
-		offset = addr & (PAGE_SIZE - 1);
-		size = min(PAGE_SIZE - offset, n - len);
-
-		map = kmap_atomic(page);
-		memcpy(to, map+offset, size);
-		kunmap_atomic(map);
-		put_page(page);
-
-		len  += size;
-		to   += size;
-		addr += size;
-
-	} while (len < n);
-
-	return len;
+		return 0;
+
+	/*
+	 * Even though this function is typically called from NMI/IRQ context
+	 * disable pagefaults so that its behaviour is consistent even when
+	 * called form other contexts.
+	 */
+	pagefault_disable();
+	ret = __copy_from_user_inatomic(to, from, n);
+	pagefault_enable();
+
+	return n - ret;
 }
 EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3aaeffcfd67a..506564b13ba7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -51,7 +51,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
 	return 0;
 }
 
-static inline int __kprobes notify_page_fault(struct pt_regs *regs)
+static inline int __kprobes kprobes_fault(struct pt_regs *regs)
 {
 	int ret = 0;
 
@@ -1048,7 +1048,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 			return;
 
 		/* kprobes don't want to hook the spurious faults: */
-		if (notify_page_fault(regs))
+		if (kprobes_fault(regs))
 			return;
 		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -1060,23 +1060,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	}
 
 	/* kprobes don't want to hook the spurious faults: */
-	if (unlikely(notify_page_fault(regs)))
+	if (unlikely(kprobes_fault(regs)))
 		return;
-	/*
-	 * It's safe to allow irq's after cr2 has been saved and the
-	 * vmalloc fault has been handled.
-	 *
-	 * User-mode registers count as a user access even for any
-	 * potential system fault or CPU buglet:
-	 */
-	if (user_mode_vm(regs)) {
-		local_irq_enable();
-		error_code |= PF_USER;
-		flags |= FAULT_FLAG_USER;
-	} else {
-		if (regs->flags & X86_EFLAGS_IF)
-			local_irq_enable();
-	}
 
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
@@ -1088,17 +1073,35 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 		}
 	}
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-
 	/*
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
 	 */
 	if (unlikely(in_atomic() || !mm)) {
+		if (in_nmi())
+			trace_printk("YIPEE!!!\n");
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
 
+	/*
+	 * It's safe to allow irq's after cr2 has been saved and the
+	 * vmalloc fault has been handled.
+	 *
+	 * User-mode registers count as a user access even for any
+	 * potential system fault or CPU buglet:
+	 */
+	if (user_mode_vm(regs)) {
+		local_irq_enable();
+		error_code |= PF_USER;
+		flags |= FAULT_FLAG_USER;
+	} else {
+		if (regs->flags & X86_EFLAGS_IF)
+			local_irq_enable();
+	}
+
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
 	if (error_code & PF_WRITE)
 		flags |= FAULT_FLAG_WRITE;
 

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-24 10:52                                       ` Peter Zijlstra
@ 2013-10-24 13:47                                         ` Don Zickus
  2013-10-24 14:06                                           ` Peter Zijlstra
  2013-10-25 16:33                                         ` Don Zickus
  2013-10-29 14:08                                         ` [tip:perf/core] perf/x86: Further optimize copy_from_user_nmi() tip-bot for Peter Zijlstra
  2 siblings, 1 reply; 47+ messages in thread
From: Don Zickus @ 2013-10-24 13:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Andi Kleen, dave.hansen, Stephane Eranian,
	jmario, Linux Kernel Mailing List, Arnaldo Carvalho de Melo,
	Ingo Molnar

On Thu, Oct 24, 2013 at 12:52:06PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 23, 2013 at 10:48:38PM +0200, Peter Zijlstra wrote:
> > I'll also make sure to test we actually hit the fault path
> > by concurrently running something like:
> > 
> >  while :; echo 1 > /proc/sys/vm/drop_caches ; done
> > 
> > while doing perf top or so.. 
> 
> So the below appears to work; I've ran:
> 
>   while :; do echo 1 > /proc/sys/vm/drop_caches; sleep 1; done &
>   while :; do make O=defconfig-build/ clean; perf record -a -g fp -e cycles:pp make O=defconfig-build/ -s -j64; done
> 
> And verified that the if (in_nmi()) trace_printk() was visible in the
> trace output verifying we indeed took the fault from the NMI code.
> 
> I've had this running for ~ 30 minutes or so and the machine is still
> healthy.
> 
> Don, can you give this stuff a spin on your system?

I'll try to grab the machine I was testing with and see what this patch
does.  Thanks!  I assume this can go on top of the other patch that was
committed to -tip last week?

Cheers,
Don

> 
> ---
>  arch/x86/lib/usercopy.c | 43 +++++++++++++++----------------------------
>  arch/x86/mm/fault.c     | 43 +++++++++++++++++++++++--------------------
>  2 files changed, 38 insertions(+), 48 deletions(-)
> 
> diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
> index 4f74d94c8d97..5465b8613944 100644
> --- a/arch/x86/lib/usercopy.c
> +++ b/arch/x86/lib/usercopy.c
> @@ -11,39 +11,26 @@
>  #include <linux/sched.h>
>  
>  /*
> - * best effort, GUP based copy_from_user() that is NMI-safe
> + * We rely on the nested NMI work to allow atomic faults from the NMI path; the
> + * nested NMI paths are careful to preserve CR2.
>   */
>  unsigned long
>  copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
>  {
> -	unsigned long offset, addr = (unsigned long)from;
> -	unsigned long size, len = 0;
> -	struct page *page;
> -	void *map;
> -	int ret;
> +	unsigned long ret;
>  
>  	if (__range_not_ok(from, n, TASK_SIZE))
> -		return len;
> -
> -	do {
> -		ret = __get_user_pages_fast(addr, 1, 0, &page);
> -		if (!ret)
> -			break;
> -
> -		offset = addr & (PAGE_SIZE - 1);
> -		size = min(PAGE_SIZE - offset, n - len);
> -
> -		map = kmap_atomic(page);
> -		memcpy(to, map+offset, size);
> -		kunmap_atomic(map);
> -		put_page(page);
> -
> -		len  += size;
> -		to   += size;
> -		addr += size;
> -
> -	} while (len < n);
> -
> -	return len;
> +		return 0;
> +
> +	/*
> +	 * Even though this function is typically called from NMI/IRQ context
> +	 * disable pagefaults so that its behaviour is consistent even when
> +	 * called form other contexts.
> +	 */
> +	pagefault_disable();
> +	ret = __copy_from_user_inatomic(to, from, n);
> +	pagefault_enable();
> +
> +	return n - ret;
>  }
>  EXPORT_SYMBOL_GPL(copy_from_user_nmi);
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 3aaeffcfd67a..506564b13ba7 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -51,7 +51,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
>  	return 0;
>  }
>  
> -static inline int __kprobes notify_page_fault(struct pt_regs *regs)
> +static inline int __kprobes kprobes_fault(struct pt_regs *regs)
>  {
>  	int ret = 0;
>  
> @@ -1048,7 +1048,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
>  			return;
>  
>  		/* kprobes don't want to hook the spurious faults: */
> -		if (notify_page_fault(regs))
> +		if (kprobes_fault(regs))
>  			return;
>  		/*
>  		 * Don't take the mm semaphore here. If we fixup a prefetch
> @@ -1060,23 +1060,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
>  	}
>  
>  	/* kprobes don't want to hook the spurious faults: */
> -	if (unlikely(notify_page_fault(regs)))
> +	if (unlikely(kprobes_fault(regs)))
>  		return;
> -	/*
> -	 * It's safe to allow irq's after cr2 has been saved and the
> -	 * vmalloc fault has been handled.
> -	 *
> -	 * User-mode registers count as a user access even for any
> -	 * potential system fault or CPU buglet:
> -	 */
> -	if (user_mode_vm(regs)) {
> -		local_irq_enable();
> -		error_code |= PF_USER;
> -		flags |= FAULT_FLAG_USER;
> -	} else {
> -		if (regs->flags & X86_EFLAGS_IF)
> -			local_irq_enable();
> -	}
>  
>  	if (unlikely(error_code & PF_RSVD))
>  		pgtable_bad(regs, error_code, address);
> @@ -1088,17 +1073,35 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
>  		}
>  	}
>  
> -	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
> -
>  	/*
>  	 * If we're in an interrupt, have no user context or are running
>  	 * in an atomic region then we must not take the fault:
>  	 */
>  	if (unlikely(in_atomic() || !mm)) {
> +		if (in_nmi())
> +			trace_printk("YIPEE!!!\n");
>  		bad_area_nosemaphore(regs, error_code, address);
>  		return;
>  	}
>  
> +	/*
> +	 * It's safe to allow irq's after cr2 has been saved and the
> +	 * vmalloc fault has been handled.
> +	 *
> +	 * User-mode registers count as a user access even for any
> +	 * potential system fault or CPU buglet:
> +	 */
> +	if (user_mode_vm(regs)) {
> +		local_irq_enable();
> +		error_code |= PF_USER;
> +		flags |= FAULT_FLAG_USER;
> +	} else {
> +		if (regs->flags & X86_EFLAGS_IF)
> +			local_irq_enable();
> +	}
> +
> +	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
> +
>  	if (error_code & PF_WRITE)
>  		flags |= FAULT_FLAG_WRITE;
>  

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-24 13:47                                         ` Don Zickus
@ 2013-10-24 14:06                                           ` Peter Zijlstra
  0 siblings, 0 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-24 14:06 UTC (permalink / raw)
  To: Don Zickus
  Cc: Linus Torvalds, Andi Kleen, dave.hansen, Stephane Eranian,
	jmario, Linux Kernel Mailing List, Arnaldo Carvalho de Melo,
	Ingo Molnar

On Thu, Oct 24, 2013 at 09:47:06AM -0400, Don Zickus wrote:
> > Don, can you give this stuff a spin on your system?
> 
> I'll try to grab the machine I was testing with and see what this patch
> does.  Thanks!  I assume this can go on top of the other patch that was
> committed to -tip last week?

Yeah, I had that patch in my tree so it should apply fine.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-24 10:52                                       ` Peter Zijlstra
  2013-10-24 13:47                                         ` Don Zickus
@ 2013-10-25 16:33                                         ` Don Zickus
  2013-10-25 17:03                                           ` Peter Zijlstra
  2013-10-26 10:36                                           ` Ingo Molnar
  2013-10-29 14:08                                         ` [tip:perf/core] perf/x86: Further optimize copy_from_user_nmi() tip-bot for Peter Zijlstra
  2 siblings, 2 replies; 47+ messages in thread
From: Don Zickus @ 2013-10-25 16:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Andi Kleen, dave.hansen, Stephane Eranian,
	jmario, Linux Kernel Mailing List, Arnaldo Carvalho de Melo,
	Ingo Molnar

On Thu, Oct 24, 2013 at 12:52:06PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 23, 2013 at 10:48:38PM +0200, Peter Zijlstra wrote:
> > I'll also make sure to test we actually hit the fault path
> > by concurrently running something like:
> > 
> >  while :; echo 1 > /proc/sys/vm/drop_caches ; done
> > 
> > while doing perf top or so.. 
> 
> So the below appears to work; I've ran:
> 
>   while :; do echo 1 > /proc/sys/vm/drop_caches; sleep 1; done &
>   while :; do make O=defconfig-build/ clean; perf record -a -g fp -e cycles:pp make O=defconfig-build/ -s -j64; done
> 
> And verified that the if (in_nmi()) trace_printk() was visible in the
> trace output verifying we indeed took the fault from the NMI code.
> 
> I've had this running for ~ 30 minutes or so and the machine is still
> healthy.
> 
> Don, can you give this stuff a spin on your system?

Hi Peter,

I finally had a chance to run this on my machine.  From my testing, it
looks good.  Better performance numbers.  I think my longest latency went
from 300K cycles down to 150K cycles and very few of those (most are under
100K cycles).

I also don't see perf throttling me down to 1500 samples, it stops around
7000.  So I see progress with this patch. :-)

Thanks!

Cheers,
Don

> 
> ---
>  arch/x86/lib/usercopy.c | 43 +++++++++++++++----------------------------
>  arch/x86/mm/fault.c     | 43 +++++++++++++++++++++++--------------------
>  2 files changed, 38 insertions(+), 48 deletions(-)
> 
> diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
> index 4f74d94c8d97..5465b8613944 100644
> --- a/arch/x86/lib/usercopy.c
> +++ b/arch/x86/lib/usercopy.c
> @@ -11,39 +11,26 @@
>  #include <linux/sched.h>
>  
>  /*
> - * best effort, GUP based copy_from_user() that is NMI-safe
> + * We rely on the nested NMI work to allow atomic faults from the NMI path; the
> + * nested NMI paths are careful to preserve CR2.
>   */
>  unsigned long
>  copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
>  {
> -	unsigned long offset, addr = (unsigned long)from;
> -	unsigned long size, len = 0;
> -	struct page *page;
> -	void *map;
> -	int ret;
> +	unsigned long ret;
>  
>  	if (__range_not_ok(from, n, TASK_SIZE))
> -		return len;
> -
> -	do {
> -		ret = __get_user_pages_fast(addr, 1, 0, &page);
> -		if (!ret)
> -			break;
> -
> -		offset = addr & (PAGE_SIZE - 1);
> -		size = min(PAGE_SIZE - offset, n - len);
> -
> -		map = kmap_atomic(page);
> -		memcpy(to, map+offset, size);
> -		kunmap_atomic(map);
> -		put_page(page);
> -
> -		len  += size;
> -		to   += size;
> -		addr += size;
> -
> -	} while (len < n);
> -
> -	return len;
> +		return 0;
> +
> +	/*
> +	 * Even though this function is typically called from NMI/IRQ context
> +	 * disable pagefaults so that its behaviour is consistent even when
> +	 * called form other contexts.
> +	 */
> +	pagefault_disable();
> +	ret = __copy_from_user_inatomic(to, from, n);
> +	pagefault_enable();
> +
> +	return n - ret;
>  }
>  EXPORT_SYMBOL_GPL(copy_from_user_nmi);
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 3aaeffcfd67a..506564b13ba7 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -51,7 +51,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
>  	return 0;
>  }
>  
> -static inline int __kprobes notify_page_fault(struct pt_regs *regs)
> +static inline int __kprobes kprobes_fault(struct pt_regs *regs)
>  {
>  	int ret = 0;
>  
> @@ -1048,7 +1048,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
>  			return;
>  
>  		/* kprobes don't want to hook the spurious faults: */
> -		if (notify_page_fault(regs))
> +		if (kprobes_fault(regs))
>  			return;
>  		/*
>  		 * Don't take the mm semaphore here. If we fixup a prefetch
> @@ -1060,23 +1060,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
>  	}
>  
>  	/* kprobes don't want to hook the spurious faults: */
> -	if (unlikely(notify_page_fault(regs)))
> +	if (unlikely(kprobes_fault(regs)))
>  		return;
> -	/*
> -	 * It's safe to allow irq's after cr2 has been saved and the
> -	 * vmalloc fault has been handled.
> -	 *
> -	 * User-mode registers count as a user access even for any
> -	 * potential system fault or CPU buglet:
> -	 */
> -	if (user_mode_vm(regs)) {
> -		local_irq_enable();
> -		error_code |= PF_USER;
> -		flags |= FAULT_FLAG_USER;
> -	} else {
> -		if (regs->flags & X86_EFLAGS_IF)
> -			local_irq_enable();
> -	}
>  
>  	if (unlikely(error_code & PF_RSVD))
>  		pgtable_bad(regs, error_code, address);
> @@ -1088,17 +1073,35 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
>  		}
>  	}
>  
> -	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
> -
>  	/*
>  	 * If we're in an interrupt, have no user context or are running
>  	 * in an atomic region then we must not take the fault:
>  	 */
>  	if (unlikely(in_atomic() || !mm)) {
> +		if (in_nmi())
> +			trace_printk("YIPEE!!!\n");
>  		bad_area_nosemaphore(regs, error_code, address);
>  		return;
>  	}
>  
> +	/*
> +	 * It's safe to allow irq's after cr2 has been saved and the
> +	 * vmalloc fault has been handled.
> +	 *
> +	 * User-mode registers count as a user access even for any
> +	 * potential system fault or CPU buglet:
> +	 */
> +	if (user_mode_vm(regs)) {
> +		local_irq_enable();
> +		error_code |= PF_USER;
> +		flags |= FAULT_FLAG_USER;
> +	} else {
> +		if (regs->flags & X86_EFLAGS_IF)
> +			local_irq_enable();
> +	}
> +
> +	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
> +
>  	if (error_code & PF_WRITE)
>  		flags |= FAULT_FLAG_WRITE;
>  

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-25 16:33                                         ` Don Zickus
@ 2013-10-25 17:03                                           ` Peter Zijlstra
  2013-10-26 10:36                                           ` Ingo Molnar
  1 sibling, 0 replies; 47+ messages in thread
From: Peter Zijlstra @ 2013-10-25 17:03 UTC (permalink / raw)
  To: Don Zickus
  Cc: Linus Torvalds, Andi Kleen, dave.hansen, Stephane Eranian,
	jmario, Linux Kernel Mailing List, Arnaldo Carvalho de Melo,
	Ingo Molnar

On Fri, Oct 25, 2013 at 12:33:03PM -0400, Don Zickus wrote:
> Hi Peter,
> 
> I finally had a chance to run this on my machine.  From my testing, it
> looks good.  Better performance numbers.  I think my longest latency went
> from 300K cycles down to 150K cycles and very few of those (most are under
> 100K cycles).
> 
> I also don't see perf throttling me down to 1500 samples, it stops around
> 7000.  So I see progress with this patch. :-)

Awesome.. I'll write up a proper patch when I'm back home.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-25 16:33                                         ` Don Zickus
  2013-10-25 17:03                                           ` Peter Zijlstra
@ 2013-10-26 10:36                                           ` Ingo Molnar
  2013-10-28 13:19                                             ` Don Zickus
  1 sibling, 1 reply; 47+ messages in thread
From: Ingo Molnar @ 2013-10-26 10:36 UTC (permalink / raw)
  To: Don Zickus
  Cc: Peter Zijlstra, Linus Torvalds, Andi Kleen, dave.hansen,
	Stephane Eranian, jmario, Linux Kernel Mailing List,
	Arnaldo Carvalho de Melo


* Don Zickus <dzickus@redhat.com> wrote:

> On Thu, Oct 24, 2013 at 12:52:06PM +0200, Peter Zijlstra wrote:
> > On Wed, Oct 23, 2013 at 10:48:38PM +0200, Peter Zijlstra wrote:
> > > I'll also make sure to test we actually hit the fault path
> > > by concurrently running something like:
> > > 
> > >  while :; echo 1 > /proc/sys/vm/drop_caches ; done
> > > 
> > > while doing perf top or so.. 
> > 
> > So the below appears to work; I've ran:
> > 
> >   while :; do echo 1 > /proc/sys/vm/drop_caches; sleep 1; done &
> >   while :; do make O=defconfig-build/ clean; perf record -a -g fp -e cycles:pp make O=defconfig-build/ -s -j64; done
> > 
> > And verified that the if (in_nmi()) trace_printk() was visible in the
> > trace output verifying we indeed took the fault from the NMI code.
> > 
> > I've had this running for ~ 30 minutes or so and the machine is still
> > healthy.
> > 
> > Don, can you give this stuff a spin on your system?
> 
> Hi Peter,
> 
> I finally had a chance to run this on my machine.  From my 
> testing, it looks good.  Better performance numbers.  I think my 
> longest latency went from 300K cycles down to 150K cycles and very 
> few of those (most are under 100K cycles).

Btw., do we know where those ~100k-150k cycles are spent 
specifically? 100k cycles is still an awful lot of time to spend in 
NMI context ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip()
  2013-10-26 10:36                                           ` Ingo Molnar
@ 2013-10-28 13:19                                             ` Don Zickus
  0 siblings, 0 replies; 47+ messages in thread
From: Don Zickus @ 2013-10-28 13:19 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Linus Torvalds, Andi Kleen, dave.hansen,
	Stephane Eranian, jmario, Linux Kernel Mailing List,
	Arnaldo Carvalho de Melo

On Sat, Oct 26, 2013 at 12:36:52PM +0200, Ingo Molnar wrote:
> 
> * Don Zickus <dzickus@redhat.com> wrote:
> 
> > On Thu, Oct 24, 2013 at 12:52:06PM +0200, Peter Zijlstra wrote:
> > > On Wed, Oct 23, 2013 at 10:48:38PM +0200, Peter Zijlstra wrote:
> > > > I'll also make sure to test we actually hit the fault path
> > > > by concurrently running something like:
> > > > 
> > > >  while :; echo 1 > /proc/sys/vm/drop_caches ; done
> > > > 
> > > > while doing perf top or so.. 
> > > 
> > > So the below appears to work; I've ran:
> > > 
> > >   while :; do echo 1 > /proc/sys/vm/drop_caches; sleep 1; done &
> > >   while :; do make O=defconfig-build/ clean; perf record -a -g fp -e cycles:pp make O=defconfig-build/ -s -j64; done
> > > 
> > > And verified that the if (in_nmi()) trace_printk() was visible in the
> > > trace output verifying we indeed took the fault from the NMI code.
> > > 
> > > I've had this running for ~ 30 minutes or so and the machine is still
> > > healthy.
> > > 
> > > Don, can you give this stuff a spin on your system?
> > 
> > Hi Peter,
> > 
> > I finally had a chance to run this on my machine.  From my 
> > testing, it looks good.  Better performance numbers.  I think my 
> > longest latency went from 300K cycles down to 150K cycles and very 
> > few of those (most are under 100K cycles).
> 
> Btw., do we know where those ~100k-150k cycles are spent 
> specifically? 100k cycles is still an awful lot of time to spend in 
> NMI context ...

I agree, there is still a bunch of latency in the nmi path.  I believe it
is still in the pebs code.  I share the machine with a colleague right
now, so I haven't been able to isolate it.

But going from a few hundred samples over a million cycles to about a
couple dozen over 100K was a big step.  :-)

I still see perf throttling and people are complaining about it here so I
still plan to keep investigating.  Just taking me a while.  Sorry about
that.

Cheers,
Don

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [tip:perf/urgent] perf/x86: Fix NMI measurements
  2013-10-17 13:33                 ` Peter Zijlstra
@ 2013-10-29 14:07                   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 47+ messages in thread
From: tip-bot for Peter Zijlstra @ 2013-10-29 14:07 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, peterz, tglx, dzickus

Commit-ID:  e8a923cc1fff6e627f906655ad52ee694ef2f6d7
Gitweb:     http://git.kernel.org/tip/e8a923cc1fff6e627f906655ad52ee694ef2f6d7
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 17 Oct 2013 15:32:10 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 29 Oct 2013 12:01:20 +0100

perf/x86: Fix NMI measurements

OK, so what I'm actually seeing on my WSM is that sched/clock.c is
'broken' for the purpose we're using it for.

What triggered it is that my WSM-EP is broken :-(

  [    0.001000] tsc: Fast TSC calibration using PIT
  [    0.002000] tsc: Detected 2533.715 MHz processor
  [    0.500180] TSC synchronization [CPU#0 -> CPU#6]:
  [    0.505197] Measured 3 cycles TSC warp between CPUs, turning off TSC clock.
  [    0.004000] tsc: Marking TSC unstable due to check_tsc_sync_source failed

For some reason it consistently detects TSC skew, even though NHM+
should have a single clock domain for 'reasonable' systems.

This marks sched_clock_stable=0, which means that we do fancy stuff to
try and get a 'sane' clock. Part of this fancy stuff relies on the tick,
clearly that's gone when NOHZ=y. So for idle cpus time gets stuck, until
it either wakes up or gets kicked by another cpu.

While this is perfectly fine for the scheduler -- it only cares about
actually running stuff, and when we're running stuff we're obviously not
idle. This does somewhat break down for perf which can trigger events
just fine on an otherwise idle cpu.

So I've got NMIs get get 'measured' as taking ~1ms, which actually
don't last nearly that long:

          <idle>-0     [013] d.h.   886.311970: rcu_nmi_enter <-do_nmi
  ...
          <idle>-0     [013] d.h.   886.311997: perf_sample_event_took: HERE!!! : 1040990

So ftrace (which uses sched_clock(), not the fancy bits) only sees
~27us, but we measure ~1ms !!

Now since all this measurement stuff lives in x86 code, we can actually
fix it.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: mingo@kernel.org
Cc: dave.hansen@linux.intel.com
Cc: eranian@google.com
Cc: Don Zickus <dzickus@redhat.com>
Cc: jmario@redhat.com
Cc: acme@infradead.org
Link: http://lkml.kernel.org/r/20131017133350.GG3364@laptop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 6 +++---
 arch/x86/kernel/nmi.c            | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9d84491..8a87a32 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1276,16 +1276,16 @@ void perf_events_lapic_init(void)
 static int __kprobes
 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 {
-	int ret;
 	u64 start_clock;
 	u64 finish_clock;
+	int ret;
 
 	if (!atomic_read(&active_events))
 		return NMI_DONE;
 
-	start_clock = local_clock();
+	start_clock = sched_clock();
 	ret = x86_pmu.handle_irq(regs);
-	finish_clock = local_clock();
+	finish_clock = sched_clock();
 
 	perf_sample_event_took(finish_clock - start_clock);
 
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ba77ebc..6fcb49c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -113,10 +113,10 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
 		u64 before, delta, whole_msecs;
 		int remainder_ns, decimal_msecs, thishandled;
 
-		before = local_clock();
+		before = sched_clock();
 		thishandled = a->handler(type, regs);
 		handled += thishandled;
-		delta = local_clock() - before;
+		delta = sched_clock() - before;
 		trace_nmi_handler(a->handler, (int)delta, thishandled);
 
 		if (delta < nmi_longest_ns)

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [tip:perf/core] perf/x86: Further optimize copy_from_user_nmi()
  2013-10-24 10:52                                       ` Peter Zijlstra
  2013-10-24 13:47                                         ` Don Zickus
  2013-10-25 16:33                                         ` Don Zickus
@ 2013-10-29 14:08                                         ` tip-bot for Peter Zijlstra
  2 siblings, 0 replies; 47+ messages in thread
From: tip-bot for Peter Zijlstra @ 2013-10-29 14:08 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, eranian, hpa, mingo, torvalds, peterz, acme, ak,
	tglx, dzickus

Commit-ID:  e00b12e64be9a34ef071de7b6052ca9ea29dd460
Gitweb:     http://git.kernel.org/tip/e00b12e64be9a34ef071de7b6052ca9ea29dd460
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 24 Oct 2013 12:52:06 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 29 Oct 2013 12:02:54 +0100

perf/x86: Further optimize copy_from_user_nmi()

Now that we can deal with nested NMI due to IRET re-enabling NMIs and
can deal with faults from NMI by making sure we preserve CR2 over NMIs
we can in fact simply access user-space memory from NMI context.

So rewrite copy_from_user_nmi() to use __copy_from_user_inatomic() and
rework the fault path to do the minimal required work before taking
the in_atomic() fault handler.

In particular avoid perf_sw_event() which would make perf recurse on
itself (it should be harmless as our recursion protections should be
able to deal with this -- but why tempt fate).

Also rename notify_page_fault() to kprobes_fault() as that is a much
better name; there is no notifier in it and its specific to kprobes.

Don measured that his worst case NMI path shrunk from ~300K cycles to
~150K cycles.

Cc: Stephane Eranian <eranian@google.com>
Cc: jmario@redhat.com
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: dave.hansen@linux.intel.com
Tested-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20131024105206.GM2490@laptop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/usercopy.c | 43 +++++++++++++++----------------------------
 arch/x86/mm/fault.c     | 41 +++++++++++++++++++++--------------------
 2 files changed, 36 insertions(+), 48 deletions(-)

diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index 4f74d94..5465b86 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -11,39 +11,26 @@
 #include <linux/sched.h>
 
 /*
- * best effort, GUP based copy_from_user() that is NMI-safe
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
  */
 unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 {
-	unsigned long offset, addr = (unsigned long)from;
-	unsigned long size, len = 0;
-	struct page *page;
-	void *map;
-	int ret;
+	unsigned long ret;
 
 	if (__range_not_ok(from, n, TASK_SIZE))
-		return len;
-
-	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
-			break;
-
-		offset = addr & (PAGE_SIZE - 1);
-		size = min(PAGE_SIZE - offset, n - len);
-
-		map = kmap_atomic(page);
-		memcpy(to, map+offset, size);
-		kunmap_atomic(map);
-		put_page(page);
-
-		len  += size;
-		to   += size;
-		addr += size;
-
-	} while (len < n);
-
-	return len;
+		return 0;
+
+	/*
+	 * Even though this function is typically called from NMI/IRQ context
+	 * disable pagefaults so that its behaviour is consistent even when
+	 * called form other contexts.
+	 */
+	pagefault_disable();
+	ret = __copy_from_user_inatomic(to, from, n);
+	pagefault_enable();
+
+	return n - ret;
 }
 EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3aaeffc..7a517bb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -51,7 +51,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
 	return 0;
 }
 
-static inline int __kprobes notify_page_fault(struct pt_regs *regs)
+static inline int __kprobes kprobes_fault(struct pt_regs *regs)
 {
 	int ret = 0;
 
@@ -1048,7 +1048,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 			return;
 
 		/* kprobes don't want to hook the spurious faults: */
-		if (notify_page_fault(regs))
+		if (kprobes_fault(regs))
 			return;
 		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -1060,23 +1060,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	}
 
 	/* kprobes don't want to hook the spurious faults: */
-	if (unlikely(notify_page_fault(regs)))
+	if (unlikely(kprobes_fault(regs)))
 		return;
-	/*
-	 * It's safe to allow irq's after cr2 has been saved and the
-	 * vmalloc fault has been handled.
-	 *
-	 * User-mode registers count as a user access even for any
-	 * potential system fault or CPU buglet:
-	 */
-	if (user_mode_vm(regs)) {
-		local_irq_enable();
-		error_code |= PF_USER;
-		flags |= FAULT_FLAG_USER;
-	} else {
-		if (regs->flags & X86_EFLAGS_IF)
-			local_irq_enable();
-	}
 
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
@@ -1088,8 +1073,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 		}
 	}
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-
 	/*
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
@@ -1099,6 +1082,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 		return;
 	}
 
+	/*
+	 * It's safe to allow irq's after cr2 has been saved and the
+	 * vmalloc fault has been handled.
+	 *
+	 * User-mode registers count as a user access even for any
+	 * potential system fault or CPU buglet:
+	 */
+	if (user_mode_vm(regs)) {
+		local_irq_enable();
+		error_code |= PF_USER;
+		flags |= FAULT_FLAG_USER;
+	} else {
+		if (regs->flags & X86_EFLAGS_IF)
+			local_irq_enable();
+	}
+
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
 	if (error_code & PF_WRITE)
 		flags |= FAULT_FLAG_WRITE;
 

^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2013-10-29 14:09 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-14 20:35 x86, perf: throttling issues with long nmi latencies Don Zickus
2013-10-14 21:28 ` Andi Kleen
2013-10-15 10:14 ` Peter Zijlstra
2013-10-15 13:02   ` Peter Zijlstra
2013-10-15 14:32     ` Peter Zijlstra
2013-10-15 15:07       ` Peter Zijlstra
2013-10-15 15:41         ` Don Zickus
2013-10-16 10:57           ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Peter Zijlstra
2013-10-16 12:46             ` Don Zickus
2013-10-16 13:31               ` Peter Zijlstra
2013-10-16 13:54                 ` Don Zickus
2013-10-17 11:21                 ` Peter Zijlstra
2013-10-17 13:33                 ` Peter Zijlstra
2013-10-29 14:07                   ` [tip:perf/urgent] perf/x86: Fix NMI measurements tip-bot for Peter Zijlstra
2013-10-16 20:52             ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Andi Kleen
2013-10-16 21:03               ` Peter Zijlstra
2013-10-16 23:07                 ` Peter Zijlstra
2013-10-17  9:41                   ` Peter Zijlstra
2013-10-17 16:00                     ` Don Zickus
2013-10-17 16:04                       ` Don Zickus
2013-10-17 16:30                         ` Peter Zijlstra
2013-10-17 18:26                           ` Linus Torvalds
2013-10-17 21:08                             ` Peter Zijlstra
2013-10-17 21:11                               ` Peter Zijlstra
2013-10-17 22:01                             ` Peter Zijlstra
2013-10-17 22:27                               ` Linus Torvalds
2013-10-22 21:12                                 ` Peter Zijlstra
2013-10-23  7:09                                   ` Linus Torvalds
2013-10-23 20:48                                     ` Peter Zijlstra
2013-10-24 10:52                                       ` Peter Zijlstra
2013-10-24 13:47                                         ` Don Zickus
2013-10-24 14:06                                           ` Peter Zijlstra
2013-10-25 16:33                                         ` Don Zickus
2013-10-25 17:03                                           ` Peter Zijlstra
2013-10-26 10:36                                           ` Ingo Molnar
2013-10-28 13:19                                             ` Don Zickus
2013-10-29 14:08                                         ` [tip:perf/core] perf/x86: Further optimize copy_from_user_nmi() tip-bot for Peter Zijlstra
2013-10-23  7:44                                   ` [PATCH] perf, x86: Optimize intel_pmu_pebs_fixup_ip() Ingo Molnar
2013-10-17 14:49             ` Don Zickus
2013-10-17 14:51               ` Peter Zijlstra
2013-10-17 15:03                 ` Don Zickus
2013-10-17 15:09                   ` Peter Zijlstra
2013-10-17 15:11                     ` Peter Zijlstra
2013-10-17 16:50             ` [tip:perf/core] perf/x86: " tip-bot for Peter Zijlstra
2013-10-15 16:22         ` x86, perf: throttling issues with long nmi latencies Don Zickus
2013-10-15 14:36     ` Don Zickus
2013-10-15 14:39       ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.