linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86-64: fix memset() to support sizes of 4Gb and above
@ 2012-01-05 16:10 Jan Beulich
  2012-01-06 11:05 ` Ingo Molnar
  2012-01-26 13:40 ` [tip:x86/asm] x86-64: Fix " tip-bot for Jan Beulich
  0 siblings, 2 replies; 11+ messages in thread
From: Jan Beulich @ 2012-01-05 16:10 UTC (permalink / raw)
  To: mingo, tglx, hpa; +Cc: linux-kernel

While currently there doesn't appear to be any reachable in-tree case
where such large memory blocks may be passed to memset()
(alloc_bootmem() being the primary non-reachable one, as it gets called
with suitably large sizes in FLATMEM configurations), we have recently
hit the problem a second time in our Xen kernels. Rather than working
around it a second time, prevent others from falling into the same trap
by fixing this long standing limitation.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

---
 arch/x86/lib/memset_64.S |   33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

--- 3.2/arch/x86/lib/memset_64.S
+++ 3.2-x86_64-memset/arch/x86/lib/memset_64.S
@@ -19,16 +19,15 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemset_c:
 	movq %rdi,%r9
-	movl %edx,%r8d
-	andl $7,%r8d
-	movl %edx,%ecx
-	shrl $3,%ecx
+	movq %rdx,%rcx
+	andl $7,%edx
+	shrq $3,%rcx
 	/* expand byte value  */
 	movzbl %sil,%esi
 	movabs $0x0101010101010101,%rax
-	mulq %rsi		/* with rax, clobbers rdx */
+	imulq %rsi,%rax
 	rep stosq
-	movl %r8d,%ecx
+	movl %edx,%ecx
 	rep stosb
 	movq %r9,%rax
 	ret
@@ -50,7 +49,7 @@
 .Lmemset_c_e:
 	movq %rdi,%r9
 	movb %sil,%al
-	movl %edx,%ecx
+	movq %rdx,%rcx
 	rep stosb
 	movq %r9,%rax
 	ret
@@ -61,12 +60,11 @@ ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
 	movq %rdi,%r10
-	movq %rdx,%r11
 
 	/* expand byte value  */
 	movzbl %sil,%ecx
 	movabs $0x0101010101010101,%rax
-	mul    %rcx		/* with rax, clobbers rdx */
+	imulq  %rcx,%rax
 
 	/* align dst */
 	movl  %edi,%r9d
@@ -75,13 +73,13 @@ ENTRY(__memset)
 	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
 
-	movl %r11d,%ecx
-	shrl $6,%ecx
+	movq  %rdx,%rcx
+	shrq  $6,%rcx
 	jz	 .Lhandle_tail
 
 	.p2align 4
 .Lloop_64:
-	decl   %ecx
+	decq  %rcx
 	movq  %rax,(%rdi)
 	movq  %rax,8(%rdi)
 	movq  %rax,16(%rdi)
@@ -97,7 +95,7 @@ ENTRY(__memset)
 	   to predict jump tables. */
 	.p2align 4
 .Lhandle_tail:
-	movl	%r11d,%ecx
+	movl	%edx,%ecx
 	andl    $63&(~7),%ecx
 	jz 		.Lhandle_7
 	shrl	$3,%ecx
@@ -109,12 +107,11 @@ ENTRY(__memset)
 	jnz    .Lloop_8
 
 .Lhandle_7:
-	movl	%r11d,%ecx
-	andl	$7,%ecx
+	andl	$7,%edx
 	jz      .Lende
 	.p2align 4
 .Lloop_1:
-	decl    %ecx
+	decl    %edx
 	movb 	%al,(%rdi)
 	leaq	1(%rdi),%rdi
 	jnz     .Lloop_1
@@ -125,13 +122,13 @@ ENTRY(__memset)
 
 	CFI_RESTORE_STATE
 .Lbad_alignment:
-	cmpq $7,%r11
+	cmpq $7,%rdx
 	jbe	.Lhandle_7
 	movq %rax,(%rdi)	/* unaligned store */
 	movq $8,%r8
 	subq %r9,%r8
 	addq %r8,%rdi
-	subq %r8,%r11
+	subq %r8,%rdx
 	jmp .Lafter_bad_alignment
 .Lfinal:
 	CFI_ENDPROC




^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] x86-64: fix memset() to support sizes of 4Gb and above
  2012-01-05 16:10 [PATCH] x86-64: fix memset() to support sizes of 4Gb and above Jan Beulich
@ 2012-01-06 11:05 ` Ingo Molnar
  2012-01-06 12:31   ` Jan Beulich
  2012-01-18 10:40   ` Jan Beulich
  2012-01-26 13:40 ` [tip:x86/asm] x86-64: Fix " tip-bot for Jan Beulich
  1 sibling, 2 replies; 11+ messages in thread
From: Ingo Molnar @ 2012-01-06 11:05 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tglx, hpa, linux-kernel, Linus Torvalds, Andrew Morton


* Jan Beulich <JBeulich@suse.com> wrote:

> While currently there doesn't appear to be any reachable 
> in-tree case where such large memory blocks may be passed to 
> memset() (alloc_bootmem() being the primary non-reachable one, 
> as it gets called with suitably large sizes in FLATMEM 
> configurations), we have recently hit the problem a second 
> time in our Xen kernels. Rather than working around it a 
> second time, prevent others from falling into the same trap by 
> fixing this long standing limitation.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Have you checked the before/after size of the hotpath?

The patch suggests that it got shorter by 3 instructions:

> -	movl %edx,%r8d
> -	andl $7,%r8d
> -	movl %edx,%ecx
> -	shrl $3,%ecx
> +	movq %rdx,%rcx
> +	andl $7,%edx
> +	shrq $3,%rcx

[...]

>  	movq %rdi,%r10
> -	movq %rdx,%r11
>  

[...]

> -	movl	%r11d,%ecx
> -	andl	$7,%ecx
> +	andl	$7,%edx

Is that quick impression correct? I have not tried building or 
measuring it.

Also, note that we have some cool instrumentation tech upstream, 
lately we've added a way to measure the *kernel*'s memcpy 
routine performance in user-space, using perf bench:

  $ perf bench mem memcpy -r x86
  # Running mem/memcpy benchmark...
  Unknown routine:x86
  Available routines...
	default ... Default memcpy() provided by glibc
	x86-64-unrolled ... unrolled memcpy() in arch/x86/lib/memcpy_64.S

  $ perf bench mem memcpy -r x86-64-unrolled
  # Running mem/memcpy benchmark...
  # Copying 1MB Bytes ...

       1.888902 GB/Sec
      15.024038 GB/Sec (with prefault)

This builds the routine in arch/x86/lib/memcpy_64.S and measures 
its bandwidth in the cache-cold and cache-hot case as well.

Would be nice to add support for arch/x86/lib/memset_64.S as 
well, and look at the before/after performance of it. In 
userspace we can do a lot more accurate measurements of this 
kind:

 $ perf stat --repeat 1000 -e instructions -e cycles perf bench mem memcpy -r x86-64-unrolled >/dev/null

 Performance counter stats for 'perf bench mem memcpy -r x86-64-unrolled' (1000 runs):

         4,924,378 instructions              #    0.84  insns per cycle          ( +-  0.03% )
         5,892,603 cycles                    #    0.000 GHz                      ( +-  0.06% )

       0.002208000 seconds time elapsed                                          ( +-  0.10% )

Check how the confidence interval of the measurement is in the 
0.03% range, thus even a single cycle change in overhead caused 
by your patch should be measurable. Note, you'll need to rebuild 
perf with the different memset routines.

For example the kernel's memcpy routine in slightly faster than 
glibc's:

  $ perf stat --repeat 1000 -e instructions -e cycles perf bench mem memcpy -r default >/dev/null

 Performance counter stats for 'perf bench mem memcpy -r default' (1000 runs):

         4,927,173 instructions              #    0.83  insns per cycle          ( +-  0.03% )
         5,928,168 cycles                    #    0.000 GHz                      ( +-  0.06% )

       0.002157349 seconds time elapsed                                          ( +-  0.10% )

If such measurements all suggests equal or better performance, 
and if there's no erratum in current CPUs that would make 4G 
string copies dangerous [which your research suggests should be 
fine], i have no principal objection against this patch.

I'd not be surprised (at all) if other OSs did larger than 4GB 
memsets in certain circumstances.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] x86-64: fix memset() to support sizes of 4Gb and above
  2012-01-06 11:05 ` Ingo Molnar
@ 2012-01-06 12:31   ` Jan Beulich
  2012-01-06 19:01     ` Ingo Molnar
  2012-01-18 10:40   ` Jan Beulich
  1 sibling, 1 reply; 11+ messages in thread
From: Jan Beulich @ 2012-01-06 12:31 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: tglx, Andrew Morton, Linus Torvalds, linux-kernel, hpa

>>> On 06.01.12 at 12:05, Ingo Molnar <mingo@elte.hu> wrote:

> * Jan Beulich <JBeulich@suse.com> wrote:
> 
>> While currently there doesn't appear to be any reachable 
>> in-tree case where such large memory blocks may be passed to 
>> memset() (alloc_bootmem() being the primary non-reachable one, 
>> as it gets called with suitably large sizes in FLATMEM 
>> configurations), we have recently hit the problem a second 
>> time in our Xen kernels. Rather than working around it a 
>> second time, prevent others from falling into the same trap by 
>> fixing this long standing limitation.
>> 
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Have you checked the before/after size of the hotpath?
> 
> The patch suggests that it got shorter by 3 instructions:
> 
>> -	movl %edx,%r8d
>> -	andl $7,%r8d
>> -	movl %edx,%ecx
>> -	shrl $3,%ecx
>> +	movq %rdx,%rcx
>> +	andl $7,%edx
>> +	shrq $3,%rcx
> 
> [...]
> 
>>  	movq %rdi,%r10
>> -	movq %rdx,%r11
>>  
> 
> [...]
> 
>> -	movl	%r11d,%ecx
>> -	andl	$7,%ecx
>> +	andl	$7,%edx
> 
> Is that quick impression correct? I have not tried building or 
> measuring it.

Yes, that's correct.

As the bodies of the individual flavors didn't change, I see no risk in
this change causing any performance degradation.

> Would be nice to add support for arch/x86/lib/memset_64.S as 
> well, and look at the before/after performance of it. In 
> userspace we can do a lot more accurate measurements of this 
> kind:

I'll see whether I can get this done, but I admit that I'm entirely
unfamiliar with this tool and its infrastructure. I hope doing this is
not going to be a requirement for acceptance of the patch.

Jan


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] x86-64: fix memset() to support sizes of 4Gb and above
  2012-01-06 12:31   ` Jan Beulich
@ 2012-01-06 19:01     ` Ingo Molnar
  0 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2012-01-06 19:01 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tglx, Andrew Morton, Linus Torvalds, linux-kernel, hpa


* Jan Beulich <JBeulich@suse.com> wrote:

> > Would be nice to add support for arch/x86/lib/memset_64.S as 
> > well, and look at the before/after performance of it. In 
> > userspace we can do a lot more accurate measurements of this 
> > kind:
> 
> I'll see whether I can get this done, but I admit that I'm 
> entirely unfamiliar with this tool and its infrastructure. 
> [...]

Just check what it does for memcpy and duplicate it.

> [...] I hope doing this is not going to be a requirement for 
> acceptance of the patch.

No, but it would certainly help us make sure there's no 
performance side effect.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] x86-64: fix memset() to support sizes of 4Gb and above
  2012-01-06 11:05 ` Ingo Molnar
  2012-01-06 12:31   ` Jan Beulich
@ 2012-01-18 10:40   ` Jan Beulich
  2012-01-18 11:14     ` Ingo Molnar
  2012-01-18 18:16     ` Linus Torvalds
  1 sibling, 2 replies; 11+ messages in thread
From: Jan Beulich @ 2012-01-18 10:40 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: tglx, Andrew Morton, Linus Torvalds, linux-kernel, hpa

>>> On 06.01.12 at 12:05, Ingo Molnar <mingo@elte.hu> wrote:
>> * Jan Beulich <JBeulich@suse.com> wrote:
> Would be nice to add support for arch/x86/lib/memset_64.S as 
> well, and look at the before/after performance of it.

Got this done, will post the patch soon. However, ...

> For example the kernel's memcpy routine in slightly faster than 
> glibc's:

This is an illusion - since the kernel's memcpy_64.S also defines a
"memcpy" (not just "__memcpy"), the static linker resolves the
reference from mem-memcpy.c against this one. Apparent
performance differences rather point at effects like (guessing)
branch prediction (using the second vs the first entry of
routines[]). After fixing this, on my Westmere box glibc's is quite
a bit slower than the unrolled kernel variant (4% fewer
instructions, but about 15% more cycles).

> If such measurements all suggests equal or better performance, 
> and if there's no erratum in current CPUs that would make 4G 
> string copies dangerous [which your research suggests should be 
> fine], i have no principal objection against this patch.

If I interpreted things correctly, there's a tiny win with the changes
(also for not-yet-posted memcpy equivalent):

# Original 3.2:
 Performance counter stats for 'perf bench mem memcpy -r x86-64-unrolled' (1000 runs):

         5,237,848 instructions              #    1.01  insns per cycle          ( +-  0.01% )
         5,187,712 cycles                    #    0.000 GHz                      ( +-  0.10% )

       0.003426133 seconds time elapsed                                          ( +-  0.30% )

         5,236,075 instructions              #    1.01  insns per cycle          ( +-  0.01% )
         5,177,677 cycles                    #    0.000 GHz                      ( +-  0.08% )

       0.003423426 seconds time elapsed                                          ( +-  0.29% )

         5,236,887 instructions              #    1.01  insns per cycle          ( +-  0.01% )
         5,180,640 cycles                    #    0.000 GHz                      ( +-  0.08% )

       0.003410956 seconds time elapsed                                          ( +-  0.32% )

 Performance counter stats for 'perf bench mem memset -r x86-64-unrolled' (1000 runs):

         4,300,600 instructions              #    0.97  insns per cycle          ( +-  0.02% )
         4,442,449 cycles                    #    0.000 GHz                      ( +-  0.12% )

       0.002976608 seconds time elapsed                                          ( +-  0.29% )

         4,300,542 instructions              #    0.97  insns per cycle          ( +-  0.02% )
         4,443,480 cycles                    #    0.000 GHz                      ( +-  0.10% )

       0.002942516 seconds time elapsed                                          ( +-  0.36% )

         4,300,400 instructions              #    0.97  insns per cycle          ( +-  0.02% )
         4,439,363 cycles                    #    0.000 GHz                      ( +-  0.08% )

       0.002962733 seconds time elapsed                                          ( +-  0.32% )

# Patched (x86_64-mem*.patch):
 Performance counter stats for 'perf bench mem memcpy -r x86-64-unrolled' (1000 runs):

         5,236,674 instructions              #    1.01  insns per cycle          ( +-  0.01% )
         5,182,292 cycles                    #    0.000 GHz                      ( +-  0.07% )

       0.003426389 seconds time elapsed                                          ( +-  0.29% )

         5,235,704 instructions              #    1.01  insns per cycle          ( +-  0.01% )
         5,183,586 cycles                    #    0.000 GHz                      ( +-  0.08% )

       0.003414827 seconds time elapsed                                          ( +-  0.31% )

         5,236,240 instructions              #    1.01  insns per cycle          ( +-  0.01% )
         5,192,932 cycles                    #    0.000 GHz                      ( +-  0.10% )

       0.003404885 seconds time elapsed                                          ( +-  0.33% )

 Performance counter stats for 'perf bench mem memset -r x86-64-unrolled' (1000 runs):

         4,299,811 instructions              #    0.97  insns per cycle          ( +-  0.02% )
         4,442,268 cycles                    #    0.000 GHz                      ( +-  0.09% )

       0.002957321 seconds time elapsed                                          ( +-  0.32% )

         4,300,057 instructions              #    0.97  insns per cycle          ( +-  0.02% )
         4,438,804 cycles                    #    0.000 GHz                      ( +-  0.09% )

       0.002974749 seconds time elapsed                                          ( +-  0.29% )

         4,299,886 instructions              #    0.97  insns per cycle          ( +-  0.02% )
         4,444,117 cycles                    #    0.000 GHz                      ( +-  0.11% )

       0.002967353 seconds time elapsed                                          ( +-  0.30% )

Jan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] x86-64: fix memset() to support sizes of 4Gb and above
  2012-01-18 10:40   ` Jan Beulich
@ 2012-01-18 11:14     ` Ingo Molnar
  2012-01-18 13:33       ` Jan Beulich
  2012-01-18 18:16     ` Linus Torvalds
  1 sibling, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2012-01-18 11:14 UTC (permalink / raw)
  To: Jan Beulich; +Cc: tglx, Andrew Morton, Linus Torvalds, linux-kernel, hpa


* Jan Beulich <JBeulich@suse.com> wrote:

> >>> On 06.01.12 at 12:05, Ingo Molnar <mingo@elte.hu> wrote:
> >> * Jan Beulich <JBeulich@suse.com> wrote:
> > Would be nice to add support for arch/x86/lib/memset_64.S as 
> > well, and look at the before/after performance of it.
> 
> Got this done, will post the patch soon. However, ...
> 
> > For example the kernel's memcpy routine in slightly faster than 
> > glibc's:
> 
> This is an illusion [...]

Oh ...

> [...] - since the kernel's memcpy_64.S also defines a "memcpy" 
> (not just "__memcpy"), the static linker resolves the 
> reference from mem-memcpy.c against this one. Apparent 
> performance differences rather point at effects like 
> (guessing) branch prediction (using the second vs the first 
> entry of routines[]). After fixing this, on my Westmere box 
> glibc's is quite a bit slower than the unrolled kernel variant 
> (4% fewer instructions, but about 15% more cycles).

Cool and thanks for looking into this. Will wait for your 
patch(es).

> > If such measurements all suggests equal or better 
> > performance, and if there's no erratum in current CPUs that 
> > would make 4G string copies dangerous [which your research 
> > suggests should be fine], i have no principal objection 
> > against this patch.
> 
> If I interpreted things correctly, there's a tiny win with the 
> changes (also for not-yet-posted memcpy equivalent):

Nice. That would be the expectation from the reduction in the 
instruction count. Seems to be slighly above the noise threshold 
of the measurement.

Note that sometimes there's variance between different perf 
bench runs larger than the reported standard deviation. This can 
be seen from the three repeated --repeat 1000 runs you did.

I believe this effect is due to memory layout artifacts - found 
no good way so far to move that kind of variance inside the perf 
stat --repeat runs.

Maybe we could allocate a random amount of memory in user-space, 
in the [0..1MB] range, before doing a repeat run (and freeing it 
after an iteration), and perhaps dup() stdout randomly, to fuzz 
the kmalloc and page allocation layout patterns?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] x86-64: fix memset() to support sizes of 4Gb and above
  2012-01-18 11:14     ` Ingo Molnar
@ 2012-01-18 13:33       ` Jan Beulich
  0 siblings, 0 replies; 11+ messages in thread
From: Jan Beulich @ 2012-01-18 13:33 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: tglx, Andrew Morton, Linus Torvalds, linux-kernel, hpa

>>> On 18.01.12 at 12:14, Ingo Molnar <mingo@elte.hu> wrote:
> Cool and thanks for looking into this. Will wait for your 
> patch(es).

With them just submitted, is there anything else that I need to do to
have the original memset() change applied? I didn't want to submit
the memcpy equivalent (and a small follow-up one) without it having
a realistic chance of going in (which I assume it won't have when the
memset() one isn't acceptable for whatever reason).

Jan


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] x86-64: fix memset() to support sizes of 4Gb and above
  2012-01-18 10:40   ` Jan Beulich
  2012-01-18 11:14     ` Ingo Molnar
@ 2012-01-18 18:16     ` Linus Torvalds
  2012-01-19  7:48       ` Jan Beulich
  1 sibling, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2012-01-18 18:16 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Ingo Molnar, tglx, Andrew Morton, linux-kernel, hpa

On Wed, Jan 18, 2012 at 2:40 AM, Jan Beulich <JBeulich@suse.com> wrote:
>
>> For example the kernel's memcpy routine in slightly faster than
>> glibc's:
>
> This is an illusion - since the kernel's memcpy_64.S also defines a
> "memcpy" (not just "__memcpy"), the static linker resolves the
> reference from mem-memcpy.c against this one. Apparent
> performance differences rather point at effects like (guessing)
> branch prediction (using the second vs the first entry of
> routines[]). After fixing this, on my Westmere box glibc's is quite
> a bit slower than the unrolled kernel variant (4% fewer
> instructions, but about 15% more cycles).

Please don't bother doing memcpy performance analysis using hot-cache
cases (or entirely cold-cache for that matter) and/or big memory
copies.

The *normal* memory copy size tends to be in the 10-30 byte range, and
the cache issues (both code *and* data) are unclear. Running
microbenchmarks is almost always counter-productive, since it actually
shows numbers for something that has absolutely *nothing* to do with
the actual patterns.

End result: people do crazy things and tune memcpy for their
benchmarks, doing things like instruction (or prefetch) scheduling
that only makes the code more complicated, and has no actual basis in
reality. And then other people are afraid to touch the end result,
even though it's shit - simply because it *looks* like it was done
with lots of actual testing and effort. Never mind that all the
testing and effort was likely crap.

                  Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] x86-64: fix memset() to support sizes of 4Gb and above
  2012-01-18 18:16     ` Linus Torvalds
@ 2012-01-19  7:48       ` Jan Beulich
  2012-01-19 12:18         ` Ingo Molnar
  0 siblings, 1 reply; 11+ messages in thread
From: Jan Beulich @ 2012-01-19  7:48 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ingo Molnar, tglx, Andrew Morton, linux-kernel, hpa

>>> On 18.01.12 at 19:16, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Wed, Jan 18, 2012 at 2:40 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>
>>> For example the kernel's memcpy routine in slightly faster than
>>> glibc's:
>>
>> This is an illusion - since the kernel's memcpy_64.S also defines a
>> "memcpy" (not just "__memcpy"), the static linker resolves the
>> reference from mem-memcpy.c against this one. Apparent
>> performance differences rather point at effects like (guessing)
>> branch prediction (using the second vs the first entry of
>> routines[]). After fixing this, on my Westmere box glibc's is quite
>> a bit slower than the unrolled kernel variant (4% fewer
>> instructions, but about 15% more cycles).
> 
> Please don't bother doing memcpy performance analysis using hot-cache
> cases (or entirely cold-cache for that matter) and/or big memory
> copies.

I realize that - I just was asked to do this analysis, to (hopefully)
turn down arguments against the $subject patch.

> The *normal* memory copy size tends to be in the 10-30 byte range, and
> the cache issues (both code *and* data) are unclear. Running
> microbenchmarks is almost always counter-productive, since it actually
> shows numbers for something that has absolutely *nothing* to do with
> the actual patterns.

This is why I added a way to do meaningful measurement on small
size operations (albeit still cache-hot) with perf.

Jan


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] x86-64: fix memset() to support sizes of 4Gb and above
  2012-01-19  7:48       ` Jan Beulich
@ 2012-01-19 12:18         ` Ingo Molnar
  0 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2012-01-19 12:18 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Linus Torvalds, tglx, Andrew Morton, linux-kernel, hpa


* Jan Beulich <JBeulich@suse.com> wrote:

> >>> On 18.01.12 at 19:16, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> > On Wed, Jan 18, 2012 at 2:40 AM, Jan Beulich <JBeulich@suse.com> wrote:
> >>
> >>> For example the kernel's memcpy routine in slightly faster than
> >>> glibc's:
> >>
> >> This is an illusion - since the kernel's memcpy_64.S also defines a
> >> "memcpy" (not just "__memcpy"), the static linker resolves the
> >> reference from mem-memcpy.c against this one. Apparent
> >> performance differences rather point at effects like (guessing)
> >> branch prediction (using the second vs the first entry of
> >> routines[]). After fixing this, on my Westmere box glibc's is quite
> >> a bit slower than the unrolled kernel variant (4% fewer
> >> instructions, but about 15% more cycles).
> > 
> > Please don't bother doing memcpy performance analysis using 
> > hot-cache cases (or entirely cold-cache for that matter) 
> > and/or big memory copies.
> 
> I realize that - I just was asked to do this analysis, to 
> (hopefully) turn down arguments against the $subject patch.

The other problem with such repeated measurements, beyond their 
very isolated and artificially sterile nature, is what i 
mentioned: the inter-test variability is not enough to signal 
the real variance that occurs in a live system. That too can be 
deceiving.

Note that your patch is a special case which makes measurement 
easier: from the nature of your changes i expected *at most* 
some minimal micro-performance impact, not any larger access 
pattern related changes.

But Linus is right that this cannot be generalized to the 
typical patch.

So i realize all those limitations and fully agree with being 
aware of them, but compared to measuring *nothing* (which is the 
current status quo) we have to start *somewhere*.

> > The *normal* memory copy size tends to be in the 10-30 byte 
> > range, and the cache issues (both code *and* data) are 
> > unclear. Running microbenchmarks is almost always 
> > counter-productive, since it actually shows numbers for 
> > something that has absolutely *nothing* to do with the 
> > actual patterns.
> 
> This is why I added a way to do meaningful measurement on 
> small size operations (albeit still cache-hot) with perf.

We could add a test point for 10 and a 30 bytes, and the two 
corner cases: one measurement with an I$ that is trashing and a 
measurement where the D$ is trashing in a non-trivial way.

( I have used test-code before to achieve high I$ trashing: a
  function with a million NOPs. )

Once we have the typical sizes and the edge cases covered we can 
at least hope that reality is a healthy mix of all those 
"eigen-vectors".

Once we have that in place we can at least have one meaningful 
result: if a patch improves *all* these edge cases on the CPU 
models that matter, then it's typically true that it will 
improve the generic 'mixed' workload as well.

If a patch is not so clear-cut then it has to be measured with 
real loads as well, etc.

Anyway, i'll apply your current patches and play with them a 
bit.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [tip:x86/asm] x86-64: Fix memset() to support sizes of 4Gb and above
  2012-01-05 16:10 [PATCH] x86-64: fix memset() to support sizes of 4Gb and above Jan Beulich
  2012-01-06 11:05 ` Ingo Molnar
@ 2012-01-26 13:40 ` tip-bot for Jan Beulich
  1 sibling, 0 replies; 11+ messages in thread
From: tip-bot for Jan Beulich @ 2012-01-26 13:40 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, torvalds, jbeulich, akpm, JBeulich,
	tglx, mingo

Commit-ID:  5d7244e7c984cecead412bde6395ce18618a4a37
Gitweb:     http://git.kernel.org/tip/5d7244e7c984cecead412bde6395ce18618a4a37
Author:     Jan Beulich <JBeulich@suse.com>
AuthorDate: Thu, 5 Jan 2012 16:10:42 +0000
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 26 Jan 2012 11:50:04 +0100

x86-64: Fix memset() to support sizes of 4Gb and above

While currently there doesn't appear to be any reachable in-tree
case where such large memory blocks may be passed to memset()
(alloc_bootmem() being the primary non-reachable one, as it gets
called with suitably large sizes in FLATMEM configurations), we
have recently hit the problem a second time in our Xen kernels.

Rather than working around it a second time, prevent others from
falling into the same trap by fixing this long standing
limitation.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F05D992020000780006AA09@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/memset_64.S |   33 +++++++++++++++------------------
 1 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 79bd454..2dcb380 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -19,16 +19,15 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemset_c:
 	movq %rdi,%r9
-	movl %edx,%r8d
-	andl $7,%r8d
-	movl %edx,%ecx
-	shrl $3,%ecx
+	movq %rdx,%rcx
+	andl $7,%edx
+	shrq $3,%rcx
 	/* expand byte value  */
 	movzbl %sil,%esi
 	movabs $0x0101010101010101,%rax
-	mulq %rsi		/* with rax, clobbers rdx */
+	imulq %rsi,%rax
 	rep stosq
-	movl %r8d,%ecx
+	movl %edx,%ecx
 	rep stosb
 	movq %r9,%rax
 	ret
@@ -50,7 +49,7 @@
 .Lmemset_c_e:
 	movq %rdi,%r9
 	movb %sil,%al
-	movl %edx,%ecx
+	movq %rdx,%rcx
 	rep stosb
 	movq %r9,%rax
 	ret
@@ -61,12 +60,11 @@ ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
 	movq %rdi,%r10
-	movq %rdx,%r11
 
 	/* expand byte value  */
 	movzbl %sil,%ecx
 	movabs $0x0101010101010101,%rax
-	mul    %rcx		/* with rax, clobbers rdx */
+	imulq  %rcx,%rax
 
 	/* align dst */
 	movl  %edi,%r9d
@@ -75,13 +73,13 @@ ENTRY(__memset)
 	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
 
-	movl %r11d,%ecx
-	shrl $6,%ecx
+	movq  %rdx,%rcx
+	shrq  $6,%rcx
 	jz	 .Lhandle_tail
 
 	.p2align 4
 .Lloop_64:
-	decl   %ecx
+	decq  %rcx
 	movq  %rax,(%rdi)
 	movq  %rax,8(%rdi)
 	movq  %rax,16(%rdi)
@@ -97,7 +95,7 @@ ENTRY(__memset)
 	   to predict jump tables. */
 	.p2align 4
 .Lhandle_tail:
-	movl	%r11d,%ecx
+	movl	%edx,%ecx
 	andl    $63&(~7),%ecx
 	jz 		.Lhandle_7
 	shrl	$3,%ecx
@@ -109,12 +107,11 @@ ENTRY(__memset)
 	jnz    .Lloop_8
 
 .Lhandle_7:
-	movl	%r11d,%ecx
-	andl	$7,%ecx
+	andl	$7,%edx
 	jz      .Lende
 	.p2align 4
 .Lloop_1:
-	decl    %ecx
+	decl    %edx
 	movb 	%al,(%rdi)
 	leaq	1(%rdi),%rdi
 	jnz     .Lloop_1
@@ -125,13 +122,13 @@ ENTRY(__memset)
 
 	CFI_RESTORE_STATE
 .Lbad_alignment:
-	cmpq $7,%r11
+	cmpq $7,%rdx
 	jbe	.Lhandle_7
 	movq %rax,(%rdi)	/* unaligned store */
 	movq $8,%r8
 	subq %r9,%r8
 	addq %r8,%rdi
-	subq %r8,%r11
+	subq %r8,%rdx
 	jmp .Lafter_bad_alignment
 .Lfinal:
 	CFI_ENDPROC

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2012-01-26 13:40 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-01-05 16:10 [PATCH] x86-64: fix memset() to support sizes of 4Gb and above Jan Beulich
2012-01-06 11:05 ` Ingo Molnar
2012-01-06 12:31   ` Jan Beulich
2012-01-06 19:01     ` Ingo Molnar
2012-01-18 10:40   ` Jan Beulich
2012-01-18 11:14     ` Ingo Molnar
2012-01-18 13:33       ` Jan Beulich
2012-01-18 18:16     ` Linus Torvalds
2012-01-19  7:48       ` Jan Beulich
2012-01-19 12:18         ` Ingo Molnar
2012-01-26 13:40 ` [tip:x86/asm] x86-64: Fix " tip-bot for Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).