linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] lib/string: Bring optimized memcmp from glibc
@ 2021-07-21 13:59 Nikolay Borisov
  2021-07-21 14:32 ` Christoph Hellwig
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Nikolay Borisov @ 2021-07-21 13:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: ndesaulniers, torvalds, linux-fsdevel, david, Nikolay Borisov

This is glibc's memcmp version. The upside is that for architectures
which don't have an optimized version the kernel can provide some
solace in the form of a generic, word-sized optimized memcmp. I tested
this with a heavy IOCTL_FIDEDUPERANGE(2) workload and here are the
results I got:

	    UNPATCHED		PATCHED
real	    1m13.127s		1m10.611s
user	    0m0.030s            0m0.033s
sys         0m5.890s            0m4.001s  (32% decrease)

Comparing 2 perf profiles of the workload yield:

    30.44%    -29.39%  [kernel.vmlinux]         [k] memcmp

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
---
 lib/string.c | 303 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 293 insertions(+), 10 deletions(-)

diff --git a/lib/string.c b/lib/string.c
index 7548eb715ddb..915db7e49804 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -923,22 +923,305 @@ EXPORT_SYMBOL(memmove);
 #endif
 
 #ifndef __HAVE_ARCH_MEMCMP
+
+/* Threshold value for when to enter the unrolled loops.  */
+#define MEMCMP_THRESH 16
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+# define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
+# define CMP_LT_OR_GT(a, b) memcmp_bytes((a), (b))
+/*
+ * Compare A and B bytewise in the byte order of the machine.
+ * A and B are known to be different. This is needed only on little-endian
+ * machines.
+ */
+static inline int memcmp_bytes(unsigned long a, unsigned long b)
+{
+	long srcp1 = (long) &a;
+	long srcp2 = (long) &b;
+	unsigned long a0, b0;
+
+	do {
+		a0 = ((uint8_t *) srcp1)[0];
+		b0 = ((uint8_t *) srcp2)[0];
+		srcp1 += 1;
+		srcp2 += 1;
+	} while (a0 == b0);
+	return a0 - b0;
+}
+
+#else
+# define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2)))
+# define CMP_LT_OR_GT(a, b) ((a) > (b) ? 1 : -1)
+#endif
+
+/*
+ * The strategy of this memcmp is:
+ *  1. Compare bytes until one of the block pointers is aligned.
+ *
+ *  2. Compare using memcmp_common_alignment or memcmp_not_common_alignment,
+ *  regarding the alignment of the other block after the initial byte operations.
+ *  The maximum number of full words (of type unsigned long) are compared in
+ *  this way.
+ *
+ *  3. Compare the few remaining bytes.
+ */
+
+/*
+ * Compare blocks at SRCP1 and SRCP2 with LEN `unsigned long' objects (not LEN
+ * bytes!). Both SRCP1 and SRCP2 should be aligned for memory operations on
+ * `unsigned long's.
+ */
+static int memcmp_common_alignment(long srcp1, long srcp2, size_t len)
+{
+	unsigned long a0, a1;
+	unsigned long b0, b1;
+
+	switch (len % 4) {
+	default: /* Avoid warning about uninitialized local variables.  */
+	case 2:
+		a0 = ((unsigned long *) srcp1)[0];
+		b0 = ((unsigned long *) srcp2)[0];
+		srcp1 -= 2 * sizeof(unsigned long);
+		srcp2 -= 2 * sizeof(unsigned long);
+		len += 2;
+		goto do1;
+	case 3:
+		a1 = ((unsigned long *) srcp1)[0];
+		b1 = ((unsigned long *) srcp2)[0];
+		srcp1 -= sizeof(unsigned long);
+		srcp2 -= sizeof(unsigned long);
+		len += 1;
+		goto do2;
+	case 0:
+		if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+			return 0;
+		a0 = ((unsigned long *) srcp1)[0];
+		b0 = ((unsigned long *) srcp2)[0];
+		goto do3;
+	case 1:
+		a1 = ((unsigned long *) srcp1)[0];
+		b1 = ((unsigned long *) srcp2)[0];
+		srcp1 += sizeof(unsigned long);
+		srcp2 += sizeof(unsigned long);
+		len -= 1;
+		if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+			goto do0;
+		/* Fallthrough */
+	}
+
+	do {
+		a0 = ((unsigned long *) srcp1)[0];
+		b0 = ((unsigned long *) srcp2)[0];
+		if (a1 != b1)
+			return CMP_LT_OR_GT(a1, b1);
+
+do3:
+		a1 = ((unsigned long *) srcp1)[1];
+		b1 = ((unsigned long *) srcp2)[1];
+		if (a0 != b0)
+			return CMP_LT_OR_GT(a0, b0);
+
+do2:
+		a0 = ((unsigned long *) srcp1)[2];
+		b0 = ((unsigned long *) srcp2)[2];
+		if (a1 != b1)
+			return CMP_LT_OR_GT(a1, b1);
+
+do1:
+		a1 = ((unsigned long *) srcp1)[3];
+		b1 = ((unsigned long *) srcp2)[3];
+		if (a0 != b0)
+			return CMP_LT_OR_GT(a0, b0);
+
+		srcp1 += 4 * sizeof(unsigned long);
+		srcp2 += 4 * sizeof(unsigned long);
+		len -= 4;
+	}  while (len != 0);
+
+	/*
+	 * This is the right position for do0. Please don't move it into the
+	 * loop.
+	 */
+do0:
+	if (a1 != b1)
+		return CMP_LT_OR_GT(a1, b1);
+	return 0;
+}
+
+/*
+ * Compare blocks at SRCP1 and SRCP2 with LEN `unsigned long' objects (not LEN
+ * bytes!). SRCP2 should be aligned for memory operations on `unsigned long',
+ * but SRCP1 *should be unaligned*.
+ */
+static int memcmp_not_common_alignment(long srcp1, long srcp2, size_t len)
+{
+	unsigned long a0, a1, a2, a3;
+	unsigned long b0, b1, b2, b3;
+	unsigned long x;
+	int shl, shr;
+
+	/*
+	 * Calculate how to shift a word read at the memory operation
+	 * aligned srcp1 to make it aligned for comparison.
+	 */
+
+	shl = 8 * (srcp1 % sizeof(unsigned long));
+	shr = 8 * sizeof(unsigned long) - shl;
+
+	/*
+	 * Make SRCP1 aligned by rounding it down to the beginning of the
+	 * `unsigned long' it points in the middle of.
+	 */
+	srcp1 &= -sizeof(unsigned long);
+
+	switch (len % 4) {
+	default: /* Avoid warning about uninitialized local variables.  */
+	case 2:
+		a1 = ((unsigned long *) srcp1)[0];
+		a2 = ((unsigned long *) srcp1)[1];
+		b2 = ((unsigned long *) srcp2)[0];
+		srcp1 -= 1 * sizeof(unsigned long);
+		srcp2 -= 2 * sizeof(unsigned long);
+		len += 2;
+		goto do1;
+	case 3:
+		a0 = ((unsigned long *) srcp1)[0];
+		a1 = ((unsigned long *) srcp1)[1];
+		b1 = ((unsigned long *) srcp2)[0];
+		srcp2 -= 1 * sizeof(unsigned long);
+		len += 1;
+		goto do2;
+	case 0:
+		if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+			return 0;
+		a3 = ((unsigned long *) srcp1)[0];
+		a0 = ((unsigned long *) srcp1)[1];
+		b0 = ((unsigned long *) srcp2)[0];
+		srcp1 += 1 * sizeof(unsigned long);
+		goto do3;
+	case 1:
+		a2 = ((unsigned long *) srcp1)[0];
+		a3 = ((unsigned long *) srcp1)[1];
+		b3 = ((unsigned long *) srcp2)[0];
+		srcp1 += 2 * sizeof(unsigned long);
+		srcp2 += 1 * sizeof(unsigned long);
+		len -= 1;
+		if (MEMCMP_THRESH <= 3 * sizeof(unsigned long) && len == 0)
+			goto do0;
+		/* Fallthrough */
+	}
+
+	do {
+		a0 = ((unsigned long *) srcp1)[0];
+		b0 = ((unsigned long *) srcp2)[0];
+		x = MERGE(a2, shl, a3, shr);
+		if (x != b3)
+			return CMP_LT_OR_GT(x, b3);
+
+do3:
+		a1 = ((unsigned long *) srcp1)[1];
+		b1 = ((unsigned long *) srcp2)[1];
+		x = MERGE(a3, shl, a0, shr);
+		if (x != b0)
+			return CMP_LT_OR_GT(x, b0);
+
+do2:
+		a2 = ((unsigned long *) srcp1)[2];
+		b2 = ((unsigned long *) srcp2)[2];
+		x = MERGE(a0, shl, a1, shr);
+		if (x != b1)
+			return CMP_LT_OR_GT(x, b1);
+
+do1:
+		a3 = ((unsigned long *) srcp1)[3];
+		b3 = ((unsigned long *) srcp2)[3];
+		x = MERGE(a1, shl, a2, shr);
+		if (x != b2)
+			return CMP_LT_OR_GT(x, b2);
+
+		srcp1 += 4 * sizeof(unsigned long);
+		srcp2 += 4 * sizeof(unsigned long);
+		len -= 4;
+	} while (len != 0);
+
+	/*
+	 * This is the right position for do0. Please don't move it into
+	 * the loop.
+	 */
+do0:
+	x = MERGE(a2, shl, a3, shr);
+	if (x != b3)
+		return CMP_LT_OR_GT(x, b3);
+	return 0;
+}
+
+#undef memcmp
 /**
  * memcmp - Compare two areas of memory
- * @cs: One area of memory
- * @ct: Another area of memory
+ * @s1: One area of memory
+ * @s2: Another area of memory
  * @count: The size of the area.
  */
-#undef memcmp
-__visible int memcmp(const void *cs, const void *ct, size_t count)
+__visible int memcmp(const void *s1, const void *s2, size_t len)
 {
-	const unsigned char *su1, *su2;
-	int res = 0;
+	unsigned long a0;
+	unsigned long b0;
+	long srcp1 = (long) s1;
+	long srcp2 = (long) s2;
+	unsigned long res;
+
+	if (len >= MEMCMP_THRESH) {
+		/*
+		 * There are at least some bytes to compare. No need to test
+		 * for LEN == 0 in this alignment loop.
+		 */
+		while (!IS_ALIGNED(srcp2, sizeof(unsigned long))) {
+			a0 = ((uint8_t *) srcp1)[0];
+			b0 = ((uint8_t *) srcp2)[0];
+			srcp1 += 1;
+			srcp2 += 1;
+			res = a0 - b0;
+			if (res != 0)
+				return res;
+			len -= 1;
+		}
 
-	for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
-		if ((res = *su1 - *su2) != 0)
-			break;
-	return res;
+		/*
+		 * SRCP2 is now aligned for memory operations on `unsigned long'.
+		 * SRCP1 alignment determines if we can do a simple, aligned
+		 * compare or need to shuffle bits.
+		 */
+
+		if (IS_ALIGNED(srcp1, sizeof(unsigned long)))
+			res = memcmp_common_alignment(srcp1, srcp2,
+						len / sizeof(unsigned long));
+		else
+			res = memcmp_not_common_alignment(srcp1, srcp2,
+						len / sizeof(unsigned long));
+
+		if (res != 0)
+			return res;
+
+		/* Number of bytes remaining in the interval [0..sizeof(unsigned long)-1].  */
+		srcp1 += len & -sizeof(unsigned long);
+		srcp2 += len & -sizeof(unsigned long);
+		len %= sizeof(unsigned long);
+	}
+
+	/* There are just a few bytes to compare. Use byte memory operations.  */
+	while (len != 0) {
+		a0 = ((uint8_t *) srcp1)[0];
+		b0 = ((uint8_t *) srcp2)[0];
+		srcp1 += 1;
+		srcp2 += 1;
+		res = a0 - b0;
+		if (res != 0)
+			return res;
+		len -= 1;
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL(memcmp);
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 13:59 [PATCH] lib/string: Bring optimized memcmp from glibc Nikolay Borisov
@ 2021-07-21 14:32 ` Christoph Hellwig
  2021-07-21 14:35   ` Nikolay Borisov
  2021-07-21 18:00 ` Linus Torvalds
  2021-07-28 20:12 ` Florian Weimer
  2 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2021-07-21 14:32 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: linux-kernel, ndesaulniers, torvalds, linux-fsdevel, david

This seems to have lost the copyright notices from glibc.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 14:32 ` Christoph Hellwig
@ 2021-07-21 14:35   ` Nikolay Borisov
  2021-07-21 14:42     ` Christoph Hellwig
  0 siblings, 1 reply; 23+ messages in thread
From: Nikolay Borisov @ 2021-07-21 14:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-kernel, ndesaulniers, torvalds, linux-fsdevel, david



On 21.07.21 г. 17:32, Christoph Hellwig wrote:
> This seems to have lost the copyright notices from glibc.
> 

I copied over only the code, what else needs to be brought up:

 Copyright (C) 1991-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Torbjorn Granlund (tege@sics.se).

The rest is the generic GPL license txt ?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 14:35   ` Nikolay Borisov
@ 2021-07-21 14:42     ` Christoph Hellwig
  2021-07-21 14:51       ` Matthew Wilcox
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2021-07-21 14:42 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: Christoph Hellwig, linux-kernel, ndesaulniers, torvalds,
	linux-fsdevel, david

On Wed, Jul 21, 2021 at 05:35:42PM +0300, Nikolay Borisov wrote:
> 
> 
> On 21.07.21 ??. 17:32, Christoph Hellwig wrote:
> > This seems to have lost the copyright notices from glibc.
> > 
> 
> I copied over only the code, what else needs to be brought up:
> 
>  Copyright (C) 1991-2021 Free Software Foundation, Inc.
>    This file is part of the GNU C Library.
>    Contributed by Torbjorn Granlund (tege@sics.se).
> 
> The rest is the generic GPL license txt ?

Last time I checked glibc is under LGPL.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 14:42     ` Christoph Hellwig
@ 2021-07-21 14:51       ` Matthew Wilcox
  2021-07-21 15:17         ` Peter.Enderborg
  0 siblings, 1 reply; 23+ messages in thread
From: Matthew Wilcox @ 2021-07-21 14:51 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Nikolay Borisov, linux-kernel, ndesaulniers, torvalds,
	linux-fsdevel, david

On Wed, Jul 21, 2021 at 03:42:10PM +0100, Christoph Hellwig wrote:
> On Wed, Jul 21, 2021 at 05:35:42PM +0300, Nikolay Borisov wrote:
> > 
> > 
> > On 21.07.21 ??. 17:32, Christoph Hellwig wrote:
> > > This seems to have lost the copyright notices from glibc.
> > > 
> > 
> > I copied over only the code, what else needs to be brought up:
> > 
> >  Copyright (C) 1991-2021 Free Software Foundation, Inc.
> >    This file is part of the GNU C Library.
> >    Contributed by Torbjorn Granlund (tege@sics.se).
> > 
> > The rest is the generic GPL license txt ?
> 
> Last time I checked glibc is under LGPL.

This particular file is under LGPL-2.1, so we can distribute it under
GPL 2.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 14:51       ` Matthew Wilcox
@ 2021-07-21 15:17         ` Peter.Enderborg
  2021-07-21 15:34           ` Matthew Wilcox
  0 siblings, 1 reply; 23+ messages in thread
From: Peter.Enderborg @ 2021-07-21 15:17 UTC (permalink / raw)
  To: willy, hch, tege
  Cc: nborisov, linux-kernel, ndesaulniers, torvalds, linux-fsdevel, david

On 7/21/21 4:51 PM, Matthew Wilcox wrote:
> On Wed, Jul 21, 2021 at 03:42:10PM +0100, Christoph Hellwig wrote:
>> On Wed, Jul 21, 2021 at 05:35:42PM +0300, Nikolay Borisov wrote:
>>>
>>> On 21.07.21 ??. 17:32, Christoph Hellwig wrote:
>>>> This seems to have lost the copyright notices from glibc.
>>>>
>>> I copied over only the code, what else needs to be brought up:
>>>
>>>  Copyright (C) 1991-2021 Free Software Foundation, Inc.
>>>    This file is part of the GNU C Library.
>>>    Contributed by Torbjorn Granlund (tege@sics.se).
>>>
>>> The rest is the generic GPL license txt ?
>> Last time I checked glibc is under LGPL.
> This particular file is under LGPL-2.1, so we can distribute it under
> GPL 2.

Sure. But should not Torbjörn Granlund have some cred?

Sort of "Original-Author" tag or something?


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 15:17         ` Peter.Enderborg
@ 2021-07-21 15:34           ` Matthew Wilcox
  2021-07-21 15:39             ` Peter.Enderborg
  0 siblings, 1 reply; 23+ messages in thread
From: Matthew Wilcox @ 2021-07-21 15:34 UTC (permalink / raw)
  To: Peter.Enderborg
  Cc: hch, tege, nborisov, linux-kernel, ndesaulniers, torvalds,
	linux-fsdevel, david

On Wed, Jul 21, 2021 at 03:17:53PM +0000, Peter.Enderborg@sony.com wrote:
> On 7/21/21 4:51 PM, Matthew Wilcox wrote:
> > On Wed, Jul 21, 2021 at 03:42:10PM +0100, Christoph Hellwig wrote:
> >> On Wed, Jul 21, 2021 at 05:35:42PM +0300, Nikolay Borisov wrote:
> >>>
> >>> On 21.07.21 ??. 17:32, Christoph Hellwig wrote:
> >>>> This seems to have lost the copyright notices from glibc.
> >>>>
> >>> I copied over only the code, what else needs to be brought up:
> >>>
> >>>  Copyright (C) 1991-2021 Free Software Foundation, Inc.
> >>>    This file is part of the GNU C Library.
> >>>    Contributed by Torbjorn Granlund (tege@sics.se).
> >>>
> >>> The rest is the generic GPL license txt ?
> >> Last time I checked glibc is under LGPL.
> > This particular file is under LGPL-2.1, so we can distribute it under
> > GPL 2.
> 
> Sure. But should not Torbjörn Granlund have some cred?

I didn't say we could remove his copyright.  It's clearly still
copyright Torbjörn.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 15:34           ` Matthew Wilcox
@ 2021-07-21 15:39             ` Peter.Enderborg
  0 siblings, 0 replies; 23+ messages in thread
From: Peter.Enderborg @ 2021-07-21 15:39 UTC (permalink / raw)
  To: willy
  Cc: hch, nborisov, linux-kernel, ndesaulniers, torvalds,
	linux-fsdevel, david

On 7/21/21 5:34 PM, Matthew Wilcox wrote:
> On Wed, Jul 21, 2021 at 03:17:53PM +0000, Peter.Enderborg@sony.com wrote:
>> On 7/21/21 4:51 PM, Matthew Wilcox wrote:
>>> On Wed, Jul 21, 2021 at 03:42:10PM +0100, Christoph Hellwig wrote:
>>>> On Wed, Jul 21, 2021 at 05:35:42PM +0300, Nikolay Borisov wrote:
>>>>>
>>>>> On 21.07.21 ??. 17:32, Christoph Hellwig wrote:
>>>>>> This seems to have lost the copyright notices from glibc.
>>>>>>
>>>>> I copied over only the code, what else needs to be brought up:
>>>>>
>>>>>  Copyright (C) 1991-2021 Free Software Foundation, Inc.
>>>>>    This file is part of the GNU C Library.
>>>>>    Contributed by Torbjorn Granlund (tege@sics.se).
>>>>>
>>>>> The rest is the generic GPL license txt ?
>>>> Last time I checked glibc is under LGPL.
>>> This particular file is under LGPL-2.1, so we can distribute it under
>>> GPL 2.
>>
>> Sure. But should not Torbjörn Granlund have some cred?
> 
> I didn't say we could remove his copyright.  It's clearly still
> copyright Torbjörn.
> 

Yes, but how?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 13:59 [PATCH] lib/string: Bring optimized memcmp from glibc Nikolay Borisov
  2021-07-21 14:32 ` Christoph Hellwig
@ 2021-07-21 18:00 ` Linus Torvalds
  2021-07-21 18:17   ` Nikolay Borisov
  2021-07-21 20:10   ` David Sterba
  2021-07-28 20:12 ` Florian Weimer
  2 siblings, 2 replies; 23+ messages in thread
From: Linus Torvalds @ 2021-07-21 18:00 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: Linux Kernel Mailing List, Nick Desaulniers, linux-fsdevel, Dave Chinner

On Wed, Jul 21, 2021 at 6:59 AM Nikolay Borisov <nborisov@suse.com> wrote:
>
> This is glibc's memcmp version. The upside is that for architectures
> which don't have an optimized version the kernel can provide some
> solace in the form of a generic, word-sized optimized memcmp. I tested
> this with a heavy IOCTL_FIDEDUPERANGE(2) workload and here are the
> results I got:

Hmm. I suspect the usual kernel use of memcmp() is _very_ skewed to
very small memcmp calls, and I don't think I've ever seen that
(horribly bad) byte-wise default memcmp in most profiles.

I suspect that FIDEDUPERANGE thing is most likely a very special case.

So I don't think you're wrong to look at this, but I think you've gone
from our old "spend no effort at all" to "look at one special case".

And I think the glibc implementation is horrible and doesn't know
about machines where unaligned loads are cheap - which is all
reasonable ones.

That MERGE() macro is disgusting, and memcmp_not_common_alignment()
should not exist on any sane architecture. It's literally doing extra
work to make for slower accesses, when the hardware does it better
natively.

So honestly, I'd much rather see a much saner and simpler
implementation that works well on the architectures that matter, and
that don't want that "align things by hand".

Aligning one of the sources by hand is fine and makes sense - so that
_if_ the two strings end up being mutually aligned, all subsequent
accesses are aligned.

 But then trying to do shift-and-masking for the possibly remaining
unaligned source is crazy and garbage. Don't do it.

And you never saw that, because your special FIDEDUPERANGE testcase
will never have anything but mutually aligned cases.

Which just shows that going from "don't care at all' to "care about
one special case" is not the way to go.

So I'd much rather see a simple default function that works well for
the sane architectures, than go with the default code from glibc - and
bad for the common modern architectures.

Then architectures could choose that one with some

        select GENERIC_MEMCMP

the same way we have

        select GENERIC_STRNCPY_FROM_USER

for the (sane, for normal architectures) common optimized case for a
special string instruction that matters a lot for the kernel.

                     Linus

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 18:00 ` Linus Torvalds
@ 2021-07-21 18:17   ` Nikolay Borisov
  2021-07-21 18:45     ` Linus Torvalds
  2021-07-21 20:10   ` David Sterba
  1 sibling, 1 reply; 23+ messages in thread
From: Nikolay Borisov @ 2021-07-21 18:17 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Linux Kernel Mailing List, Nick Desaulniers, linux-fsdevel, Dave Chinner



On 21.07.21 г. 21:00, Linus Torvalds wrote:
> On Wed, Jul 21, 2021 at 6:59 AM Nikolay Borisov <nborisov@suse.com> wrote:
>>
>> This is glibc's memcmp version. The upside is that for architectures
>> which don't have an optimized version the kernel can provide some
>> solace in the form of a generic, word-sized optimized memcmp. I tested
>> this with a heavy IOCTL_FIDEDUPERANGE(2) workload and here are the
>> results I got:
> 
> Hmm. I suspect the usual kernel use of memcmp() is _very_ skewed to
> very small memcmp calls, and I don't think I've ever seen that
> (horribly bad) byte-wise default memcmp in most profiles.
> 
> I suspect that FIDEDUPERANGE thing is most likely a very special case.
> 
> So I don't think you're wrong to look at this, but I think you've gone
> from our old "spend no effort at all" to "look at one special case".
> 
> And I think the glibc implementation is horrible and doesn't know
> about machines where unaligned loads are cheap - which is all
> reasonable ones.
> 
> That MERGE() macro is disgusting, and memcmp_not_common_alignment()
> should not exist on any sane architecture. It's literally doing extra
> work to make for slower accesses, when the hardware does it better
> natively.
> 
> So honestly, I'd much rather see a much saner and simpler
> implementation that works well on the architectures that matter, and
> that don't want that "align things by hand".
> 
> Aligning one of the sources by hand is fine and makes sense - so that
> _if_ the two strings end up being mutually aligned, all subsequent
> accesses are aligned.

I find it somewhat arbitrary that we choose to align the 2nd pointer and
not the first. Obviously it'll be easy to detect which one of the 2 is
unaligned and align it so that from thereon memcmp can continue doing
aligned accesses. However, this means a check like that would be done
for *every* (well, barring some threshold value) access to memcmp.

> 
>  But then trying to do shift-and-masking for the possibly remaining
> unaligned source is crazy and garbage. Don't do it.
> 
> And you never saw that, because your special FIDEDUPERANGE testcase
> will never have anything but mutually aligned cases.
> 
> Which just shows that going from "don't care at all' to "care about
> one special case" is not the way to go.
> 
> So I'd much rather see a simple default function that works well for
> the sane architectures, than go with the default code from glibc - and
> bad for the common modern architectures.

So you are saying that the current memcmp could indeed use improvement
but you don't want it to be based on the glibc's code due to the ugly
misalignment handling?

> 
> Then architectures could choose that one with some

So you are suggesting keeping the current byte comparison one aka
'naive' and having another, more optimized generic implementation that
should be selected by GENERIC_MEMCMP or have I misunderstood you ?

> 
>         select GENERIC_MEMCMP
> 
> the same way we have
> 
>         select GENERIC_STRNCPY_FROM_USER
> 
> for the (sane, for normal architectures) common optimized case for a
> special string instruction that matters a lot for the kernel.
> 
>                      Linus
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 18:17   ` Nikolay Borisov
@ 2021-07-21 18:45     ` Linus Torvalds
  2021-07-21 18:55       ` Linus Torvalds
                         ` (3 more replies)
  0 siblings, 4 replies; 23+ messages in thread
From: Linus Torvalds @ 2021-07-21 18:45 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: Linux Kernel Mailing List, Nick Desaulniers, linux-fsdevel, Dave Chinner

[-- Attachment #1: Type: text/plain, Size: 2562 bytes --]

On Wed, Jul 21, 2021 at 11:17 AM Nikolay Borisov <nborisov@suse.com> wrote:
>
> I find it somewhat arbitrary that we choose to align the 2nd pointer and
> not the first.

Yeah, that's a bit odd, but I don't think it matters.

The hope is obviously that they are mutually aligned, and in that case
it doesn't matter which one you aim to align.

> So you are saying that the current memcmp could indeed use improvement
> but you don't want it to be based on the glibc's code due to the ugly
> misalignment handling?

Yeah. I suspect that this (very simple) patch gives you the same
performance improvement that the glibc code does.

NOTE! I'm not saying this patch is perfect. This one doesn't even
_try_ to do the mutual alignment, because it's really silly. But I'm
throwing this out here for discussion, because

 - it's really simple

 - I suspect it gets you 99% of the way there

 - the code generation is actually quite good with both gcc and clang.
This is gcc:

        memcmp:
                jmp     .L60
        .L52:
                movq    (%rsi), %rax
                cmpq    %rax, (%rdi)
                jne     .L53
                addq    $8, %rdi
                addq    $8, %rsi
                subq    $8, %rdx
        .L60:
                cmpq    $7, %rdx
                ja      .L52
                testq   %rdx, %rdx
                je      .L61
        .L53:
                xorl    %ecx, %ecx
                jmp     .L56
        .L62:
                addq    $1, %rcx
                cmpq    %rcx, %rdx
                je      .L51
        .L56:
                movzbl  (%rdi,%rcx), %eax
                movzbl  (%rsi,%rcx), %r8d
                subl    %r8d, %eax
                je      .L62
        .L51:
                ret
        .L61:
                xorl    %eax, %eax
                ret

and notice how there are no spills, no extra garbage, just simple and
straightforward code.

Those things ends mattering too - it's good for I$, it's good for the
small cases, and it's good for debugging and reading the code.

If this is "good enough" for your test-case, I really would prefer
something like this. "Make it as simple as possible, but no simpler"

I can do the mutual alignment too, but I'd actually prefer to do it as
a separate patch, for when there are numbers for that.

And I wouldn't do it as a byte-by-byte case, because that's just stupid.

I'd do it using a separate first single "get unaligned word from both
sources, compare them for equality, and then only add enough bytes to
align"

                  Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 975 bytes --]

 lib/string.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lib/string.c b/lib/string.c
index 77bd0b1d3296..b2de45a581f4 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 
+#include <asm/unaligned.h>
 #include <asm/byteorder.h>
 #include <asm/word-at-a-time.h>
 #include <asm/page.h>
@@ -935,6 +936,21 @@ __visible int memcmp(const void *cs, const void *ct, size_t count)
 	const unsigned char *su1, *su2;
 	int res = 0;
 
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (count >= sizeof(unsigned long)) {
+		const unsigned long *u1 = cs;
+		const unsigned long *u2 = ct;
+		do {
+			if (get_unaligned(u1) != get_unaligned(u2))
+				break;
+			u1++;
+			u2++;
+			count -= sizeof(unsigned long);
+		} while (count >= sizeof(unsigned long));
+		cs = u1;
+		ct = u2;
+	}
+#endif
 	for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
 		if ((res = *su1 - *su2) != 0)
 			break;

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 18:45     ` Linus Torvalds
@ 2021-07-21 18:55       ` Linus Torvalds
  2021-07-21 19:26       ` Linus Torvalds
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 23+ messages in thread
From: Linus Torvalds @ 2021-07-21 18:55 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: Linux Kernel Mailing List, Nick Desaulniers, linux-fsdevel, Dave Chinner

On Wed, Jul 21, 2021 at 11:45 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
>  - the code generation is actually quite good with both gcc and clang.

Side note: I only looked at whether the code looked good, I didn't
check whether it looked *correct*.

So caveat emptor.  It looks trivially correct both on the source and
assembly level, but it's entirely untested, which probably means that
it has some stupid bug.

               Linus

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 18:45     ` Linus Torvalds
  2021-07-21 18:55       ` Linus Torvalds
@ 2021-07-21 19:26       ` Linus Torvalds
  2021-07-22 11:28         ` Nikolay Borisov
  2021-07-22  8:28       ` Nikolay Borisov
  2021-07-23 14:02       ` David Laight
  3 siblings, 1 reply; 23+ messages in thread
From: Linus Torvalds @ 2021-07-21 19:26 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: Linux Kernel Mailing List, Nick Desaulniers, linux-fsdevel, Dave Chinner

[-- Attachment #1: Type: text/plain, Size: 2877 bytes --]

On Wed, Jul 21, 2021 at 11:45 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> I can do the mutual alignment too, but I'd actually prefer to do it as
> a separate patch, for when there are numbers for that.
>
> And I wouldn't do it as a byte-by-byte case, because that's just stupid.

Here's that "try to align one of the pointers in order to avoid the
lots-of-unaligned case" patch.

It's not quite as simple, and the generated assembly isn't quite as
obvious. But it still generates code that looks good, it's just that
the code to align the first pointer ends up being a bit harder to
read.

And since it's a bit less obvious, the "this is probably buggy because
I didn't actually _test_ it" warning holds even more. But you can see
how much simpler the code still is than the horrendous glibc mess is.

And I removed the "CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS" checking,
because now it should be at least *somewhat* reasonable even on
machines that have a complicated "get_unaligned()".

But maybe I should have kept it. Only testing will tell.

Again: UNTESTED GARBAGE ATTACHED. Be careful. But it migth work, and
ti generates something that looks superficially reasonable.

Gcc again:

        memcmp:
                cmpq    $7, %rdx
                jbe     .L56
                movq    (%rsi), %rax
                cmpq    %rax, (%rdi)
                je      .L61
        .L55:
                xorl    %ecx, %ecx
                jmp     .L58
        .L62:
                addq    $1, %rcx
                cmpq    %rcx, %rdx
                je      .L51
        .L58:
                movzbl  (%rdi,%rcx), %eax
                movzbl  (%rsi,%rcx), %r8d
                subl    %r8d, %eax
                je      .L62
        .L51:
                ret
        .L56:
                testq   %rdx, %rdx
                jne     .L55
                xorl    %eax, %eax
                ret
        .L61:
                movq    %rdi, %rcx
                movl    $8, %eax
                andl    $7, %ecx
                subq    %rcx, %rax
                leaq    -8(%rcx,%rdx), %rdx
                addq    %rax, %rdi
                addq    %rax, %rsi
                cmpq    $7, %rdx
                ja      .L57
                jmp     .L56
        .L63:
                subq    $8, %rdx
                addq    $8, %rdi
                addq    $8, %rsi
                cmpq    $7, %rdx
                jbe     .L56
        .L57:
                movq    (%rsi), %rax
                cmpq    %rax, (%rdi)
                je      .L63
                jmp     .L55

but clang is similar (except clang isn't as eager to move basic blocks
around, so it's visually very different).

Note no spills, no odd shifts for unaligned accesses, no garbage.

Again: untested, so consider this a starting point rather than
anything good and proper.

                   Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 1172 bytes --]

 lib/string.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/lib/string.c b/lib/string.c
index 77bd0b1d3296..3eb390fc4f73 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 
+#include <asm/unaligned.h>
 #include <asm/byteorder.h>
 #include <asm/word-at-a-time.h>
 #include <asm/page.h>
@@ -935,6 +936,31 @@ __visible int memcmp(const void *cs, const void *ct, size_t count)
 	const unsigned char *su1, *su2;
 	int res = 0;
 
+	if (count >= sizeof(unsigned long)) {
+		const unsigned long *u1 = cs;
+		const unsigned long *u2 = ct;
+		unsigned long offset;
+
+		if (get_unaligned(u1) != get_unaligned(u2))
+			goto bytewise;
+
+		/* Align 'u1' up */
+		offset = sizeof(*u1) - ((sizeof(*u1)-1) & (unsigned long)(u1));
+		u1 = cs + offset;
+		u2 = ct + offset;
+		count -= offset;
+
+		while (count >= sizeof(unsigned long)) {
+			if (*u1 != get_unaligned(u2))
+				break;
+			u1++;
+			u2++;
+			count -= sizeof(unsigned long);
+		}
+		cs = u1;
+		ct = u2;
+	}
+bytewise:
 	for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
 		if ((res = *su1 - *su2) != 0)
 			break;

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 18:00 ` Linus Torvalds
  2021-07-21 18:17   ` Nikolay Borisov
@ 2021-07-21 20:10   ` David Sterba
  2021-07-21 20:27     ` Linus Torvalds
  1 sibling, 1 reply; 23+ messages in thread
From: David Sterba @ 2021-07-21 20:10 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Nikolay Borisov, Linux Kernel Mailing List, Nick Desaulniers,
	linux-fsdevel, Dave Chinner

On Wed, Jul 21, 2021 at 11:00:59AM -0700, Linus Torvalds wrote:
> On Wed, Jul 21, 2021 at 6:59 AM Nikolay Borisov <nborisov@suse.com> wrote:
> >
> > This is glibc's memcmp version. The upside is that for architectures
> > which don't have an optimized version the kernel can provide some
> > solace in the form of a generic, word-sized optimized memcmp. I tested
> > this with a heavy IOCTL_FIDEDUPERANGE(2) workload and here are the
> > results I got:
> 
> Hmm. I suspect the usual kernel use of memcmp() is _very_ skewed to
> very small memcmp calls, and I don't think I've ever seen that
> (horribly bad) byte-wise default memcmp in most profiles.
> 
> I suspect that FIDEDUPERANGE thing is most likely a very special case.
> 
> So I don't think you're wrong to look at this, but I think you've gone
> from our old "spend no effort at all" to "look at one special case".

The memcmp in question is fs/remap_range.c:vfs_dedupe_file_range_compare

   253                  src_addr = kmap_atomic(src_page);
   254                  dest_addr = kmap_atomic(dest_page);
   ...
   259                  if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
   260                          same = false;
   261  
   262                  kunmap_atomic(dest_addr);
   263                  kunmap_atomic(src_addr);

so adding a memcmp_large that compares by native words or u64 could be
the best option. There's some alignment of the starting offset and
length but that can be special cased and fall back to standard memcmp.
The dedupe ioctl is typically called on ranges spanning many pages so
the overhead of the non-paged portions should be insignificant.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 20:10   ` David Sterba
@ 2021-07-21 20:27     ` Linus Torvalds
  2021-07-22  5:54       ` Nikolay Borisov
  0 siblings, 1 reply; 23+ messages in thread
From: Linus Torvalds @ 2021-07-21 20:27 UTC (permalink / raw)
  To: David Sterba, Linus Torvalds, Nikolay Borisov,
	Linux Kernel Mailing List, Nick Desaulniers, linux-fsdevel,
	Dave Chinner

On Wed, Jul 21, 2021 at 1:13 PM David Sterba <dsterba@suse.cz> wrote:
>
> adding a memcmp_large that compares by native words or u64 could be
> the best option.

Yeah, we could just special-case that one place.

But see the patches I sent out - I think we can get the best of both worlds.

A small and simple memcmp() that is good enough and not the
_completely_ stupid thing we have now.

The second patch I sent out even gets the mutually aligned case right.

Of course, the glibc code also ended up unrolling things a bit, but
honestly, the way it did it was too disgusting for words.

And if it really turns out that the unrolling makes a big difference -
although I doubt it's meaningful with any modern core - I can add a
couple of lines to that simple patch I sent out to do that too.
Without getting the monster that is that glibc code.

Of course, my patch depends on the fact that "get_unaligned()" is
cheap on all CPU's that really matter, and that caches aren't
direct-mapped any more. The glibc code seems to be written for a world
where registers are cheap, unaligned accesses are prohibitively
expensive, and unrolling helps because L1 caches are direct-mapped and
you really want to do chunking to not get silly way conflicts.

If old-style Sparc or MIPS was our primary target, that would be one
thing. But it really isn't.

              Linus

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 20:27     ` Linus Torvalds
@ 2021-07-22  5:54       ` Nikolay Borisov
  0 siblings, 0 replies; 23+ messages in thread
From: Nikolay Borisov @ 2021-07-22  5:54 UTC (permalink / raw)
  To: Linus Torvalds, David Sterba, Linux Kernel Mailing List,
	Nick Desaulniers, linux-fsdevel, Dave Chinner



On 21.07.21 г. 23:27, Linus Torvalds wrote:
> On Wed, Jul 21, 2021 at 1:13 PM David Sterba <dsterba@suse.cz> wrote:
>>
>> adding a memcmp_large that compares by native words or u64 could be
>> the best option.
> 
> Yeah, we could just special-case that one place.

This who thread started because I first implemented a special case just
for dedupe and Dave Chinner suggested instead of playing whack-a-mole to
get something decent for the generic memcmp so that we get an
improvement across the whole of the kernel.

> 
> But see the patches I sent out - I think we can get the best of both worlds.
> 
> A small and simple memcmp() that is good enough and not the
> _completely_ stupid thing we have now.
> 
> The second patch I sent out even gets the mutually aligned case right.
> 
> Of course, the glibc code also ended up unrolling things a bit, but
> honestly, the way it did it was too disgusting for words.
> 
> And if it really turns out that the unrolling makes a big difference -
> although I doubt it's meaningful with any modern core - I can add a
> couple of lines to that simple patch I sent out to do that too.
> Without getting the monster that is that glibc code.
> 
> Of course, my patch depends on the fact that "get_unaligned()" is
> cheap on all CPU's that really matter, and that caches aren't
> direct-mapped any more. The glibc code seems to be written for a world
> where registers are cheap, unaligned accesses are prohibitively
> expensive, and unrolling helps because L1 caches are direct-mapped and
> you really want to do chunking to not get silly way conflicts.
> 
> If old-style Sparc or MIPS was our primary target, that would be one
> thing. But it really isn't.
> 
>               Linus
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 18:45     ` Linus Torvalds
  2021-07-21 18:55       ` Linus Torvalds
  2021-07-21 19:26       ` Linus Torvalds
@ 2021-07-22  8:28       ` Nikolay Borisov
  2021-07-23 14:02       ` David Laight
  3 siblings, 0 replies; 23+ messages in thread
From: Nikolay Borisov @ 2021-07-22  8:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Linux Kernel Mailing List, Nick Desaulniers, linux-fsdevel, Dave Chinner



On 21.07.21 г. 21:45, Linus Torvalds wrote:
> On Wed, Jul 21, 2021 at 11:17 AM Nikolay Borisov <nborisov@suse.com> wrote:
>>
>> I find it somewhat arbitrary that we choose to align the 2nd pointer and
>> not the first.
> 
> Yeah, that's a bit odd, but I don't think it matters.
> 
> The hope is obviously that they are mutually aligned, and in that case
> it doesn't matter which one you aim to align.
> 
>> So you are saying that the current memcmp could indeed use improvement
>> but you don't want it to be based on the glibc's code due to the ugly
>> misalignment handling?
> 
> Yeah. I suspect that this (very simple) patch gives you the same
> performance improvement that the glibc code does.

You suspect correctly, perf profile:

30.44%    -29.38%  [kernel.vmlinux]         [k] memcmp

This is only on x86-64 as I don't have other arch handy. But this one is
definitely good.



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 19:26       ` Linus Torvalds
@ 2021-07-22 11:28         ` Nikolay Borisov
  2021-07-22 16:40           ` Linus Torvalds
  0 siblings, 1 reply; 23+ messages in thread
From: Nikolay Borisov @ 2021-07-22 11:28 UTC (permalink / raw)
  To: Linus Torvalds, Nikolay Borisov
  Cc: Linux Kernel Mailing List, Nick Desaulniers, linux-fsdevel, Dave Chinner



On 21.07.21 г. 22:26, Linus Torvalds wrote:
> On Wed, Jul 21, 2021 at 11:45 AM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
>>
>> I can do the mutual alignment too, but I'd actually prefer to do it as
>> a separate patch, for when there are numbers for that.
>>
>> And I wouldn't do it as a byte-by-byte case, because that's just stupid.
> 
> Here's that "try to align one of the pointers in order to avoid the
> lots-of-unaligned case" patch.
> 
> It's not quite as simple, and the generated assembly isn't quite as
> obvious. But it still generates code that looks good, it's just that
> the code to align the first pointer ends up being a bit harder to
> read.
> 

This one also works, tested only on x86-64. Looking at the perf diff:

    30.44%    -28.66%  [kernel.vmlinux]         [k] memcmp


Comparing your 2 version that you submitted the difference is:

     1.05%     +0.72%  [kernel.vmlinux]     [k] memcmp


So the pointer alignment one is slightly more expensive. However those
measurements were done only on x86-64.

Now on a more practical note, IIUC your 2nd version makes sense if the
cost of doing a one unaligned access in the loop body is offset by the
fact we are doing a native word-sized comparison, right?


<snip>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-22 11:28         ` Nikolay Borisov
@ 2021-07-22 16:40           ` Linus Torvalds
  2021-07-22 17:03             ` Nikolay Borisov
  2021-08-26  9:03             ` Nikolay Borisov
  0 siblings, 2 replies; 23+ messages in thread
From: Linus Torvalds @ 2021-07-22 16:40 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: Nikolay Borisov, Linux Kernel Mailing List, Nick Desaulniers,
	linux-fsdevel, Dave Chinner

On Thu, Jul 22, 2021 at 4:28 AM Nikolay Borisov
<n.borisov.lkml@gmail.com> wrote:
>
> This one also works, tested only on x86-64. Looking at the perf diff:
>
>     30.44%    -28.66%  [kernel.vmlinux]         [k] memcmp

Ok.

So the one that doesn't even bother to align is

    30.44%    -29.38%  [kernel.vmlinux]         [k] memcmp

and the one that aligns the first one is

    30.44%    -28.66%  [kernel.vmlinux]         [k] memcmp

and the difference between the two is basically in the noise:

     1.05%     +0.72%  [kernel.vmlinux]     [k] memcmp

but the first one does seem to be slightly better.

> Now on a more practical note, IIUC your 2nd version makes sense if the
> cost of doing a one unaligned access in the loop body is offset by the
> fact we are doing a native word-sized comparison, right?

So honestly, the reason the first one seems to beat the second one is
that the cost of unaligned accesses on modern x86 is basically
epsilon.

For all normal unaligned accesses there simply is no cost at all.
There is a _tiny_ cost when the unaligned access crosses a cacheline
access boundary (which on older CPU's is every 32 bytes, on modern
ones it's 64 bytes). And then there is another slightly bigger cost
when the unaligned access actually crosses a page boundary.

But even those non-zero cost cases are basically in the noise, because
under most circumstances they will be hidden by any out-of-order
engine, and completely dwarfed by the _real_ costs which are branch
mispredicts and cache misses.

So on the whole, unaligned accesses are basically no cost at all. You
almost have to have unusual code sequences for them to be even
measurable.

So that second patch that aligns one of the sources is basically only
extra overhead for no real advantage. The cost of it is probably one
more branch mispredict, and possibly a cycle or two for the extra
instructions.

Which is why the first one wins: it's simpler, and the extra work the
second one does is basically not worth it on x86. Plus I suspect your
test-case was all aligned anyway to begin with, so the extra work is
_doubly_ pointless.

I suspect the second patch would be worthwhile if

 (a) there really were a lot of strings that weren't aligned (likelihood: low)

 (b) other microarchitectures that do worse on unaligned accesses -
some microarchitectures spend extra cycles on _any_ unaligned accesses
even if they don't cross cache access boundaries etc.

and I can see (b) happening quite easily. You just won't see it on a
modern x86-64 setup.

I suspect we should start with the first version. It's not only better
on x86, but it's simpler, and it's guarded by that

    #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS

so it's fundamentally "safer" on architectures that are just horrible
about unaligned accesses.

Now, admittedly I don't personally even care about such architectures,
and because we use "get_unaligned()", the compiler should always
generate code that doesn't absolutely suck for bad architectures, but
considering how long we've gone with the completely brainlessly simple
"byte at a time" version without anybody even noticing, I think a
minimal change is a better change.

That said, I'm not convinced I want to apply even that minimal first
patch outside of the merge window.

So would you mind reminding me about this patch the next merge window?
Unless there was some big extrernal reason why the performance of
memcmp() mattered to you so much (ie some user that actually noticed
and complained) and we really want to prioritize this..

              Linus

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-22 16:40           ` Linus Torvalds
@ 2021-07-22 17:03             ` Nikolay Borisov
  2021-08-26  9:03             ` Nikolay Borisov
  1 sibling, 0 replies; 23+ messages in thread
From: Nikolay Borisov @ 2021-07-22 17:03 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Nikolay Borisov, Linux Kernel Mailing List, Nick Desaulniers,
	linux-fsdevel, Dave Chinner



On 22.07.21 г. 19:40, Linus Torvalds wrote:
> On Thu, Jul 22, 2021 at 4:28 AM Nikolay Borisov
> <n.borisov.lkml@gmail.com> wrote:
>>
>> This one also works, tested only on x86-64. Looking at the perf diff:
>>
>>     30.44%    -28.66%  [kernel.vmlinux]         [k] memcmp
> 
> Ok.
> 
> So the one that doesn't even bother to align is
> 
>     30.44%    -29.38%  [kernel.vmlinux]         [k] memcmp
> 
> and the one that aligns the first one is
> 
>     30.44%    -28.66%  [kernel.vmlinux]         [k] memcmp
> 
> and the difference between the two is basically in the noise:
> 
>      1.05%     +0.72%  [kernel.vmlinux]     [k] memcmp
> 
> but the first one does seem to be slightly better.
> 
>> Now on a more practical note, IIUC your 2nd version makes sense if the
>> cost of doing a one unaligned access in the loop body is offset by the
>> fact we are doing a native word-sized comparison, right?
> 
> So honestly, the reason the first one seems to beat the second one is
> that the cost of unaligned accesses on modern x86 is basically
> epsilon.
> 
> For all normal unaligned accesses there simply is no cost at all.
> There is a _tiny_ cost when the unaligned access crosses a cacheline
> access boundary (which on older CPU's is every 32 bytes, on modern
> ones it's 64 bytes). And then there is another slightly bigger cost
> when the unaligned access actually crosses a page boundary.
> 
> But even those non-zero cost cases are basically in the noise, because
> under most circumstances they will be hidden by any out-of-order
> engine, and completely dwarfed by the _real_ costs which are branch
> mispredicts and cache misses.
> 
> So on the whole, unaligned accesses are basically no cost at all. You
> almost have to have unusual code sequences for them to be even
> measurable.
> 
> So that second patch that aligns one of the sources is basically only
> extra overhead for no real advantage. The cost of it is probably one
> more branch mispredict, and possibly a cycle or two for the extra
> instructions.
> 
> Which is why the first one wins: it's simpler, and the extra work the
> second one does is basically not worth it on x86. Plus I suspect your
> test-case was all aligned anyway to begin with, so the extra work is
> _doubly_ pointless.
> 
> I suspect the second patch would be worthwhile if
> 
>  (a) there really were a lot of strings that weren't aligned (likelihood: low)
> 
>  (b) other microarchitectures that do worse on unaligned accesses -
> some microarchitectures spend extra cycles on _any_ unaligned accesses
> even if they don't cross cache access boundaries etc.
> 
> and I can see (b) happening quite easily. You just won't see it on a
> modern x86-64 setup.
> 
> I suspect we should start with the first version. It's not only better
> on x86, but it's simpler, and it's guarded by that
> 
>     #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> 
> so it's fundamentally "safer" on architectures that are just horrible
> about unaligned accesses.
> 
> Now, admittedly I don't personally even care about such architectures,
> and because we use "get_unaligned()", the compiler should always
> generate code that doesn't absolutely suck for bad architectures, but
> considering how long we've gone with the completely brainlessly simple
> "byte at a time" version without anybody even noticing, I think a
> minimal change is a better change.
> 
> That said, I'm not convinced I want to apply even that minimal first
> patch outside of the merge window.
> 
> So would you mind reminding me about this patch the next merge window?
> Unless there was some big extrernal reason why the performance of
> memcmp() mattered to you so much (ie some user that actually noticed
> and complained) and we really want to prioritize this..

I will do my best and hope I don't forget. OTOH there isn't anything
pressing it's something I found while looking at fidedupe's performance
and not even the major contributor but still, it looks sensible to fix
it now, that I have a workload at hand which clearly demonstrates
positive impact and can easily measure any changes.
> 
>               Linus
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 18:45     ` Linus Torvalds
                         ` (2 preceding siblings ...)
  2021-07-22  8:28       ` Nikolay Borisov
@ 2021-07-23 14:02       ` David Laight
  3 siblings, 0 replies; 23+ messages in thread
From: David Laight @ 2021-07-23 14:02 UTC (permalink / raw)
  To: 'Linus Torvalds', Nikolay Borisov
  Cc: Linux Kernel Mailing List, Nick Desaulniers, linux-fsdevel, Dave Chinner

From: Linus Torvalds
> Sent: 21 July 2021 19:46
> 
> On Wed, Jul 21, 2021 at 11:17 AM Nikolay Borisov <nborisov@suse.com> wrote:
> >
> > I find it somewhat arbitrary that we choose to align the 2nd pointer and
> > not the first.
> 
> Yeah, that's a bit odd, but I don't think it matters.
> 
> The hope is obviously that they are mutually aligned, and in that case
> it doesn't matter which one you aim to align.
> 
> > So you are saying that the current memcmp could indeed use improvement
> > but you don't want it to be based on the glibc's code due to the ugly
> > misalignment handling?
> 
> Yeah. I suspect that this (very simple) patch gives you the same
> performance improvement that the glibc code does.
> 
> NOTE! I'm not saying this patch is perfect. This one doesn't even
> _try_ to do the mutual alignment, because it's really silly. But I'm
> throwing this out here for discussion, because
> 
>  - it's really simple
> 
>  - I suspect it gets you 99% of the way there
> 
>  - the code generation is actually quite good with both gcc and clang.
> This is gcc:
> 
>         memcmp:
>                 jmp     .L60
>         .L52:
>                 movq    (%rsi), %rax
>                 cmpq    %rax, (%rdi)
>                 jne     .L53
>                 addq    $8, %rdi
>                 addq    $8, %rsi
>                 subq    $8, %rdx
>         .L60:
>                 cmpq    $7, %rdx
>                 ja      .L52

I wonder how fast that can be made to run.
I think the two conditional branches have to run in separate clocks.
So you may get all 5 arithmetic operations to run in the same 2 clocks.
But that may be pushing things on everything except the very latest cpu.
The memory reads aren't limiting at all, the cpu can do two per clock.
So even though (IIRC) misaligned ones cost an extra clock it doesn't matter.

That looks like a +dst++ = *src++ loop.
The array copy dst[i] = src[i]; i++ requires one less 'addq'
provided the cpu has 'register + register' addressing.
Not decrementing the length also saves an 'addq'.
So the loop:
	for (i = 0; i < length - 7; i += 8)
		dst[i] = src[i];  /* Hacked to be right in C */
probably only has one addq and one cmpq per iteration.
That is much more likely to run in the 2 clocks.
(If you can persuade gcc not to transform it!)

It may also be possible to remove the cmpq by arranging
that the flags from the addq contain the right condition.
That needs something like:
	dst += len; src += len; len = -len
	do
		dst[len] = src[len];
	while ((len += 8) < 0);
That probably isn't necessary for x86, but is likely to help sparc.

For mips-like cpu (with 'compare and jump', only 'reg + constant'
addressing) you really want a loop like:
	dst_end = dst + length;
	do
		*dst++ = *src++;
	while (dst < dst_end);
This has two adds and a compare per iteration.
That might be a good compromise for aligned copies.

I'm not at all sure is it ever worth aligning either pointer
if misaligned reads don't fault.
Most compares (of any size) will be aligned.
So you get the 'hit' of the test when it cannot help.
That almost certainly exceeds any benefit.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-21 13:59 [PATCH] lib/string: Bring optimized memcmp from glibc Nikolay Borisov
  2021-07-21 14:32 ` Christoph Hellwig
  2021-07-21 18:00 ` Linus Torvalds
@ 2021-07-28 20:12 ` Florian Weimer
  2 siblings, 0 replies; 23+ messages in thread
From: Florian Weimer @ 2021-07-28 20:12 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: linux-kernel, ndesaulniers, torvalds, linux-fsdevel, david

* Nikolay Borisov:

> +/*
> + * Compare A and B bytewise in the byte order of the machine.
> + * A and B are known to be different. This is needed only on little-endian
> + * machines.
> + */
> +static inline int memcmp_bytes(unsigned long a, unsigned long b)
> +{
> +	long srcp1 = (long) &a;
> +	long srcp2 = (long) &b;
> +	unsigned long a0, b0;
> +
> +	do {
> +		a0 = ((uint8_t *) srcp1)[0];
> +		b0 = ((uint8_t *) srcp2)[0];
> +		srcp1 += 1;
> +		srcp2 += 1;
> +	} while (a0 == b0);
> +	return a0 - b0;
> +}

Should this be this?

static inline int memcmp_bytes(unsigned long a, unsigned long b)
{
	if (sizeof(a) == 4)
		return __builtin_bswap32(a) < __builtin_bswap32(b) ? -1 : 0;
	else
		return __builtin_bswap64(a) < __builtin_bswap64(b) ? -1 : 0;
}

(Or whatever macro versions the kernel has for this.)

Or is the expectation that targets that don't have an assembler
implementation for memcmp have also bad bswap built-ins?

Thanks,
Florian


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] lib/string: Bring optimized memcmp from glibc
  2021-07-22 16:40           ` Linus Torvalds
  2021-07-22 17:03             ` Nikolay Borisov
@ 2021-08-26  9:03             ` Nikolay Borisov
  1 sibling, 0 replies; 23+ messages in thread
From: Nikolay Borisov @ 2021-08-26  9:03 UTC (permalink / raw)
  To: Linus Torvalds, Nikolay Borisov
  Cc: Linux Kernel Mailing List, Nick Desaulniers, linux-fsdevel, Dave Chinner



On 22.07.21 г. 19:40, Linus Torvalds wrote:
> So would you mind reminding me about this patch the next merge window?
> Unless there was some big extrernal reason why the performance of
> memcmp() mattered to you so much (ie some user that actually noticed
> and complained) and we really want to prioritize this..


I'm reminding you of spinning this patch up for 5.15 :)

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2021-08-26  9:03 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-21 13:59 [PATCH] lib/string: Bring optimized memcmp from glibc Nikolay Borisov
2021-07-21 14:32 ` Christoph Hellwig
2021-07-21 14:35   ` Nikolay Borisov
2021-07-21 14:42     ` Christoph Hellwig
2021-07-21 14:51       ` Matthew Wilcox
2021-07-21 15:17         ` Peter.Enderborg
2021-07-21 15:34           ` Matthew Wilcox
2021-07-21 15:39             ` Peter.Enderborg
2021-07-21 18:00 ` Linus Torvalds
2021-07-21 18:17   ` Nikolay Borisov
2021-07-21 18:45     ` Linus Torvalds
2021-07-21 18:55       ` Linus Torvalds
2021-07-21 19:26       ` Linus Torvalds
2021-07-22 11:28         ` Nikolay Borisov
2021-07-22 16:40           ` Linus Torvalds
2021-07-22 17:03             ` Nikolay Borisov
2021-08-26  9:03             ` Nikolay Borisov
2021-07-22  8:28       ` Nikolay Borisov
2021-07-23 14:02       ` David Laight
2021-07-21 20:10   ` David Sterba
2021-07-21 20:27     ` Linus Torvalds
2021-07-22  5:54       ` Nikolay Borisov
2021-07-28 20:12 ` Florian Weimer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).