linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
@ 2021-11-17 18:45 kernel test robot
  2021-11-17 18:55 ` Eric Dumazet
  0 siblings, 1 reply; 26+ messages in thread
From: kernel test robot @ 2021-11-17 18:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: kbuild-all, linux-kernel, x86, Peter Zijlstra, Alexander Duyck

[-- Attachment #1: Type: text/plain, Size: 4317 bytes --]

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
head:   d31c3c683ee668ba5d87c0730610442fd672525f
commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
config: um-x86_64_defconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
        git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
        git fetch --no-tags tip x86/core
        git checkout d31c3c683ee668ba5d87c0730610442fd672525f
        # save the attached .config to linux build tree
        make W=1 ARCH=um SUBARCH=x86_64

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
>> arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
      98 |   trail = (load_unaligned_zeropad(buff) << shift) >> shift;
         |            ^~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/load_unaligned_zeropad +98 arch/x86/um/../lib/csum-partial_64.c

    23	
    24	/*
    25	 * Do a checksum on an arbitrary memory area.
    26	 * Returns a 32bit checksum.
    27	 *
    28	 * This isn't as time critical as it used to be because many NICs
    29	 * do hardware checksumming these days.
    30	 *
    31	 * Still, with CHECKSUM_COMPLETE this is called to compute
    32	 * checksums on IPv6 headers (40 bytes) and other small parts.
    33	 * it's best to have buff aligned on a 64-bit boundary
    34	 */
    35	__wsum csum_partial(const void *buff, int len, __wsum sum)
    36	{
    37		u64 temp64 = (__force u64)sum;
    38		unsigned odd, result;
    39	
    40		odd = 1 & (unsigned long) buff;
    41		if (unlikely(odd)) {
    42			if (unlikely(len == 0))
    43				return sum;
    44			temp64 += (*(unsigned char *)buff << 8);
    45			len--;
    46			buff++;
    47		}
    48	
    49		while (unlikely(len >= 64)) {
    50			asm("addq 0*8(%[src]),%[res]\n\t"
    51			    "adcq 1*8(%[src]),%[res]\n\t"
    52			    "adcq 2*8(%[src]),%[res]\n\t"
    53			    "adcq 3*8(%[src]),%[res]\n\t"
    54			    "adcq 4*8(%[src]),%[res]\n\t"
    55			    "adcq 5*8(%[src]),%[res]\n\t"
    56			    "adcq 6*8(%[src]),%[res]\n\t"
    57			    "adcq 7*8(%[src]),%[res]\n\t"
    58			    "adcq $0,%[res]"
    59			    : [res] "+r" (temp64)
    60			    : [src] "r" (buff)
    61			    : "memory");
    62			buff += 64;
    63			len -= 64;
    64		}
    65	
    66		if (len & 32) {
    67			asm("addq 0*8(%[src]),%[res]\n\t"
    68			    "adcq 1*8(%[src]),%[res]\n\t"
    69			    "adcq 2*8(%[src]),%[res]\n\t"
    70			    "adcq 3*8(%[src]),%[res]\n\t"
    71			    "adcq $0,%[res]"
    72				: [res] "+r" (temp64)
    73				: [src] "r" (buff)
    74				: "memory");
    75			buff += 32;
    76		}
    77		if (len & 16) {
    78			asm("addq 0*8(%[src]),%[res]\n\t"
    79			    "adcq 1*8(%[src]),%[res]\n\t"
    80			    "adcq $0,%[res]"
    81				: [res] "+r" (temp64)
    82				: [src] "r" (buff)
    83				: "memory");
    84			buff += 16;
    85		}
    86		if (len & 8) {
    87			asm("addq 0*8(%[src]),%[res]\n\t"
    88			    "adcq $0,%[res]"
    89				: [res] "+r" (temp64)
    90				: [src] "r" (buff)
    91				: "memory");
    92			buff += 8;
    93		}
    94		if (len & 7) {
    95			unsigned int shift = (8 - (len & 7)) * 8;
    96			unsigned long trail;
    97	
  > 98			trail = (load_unaligned_zeropad(buff) << shift) >> shift;
    99	
   100			asm("addq %[trail],%[res]\n\t"
   101			    "adcq $0,%[res]"
   102				: [res] "+r" (temp64)
   103				: [trail] "r" (trail));
   104		}
   105		result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
   106		if (unlikely(odd)) { 
   107			result = from32to16(result);
   108			result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
   109		}
   110		return (__force __wsum)result;
   111	}
   112	EXPORT_SYMBOL(csum_partial);
   113	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 9772 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-17 18:45 [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' kernel test robot
@ 2021-11-17 18:55 ` Eric Dumazet
  2021-11-17 19:40   ` Eric Dumazet
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2021-11-17 18:55 UTC (permalink / raw)
  To: kernel test robot
  Cc: kbuild-all, linux-kernel, x86, Peter Zijlstra, Alexander Duyck

On Wed, Nov 17, 2021 at 10:46 AM kernel test robot <lkp@intel.com> wrote:
>
> tree:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
> head:   d31c3c683ee668ba5d87c0730610442fd672525f
> commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
> config: um-x86_64_defconfig (attached as .config)
> compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> reproduce (this is a W=1 build):
>         # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
>         git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
>         git fetch --no-tags tip x86/core
>         git checkout d31c3c683ee668ba5d87c0730610442fd672525f
>         # save the attached .config to linux build tree
>         make W=1 ARCH=um SUBARCH=x86_64
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
>
> All errors (new ones prefixed by >>):
>
>    arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
> >> arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
>       98 |   trail = (load_unaligned_zeropad(buff) << shift) >> shift;
>          |            ^~~~~~~~~~~~~~~~~~~~~~
>    cc1: some warnings being treated as errors
>
>

Hmmm... it seems we need to guard this with CONFIG_DCACHE_WORD_ACCESS ?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-17 18:55 ` Eric Dumazet
@ 2021-11-17 19:40   ` Eric Dumazet
  2021-11-18 16:00     ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2021-11-17 19:40 UTC (permalink / raw)
  To: kernel test robot
  Cc: kbuild-all, linux-kernel, x86, Peter Zijlstra, Alexander Duyck

On Wed, Nov 17, 2021 at 10:55 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Nov 17, 2021 at 10:46 AM kernel test robot <lkp@intel.com> wrote:
> >
> > tree:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
> > head:   d31c3c683ee668ba5d87c0730610442fd672525f
> > commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
> > config: um-x86_64_defconfig (attached as .config)
> > compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> > reproduce (this is a W=1 build):
> >         # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
> >         git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> >         git fetch --no-tags tip x86/core
> >         git checkout d31c3c683ee668ba5d87c0730610442fd672525f
> >         # save the attached .config to linux build tree
> >         make W=1 ARCH=um SUBARCH=x86_64
> >
> > If you fix the issue, kindly add following tag as appropriate
> > Reported-by: kernel test robot <lkp@intel.com>
> >
> > All errors (new ones prefixed by >>):
> >
> >    arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
> > >> arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
> >       98 |   trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> >          |            ^~~~~~~~~~~~~~~~~~~~~~
> >    cc1: some warnings being treated as errors
> >
> >
>
> Hmmm... it seems we need to guard this with CONFIG_DCACHE_WORD_ACCESS ?

Perhaps something like the following ?

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 5ec35626945b6db2f7f41c6d46d5e422810eac46..d419b9345d6dba2e924887671bc6f11c3e17ebd7
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -91,12 +91,23 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
                        : "memory");
                buff += 8;
        }
-       if (len & 7) {
-               unsigned int shift = (8 - (len & 7)) * 8;
+       len &= 7;
+       if (len) {
                unsigned long trail;
+#ifndef CONFIG_DCACHE_WORD_ACCESS
+               union {
+                       unsigned long   ulval;
+                       u8              bytes[sizeof(long)];
+               } v;

-               trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+               v.ulval = 0;
+               memcpy(v.bytes, buff, len);
+               trail = v.ulval;
+#else
+               unsigned int shift = (sizeof(long) - len) * BITS_PER_BYTE;

+               trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+#endif
                asm("addq %[trail],%[res]\n\t"
                    "adcq $0,%[res]"
                        : [res] "+r" (temp64)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-17 19:40   ` Eric Dumazet
@ 2021-11-18 16:00     ` Peter Zijlstra
  2021-11-18 16:26       ` Johannes Berg
  2021-12-29  6:00       ` Al Viro
  0 siblings, 2 replies; 26+ messages in thread
From: Peter Zijlstra @ 2021-11-18 16:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: kernel test robot, kbuild-all, linux-kernel, x86,
	Alexander Duyck, linux-um


On Wed, Nov 17, 2021 at 11:40:35AM -0800, Eric Dumazet wrote:
> On Wed, Nov 17, 2021 at 10:55 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Nov 17, 2021 at 10:46 AM kernel test robot <lkp@intel.com> wrote:
> > >
> > > tree:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
> > > head:   d31c3c683ee668ba5d87c0730610442fd672525f
> > > commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
> > > config: um-x86_64_defconfig (attached as .config)
> > > compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> > > reproduce (this is a W=1 build):
> > >         # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
> > >         git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> > >         git fetch --no-tags tip x86/core
> > >         git checkout d31c3c683ee668ba5d87c0730610442fd672525f
> > >         # save the attached .config to linux build tree
> > >         make W=1 ARCH=um SUBARCH=x86_64
> > >
> > > If you fix the issue, kindly add following tag as appropriate
> > > Reported-by: kernel test robot <lkp@intel.com>
> > >
> > > All errors (new ones prefixed by >>):
> > >
> > >    arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
> > > >> arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
> > >       98 |   trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> > >          |            ^~~~~~~~~~~~~~~~~~~~~~
> > >    cc1: some warnings being treated as errors
> > >
> > >
> >
> > Hmmm... it seems we need to guard this with CONFIG_DCACHE_WORD_ACCESS ?
> 
> Perhaps something like the following ?

Dear um folks, is this indeed the best solution? It's a bit sad to have
to add this to x86_64, but if that's the way it is...

> 
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index 5ec35626945b6db2f7f41c6d46d5e422810eac46..d419b9345d6dba2e924887671bc6f11c3e17ebd7
> 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -91,12 +91,23 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
>                         : "memory");
>                 buff += 8;
>         }
> -       if (len & 7) {
> -               unsigned int shift = (8 - (len & 7)) * 8;
> +       len &= 7;
> +       if (len) {
>                 unsigned long trail;
> +#ifndef CONFIG_DCACHE_WORD_ACCESS
> +               union {
> +                       unsigned long   ulval;
> +                       u8              bytes[sizeof(long)];
> +               } v;
> 
> -               trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> +               v.ulval = 0;
> +               memcpy(v.bytes, buff, len);
> +               trail = v.ulval;
> +#else
> +               unsigned int shift = (sizeof(long) - len) * BITS_PER_BYTE;
> 
> +               trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> +#endif
>                 asm("addq %[trail],%[res]\n\t"
>                     "adcq $0,%[res]"
>                         : [res] "+r" (temp64)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-18 16:00     ` Peter Zijlstra
@ 2021-11-18 16:26       ` Johannes Berg
  2021-11-18 16:57         ` Eric Dumazet
  2021-12-29  6:00       ` Al Viro
  1 sibling, 1 reply; 26+ messages in thread
From: Johannes Berg @ 2021-11-18 16:26 UTC (permalink / raw)
  To: Peter Zijlstra, Eric Dumazet
  Cc: kernel test robot, kbuild-all, linux-kernel, x86,
	Alexander Duyck, linux-um

On Thu, 2021-11-18 at 17:00 +0100, Peter Zijlstra wrote:
> On Wed, Nov 17, 2021 at 11:40:35AM -0800, Eric Dumazet wrote:
> > On Wed, Nov 17, 2021 at 10:55 AM Eric Dumazet <edumazet@google.com> wrote:
> > > 
> > > On Wed, Nov 17, 2021 at 10:46 AM kernel test robot <lkp@intel.com> wrote:
> > > > 
> > > > tree:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
> > > > head:   d31c3c683ee668ba5d87c0730610442fd672525f
> > > > commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
> > > > config: um-x86_64_defconfig (attached as .config)
> > > > compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> > > > reproduce (this is a W=1 build):
> > > >         # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
> > > >         git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> > > >         git fetch --no-tags tip x86/core
> > > >         git checkout d31c3c683ee668ba5d87c0730610442fd672525f
> > > >         # save the attached .config to linux build tree
> > > >         make W=1 ARCH=um SUBARCH=x86_64
> > > > 
> > > > If you fix the issue, kindly add following tag as appropriate
> > > > Reported-by: kernel test robot <lkp@intel.com>
> > > > 
> > > > All errors (new ones prefixed by >>):
> > > > 
> > > >    arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
> > > > > > arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
> > > >       98 |   trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> > > >          |            ^~~~~~~~~~~~~~~~~~~~~~
> > > >    cc1: some warnings being treated as errors
> > > > 
> > > > 
> > > 
> > > Hmmm... it seems we need to guard this with CONFIG_DCACHE_WORD_ACCESS ?
> > 
> > Perhaps something like the following ?
> 
> Dear um folks, is this indeed the best solution? It's a bit sad to have
> to add this to x86_64, but if that's the way it is...
> 

I guess we can add load_unaligned_zeropad() or even just somehow add the
include with it (asm/word-at-a-time.h) from x86?

johannes

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-18 16:26       ` Johannes Berg
@ 2021-11-18 16:57         ` Eric Dumazet
  2021-11-18 17:02           ` Eric Dumazet
  2021-11-25  1:58           ` Noah Goldstein
  0 siblings, 2 replies; 26+ messages in thread
From: Eric Dumazet @ 2021-11-18 16:57 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Peter Zijlstra, kernel test robot, kbuild-all, linux-kernel, x86,
	Alexander Duyck, linux-um

On Thu, Nov 18, 2021 at 8:27 AM Johannes Berg <johannes@sipsolutions.net> wrote:
>
> On Thu, 2021-11-18 at 17:00 +0100, Peter Zijlstra wrote:
> > On Wed, Nov 17, 2021 at 11:40:35AM -0800, Eric Dumazet wrote:
> > > On Wed, Nov 17, 2021 at 10:55 AM Eric Dumazet <edumazet@google.com> wrote:
> > > >
> > > > On Wed, Nov 17, 2021 at 10:46 AM kernel test robot <lkp@intel.com> wrote:
> > > > >
> > > > > tree:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
> > > > > head:   d31c3c683ee668ba5d87c0730610442fd672525f
> > > > > commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
> > > > > config: um-x86_64_defconfig (attached as .config)
> > > > > compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> > > > > reproduce (this is a W=1 build):
> > > > >         # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
> > > > >         git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> > > > >         git fetch --no-tags tip x86/core
> > > > >         git checkout d31c3c683ee668ba5d87c0730610442fd672525f
> > > > >         # save the attached .config to linux build tree
> > > > >         make W=1 ARCH=um SUBARCH=x86_64
> > > > >
> > > > > If you fix the issue, kindly add following tag as appropriate
> > > > > Reported-by: kernel test robot <lkp@intel.com>
> > > > >
> > > > > All errors (new ones prefixed by >>):
> > > > >
> > > > >    arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
> > > > > > > arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
> > > > >       98 |   trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> > > > >          |            ^~~~~~~~~~~~~~~~~~~~~~
> > > > >    cc1: some warnings being treated as errors
> > > > >
> > > > >
> > > >
> > > > Hmmm... it seems we need to guard this with CONFIG_DCACHE_WORD_ACCESS ?
> > >
> > > Perhaps something like the following ?
> >
> > Dear um folks, is this indeed the best solution? It's a bit sad to have
> > to add this to x86_64, but if that's the way it is...
> >
>
> I guess we can add load_unaligned_zeropad() or even just somehow add the
> include with it (asm/word-at-a-time.h) from x86?

Unless fixups can be handled, the signature of the function needs to
be different.

In UM, we would need to provide a number of bytes that can be read.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-18 16:57         ` Eric Dumazet
@ 2021-11-18 17:02           ` Eric Dumazet
  2021-11-25  1:58           ` Noah Goldstein
  1 sibling, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2021-11-18 17:02 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Peter Zijlstra, kernel test robot, kbuild-all, linux-kernel, x86,
	Alexander Duyck, linux-um

On Thu, Nov 18, 2021 at 8:57 AM Eric Dumazet <edumazet@google.com> wrote:

>
> Unless fixups can be handled, the signature of the function needs to
> be different.
>
> In UM, we would need to provide a number of bytes that can be read.

We can make this a bit less ugly  of course.

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 5ec35626945b6db2f7f41c6d46d5e422810eac46..7a3c4e7e05c4b21566e1ee3813a071509a9d54ff
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -21,6 +21,25 @@ static inline unsigned short from32to16(unsigned a)
        return b;
 }

+
+static inline unsigned long load_partial_long(const void *buff, int len)
+{
+#ifndef CONFIG_DCACHE_WORD_ACCESS
+               union {
+                       unsigned long   ulval;
+                       u8              bytes[sizeof(long)];
+               } v;
+
+               v.ulval = 0;
+               memcpy(v.bytes, buff, len);
+               return v.ulval;
+#else
+               unsigned int shift = (sizeof(long) - len) * BITS_PER_BYTE;
+
+               return (load_unaligned_zeropad(buff) << shift) >> shift;
+#endif
+}
+
 /*
  * Do a checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
@@ -91,11 +110,9 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
                        : "memory");
                buff += 8;
        }
-       if (len & 7) {
-               unsigned int shift = (8 - (len & 7)) * 8;
-               unsigned long trail;
-
-               trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+       len &= 7;
+       if (len) {
+               unsigned long trail = load_partial_long(buff, len);

                asm("addq %[trail],%[res]\n\t"
                    "adcq $0,%[res]"

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-18 16:57         ` Eric Dumazet
  2021-11-18 17:02           ` Eric Dumazet
@ 2021-11-25  1:58           ` Noah Goldstein
  2021-11-25  2:56             ` Eric Dumazet
  1 sibling, 1 reply; 26+ messages in thread
From: Noah Goldstein @ 2021-11-25  1:58 UTC (permalink / raw)
  To: edumazet, Johannes Berg
  Cc: alexanderduyck, kbuild-all, linux-kernel, linux-um, lkp, peterz,
	x86, goldstein.w.n

From: Eric Dumazet <edumazet@google.com>

On Thu, Nov 18, 2021 at 8:57 AM Eric Dumazet <edumazet@google.com> wrote:

>
> Unless fixups can be handled, the signature of the function needs to
> be different.
>
> In UM, we would need to provide a number of bytes that can be read.

We can make this a bit less ugly  of course.

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 5ec35626945b6db2f7f41c6d46d5e422810eac46..7a3c4e7e05c4b21566e1ee3813a071509a9d54ff
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -21,6 +21,25 @@ static inline unsigned short from32to16(unsigned a)
        return b;
 }

+
+static inline unsigned long load_partial_long(const void *buff, int len)
+{
+#ifndef CONFIG_DCACHE_WORD_ACCESS
+               union {
+                       unsigned long   ulval;
+                       u8              bytes[sizeof(long)];
+               } v;
+
+               v.ulval = 0;
+               memcpy(v.bytes, buff, len);
+               return v.ulval;
+#else
+               unsigned int shift = (sizeof(long) - len) * BITS_PER_BYTE;
+
+               return (load_unaligned_zeropad(buff) << shift) >> shift;
+#endif
+}
+
 /*
  * Do a checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
@@ -91,11 +110,9 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
                        : "memory");
                buff += 8;
        }
-       if (len & 7) {
-               unsigned int shift = (8 - (len & 7)) * 8;
-               unsigned long trail;
-
-               trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+       len &= 7;
+       if (len) {
+               unsigned long trail = load_partial_long(buff, len);

                asm("addq %[trail],%[res]\n\t"
                    "adcq $0,%[res]"

Hi, I'm not sure if this is intentional or not, but I noticed that the output
of 'csum_partial' is different after this patch. I figured that the checksum
algorithm is fixed so just wanted mention it incase its a bug. If not sorry
for the spam.

Example on x86_64:

Buff: [ 87, b3, 92, b7, 8b, 53, 96, db, cd, 0f, 7e, 7e ]
len : 11
sum : 0

csum_partial new : 2480936615
csum_partial HEAD: 2472089390

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  1:58           ` Noah Goldstein
@ 2021-11-25  2:56             ` Eric Dumazet
  2021-11-25  3:41               ` Noah Goldstein
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2021-11-25  2:56 UTC (permalink / raw)
  To: Noah Goldstein
  Cc: Johannes Berg, alexanderduyck, kbuild-all, linux-kernel,
	linux-um, lkp, peterz, x86

On Wed, Nov 24, 2021 at 5:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>

>
> Hi, I'm not sure if this is intentional or not, but I noticed that the output
> of 'csum_partial' is different after this patch. I figured that the checksum
> algorithm is fixed so just wanted mention it incase its a bug. If not sorry
> for the spam.
>
> Example on x86_64:
>
> Buff: [ 87, b3, 92, b7, 8b, 53, 96, db, cd, 0f, 7e, 7e ]
> len : 11
> sum : 0
>
> csum_partial new : 2480936615
> csum_partial HEAD: 2472089390

No worries.

skb->csum is 32bit, but really what matters is the 16bit folded value.

So make sure to apply csum_fold() before comparing the results.

A minimal C and generic version of csum_fold() would be something like

static unsigned short csum_fold(u32 csum)
{
  u32 sum = csum;
  sum = (sum & 0xffff) + (sum >> 16);
  sum = (sum & 0xffff) + (sum >> 16);
  return ~sum;
}

I bet that csum_fold(2480936615) == csum_fold(2472089390)

It would be nice if we had a csum test suite, hint, hint ;)

Thanks !

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  2:56             ` Eric Dumazet
@ 2021-11-25  3:41               ` Noah Goldstein
  2021-11-25  4:00                 ` Eric Dumazet
  0 siblings, 1 reply; 26+ messages in thread
From: Noah Goldstein @ 2021-11-25  3:41 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

On Wed, Nov 24, 2021 at 8:56 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Nov 24, 2021 at 5:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
>
> >
> > Hi, I'm not sure if this is intentional or not, but I noticed that the output
> > of 'csum_partial' is different after this patch. I figured that the checksum
> > algorithm is fixed so just wanted mention it incase its a bug. If not sorry
> > for the spam.
> >
> > Example on x86_64:
> >
> > Buff: [ 87, b3, 92, b7, 8b, 53, 96, db, cd, 0f, 7e, 7e ]
> > len : 11
> > sum : 0
> >
> > csum_partial new : 2480936615
> > csum_partial HEAD: 2472089390
>
> No worries.
>
> skb->csum is 32bit, but really what matters is the 16bit folded value.
>
> So make sure to apply csum_fold() before comparing the results.
>
> A minimal C and generic version of csum_fold() would be something like
>
> static unsigned short csum_fold(u32 csum)
> {
>   u32 sum = csum;
>   sum = (sum & 0xffff) + (sum >> 16);
>   sum = (sum & 0xffff) + (sum >> 16);
>   return ~sum;
> }
>
> I bet that csum_fold(2480936615) == csum_fold(2472089390)
>

Correct :)

The outputs seem to match if `buff` is aligned to 64-bit. Still see
difference with `csum_fold(csum_partial())` if `buff` is not 64-bit aligned.

The comment at the top says it's "best" to have `buff` 64-bit aligned but
the code logic seems meant to support the misaligned case so not
sure if it's an issue.

Example:

csum_fold(csum_partial) new : 0x3764
csum_fold(csum_partial) HEAD: 0x3a61

buff        : [ 11, ea, 75, 76, e9, ab, 86, 48 ]
buff addr   : ffff88eaf5fb0001
len         : 8
sum_in      : 25

> It would be nice if we had a csum test suite, hint, hint ;)

Where in the kernel would that belong?

>
> Thanks !

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  3:41               ` Noah Goldstein
@ 2021-11-25  4:00                 ` Eric Dumazet
  2021-11-25  4:08                   ` Eric Dumazet
  2021-11-26 17:18                   ` David Laight
  0 siblings, 2 replies; 26+ messages in thread
From: Eric Dumazet @ 2021-11-25  4:00 UTC (permalink / raw)
  To: Noah Goldstein
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

On Wed, Nov 24, 2021 at 7:41 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Nov 24, 2021 at 8:56 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Nov 24, 2021 at 5:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> >
> > >
> > > Hi, I'm not sure if this is intentional or not, but I noticed that the output
> > > of 'csum_partial' is different after this patch. I figured that the checksum
> > > algorithm is fixed so just wanted mention it incase its a bug. If not sorry
> > > for the spam.
> > >
> > > Example on x86_64:
> > >
> > > Buff: [ 87, b3, 92, b7, 8b, 53, 96, db, cd, 0f, 7e, 7e ]
> > > len : 11
> > > sum : 0
> > >
> > > csum_partial new : 2480936615
> > > csum_partial HEAD: 2472089390
> >
> > No worries.
> >
> > skb->csum is 32bit, but really what matters is the 16bit folded value.
> >
> > So make sure to apply csum_fold() before comparing the results.
> >
> > A minimal C and generic version of csum_fold() would be something like
> >
> > static unsigned short csum_fold(u32 csum)
> > {
> >   u32 sum = csum;
> >   sum = (sum & 0xffff) + (sum >> 16);
> >   sum = (sum & 0xffff) + (sum >> 16);
> >   return ~sum;
> > }
> >
> > I bet that csum_fold(2480936615) == csum_fold(2472089390)
> >
>
> Correct :)
>
> The outputs seem to match if `buff` is aligned to 64-bit. Still see
> difference with `csum_fold(csum_partial())` if `buff` is not 64-bit aligned.
>
> The comment at the top says it's "best" to have `buff` 64-bit aligned but
> the code logic seems meant to support the misaligned case so not
> sure if it's an issue.
>

It is an issue in general, not in standard cases because network
headers are aligned.

I think it came when I folded csum_partial() and do_csum(), I forgot
to ror() the seed.

I suspect the following would help:

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
        if (unlikely(odd)) {
                if (unlikely(len == 0))
                        return sum;
+               temp64 = ror32((__force u64)sum, 8);
                temp64 += (*(unsigned char *)buff << 8);
                len--;
                buff++;




> Example:
>
> csum_fold(csum_partial) new : 0x3764
> csum_fold(csum_partial) HEAD: 0x3a61
>
> buff        : [ 11, ea, 75, 76, e9, ab, 86, 48 ]
> buff addr   : ffff88eaf5fb0001
> len         : 8
> sum_in      : 25
>
> > It would be nice if we had a csum test suite, hint, hint ;)
>
> Where in the kernel would that belong?

This could be a module, like lib/test_csum.c

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  4:00                 ` Eric Dumazet
@ 2021-11-25  4:08                   ` Eric Dumazet
  2021-11-25  4:20                     ` Eric Dumazet
  2021-11-26 17:18                   ` David Laight
  1 sibling, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2021-11-25  4:08 UTC (permalink / raw)
  To: Noah Goldstein
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

On Wed, Nov 24, 2021 at 8:00 PM Eric Dumazet <edumazet@google.com> wrote:

>
> It is an issue in general, not in standard cases because network
> headers are aligned.
>
> I think it came when I folded csum_partial() and do_csum(), I forgot
> to ror() the seed.
>
> I suspect the following would help:
>
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
>         if (unlikely(odd)) {
>                 if (unlikely(len == 0))
>                         return sum;
> +               temp64 = ror32((__force u64)sum, 8);
>                 temp64 += (*(unsigned char *)buff << 8);
>                 len--;
>                 buff++;
>
>

It is a bit late here, I will test the following later this week.

We probably can remove one conditional jump at the end of the function

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..15986ad42ed5ccb8241ff467a34188cf901ec98e
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -41,9 +41,11 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
        if (unlikely(odd)) {
                if (unlikely(len == 0))
                        return sum;
+               temp64 = ror32((__force u64)sum, 8);
                temp64 += (*(unsigned char *)buff << 8);
                len--;
                buff++;
+               odd = 8;
        }

        while (unlikely(len >= 64)) {
@@ -129,10 +131,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
 #endif
        }
        result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
-       if (unlikely(odd)) {
-               result = from32to16(result);
-               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-       }
+       ror32(result, odd);
        return (__force __wsum)result;
 }
 EXPORT_SYMBOL(csum_partial);

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  4:08                   ` Eric Dumazet
@ 2021-11-25  4:20                     ` Eric Dumazet
  2021-11-25  4:56                       ` Noah Goldstein
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2021-11-25  4:20 UTC (permalink / raw)
  To: Noah Goldstein
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

On Wed, Nov 24, 2021 at 8:08 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Nov 24, 2021 at 8:00 PM Eric Dumazet <edumazet@google.com> wrote:
>
> >
> > It is an issue in general, not in standard cases because network
> > headers are aligned.
> >
> > I think it came when I folded csum_partial() and do_csum(), I forgot
> > to ror() the seed.
> >
> > I suspect the following would help:
> >
> > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> > 100644
> > --- a/arch/x86/lib/csum-partial_64.c
> > +++ b/arch/x86/lib/csum-partial_64.c
> > @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> >         if (unlikely(odd)) {
> >                 if (unlikely(len == 0))
> >                         return sum;
> > +               temp64 = ror32((__force u64)sum, 8);
> >                 temp64 += (*(unsigned char *)buff << 8);
> >                 len--;
> >                 buff++;
> >
> >
>
> It is a bit late here, I will test the following later this week.
>
> We probably can remove one conditional jump at the end of the function
>
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..15986ad42ed5ccb8241ff467a34188cf901ec98e
> 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -41,9 +41,11 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
>         if (unlikely(odd)) {
>                 if (unlikely(len == 0))
>                         return sum;
> +               temp64 = ror32((__force u64)sum, 8);
>                 temp64 += (*(unsigned char *)buff << 8);
>                 len--;
>                 buff++;
> +               odd = 8;
>         }
>
>         while (unlikely(len >= 64)) {
> @@ -129,10 +131,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
>  #endif
>         }
>         result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
> -       if (unlikely(odd)) {
> -               result = from32to16(result);
> -               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> -       }
> +       ror32(result, odd);

this would be
          result = ror32(result, odd);

definitely time to stop working today for me.

>         return (__force __wsum)result;
>  }
>  EXPORT_SYMBOL(csum_partial);

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  4:20                     ` Eric Dumazet
@ 2021-11-25  4:56                       ` Noah Goldstein
  2021-11-25  5:09                         ` Noah Goldstein
  0 siblings, 1 reply; 26+ messages in thread
From: Noah Goldstein @ 2021-11-25  4:56 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

On Wed, Nov 24, 2021 at 10:20 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Nov 24, 2021 at 8:08 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Nov 24, 2021 at 8:00 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > >
> > > It is an issue in general, not in standard cases because network
> > > headers are aligned.
> > >
> > > I think it came when I folded csum_partial() and do_csum(), I forgot
> > > to ror() the seed.
> > >
> > > I suspect the following would help:
> > >
> > > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> > > 100644
> > > --- a/arch/x86/lib/csum-partial_64.c
> > > +++ b/arch/x86/lib/csum-partial_64.c
> > > @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > >         if (unlikely(odd)) {
> > >                 if (unlikely(len == 0))
> > >                         return sum;
> > > +               temp64 = ror32((__force u64)sum, 8);
> > >                 temp64 += (*(unsigned char *)buff << 8);
> > >                 len--;
> > >                 buff++;
> > >
> > >
> >
> > It is a bit late here, I will test the following later this week.
> >
> > We probably can remove one conditional jump at the end of the function
> >
> > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..15986ad42ed5ccb8241ff467a34188cf901ec98e
> > 100644
> > --- a/arch/x86/lib/csum-partial_64.c
> > +++ b/arch/x86/lib/csum-partial_64.c
> > @@ -41,9 +41,11 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> >         if (unlikely(odd)) {
> >                 if (unlikely(len == 0))
> >                         return sum;
> > +               temp64 = ror32((__force u64)sum, 8);
> >                 temp64 += (*(unsigned char *)buff << 8);
> >                 len--;
> >                 buff++;
> > +               odd = 8;
> >         }
> >
> >         while (unlikely(len >= 64)) {
> > @@ -129,10 +131,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> >  #endif
> >         }
> >         result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
> > -       if (unlikely(odd)) {
> > -               result = from32to16(result);
> > -               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> > -       }
> > +       ror32(result, odd);
>
> this would be
>           result = ror32(result, odd);
>
> definitely time to stop working today for me.
>
> >         return (__force __wsum)result;
> >  }
> >  EXPORT_SYMBOL(csum_partial);

All my tests pass with that change :)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  4:56                       ` Noah Goldstein
@ 2021-11-25  5:09                         ` Noah Goldstein
  2021-11-25  6:32                           ` Eric Dumazet
  0 siblings, 1 reply; 26+ messages in thread
From: Noah Goldstein @ 2021-11-25  5:09 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

On Wed, Nov 24, 2021 at 10:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Nov 24, 2021 at 10:20 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Nov 24, 2021 at 8:08 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Wed, Nov 24, 2021 at 8:00 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > >
> > > > It is an issue in general, not in standard cases because network
> > > > headers are aligned.
> > > >
> > > > I think it came when I folded csum_partial() and do_csum(), I forgot
> > > > to ror() the seed.
> > > >
> > > > I suspect the following would help:
> > > >
> > > > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > > > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> > > > 100644
> > > > --- a/arch/x86/lib/csum-partial_64.c
> > > > +++ b/arch/x86/lib/csum-partial_64.c
> > > > @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > > >         if (unlikely(odd)) {
> > > >                 if (unlikely(len == 0))
> > > >                         return sum;
> > > > +               temp64 = ror32((__force u64)sum, 8);
> > > >                 temp64 += (*(unsigned char *)buff << 8);
> > > >                 len--;
> > > >                 buff++;
> > > >
> > > >
> > >
> > > It is a bit late here, I will test the following later this week.
> > >
> > > We probably can remove one conditional jump at the end of the function
> > >
> > > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..15986ad42ed5ccb8241ff467a34188cf901ec98e
> > > 100644
> > > --- a/arch/x86/lib/csum-partial_64.c
> > > +++ b/arch/x86/lib/csum-partial_64.c
> > > @@ -41,9 +41,11 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > >         if (unlikely(odd)) {
> > >                 if (unlikely(len == 0))
> > >                         return sum;
> > > +               temp64 = ror32((__force u64)sum, 8);
> > >                 temp64 += (*(unsigned char *)buff << 8);
> > >                 len--;
> > >                 buff++;
> > > +               odd = 8;
> > >         }
> > >
> > >         while (unlikely(len >= 64)) {
> > > @@ -129,10 +131,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > >  #endif
> > >         }
> > >         result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
> > > -       if (unlikely(odd)) {
> > > -               result = from32to16(result);
> > > -               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> > > -       }
> > > +       ror32(result, odd);
> >
> > this would be
> >           result = ror32(result, odd);
> >
> > definitely time to stop working today for me.
> >
> > >         return (__force __wsum)result;
> > >  }
> > >  EXPORT_SYMBOL(csum_partial);
>
> All my tests pass with that change :)

Although I see slightly worse performance with aligned `buff`  in
the branch-free approach. Imagine if non-aligned `buff` is that
uncommon might be better to speculate past the work of `ror`.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  5:09                         ` Noah Goldstein
@ 2021-11-25  6:32                           ` Eric Dumazet
  2021-11-25  6:45                             ` Eric Dumazet
  2021-11-25  6:47                             ` Noah Goldstein
  0 siblings, 2 replies; 26+ messages in thread
From: Eric Dumazet @ 2021-11-25  6:32 UTC (permalink / raw)
  To: Noah Goldstein
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

On Wed, Nov 24, 2021 at 9:09 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> Although I see slightly worse performance with aligned `buff`  in
> the branch-free approach. Imagine if non-aligned `buff` is that
> uncommon might be better to speculate past the work of `ror`.

Yes, no clear win here removing the conditional (same cost really),
although using a ror32() is removing the from32to16() helper and get
rid of one folding.

I will formally submit this change, thanks !

diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..cf4bd3ef66e56c681b3435d43011ece78438376d
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -11,16 +11,6 @@
 #include <asm/checksum.h>
 #include <asm/word-at-a-time.h>

-static inline unsigned short from32to16(unsigned a)
-{
-       unsigned short b = a >> 16;
-       asm("addw %w2,%w0\n\t"
-           "adcw $0,%w0\n"
-           : "=r" (b)
-           : "0" (b), "r" (a));
-       return b;
-}
-
 /*
  * Do a checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
@@ -41,6 +31,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
        if (unlikely(odd)) {
                if (unlikely(len == 0))
                        return sum;
+               temp64 = ror32((__force u32)sum, 8);
                temp64 += (*(unsigned char *)buff << 8);
                len--;
                buff++;
@@ -129,10 +120,8 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
 #endif
        }
        result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
-       if (unlikely(odd)) {
-               result = from32to16(result);
-               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-       }
+       if (unlikely(odd))
+               result = ror32(result, 8);
        return (__force __wsum)result;
 }
 EXPORT_SYMBOL(csum_partial);

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  6:32                           ` Eric Dumazet
@ 2021-11-25  6:45                             ` Eric Dumazet
  2021-11-25  6:49                               ` Noah Goldstein
  2021-11-25  6:47                             ` Noah Goldstein
  1 sibling, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2021-11-25  6:45 UTC (permalink / raw)
  To: Noah Goldstein
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

On Wed, Nov 24, 2021 at 10:32 PM Eric Dumazet <edumazet@google.com> wrote:

> -       }
> +       if (unlikely(odd))
> +               result = ror32(result, 8);
>         return (__force __wsum)result;

Oh well, gcc at least removes the conditional and generates a ror and a cmov

        mov    %edx,%eax
        ror    $0x8,%eax
        test   %r8,%r8
        cmove  %edx,%eax
        ret

clang keeps the cond jmp
         test   $0x1,%dil
         je     93
         rol    $0x18,%eax
93:    ret

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  6:32                           ` Eric Dumazet
  2021-11-25  6:45                             ` Eric Dumazet
@ 2021-11-25  6:47                             ` Noah Goldstein
  1 sibling, 0 replies; 26+ messages in thread
From: Noah Goldstein @ 2021-11-25  6:47 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

On Thu, Nov 25, 2021 at 12:32 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Nov 24, 2021 at 9:09 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> >
> >
> > Although I see slightly worse performance with aligned `buff`  in
> > the branch-free approach. Imagine if non-aligned `buff` is that
> > uncommon might be better to speculate past the work of `ror`.
>
> Yes, no clear win here removing the conditional (same cost really),
> although using a ror32() is removing the from32to16() helper and get
> rid of one folding.
>
> I will formally submit this change, thanks !

Great :)

Can you put me on the cc list? I have a patch for the function that I'll post
once yours gets through.

>
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..cf4bd3ef66e56c681b3435d43011ece78438376d
> 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -11,16 +11,6 @@
>  #include <asm/checksum.h>
>  #include <asm/word-at-a-time.h>
>
> -static inline unsigned short from32to16(unsigned a)
> -{
> -       unsigned short b = a >> 16;
> -       asm("addw %w2,%w0\n\t"
> -           "adcw $0,%w0\n"
> -           : "=r" (b)
> -           : "0" (b), "r" (a));
> -       return b;
> -}
> -
>  /*
>   * Do a checksum on an arbitrary memory area.
>   * Returns a 32bit checksum.
> @@ -41,6 +31,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
>         if (unlikely(odd)) {
>                 if (unlikely(len == 0))
>                         return sum;
> +               temp64 = ror32((__force u32)sum, 8);
>                 temp64 += (*(unsigned char *)buff << 8);
>                 len--;
>                 buff++;
> @@ -129,10 +120,8 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
>  #endif
>         }
>         result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
> -       if (unlikely(odd)) {
> -               result = from32to16(result);
> -               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> -       }
> +       if (unlikely(odd))
> +               result = ror32(result, 8);
>         return (__force __wsum)result;
>  }
>  EXPORT_SYMBOL(csum_partial);

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  6:45                             ` Eric Dumazet
@ 2021-11-25  6:49                               ` Noah Goldstein
  0 siblings, 0 replies; 26+ messages in thread
From: Noah Goldstein @ 2021-11-25  6:49 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

On Thu, Nov 25, 2021 at 12:46 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Nov 24, 2021 at 10:32 PM Eric Dumazet <edumazet@google.com> wrote:
>
> > -       }
> > +       if (unlikely(odd))
> > +               result = ror32(result, 8);
> >         return (__force __wsum)result;
>
> Oh well, gcc at least removes the conditional and generates a ror and a cmov

Seems like a missed optimization for `unlikely` where dependency breaking
is pretty common.

Although still saves a uop because of `imm8` operand.

>
>         mov    %edx,%eax
>         ror    $0x8,%eax
>         test   %r8,%r8
>         cmove  %edx,%eax
>         ret
>
> clang keeps the cond jmp
>          test   $0x1,%dil
>          je     93
>          rol    $0x18,%eax
> 93:    ret

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-25  4:00                 ` Eric Dumazet
  2021-11-25  4:08                   ` Eric Dumazet
@ 2021-11-26 17:18                   ` David Laight
  2021-11-26 18:09                     ` Eric Dumazet
  1 sibling, 1 reply; 26+ messages in thread
From: David Laight @ 2021-11-26 17:18 UTC (permalink / raw)
  To: 'Eric Dumazet', Noah Goldstein
  Cc: Johannes Berg, alexanderduyck, kbuild-all, open list, linux-um,
	lkp, peterz, X86 ML

From: Eric Dumazet
> Sent: 25 November 2021 04:01
...
> > The outputs seem to match if `buff` is aligned to 64-bit. Still see
> > difference with `csum_fold(csum_partial())` if `buff` is not 64-bit aligned.
> >
> > The comment at the top says it's "best" to have `buff` 64-bit aligned but
> > the code logic seems meant to support the misaligned case so not
> > sure if it's an issue.
> >
> 
> It is an issue in general, not in standard cases because network
> headers are aligned.
> 
> I think it came when I folded csum_partial() and do_csum(), I forgot
> to ror() the seed.
> 
> I suspect the following would help:
> 
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
>         if (unlikely(odd)) {
>                 if (unlikely(len == 0))
>                         return sum;
> +               temp64 = ror32((__force u64)sum, 8);
>                 temp64 += (*(unsigned char *)buff << 8);
>                 len--;
>                 buff++;

You can save an instruction (as if this path matters) by:
			temp64 = sum + *(unsigned char *)buff;
			temp64 <<= 8;
Although that probably falls foul of 64bit shifts being slow.
So maybe just:
			sum += *(unsigned char *)buff;
			temp64 = bswap32(sum);
AFAICT (from a pdf) bswap32() and ror(x, 8) are likely to be
the same speed but may use different execution units.

Intel seem so have managed to slow down ror(x, %cl) to 3 clocks
in sandy bridge - and still not fixed it.
Although the compiler might be making a pigs-breakfast of the
register allocation when you tried setting 'odd = 8'.

Weeks can be spent fiddling with this code :-(

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-26 17:18                   ` David Laight
@ 2021-11-26 18:09                     ` Eric Dumazet
  2021-11-26 22:41                       ` David Laight
  0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2021-11-26 18:09 UTC (permalink / raw)
  To: David Laight
  Cc: Noah Goldstein, Johannes Berg, alexanderduyck, kbuild-all,
	open list, linux-um, lkp, peterz, X86 ML

On Fri, Nov 26, 2021 at 9:18 AM David Laight <David.Laight@aculab.com> wrote:
>
> From: Eric Dumazet
> > Sent: 25 November 2021 04:01
> ...
> > > The outputs seem to match if `buff` is aligned to 64-bit. Still see
> > > difference with `csum_fold(csum_partial())` if `buff` is not 64-bit aligned.
> > >
> > > The comment at the top says it's "best" to have `buff` 64-bit aligned but
> > > the code logic seems meant to support the misaligned case so not
> > > sure if it's an issue.
> > >
> >
> > It is an issue in general, not in standard cases because network
> > headers are aligned.
> >
> > I think it came when I folded csum_partial() and do_csum(), I forgot
> > to ror() the seed.
> >
> > I suspect the following would help:
> >
> > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> > 100644
> > --- a/arch/x86/lib/csum-partial_64.c
> > +++ b/arch/x86/lib/csum-partial_64.c
> > @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> >         if (unlikely(odd)) {
> >                 if (unlikely(len == 0))
> >                         return sum;
> > +               temp64 = ror32((__force u64)sum, 8);
> >                 temp64 += (*(unsigned char *)buff << 8);
> >                 len--;
> >                 buff++;
>
> You can save an instruction (as if this path matters) by:
>                         temp64 = sum + *(unsigned char *)buff;

This might overflow, sum is 32bit.

Given that we have temp64 = (__force u64)sum;  already done at this
point, we can instead

temp64 += *(u8 *)buff;

>                         temp64 <<= 8;



> Although that probably falls foul of 64bit shifts being slow.

Are they slower than the ror32(sum, 8) ?

> So maybe just:
>                         sum += *(unsigned char *)buff;

we might miss a carry/overflow here

>                         temp64 = bswap32(sum);


> AFAICT (from a pdf) bswap32() and ror(x, 8) are likely to be
> the same speed but may use different execution units.
>
> Intel seem so have managed to slow down ror(x, %cl) to 3 clocks
> in sandy bridge - and still not fixed it.
> Although the compiler might be making a pigs-breakfast of the
> register allocation when you tried setting 'odd = 8'.
>
> Weeks can be spent fiddling with this code :-(

Yes, and in the end, it won't be able to compete with  a
specialized/inlined ipv6_csum_partial()

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-26 18:09                     ` Eric Dumazet
@ 2021-11-26 22:41                       ` David Laight
  2021-11-26 23:04                         ` Noah Goldstein
  0 siblings, 1 reply; 26+ messages in thread
From: David Laight @ 2021-11-26 22:41 UTC (permalink / raw)
  To: 'Eric Dumazet'
  Cc: Noah Goldstein, Johannes Berg, alexanderduyck, kbuild-all,
	open list, linux-um, lkp, peterz, X86 ML

From: Eric Dumazet
> Sent: 26 November 2021 18:10
...
> > AFAICT (from a pdf) bswap32() and ror(x, 8) are likely to be
> > the same speed but may use different execution units.

The 64bit shifts/rotates are also only one clock.
It is the bswap64 that can be two.

> > Intel seem so have managed to slow down ror(x, %cl) to 3 clocks
> > in sandy bridge - and still not fixed it.
> > Although the compiler might be making a pigs-breakfast of the
> > register allocation when you tried setting 'odd = 8'.
> >
> > Weeks can be spent fiddling with this code :-(
> 
> Yes, and in the end, it won't be able to compete with  a
> specialized/inlined ipv6_csum_partial()

I bet most of the gain comes from knowing there is a non-zero
whole number of 32bit words.
The pesky edge conditions cost.

And even then you need to get it right!
The one for summing the 5-word IPv4 header is actually horrid
on Intel cpu prior to Haswell because 'adc' has a latency of 2.
On Sandy bridge the carry output is valid on the next clock,
so adding to alternate registers doubles throughput.
(That could easily be done in the current function and will
make a big different on those cpu.)

But basically the current generic code has the loop unrolled
further than is necessary for modern (non-atom) cpu.
That just adds more code outside the loop.

I did managed to get 12 bytes/clock using adco/adox with only
32 bytes each iteration.
That will require aligned buffers.

Alignment won't matter for 'adc' loops because there are two
'memory read' units - but there is the elephant:

Sandy bridge Cache bank conflicts
Each consecutive 128 bytes, or two cache lines, in the data cache is divided
into 8 banks of 16 bytes each. It is not possible to do two memory reads in
the same clock cycle if the two memory addresses have the same bank number,
i.e. if bit 4 - 6 in the two addresses are the same.
	; Example 9.5. Sandy bridge cache bank conflict
	mov eax, [rsi] ; Use bank 0, assuming rsi is divisible by 40H
	mov ebx, [rsi+100H] ; Use bank 0. Cache bank conflict
	mov ecx, [rsi+110H] ; Use bank 1. No cache bank conflict

That isn't a problem on Haswell, but it is probably worth ordering
the 'adc' in the loop to reduce the number of conflicts.
I didn't try to look for that though.
I only remember testing aligned buffers on Sandy/Ivy bridge.
Adding to alternate registers helped no end.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-26 22:41                       ` David Laight
@ 2021-11-26 23:04                         ` Noah Goldstein
  2021-11-28 18:30                           ` David Laight
  0 siblings, 1 reply; 26+ messages in thread
From: Noah Goldstein @ 2021-11-26 23:04 UTC (permalink / raw)
  To: David Laight
  Cc: Eric Dumazet, Johannes Berg, alexanderduyck, kbuild-all,
	open list, linux-um, lkp, peterz, X86 ML

On Fri, Nov 26, 2021 at 4:41 PM David Laight <David.Laight@aculab.com> wrote:
>
> From: Eric Dumazet
> > Sent: 26 November 2021 18:10
> ...
> > > AFAICT (from a pdf) bswap32() and ror(x, 8) are likely to be
> > > the same speed but may use different execution units.
>
> The 64bit shifts/rotates are also only one clock.
> It is the bswap64 that can be two.
>
> > > Intel seem so have managed to slow down ror(x, %cl) to 3 clocks
> > > in sandy bridge - and still not fixed it.
> > > Although the compiler might be making a pigs-breakfast of the
> > > register allocation when you tried setting 'odd = 8'.
> > >
> > > Weeks can be spent fiddling with this code :-(
> >
> > Yes, and in the end, it won't be able to compete with  a
> > specialized/inlined ipv6_csum_partial()
>
> I bet most of the gain comes from knowing there is a non-zero
> whole number of 32bit words.
> The pesky edge conditions cost.
>
> And even then you need to get it right!
> The one for summing the 5-word IPv4 header is actually horrid
> on Intel cpu prior to Haswell because 'adc' has a latency of 2.
> On Sandy bridge the carry output is valid on the next clock,
> so adding to alternate registers doubles throughput.
> (That could easily be done in the current function and will
> make a big different on those cpu.)
>
> But basically the current generic code has the loop unrolled
> further than is necessary for modern (non-atom) cpu.
> That just adds more code outside the loop.
>
> I did managed to get 12 bytes/clock using adco/adox with only
> 32 bytes each iteration.
> That will require aligned buffers.
>
> Alignment won't matter for 'adc' loops because there are two
> 'memory read' units - but there is the elephant:
>
> Sandy bridge Cache bank conflicts
> Each consecutive 128 bytes, or two cache lines, in the data cache is divided
> into 8 banks of 16 bytes each. It is not possible to do two memory reads in
> the same clock cycle if the two memory addresses have the same bank number,
> i.e. if bit 4 - 6 in the two addresses are the same.
>         ; Example 9.5. Sandy bridge cache bank conflict
>         mov eax, [rsi] ; Use bank 0, assuming rsi is divisible by 40H
>         mov ebx, [rsi+100H] ; Use bank 0. Cache bank conflict
>         mov ecx, [rsi+110H] ; Use bank 1. No cache bank conflict
>
> That isn't a problem on Haswell, but it is probably worth ordering
> the 'adc' in the loop to reduce the number of conflicts.
> I didn't try to look for that though.
> I only remember testing aligned buffers on Sandy/Ivy bridge.
> Adding to alternate registers helped no end.

Cant that just be solved by having the two independent adcx/adox chains work
from region that are 16+ bytes apart? For 40 byte ipv6 header it will be simple.
>
>         David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-26 23:04                         ` Noah Goldstein
@ 2021-11-28 18:30                           ` David Laight
  0 siblings, 0 replies; 26+ messages in thread
From: David Laight @ 2021-11-28 18:30 UTC (permalink / raw)
  To: 'Noah Goldstein'
  Cc: Eric Dumazet, Johannes Berg, alexanderduyck, kbuild-all,
	open list, linux-um, lkp, peterz, X86 ML

From: Noah Goldstein
> Sent: 26 November 2021 23:04
> 
> On Fri, Nov 26, 2021 at 4:41 PM David Laight <David.Laight@aculab.com> wrote:
> >
> > From: Eric Dumazet
> > > Sent: 26 November 2021 18:10
> > ...
> > > > AFAICT (from a pdf) bswap32() and ror(x, 8) are likely to be
> > > > the same speed but may use different execution units.
> >
> > The 64bit shifts/rotates are also only one clock.
> > It is the bswap64 that can be two.
> >
> > > > Intel seem so have managed to slow down ror(x, %cl) to 3 clocks
> > > > in sandy bridge - and still not fixed it.
> > > > Although the compiler might be making a pigs-breakfast of the
> > > > register allocation when you tried setting 'odd = 8'.
> > > >
> > > > Weeks can be spent fiddling with this code :-(
> > >
> > > Yes, and in the end, it won't be able to compete with  a
> > > specialized/inlined ipv6_csum_partial()
> >
> > I bet most of the gain comes from knowing there is a non-zero
> > whole number of 32bit words.
> > The pesky edge conditions cost.
> >
> > And even then you need to get it right!
> > The one for summing the 5-word IPv4 header is actually horrid
> > on Intel cpu prior to Haswell because 'adc' has a latency of 2.
> > On Sandy bridge the carry output is valid on the next clock,
> > so adding to alternate registers doubles throughput.
> > (That could easily be done in the current function and will
> > make a big different on those cpu.)
> >
> > But basically the current generic code has the loop unrolled
> > further than is necessary for modern (non-atom) cpu.
> > That just adds more code outside the loop.
> >
> > I did managed to get 12 bytes/clock using adco/adox with only
> > 32 bytes each iteration.
> > That will require aligned buffers.
> >
> > Alignment won't matter for 'adc' loops because there are two
> > 'memory read' units - but there is the elephant:
> >
> > Sandy bridge Cache bank conflicts
> > Each consecutive 128 bytes, or two cache lines, in the data cache is divided
> > into 8 banks of 16 bytes each. It is not possible to do two memory reads in
> > the same clock cycle if the two memory addresses have the same bank number,
> > i.e. if bit 4 - 6 in the two addresses are the same.
> >         ; Example 9.5. Sandy bridge cache bank conflict
> >         mov eax, [rsi] ; Use bank 0, assuming rsi is divisible by 40H
> >         mov ebx, [rsi+100H] ; Use bank 0. Cache bank conflict
> >         mov ecx, [rsi+110H] ; Use bank 1. No cache bank conflict
> >
> > That isn't a problem on Haswell, but it is probably worth ordering
> > the 'adc' in the loop to reduce the number of conflicts.
> > I didn't try to look for that though.
> > I only remember testing aligned buffers on Sandy/Ivy bridge.
> > Adding to alternate registers helped no end.
> 
> Cant that just be solved by having the two independent adcx/adox chains work
> from region that are 16+ bytes apart? For 40 byte ipv6 header it will be simple.

Not relevant, adcx/adox are only supported haswell/broadwell onwards
which don't have the 'cache bank conflict' issue.

In any case using adx[oc] for only 40 bytes isn't worth the effort.

The other issue with adcx/adoc is that some cpu that support them
have very slow decode times - so unless you' got them in a loop
it will be horrid.
Trying to 'loop carry' both the 'carry' and 'overflow' flags is also
fraught. The 'loop' instruction would do it - but that is horribly
slow on Intel cpu (I think it is ok an AMD ones).
You can use jcxz at the top of the loop and an unconditional jump at the bottom.
There might be an obscure method of doing a 64bit->32bit move into %recx
and then a jrcxz at the loop bottom!

For Ivy/Sandy bridge it is noted:
There is hardly any penalty for misaligned memory access with operand sizes
of 64 bits or less, except for the effect of using multiple cache banks.

That might mean that you can do a misaligned read every clock.
With the only issues arising for that that is trying to do 2 reads/clock.
Given the checksum code needs to do 'adc', the carry flag constrains
you to 1 read/clock - so there may actually be no real penalty for
a misaligned buffer at all.

No one (except me) has actually noticed that the adc chain takes two
clocks per adc on sandy bridge, so if the misaligned memory reads
take two clocks it makes no difference.

(info from pdf's from www.agner.org/optimize)

I've not got the test systems and program I used back in May 2020
to hand any more.

I certainly found that efficiently handling the 'odd 7 bytes'
was actually more difficult than it might seem.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-11-18 16:00     ` Peter Zijlstra
  2021-11-18 16:26       ` Johannes Berg
@ 2021-12-29  6:00       ` Al Viro
  2022-01-31  2:29         ` Al Viro
  1 sibling, 1 reply; 26+ messages in thread
From: Al Viro @ 2021-12-29  6:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Dumazet, kernel test robot, kbuild-all, linux-kernel, x86,
	Alexander Duyck, linux-um

On Thu, Nov 18, 2021 at 05:00:58PM +0100, Peter Zijlstra wrote:

> Dear um folks, is this indeed the best solution? It's a bit sad to have
> to add this to x86_64, but if that's the way it is...

Something like this, perhaps? (absolutely untested)

diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index e5a7b552bb384..a58b67ec8119d 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -23,7 +23,6 @@ generic-y += softirq_stack.h
 generic-y += switch_to.h
 generic-y += topology.h
 generic-y += trace_clock.h
-generic-y += word-at-a-time.h
 generic-y += kprobes.h
 generic-y += mm_hooks.h
 generic-y += vga.h
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 95d26a69088b1..a812cc35092c6 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -9,6 +9,7 @@ endmenu
 config UML_X86
 	def_bool y
 	select GENERIC_FIND_FIRST_BIT
+	select DCACHE_WORD_ACCESS
 
 config 64BIT
 	bool "64-bit kernel" if "$(SUBARCH)" = "x86"

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad'
  2021-12-29  6:00       ` Al Viro
@ 2022-01-31  2:29         ` Al Viro
  0 siblings, 0 replies; 26+ messages in thread
From: Al Viro @ 2022-01-31  2:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Dumazet, kernel test robot, kbuild-all, linux-kernel, x86,
	Alexander Duyck, linux-um

On Wed, Dec 29, 2021 at 06:00:56AM +0000, Al Viro wrote:
> On Thu, Nov 18, 2021 at 05:00:58PM +0100, Peter Zijlstra wrote:
> 
> > Dear um folks, is this indeed the best solution? It's a bit sad to have
> > to add this to x86_64, but if that's the way it is...
> 
> Something like this, perhaps? (absolutely untested)

[snip]
AFAICS, this (on top of current mainline) does the right thing.

commit 6692531df62d812de5d22c8bca0d90edc163aa84
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Sun Jan 30 21:25:53 2022 -0500

    uml/x86: use x86 load_unaligned_zeropad()
    
    allows, among other things, to drop !DCACHE_WORD_ACCESS mess in
    x86 csum-partial_64.c
    
    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index e5a7b552bb384..a58b67ec8119d 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -23,7 +23,6 @@ generic-y += softirq_stack.h
 generic-y += switch_to.h
 generic-y += topology.h
 generic-y += trace_clock.h
-generic-y += word-at-a-time.h
 generic-y += kprobes.h
 generic-y += mm_hooks.h
 generic-y += vga.h
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 1f8a8f8951738..50734a23034c4 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -93,7 +93,6 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
 		buff += 8;
 	}
 	if (len & 7) {
-#ifdef CONFIG_DCACHE_WORD_ACCESS
 		unsigned int shift = (8 - (len & 7)) * 8;
 		unsigned long trail;
 
@@ -103,31 +102,6 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
 		    "adcq $0,%[res]"
 			: [res] "+r" (temp64)
 			: [trail] "r" (trail));
-#else
-		if (len & 4) {
-			asm("addq %[val],%[res]\n\t"
-			    "adcq $0,%[res]"
-				: [res] "+r" (temp64)
-				: [val] "r" ((u64)*(u32 *)buff)
-				: "memory");
-			buff += 4;
-		}
-		if (len & 2) {
-			asm("addq %[val],%[res]\n\t"
-			    "adcq $0,%[res]"
-				: [res] "+r" (temp64)
-				: [val] "r" ((u64)*(u16 *)buff)
-				: "memory");
-			buff += 2;
-		}
-		if (len & 1) {
-			asm("addq %[val],%[res]\n\t"
-			    "adcq $0,%[res]"
-				: [res] "+r" (temp64)
-				: [val] "r" ((u64)*(u8 *)buff)
-				: "memory");
-		}
-#endif
 	}
 	result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
 	if (unlikely(odd)) {
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 40d6a06e41c81..4eb47d3ba6250 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -8,6 +8,7 @@ endmenu
 
 config UML_X86
 	def_bool y
+	select DCACHE_WORD_ACCESS
 
 config 64BIT
 	bool "64-bit kernel" if "$(SUBARCH)" = "x86"

^ permalink raw reply related	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2022-01-31  2:30 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-17 18:45 [tip:x86/core 1/1] arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' kernel test robot
2021-11-17 18:55 ` Eric Dumazet
2021-11-17 19:40   ` Eric Dumazet
2021-11-18 16:00     ` Peter Zijlstra
2021-11-18 16:26       ` Johannes Berg
2021-11-18 16:57         ` Eric Dumazet
2021-11-18 17:02           ` Eric Dumazet
2021-11-25  1:58           ` Noah Goldstein
2021-11-25  2:56             ` Eric Dumazet
2021-11-25  3:41               ` Noah Goldstein
2021-11-25  4:00                 ` Eric Dumazet
2021-11-25  4:08                   ` Eric Dumazet
2021-11-25  4:20                     ` Eric Dumazet
2021-11-25  4:56                       ` Noah Goldstein
2021-11-25  5:09                         ` Noah Goldstein
2021-11-25  6:32                           ` Eric Dumazet
2021-11-25  6:45                             ` Eric Dumazet
2021-11-25  6:49                               ` Noah Goldstein
2021-11-25  6:47                             ` Noah Goldstein
2021-11-26 17:18                   ` David Laight
2021-11-26 18:09                     ` Eric Dumazet
2021-11-26 22:41                       ` David Laight
2021-11-26 23:04                         ` Noah Goldstein
2021-11-28 18:30                           ` David Laight
2021-12-29  6:00       ` Al Viro
2022-01-31  2:29         ` Al Viro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).