On Wed, Nov 24, 2021 at 9:09 PM Noah Goldstein wrote: > > > > Although I see slightly worse performance with aligned `buff` in > the branch-free approach. Imagine if non-aligned `buff` is that > uncommon might be better to speculate past the work of `ror`. Yes, no clear win here removing the conditional (same cost really), although using a ror32() is removing the from32to16() helper and get rid of one folding. I will formally submit this change, thanks ! diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..cf4bd3ef66e56c681b3435d43011ece78438376d 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -11,16 +11,6 @@ #include #include -static inline unsigned short from32to16(unsigned a) -{ - unsigned short b = a >> 16; - asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" - : "=r" (b) - : "0" (b), "r" (a)); - return b; -} - /* * Do a checksum on an arbitrary memory area. * Returns a 32bit checksum. @@ -41,6 +31,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) if (unlikely(odd)) { if (unlikely(len == 0)) return sum; + temp64 = ror32((__force u32)sum, 8); temp64 += (*(unsigned char *)buff << 8); len--; buff++; @@ -129,10 +120,8 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) #endif } result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - } + if (unlikely(odd)) + result = ror32(result, 8); return (__force __wsum)result; } EXPORT_SYMBOL(csum_partial);