linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
To: Eric Biggers <ebiggers3@gmail.com>
Cc: minchan@kernel.org, akpm@linux-foundation.org,
	bongkyu.kim@lge.com, rsalvaterra@gmail.com,
	sergey.senozhatsky@gmail.com, gregkh@linuxfoundation.org,
	linux-kernel@vger.kernel.org, herbert@gondor.apana.org.au,
	davem@davemloft.net, linux-crypto@vger.kernel.org,
	anton@enomsg.org, ccross@android.com, keescook@chromium.org,
	tony.luck@intel.com
Subject: Re: [PATCH] lz4: fix performance regressions
Date: Tue, 14 Feb 2017 11:33:44 +0100	[thread overview]
Message-ID: <20170214103344.GA5475@bierbaron.springfield.local> (raw)
In-Reply-To: <20170212233802.GA29621@zzz>

Hey Eric,

On Sun, Feb 12, 2017 at 03:38:02PM -0800, Eric Biggers wrote:
> Hi Sven,
> 
> On Sun, Feb 12, 2017 at 12:16:18PM +0100, Sven Schmidt wrote:
> >  /*-************************************
> >   *	Reading and writing into memory
> >   **************************************/
> > +typedef union {
> > +	U16 u16;
> > +	U32 u32;
> > +	size_t uArch;
> > +} __packed unalign;
> > 
> > -static inline U16 LZ4_read16(const void *memPtr)
> > +static FORCE_INLINE __maybe_unused U16 LZ4_read16(const void *ptr)
> >  {
> > -	U16 val;
> > -
> > -	memcpy(&val, memPtr, sizeof(val));
> > -
> > -	return val;
> > +	return ((const unalign *)ptr)->u16;
> >  }
> > 
> > -static inline U32 LZ4_read32(const void *memPtr)
> > +static FORCE_INLINE __maybe_unused U32 LZ4_read32(const void *ptr)
> >  {
> > -	U32 val;
> > -
> > -	memcpy(&val, memPtr, sizeof(val));
> > -
> > -	return val;
> > +	return ((const unalign *)ptr)->u32;
> >  }
> > 
> > -static inline size_t LZ4_read_ARCH(const void *memPtr)
> > +static FORCE_INLINE __maybe_unused size_t LZ4_read_ARCH(const void *ptr)
> >  {
> > -	size_t val;
> > -
> > -	memcpy(&val, memPtr, sizeof(val));
> > -
> > -	return val;
> > +	return ((const unalign *)ptr)->uArch;
> >  }
> > 
> > -static inline void LZ4_write16(void *memPtr, U16 value)
> > +static FORCE_INLINE __maybe_unused void LZ4_write16(void *memPtr, U16 value)
> >  {
> > -	memcpy(memPtr, &value, sizeof(value));
> > +	((unalign *)memPtr)->u16 = value;
> >  }
> > 
> > -static inline void LZ4_write32(void *memPtr, U32 value)
> > -{
> > -	memcpy(memPtr, &value, sizeof(value));
> > +static FORCE_INLINE __maybe_unused void LZ4_write32(void *memPtr, U32 value) {
> > +	((unalign *)memPtr)->u32 = value;
> >  }
> > 
> > -static inline U16 LZ4_readLE16(const void *memPtr)
> > +static FORCE_INLINE __maybe_unused U16 LZ4_readLE16(const void *memPtr)
> >  {
> > -#ifdef __LITTLE_ENDIAN__
> > +#if LZ4_LITTLE_ENDIAN
> >  	return LZ4_read16(memPtr);
> >  #else
> >  	const BYTE *p = (const BYTE *)memPtr;
> > @@ -137,19 +143,19 @@ static inline U16 LZ4_readLE16(const void *memPtr)
> >  #endif
> >  }
> 
> Since upstream LZ4 is intended to be compiled at -O3, this may allow it to get
> away with using memcpy() for unaligned memory accesses.  The reason it uses
> memcpy() is that, other than a byte-by-byte copy, it is the only portable way to
> express unaligned memory accesses.  But the Linux kernel is sometimes compiled
> optimized for size (-Os), and I wouldn't be *too* surprised if some of the
> memcpy()'s don't always get inlined then, which could be causing the performance
> regression being observed.  (Of course, this could be verified by checking
> whether CONFIG_CC_OPTIMIZE_FOR_SIZE=y is set, then reading the assembly.)
> 
> But I don't think accessing a __packed structure directly is the right
> alternative.  Instead, Linux already includes macros for unaligned memory
> accesses which have been optimized for every supported architecture.  Those
> should just be used instead, e.g. like this:
> 
> static FORCE_INLINE U16 LZ4_read16(const void *ptr)
> {
> 	return get_unaligned((const u16 *)ptr);
> }
> 
> static FORCE_INLINE U32 LZ4_read32(const void *ptr)
> {
> 	return get_unaligned((const u32 *)ptr);
> }
> 
> static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr)
> {
> 	return get_unaligned((const size_t *)ptr);
> }
> 
> static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value)
> {
> 	put_unaligned(value, (u16 *)memPtr);
> }
> 
> static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value)
> {
> 	put_unaligned(value, (u32 *)memPtr);
> }
> 
> static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr)
> {
> 	return get_unaligned_le16(memPtr);
> }
> 
> static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value)
> {
> 	return put_unaligned_le16(value, memPtr);
> }
> 
> static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
> {
> 	if (LZ4_64bits()) {
> 		u64 a = get_unaligned((const u64 *)src);
> 		put_unaligned(a, (u64 *)dst);
> 	} else {
> 		u32 a = get_unaligned((const u32 *)src);
> 		u32 b = get_unaligned((const u32 *)src + 1);
> 		put_unaligned(a, (u32 *)dst);
> 		put_unaligned(b, (u32 *)dst + 1);
> 	}
> }
> 
> 
> Note that I dropped __maybe_unused as it's not needed on inline functions.
> That should be done everywhere else the patch proposes to add it too.
> 
> > -#if LZ4_ARCH64
> > -#ifdef __BIG_ENDIAN__
> > -#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3)
> > +static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val)
> > +{
> > +#if LZ4_LITTLE_ENDIAN
> > +#if LZ4_ARCH64 /* 64 Bits Little Endian */
> > +#if defined(LZ4_FORCE_SW_BITCOUNT)
> > +	static const int DeBruijnBytePos[64] = {
> > +		0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7,
> > +		0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7,
> > +		7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6,
> > +		7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7
> > +	};
> > +
> > +	return DeBruijnBytePos[((U64)((val & -(long long)val)
> > +		* 0x0218A392CDABBD3FULL)) >> 58];
> >  #else
> > -#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3)
> > -#endif
> > +	return (__builtin_ctzll((U64)val) >> 3);
> > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
> > +#else /* 32 Bits Little Endian */
> > +#if defined(LZ4_FORCE_SW_BITCOUNT)
> > +	static const int DeBruijnBytePos[32] = {
> > +		0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1,
> > +		3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1
> > +	};
> > +
> > +	return DeBruijnBytePos[((U32)((val & -(S32)val)
> > +		* 0x077CB531U)) >> 27];
> >  #else
> > -#ifdef __BIG_ENDIAN__
> > -#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3)
> > +	return (__builtin_ctz((U32)val) >> 3);
> > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
> > +#endif /* LZ4_ARCH64 */
> > +#else /* Big Endian */
> > +#if LZ4_ARCH64 /* 64 Bits Big Endian */
> > +#if defined(LZ4_FORCE_SW_BITCOUNT)
> > +	unsigned int r;
> > +
> > +	if (!(val >> 32)) {
> > +		r = 4;
> > +	} else {
> > +		r = 0;
> > +		val >>= 32;
> > +	}
> > +
> > +	if (!(val >> 16)) {
> > +		r += 2;
> > +		val >>= 8;
> > +	} else {
> > +		val >>= 24;
> > +	}
> > +
> > +	r += (!val);
> > +
> > +	return r;
> >  #else
> > -#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3)
> > -#endif
> > -#endif
> > +	return (__builtin_clzll((U64)val) >> 3);
> > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
> > +#else /* 32 Bits Big Endian */
> > +#if defined(LZ4_FORCE_SW_BITCOUNT)
> > +	unsigned int r;
> > +
> > +	if (!(val >> 16)) {
> > +		r = 2;
> > +		val >>= 8;
> > +	} else {
> > +		r = 0;
> > +		val >>= 24;
> > +	}
> > +
> > +	r += (!val);
> > +
> > +	return r;
> > +#else
> > +	return (__builtin_clz((U32)val) >> 3);
> > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
> > +#endif /* LZ4_ARCH64 */
> > +#endif /* LZ4_LITTLE_ENDIAN */
> > +}
> 
> The reason LZ4_NbCommonBytes() in upstream LZ4 is so complicated is that it
> needs to provide portable fallbacks that work on *any* platform and compiler.
> This isn't needed in the Linux kernel, and it should just call the functions
> already defined that do the right thing:
> 
> static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val)
> {
>         if (LZ4_isLittleEndian())
>                 return __ffs(val) >> 3;
>         else
>                 return (BITS_PER_LONG - 1 - __fls(val)) >> 3;
> }                      
> 
> To be clear, when I said that upstream LZ4 shouldn't generally be changed, I'm
> primarily talking about the core code, not the platform-specific parts.  What we
> need to do is define platform-specific stuff, like LZ4_read*(), LZ4_write*(),
> LZ4_NbCommonBytes(), LZ4_64bits(), and FORCE_INLINE, in a way that makes sense
> for the Linux kernel and the environment it's compiled in.  Also I think it's
> fine, and maybe even necessary for performance, to *add* inline or FORCE_INLINE
> in some places too, given that LZ4 in Linux may get compiled with a lower
> optimization level than that intended to be used for upstream LZ4 --- though it
> may be worth considering updating the Makefile to just always compile the LZ4
> files with -O3 instead.  What should be avoided is making unnecessary changes to
> the *users* of the platform-specific code or to the core (de)compression
> parameters or templates.
> 
> Eric

I'm very thankful for your effort here! I included all your proposed changes (as well as -O3 as compiler flag in the Makefile)
and noticed further improvements concerning the performance in ZRAM (compared to current LZ4 in 4.10 RC8 as well as the previous changes discussed
here). I would never have thought that the NBcommonBytes function can be re-written in such an easy way, since the current LZ4 in the kernel
uses similar approach checking for 64/32 bit and endianess.

Also, thanks for your very detailed remarks. That's really helpful to me.

Regards,

Sven

  reply	other threads:[~2017-02-14 10:33 UTC|newest]

Thread overview: 84+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <1482259992-16680-1-git-send-email-4sschmid@informatik.uni-hamburg.de>
     [not found] ` <1482259992-16680-2-git-send-email-4sschmid@informatik.uni-hamburg.de>
2016-12-22 13:25   ` [PATCH 1/3] crypto: Change lz4 modules to work with new lz4 compressor module version Sergey Senozhatsky
2017-01-07 16:55 ` [PATCH v2 0/4] Update LZ4 compressor module Sven Schmidt
2017-01-07 16:55   ` [PATCH v2 1/4] lib: Update LZ4 compressor module based on LZ4 v1.7.2 Sven Schmidt
2017-01-08 11:22     ` Greg KH
2017-01-10  9:32       ` Sven Schmidt
2017-01-10  9:59         ` Greg KH
2017-01-08 11:25     ` Greg KH
2017-01-08 11:33       ` Rui Salvaterra
2017-01-10  9:21       ` Sven Schmidt
2017-01-10 10:00         ` Greg KH
2017-01-10 10:50           ` Willy Tarreau
2017-01-07 16:55   ` [PATCH v2 2/4] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-08 11:23     ` Greg KH
2017-01-07 16:55   ` [PATCH v2 3/4] crypto: Change LZ4 modules " Sven Schmidt
2017-01-07 16:55   ` [PATCH v2 4/4] fs/pstore: fs/squashfs: Change usage of LZ4 to comply " Sven Schmidt
2017-01-07 21:33     ` Kees Cook
2017-01-10  9:45       ` Sven Schmidt
2017-01-21 15:09 ` [PATCH v3 0/4] Update LZ4 compressor module Sven Schmidt
2017-01-21 15:09   ` [PATCH 1/4] lib: " Sven Schmidt
2017-01-21 15:56     ` kbuild test robot
2017-01-21 16:16     ` kbuild test robot
2017-01-21 17:38     ` kbuild test robot
2017-01-22 11:05     ` Greg KH
2017-01-21 15:09   ` [PATCH 2/4] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-21 15:09   ` [PATCH 3/4] crypto: Change LZ4 modules " Sven Schmidt
2017-01-21 15:09   ` [PATCH 4/4] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-22 19:35 ` [PATCH v4 0/4] Update LZ4 compressor module Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 1/4] lib: " Sven Schmidt
2017-01-24  0:23     ` Andrew Morton
2017-01-24 16:48       ` Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 2/4] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 3/4] crypto: Change LZ4 modules " Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 4/4] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-26  7:57 ` [PATCH v5 0/5] Update LZ4 compressor module Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 1/5] lib: " Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt
2017-01-26  9:19   ` [PATCH v5 0/5] Update LZ4 compressor module Eric Biggers
2017-01-26 14:15     ` Sven Schmidt
2017-01-27 22:01 ` [PATCH v6 " Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 1/5] lib: " Sven Schmidt
2017-01-31 22:27     ` Jonathan Corbet
2017-02-01 20:18       ` Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt
2017-02-05 19:09 ` [PATCH v7 0/5] Update LZ4 compressor module Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 1/5] lib: " Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt
2017-02-08 23:31   ` [PATCH v7 0/5] Update LZ4 compressor module Minchan Kim
2017-02-09  0:24     ` Eric Biggers
2017-02-09  5:24       ` Eric Biggers
2017-02-09 11:05         ` Sven Schmidt
2017-02-09 18:20           ` Eric Biggers
2017-02-10  0:14         ` Minchan Kim
2017-02-09 11:02       ` Sven Schmidt
2017-02-09 18:29         ` Eric Biggers
2017-02-10  3:57           ` David Miller
2017-02-09 10:56     ` Sven Schmidt
2017-02-10  0:13       ` Minchan Kim
2017-02-12 11:16         ` Sven Schmidt
2017-02-12 11:16           ` [PATCH] lz4: fix performance regressions Sven Schmidt
2017-02-12 13:05             ` Willy Tarreau
2017-02-12 15:20               ` Sven Schmidt
2017-02-12 21:41                 ` Willy Tarreau
2017-02-13 11:53                   ` Sven Schmidt
2017-02-13 13:37                     ` Willy Tarreau
2017-02-12 23:38             ` Eric Biggers
2017-02-14 10:33               ` Sven Schmidt [this message]
2017-02-13  0:03           ` [PATCH v7 0/5] Update LZ4 compressor module Minchan Kim
2017-02-13 12:08             ` Sven Schmidt
2017-02-15  7:29               ` Minchan Kim
2017-02-15 18:16 ` [PATCH v8 " Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 1/5] lib: " Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170214103344.GA5475@bierbaron.springfield.local \
    --to=4sschmid@informatik.uni-hamburg.de \
    --cc=akpm@linux-foundation.org \
    --cc=anton@enomsg.org \
    --cc=bongkyu.kim@lge.com \
    --cc=ccross@android.com \
    --cc=davem@davemloft.net \
    --cc=ebiggers3@gmail.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=herbert@gondor.apana.org.au \
    --cc=keescook@chromium.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=minchan@kernel.org \
    --cc=rsalvaterra@gmail.com \
    --cc=sergey.senozhatsky@gmail.com \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).