All of lore.kernel.org
 help / color / mirror / Atom feed
From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
To: Eric Biggers <ebiggers3@gmail.com>
Cc: minchan@kernel.org, akpm@linux-foundation.org,
	bongkyu.kim@lge.com, rsalvaterra@gmail.com,
	sergey.senozhatsky@gmail.com, gregkh@linuxfoundation.org,
	linux-kernel@vger.kernel.org, herbert@gondor.apana.org.au,
	davem@davemloft.net, linux-crypto@vger.kernel.org,
	anton@enomsg.org, ccross@android.com, keescook@chromium.org,
	tony.luck@intel.com
Subject: Re: [PATCH] lz4: fix performance regressions
Date: Tue, 14 Feb 2017 11:33:44 +0100	[thread overview]
Message-ID: <20170214103344.GA5475@bierbaron.springfield.local> (raw)
In-Reply-To: <20170212233802.GA29621@zzz>

Hey Eric,

On Sun, Feb 12, 2017 at 03:38:02PM -0800, Eric Biggers wrote:
> Hi Sven,
> 
> On Sun, Feb 12, 2017 at 12:16:18PM +0100, Sven Schmidt wrote:
> >  /*-************************************
> >   *	Reading and writing into memory
> >   **************************************/
> > +typedef union {
> > +	U16 u16;
> > +	U32 u32;
> > +	size_t uArch;
> > +} __packed unalign;
> > 
> > -static inline U16 LZ4_read16(const void *memPtr)
> > +static FORCE_INLINE __maybe_unused U16 LZ4_read16(const void *ptr)
> >  {
> > -	U16 val;
> > -
> > -	memcpy(&val, memPtr, sizeof(val));
> > -
> > -	return val;
> > +	return ((const unalign *)ptr)->u16;
> >  }
> > 
> > -static inline U32 LZ4_read32(const void *memPtr)
> > +static FORCE_INLINE __maybe_unused U32 LZ4_read32(const void *ptr)
> >  {
> > -	U32 val;
> > -
> > -	memcpy(&val, memPtr, sizeof(val));
> > -
> > -	return val;
> > +	return ((const unalign *)ptr)->u32;
> >  }
> > 
> > -static inline size_t LZ4_read_ARCH(const void *memPtr)
> > +static FORCE_INLINE __maybe_unused size_t LZ4_read_ARCH(const void *ptr)
> >  {
> > -	size_t val;
> > -
> > -	memcpy(&val, memPtr, sizeof(val));
> > -
> > -	return val;
> > +	return ((const unalign *)ptr)->uArch;
> >  }
> > 
> > -static inline void LZ4_write16(void *memPtr, U16 value)
> > +static FORCE_INLINE __maybe_unused void LZ4_write16(void *memPtr, U16 value)
> >  {
> > -	memcpy(memPtr, &value, sizeof(value));
> > +	((unalign *)memPtr)->u16 = value;
> >  }
> > 
> > -static inline void LZ4_write32(void *memPtr, U32 value)
> > -{
> > -	memcpy(memPtr, &value, sizeof(value));
> > +static FORCE_INLINE __maybe_unused void LZ4_write32(void *memPtr, U32 value) {
> > +	((unalign *)memPtr)->u32 = value;
> >  }
> > 
> > -static inline U16 LZ4_readLE16(const void *memPtr)
> > +static FORCE_INLINE __maybe_unused U16 LZ4_readLE16(const void *memPtr)
> >  {
> > -#ifdef __LITTLE_ENDIAN__
> > +#if LZ4_LITTLE_ENDIAN
> >  	return LZ4_read16(memPtr);
> >  #else
> >  	const BYTE *p = (const BYTE *)memPtr;
> > @@ -137,19 +143,19 @@ static inline U16 LZ4_readLE16(const void *memPtr)
> >  #endif
> >  }
> 
> Since upstream LZ4 is intended to be compiled at -O3, this may allow it to get
> away with using memcpy() for unaligned memory accesses.  The reason it uses
> memcpy() is that, other than a byte-by-byte copy, it is the only portable way to
> express unaligned memory accesses.  But the Linux kernel is sometimes compiled
> optimized for size (-Os), and I wouldn't be *too* surprised if some of the
> memcpy()'s don't always get inlined then, which could be causing the performance
> regression being observed.  (Of course, this could be verified by checking
> whether CONFIG_CC_OPTIMIZE_FOR_SIZE=y is set, then reading the assembly.)
> 
> But I don't think accessing a __packed structure directly is the right
> alternative.  Instead, Linux already includes macros for unaligned memory
> accesses which have been optimized for every supported architecture.  Those
> should just be used instead, e.g. like this:
> 
> static FORCE_INLINE U16 LZ4_read16(const void *ptr)
> {
> 	return get_unaligned((const u16 *)ptr);
> }
> 
> static FORCE_INLINE U32 LZ4_read32(const void *ptr)
> {
> 	return get_unaligned((const u32 *)ptr);
> }
> 
> static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr)
> {
> 	return get_unaligned((const size_t *)ptr);
> }
> 
> static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value)
> {
> 	put_unaligned(value, (u16 *)memPtr);
> }
> 
> static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value)
> {
> 	put_unaligned(value, (u32 *)memPtr);
> }
> 
> static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr)
> {
> 	return get_unaligned_le16(memPtr);
> }
> 
> static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value)
> {
> 	return put_unaligned_le16(value, memPtr);
> }
> 
> static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
> {
> 	if (LZ4_64bits()) {
> 		u64 a = get_unaligned((const u64 *)src);
> 		put_unaligned(a, (u64 *)dst);
> 	} else {
> 		u32 a = get_unaligned((const u32 *)src);
> 		u32 b = get_unaligned((const u32 *)src + 1);
> 		put_unaligned(a, (u32 *)dst);
> 		put_unaligned(b, (u32 *)dst + 1);
> 	}
> }
> 
> 
> Note that I dropped __maybe_unused as it's not needed on inline functions.
> That should be done everywhere else the patch proposes to add it too.
> 
> > -#if LZ4_ARCH64
> > -#ifdef __BIG_ENDIAN__
> > -#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3)
> > +static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val)
> > +{
> > +#if LZ4_LITTLE_ENDIAN
> > +#if LZ4_ARCH64 /* 64 Bits Little Endian */
> > +#if defined(LZ4_FORCE_SW_BITCOUNT)
> > +	static const int DeBruijnBytePos[64] = {
> > +		0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7,
> > +		0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7,
> > +		7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6,
> > +		7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7
> > +	};
> > +
> > +	return DeBruijnBytePos[((U64)((val & -(long long)val)
> > +		* 0x0218A392CDABBD3FULL)) >> 58];
> >  #else
> > -#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3)
> > -#endif
> > +	return (__builtin_ctzll((U64)val) >> 3);
> > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
> > +#else /* 32 Bits Little Endian */
> > +#if defined(LZ4_FORCE_SW_BITCOUNT)
> > +	static const int DeBruijnBytePos[32] = {
> > +		0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1,
> > +		3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1
> > +	};
> > +
> > +	return DeBruijnBytePos[((U32)((val & -(S32)val)
> > +		* 0x077CB531U)) >> 27];
> >  #else
> > -#ifdef __BIG_ENDIAN__
> > -#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3)
> > +	return (__builtin_ctz((U32)val) >> 3);
> > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
> > +#endif /* LZ4_ARCH64 */
> > +#else /* Big Endian */
> > +#if LZ4_ARCH64 /* 64 Bits Big Endian */
> > +#if defined(LZ4_FORCE_SW_BITCOUNT)
> > +	unsigned int r;
> > +
> > +	if (!(val >> 32)) {
> > +		r = 4;
> > +	} else {
> > +		r = 0;
> > +		val >>= 32;
> > +	}
> > +
> > +	if (!(val >> 16)) {
> > +		r += 2;
> > +		val >>= 8;
> > +	} else {
> > +		val >>= 24;
> > +	}
> > +
> > +	r += (!val);
> > +
> > +	return r;
> >  #else
> > -#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3)
> > -#endif
> > -#endif
> > +	return (__builtin_clzll((U64)val) >> 3);
> > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
> > +#else /* 32 Bits Big Endian */
> > +#if defined(LZ4_FORCE_SW_BITCOUNT)
> > +	unsigned int r;
> > +
> > +	if (!(val >> 16)) {
> > +		r = 2;
> > +		val >>= 8;
> > +	} else {
> > +		r = 0;
> > +		val >>= 24;
> > +	}
> > +
> > +	r += (!val);
> > +
> > +	return r;
> > +#else
> > +	return (__builtin_clz((U32)val) >> 3);
> > +#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
> > +#endif /* LZ4_ARCH64 */
> > +#endif /* LZ4_LITTLE_ENDIAN */
> > +}
> 
> The reason LZ4_NbCommonBytes() in upstream LZ4 is so complicated is that it
> needs to provide portable fallbacks that work on *any* platform and compiler.
> This isn't needed in the Linux kernel, and it should just call the functions
> already defined that do the right thing:
> 
> static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val)
> {
>         if (LZ4_isLittleEndian())
>                 return __ffs(val) >> 3;
>         else
>                 return (BITS_PER_LONG - 1 - __fls(val)) >> 3;
> }                      
> 
> To be clear, when I said that upstream LZ4 shouldn't generally be changed, I'm
> primarily talking about the core code, not the platform-specific parts.  What we
> need to do is define platform-specific stuff, like LZ4_read*(), LZ4_write*(),
> LZ4_NbCommonBytes(), LZ4_64bits(), and FORCE_INLINE, in a way that makes sense
> for the Linux kernel and the environment it's compiled in.  Also I think it's
> fine, and maybe even necessary for performance, to *add* inline or FORCE_INLINE
> in some places too, given that LZ4 in Linux may get compiled with a lower
> optimization level than that intended to be used for upstream LZ4 --- though it
> may be worth considering updating the Makefile to just always compile the LZ4
> files with -O3 instead.  What should be avoided is making unnecessary changes to
> the *users* of the platform-specific code or to the core (de)compression
> parameters or templates.
> 
> Eric

I'm very thankful for your effort here! I included all your proposed changes (as well as -O3 as compiler flag in the Makefile)
and noticed further improvements concerning the performance in ZRAM (compared to current LZ4 in 4.10 RC8 as well as the previous changes discussed
here). I would never have thought that the NBcommonBytes function can be re-written in such an easy way, since the current LZ4 in the kernel
uses similar approach checking for 64/32 bit and endianess.

Also, thanks for your very detailed remarks. That's really helpful to me.

Regards,

Sven

  reply	other threads:[~2017-02-14 10:33 UTC|newest]

Thread overview: 104+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-12-20 18:53 [PATCH 0/3] Update LZ4 compressor module Sven Schmidt
2016-12-20 18:53 ` [PATCH 1/3] crypto: Change lz4 modules to work with new lz4 compressor module version Sven Schmidt
2016-12-21  5:25   ` kbuild test robot
2016-12-22 13:25   ` Sergey Senozhatsky
2016-12-20 18:53 ` [PATCH 2/3] fs/pstore: fs/squashfs: Change lz4 compressor functions to work with new version Sven Schmidt
2016-12-21  5:16   ` kbuild test robot
2016-12-21  5:18   ` kbuild test robot
2016-12-22 13:21   ` Sergey Senozhatsky
2016-12-22 15:31     ` Sven Schmidt
2016-12-22 15:36       ` Sergey Senozhatsky
2016-12-22 13:27   ` Sergey Senozhatsky
2016-12-20 18:53 ` [PATCH 3/3] lib: Update LZ4 compressor module based on LZ4 v1.7.2 Sven Schmidt
2016-12-20 19:52   ` Joe Perches
2016-12-21 20:04     ` Sven Schmidt
2016-12-22 17:31       ` Greg KH
2016-12-22 18:39         ` Sven Schmidt
2016-12-21  7:09   ` kbuild test robot
2016-12-22 17:29 ` [PATCH 0/3] Update LZ4 compressor module Greg KH
2016-12-22 18:35   ` Sven Schmidt
2016-12-23 20:53     ` Greg KH
2017-01-07 16:55 ` [PATCH v2 0/4] " Sven Schmidt
2017-01-07 16:55   ` [PATCH v2 1/4] lib: Update LZ4 compressor module based on LZ4 v1.7.2 Sven Schmidt
2017-01-08 11:22     ` Greg KH
2017-01-10  9:32       ` Sven Schmidt
2017-01-10  9:59         ` Greg KH
2017-01-08 11:25     ` Greg KH
2017-01-08 11:33       ` Rui Salvaterra
2017-01-10  9:21       ` Sven Schmidt
2017-01-10 10:00         ` Greg KH
2017-01-10 10:50           ` Willy Tarreau
2017-01-07 16:55   ` [PATCH v2 2/4] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-08 11:23     ` Greg KH
2017-01-07 16:55   ` [PATCH v2 3/4] crypto: Change LZ4 modules " Sven Schmidt
2017-01-07 16:55   ` [PATCH v2 4/4] fs/pstore: fs/squashfs: Change usage of LZ4 to comply " Sven Schmidt
2017-01-07 21:33     ` Kees Cook
2017-01-10  9:45       ` Sven Schmidt
2017-01-21 15:09 ` [PATCH v3 0/4] Update LZ4 compressor module Sven Schmidt
2017-01-21 15:09   ` [PATCH 1/4] lib: " Sven Schmidt
2017-01-21 15:56     ` kbuild test robot
2017-01-21 16:16     ` kbuild test robot
2017-01-21 17:38     ` kbuild test robot
2017-01-22 11:05     ` Greg KH
2017-01-21 15:09   ` [PATCH 2/4] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-21 15:09   ` [PATCH 3/4] crypto: Change LZ4 modules " Sven Schmidt
2017-01-21 15:09   ` [PATCH 4/4] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-22 19:35 ` [PATCH v4 0/4] Update LZ4 compressor module Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 1/4] lib: " Sven Schmidt
2017-01-24  0:23     ` Andrew Morton
2017-01-24 16:48       ` Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 2/4] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 3/4] crypto: Change LZ4 modules " Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 4/4] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-26  7:57 ` [PATCH v5 0/5] Update LZ4 compressor module Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 1/5] lib: " Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt
2017-01-26  9:19   ` [PATCH v5 0/5] Update LZ4 compressor module Eric Biggers
2017-01-26 14:15     ` Sven Schmidt
2017-01-27 22:01 ` [PATCH v6 " Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 1/5] lib: " Sven Schmidt
2017-01-31 22:27     ` Jonathan Corbet
2017-02-01 20:18       ` Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt
2017-02-05 19:09 ` [PATCH v7 0/5] Update LZ4 compressor module Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 1/5] lib: " Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt
2017-02-08 23:31   ` [PATCH v7 0/5] Update LZ4 compressor module Minchan Kim
2017-02-09  0:24     ` Eric Biggers
2017-02-09  5:24       ` Eric Biggers
2017-02-09 11:05         ` Sven Schmidt
2017-02-09 18:20           ` Eric Biggers
2017-02-10  0:14         ` Minchan Kim
2017-02-09 11:02       ` Sven Schmidt
2017-02-09 18:29         ` Eric Biggers
2017-02-10  3:57           ` David Miller
2017-02-09 10:56     ` Sven Schmidt
2017-02-10  0:13       ` Minchan Kim
2017-02-12 11:16         ` Sven Schmidt
2017-02-12 11:16           ` [PATCH] lz4: fix performance regressions Sven Schmidt
2017-02-12 11:16             ` Sven Schmidt
2017-02-12 13:05             ` Willy Tarreau
2017-02-12 15:20               ` Sven Schmidt
2017-02-12 21:41                 ` Willy Tarreau
2017-02-13 11:53                   ` Sven Schmidt
2017-02-13 13:37                     ` Willy Tarreau
2017-02-12 23:38             ` Eric Biggers
2017-02-14 10:33               ` Sven Schmidt [this message]
2017-02-13  0:03           ` [PATCH v7 0/5] Update LZ4 compressor module Minchan Kim
2017-02-13 12:08             ` Sven Schmidt
2017-02-15  7:29               ` Minchan Kim
2017-02-15 18:16 ` [PATCH v8 " Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 1/5] lib: " Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170214103344.GA5475@bierbaron.springfield.local \
    --to=4sschmid@informatik.uni-hamburg.de \
    --cc=akpm@linux-foundation.org \
    --cc=anton@enomsg.org \
    --cc=bongkyu.kim@lge.com \
    --cc=ccross@android.com \
    --cc=davem@davemloft.net \
    --cc=ebiggers3@gmail.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=herbert@gondor.apana.org.au \
    --cc=keescook@chromium.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=minchan@kernel.org \
    --cc=rsalvaterra@gmail.com \
    --cc=sergey.senozhatsky@gmail.com \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.