linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
To: minchan@kernel.org
Cc: ebiggers3@gmail.com, akpm@linux-foundation.org,
	bongkyu.kim@lge.com, rsalvaterra@gmail.com,
	sergey.senozhatsky@gmail.com, gregkh@linuxfoundation.org,
	linux-kernel@vger.kernel.org, herbert@gondor.apana.org.au,
	davem@davemloft.net, linux-crypto@vger.kernel.org,
	anton@enomsg.org, ccross@android.com, keescook@chromium.org,
	tony.luck@intel.com,
	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Subject: [PATCH] lz4: fix performance regressions
Date: Sun, 12 Feb 2017 12:16:18 +0100	[thread overview]
Message-ID: <1486898178-17125-2-git-send-email-4sschmid@informatik.uni-hamburg.de> (raw)
In-Reply-To: <1486898178-17125-1-git-send-email-4sschmid@informatik.uni-hamburg.de>

Fix performance regressions compared to current kernel LZ4

Signed-off-by: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
---
 include/linux/lz4.h      |   2 +-
 lib/lz4/lz4_compress.c   | 157 +++++++++++++++++++++++-------------
 lib/lz4/lz4_decompress.c |  50 ++++++++----
 lib/lz4/lz4defs.h        | 203 ++++++++++++++++++++++++++++++++---------------
 lib/lz4/lz4hc_compress.c |   8 +-
 5 files changed, 281 insertions(+), 139 deletions(-)

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index a3912d7..394e3d9 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -82,7 +82,7 @@
 /*-************************************************************************
  *	STREAMING CONSTANTS AND STRUCTURES
  **************************************************************************/
-#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
+#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE - 3)) + 4)
 #define LZ4_STREAMSIZE	(LZ4_STREAMSIZE_U64 * sizeof(unsigned long long))

 #define LZ4_STREAMHCSIZE        262192
diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
index 697dbda..2cbbf99 100644
--- a/lib/lz4/lz4_compress.c
+++ b/lib/lz4/lz4_compress.c
@@ -39,27 +39,33 @@
 #include <linux/kernel.h>
 #include <asm/unaligned.h>

+static const int LZ4_minLength = (MFLIMIT + 1);
+static const int LZ4_64Klimit = ((64 * KB) + (MFLIMIT - 1));
+
 /*-******************************
  *	Compression functions
  ********************************/
-static U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
+static FORCE_INLINE U32 LZ4_hash4(
+	U32 sequence,
+	tableType_t const tableType)
 {
 	if (tableType == byU16)
 		return ((sequence * 2654435761U)
-			>> ((MINMATCH*8) - (LZ4_HASHLOG + 1)));
+			>> ((MINMATCH * 8) - (LZ4_HASHLOG + 1)));
 	else
 		return ((sequence * 2654435761U)
-			>> ((MINMATCH*8) - LZ4_HASHLOG));
+			>> ((MINMATCH * 8) - LZ4_HASHLOG));
 }

-#if LZ4_ARCH64
-static U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
+static FORCE_INLINE __maybe_unused U32 LZ4_hash5(
+	U64 sequence,
+	tableType_t const tableType)
 {
 	const U32 hashLog = (tableType == byU16)
 		? LZ4_HASHLOG + 1
 		: LZ4_HASHLOG;

-#ifdef __LITTLE_ENDIAN__
+#if LZ4_LITTLE_ENDIAN
 	static const U64 prime5bytes = 889523592379ULL;

 	return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
@@ -69,9 +75,10 @@ static U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
 	return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
 #endif
 }
-#endif

-static U32 LZ4_hashPosition(const void *p, tableType_t tableType)
+static FORCE_INLINE U32 LZ4_hashPosition(
+	const void *p,
+	tableType_t const tableType)
 {
 #if LZ4_ARCH64
 	if (tableType == byU32)
@@ -81,8 +88,12 @@ static U32 LZ4_hashPosition(const void *p, tableType_t tableType)
 	return LZ4_hash4(LZ4_read32(p), tableType);
 }

-static void LZ4_putPositionOnHash(const BYTE *p, U32 h, void *tableBase,
-	tableType_t const tableType, const BYTE *srcBase)
+static void LZ4_putPositionOnHash(
+	const BYTE *p,
+	U32 h,
+	void *tableBase,
+	tableType_t const tableType,
+	const BYTE *srcBase)
 {
 	switch (tableType) {
 	case byPtr:
@@ -109,16 +120,22 @@ static void LZ4_putPositionOnHash(const BYTE *p, U32 h, void *tableBase,
 	}
 }

-static inline void LZ4_putPosition(const BYTE *p, void *tableBase,
-	tableType_t tableType, const BYTE *srcBase)
+static FORCE_INLINE void LZ4_putPosition(
+	const BYTE *p,
+	void *tableBase,
+	tableType_t tableType,
+	const BYTE *srcBase)
 {
 	U32 const h = LZ4_hashPosition(p, tableType);

 	LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
 }

-static const BYTE *LZ4_getPositionOnHash(U32 h, void *tableBase,
-	tableType_t tableType, const BYTE *srcBase)
+static const BYTE *LZ4_getPositionOnHash(
+	U32 h,
+	void *tableBase,
+	tableType_t tableType,
+	const BYTE *srcBase)
 {
 	if (tableType == byPtr) {
 		const BYTE **hashTable = (const BYTE **) tableBase;
@@ -135,12 +152,16 @@ static const BYTE *LZ4_getPositionOnHash(U32 h, void *tableBase,
 	{
 		/* default, to ensure a return */
 		const U16 * const hashTable = (U16 *) tableBase;
+
 		return hashTable[h] + srcBase;
 	}
 }

-static inline const BYTE *LZ4_getPosition(const BYTE *p, void *tableBase,
-	tableType_t tableType, const BYTE *srcBase)
+static FORCE_INLINE const BYTE *LZ4_getPosition(
+	const BYTE *p,
+	void *tableBase,
+	tableType_t tableType,
+	const BYTE *srcBase)
 {
 	U32 const h = LZ4_hashPosition(p, tableType);

@@ -152,7 +173,7 @@ static inline const BYTE *LZ4_getPosition(const BYTE *p, void *tableBase,
  * LZ4_compress_generic() :
  * inlined, to ensure branches are decided at compilation time
  */
-static inline int LZ4_compress_generic(
+static FORCE_INLINE int LZ4_compress_generic(
 	LZ4_stream_t_internal * const dictPtr,
 	const char * const source,
 	char * const dest,
@@ -187,6 +208,7 @@ static inline int LZ4_compress_generic(
 		/* Unsupported inputSize, too large (or negative) */
 		return 0;
 	}
+
 	switch (dict) {
 	case noDict:
 	default:
@@ -216,7 +238,8 @@ static inline int LZ4_compress_generic(

 	/* First Byte */
 	LZ4_putPosition(ip, dictPtr->hashTable, tableType, base);
-	ip++; forwardH = LZ4_hashPosition(ip, tableType);
+	ip++;
+	forwardH = LZ4_hashPosition(ip, tableType);

 	/* Main Loop */
 	for ( ; ; ) {
@@ -227,15 +250,14 @@ static inline int LZ4_compress_generic(
 		{
 			const BYTE *forwardIp = ip;
 			unsigned int step = 1;
-			unsigned int searchMatchNb = acceleration
-				<< LZ4_skipTrigger;
+			unsigned int searchMatchNb = acceleration << LZ4_SKIPTRIGGER;

 			do {
 				U32 const h = forwardH;

 				ip = forwardIp;
 				forwardIp += step;
-				step = (searchMatchNb++ >> LZ4_skipTrigger);
+				step = (searchMatchNb++ >> LZ4_SKIPTRIGGER);

 				if (unlikely(forwardIp > mflimit))
 					goto _last_literals;
@@ -243,6 +265,7 @@ static inline int LZ4_compress_generic(
 				match = LZ4_getPositionOnHash(h,
 					dictPtr->hashTable,
 					tableType, base);
+
 				if (dict == usingExtDict) {
 					if (match < (const BYTE *)source) {
 						refDelta = dictDelta;
@@ -251,11 +274,12 @@ static inline int LZ4_compress_generic(
 						refDelta = 0;
 						lowLimit = (const BYTE *)source;
 				}	 }
+
 				forwardH = LZ4_hashPosition(forwardIp,
 					tableType);
+
 				LZ4_putPositionOnHash(ip, h, dictPtr->hashTable,
 					tableType, base);
-
 			} while (((dictIssue == dictSmall)
 					? (match < lowRefLimit)
 					: 0)
@@ -268,31 +292,34 @@ static inline int LZ4_compress_generic(

 		/* Catch up */
 		while (((ip > anchor) & (match + refDelta > lowLimit))
-			&& (unlikely(ip[-1] == match[refDelta - 1]))) {
+				&& (unlikely(ip[-1] == match[refDelta - 1]))) {
 			ip--;
 			match--;
-			}
+		}

 		/* Encode Literals */
 		{
 			unsigned const int litLength = (unsigned int)(ip - anchor);

 			token = op++;
+
 			if ((outputLimited) &&
 				/* Check output buffer overflow */
 				(unlikely(op + litLength +
 					(2 + 1 + LASTLITERALS) +
-					(litLength/255) > olimit)))
+					(litLength / 255) > olimit)))
 				return 0;
+
 			if (litLength >= RUN_MASK) {
 				int len = (int)litLength - RUN_MASK;

-				*token = (RUN_MASK<<ML_BITS);
-				for (; len >= 255 ; len -= 255)
+				*token = (RUN_MASK << ML_BITS);
+
+				for (; len >= 255; len -= 255)
 					*op++ = 255;
 				*op++ = (BYTE)len;
 			} else
-				*token = (BYTE)(litLength<<ML_BITS);
+				*token = (BYTE)(litLength << ML_BITS);

 			/* Copy Literals */
 			LZ4_wildCopy(op, anchor, op + litLength);
@@ -301,7 +328,8 @@ static inline int LZ4_compress_generic(

 _next_match:
 		/* Encode Offset */
-		LZ4_writeLE16(op, (U16)(ip - match)); op += 2;
+		LZ4_writeLE16(op, (U16)(ip - match));
+		op += 2;

 		/* Encode MatchLength */
 		{
@@ -313,11 +341,15 @@ static inline int LZ4_compress_generic(

 				match += refDelta;
 				limit = ip + (dictEnd - match);
+
 				if (limit > matchlimit)
 					limit = matchlimit;
+
 				matchCode = LZ4_count(ip + MINMATCH,
 					match + MINMATCH, limit);
+
 				ip += MINMATCH + matchCode;
+
 				if (ip == limit) {
 					unsigned const int more = LZ4_count(ip,
 						(const BYTE *)source,
@@ -336,17 +368,20 @@ static inline int LZ4_compress_generic(
 				/* Check output buffer overflow */
 				(unlikely(op +
 					(1 + LASTLITERALS) +
-					(matchCode>>8) > olimit)))
+					(matchCode >> 8) > olimit)))
 				return 0;
+
 			if (matchCode >= ML_MASK) {
 				*token += ML_MASK;
 				matchCode -= ML_MASK;
 				LZ4_write32(op, 0xFFFFFFFF);
-				while (matchCode >= 4*255) {
+
+				while (matchCode >= 4 * 255) {
 					op += 4;
 					LZ4_write32(op, 0xFFFFFFFF);
-					matchCode -= 4*255;
+					matchCode -= 4 * 255;
 				}
+
 				op += matchCode / 255;
 				*op++ = (BYTE)(matchCode % 255);
 			} else
@@ -365,6 +400,7 @@ static inline int LZ4_compress_generic(
 		/* Test next position */
 		match = LZ4_getPosition(ip, dictPtr->hashTable,
 			tableType, base);
+
 		if (dict == usingExtDict) {
 			if (match < (const BYTE *)source) {
 				refDelta = dictDelta;
@@ -374,7 +410,9 @@ static inline int LZ4_compress_generic(
 				lowLimit = (const BYTE *)source;
 			}
 		}
+
 		LZ4_putPosition(ip, dictPtr->hashTable, tableType, base);
+
 		if (((dictIssue == dictSmall) ? (match >= lowRefLimit) : 1)
 			&& (match + MAX_DISTANCE >= ip)
 			&& (LZ4_read32(match + refDelta) == LZ4_read32(ip))) {
@@ -395,18 +433,21 @@ static inline int LZ4_compress_generic(
 		if ((outputLimited) &&
 			/* Check output buffer overflow */
 			((op - (BYTE *)dest) + lastRun + 1 +
-			((lastRun + 255 - RUN_MASK)/255) > (U32)maxOutputSize))
+			((lastRun + 255 - RUN_MASK) / 255) > (U32)maxOutputSize))
 			return 0;
+
 		if (lastRun >= RUN_MASK) {
 			size_t accumulator = lastRun - RUN_MASK;
 			*op++ = RUN_MASK << ML_BITS;
-			for (; accumulator >= 255 ; accumulator -= 255)
+			for (; accumulator >= 255; accumulator -= 255)
 				*op++ = 255;
 			*op++ = (BYTE) accumulator;
 		} else {
-			*op++ = (BYTE)(lastRun<<ML_BITS);
+			*op++ = (BYTE)(lastRun << ML_BITS);
 		}
+
 		memcpy(op, anchor, lastRun);
+
 		op += lastRun;
 	}

@@ -414,23 +455,27 @@ static inline int LZ4_compress_generic(
 	return (int) (((char *)op) - dest);
 }

-static int LZ4_compress_fast_extState(void *state, const char *source, char *dest,
-	int inputSize, int maxOutputSize, int acceleration)
+static int LZ4_compress_fast_extState(
+	void *state,
+	const char *source,
+	char *dest,
+	int inputSize,
+	int maxOutputSize,
+	int acceleration)
 {
-	#if LZ4_ARCH64
-	tableType_t tableType = byU32;
-	#else
-	tableType_t tableType = byPtr;
-	#endif
-
 	LZ4_stream_t_internal *ctx = &((LZ4_stream_t *)state)->internal_donotuse;
+#if LZ4_ARCH64
+	const tableType_t tableType = byU32;
+#else
+	const tableType_t tableType = byPtr;
+#endif

 	LZ4_resetStream((LZ4_stream_t *)state);

 	if (acceleration < 1)
 		acceleration = LZ4_ACCELERATION_DEFAULT;

-	if (maxOutputSize >= LZ4_compressBound(inputSize)) {
+	if (maxOutputSize >= LZ4_COMPRESSBOUND(inputSize)) {
 		if (inputSize < LZ4_64Klimit)
 			return LZ4_compress_generic(ctx, source,
 				dest, inputSize, 0,
@@ -474,7 +519,6 @@ EXPORT_SYMBOL(LZ4_compress_default);
 /*-******************************
  *	*_destSize() variant
  ********************************/
-
 static int LZ4_compress_destSize_generic(
 	LZ4_stream_t_internal * const ctx,
 	const char * const src,
@@ -529,14 +573,14 @@ static int LZ4_compress_destSize_generic(
 		{
 			const BYTE *forwardIp = ip;
 			unsigned int step = 1;
-			unsigned int searchMatchNb = 1 << LZ4_skipTrigger;
+			unsigned int searchMatchNb = 1 << LZ4_SKIPTRIGGER;

 			do {
 				U32 h = forwardH;

 				ip = forwardIp;
 				forwardIp += step;
-				step = (searchMatchNb++ >> LZ4_skipTrigger);
+				step = (searchMatchNb++ >> LZ4_SKIPTRIGGER);

 				if (unlikely(forwardIp > mflimit))
 					goto _last_literals;
@@ -559,8 +603,9 @@ static int LZ4_compress_destSize_generic(
 		while ((ip > anchor)
 			&& (match > lowLimit)
 			&& (unlikely(ip[-1] == match[-1]))) {
-			ip--; match--;
-			}
+			ip--;
+			match--;
+		}

 		/* Encode Literal length */
 		{
@@ -644,11 +689,11 @@ static int LZ4_compress_destSize_generic(
 		size_t lastRunSize = (size_t)(iend - anchor);

 		if (op + 1 /* token */
-			+ ((lastRunSize + 240)/255) /* litLength */
+			+ ((lastRunSize + 240) / 255) /* litLength */
 			+ lastRunSize /* literals */ > oend) {
 			/* adapt lastRunSize to fill 'dst' */
 			lastRunSize	= (oend - op) - 1;
-			lastRunSize -= (lastRunSize + 240)/255;
+			lastRunSize -= (lastRunSize + 240) / 255;
 		}
 		ip = anchor + lastRunSize;

@@ -656,7 +701,7 @@ static int LZ4_compress_destSize_generic(
 			size_t accumulator = lastRunSize - RUN_MASK;

 			*op++ = RUN_MASK << ML_BITS;
-			for (; accumulator >= 255 ; accumulator -= 255)
+			for (; accumulator >= 255; accumulator -= 255)
 				*op++ = 255;
 			*op++ = (BYTE) accumulator;
 		} else {
@@ -675,14 +720,14 @@ static int LZ4_compress_destSize_extState(LZ4_stream_t *state, const char *src,
 	char *dst, int *srcSizePtr, int targetDstSize)
 {
 	#if LZ4_ARCH64
-	tableType_t tableType = byU32;
+		const tableType_t tableType = byU32;
 	#else
-	tableType_t tableType = byPtr;
+		const tableType_t tableType = byPtr;
 	#endif

 	LZ4_resetStream(state);

-	if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) {
+	if (targetDstSize >= LZ4_COMPRESSBOUND(*srcSizePtr)) {
 		/* compression success is guaranteed */
 		return LZ4_compress_fast_extState(
 			state, src, dst, *srcSizePtr,
@@ -847,7 +892,7 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
 			result = LZ4_compress_generic(
 				streamPtr, source, dest, inputSize,
 				maxOutputSize, limitedOutput, byU32,
-				withPrefix64k, dictSmall,	acceleration);
+				withPrefix64k, dictSmall, acceleration);
 		} else {
 			result = LZ4_compress_generic(
 				streamPtr, source, dest, inputSize,
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index a7731ba..3bfc2f6 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -49,8 +49,8 @@
  * Note that it is important this generic function is really inlined,
  * in order to remove useless branches during compilation optimization.
  */
-static inline int LZ4_decompress_generic(
-	 const char *const source,
+static FORCE_INLINE int LZ4_decompress_generic(
+	 const char * const source,
 	 char * const dest,
 	 int inputSize,
 		/*
@@ -180,22 +180,28 @@ static inline int LZ4_decompress_generic(
 					goto _output_error;
 				}
 			}
+
 			memcpy(op, ip, length);
 			ip += length;
 			op += length;
 			/* Necessarily EOF, due to parsing restrictions */
 			break;
 		}
+
 		LZ4_wildCopy(op, ip, cpy);
-		ip += length; op = cpy;
+		ip += length;
+		op = cpy;

 		/* get offset */
-		offset = LZ4_readLE16(ip); ip += 2;
+		offset = LZ4_readLE16(ip);
+		ip += 2;
 		match = op - offset;
+
 		if ((checkOffset) && (unlikely(match < lowLimit))) {
 			/* Error : offset outside buffers */
 			goto _output_error;
 		}
+
 		/* costs ~1%; silence an msan warning when offset == 0 */
 		LZ4_write32(op, (U32)offset);

@@ -205,11 +211,14 @@ static inline int LZ4_decompress_generic(
 			unsigned int s;

 			do {
-			s = *ip++;
-			if ((endOnInput) && (ip > iend - LASTLITERALS))
-				goto _output_error;
-			length += s;
+				s = *ip++;
+
+				if ((endOnInput) && (ip > iend - LASTLITERALS))
+					goto _output_error;
+
+				length += s;
 			} while (s == 255);
+
 			if ((safeDecode)
 				&& unlikely(
 					(size_t)(op + length) < (size_t)op)) {
@@ -217,6 +226,7 @@ static inline int LZ4_decompress_generic(
 				goto _output_error;
 			}
 		}
+
 		length += MINMATCH;

 		/* check external dictionary */
@@ -227,12 +237,13 @@ static inline int LZ4_decompress_generic(
 			}

 			if (length <= (size_t)(lowPrefix - match)) {
-			/*
-			 * match can be copied as a single segment
-			 * from external dictionary
-			 */
-			memmove(op, dictEnd - (lowPrefix - match), length);
-			op += length;
+				/*
+				 * match can be copied as a single segment
+				 * from external dictionary
+				 */
+				memmove(op, dictEnd - (lowPrefix - match),
+					length);
+				op += length;
 			} else {
 				/*
 				 * match encompass external
@@ -256,11 +267,13 @@ static inline int LZ4_decompress_generic(
 					op += restSize;
 				}
 			}
+
 			continue;
 		}

 		/* copy match within block */
 		cpy = op + length;
+
 		if (unlikely(offset < 8)) {
 			const int dec64 = dec64table[offset];

@@ -272,7 +285,8 @@ static inline int LZ4_decompress_generic(
 			memcpy(op + 4, match, 4);
 			match -= dec64;
 		} else {
-			LZ4_copy8(op, match); match += 8;
+			LZ4_copy8(op, match);
+			match += 8;
 		}

 		op += 8;
@@ -287,18 +301,22 @@ static inline int LZ4_decompress_generic(
 				 */
 				goto _output_error;
 			}
+
 			if (op < oCopyLimit) {
 				LZ4_wildCopy(op, match, oCopyLimit);
 				match += oCopyLimit - op;
 				op = oCopyLimit;
 			}
+
 			while (op < cpy)
 				*op++ = *match++;
 		} else {
 			LZ4_copy8(op, match);
+
 			if (length > 16)
 				LZ4_wildCopy(op + 8, match + 8, cpy);
 		}
+
 		op = cpy; /* correction */
 	}

@@ -438,7 +456,7 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
  * These decoding functions work the same as "_continue" ones,
  * the dictionary must be explicitly provided within parameters
  */
-static inline int LZ4_decompress_usingDict_generic(const char *source,
+static FORCE_INLINE int LZ4_decompress_usingDict_generic(const char *source,
 	char *dest, int compressedSize, int maxOutputSize, int safe,
 	const char *dictStart, int dictSize)
 {
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index 23e1a1b..47ef42b 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -38,14 +38,7 @@
 #include <asm/unaligned.h>
 #include <linux/string.h>	 /* memset, memcpy */

-/*
- * Detects 64 bits mode
-*/
-#if defined(CONFIG_64BIT)
-#define LZ4_ARCH64 1
-#else
-#define LZ4_ARCH64 0
-#endif
+#define FORCE_INLINE __always_inline

 /*-************************************
  *	Basic Types
@@ -60,14 +53,38 @@ typedef uint64_t U64;
 typedef uintptr_t uptrval;

 /*-************************************
+ *	Architecture specifics
+ **************************************/
+#if defined(CONFIG_64BIT)
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+#if defined(__LITTLE_ENDIAN)
+#define LZ4_LITTLE_ENDIAN 1
+#else
+#define LZ4_LITTLE_ENDIAN 0
+#endif
+
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system
+ * does not support hardware bit count
+ */
+/* #define LZ4_FORCE_SW_BITCOUNT */
+
+/*-************************************
  *	Constants
  **************************************/
 #define MINMATCH 4

 #define WILDCOPYLENGTH 8
 #define LASTLITERALS 5
-#define MFLIMIT (WILDCOPYLENGTH+MINMATCH)
-static const int LZ4_minLength = (MFLIMIT+1);
+#define MFLIMIT (WILDCOPYLENGTH + MINMATCH)
+
+/* Increase this value ==> compression run slower on incompressible data */
+#define LZ4_SKIPTRIGGER 6

 #define KB (1<<10)
 #define MB (1<<20)
@@ -82,53 +99,42 @@ static const int LZ4_minLength = (MFLIMIT+1);
 #define RUN_BITS (8-ML_BITS)
 #define RUN_MASK ((1U<<RUN_BITS)-1)

-static const int LZ4_64Klimit = ((64 * KB) + (MFLIMIT-1));
-static const U32 LZ4_skipTrigger = 6;
-
 /*-************************************
  *	Reading and writing into memory
  **************************************/
+typedef union {
+	U16 u16;
+	U32 u32;
+	size_t uArch;
+} __packed unalign;

-static inline U16 LZ4_read16(const void *memPtr)
+static FORCE_INLINE __maybe_unused U16 LZ4_read16(const void *ptr)
 {
-	U16 val;
-
-	memcpy(&val, memPtr, sizeof(val));
-
-	return val;
+	return ((const unalign *)ptr)->u16;
 }

-static inline U32 LZ4_read32(const void *memPtr)
+static FORCE_INLINE __maybe_unused U32 LZ4_read32(const void *ptr)
 {
-	U32 val;
-
-	memcpy(&val, memPtr, sizeof(val));
-
-	return val;
+	return ((const unalign *)ptr)->u32;
 }

-static inline size_t LZ4_read_ARCH(const void *memPtr)
+static FORCE_INLINE __maybe_unused size_t LZ4_read_ARCH(const void *ptr)
 {
-	size_t val;
-
-	memcpy(&val, memPtr, sizeof(val));
-
-	return val;
+	return ((const unalign *)ptr)->uArch;
 }

-static inline void LZ4_write16(void *memPtr, U16 value)
+static FORCE_INLINE __maybe_unused void LZ4_write16(void *memPtr, U16 value)
 {
-	memcpy(memPtr, &value, sizeof(value));
+	((unalign *)memPtr)->u16 = value;
 }

-static inline void LZ4_write32(void *memPtr, U32 value)
-{
-	memcpy(memPtr, &value, sizeof(value));
+static FORCE_INLINE __maybe_unused void LZ4_write32(void *memPtr, U32 value) {
+	((unalign *)memPtr)->u32 = value;
 }

-static inline U16 LZ4_readLE16(const void *memPtr)
+static FORCE_INLINE __maybe_unused U16 LZ4_readLE16(const void *memPtr)
 {
-#ifdef __LITTLE_ENDIAN__
+#if LZ4_LITTLE_ENDIAN
 	return LZ4_read16(memPtr);
 #else
 	const BYTE *p = (const BYTE *)memPtr;
@@ -137,19 +143,19 @@ static inline U16 LZ4_readLE16(const void *memPtr)
 #endif
 }

-static inline void LZ4_writeLE16(void *memPtr, U16 value)
+static FORCE_INLINE __maybe_unused void LZ4_writeLE16(void *memPtr, U16 value)
 {
-#ifdef __LITTLE_ENDIAN__
+#if LZ4_LITTLE_ENDIAN
 	LZ4_write16(memPtr, value);
 #else
 	BYTE *p = (BYTE *)memPtr;

 	p[0] = (BYTE) value;
-	p[1] = (BYTE)(value>>8);
+	p[1] = (BYTE)(value >> 8);
 #endif
 }

-static inline void LZ4_copy8(void *dst, const void *src)
+static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
 {
 	memcpy(dst, src, 8);
 }
@@ -158,7 +164,8 @@ static inline void LZ4_copy8(void *dst, const void *src)
  * customized variant of memcpy,
  * which can overwrite up to 7 bytes beyond dstEnd
  */
-static inline void LZ4_wildCopy(void *dstPtr, const void *srcPtr, void *dstEnd)
+static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
+	const void *srcPtr, void *dstEnd)
 {
 	BYTE *d = (BYTE *)dstPtr;
 	const BYTE *s = (const BYTE *)srcPtr;
@@ -171,49 +178,121 @@ static inline void LZ4_wildCopy(void *dstPtr, const void *srcPtr, void *dstEnd)
 	} while (d < e);
 }

-#if LZ4_ARCH64
-#ifdef __BIG_ENDIAN__
-#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3)
+static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val)
+{
+#if LZ4_LITTLE_ENDIAN
+#if LZ4_ARCH64 /* 64 Bits Little Endian */
+#if defined(LZ4_FORCE_SW_BITCOUNT)
+	static const int DeBruijnBytePos[64] = {
+		0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7,
+		0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7,
+		7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6,
+		7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7
+	};
+
+	return DeBruijnBytePos[((U64)((val & -(long long)val)
+		* 0x0218A392CDABBD3FULL)) >> 58];
 #else
-#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3)
-#endif
+	return (__builtin_ctzll((U64)val) >> 3);
+#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
+#else /* 32 Bits Little Endian */
+#if defined(LZ4_FORCE_SW_BITCOUNT)
+	static const int DeBruijnBytePos[32] = {
+		0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1,
+		3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1
+	};
+
+	return DeBruijnBytePos[((U32)((val & -(S32)val)
+		* 0x077CB531U)) >> 27];
 #else
-#ifdef __BIG_ENDIAN__
-#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3)
+	return (__builtin_ctz((U32)val) >> 3);
+#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
+#endif /* LZ4_ARCH64 */
+#else /* Big Endian */
+#if LZ4_ARCH64 /* 64 Bits Big Endian */
+#if defined(LZ4_FORCE_SW_BITCOUNT)
+	unsigned int r;
+
+	if (!(val >> 32)) {
+		r = 4;
+	} else {
+		r = 0;
+		val >>= 32;
+	}
+
+	if (!(val >> 16)) {
+		r += 2;
+		val >>= 8;
+	} else {
+		val >>= 24;
+	}
+
+	r += (!val);
+
+	return r;
 #else
-#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3)
-#endif
-#endif
+	return (__builtin_clzll((U64)val) >> 3);
+#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
+#else /* 32 Bits Big Endian */
+#if defined(LZ4_FORCE_SW_BITCOUNT)
+	unsigned int r;
+
+	if (!(val >> 16)) {
+		r = 2;
+		val >>= 8;
+	} else {
+		r = 0;
+		val >>= 24;
+	}
+
+	r += (!val);
+
+	return r;
+#else
+	return (__builtin_clz((U32)val) >> 3);
+#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
+#endif /* LZ4_ARCH64 */
+#endif /* LZ4_LITTLE_ENDIAN */
+}

-static inline unsigned int LZ4_count(const BYTE *pIn, const BYTE *pMatch,
+static FORCE_INLINE __maybe_unused unsigned int LZ4_count(
+	const BYTE *pIn,
+	const BYTE *pMatch,
 	const BYTE *pInLimit)
 {
 	const BYTE *const pStart = pIn;

-	while (likely(pIn < pInLimit-(STEPSIZE-1))) {
-		size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+	while (likely(pIn < pInLimit - (STEPSIZE - 1))) {
+		size_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);

 		if (!diff) {
 			pIn += STEPSIZE;
 			pMatch += STEPSIZE;
 			continue;
 		}
-		pIn += LZ4_NBCOMMONBYTES(diff);
+
+		pIn += LZ4_NbCommonBytes(diff);
+
 		return (unsigned int)(pIn - pStart);
 	}

-#ifdef LZ4_ARCH64
-	if ((pIn < (pInLimit-3))
+#if LZ4_ARCH64
+	if ((pIn < (pInLimit - 3))
 		&& (LZ4_read32(pMatch) == LZ4_read32(pIn))) {
-		pIn += 4; pMatch += 4;
+		pIn += 4;
+		pMatch += 4;
 	}
 #endif
-	if ((pIn < (pInLimit-1))
+
+	if ((pIn < (pInLimit - 1))
 		&& (LZ4_read16(pMatch) == LZ4_read16(pIn))) {
-		pIn += 2; pMatch += 2;
+		pIn += 2;
+		pMatch += 2;
 	}
+
 	if ((pIn < pInLimit) && (*pMatch == *pIn))
 		pIn++;
+
 	return (unsigned int)(pIn - pStart);
 }

diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
index 8363292..c7271a1 100644
--- a/lib/lz4/lz4hc_compress.c
+++ b/lib/lz4/lz4hc_compress.c
@@ -71,7 +71,7 @@ static void LZ4HC_init(LZ4HC_CCtx_internal *hc4, const BYTE *start)
 }

 /* Update chains up to ip (excluded) */
-static inline void LZ4HC_Insert(LZ4HC_CCtx_internal *hc4,
+static FORCE_INLINE void LZ4HC_Insert(LZ4HC_CCtx_internal *hc4,
 	const BYTE *ip)
 {
 	U16 * const chainTable = hc4->chainTable;
@@ -96,7 +96,7 @@ static inline void LZ4HC_Insert(LZ4HC_CCtx_internal *hc4,
 	hc4->nextToUpdate = target;
 }

-static inline int LZ4HC_InsertAndFindBestMatch(
+static FORCE_INLINE int LZ4HC_InsertAndFindBestMatch(
 	LZ4HC_CCtx_internal *hc4, /* Index table will be updated */
 	const BYTE *ip,
 	const BYTE * const iLimit,
@@ -165,7 +165,7 @@ static inline int LZ4HC_InsertAndFindBestMatch(
 	return (int)ml;
 }

-static inline int LZ4HC_InsertAndGetWiderMatch(
+static FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch(
 	LZ4HC_CCtx_internal *hc4,
 	const BYTE * const ip,
 	const BYTE * const iLowLimit,
@@ -259,7 +259,7 @@ static inline int LZ4HC_InsertAndGetWiderMatch(
 	return longest;
 }

-static inline int LZ4HC_encodeSequence(
+static FORCE_INLINE int LZ4HC_encodeSequence(
 	const BYTE **ip,
 	BYTE **op,
 	const BYTE **anchor,

  reply	other threads:[~2017-02-12 11:17 UTC|newest]

Thread overview: 84+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <1482259992-16680-1-git-send-email-4sschmid@informatik.uni-hamburg.de>
     [not found] ` <1482259992-16680-2-git-send-email-4sschmid@informatik.uni-hamburg.de>
2016-12-22 13:25   ` [PATCH 1/3] crypto: Change lz4 modules to work with new lz4 compressor module version Sergey Senozhatsky
2017-01-07 16:55 ` [PATCH v2 0/4] Update LZ4 compressor module Sven Schmidt
2017-01-07 16:55   ` [PATCH v2 1/4] lib: Update LZ4 compressor module based on LZ4 v1.7.2 Sven Schmidt
2017-01-08 11:22     ` Greg KH
2017-01-10  9:32       ` Sven Schmidt
2017-01-10  9:59         ` Greg KH
2017-01-08 11:25     ` Greg KH
2017-01-08 11:33       ` Rui Salvaterra
2017-01-10  9:21       ` Sven Schmidt
2017-01-10 10:00         ` Greg KH
2017-01-10 10:50           ` Willy Tarreau
2017-01-07 16:55   ` [PATCH v2 2/4] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-08 11:23     ` Greg KH
2017-01-07 16:55   ` [PATCH v2 3/4] crypto: Change LZ4 modules " Sven Schmidt
2017-01-07 16:55   ` [PATCH v2 4/4] fs/pstore: fs/squashfs: Change usage of LZ4 to comply " Sven Schmidt
2017-01-07 21:33     ` Kees Cook
2017-01-10  9:45       ` Sven Schmidt
2017-01-21 15:09 ` [PATCH v3 0/4] Update LZ4 compressor module Sven Schmidt
2017-01-21 15:09   ` [PATCH 1/4] lib: " Sven Schmidt
2017-01-21 15:56     ` kbuild test robot
2017-01-21 16:16     ` kbuild test robot
2017-01-21 17:38     ` kbuild test robot
2017-01-22 11:05     ` Greg KH
2017-01-21 15:09   ` [PATCH 2/4] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-21 15:09   ` [PATCH 3/4] crypto: Change LZ4 modules " Sven Schmidt
2017-01-21 15:09   ` [PATCH 4/4] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-22 19:35 ` [PATCH v4 0/4] Update LZ4 compressor module Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 1/4] lib: " Sven Schmidt
2017-01-24  0:23     ` Andrew Morton
2017-01-24 16:48       ` Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 2/4] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 3/4] crypto: Change LZ4 modules " Sven Schmidt
2017-01-22 19:35   ` [PATCH v4 4/4] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-26  7:57 ` [PATCH v5 0/5] Update LZ4 compressor module Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 1/5] lib: " Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-26  7:57   ` [PATCH v5 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt
2017-01-26  9:19   ` [PATCH v5 0/5] Update LZ4 compressor module Eric Biggers
2017-01-26 14:15     ` Sven Schmidt
2017-01-27 22:01 ` [PATCH v6 " Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 1/5] lib: " Sven Schmidt
2017-01-31 22:27     ` Jonathan Corbet
2017-02-01 20:18       ` Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-01-27 22:02   ` [PATCH v6 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt
2017-02-05 19:09 ` [PATCH v7 0/5] Update LZ4 compressor module Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 1/5] lib: " Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-02-05 19:09   ` [PATCH v7 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt
2017-02-08 23:31   ` [PATCH v7 0/5] Update LZ4 compressor module Minchan Kim
2017-02-09  0:24     ` Eric Biggers
2017-02-09  5:24       ` Eric Biggers
2017-02-09 11:05         ` Sven Schmidt
2017-02-09 18:20           ` Eric Biggers
2017-02-10  0:14         ` Minchan Kim
2017-02-09 11:02       ` Sven Schmidt
2017-02-09 18:29         ` Eric Biggers
2017-02-10  3:57           ` David Miller
2017-02-09 10:56     ` Sven Schmidt
2017-02-10  0:13       ` Minchan Kim
2017-02-12 11:16         ` Sven Schmidt
2017-02-12 11:16           ` Sven Schmidt [this message]
2017-02-12 13:05             ` [PATCH] lz4: fix performance regressions Willy Tarreau
2017-02-12 15:20               ` Sven Schmidt
2017-02-12 21:41                 ` Willy Tarreau
2017-02-13 11:53                   ` Sven Schmidt
2017-02-13 13:37                     ` Willy Tarreau
2017-02-12 23:38             ` Eric Biggers
2017-02-14 10:33               ` Sven Schmidt
2017-02-13  0:03           ` [PATCH v7 0/5] Update LZ4 compressor module Minchan Kim
2017-02-13 12:08             ` Sven Schmidt
2017-02-15  7:29               ` Minchan Kim
2017-02-15 18:16 ` [PATCH v8 " Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 1/5] lib: " Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 3/5] crypto: Change LZ4 modules " Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version Sven Schmidt
2017-02-15 18:16   ` [PATCH v8 5/5] lib/lz4: Remove back-compat wrappers Sven Schmidt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1486898178-17125-2-git-send-email-4sschmid@informatik.uni-hamburg.de \
    --to=4sschmid@informatik.uni-hamburg.de \
    --cc=akpm@linux-foundation.org \
    --cc=anton@enomsg.org \
    --cc=bongkyu.kim@lge.com \
    --cc=ccross@android.com \
    --cc=davem@davemloft.net \
    --cc=ebiggers3@gmail.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=herbert@gondor.apana.org.au \
    --cc=keescook@chromium.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=minchan@kernel.org \
    --cc=rsalvaterra@gmail.com \
    --cc=sergey.senozhatsky@gmail.com \
    --cc=tony.luck@intel.com \
    --subject='Re: [PATCH] lz4: fix performance regressions' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
on how to clone and mirror all data and code used for this inbox