All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
@ 2019-05-15  0:43 Chenxi Mao
  2019-05-15  2:21 ` Gao Xiang
  0 siblings, 1 reply; 16+ messages in thread
From: Chenxi Mao @ 2019-05-15  0:43 UTC (permalink / raw)
  To: akpm
  Cc: gaoxiang25, linux-kernel, roy.feng, yuanli.xu, robert.alm,
	masaya.a.takahashi, chenxi . mao

FAST_DEC_LOOP was introduced from LZ4 1.9.
This change would be introduce 10% on decompress operation
according to LZ4 benchmark result on X86 devices.
Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements,
however clang compiler has downgrade if FAST_DEC_LOOP enabled.

So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.

Here is the test result on ARM64(cortex-A53)

Benchmark via ZRAM:

Test case:
fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
--buffer_compress_percentage=75 \
--scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
--filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
--name=seq-read --rw=read --stonewall --name=seq-readwrite \
--rw=rw --stonewall --name=rand-readwrite --rw=randrw --stonewall

Patched:
   READ: bw=7077MiB/s (7421MB/s)
Vanilla:
   READ: bw=5134MiB/s (5384MB/s)

Reference:
1. https://github.com/lz4/lz4/pull/645
2. https://github.com/lz4/lz4/pull/707

Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
---
 lib/lz4/lz4_decompress.c | 425 +++++++++++++++++++++++++++++++++------
 1 file changed, 361 insertions(+), 64 deletions(-)

diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 0c9d3ad17e0f..a4c87e32b3c0 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -50,6 +50,131 @@
 #define assert(condition) ((void)0)
 #endif
 
+#ifndef LZ4_FAST_DEC_LOOP
+#if defined(__i386__) || defined(__x86_64__)
+#define LZ4_FAST_DEC_LOOP 1
+#elif defined(__aarch64__) && !defined(__clang__)
+     /* On aarch64, we disable this optimization for clang because on certain
+      * mobile chipsets and clang, it reduces performance. For more information
+      * refer to https://github.com/lz4/lz4/pull/707. */
+#define LZ4_FAST_DEC_LOOP 1
+#else
+#define LZ4_FAST_DEC_LOOP 0
+#endif
+#endif
+
+static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
+static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
+
+#if LZ4_FAST_DEC_LOOP
+#define FASTLOOP_SAFE_DISTANCE 64
+FORCE_INLINE void
+LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
+			     const size_t offset)
+{
+	if (offset < 8) {
+		dstPtr[0] = srcPtr[0];
+
+		dstPtr[1] = srcPtr[1];
+		dstPtr[2] = srcPtr[2];
+		dstPtr[3] = srcPtr[3];
+		srcPtr += inc32table[offset];
+		memcpy(dstPtr + 4, srcPtr, 4);
+		srcPtr -= dec64table[offset];
+		dstPtr += 8;
+	} else {
+		memcpy(dstPtr, srcPtr, 8);
+		dstPtr += 8;
+		srcPtr += 8;
+	}
+
+	LZ4_wildCopy(dstPtr, srcPtr, dstEnd);
+}
+
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
+ * this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+FORCE_INLINE void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd)
+{
+	BYTE *d = (BYTE *) dstPtr;
+	const BYTE *s = (const BYTE *)srcPtr;
+	BYTE *const e = (BYTE *) dstEnd;
+
+	do {
+		memcpy(d, s, 16);
+		memcpy(d + 16, s + 16, 16);
+		d += 32;
+		s += 32;
+	} while (d < e);
+}
+
+FORCE_INLINE void
+LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
+			const size_t offset)
+{
+	BYTE v[8];
+	switch (offset) {
+
+	case 1:
+		memset(v, *srcPtr, 8);
+		goto copy_loop;
+	case 2:
+		memcpy(v, srcPtr, 2);
+		memcpy(&v[2], srcPtr, 2);
+		memcpy(&v[4], &v[0], 4);
+		goto copy_loop;
+	case 4:
+		memcpy(v, srcPtr, 4);
+		memcpy(&v[4], srcPtr, 4);
+		goto copy_loop;
+	default:
+		LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+		return;
+	}
+
+      copy_loop:
+	memcpy(dstPtr, v, 8);
+	dstPtr += 8;
+	while (dstPtr < dstEnd) {
+		memcpy(dstPtr, v, 8);
+		dstPtr += 8;
+	}
+}
+
+/* Read the variable-length literal or match length.
+ *
+ * ip - pointer to use as input.
+ * lencheck - end ip.  Return an error if ip advances >= lencheck.
+ * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
+ * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
+ * error (output) - error code.  Should be set to 0 before call.
+ */
+typedef enum { loop_error = -2, initial_error = -1, ok = 0} variable_length_error;
+FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
+					   const BYTE *lencheck,
+					   int loop_check, int initial_check,
+					   variable_length_error *error)
+{
+	unsigned length = 0;
+	unsigned s;
+	if (initial_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
+		*error = initial_error;
+		return length;
+	}
+	do {
+		s = **ip;
+		(*ip)++;
+		length += s;
+		if (loop_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
+			*error = loop_error;
+			return length;
+		}
+	} while (s == 255);
+
+	return length;
+}
+#endif
+
 /*
  * LZ4_decompress_generic() :
  * This generic decompression function covers all use cases.
@@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
 	 const size_t dictSize
 	 )
 {
-	const BYTE *ip = (const BYTE *) src;
-	const BYTE * const iend = ip + srcSize;
+	const BYTE *ip = (const BYTE *)src;
+	const BYTE *const iend = ip + srcSize;
 
 	BYTE *op = (BYTE *) dst;
-	BYTE * const oend = op + outputSize;
+	BYTE *const oend = op + outputSize;
 	BYTE *cpy;
 
-	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
-	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
-	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
+	const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
 
 	const int safeDecode = (endOnInput == endOnInputSize);
 	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
 
 	/* Set up the "end" pointers for the shortcut. */
 	const BYTE *const shortiend = iend -
-		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+	    (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
 	const BYTE *const shortoend = oend -
-		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+	    (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+
+	const BYTE *match;
+	size_t offset;
+	unsigned int token;
+	size_t length;
 
 	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
 		 srcSize, outputSize);
@@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
 	if ((endOnInput) && unlikely(srcSize == 0))
 		return -1;
 
-	/* Main Loop : decode sequences */
+#if LZ4_FAST_DEC_LOOP
+	if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+		DEBUGLOG(6, "skip fast decode loop");
+		goto safe_decode;
+	}
+
+	/* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
 	while (1) {
-		size_t length;
-		const BYTE *match;
-		size_t offset;
+		/* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+		assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+		if (endOnInput) {
+			assert(ip < iend);
+		}
+		token = *ip++;
+		length = token >> ML_BITS;	/* literal length */
+
+		assert(!endOnInput || ip <= iend);	/* ip < iend before the increment */
+
+		/* decode literal length */
+		if (length == RUN_MASK) {
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - RUN_MASK,
+						 endOnInput, endOnInput,
+						 &error);
+			if (error == initial_error) {
+				goto _output_error;
+			}
+			if ((safeDecode)
+			    && unlikely((uptrval) (op) + length <
+					(uptrval) (op))) {
+				goto _output_error;
+			}	/* overflow detection */
+			if ((safeDecode)
+			    && unlikely((uptrval) (ip) + length <
+					(uptrval) (ip))) {
+				goto _output_error;
+			}
+
+			/* overflow detection */
+			/* copy literals */
+			cpy = op + length;
+			LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+			if (endOnInput) {	/* LZ4_decompress_safe() */
+				if ((cpy > oend - 32)
+				    || (ip + length > iend - 32)) {
+					goto safe_literal_copy;
+				}
+				LZ4_wildCopy32(op, ip, cpy);
+			} else {	/* LZ4_decompress_fast() */
+				if (cpy > oend - 8) {
+					goto safe_literal_copy;
+				}
+				LZ4_wildCopy(op, ip, cpy);
+				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
+				/* it doesn't know input length, and only relies on end-of-block */
+				/* properties */
+			}
+			ip += length;
+			op = cpy;
+		} else {
+			cpy = op + length;
+			if (endOnInput) {	/* LZ4_decompress_safe() */
+				DEBUGLOG(7,
+					 "copy %u bytes in a 16-bytes stripe",
+					 (unsigned)length);
+				/* We don't need to check oend */
+				/* since we check it once for each loop below */
+				if (ip > iend - (16 + 1)) {	/*max lit + offset + nextToken */
+					goto safe_literal_copy;
+				}
+				/* Literals can only be 14, but hope compilers optimize */
+				/*if we copy by a register size */
+				memcpy(op, ip, 16);
+			} else {
+				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
+				/* it doesn't know input length, and relies on end-of-block */
+				/* properties */
+				memcpy(op, ip, 8);
+				if (length > 8) {
+					memcpy(op + 8, ip + 8, 8);
+				}
+			}
+			ip += length;
+			op = cpy;
+		}
+
+		/* get offset */
+		offset = LZ4_readLE16(ip);
+		ip += 2;	/* end-of-block condition violated */
+		match = op - offset;
+
+		/* get matchlength */
+		length = token & ML_MASK;
 
-		/* get literal length */
-		unsigned int const token = *ip++;
-		length = token>>ML_BITS;
+		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
+			goto _output_error;
+		}
+		/* Error : offset outside buffers */
+		if (length == ML_MASK) {
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - LASTLITERALS + 1,
+						 endOnInput, 0, &error);
+			if (error != ok) {
+				goto _output_error;
+			}
+			if ((safeDecode)
+			    && unlikely((uptrval) (op) + length < (uptrval) op)) {
+				goto _output_error;
+			}	/* overflow detection */
+			length += MINMATCH;
+			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+				goto safe_match_copy;
+			}
+		} else {
+			length += MINMATCH;
+			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+				goto safe_match_copy;
+			}
+
+			/* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
+			if (!(dict == usingExtDict) || (match >= lowPrefix)) {
+				if (offset >= 8) {
+					memcpy(op, match, 8);
+					memcpy(op + 8, match + 8, 8);
+					memcpy(op + 16, match + 16, 2);
+					op += length;
+					continue;
+				}
+			}
+		}
+
+		/* match starting within external dictionary */
+		if ((dict == usingExtDict) && (match < lowPrefix)) {
+			if (unlikely(op + length > oend - LASTLITERALS)) {
+				if (partialDecoding) {
+					/* reach end of buffer */
+					length =
+					    min(length, (size_t) (oend - op));
+				} else {
+					/* end-of-block condition violated */
+					goto _output_error;
+				}
+			}
+
+			if (length <= (size_t) (lowPrefix - match)) {
+				/* match fits entirely within external dictionary : just copy */
+				memmove(op, dictEnd - (lowPrefix - match), length);
+				op += length;
+			} else {
+				/* match stretches into both external dict and current block */
+				size_t const copySize =
+				    (size_t) (lowPrefix - match);
+				size_t const restSize = length - copySize;
+				memcpy(op, dictEnd - copySize, copySize);
+				op += copySize;
+				if (restSize > (size_t) (op - lowPrefix)) {	/* overlap copy */
+					BYTE *const endOfMatch = op + restSize;
+					const BYTE *copyFrom = lowPrefix;
+					while (op < endOfMatch) {
+						*op++ = *copyFrom++;
+					}
+				} else {
+					memcpy(op, lowPrefix, restSize);
+					op += restSize;
+				}
+			}
+			continue;
+		}
+
+		/* copy match within block */
+		cpy = op + length;
+
+		assert((op <= oend) && (oend - op >= 32));
+		if (unlikely(offset < 16)) {
+			LZ4_memcpy_using_offset(op, match, cpy, offset);
+		} else {
+			LZ4_wildCopy32(op, match, cpy);
+		}
+
+		op = cpy;	/* wildcopy correction */
+	}
+      safe_decode:
+#endif
+	/* Main Loop : decode sequences */
+	while (1) {
+		length = token >> ML_BITS;
 
 		/* ip < iend before the increment */
 		assert(!endOnInput || ip <= iend);
@@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
 		 * combined check for both stages).
 		 */
 		if ((endOnInput ? length != RUN_MASK : length <= 8)
-		   /*
-		    * strictly "less than" on input, to re-enter
-		    * the loop with at least one byte
-		    */
-		   && likely((endOnInput ? ip < shortiend : 1) &
-			     (op <= shortoend))) {
+		    /*
+		     * strictly "less than" on input, to re-enter
+		     * the loop with at least one byte
+		     */
+		    && likely((endOnInput ? ip < shortiend : 1) &
+			      (op <= shortoend))) {
 			/* Copy the literals */
 			memcpy(op, ip, endOnInput ? 16 : 8);
-			op += length; ip += length;
+			op += length;
+			ip += length;
 
 			/*
 			 * The second stage:
 			 * prepare for match copying, decode full info.
 			 * If it doesn't work out, the info won't be wasted.
 			 */
-			length = token & ML_MASK; /* match length */
+			length = token & ML_MASK;	/* match length */
 			offset = LZ4_readLE16(ip);
 			ip += 2;
 			match = op - offset;
-			assert(match <= op); /* check overflow */
+			assert(match <= op);	/* check overflow */
 
 			/* Do not deal with overlapping matches. */
 			if ((length != ML_MASK) &&
@@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		/* decode literal length */
 		if (length == RUN_MASK) {
-			unsigned int s;
 
-			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
-				/* overflow detection */
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - RUN_MASK,
+						 endOnInput, endOnInput,
+						 &error);
+			if (error == initial_error)
 				goto _output_error;
-			}
-			do {
-				s = *ip++;
-				length += s;
-			} while (likely(endOnInput
-				? ip < iend - RUN_MASK
-				: 1) & (s == 255));
 
 			if ((safeDecode)
-			    && unlikely((uptrval)(op) +
-					length < (uptrval)(op))) {
+			    && unlikely((uptrval) (op) +
+					length < (uptrval) (op))) {
 				/* overflow detection */
 				goto _output_error;
 			}
 			if ((safeDecode)
-			    && unlikely((uptrval)(ip) +
-					length < (uptrval)(ip))) {
+			    && unlikely((uptrval) (ip) +
+					length < (uptrval) (ip))) {
 				/* overflow detection */
 				goto _output_error;
 			}
@@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		/* copy literals */
 		cpy = op + length;
+#if LZ4_FAST_DEC_LOOP
+	      safe_literal_copy:
+#endif
 		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
 
 		if (((endOnInput) && ((cpy > oend - MFLIMIT)
-			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
-			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
+				      || (ip + length >
+					  iend - (2 + 1 + LASTLITERALS))))
+		    || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
 			if (partialDecoding) {
 				if (cpy > oend) {
 					/*
@@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 					length = oend - op;
 				}
 				if ((endOnInput)
-					&& (ip + length > iend)) {
+				    && (ip + length > iend)) {
 					/*
 					 * Error :
 					 * read attempt beyond
@@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				}
 			} else {
 				if ((!endOnInput)
-					&& (cpy != oend)) {
+				    && (cpy != oend)) {
 					/*
 					 * Error :
 					 * block decoding must
@@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 					goto _output_error;
 				}
 				if ((endOnInput)
-					&& ((ip + length != iend)
+				    && ((ip + length != iend)
 					|| (cpy > oend))) {
 					/*
 					 * Error :
@@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			goto _output_error;
 		}
 
-		/* costs ~1%; silence an msan warning when offset == 0 */
-		/*
-		 * note : when partialDecoding, there is no guarantee that
-		 * at least 4 bytes remain available in output buffer
-		 */
-		if (!partialDecoding) {
-			assert(oend > op);
-			assert(oend - op >= 4);
-
-			LZ4_write32(op, (U32)offset);
-		}
-
 		if (length == ML_MASK) {
-			unsigned int s;
-
-			do {
-				s = *ip++;
-
-				if ((endOnInput) && (ip > iend - LASTLITERALS))
-					goto _output_error;
 
-				length += s;
-			} while (s == 255);
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - LASTLITERALS + 1,
+						 endOnInput, 0, &error);
+			if (error != ok)
+				goto _output_error;
 
 			if ((safeDecode)
 				&& unlikely(
@@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		length += MINMATCH;
 
+#if LZ4_FAST_DEC_LOOP
+safe_match_copy:
+#endif
+
 		/* match starting within external dictionary */
 		if ((dict == usingExtDict) && (match < lowPrefix)) {
 			if (unlikely(op + length > oend - LASTLITERALS)) {
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-15  0:43 [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64 Chenxi Mao
@ 2019-05-15  2:21 ` Gao Xiang
  2019-05-15  3:27   ` Mao, Chenxi
  0 siblings, 1 reply; 16+ messages in thread
From: Gao Xiang @ 2019-05-15  2:21 UTC (permalink / raw)
  To: Chenxi Mao
  Cc: akpm, linux-kernel, roy.feng, yuanli.xu, robert.alm,
	masaya.a.takahashi, Miao Xie

Hi Chenxi,

On 2019/5/15 8:43, Chenxi Mao wrote:
> FAST_DEC_LOOP was introduced from LZ4 1.9.
> This change would be introduce 10% on decompress operation
> according to LZ4 benchmark result on X86 devices.
> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements,
> however clang compiler has downgrade if FAST_DEC_LOOP enabled.

I noticed this optimization and lz4-1.9.0 [1] was just released month ago
with this big change therefore I'm a little afraid of its current
stability (especially some rare used decompression apis...)

Could you Cc more people (e.g. lz4 original author Yann Collet)
in order to get some more ideas about this if possible?

Anyway, I like this optimization as well since FAST_DEC_LOOP improves
decompression speed a lot :)

[1] https://github.com/lz4/lz4/releases/tag/v1.9.0

Thanks,
Gao Xiang

> 
> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
> 
> Here is the test result on ARM64(cortex-A53)
> 
> Benchmark via ZRAM:
> 
> Test case:
> fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
> --buffer_compress_percentage=75 \
> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
> --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
> --name=seq-read --rw=read --stonewall --name=seq-readwrite \
> --rw=rw --stonewall --name=rand-readwrite --rw=randrw --stonewall
> 
> Patched:
>    READ: bw=7077MiB/s (7421MB/s)
> Vanilla:
>    READ: bw=5134MiB/s (5384MB/s)
> 
> Reference:
> 1. https://github.com/lz4/lz4/pull/645
> 2. https://github.com/lz4/lz4/pull/707
> 
> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
> ---
>  lib/lz4/lz4_decompress.c | 425 +++++++++++++++++++++++++++++++++------
>  1 file changed, 361 insertions(+), 64 deletions(-)
> 
> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
> index 0c9d3ad17e0f..a4c87e32b3c0 100644
> --- a/lib/lz4/lz4_decompress.c
> +++ b/lib/lz4/lz4_decompress.c
> @@ -50,6 +50,131 @@
>  #define assert(condition) ((void)0)
>  #endif
>  
> +#ifndef LZ4_FAST_DEC_LOOP
> +#if defined(__i386__) || defined(__x86_64__)
> +#define LZ4_FAST_DEC_LOOP 1
> +#elif defined(__aarch64__) && !defined(__clang__)
> +     /* On aarch64, we disable this optimization for clang because on certain
> +      * mobile chipsets and clang, it reduces performance. For more information
> +      * refer to https://github.com/lz4/lz4/pull/707. */
> +#define LZ4_FAST_DEC_LOOP 1
> +#else
> +#define LZ4_FAST_DEC_LOOP 0
> +#endif
> +#endif
> +
> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
> +
> +#if LZ4_FAST_DEC_LOOP
> +#define FASTLOOP_SAFE_DISTANCE 64
> +FORCE_INLINE void
> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
> +			     const size_t offset)
> +{
> +	if (offset < 8) {
> +		dstPtr[0] = srcPtr[0];
> +
> +		dstPtr[1] = srcPtr[1];
> +		dstPtr[2] = srcPtr[2];
> +		dstPtr[3] = srcPtr[3];
> +		srcPtr += inc32table[offset];
> +		memcpy(dstPtr + 4, srcPtr, 4);
> +		srcPtr -= dec64table[offset];
> +		dstPtr += 8;
> +	} else {
> +		memcpy(dstPtr, srcPtr, 8);
> +		dstPtr += 8;
> +		srcPtr += 8;
> +	}
> +
> +	LZ4_wildCopy(dstPtr, srcPtr, dstEnd);
> +}
> +
> +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
> + * this version copies two times 16 bytes (instead of one time 32 bytes)
> + * because it must be compatible with offsets >= 16. */
> +FORCE_INLINE void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd)
> +{
> +	BYTE *d = (BYTE *) dstPtr;
> +	const BYTE *s = (const BYTE *)srcPtr;
> +	BYTE *const e = (BYTE *) dstEnd;
> +
> +	do {
> +		memcpy(d, s, 16);
> +		memcpy(d + 16, s + 16, 16);
> +		d += 32;
> +		s += 32;
> +	} while (d < e);
> +}
> +
> +FORCE_INLINE void
> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
> +			const size_t offset)
> +{
> +	BYTE v[8];
> +	switch (offset) {
> +
> +	case 1:
> +		memset(v, *srcPtr, 8);
> +		goto copy_loop;
> +	case 2:
> +		memcpy(v, srcPtr, 2);
> +		memcpy(&v[2], srcPtr, 2);
> +		memcpy(&v[4], &v[0], 4);
> +		goto copy_loop;
> +	case 4:
> +		memcpy(v, srcPtr, 4);
> +		memcpy(&v[4], srcPtr, 4);
> +		goto copy_loop;
> +	default:
> +		LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
> +		return;
> +	}
> +
> +      copy_loop:
> +	memcpy(dstPtr, v, 8);
> +	dstPtr += 8;
> +	while (dstPtr < dstEnd) {
> +		memcpy(dstPtr, v, 8);
> +		dstPtr += 8;
> +	}
> +}
> +
> +/* Read the variable-length literal or match length.
> + *
> + * ip - pointer to use as input.
> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
> + * error (output) - error code.  Should be set to 0 before call.
> + */
> +typedef enum { loop_error = -2, initial_error = -1, ok = 0} variable_length_error;
> +FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
> +					   const BYTE *lencheck,
> +					   int loop_check, int initial_check,
> +					   variable_length_error *error)
> +{
> +	unsigned length = 0;
> +	unsigned s;
> +	if (initial_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
> +		*error = initial_error;
> +		return length;
> +	}
> +	do {
> +		s = **ip;
> +		(*ip)++;
> +		length += s;
> +		if (loop_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
> +			*error = loop_error;
> +			return length;
> +		}
> +	} while (s == 255);
> +
> +	return length;
> +}
> +#endif
> +
>  /*
>   * LZ4_decompress_generic() :
>   * This generic decompression function covers all use cases.
> @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  	 const size_t dictSize
>  	 )
>  {
> -	const BYTE *ip = (const BYTE *) src;
> -	const BYTE * const iend = ip + srcSize;
> +	const BYTE *ip = (const BYTE *)src;
> +	const BYTE *const iend = ip + srcSize;
>  
>  	BYTE *op = (BYTE *) dst;
> -	BYTE * const oend = op + outputSize;
> +	BYTE *const oend = op + outputSize;
>  	BYTE *cpy;
>  
> -	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
> -	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
> -	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
> +	const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>  
>  	const int safeDecode = (endOnInput == endOnInputSize);
>  	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
>  
>  	/* Set up the "end" pointers for the shortcut. */
>  	const BYTE *const shortiend = iend -
> -		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>  	const BYTE *const shortoend = oend -
> -		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
> +
> +	const BYTE *match;
> +	size_t offset;
> +	unsigned int token;
> +	size_t length;
>  
>  	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>  		 srcSize, outputSize);
> @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  	if ((endOnInput) && unlikely(srcSize == 0))
>  		return -1;
>  
> -	/* Main Loop : decode sequences */
> +#if LZ4_FAST_DEC_LOOP
> +	if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
> +		DEBUGLOG(6, "skip fast decode loop");
> +		goto safe_decode;
> +	}
> +
> +	/* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
>  	while (1) {
> -		size_t length;
> -		const BYTE *match;
> -		size_t offset;
> +		/* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
> +		assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
> +		if (endOnInput) {
> +			assert(ip < iend);
> +		}
> +		token = *ip++;
> +		length = token >> ML_BITS;	/* literal length */
> +
> +		assert(!endOnInput || ip <= iend);	/* ip < iend before the increment */
> +
> +		/* decode literal length */
> +		if (length == RUN_MASK) {
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - RUN_MASK,
> +						 endOnInput, endOnInput,
> +						 &error);
> +			if (error == initial_error) {
> +				goto _output_error;
> +			}
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (op) + length <
> +					(uptrval) (op))) {
> +				goto _output_error;
> +			}	/* overflow detection */
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (ip) + length <
> +					(uptrval) (ip))) {
> +				goto _output_error;
> +			}
> +
> +			/* overflow detection */
> +			/* copy literals */
> +			cpy = op + length;
> +			LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
> +			if (endOnInput) {	/* LZ4_decompress_safe() */
> +				if ((cpy > oend - 32)
> +				    || (ip + length > iend - 32)) {
> +					goto safe_literal_copy;
> +				}
> +				LZ4_wildCopy32(op, ip, cpy);
> +			} else {	/* LZ4_decompress_fast() */
> +				if (cpy > oend - 8) {
> +					goto safe_literal_copy;
> +				}
> +				LZ4_wildCopy(op, ip, cpy);
> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
> +				/* it doesn't know input length, and only relies on end-of-block */
> +				/* properties */
> +			}
> +			ip += length;
> +			op = cpy;
> +		} else {
> +			cpy = op + length;
> +			if (endOnInput) {	/* LZ4_decompress_safe() */
> +				DEBUGLOG(7,
> +					 "copy %u bytes in a 16-bytes stripe",
> +					 (unsigned)length);
> +				/* We don't need to check oend */
> +				/* since we check it once for each loop below */
> +				if (ip > iend - (16 + 1)) {	/*max lit + offset + nextToken */
> +					goto safe_literal_copy;
> +				}
> +				/* Literals can only be 14, but hope compilers optimize */
> +				/*if we copy by a register size */
> +				memcpy(op, ip, 16);
> +			} else {
> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
> +				/* it doesn't know input length, and relies on end-of-block */
> +				/* properties */
> +				memcpy(op, ip, 8);
> +				if (length > 8) {
> +					memcpy(op + 8, ip + 8, 8);
> +				}
> +			}
> +			ip += length;
> +			op = cpy;
> +		}
> +
> +		/* get offset */
> +		offset = LZ4_readLE16(ip);
> +		ip += 2;	/* end-of-block condition violated */
> +		match = op - offset;
> +
> +		/* get matchlength */
> +		length = token & ML_MASK;
>  
> -		/* get literal length */
> -		unsigned int const token = *ip++;
> -		length = token>>ML_BITS;
> +		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
> +			goto _output_error;
> +		}
> +		/* Error : offset outside buffers */
> +		if (length == ML_MASK) {
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
> +						 endOnInput, 0, &error);
> +			if (error != ok) {
> +				goto _output_error;
> +			}
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (op) + length < (uptrval) op)) {
> +				goto _output_error;
> +			}	/* overflow detection */
> +			length += MINMATCH;
> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
> +				goto safe_match_copy;
> +			}
> +		} else {
> +			length += MINMATCH;
> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
> +				goto safe_match_copy;
> +			}
> +
> +			/* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
> +			if (!(dict == usingExtDict) || (match >= lowPrefix)) {
> +				if (offset >= 8) {
> +					memcpy(op, match, 8);
> +					memcpy(op + 8, match + 8, 8);
> +					memcpy(op + 16, match + 16, 2);
> +					op += length;
> +					continue;
> +				}
> +			}
> +		}
> +
> +		/* match starting within external dictionary */
> +		if ((dict == usingExtDict) && (match < lowPrefix)) {
> +			if (unlikely(op + length > oend - LASTLITERALS)) {
> +				if (partialDecoding) {
> +					/* reach end of buffer */
> +					length =
> +					    min(length, (size_t) (oend - op));
> +				} else {
> +					/* end-of-block condition violated */
> +					goto _output_error;
> +				}
> +			}
> +
> +			if (length <= (size_t) (lowPrefix - match)) {
> +				/* match fits entirely within external dictionary : just copy */
> +				memmove(op, dictEnd - (lowPrefix - match), length);
> +				op += length;
> +			} else {
> +				/* match stretches into both external dict and current block */
> +				size_t const copySize =
> +				    (size_t) (lowPrefix - match);
> +				size_t const restSize = length - copySize;
> +				memcpy(op, dictEnd - copySize, copySize);
> +				op += copySize;
> +				if (restSize > (size_t) (op - lowPrefix)) {	/* overlap copy */
> +					BYTE *const endOfMatch = op + restSize;
> +					const BYTE *copyFrom = lowPrefix;
> +					while (op < endOfMatch) {
> +						*op++ = *copyFrom++;
> +					}
> +				} else {
> +					memcpy(op, lowPrefix, restSize);
> +					op += restSize;
> +				}
> +			}
> +			continue;
> +		}
> +
> +		/* copy match within block */
> +		cpy = op + length;
> +
> +		assert((op <= oend) && (oend - op >= 32));
> +		if (unlikely(offset < 16)) {
> +			LZ4_memcpy_using_offset(op, match, cpy, offset);
> +		} else {
> +			LZ4_wildCopy32(op, match, cpy);
> +		}
> +
> +		op = cpy;	/* wildcopy correction */
> +	}
> +      safe_decode:
> +#endif
> +	/* Main Loop : decode sequences */
> +	while (1) {
> +		length = token >> ML_BITS;
>  
>  		/* ip < iend before the increment */
>  		assert(!endOnInput || ip <= iend);
> @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  		 * combined check for both stages).
>  		 */
>  		if ((endOnInput ? length != RUN_MASK : length <= 8)
> -		   /*
> -		    * strictly "less than" on input, to re-enter
> -		    * the loop with at least one byte
> -		    */
> -		   && likely((endOnInput ? ip < shortiend : 1) &
> -			     (op <= shortoend))) {
> +		    /*
> +		     * strictly "less than" on input, to re-enter
> +		     * the loop with at least one byte
> +		     */
> +		    && likely((endOnInput ? ip < shortiend : 1) &
> +			      (op <= shortoend))) {
>  			/* Copy the literals */
>  			memcpy(op, ip, endOnInput ? 16 : 8);
> -			op += length; ip += length;
> +			op += length;
> +			ip += length;
>  
>  			/*
>  			 * The second stage:
>  			 * prepare for match copying, decode full info.
>  			 * If it doesn't work out, the info won't be wasted.
>  			 */
> -			length = token & ML_MASK; /* match length */
> +			length = token & ML_MASK;	/* match length */
>  			offset = LZ4_readLE16(ip);
>  			ip += 2;
>  			match = op - offset;
> -			assert(match <= op); /* check overflow */
> +			assert(match <= op);	/* check overflow */
>  
>  			/* Do not deal with overlapping matches. */
>  			if ((length != ML_MASK) &&
> @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		/* decode literal length */
>  		if (length == RUN_MASK) {
> -			unsigned int s;
>  
> -			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
> -				/* overflow detection */
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - RUN_MASK,
> +						 endOnInput, endOnInput,
> +						 &error);
> +			if (error == initial_error)
>  				goto _output_error;
> -			}
> -			do {
> -				s = *ip++;
> -				length += s;
> -			} while (likely(endOnInput
> -				? ip < iend - RUN_MASK
> -				: 1) & (s == 255));
>  
>  			if ((safeDecode)
> -			    && unlikely((uptrval)(op) +
> -					length < (uptrval)(op))) {
> +			    && unlikely((uptrval) (op) +
> +					length < (uptrval) (op))) {
>  				/* overflow detection */
>  				goto _output_error;
>  			}
>  			if ((safeDecode)
> -			    && unlikely((uptrval)(ip) +
> -					length < (uptrval)(ip))) {
> +			    && unlikely((uptrval) (ip) +
> +					length < (uptrval) (ip))) {
>  				/* overflow detection */
>  				goto _output_error;
>  			}
> @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		/* copy literals */
>  		cpy = op + length;
> +#if LZ4_FAST_DEC_LOOP
> +	      safe_literal_copy:
> +#endif
>  		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>  
>  		if (((endOnInput) && ((cpy > oend - MFLIMIT)
> -			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
> -			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
> +				      || (ip + length >
> +					  iend - (2 + 1 + LASTLITERALS))))
> +		    || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>  			if (partialDecoding) {
>  				if (cpy > oend) {
>  					/*
> @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  					length = oend - op;
>  				}
>  				if ((endOnInput)
> -					&& (ip + length > iend)) {
> +				    && (ip + length > iend)) {
>  					/*
>  					 * Error :
>  					 * read attempt beyond
> @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  				}
>  			} else {
>  				if ((!endOnInput)
> -					&& (cpy != oend)) {
> +				    && (cpy != oend)) {
>  					/*
>  					 * Error :
>  					 * block decoding must
> @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  					goto _output_error;
>  				}
>  				if ((endOnInput)
> -					&& ((ip + length != iend)
> +				    && ((ip + length != iend)
>  					|| (cpy > oend))) {
>  					/*
>  					 * Error :
> @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  			goto _output_error;
>  		}
>  
> -		/* costs ~1%; silence an msan warning when offset == 0 */
> -		/*
> -		 * note : when partialDecoding, there is no guarantee that
> -		 * at least 4 bytes remain available in output buffer
> -		 */
> -		if (!partialDecoding) {
> -			assert(oend > op);
> -			assert(oend - op >= 4);
> -
> -			LZ4_write32(op, (U32)offset);
> -		}
> -
>  		if (length == ML_MASK) {
> -			unsigned int s;
> -
> -			do {
> -				s = *ip++;
> -
> -				if ((endOnInput) && (ip > iend - LASTLITERALS))
> -					goto _output_error;
>  
> -				length += s;
> -			} while (s == 255);
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
> +						 endOnInput, 0, &error);
> +			if (error != ok)
> +				goto _output_error;
>  
>  			if ((safeDecode)
>  				&& unlikely(
> @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		length += MINMATCH;
>  
> +#if LZ4_FAST_DEC_LOOP
> +safe_match_copy:
> +#endif
> +
>  		/* match starting within external dictionary */
>  		if ((dict == usingExtDict) && (match < lowPrefix)) {
>  			if (unlikely(op + length > oend - LASTLITERALS)) {
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-15  2:21 ` Gao Xiang
@ 2019-05-15  3:27   ` Mao, Chenxi
       [not found]     ` <CADv+DGkGa3q4FJ0TeLcj626PrUEYCDMfyKdd0yx82VmAX2i98Q@mail.gmail.com>
  0 siblings, 1 reply; 16+ messages in thread
From: Mao, Chenxi @ 2019-05-15  3:27 UTC (permalink / raw)
  To: Gao Xiang
  Cc: akpm, linux-kernel, Feng, Roy, Xu, Yuanli 2, Alm, Robert 2,
	Takahashi, Masaya A (Sony Mobile),
	Miao Xie, yann.collet.73

Hi Xiang:

Thanks for your reply, I will have a stress test on my device later.
I didn't have chance to test LZ4 with clang build because of device limitation. I think I could do it later.
I guess the clang performance downgrade is caused by some compiler optimization options.
I will double check it later if I got the device which can build kernel with clang.

BTW, the FAST_DEC_LOOP leverage LZ4_wildCopy8 instead of LZ4_wildCopy, 
however based on my test, original LZ4_wildCopy has the better performance. 
Original LZ4_wildCopy API still invoked by FAST_DEC_LOOP apis.
That might be a problem for X86 devices for current patch. 

Chenxi

-----Original Message-----
From: Gao Xiang [mailto:gaoxiang25@huawei.com] 
Sent: Wednesday, May 15, 2019 10:21 AM
To: Mao, Chenxi <Chenxi.Mao@sony.com>
Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org; Feng, Roy <Roy.Feng@sony.com>; Xu, Yuanli 2 <Yuanli.Xu@sony.com>; Alm, Robert 2 <Robert.Alm@sony.com>; Takahashi, Masaya A (Sony Mobile) <Masaya.A.Takahashi@sony.com>; Miao Xie <miaoxie@huawei.com>
Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64

Hi Chenxi,

On 2019/5/15 8:43, Chenxi Mao wrote:
> FAST_DEC_LOOP was introduced from LZ4 1.9.
> This change would be introduce 10% on decompress operation according 
> to LZ4 benchmark result on X86 devices.
> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements, however 
> clang compiler has downgrade if FAST_DEC_LOOP enabled.

I noticed this optimization and lz4-1.9.0 [1] was just released month ago with this big change therefore I'm a little afraid of its current stability (especially some rare used decompression apis...)

Could you Cc more people (e.g. lz4 original author Yann Collet) in order to get some more ideas about this if possible?

Anyway, I like this optimization as well since FAST_DEC_LOOP improves decompression speed a lot :)

[1] https://github.com/lz4/lz4/releases/tag/v1.9.0

Thanks,
Gao Xiang

> 
> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
> 
> Here is the test result on ARM64(cortex-A53)
> 
> Benchmark via ZRAM:
> 
> Test case:
> fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
> --buffer_compress_percentage=75 \
> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
> --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \ 
> --name=seq-read --rw=read --stonewall --name=seq-readwrite \ --rw=rw 
> --stonewall --name=rand-readwrite --rw=randrw --stonewall
> 
> Patched:
>    READ: bw=7077MiB/s (7421MB/s)
> Vanilla:
>    READ: bw=5134MiB/s (5384MB/s)
> 
> Reference:
> 1. https://github.com/lz4/lz4/pull/645
> 2. https://github.com/lz4/lz4/pull/707
> 
> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
> ---
>  lib/lz4/lz4_decompress.c | 425 
> +++++++++++++++++++++++++++++++++------
>  1 file changed, 361 insertions(+), 64 deletions(-)
> 
> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 
> 0c9d3ad17e0f..a4c87e32b3c0 100644
> --- a/lib/lz4/lz4_decompress.c
> +++ b/lib/lz4/lz4_decompress.c
> @@ -50,6 +50,131 @@
>  #define assert(condition) ((void)0)
>  #endif
>  
> +#ifndef LZ4_FAST_DEC_LOOP
> +#if defined(__i386__) || defined(__x86_64__) #define 
> +LZ4_FAST_DEC_LOOP 1 #elif defined(__aarch64__) && !defined(__clang__)
> +     /* On aarch64, we disable this optimization for clang because on certain
> +      * mobile chipsets and clang, it reduces performance. For more information
> +      * refer to https://github.com/lz4/lz4/pull/707. */ #define 
> +LZ4_FAST_DEC_LOOP 1 #else #define LZ4_FAST_DEC_LOOP 0 #endif #endif
> +
> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 }; 
> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
> +
> +#if LZ4_FAST_DEC_LOOP
> +#define FASTLOOP_SAFE_DISTANCE 64
> +FORCE_INLINE void
> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
> +			     const size_t offset)
> +{
> +	if (offset < 8) {
> +		dstPtr[0] = srcPtr[0];
> +
> +		dstPtr[1] = srcPtr[1];
> +		dstPtr[2] = srcPtr[2];
> +		dstPtr[3] = srcPtr[3];
> +		srcPtr += inc32table[offset];
> +		memcpy(dstPtr + 4, srcPtr, 4);
> +		srcPtr -= dec64table[offset];
> +		dstPtr += 8;
> +	} else {
> +		memcpy(dstPtr, srcPtr, 8);
> +		dstPtr += 8;
> +		srcPtr += 8;
> +	}
> +
> +	LZ4_wildCopy(dstPtr, srcPtr, dstEnd); }
> +
> +/* customized variant of memcpy, which can overwrite up to 32 bytes 
> +beyond dstEnd
> + * this version copies two times 16 bytes (instead of one time 32 
> +bytes)
> + * because it must be compatible with offsets >= 16. */ FORCE_INLINE 
> +void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) {
> +	BYTE *d = (BYTE *) dstPtr;
> +	const BYTE *s = (const BYTE *)srcPtr;
> +	BYTE *const e = (BYTE *) dstEnd;
> +
> +	do {
> +		memcpy(d, s, 16);
> +		memcpy(d + 16, s + 16, 16);
> +		d += 32;
> +		s += 32;
> +	} while (d < e);
> +}
> +
> +FORCE_INLINE void
> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
> +			const size_t offset)
> +{
> +	BYTE v[8];
> +	switch (offset) {
> +
> +	case 1:
> +		memset(v, *srcPtr, 8);
> +		goto copy_loop;
> +	case 2:
> +		memcpy(v, srcPtr, 2);
> +		memcpy(&v[2], srcPtr, 2);
> +		memcpy(&v[4], &v[0], 4);
> +		goto copy_loop;
> +	case 4:
> +		memcpy(v, srcPtr, 4);
> +		memcpy(&v[4], srcPtr, 4);
> +		goto copy_loop;
> +	default:
> +		LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
> +		return;
> +	}
> +
> +      copy_loop:
> +	memcpy(dstPtr, v, 8);
> +	dstPtr += 8;
> +	while (dstPtr < dstEnd) {
> +		memcpy(dstPtr, v, 8);
> +		dstPtr += 8;
> +	}
> +}
> +
> +/* Read the variable-length literal or match length.
> + *
> + * ip - pointer to use as input.
> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
> + * error (output) - error code.  Should be set to 0 before call.
> + */
> +typedef enum { loop_error = -2, initial_error = -1, ok = 0} 
> +variable_length_error; FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
> +					   const BYTE *lencheck,
> +					   int loop_check, int initial_check,
> +					   variable_length_error *error) {
> +	unsigned length = 0;
> +	unsigned s;
> +	if (initial_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
> +		*error = initial_error;
> +		return length;
> +	}
> +	do {
> +		s = **ip;
> +		(*ip)++;
> +		length += s;
> +		if (loop_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
> +			*error = loop_error;
> +			return length;
> +		}
> +	} while (s == 255);
> +
> +	return length;
> +}
> +#endif
> +
>  /*
>   * LZ4_decompress_generic() :
>   * This generic decompression function covers all use cases.
> @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  	 const size_t dictSize
>  	 )
>  {
> -	const BYTE *ip = (const BYTE *) src;
> -	const BYTE * const iend = ip + srcSize;
> +	const BYTE *ip = (const BYTE *)src;
> +	const BYTE *const iend = ip + srcSize;
>  
>  	BYTE *op = (BYTE *) dst;
> -	BYTE * const oend = op + outputSize;
> +	BYTE *const oend = op + outputSize;
>  	BYTE *cpy;
>  
> -	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
> -	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
> -	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
> +	const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>  
>  	const int safeDecode = (endOnInput == endOnInputSize);
>  	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * 
> KB)));
>  
>  	/* Set up the "end" pointers for the shortcut. */
>  	const BYTE *const shortiend = iend -
> -		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>  	const BYTE *const shortoend = oend -
> -		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
> +
> +	const BYTE *match;
> +	size_t offset;
> +	unsigned int token;
> +	size_t length;
>  
>  	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>  		 srcSize, outputSize);
> @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  	if ((endOnInput) && unlikely(srcSize == 0))
>  		return -1;
>  
> -	/* Main Loop : decode sequences */
> +#if LZ4_FAST_DEC_LOOP
> +	if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
> +		DEBUGLOG(6, "skip fast decode loop");
> +		goto safe_decode;
> +	}
> +
> +	/* Fast loop : decode sequences as long as output < 
> +iend-FASTLOOP_SAFE_DISTANCE */
>  	while (1) {
> -		size_t length;
> -		const BYTE *match;
> -		size_t offset;
> +		/* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
> +		assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
> +		if (endOnInput) {
> +			assert(ip < iend);
> +		}
> +		token = *ip++;
> +		length = token >> ML_BITS;	/* literal length */
> +
> +		assert(!endOnInput || ip <= iend);	/* ip < iend before the increment */
> +
> +		/* decode literal length */
> +		if (length == RUN_MASK) {
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - RUN_MASK,
> +						 endOnInput, endOnInput,
> +						 &error);
> +			if (error == initial_error) {
> +				goto _output_error;
> +			}
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (op) + length <
> +					(uptrval) (op))) {
> +				goto _output_error;
> +			}	/* overflow detection */
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (ip) + length <
> +					(uptrval) (ip))) {
> +				goto _output_error;
> +			}
> +
> +			/* overflow detection */
> +			/* copy literals */
> +			cpy = op + length;
> +			LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
> +			if (endOnInput) {	/* LZ4_decompress_safe() */
> +				if ((cpy > oend - 32)
> +				    || (ip + length > iend - 32)) {
> +					goto safe_literal_copy;
> +				}
> +				LZ4_wildCopy32(op, ip, cpy);
> +			} else {	/* LZ4_decompress_fast() */
> +				if (cpy > oend - 8) {
> +					goto safe_literal_copy;
> +				}
> +				LZ4_wildCopy(op, ip, cpy);
> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
> +				/* it doesn't know input length, and only relies on end-of-block */
> +				/* properties */
> +			}
> +			ip += length;
> +			op = cpy;
> +		} else {
> +			cpy = op + length;
> +			if (endOnInput) {	/* LZ4_decompress_safe() */
> +				DEBUGLOG(7,
> +					 "copy %u bytes in a 16-bytes stripe",
> +					 (unsigned)length);
> +				/* We don't need to check oend */
> +				/* since we check it once for each loop below */
> +				if (ip > iend - (16 + 1)) {	/*max lit + offset + nextToken */
> +					goto safe_literal_copy;
> +				}
> +				/* Literals can only be 14, but hope compilers optimize */
> +				/*if we copy by a register size */
> +				memcpy(op, ip, 16);
> +			} else {
> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
> +				/* it doesn't know input length, and relies on end-of-block */
> +				/* properties */
> +				memcpy(op, ip, 8);
> +				if (length > 8) {
> +					memcpy(op + 8, ip + 8, 8);
> +				}
> +			}
> +			ip += length;
> +			op = cpy;
> +		}
> +
> +		/* get offset */
> +		offset = LZ4_readLE16(ip);
> +		ip += 2;	/* end-of-block condition violated */
> +		match = op - offset;
> +
> +		/* get matchlength */
> +		length = token & ML_MASK;
>  
> -		/* get literal length */
> -		unsigned int const token = *ip++;
> -		length = token>>ML_BITS;
> +		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
> +			goto _output_error;
> +		}
> +		/* Error : offset outside buffers */
> +		if (length == ML_MASK) {
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
> +						 endOnInput, 0, &error);
> +			if (error != ok) {
> +				goto _output_error;
> +			}
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (op) + length < (uptrval) op)) {
> +				goto _output_error;
> +			}	/* overflow detection */
> +			length += MINMATCH;
> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
> +				goto safe_match_copy;
> +			}
> +		} else {
> +			length += MINMATCH;
> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
> +				goto safe_match_copy;
> +			}
> +
> +			/* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
> +			if (!(dict == usingExtDict) || (match >= lowPrefix)) {
> +				if (offset >= 8) {
> +					memcpy(op, match, 8);
> +					memcpy(op + 8, match + 8, 8);
> +					memcpy(op + 16, match + 16, 2);
> +					op += length;
> +					continue;
> +				}
> +			}
> +		}
> +
> +		/* match starting within external dictionary */
> +		if ((dict == usingExtDict) && (match < lowPrefix)) {
> +			if (unlikely(op + length > oend - LASTLITERALS)) {
> +				if (partialDecoding) {
> +					/* reach end of buffer */
> +					length =
> +					    min(length, (size_t) (oend - op));
> +				} else {
> +					/* end-of-block condition violated */
> +					goto _output_error;
> +				}
> +			}
> +
> +			if (length <= (size_t) (lowPrefix - match)) {
> +				/* match fits entirely within external dictionary : just copy */
> +				memmove(op, dictEnd - (lowPrefix - match), length);
> +				op += length;
> +			} else {
> +				/* match stretches into both external dict and current block */
> +				size_t const copySize =
> +				    (size_t) (lowPrefix - match);
> +				size_t const restSize = length - copySize;
> +				memcpy(op, dictEnd - copySize, copySize);
> +				op += copySize;
> +				if (restSize > (size_t) (op - lowPrefix)) {	/* overlap copy */
> +					BYTE *const endOfMatch = op + restSize;
> +					const BYTE *copyFrom = lowPrefix;
> +					while (op < endOfMatch) {
> +						*op++ = *copyFrom++;
> +					}
> +				} else {
> +					memcpy(op, lowPrefix, restSize);
> +					op += restSize;
> +				}
> +			}
> +			continue;
> +		}
> +
> +		/* copy match within block */
> +		cpy = op + length;
> +
> +		assert((op <= oend) && (oend - op >= 32));
> +		if (unlikely(offset < 16)) {
> +			LZ4_memcpy_using_offset(op, match, cpy, offset);
> +		} else {
> +			LZ4_wildCopy32(op, match, cpy);
> +		}
> +
> +		op = cpy;	/* wildcopy correction */
> +	}
> +      safe_decode:
> +#endif
> +	/* Main Loop : decode sequences */
> +	while (1) {
> +		length = token >> ML_BITS;
>  
>  		/* ip < iend before the increment */
>  		assert(!endOnInput || ip <= iend);
> @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  		 * combined check for both stages).
>  		 */
>  		if ((endOnInput ? length != RUN_MASK : length <= 8)
> -		   /*
> -		    * strictly "less than" on input, to re-enter
> -		    * the loop with at least one byte
> -		    */
> -		   && likely((endOnInput ? ip < shortiend : 1) &
> -			     (op <= shortoend))) {
> +		    /*
> +		     * strictly "less than" on input, to re-enter
> +		     * the loop with at least one byte
> +		     */
> +		    && likely((endOnInput ? ip < shortiend : 1) &
> +			      (op <= shortoend))) {
>  			/* Copy the literals */
>  			memcpy(op, ip, endOnInput ? 16 : 8);
> -			op += length; ip += length;
> +			op += length;
> +			ip += length;
>  
>  			/*
>  			 * The second stage:
>  			 * prepare for match copying, decode full info.
>  			 * If it doesn't work out, the info won't be wasted.
>  			 */
> -			length = token & ML_MASK; /* match length */
> +			length = token & ML_MASK;	/* match length */
>  			offset = LZ4_readLE16(ip);
>  			ip += 2;
>  			match = op - offset;
> -			assert(match <= op); /* check overflow */
> +			assert(match <= op);	/* check overflow */
>  
>  			/* Do not deal with overlapping matches. */
>  			if ((length != ML_MASK) &&
> @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		/* decode literal length */
>  		if (length == RUN_MASK) {
> -			unsigned int s;
>  
> -			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
> -				/* overflow detection */
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - RUN_MASK,
> +						 endOnInput, endOnInput,
> +						 &error);
> +			if (error == initial_error)
>  				goto _output_error;
> -			}
> -			do {
> -				s = *ip++;
> -				length += s;
> -			} while (likely(endOnInput
> -				? ip < iend - RUN_MASK
> -				: 1) & (s == 255));
>  
>  			if ((safeDecode)
> -			    && unlikely((uptrval)(op) +
> -					length < (uptrval)(op))) {
> +			    && unlikely((uptrval) (op) +
> +					length < (uptrval) (op))) {
>  				/* overflow detection */
>  				goto _output_error;
>  			}
>  			if ((safeDecode)
> -			    && unlikely((uptrval)(ip) +
> -					length < (uptrval)(ip))) {
> +			    && unlikely((uptrval) (ip) +
> +					length < (uptrval) (ip))) {
>  				/* overflow detection */
>  				goto _output_error;
>  			}
> @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		/* copy literals */
>  		cpy = op + length;
> +#if LZ4_FAST_DEC_LOOP
> +	      safe_literal_copy:
> +#endif
>  		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>  
>  		if (((endOnInput) && ((cpy > oend - MFLIMIT)
> -			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
> -			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
> +				      || (ip + length >
> +					  iend - (2 + 1 + LASTLITERALS))))
> +		    || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>  			if (partialDecoding) {
>  				if (cpy > oend) {
>  					/*
> @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  					length = oend - op;
>  				}
>  				if ((endOnInput)
> -					&& (ip + length > iend)) {
> +				    && (ip + length > iend)) {
>  					/*
>  					 * Error :
>  					 * read attempt beyond
> @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  				}
>  			} else {
>  				if ((!endOnInput)
> -					&& (cpy != oend)) {
> +				    && (cpy != oend)) {
>  					/*
>  					 * Error :
>  					 * block decoding must
> @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  					goto _output_error;
>  				}
>  				if ((endOnInput)
> -					&& ((ip + length != iend)
> +				    && ((ip + length != iend)
>  					|| (cpy > oend))) {
>  					/*
>  					 * Error :
> @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  			goto _output_error;
>  		}
>  
> -		/* costs ~1%; silence an msan warning when offset == 0 */
> -		/*
> -		 * note : when partialDecoding, there is no guarantee that
> -		 * at least 4 bytes remain available in output buffer
> -		 */
> -		if (!partialDecoding) {
> -			assert(oend > op);
> -			assert(oend - op >= 4);
> -
> -			LZ4_write32(op, (U32)offset);
> -		}
> -
>  		if (length == ML_MASK) {
> -			unsigned int s;
> -
> -			do {
> -				s = *ip++;
> -
> -				if ((endOnInput) && (ip > iend - LASTLITERALS))
> -					goto _output_error;
>  
> -				length += s;
> -			} while (s == 255);
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
> +						 endOnInput, 0, &error);
> +			if (error != ok)
> +				goto _output_error;
>  
>  			if ((safeDecode)
>  				&& unlikely(
> @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		length += MINMATCH;
>  
> +#if LZ4_FAST_DEC_LOOP
> +safe_match_copy:
> +#endif
> +
>  		/* match starting within external dictionary */
>  		if ((dict == usingExtDict) && (match < lowPrefix)) {
>  			if (unlikely(op + length > oend - LASTLITERALS)) {
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
       [not found]     ` <CADv+DGkGa3q4FJ0TeLcj626PrUEYCDMfyKdd0yx82VmAX2i98Q@mail.gmail.com>
@ 2019-05-15 17:03       ` Cyan
  2019-05-15 17:47         ` Gao Xiang
  0 siblings, 1 reply; 16+ messages in thread
From: Cyan @ 2019-05-15 17:03 UTC (permalink / raw)
  To: Mao, Chenxi
  Cc: Gao Xiang, akpm, linux-kernel, Feng, Roy, Xu, Yuanli 2, Alm,
	Robert 2, Takahashi, Masaya A (Sony Mobile),
	Miao Xie

Re-posted,
it seems the previous message was rejected by the linux-kernel server
due to some kind of format limitation (no html).


Le mer. 15 mai 2019 à 09:56, Cyan <yann.collet.73@gmail.com> a écrit :
>
> The v1.9.0 version has a bug which makes it read a few bytes out of bound in certain cases.
> This was fixed in v1.9.1.
> Therefore, if you plan upgrading version, skip directly to v1.9.1.
>
> Note that, in v1.9.1, LZ4_decompress_fast() is now deprecated.
> LZ4_decompress_fast() is a security liability, since it's unable to cope with malicious inputs.
> While this is not a problem when the producer is under direct control (local production and consumption, no intermediate storage),
> this entry point seems misused in multiple scenarios where input cannot be decently trusted.
> The deprecation warning is meant to "scare away" such usages.
> It's still possible to intentionally remove the warning, for users who know what they are doing.
>
> Note that, on top of that, the _fast() variant is now slower that LZ4_decompress_safe(), removing its most important positive differentiator.
> That's because LZ4_decompress_fast() is "blind", it doesn't know the input size.
> It just guesses it, from the requested output size, and by relying on end-of-block conditions.
> This effectively limits its capability to copy up to 8 bytes at a time,
> which proves slower than the 32-bytes at a time used in LZ4_decompress_safe().
>
> LZ4_decompress_fast() still has some possible usage :
> it doesn't need the compressed size.
> Therefore, in a scenario where decompressed size is known, but compressed size isn't,
> it cannot be replaced by `LZ4_decompress_safe()`.
> This is pretty rare, but can happen in some cases, erofs for example.
> `LZ4_decompress_safe_partial()` *might* be able to replace it for such scenario,
> but I haven't thoroughly tested it, so can't yet vouch for it.
>
> If it doesn't, then it will be necessary to create a new entry point.
> The main difference is that this new entry point will need a bound for the input size, in order to guarantee it never reads beyond input buffer.
> But as said, for the time being, such dedicated entry point doesn't exist.
>
> The new FAST_DEC_LOOP setting works well on x86 and x64.
> Recently, it has been extended to gcc+arm64 (`dev` branch).
> The combination of clang+arm64 is less successful though (impacts vary, depending on version and exact hardware), so it's still disabled by default.
> But anyone can test it by setting FAST_DEC_LOOP definition to 1, thus enabling the new decoder loop.
>
>
> Y.
>
> Le mar. 14 mai 2019 à 20:28, Mao, Chenxi <Chenxi.Mao@sony.com> a écrit :
>>
>> Hi Xiang:
>>
>> Thanks for your reply, I will have a stress test on my device later.
>> I didn't have chance to test LZ4 with clang build because of device limitation. I think I could do it later.
>> I guess the clang performance downgrade is caused by some compiler optimization options.
>> I will double check it later if I got the device which can build kernel with clang.
>>
>> BTW, the FAST_DEC_LOOP leverage LZ4_wildCopy8 instead of LZ4_wildCopy,
>> however based on my test, original LZ4_wildCopy has the better performance.
>> Original LZ4_wildCopy API still invoked by FAST_DEC_LOOP apis.
>> That might be a problem for X86 devices for current patch.
>>
>> Chenxi
>>
>> -----Original Message-----
>> From: Gao Xiang [mailto:gaoxiang25@huawei.com]
>> Sent: Wednesday, May 15, 2019 10:21 AM
>> To: Mao, Chenxi <Chenxi.Mao@sony.com>
>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org; Feng, Roy <Roy.Feng@sony.com>; Xu, Yuanli 2 <Yuanli.Xu@sony.com>; Alm, Robert 2 <Robert.Alm@sony.com>; Takahashi, Masaya A (Sony Mobile) <Masaya.A.Takahashi@sony.com>; Miao Xie <miaoxie@huawei.com>
>> Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
>>
>> Hi Chenxi,
>>
>> On 2019/5/15 8:43, Chenxi Mao wrote:
>> > FAST_DEC_LOOP was introduced from LZ4 1.9.
>> > This change would be introduce 10% on decompress operation according
>> > to LZ4 benchmark result on X86 devices.
>> > Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements, however
>> > clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>
>> I noticed this optimization and lz4-1.9.0 [1] was just released month ago with this big change therefore I'm a little afraid of its current stability (especially some rare used decompression apis...)
>>
>> Could you Cc more people (e.g. lz4 original author Yann Collet) in order to get some more ideas about this if possible?
>>
>> Anyway, I like this optimization as well since FAST_DEC_LOOP improves decompression speed a lot :)
>>
>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>
>> Thanks,
>> Gao Xiang
>>
>> >
>> > So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>> >
>> > Here is the test result on ARM64(cortex-A53)
>> >
>> > Benchmark via ZRAM:
>> >
>> > Test case:
>> > fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>> > --buffer_compress_percentage=75 \
>> > --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
>> > --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
>> > --name=seq-read --rw=read --stonewall --name=seq-readwrite \ --rw=rw
>> > --stonewall --name=rand-readwrite --rw=randrw --stonewall
>> >
>> > Patched:
>> >    READ: bw=7077MiB/s (7421MB/s)
>> > Vanilla:
>> >    READ: bw=5134MiB/s (5384MB/s)
>> >
>> > Reference:
>> > 1. https://github.com/lz4/lz4/pull/645
>> > 2. https://github.com/lz4/lz4/pull/707
>> >
>> > Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
>> > ---
>> >  lib/lz4/lz4_decompress.c | 425
>> > +++++++++++++++++++++++++++++++++------
>> >  1 file changed, 361 insertions(+), 64 deletions(-)
>> >
>> > diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index
>> > 0c9d3ad17e0f..a4c87e32b3c0 100644
>> > --- a/lib/lz4/lz4_decompress.c
>> > +++ b/lib/lz4/lz4_decompress.c
>> > @@ -50,6 +50,131 @@
>> >  #define assert(condition) ((void)0)
>> >  #endif
>> >
>> > +#ifndef LZ4_FAST_DEC_LOOP
>> > +#if defined(__i386__) || defined(__x86_64__) #define
>> > +LZ4_FAST_DEC_LOOP 1 #elif defined(__aarch64__) && !defined(__clang__)
>> > +     /* On aarch64, we disable this optimization for clang because on certain
>> > +      * mobile chipsets and clang, it reduces performance. For more information
>> > +      * refer to https://github.com/lz4/lz4/pull/707. */ #define
>> > +LZ4_FAST_DEC_LOOP 1 #else #define LZ4_FAST_DEC_LOOP 0 #endif #endif
>> > +
>> > +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>> > +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>> > +
>> > +#if LZ4_FAST_DEC_LOOP
>> > +#define FASTLOOP_SAFE_DISTANCE 64
>> > +FORCE_INLINE void
>> > +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>> > +                          const size_t offset)
>> > +{
>> > +     if (offset < 8) {
>> > +             dstPtr[0] = srcPtr[0];
>> > +
>> > +             dstPtr[1] = srcPtr[1];
>> > +             dstPtr[2] = srcPtr[2];
>> > +             dstPtr[3] = srcPtr[3];
>> > +             srcPtr += inc32table[offset];
>> > +             memcpy(dstPtr + 4, srcPtr, 4);
>> > +             srcPtr -= dec64table[offset];
>> > +             dstPtr += 8;
>> > +     } else {
>> > +             memcpy(dstPtr, srcPtr, 8);
>> > +             dstPtr += 8;
>> > +             srcPtr += 8;
>> > +     }
>> > +
>> > +     LZ4_wildCopy(dstPtr, srcPtr, dstEnd); }
>> > +
>> > +/* customized variant of memcpy, which can overwrite up to 32 bytes
>> > +beyond dstEnd
>> > + * this version copies two times 16 bytes (instead of one time 32
>> > +bytes)
>> > + * because it must be compatible with offsets >= 16. */ FORCE_INLINE
>> > +void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) {
>> > +     BYTE *d = (BYTE *) dstPtr;
>> > +     const BYTE *s = (const BYTE *)srcPtr;
>> > +     BYTE *const e = (BYTE *) dstEnd;
>> > +
>> > +     do {
>> > +             memcpy(d, s, 16);
>> > +             memcpy(d + 16, s + 16, 16);
>> > +             d += 32;
>> > +             s += 32;
>> > +     } while (d < e);
>> > +}
>> > +
>> > +FORCE_INLINE void
>> > +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>> > +                     const size_t offset)
>> > +{
>> > +     BYTE v[8];
>> > +     switch (offset) {
>> > +
>> > +     case 1:
>> > +             memset(v, *srcPtr, 8);
>> > +             goto copy_loop;
>> > +     case 2:
>> > +             memcpy(v, srcPtr, 2);
>> > +             memcpy(&v[2], srcPtr, 2);
>> > +             memcpy(&v[4], &v[0], 4);
>> > +             goto copy_loop;
>> > +     case 4:
>> > +             memcpy(v, srcPtr, 4);
>> > +             memcpy(&v[4], srcPtr, 4);
>> > +             goto copy_loop;
>> > +     default:
>> > +             LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>> > +             return;
>> > +     }
>> > +
>> > +      copy_loop:
>> > +     memcpy(dstPtr, v, 8);
>> > +     dstPtr += 8;
>> > +     while (dstPtr < dstEnd) {
>> > +             memcpy(dstPtr, v, 8);
>> > +             dstPtr += 8;
>> > +     }
>> > +}
>> > +
>> > +/* Read the variable-length literal or match length.
>> > + *
>> > + * ip - pointer to use as input.
>> > + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>> > + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>> > + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>> > + * error (output) - error code.  Should be set to 0 before call.
>> > + */
>> > +typedef enum { loop_error = -2, initial_error = -1, ok = 0}
>> > +variable_length_error; FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>> > +                                        const BYTE *lencheck,
>> > +                                        int loop_check, int initial_check,
>> > +                                        variable_length_error *error) {
>> > +     unsigned length = 0;
>> > +     unsigned s;
>> > +     if (initial_check && unlikely((*ip) >= lencheck)) {     /* overflow detection */
>> > +             *error = initial_error;
>> > +             return length;
>> > +     }
>> > +     do {
>> > +             s = **ip;
>> > +             (*ip)++;
>> > +             length += s;
>> > +             if (loop_check && unlikely((*ip) >= lencheck)) {        /* overflow detection */
>> > +                     *error = loop_error;
>> > +                     return length;
>> > +             }
>> > +     } while (s == 255);
>> > +
>> > +     return length;
>> > +}
>> > +#endif
>> > +
>> >  /*
>> >   * LZ4_decompress_generic() :
>> >   * This generic decompression function covers all use cases.
>> > @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>> >        const size_t dictSize
>> >        )
>> >  {
>> > -     const BYTE *ip = (const BYTE *) src;
>> > -     const BYTE * const iend = ip + srcSize;
>> > +     const BYTE *ip = (const BYTE *)src;
>> > +     const BYTE *const iend = ip + srcSize;
>> >
>> >       BYTE *op = (BYTE *) dst;
>> > -     BYTE * const oend = op + outputSize;
>> > +     BYTE *const oend = op + outputSize;
>> >       BYTE *cpy;
>> >
>> > -     const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>> > -     static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>> > -     static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>> > +     const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>> >
>> >       const int safeDecode = (endOnInput == endOnInputSize);
>> >       const int checkOffset = ((safeDecode) && (dictSize < (int)(64 *
>> > KB)));
>> >
>> >       /* Set up the "end" pointers for the shortcut. */
>> >       const BYTE *const shortiend = iend -
>> > -             (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>> > +         (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>> >       const BYTE *const shortoend = oend -
>> > -             (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>> > +         (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>> > +
>> > +     const BYTE *match;
>> > +     size_t offset;
>> > +     unsigned int token;
>> > +     size_t length;
>> >
>> >       DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>> >                srcSize, outputSize);
>> > @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>> >       if ((endOnInput) && unlikely(srcSize == 0))
>> >               return -1;
>> >
>> > -     /* Main Loop : decode sequences */
>> > +#if LZ4_FAST_DEC_LOOP
>> > +     if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>> > +             DEBUGLOG(6, "skip fast decode loop");
>> > +             goto safe_decode;
>> > +     }
>> > +
>> > +     /* Fast loop : decode sequences as long as output <
>> > +iend-FASTLOOP_SAFE_DISTANCE */
>> >       while (1) {
>> > -             size_t length;
>> > -             const BYTE *match;
>> > -             size_t offset;
>> > +             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>> > +             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>> > +             if (endOnInput) {
>> > +                     assert(ip < iend);
>> > +             }
>> > +             token = *ip++;
>> > +             length = token >> ML_BITS;      /* literal length */
>> > +
>> > +             assert(!endOnInput || ip <= iend);      /* ip < iend before the increment */
>> > +
>> > +             /* decode literal length */
>> > +             if (length == RUN_MASK) {
>> > +                     variable_length_error error = ok;
>> > +                     length +=
>> > +                         read_variable_length(&ip, iend - RUN_MASK,
>> > +                                              endOnInput, endOnInput,
>> > +                                              &error);
>> > +                     if (error == initial_error) {
>> > +                             goto _output_error;
>> > +                     }
>> > +                     if ((safeDecode)
>> > +                         && unlikely((uptrval) (op) + length <
>> > +                                     (uptrval) (op))) {
>> > +                             goto _output_error;
>> > +                     }       /* overflow detection */
>> > +                     if ((safeDecode)
>> > +                         && unlikely((uptrval) (ip) + length <
>> > +                                     (uptrval) (ip))) {
>> > +                             goto _output_error;
>> > +                     }
>> > +
>> > +                     /* overflow detection */
>> > +                     /* copy literals */
>> > +                     cpy = op + length;
>> > +                     LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>> > +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>> > +                             if ((cpy > oend - 32)
>> > +                                 || (ip + length > iend - 32)) {
>> > +                                     goto safe_literal_copy;
>> > +                             }
>> > +                             LZ4_wildCopy32(op, ip, cpy);
>> > +                     } else {        /* LZ4_decompress_fast() */
>> > +                             if (cpy > oend - 8) {
>> > +                                     goto safe_literal_copy;
>> > +                             }
>> > +                             LZ4_wildCopy(op, ip, cpy);
>> > +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>> > +                             /* it doesn't know input length, and only relies on end-of-block */
>> > +                             /* properties */
>> > +                     }
>> > +                     ip += length;
>> > +                     op = cpy;
>> > +             } else {
>> > +                     cpy = op + length;
>> > +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>> > +                             DEBUGLOG(7,
>> > +                                      "copy %u bytes in a 16-bytes stripe",
>> > +                                      (unsigned)length);
>> > +                             /* We don't need to check oend */
>> > +                             /* since we check it once for each loop below */
>> > +                             if (ip > iend - (16 + 1)) {     /*max lit + offset + nextToken */
>> > +                                     goto safe_literal_copy;
>> > +                             }
>> > +                             /* Literals can only be 14, but hope compilers optimize */
>> > +                             /*if we copy by a register size */
>> > +                             memcpy(op, ip, 16);
>> > +                     } else {
>> > +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>> > +                             /* it doesn't know input length, and relies on end-of-block */
>> > +                             /* properties */
>> > +                             memcpy(op, ip, 8);
>> > +                             if (length > 8) {
>> > +                                     memcpy(op + 8, ip + 8, 8);
>> > +                             }
>> > +                     }
>> > +                     ip += length;
>> > +                     op = cpy;
>> > +             }
>> > +
>> > +             /* get offset */
>> > +             offset = LZ4_readLE16(ip);
>> > +             ip += 2;        /* end-of-block condition violated */
>> > +             match = op - offset;
>> > +
>> > +             /* get matchlength */
>> > +             length = token & ML_MASK;
>> >
>> > -             /* get literal length */
>> > -             unsigned int const token = *ip++;
>> > -             length = token>>ML_BITS;
>> > +             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>> > +                     goto _output_error;
>> > +             }
>> > +             /* Error : offset outside buffers */
>> > +             if (length == ML_MASK) {
>> > +                     variable_length_error error = ok;
>> > +                     length +=
>> > +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>> > +                                              endOnInput, 0, &error);
>> > +                     if (error != ok) {
>> > +                             goto _output_error;
>> > +                     }
>> > +                     if ((safeDecode)
>> > +                         && unlikely((uptrval) (op) + length < (uptrval) op)) {
>> > +                             goto _output_error;
>> > +                     }       /* overflow detection */
>> > +                     length += MINMATCH;
>> > +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>> > +                             goto safe_match_copy;
>> > +                     }
>> > +             } else {
>> > +                     length += MINMATCH;
>> > +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>> > +                             goto safe_match_copy;
>> > +                     }
>> > +
>> > +                     /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>> > +                     if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>> > +                             if (offset >= 8) {
>> > +                                     memcpy(op, match, 8);
>> > +                                     memcpy(op + 8, match + 8, 8);
>> > +                                     memcpy(op + 16, match + 16, 2);
>> > +                                     op += length;
>> > +                                     continue;
>> > +                             }
>> > +                     }
>> > +             }
>> > +
>> > +             /* match starting within external dictionary */
>> > +             if ((dict == usingExtDict) && (match < lowPrefix)) {
>> > +                     if (unlikely(op + length > oend - LASTLITERALS)) {
>> > +                             if (partialDecoding) {
>> > +                                     /* reach end of buffer */
>> > +                                     length =
>> > +                                         min(length, (size_t) (oend - op));
>> > +                             } else {
>> > +                                     /* end-of-block condition violated */
>> > +                                     goto _output_error;
>> > +                             }
>> > +                     }
>> > +
>> > +                     if (length <= (size_t) (lowPrefix - match)) {
>> > +                             /* match fits entirely within external dictionary : just copy */
>> > +                             memmove(op, dictEnd - (lowPrefix - match), length);
>> > +                             op += length;
>> > +                     } else {
>> > +                             /* match stretches into both external dict and current block */
>> > +                             size_t const copySize =
>> > +                                 (size_t) (lowPrefix - match);
>> > +                             size_t const restSize = length - copySize;
>> > +                             memcpy(op, dictEnd - copySize, copySize);
>> > +                             op += copySize;
>> > +                             if (restSize > (size_t) (op - lowPrefix)) {     /* overlap copy */
>> > +                                     BYTE *const endOfMatch = op + restSize;
>> > +                                     const BYTE *copyFrom = lowPrefix;
>> > +                                     while (op < endOfMatch) {
>> > +                                             *op++ = *copyFrom++;
>> > +                                     }
>> > +                             } else {
>> > +                                     memcpy(op, lowPrefix, restSize);
>> > +                                     op += restSize;
>> > +                             }
>> > +                     }
>> > +                     continue;
>> > +             }
>> > +
>> > +             /* copy match within block */
>> > +             cpy = op + length;
>> > +
>> > +             assert((op <= oend) && (oend - op >= 32));
>> > +             if (unlikely(offset < 16)) {
>> > +                     LZ4_memcpy_using_offset(op, match, cpy, offset);
>> > +             } else {
>> > +                     LZ4_wildCopy32(op, match, cpy);
>> > +             }
>> > +
>> > +             op = cpy;       /* wildcopy correction */
>> > +     }
>> > +      safe_decode:
>> > +#endif
>> > +     /* Main Loop : decode sequences */
>> > +     while (1) {
>> > +             length = token >> ML_BITS;
>> >
>> >               /* ip < iend before the increment */
>> >               assert(!endOnInput || ip <= iend);
>> > @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>> >                * combined check for both stages).
>> >                */
>> >               if ((endOnInput ? length != RUN_MASK : length <= 8)
>> > -                /*
>> > -                 * strictly "less than" on input, to re-enter
>> > -                 * the loop with at least one byte
>> > -                 */
>> > -                && likely((endOnInput ? ip < shortiend : 1) &
>> > -                          (op <= shortoend))) {
>> > +                 /*
>> > +                  * strictly "less than" on input, to re-enter
>> > +                  * the loop with at least one byte
>> > +                  */
>> > +                 && likely((endOnInput ? ip < shortiend : 1) &
>> > +                           (op <= shortoend))) {
>> >                       /* Copy the literals */
>> >                       memcpy(op, ip, endOnInput ? 16 : 8);
>> > -                     op += length; ip += length;
>> > +                     op += length;
>> > +                     ip += length;
>> >
>> >                       /*
>> >                        * The second stage:
>> >                        * prepare for match copying, decode full info.
>> >                        * If it doesn't work out, the info won't be wasted.
>> >                        */
>> > -                     length = token & ML_MASK; /* match length */
>> > +                     length = token & ML_MASK;       /* match length */
>> >                       offset = LZ4_readLE16(ip);
>> >                       ip += 2;
>> >                       match = op - offset;
>> > -                     assert(match <= op); /* check overflow */
>> > +                     assert(match <= op);    /* check overflow */
>> >
>> >                       /* Do not deal with overlapping matches. */
>> >                       if ((length != ML_MASK) &&
>> > @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>> >
>> >               /* decode literal length */
>> >               if (length == RUN_MASK) {
>> > -                     unsigned int s;
>> >
>> > -                     if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>> > -                             /* overflow detection */
>> > +                     variable_length_error error = ok;
>> > +                     length +=
>> > +                         read_variable_length(&ip, iend - RUN_MASK,
>> > +                                              endOnInput, endOnInput,
>> > +                                              &error);
>> > +                     if (error == initial_error)
>> >                               goto _output_error;
>> > -                     }
>> > -                     do {
>> > -                             s = *ip++;
>> > -                             length += s;
>> > -                     } while (likely(endOnInput
>> > -                             ? ip < iend - RUN_MASK
>> > -                             : 1) & (s == 255));
>> >
>> >                       if ((safeDecode)
>> > -                         && unlikely((uptrval)(op) +
>> > -                                     length < (uptrval)(op))) {
>> > +                         && unlikely((uptrval) (op) +
>> > +                                     length < (uptrval) (op))) {
>> >                               /* overflow detection */
>> >                               goto _output_error;
>> >                       }
>> >                       if ((safeDecode)
>> > -                         && unlikely((uptrval)(ip) +
>> > -                                     length < (uptrval)(ip))) {
>> > +                         && unlikely((uptrval) (ip) +
>> > +                                     length < (uptrval) (ip))) {
>> >                               /* overflow detection */
>> >                               goto _output_error;
>> >                       }
>> > @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>> >
>> >               /* copy literals */
>> >               cpy = op + length;
>> > +#if LZ4_FAST_DEC_LOOP
>> > +           safe_literal_copy:
>> > +#endif
>> >               LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>> >
>> >               if (((endOnInput) && ((cpy > oend - MFLIMIT)
>> > -                     || (ip + length > iend - (2 + 1 + LASTLITERALS))))
>> > -                     || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>> > +                                   || (ip + length >
>> > +                                       iend - (2 + 1 + LASTLITERALS))))
>> > +                 || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>> >                       if (partialDecoding) {
>> >                               if (cpy > oend) {
>> >                                       /*
>> > @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>> >                                       length = oend - op;
>> >                               }
>> >                               if ((endOnInput)
>> > -                                     && (ip + length > iend)) {
>> > +                                 && (ip + length > iend)) {
>> >                                       /*
>> >                                        * Error :
>> >                                        * read attempt beyond
>> > @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>> >                               }
>> >                       } else {
>> >                               if ((!endOnInput)
>> > -                                     && (cpy != oend)) {
>> > +                                 && (cpy != oend)) {
>> >                                       /*
>> >                                        * Error :
>> >                                        * block decoding must
>> > @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>> >                                       goto _output_error;
>> >                               }
>> >                               if ((endOnInput)
>> > -                                     && ((ip + length != iend)
>> > +                                 && ((ip + length != iend)
>> >                                       || (cpy > oend))) {
>> >                                       /*
>> >                                        * Error :
>> > @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>> >                       goto _output_error;
>> >               }
>> >
>> > -             /* costs ~1%; silence an msan warning when offset == 0 */
>> > -             /*
>> > -              * note : when partialDecoding, there is no guarantee that
>> > -              * at least 4 bytes remain available in output buffer
>> > -              */
>> > -             if (!partialDecoding) {
>> > -                     assert(oend > op);
>> > -                     assert(oend - op >= 4);
>> > -
>> > -                     LZ4_write32(op, (U32)offset);
>> > -             }
>> > -
>> >               if (length == ML_MASK) {
>> > -                     unsigned int s;
>> > -
>> > -                     do {
>> > -                             s = *ip++;
>> > -
>> > -                             if ((endOnInput) && (ip > iend - LASTLITERALS))
>> > -                                     goto _output_error;
>> >
>> > -                             length += s;
>> > -                     } while (s == 255);
>> > +                     variable_length_error error = ok;
>> > +                     length +=
>> > +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>> > +                                              endOnInput, 0, &error);
>> > +                     if (error != ok)
>> > +                             goto _output_error;
>> >
>> >                       if ((safeDecode)
>> >                               && unlikely(
>> > @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>> >
>> >               length += MINMATCH;
>> >
>> > +#if LZ4_FAST_DEC_LOOP
>> > +safe_match_copy:
>> > +#endif
>> > +
>> >               /* match starting within external dictionary */
>> >               if ((dict == usingExtDict) && (match < lowPrefix)) {
>> >                       if (unlikely(op + length > oend - LASTLITERALS)) {
>> >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-15 17:03       ` Cyan
@ 2019-05-15 17:47         ` Gao Xiang
  2019-05-15 23:48           ` Mao, Chenxi
  0 siblings, 1 reply; 16+ messages in thread
From: Gao Xiang @ 2019-05-15 17:47 UTC (permalink / raw)
  To: Cyan
  Cc: Mao, Chenxi, akpm, linux-kernel, Feng, Roy, Xu, Yuanli 2, Alm,
	Robert 2, Takahashi, Masaya A (Sony Mobile),
	Miao Xie, Greg Kroah-Hartman

Hi Yann,

On 2019/5/16 1:03, Cyan wrote:
> Re-posted,
> it seems the previous message was rejected by the linux-kernel server
> due to some kind of format limitation (no html).
> 
> 
> Le mer. 15 mai 2019 à 09:56, Cyan <yann.collet.73@gmail.com> a écrit :
>>
>> The v1.9.0 version has a bug which makes it read a few bytes out of bound in certain cases.
>> This was fixed in v1.9.1.
>> Therefore, if you plan upgrading version, skip directly to v1.9.1.
>>
>> Note that, in v1.9.1, LZ4_decompress_fast() is now deprecated.
>> LZ4_decompress_fast() is a security liability, since it's unable to cope with malicious inputs.
>> While this is not a problem when the producer is under direct control (local production and consumption, no intermediate storage),
>> this entry point seems misused in multiple scenarios where input cannot be decently trusted.
>> The deprecation warning is meant to "scare away" such usages.
>> It's still possible to intentionally remove the warning, for users who know what they are doing.
>>
>> Note that, on top of that, the _fast() variant is now slower that LZ4_decompress_safe(), removing its most important positive differentiator.
>> That's because LZ4_decompress_fast() is "blind", it doesn't know the input size.
>> It just guesses it, from the requested output size, and by relying on end-of-block conditions.
>> This effectively limits its capability to copy up to 8 bytes at a time,
>> which proves slower than the 32-bytes at a time used in LZ4_decompress_safe().
>>
>> LZ4_decompress_fast() still has some possible usage :
>> it doesn't need the compressed size.
>> Therefore, in a scenario where decompressed size is known, but compressed size isn't,
>> it cannot be replaced by `LZ4_decompress_safe()`.
>> This is pretty rare, but can happen in some cases, erofs for example.
>> `LZ4_decompress_safe_partial()` *might* be able to replace it for such scenario,
>> but I haven't thoroughly tested it, so can't yet vouch for it.

Yes, that is my real concern...

In fact, we have shipped erofs with LZ4_decompress_safe_partial() of lz4-1.8.3
for our HUAWEI P30/P30 Pro and other HUAWEI EMUI 9.1 phones on the market...

It seemd lz4-1.8.3 is rather stable since no stability report from our internal
beta test and real consumers...

I haven't tested lz4-1.9.0 yet since I am working on the erofs paper [1]...

But anyway, the patch looks good to me since it has obvious performance gain...

>>
>> If it doesn't, then it will be necessary to create a new entry point.
>> The main difference is that this new entry point will need a bound for the input size, in order to guarantee it never reads beyond input buffer.
>> But as said, for the time being, such dedicated entry point doesn't exist.

I agree, but current erofs still have to choose LZ4_decompress_fast...
I will test these 2 apis later and will report if something is wrong as well.

[1] https://www.usenix.org/conference/atc19/presentation/gao
    https://www.usenix.org/conference/atc19/technical-sessions

Thanks,
Gao Xiang


>>
>> The new FAST_DEC_LOOP setting works well on x86 and x64.
>> Recently, it has been extended to gcc+arm64 (`dev` branch).
>> The combination of clang+arm64 is less successful though (impacts vary, depending on version and exact hardware), so it's still disabled by default.
>> But anyone can test it by setting FAST_DEC_LOOP definition to 1, thus enabling the new decoder loop.
>>
>>
>> Y.
>>
>> Le mar. 14 mai 2019 à 20:28, Mao, Chenxi <Chenxi.Mao@sony.com> a écrit :
>>>
>>> Hi Xiang:
>>>
>>> Thanks for your reply, I will have a stress test on my device later.
>>> I didn't have chance to test LZ4 with clang build because of device limitation. I think I could do it later.
>>> I guess the clang performance downgrade is caused by some compiler optimization options.
>>> I will double check it later if I got the device which can build kernel with clang.
>>>
>>> BTW, the FAST_DEC_LOOP leverage LZ4_wildCopy8 instead of LZ4_wildCopy,
>>> however based on my test, original LZ4_wildCopy has the better performance.
>>> Original LZ4_wildCopy API still invoked by FAST_DEC_LOOP apis.
>>> That might be a problem for X86 devices for current patch.
>>>
>>> Chenxi
>>>
>>> -----Original Message-----
>>> From: Gao Xiang [mailto:gaoxiang25@huawei.com]
>>> Sent: Wednesday, May 15, 2019 10:21 AM
>>> To: Mao, Chenxi <Chenxi.Mao@sony.com>
>>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org; Feng, Roy <Roy.Feng@sony.com>; Xu, Yuanli 2 <Yuanli.Xu@sony.com>; Alm, Robert 2 <Robert.Alm@sony.com>; Takahashi, Masaya A (Sony Mobile) <Masaya.A.Takahashi@sony.com>; Miao Xie <miaoxie@huawei.com>
>>> Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
>>>
>>> Hi Chenxi,
>>>
>>> On 2019/5/15 8:43, Chenxi Mao wrote:
>>>> FAST_DEC_LOOP was introduced from LZ4 1.9.
>>>> This change would be introduce 10% on decompress operation according
>>>> to LZ4 benchmark result on X86 devices.
>>>> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements, however
>>>> clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>>
>>> I noticed this optimization and lz4-1.9.0 [1] was just released month ago with this big change therefore I'm a little afraid of its current stability (especially some rare used decompression apis...)
>>>
>>> Could you Cc more people (e.g. lz4 original author Yann Collet) in order to get some more ideas about this if possible?
>>>
>>> Anyway, I like this optimization as well since FAST_DEC_LOOP improves decompression speed a lot :)
>>>
>>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>>
>>> Thanks,
>>> Gao Xiang
>>>
>>>>
>>>> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>>>>
>>>> Here is the test result on ARM64(cortex-A53)
>>>>
>>>> Benchmark via ZRAM:
>>>>
>>>> Test case:
>>>> fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>>>> --buffer_compress_percentage=75 \
>>>> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
>>>> --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
>>>> --name=seq-read --rw=read --stonewall --name=seq-readwrite \ --rw=rw
>>>> --stonewall --name=rand-readwrite --rw=randrw --stonewall
>>>>
>>>> Patched:
>>>>    READ: bw=7077MiB/s (7421MB/s)
>>>> Vanilla:
>>>>    READ: bw=5134MiB/s (5384MB/s)
>>>>
>>>> Reference:
>>>> 1. https://github.com/lz4/lz4/pull/645
>>>> 2. https://github.com/lz4/lz4/pull/707
>>>>
>>>> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
>>>> ---
>>>>  lib/lz4/lz4_decompress.c | 425
>>>> +++++++++++++++++++++++++++++++++------
>>>>  1 file changed, 361 insertions(+), 64 deletions(-)
>>>>
>>>> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index
>>>> 0c9d3ad17e0f..a4c87e32b3c0 100644
>>>> --- a/lib/lz4/lz4_decompress.c
>>>> +++ b/lib/lz4/lz4_decompress.c
>>>> @@ -50,6 +50,131 @@
>>>>  #define assert(condition) ((void)0)
>>>>  #endif
>>>>
>>>> +#ifndef LZ4_FAST_DEC_LOOP
>>>> +#if defined(__i386__) || defined(__x86_64__) #define
>>>> +LZ4_FAST_DEC_LOOP 1 #elif defined(__aarch64__) && !defined(__clang__)
>>>> +     /* On aarch64, we disable this optimization for clang because on certain
>>>> +      * mobile chipsets and clang, it reduces performance. For more information
>>>> +      * refer to https://github.com/lz4/lz4/pull/707. */ #define
>>>> +LZ4_FAST_DEC_LOOP 1 #else #define LZ4_FAST_DEC_LOOP 0 #endif #endif
>>>> +
>>>> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>>>> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>>>> +
>>>> +#if LZ4_FAST_DEC_LOOP
>>>> +#define FASTLOOP_SAFE_DISTANCE 64
>>>> +FORCE_INLINE void
>>>> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>>>> +                          const size_t offset)
>>>> +{
>>>> +     if (offset < 8) {
>>>> +             dstPtr[0] = srcPtr[0];
>>>> +
>>>> +             dstPtr[1] = srcPtr[1];
>>>> +             dstPtr[2] = srcPtr[2];
>>>> +             dstPtr[3] = srcPtr[3];
>>>> +             srcPtr += inc32table[offset];
>>>> +             memcpy(dstPtr + 4, srcPtr, 4);
>>>> +             srcPtr -= dec64table[offset];
>>>> +             dstPtr += 8;
>>>> +     } else {
>>>> +             memcpy(dstPtr, srcPtr, 8);
>>>> +             dstPtr += 8;
>>>> +             srcPtr += 8;
>>>> +     }
>>>> +
>>>> +     LZ4_wildCopy(dstPtr, srcPtr, dstEnd); }
>>>> +
>>>> +/* customized variant of memcpy, which can overwrite up to 32 bytes
>>>> +beyond dstEnd
>>>> + * this version copies two times 16 bytes (instead of one time 32
>>>> +bytes)
>>>> + * because it must be compatible with offsets >= 16. */ FORCE_INLINE
>>>> +void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) {
>>>> +     BYTE *d = (BYTE *) dstPtr;
>>>> +     const BYTE *s = (const BYTE *)srcPtr;
>>>> +     BYTE *const e = (BYTE *) dstEnd;
>>>> +
>>>> +     do {
>>>> +             memcpy(d, s, 16);
>>>> +             memcpy(d + 16, s + 16, 16);
>>>> +             d += 32;
>>>> +             s += 32;
>>>> +     } while (d < e);
>>>> +}
>>>> +
>>>> +FORCE_INLINE void
>>>> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>>>> +                     const size_t offset)
>>>> +{
>>>> +     BYTE v[8];
>>>> +     switch (offset) {
>>>> +
>>>> +     case 1:
>>>> +             memset(v, *srcPtr, 8);
>>>> +             goto copy_loop;
>>>> +     case 2:
>>>> +             memcpy(v, srcPtr, 2);
>>>> +             memcpy(&v[2], srcPtr, 2);
>>>> +             memcpy(&v[4], &v[0], 4);
>>>> +             goto copy_loop;
>>>> +     case 4:
>>>> +             memcpy(v, srcPtr, 4);
>>>> +             memcpy(&v[4], srcPtr, 4);
>>>> +             goto copy_loop;
>>>> +     default:
>>>> +             LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>>>> +             return;
>>>> +     }
>>>> +
>>>> +      copy_loop:
>>>> +     memcpy(dstPtr, v, 8);
>>>> +     dstPtr += 8;
>>>> +     while (dstPtr < dstEnd) {
>>>> +             memcpy(dstPtr, v, 8);
>>>> +             dstPtr += 8;
>>>> +     }
>>>> +}
>>>> +
>>>> +/* Read the variable-length literal or match length.
>>>> + *
>>>> + * ip - pointer to use as input.
>>>> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>>>> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>>>> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>>>> + * error (output) - error code.  Should be set to 0 before call.
>>>> + */
>>>> +typedef enum { loop_error = -2, initial_error = -1, ok = 0}
>>>> +variable_length_error; FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>>>> +                                        const BYTE *lencheck,
>>>> +                                        int loop_check, int initial_check,
>>>> +                                        variable_length_error *error) {
>>>> +     unsigned length = 0;
>>>> +     unsigned s;
>>>> +     if (initial_check && unlikely((*ip) >= lencheck)) {     /* overflow detection */
>>>> +             *error = initial_error;
>>>> +             return length;
>>>> +     }
>>>> +     do {
>>>> +             s = **ip;
>>>> +             (*ip)++;
>>>> +             length += s;
>>>> +             if (loop_check && unlikely((*ip) >= lencheck)) {        /* overflow detection */
>>>> +                     *error = loop_error;
>>>> +                     return length;
>>>> +             }
>>>> +     } while (s == 255);
>>>> +
>>>> +     return length;
>>>> +}
>>>> +#endif
>>>> +
>>>>  /*
>>>>   * LZ4_decompress_generic() :
>>>>   * This generic decompression function covers all use cases.
>>>> @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>        const size_t dictSize
>>>>        )
>>>>  {
>>>> -     const BYTE *ip = (const BYTE *) src;
>>>> -     const BYTE * const iend = ip + srcSize;
>>>> +     const BYTE *ip = (const BYTE *)src;
>>>> +     const BYTE *const iend = ip + srcSize;
>>>>
>>>>       BYTE *op = (BYTE *) dst;
>>>> -     BYTE * const oend = op + outputSize;
>>>> +     BYTE *const oend = op + outputSize;
>>>>       BYTE *cpy;
>>>>
>>>> -     const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>>>> -     static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>>>> -     static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>>>> +     const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>
>>>>       const int safeDecode = (endOnInput == endOnInputSize);
>>>>       const int checkOffset = ((safeDecode) && (dictSize < (int)(64 *
>>>> KB)));
>>>>
>>>>       /* Set up the "end" pointers for the shortcut. */
>>>>       const BYTE *const shortiend = iend -
>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>       const BYTE *const shortoend = oend -
>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>> +
>>>> +     const BYTE *match;
>>>> +     size_t offset;
>>>> +     unsigned int token;
>>>> +     size_t length;
>>>>
>>>>       DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>>>>                srcSize, outputSize);
>>>> @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>       if ((endOnInput) && unlikely(srcSize == 0))
>>>>               return -1;
>>>>
>>>> -     /* Main Loop : decode sequences */
>>>> +#if LZ4_FAST_DEC_LOOP
>>>> +     if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>>>> +             DEBUGLOG(6, "skip fast decode loop");
>>>> +             goto safe_decode;
>>>> +     }
>>>> +
>>>> +     /* Fast loop : decode sequences as long as output <
>>>> +iend-FASTLOOP_SAFE_DISTANCE */
>>>>       while (1) {
>>>> -             size_t length;
>>>> -             const BYTE *match;
>>>> -             size_t offset;
>>>> +             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>>>> +             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>>>> +             if (endOnInput) {
>>>> +                     assert(ip < iend);
>>>> +             }
>>>> +             token = *ip++;
>>>> +             length = token >> ML_BITS;      /* literal length */
>>>> +
>>>> +             assert(!endOnInput || ip <= iend);      /* ip < iend before the increment */
>>>> +
>>>> +             /* decode literal length */
>>>> +             if (length == RUN_MASK) {
>>>> +                     variable_length_error error = ok;
>>>> +                     length +=
>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>> +                                              endOnInput, endOnInput,
>>>> +                                              &error);
>>>> +                     if (error == initial_error) {
>>>> +                             goto _output_error;
>>>> +                     }
>>>> +                     if ((safeDecode)
>>>> +                         && unlikely((uptrval) (op) + length <
>>>> +                                     (uptrval) (op))) {
>>>> +                             goto _output_error;
>>>> +                     }       /* overflow detection */
>>>> +                     if ((safeDecode)
>>>> +                         && unlikely((uptrval) (ip) + length <
>>>> +                                     (uptrval) (ip))) {
>>>> +                             goto _output_error;
>>>> +                     }
>>>> +
>>>> +                     /* overflow detection */
>>>> +                     /* copy literals */
>>>> +                     cpy = op + length;
>>>> +                     LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>> +                             if ((cpy > oend - 32)
>>>> +                                 || (ip + length > iend - 32)) {
>>>> +                                     goto safe_literal_copy;
>>>> +                             }
>>>> +                             LZ4_wildCopy32(op, ip, cpy);
>>>> +                     } else {        /* LZ4_decompress_fast() */
>>>> +                             if (cpy > oend - 8) {
>>>> +                                     goto safe_literal_copy;
>>>> +                             }
>>>> +                             LZ4_wildCopy(op, ip, cpy);
>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>> +                             /* it doesn't know input length, and only relies on end-of-block */
>>>> +                             /* properties */
>>>> +                     }
>>>> +                     ip += length;
>>>> +                     op = cpy;
>>>> +             } else {
>>>> +                     cpy = op + length;
>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>> +                             DEBUGLOG(7,
>>>> +                                      "copy %u bytes in a 16-bytes stripe",
>>>> +                                      (unsigned)length);
>>>> +                             /* We don't need to check oend */
>>>> +                             /* since we check it once for each loop below */
>>>> +                             if (ip > iend - (16 + 1)) {     /*max lit + offset + nextToken */
>>>> +                                     goto safe_literal_copy;
>>>> +                             }
>>>> +                             /* Literals can only be 14, but hope compilers optimize */
>>>> +                             /*if we copy by a register size */
>>>> +                             memcpy(op, ip, 16);
>>>> +                     } else {
>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>> +                             /* it doesn't know input length, and relies on end-of-block */
>>>> +                             /* properties */
>>>> +                             memcpy(op, ip, 8);
>>>> +                             if (length > 8) {
>>>> +                                     memcpy(op + 8, ip + 8, 8);
>>>> +                             }
>>>> +                     }
>>>> +                     ip += length;
>>>> +                     op = cpy;
>>>> +             }
>>>> +
>>>> +             /* get offset */
>>>> +             offset = LZ4_readLE16(ip);
>>>> +             ip += 2;        /* end-of-block condition violated */
>>>> +             match = op - offset;
>>>> +
>>>> +             /* get matchlength */
>>>> +             length = token & ML_MASK;
>>>>
>>>> -             /* get literal length */
>>>> -             unsigned int const token = *ip++;
>>>> -             length = token>>ML_BITS;
>>>> +             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>>>> +                     goto _output_error;
>>>> +             }
>>>> +             /* Error : offset outside buffers */
>>>> +             if (length == ML_MASK) {
>>>> +                     variable_length_error error = ok;
>>>> +                     length +=
>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>> +                                              endOnInput, 0, &error);
>>>> +                     if (error != ok) {
>>>> +                             goto _output_error;
>>>> +                     }
>>>> +                     if ((safeDecode)
>>>> +                         && unlikely((uptrval) (op) + length < (uptrval) op)) {
>>>> +                             goto _output_error;
>>>> +                     }       /* overflow detection */
>>>> +                     length += MINMATCH;
>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>> +                             goto safe_match_copy;
>>>> +                     }
>>>> +             } else {
>>>> +                     length += MINMATCH;
>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>> +                             goto safe_match_copy;
>>>> +                     }
>>>> +
>>>> +                     /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>>>> +                     if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>>>> +                             if (offset >= 8) {
>>>> +                                     memcpy(op, match, 8);
>>>> +                                     memcpy(op + 8, match + 8, 8);
>>>> +                                     memcpy(op + 16, match + 16, 2);
>>>> +                                     op += length;
>>>> +                                     continue;
>>>> +                             }
>>>> +                     }
>>>> +             }
>>>> +
>>>> +             /* match starting within external dictionary */
>>>> +             if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>> +                     if (unlikely(op + length > oend - LASTLITERALS)) {
>>>> +                             if (partialDecoding) {
>>>> +                                     /* reach end of buffer */
>>>> +                                     length =
>>>> +                                         min(length, (size_t) (oend - op));
>>>> +                             } else {
>>>> +                                     /* end-of-block condition violated */
>>>> +                                     goto _output_error;
>>>> +                             }
>>>> +                     }
>>>> +
>>>> +                     if (length <= (size_t) (lowPrefix - match)) {
>>>> +                             /* match fits entirely within external dictionary : just copy */
>>>> +                             memmove(op, dictEnd - (lowPrefix - match), length);
>>>> +                             op += length;
>>>> +                     } else {
>>>> +                             /* match stretches into both external dict and current block */
>>>> +                             size_t const copySize =
>>>> +                                 (size_t) (lowPrefix - match);
>>>> +                             size_t const restSize = length - copySize;
>>>> +                             memcpy(op, dictEnd - copySize, copySize);
>>>> +                             op += copySize;
>>>> +                             if (restSize > (size_t) (op - lowPrefix)) {     /* overlap copy */
>>>> +                                     BYTE *const endOfMatch = op + restSize;
>>>> +                                     const BYTE *copyFrom = lowPrefix;
>>>> +                                     while (op < endOfMatch) {
>>>> +                                             *op++ = *copyFrom++;
>>>> +                                     }
>>>> +                             } else {
>>>> +                                     memcpy(op, lowPrefix, restSize);
>>>> +                                     op += restSize;
>>>> +                             }
>>>> +                     }
>>>> +                     continue;
>>>> +             }
>>>> +
>>>> +             /* copy match within block */
>>>> +             cpy = op + length;
>>>> +
>>>> +             assert((op <= oend) && (oend - op >= 32));
>>>> +             if (unlikely(offset < 16)) {
>>>> +                     LZ4_memcpy_using_offset(op, match, cpy, offset);
>>>> +             } else {
>>>> +                     LZ4_wildCopy32(op, match, cpy);
>>>> +             }
>>>> +
>>>> +             op = cpy;       /* wildcopy correction */
>>>> +     }
>>>> +      safe_decode:
>>>> +#endif
>>>> +     /* Main Loop : decode sequences */
>>>> +     while (1) {
>>>> +             length = token >> ML_BITS;
>>>>
>>>>               /* ip < iend before the increment */
>>>>               assert(!endOnInput || ip <= iend);
>>>> @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>                * combined check for both stages).
>>>>                */
>>>>               if ((endOnInput ? length != RUN_MASK : length <= 8)
>>>> -                /*
>>>> -                 * strictly "less than" on input, to re-enter
>>>> -                 * the loop with at least one byte
>>>> -                 */
>>>> -                && likely((endOnInput ? ip < shortiend : 1) &
>>>> -                          (op <= shortoend))) {
>>>> +                 /*
>>>> +                  * strictly "less than" on input, to re-enter
>>>> +                  * the loop with at least one byte
>>>> +                  */
>>>> +                 && likely((endOnInput ? ip < shortiend : 1) &
>>>> +                           (op <= shortoend))) {
>>>>                       /* Copy the literals */
>>>>                       memcpy(op, ip, endOnInput ? 16 : 8);
>>>> -                     op += length; ip += length;
>>>> +                     op += length;
>>>> +                     ip += length;
>>>>
>>>>                       /*
>>>>                        * The second stage:
>>>>                        * prepare for match copying, decode full info.
>>>>                        * If it doesn't work out, the info won't be wasted.
>>>>                        */
>>>> -                     length = token & ML_MASK; /* match length */
>>>> +                     length = token & ML_MASK;       /* match length */
>>>>                       offset = LZ4_readLE16(ip);
>>>>                       ip += 2;
>>>>                       match = op - offset;
>>>> -                     assert(match <= op); /* check overflow */
>>>> +                     assert(match <= op);    /* check overflow */
>>>>
>>>>                       /* Do not deal with overlapping matches. */
>>>>                       if ((length != ML_MASK) &&
>>>> @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>
>>>>               /* decode literal length */
>>>>               if (length == RUN_MASK) {
>>>> -                     unsigned int s;
>>>>
>>>> -                     if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>>>> -                             /* overflow detection */
>>>> +                     variable_length_error error = ok;
>>>> +                     length +=
>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>> +                                              endOnInput, endOnInput,
>>>> +                                              &error);
>>>> +                     if (error == initial_error)
>>>>                               goto _output_error;
>>>> -                     }
>>>> -                     do {
>>>> -                             s = *ip++;
>>>> -                             length += s;
>>>> -                     } while (likely(endOnInput
>>>> -                             ? ip < iend - RUN_MASK
>>>> -                             : 1) & (s == 255));
>>>>
>>>>                       if ((safeDecode)
>>>> -                         && unlikely((uptrval)(op) +
>>>> -                                     length < (uptrval)(op))) {
>>>> +                         && unlikely((uptrval) (op) +
>>>> +                                     length < (uptrval) (op))) {
>>>>                               /* overflow detection */
>>>>                               goto _output_error;
>>>>                       }
>>>>                       if ((safeDecode)
>>>> -                         && unlikely((uptrval)(ip) +
>>>> -                                     length < (uptrval)(ip))) {
>>>> +                         && unlikely((uptrval) (ip) +
>>>> +                                     length < (uptrval) (ip))) {
>>>>                               /* overflow detection */
>>>>                               goto _output_error;
>>>>                       }
>>>> @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>
>>>>               /* copy literals */
>>>>               cpy = op + length;
>>>> +#if LZ4_FAST_DEC_LOOP
>>>> +           safe_literal_copy:
>>>> +#endif
>>>>               LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>
>>>>               if (((endOnInput) && ((cpy > oend - MFLIMIT)
>>>> -                     || (ip + length > iend - (2 + 1 + LASTLITERALS))))
>>>> -                     || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>> +                                   || (ip + length >
>>>> +                                       iend - (2 + 1 + LASTLITERALS))))
>>>> +                 || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>                       if (partialDecoding) {
>>>>                               if (cpy > oend) {
>>>>                                       /*
>>>> @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>                                       length = oend - op;
>>>>                               }
>>>>                               if ((endOnInput)
>>>> -                                     && (ip + length > iend)) {
>>>> +                                 && (ip + length > iend)) {
>>>>                                       /*
>>>>                                        * Error :
>>>>                                        * read attempt beyond
>>>> @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>                               }
>>>>                       } else {
>>>>                               if ((!endOnInput)
>>>> -                                     && (cpy != oend)) {
>>>> +                                 && (cpy != oend)) {
>>>>                                       /*
>>>>                                        * Error :
>>>>                                        * block decoding must
>>>> @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>                                       goto _output_error;
>>>>                               }
>>>>                               if ((endOnInput)
>>>> -                                     && ((ip + length != iend)
>>>> +                                 && ((ip + length != iend)
>>>>                                       || (cpy > oend))) {
>>>>                                       /*
>>>>                                        * Error :
>>>> @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>                       goto _output_error;
>>>>               }
>>>>
>>>> -             /* costs ~1%; silence an msan warning when offset == 0 */
>>>> -             /*
>>>> -              * note : when partialDecoding, there is no guarantee that
>>>> -              * at least 4 bytes remain available in output buffer
>>>> -              */
>>>> -             if (!partialDecoding) {
>>>> -                     assert(oend > op);
>>>> -                     assert(oend - op >= 4);
>>>> -
>>>> -                     LZ4_write32(op, (U32)offset);
>>>> -             }
>>>> -
>>>>               if (length == ML_MASK) {
>>>> -                     unsigned int s;
>>>> -
>>>> -                     do {
>>>> -                             s = *ip++;
>>>> -
>>>> -                             if ((endOnInput) && (ip > iend - LASTLITERALS))
>>>> -                                     goto _output_error;
>>>>
>>>> -                             length += s;
>>>> -                     } while (s == 255);
>>>> +                     variable_length_error error = ok;
>>>> +                     length +=
>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>> +                                              endOnInput, 0, &error);
>>>> +                     if (error != ok)
>>>> +                             goto _output_error;
>>>>
>>>>                       if ((safeDecode)
>>>>                               && unlikely(
>>>> @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>
>>>>               length += MINMATCH;
>>>>
>>>> +#if LZ4_FAST_DEC_LOOP
>>>> +safe_match_copy:
>>>> +#endif
>>>> +
>>>>               /* match starting within external dictionary */
>>>>               if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>                       if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-15 17:47         ` Gao Xiang
@ 2019-05-15 23:48           ` Mao, Chenxi
  2019-05-16  1:35             ` Gao Xiang
  0 siblings, 1 reply; 16+ messages in thread
From: Mao, Chenxi @ 2019-05-15 23:48 UTC (permalink / raw)
  To: Gao Xiang, Cyan
  Cc: akpm, linux-kernel, Feng, Roy, Xu, Yuanli 2, Alm, Robert 2,
	Takahashi, Masaya A (Sony Mobile),
	Miao Xie, Greg Kroah-Hartman

Hi Yann and Xiang:
For this FAST_DEC_LOOP change, I only pick up decompress related patches to current kernel LZ4 implementation(based on 1.8.3).
Here is my cherry-pick list:
2589c44 created LZ4_FAST_DEC_LOOP build macro
605d811 enable LZ4_FAST_DEC_LOOP build macro on aarch64/GCC by default
5d7d116 decompress_generic: Limit fastpath to x86
75fb878 decompress_generic: Add fastpath for small offsets
faac110 decompress_generic: Unroll loops a bit more
1fbaf84 decompress_generic: remove msan write
28b8249 decompress_generic: re-add fastpath
232f1e2 decompress_generic: drop partial copy check in fast loop
59332a3 decompress_generic: Optimize literal copies
5dfa7d4 decompress_generic: optimize match copy
28356e0 decompress_generic: Add a loop fastpath
4da3360 decompress_generic: Refactor variable length fields

Only cherry-pick these changes to LZ4 1.8.3 would not be introduce risks.

@Xiang:
Do you prefer to upgrade kernel lz4 to 1.9.1 directly or only pick some optimization patches?

Chenxi


On 5/16/19 1:47 AM, Gao Xiang wrote:
> Hi Yann,
> 
> On 2019/5/16 1:03, Cyan wrote:
>> Re-posted,
>> it seems the previous message was rejected by the linux-kernel server
>> due to some kind of format limitation (no html).
>>
>>
>> Le mer. 15 mai 2019 à 09:56, Cyan <yann.collet.73@gmail.com> a écrit :
>>>
>>> The v1.9.0 version has a bug which makes it read a few bytes out of bound in certain cases.
>>> This was fixed in v1.9.1.
>>> Therefore, if you plan upgrading version, skip directly to v1.9.1.
>>>
>>> Note that, in v1.9.1, LZ4_decompress_fast() is now deprecated.
>>> LZ4_decompress_fast() is a security liability, since it's unable to cope with malicious inputs.
>>> While this is not a problem when the producer is under direct control (local production and consumption, no intermediate storage),
>>> this entry point seems misused in multiple scenarios where input cannot be decently trusted.
>>> The deprecation warning is meant to "scare away" such usages.
>>> It's still possible to intentionally remove the warning, for users who know what they are doing.
>>>
>>> Note that, on top of that, the _fast() variant is now slower that LZ4_decompress_safe(), removing its most important positive differentiator.
>>> That's because LZ4_decompress_fast() is "blind", it doesn't know the input size.
>>> It just guesses it, from the requested output size, and by relying on end-of-block conditions.
>>> This effectively limits its capability to copy up to 8 bytes at a time,
>>> which proves slower than the 32-bytes at a time used in LZ4_decompress_safe().
>>>
>>> LZ4_decompress_fast() still has some possible usage :
>>> it doesn't need the compressed size.
>>> Therefore, in a scenario where decompressed size is known, but compressed size isn't,
>>> it cannot be replaced by `LZ4_decompress_safe()`.
>>> This is pretty rare, but can happen in some cases, erofs for example.
>>> `LZ4_decompress_safe_partial()` *might* be able to replace it for such scenario,
>>> but I haven't thoroughly tested it, so can't yet vouch for it.
> 
> Yes, that is my real concern...
> 
> In fact, we have shipped erofs with LZ4_decompress_safe_partial() of lz4-1.8.3
> for our HUAWEI P30/P30 Pro and other HUAWEI EMUI 9.1 phones on the market...
> 
> It seemd lz4-1.8.3 is rather stable since no stability report from our internal
> beta test and real consumers...
> 
> I haven't tested lz4-1.9.0 yet since I am working on the erofs paper [1]...
> 
> But anyway, the patch looks good to me since it has obvious performance gain...
> 
>>>
>>> If it doesn't, then it will be necessary to create a new entry point.
>>> The main difference is that this new entry point will need a bound for the input size, in order to guarantee it never reads beyond input buffer.
>>> But as said, for the time being, such dedicated entry point doesn't exist.
> 
> I agree, but current erofs still have to choose LZ4_decompress_fast...
> I will test these 2 apis later and will report if something is wrong as well.
> 
> [1] https://www.usenix.org/conference/atc19/presentation/gao
>     https://www.usenix.org/conference/atc19/technical-sessions
> 
> Thanks,
> Gao Xiang
> 
> 
>>>
>>> The new FAST_DEC_LOOP setting works well on x86 and x64.
>>> Recently, it has been extended to gcc+arm64 (`dev` branch).
>>> The combination of clang+arm64 is less successful though (impacts vary, depending on version and exact hardware), so it's still disabled by default.
>>> But anyone can test it by setting FAST_DEC_LOOP definition to 1, thus enabling the new decoder loop.
>>>
>>>
>>> Y.
>>>
>>> Le mar. 14 mai 2019 à 20:28, Mao, Chenxi <Chenxi.Mao@sony.com> a écrit :
>>>>
>>>> Hi Xiang:
>>>>
>>>> Thanks for your reply, I will have a stress test on my device later.
>>>> I didn't have chance to test LZ4 with clang build because of device limitation. I think I could do it later.
>>>> I guess the clang performance downgrade is caused by some compiler optimization options.
>>>> I will double check it later if I got the device which can build kernel with clang.
>>>>
>>>> BTW, the FAST_DEC_LOOP leverage LZ4_wildCopy8 instead of LZ4_wildCopy,
>>>> however based on my test, original LZ4_wildCopy has the better performance.
>>>> Original LZ4_wildCopy API still invoked by FAST_DEC_LOOP apis.
>>>> That might be a problem for X86 devices for current patch.
>>>>
>>>> Chenxi
>>>>
>>>> -----Original Message-----
>>>> From: Gao Xiang [mailto:gaoxiang25@huawei.com]
>>>> Sent: Wednesday, May 15, 2019 10:21 AM
>>>> To: Mao, Chenxi <Chenxi.Mao@sony.com>
>>>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org; Feng, Roy <Roy.Feng@sony.com>; Xu, Yuanli 2 <Yuanli.Xu@sony.com>; Alm, Robert 2 <Robert.Alm@sony.com>; Takahashi, Masaya A (Sony Mobile) <Masaya.A.Takahashi@sony.com>; Miao Xie <miaoxie@huawei.com>
>>>> Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
>>>>
>>>> Hi Chenxi,
>>>>
>>>> On 2019/5/15 8:43, Chenxi Mao wrote:
>>>>> FAST_DEC_LOOP was introduced from LZ4 1.9.
>>>>> This change would be introduce 10% on decompress operation according
>>>>> to LZ4 benchmark result on X86 devices.
>>>>> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements, however
>>>>> clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>>>
>>>> I noticed this optimization and lz4-1.9.0 [1] was just released month ago with this big change therefore I'm a little afraid of its current stability (especially some rare used decompression apis...)
>>>>
>>>> Could you Cc more people (e.g. lz4 original author Yann Collet) in order to get some more ideas about this if possible?
>>>>
>>>> Anyway, I like this optimization as well since FAST_DEC_LOOP improves decompression speed a lot :)
>>>>
>>>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>>>
>>>> Thanks,
>>>> Gao Xiang
>>>>
>>>>>
>>>>> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>>>>>
>>>>> Here is the test result on ARM64(cortex-A53)
>>>>>
>>>>> Benchmark via ZRAM:
>>>>>
>>>>> Test case:
>>>>> fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>>>>> --buffer_compress_percentage=75 \
>>>>> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
>>>>> --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
>>>>> --name=seq-read --rw=read --stonewall --name=seq-readwrite \ --rw=rw
>>>>> --stonewall --name=rand-readwrite --rw=randrw --stonewall
>>>>>
>>>>> Patched:
>>>>>    READ: bw=7077MiB/s (7421MB/s)
>>>>> Vanilla:
>>>>>    READ: bw=5134MiB/s (5384MB/s)
>>>>>
>>>>> Reference:
>>>>> 1. https://github.com/lz4/lz4/pull/645
>>>>> 2. https://github.com/lz4/lz4/pull/707
>>>>>
>>>>> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
>>>>> ---
>>>>>  lib/lz4/lz4_decompress.c | 425
>>>>> +++++++++++++++++++++++++++++++++------
>>>>>  1 file changed, 361 insertions(+), 64 deletions(-)
>>>>>
>>>>> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index
>>>>> 0c9d3ad17e0f..a4c87e32b3c0 100644
>>>>> --- a/lib/lz4/lz4_decompress.c
>>>>> +++ b/lib/lz4/lz4_decompress.c
>>>>> @@ -50,6 +50,131 @@
>>>>>  #define assert(condition) ((void)0)
>>>>>  #endif
>>>>>
>>>>> +#ifndef LZ4_FAST_DEC_LOOP
>>>>> +#if defined(__i386__) || defined(__x86_64__) #define
>>>>> +LZ4_FAST_DEC_LOOP 1 #elif defined(__aarch64__) && !defined(__clang__)
>>>>> +     /* On aarch64, we disable this optimization for clang because on certain
>>>>> +      * mobile chipsets and clang, it reduces performance. For more information
>>>>> +      * refer to https://github.com/lz4/lz4/pull/707. */ #define
>>>>> +LZ4_FAST_DEC_LOOP 1 #else #define LZ4_FAST_DEC_LOOP 0 #endif #endif
>>>>> +
>>>>> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>>>>> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>>>>> +
>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>> +#define FASTLOOP_SAFE_DISTANCE 64
>>>>> +FORCE_INLINE void
>>>>> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>>>>> +                          const size_t offset)
>>>>> +{
>>>>> +     if (offset < 8) {
>>>>> +             dstPtr[0] = srcPtr[0];
>>>>> +
>>>>> +             dstPtr[1] = srcPtr[1];
>>>>> +             dstPtr[2] = srcPtr[2];
>>>>> +             dstPtr[3] = srcPtr[3];
>>>>> +             srcPtr += inc32table[offset];
>>>>> +             memcpy(dstPtr + 4, srcPtr, 4);
>>>>> +             srcPtr -= dec64table[offset];
>>>>> +             dstPtr += 8;
>>>>> +     } else {
>>>>> +             memcpy(dstPtr, srcPtr, 8);
>>>>> +             dstPtr += 8;
>>>>> +             srcPtr += 8;
>>>>> +     }
>>>>> +
>>>>> +     LZ4_wildCopy(dstPtr, srcPtr, dstEnd); }
>>>>> +
>>>>> +/* customized variant of memcpy, which can overwrite up to 32 bytes
>>>>> +beyond dstEnd
>>>>> + * this version copies two times 16 bytes (instead of one time 32
>>>>> +bytes)
>>>>> + * because it must be compatible with offsets >= 16. */ FORCE_INLINE
>>>>> +void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) {
>>>>> +     BYTE *d = (BYTE *) dstPtr;
>>>>> +     const BYTE *s = (const BYTE *)srcPtr;
>>>>> +     BYTE *const e = (BYTE *) dstEnd;
>>>>> +
>>>>> +     do {
>>>>> +             memcpy(d, s, 16);
>>>>> +             memcpy(d + 16, s + 16, 16);
>>>>> +             d += 32;
>>>>> +             s += 32;
>>>>> +     } while (d < e);
>>>>> +}
>>>>> +
>>>>> +FORCE_INLINE void
>>>>> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>>>>> +                     const size_t offset)
>>>>> +{
>>>>> +     BYTE v[8];
>>>>> +     switch (offset) {
>>>>> +
>>>>> +     case 1:
>>>>> +             memset(v, *srcPtr, 8);
>>>>> +             goto copy_loop;
>>>>> +     case 2:
>>>>> +             memcpy(v, srcPtr, 2);
>>>>> +             memcpy(&v[2], srcPtr, 2);
>>>>> +             memcpy(&v[4], &v[0], 4);
>>>>> +             goto copy_loop;
>>>>> +     case 4:
>>>>> +             memcpy(v, srcPtr, 4);
>>>>> +             memcpy(&v[4], srcPtr, 4);
>>>>> +             goto copy_loop;
>>>>> +     default:
>>>>> +             LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>>>>> +             return;
>>>>> +     }
>>>>> +
>>>>> +      copy_loop:
>>>>> +     memcpy(dstPtr, v, 8);
>>>>> +     dstPtr += 8;
>>>>> +     while (dstPtr < dstEnd) {
>>>>> +             memcpy(dstPtr, v, 8);
>>>>> +             dstPtr += 8;
>>>>> +     }
>>>>> +}
>>>>> +
>>>>> +/* Read the variable-length literal or match length.
>>>>> + *
>>>>> + * ip - pointer to use as input.
>>>>> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>>>>> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>>>>> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>>>>> + * error (output) - error code.  Should be set to 0 before call.
>>>>> + */
>>>>> +typedef enum { loop_error = -2, initial_error = -1, ok = 0}
>>>>> +variable_length_error; FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>>>>> +                                        const BYTE *lencheck,
>>>>> +                                        int loop_check, int initial_check,
>>>>> +                                        variable_length_error *error) {
>>>>> +     unsigned length = 0;
>>>>> +     unsigned s;
>>>>> +     if (initial_check && unlikely((*ip) >= lencheck)) {     /* overflow detection */
>>>>> +             *error = initial_error;
>>>>> +             return length;
>>>>> +     }
>>>>> +     do {
>>>>> +             s = **ip;
>>>>> +             (*ip)++;
>>>>> +             length += s;
>>>>> +             if (loop_check && unlikely((*ip) >= lencheck)) {        /* overflow detection */
>>>>> +                     *error = loop_error;
>>>>> +                     return length;
>>>>> +             }
>>>>> +     } while (s == 255);
>>>>> +
>>>>> +     return length;
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>>  /*
>>>>>   * LZ4_decompress_generic() :
>>>>>   * This generic decompression function covers all use cases.
>>>>> @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>        const size_t dictSize
>>>>>        )
>>>>>  {
>>>>> -     const BYTE *ip = (const BYTE *) src;
>>>>> -     const BYTE * const iend = ip + srcSize;
>>>>> +     const BYTE *ip = (const BYTE *)src;
>>>>> +     const BYTE *const iend = ip + srcSize;
>>>>>
>>>>>       BYTE *op = (BYTE *) dst;
>>>>> -     BYTE * const oend = op + outputSize;
>>>>> +     BYTE *const oend = op + outputSize;
>>>>>       BYTE *cpy;
>>>>>
>>>>> -     const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>> -     static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>>>>> -     static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>>>>> +     const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>
>>>>>       const int safeDecode = (endOnInput == endOnInputSize);
>>>>>       const int checkOffset = ((safeDecode) && (dictSize < (int)(64 *
>>>>> KB)));
>>>>>
>>>>>       /* Set up the "end" pointers for the shortcut. */
>>>>>       const BYTE *const shortiend = iend -
>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>       const BYTE *const shortoend = oend -
>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>> +
>>>>> +     const BYTE *match;
>>>>> +     size_t offset;
>>>>> +     unsigned int token;
>>>>> +     size_t length;
>>>>>
>>>>>       DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>>>>>                srcSize, outputSize);
>>>>> @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>       if ((endOnInput) && unlikely(srcSize == 0))
>>>>>               return -1;
>>>>>
>>>>> -     /* Main Loop : decode sequences */
>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>> +     if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>>>>> +             DEBUGLOG(6, "skip fast decode loop");
>>>>> +             goto safe_decode;
>>>>> +     }
>>>>> +
>>>>> +     /* Fast loop : decode sequences as long as output <
>>>>> +iend-FASTLOOP_SAFE_DISTANCE */
>>>>>       while (1) {
>>>>> -             size_t length;
>>>>> -             const BYTE *match;
>>>>> -             size_t offset;
>>>>> +             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>>>>> +             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>>>>> +             if (endOnInput) {
>>>>> +                     assert(ip < iend);
>>>>> +             }
>>>>> +             token = *ip++;
>>>>> +             length = token >> ML_BITS;      /* literal length */
>>>>> +
>>>>> +             assert(!endOnInput || ip <= iend);      /* ip < iend before the increment */
>>>>> +
>>>>> +             /* decode literal length */
>>>>> +             if (length == RUN_MASK) {
>>>>> +                     variable_length_error error = ok;
>>>>> +                     length +=
>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>> +                                              endOnInput, endOnInput,
>>>>> +                                              &error);
>>>>> +                     if (error == initial_error) {
>>>>> +                             goto _output_error;
>>>>> +                     }
>>>>> +                     if ((safeDecode)
>>>>> +                         && unlikely((uptrval) (op) + length <
>>>>> +                                     (uptrval) (op))) {
>>>>> +                             goto _output_error;
>>>>> +                     }       /* overflow detection */
>>>>> +                     if ((safeDecode)
>>>>> +                         && unlikely((uptrval) (ip) + length <
>>>>> +                                     (uptrval) (ip))) {
>>>>> +                             goto _output_error;
>>>>> +                     }
>>>>> +
>>>>> +                     /* overflow detection */
>>>>> +                     /* copy literals */
>>>>> +                     cpy = op + length;
>>>>> +                     LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>> +                             if ((cpy > oend - 32)
>>>>> +                                 || (ip + length > iend - 32)) {
>>>>> +                                     goto safe_literal_copy;
>>>>> +                             }
>>>>> +                             LZ4_wildCopy32(op, ip, cpy);
>>>>> +                     } else {        /* LZ4_decompress_fast() */
>>>>> +                             if (cpy > oend - 8) {
>>>>> +                                     goto safe_literal_copy;
>>>>> +                             }
>>>>> +                             LZ4_wildCopy(op, ip, cpy);
>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>> +                             /* it doesn't know input length, and only relies on end-of-block */
>>>>> +                             /* properties */
>>>>> +                     }
>>>>> +                     ip += length;
>>>>> +                     op = cpy;
>>>>> +             } else {
>>>>> +                     cpy = op + length;
>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>> +                             DEBUGLOG(7,
>>>>> +                                      "copy %u bytes in a 16-bytes stripe",
>>>>> +                                      (unsigned)length);
>>>>> +                             /* We don't need to check oend */
>>>>> +                             /* since we check it once for each loop below */
>>>>> +                             if (ip > iend - (16 + 1)) {     /*max lit + offset + nextToken */
>>>>> +                                     goto safe_literal_copy;
>>>>> +                             }
>>>>> +                             /* Literals can only be 14, but hope compilers optimize */
>>>>> +                             /*if we copy by a register size */
>>>>> +                             memcpy(op, ip, 16);
>>>>> +                     } else {
>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>> +                             /* it doesn't know input length, and relies on end-of-block */
>>>>> +                             /* properties */
>>>>> +                             memcpy(op, ip, 8);
>>>>> +                             if (length > 8) {
>>>>> +                                     memcpy(op + 8, ip + 8, 8);
>>>>> +                             }
>>>>> +                     }
>>>>> +                     ip += length;
>>>>> +                     op = cpy;
>>>>> +             }
>>>>> +
>>>>> +             /* get offset */
>>>>> +             offset = LZ4_readLE16(ip);
>>>>> +             ip += 2;        /* end-of-block condition violated */
>>>>> +             match = op - offset;
>>>>> +
>>>>> +             /* get matchlength */
>>>>> +             length = token & ML_MASK;
>>>>>
>>>>> -             /* get literal length */
>>>>> -             unsigned int const token = *ip++;
>>>>> -             length = token>>ML_BITS;
>>>>> +             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>>>>> +                     goto _output_error;
>>>>> +             }
>>>>> +             /* Error : offset outside buffers */
>>>>> +             if (length == ML_MASK) {
>>>>> +                     variable_length_error error = ok;
>>>>> +                     length +=
>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>> +                                              endOnInput, 0, &error);
>>>>> +                     if (error != ok) {
>>>>> +                             goto _output_error;
>>>>> +                     }
>>>>> +                     if ((safeDecode)
>>>>> +                         && unlikely((uptrval) (op) + length < (uptrval) op)) {
>>>>> +                             goto _output_error;
>>>>> +                     }       /* overflow detection */
>>>>> +                     length += MINMATCH;
>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>> +                             goto safe_match_copy;
>>>>> +                     }
>>>>> +             } else {
>>>>> +                     length += MINMATCH;
>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>> +                             goto safe_match_copy;
>>>>> +                     }
>>>>> +
>>>>> +                     /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>>>>> +                     if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>>>>> +                             if (offset >= 8) {
>>>>> +                                     memcpy(op, match, 8);
>>>>> +                                     memcpy(op + 8, match + 8, 8);
>>>>> +                                     memcpy(op + 16, match + 16, 2);
>>>>> +                                     op += length;
>>>>> +                                     continue;
>>>>> +                             }
>>>>> +                     }
>>>>> +             }
>>>>> +
>>>>> +             /* match starting within external dictionary */
>>>>> +             if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>> +                     if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>> +                             if (partialDecoding) {
>>>>> +                                     /* reach end of buffer */
>>>>> +                                     length =
>>>>> +                                         min(length, (size_t) (oend - op));
>>>>> +                             } else {
>>>>> +                                     /* end-of-block condition violated */
>>>>> +                                     goto _output_error;
>>>>> +                             }
>>>>> +                     }
>>>>> +
>>>>> +                     if (length <= (size_t) (lowPrefix - match)) {
>>>>> +                             /* match fits entirely within external dictionary : just copy */
>>>>> +                             memmove(op, dictEnd - (lowPrefix - match), length);
>>>>> +                             op += length;
>>>>> +                     } else {
>>>>> +                             /* match stretches into both external dict and current block */
>>>>> +                             size_t const copySize =
>>>>> +                                 (size_t) (lowPrefix - match);
>>>>> +                             size_t const restSize = length - copySize;
>>>>> +                             memcpy(op, dictEnd - copySize, copySize);
>>>>> +                             op += copySize;
>>>>> +                             if (restSize > (size_t) (op - lowPrefix)) {     /* overlap copy */
>>>>> +                                     BYTE *const endOfMatch = op + restSize;
>>>>> +                                     const BYTE *copyFrom = lowPrefix;
>>>>> +                                     while (op < endOfMatch) {
>>>>> +                                             *op++ = *copyFrom++;
>>>>> +                                     }
>>>>> +                             } else {
>>>>> +                                     memcpy(op, lowPrefix, restSize);
>>>>> +                                     op += restSize;
>>>>> +                             }
>>>>> +                     }
>>>>> +                     continue;
>>>>> +             }
>>>>> +
>>>>> +             /* copy match within block */
>>>>> +             cpy = op + length;
>>>>> +
>>>>> +             assert((op <= oend) && (oend - op >= 32));
>>>>> +             if (unlikely(offset < 16)) {
>>>>> +                     LZ4_memcpy_using_offset(op, match, cpy, offset);
>>>>> +             } else {
>>>>> +                     LZ4_wildCopy32(op, match, cpy);
>>>>> +             }
>>>>> +
>>>>> +             op = cpy;       /* wildcopy correction */
>>>>> +     }
>>>>> +      safe_decode:
>>>>> +#endif
>>>>> +     /* Main Loop : decode sequences */
>>>>> +     while (1) {
>>>>> +             length = token >> ML_BITS;
>>>>>
>>>>>               /* ip < iend before the increment */
>>>>>               assert(!endOnInput || ip <= iend);
>>>>> @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>                * combined check for both stages).
>>>>>                */
>>>>>               if ((endOnInput ? length != RUN_MASK : length <= 8)
>>>>> -                /*
>>>>> -                 * strictly "less than" on input, to re-enter
>>>>> -                 * the loop with at least one byte
>>>>> -                 */
>>>>> -                && likely((endOnInput ? ip < shortiend : 1) &
>>>>> -                          (op <= shortoend))) {
>>>>> +                 /*
>>>>> +                  * strictly "less than" on input, to re-enter
>>>>> +                  * the loop with at least one byte
>>>>> +                  */
>>>>> +                 && likely((endOnInput ? ip < shortiend : 1) &
>>>>> +                           (op <= shortoend))) {
>>>>>                       /* Copy the literals */
>>>>>                       memcpy(op, ip, endOnInput ? 16 : 8);
>>>>> -                     op += length; ip += length;
>>>>> +                     op += length;
>>>>> +                     ip += length;
>>>>>
>>>>>                       /*
>>>>>                        * The second stage:
>>>>>                        * prepare for match copying, decode full info.
>>>>>                        * If it doesn't work out, the info won't be wasted.
>>>>>                        */
>>>>> -                     length = token & ML_MASK; /* match length */
>>>>> +                     length = token & ML_MASK;       /* match length */
>>>>>                       offset = LZ4_readLE16(ip);
>>>>>                       ip += 2;
>>>>>                       match = op - offset;
>>>>> -                     assert(match <= op); /* check overflow */
>>>>> +                     assert(match <= op);    /* check overflow */
>>>>>
>>>>>                       /* Do not deal with overlapping matches. */
>>>>>                       if ((length != ML_MASK) &&
>>>>> @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>
>>>>>               /* decode literal length */
>>>>>               if (length == RUN_MASK) {
>>>>> -                     unsigned int s;
>>>>>
>>>>> -                     if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>>>>> -                             /* overflow detection */
>>>>> +                     variable_length_error error = ok;
>>>>> +                     length +=
>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>> +                                              endOnInput, endOnInput,
>>>>> +                                              &error);
>>>>> +                     if (error == initial_error)
>>>>>                               goto _output_error;
>>>>> -                     }
>>>>> -                     do {
>>>>> -                             s = *ip++;
>>>>> -                             length += s;
>>>>> -                     } while (likely(endOnInput
>>>>> -                             ? ip < iend - RUN_MASK
>>>>> -                             : 1) & (s == 255));
>>>>>
>>>>>                       if ((safeDecode)
>>>>> -                         && unlikely((uptrval)(op) +
>>>>> -                                     length < (uptrval)(op))) {
>>>>> +                         && unlikely((uptrval) (op) +
>>>>> +                                     length < (uptrval) (op))) {
>>>>>                               /* overflow detection */
>>>>>                               goto _output_error;
>>>>>                       }
>>>>>                       if ((safeDecode)
>>>>> -                         && unlikely((uptrval)(ip) +
>>>>> -                                     length < (uptrval)(ip))) {
>>>>> +                         && unlikely((uptrval) (ip) +
>>>>> +                                     length < (uptrval) (ip))) {
>>>>>                               /* overflow detection */
>>>>>                               goto _output_error;
>>>>>                       }
>>>>> @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>
>>>>>               /* copy literals */
>>>>>               cpy = op + length;
>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>> +           safe_literal_copy:
>>>>> +#endif
>>>>>               LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>
>>>>>               if (((endOnInput) && ((cpy > oend - MFLIMIT)
>>>>> -                     || (ip + length > iend - (2 + 1 + LASTLITERALS))))
>>>>> -                     || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>> +                                   || (ip + length >
>>>>> +                                       iend - (2 + 1 + LASTLITERALS))))
>>>>> +                 || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>                       if (partialDecoding) {
>>>>>                               if (cpy > oend) {
>>>>>                                       /*
>>>>> @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>                                       length = oend - op;
>>>>>                               }
>>>>>                               if ((endOnInput)
>>>>> -                                     && (ip + length > iend)) {
>>>>> +                                 && (ip + length > iend)) {
>>>>>                                       /*
>>>>>                                        * Error :
>>>>>                                        * read attempt beyond
>>>>> @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>                               }
>>>>>                       } else {
>>>>>                               if ((!endOnInput)
>>>>> -                                     && (cpy != oend)) {
>>>>> +                                 && (cpy != oend)) {
>>>>>                                       /*
>>>>>                                        * Error :
>>>>>                                        * block decoding must
>>>>> @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>                                       goto _output_error;
>>>>>                               }
>>>>>                               if ((endOnInput)
>>>>> -                                     && ((ip + length != iend)
>>>>> +                                 && ((ip + length != iend)
>>>>>                                       || (cpy > oend))) {
>>>>>                                       /*
>>>>>                                        * Error :
>>>>> @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>                       goto _output_error;
>>>>>               }
>>>>>
>>>>> -             /* costs ~1%; silence an msan warning when offset == 0 */
>>>>> -             /*
>>>>> -              * note : when partialDecoding, there is no guarantee that
>>>>> -              * at least 4 bytes remain available in output buffer
>>>>> -              */
>>>>> -             if (!partialDecoding) {
>>>>> -                     assert(oend > op);
>>>>> -                     assert(oend - op >= 4);
>>>>> -
>>>>> -                     LZ4_write32(op, (U32)offset);
>>>>> -             }
>>>>> -
>>>>>               if (length == ML_MASK) {
>>>>> -                     unsigned int s;
>>>>> -
>>>>> -                     do {
>>>>> -                             s = *ip++;
>>>>> -
>>>>> -                             if ((endOnInput) && (ip > iend - LASTLITERALS))
>>>>> -                                     goto _output_error;
>>>>>
>>>>> -                             length += s;
>>>>> -                     } while (s == 255);
>>>>> +                     variable_length_error error = ok;
>>>>> +                     length +=
>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>> +                                              endOnInput, 0, &error);
>>>>> +                     if (error != ok)
>>>>> +                             goto _output_error;
>>>>>
>>>>>                       if ((safeDecode)
>>>>>                               && unlikely(
>>>>> @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>
>>>>>               length += MINMATCH;
>>>>>
>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>> +safe_match_copy:
>>>>> +#endif
>>>>> +
>>>>>               /* match starting within external dictionary */
>>>>>               if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>                       if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-15 23:48           ` Mao, Chenxi
@ 2019-05-16  1:35             ` Gao Xiang
  2019-05-16  2:07               ` Mao, Chenxi
  2019-05-16  2:11               ` Mao, Chenxi
  0 siblings, 2 replies; 16+ messages in thread
From: Gao Xiang @ 2019-05-16  1:35 UTC (permalink / raw)
  To: Mao, Chenxi
  Cc: Cyan, akpm, linux-kernel, Feng, Roy, Xu, Yuanli 2, Alm, Robert 2,
	Takahashi, Masaya A (Sony Mobile),
	Miao Xie, Greg Kroah-Hartman



On 2019/5/16 7:48, Mao, Chenxi wrote:
> Hi Yann and Xiang:
> For this FAST_DEC_LOOP change, I only pick up decompress related patches to current kernel LZ4 implementation(based on 1.8.3).
> Here is my cherry-pick list:
> 2589c44 created LZ4_FAST_DEC_LOOP build macro
> 605d811 enable LZ4_FAST_DEC_LOOP build macro on aarch64/GCC by default
> 5d7d116 decompress_generic: Limit fastpath to x86
> 75fb878 decompress_generic: Add fastpath for small offsets
> faac110 decompress_generic: Unroll loops a bit more
> 1fbaf84 decompress_generic: remove msan write
> 28b8249 decompress_generic: re-add fastpath
> 232f1e2 decompress_generic: drop partial copy check in fast loop
> 59332a3 decompress_generic: Optimize literal copies
> 5dfa7d4 decompress_generic: optimize match copy
> 28356e0 decompress_generic: Add a loop fastpath
> 4da3360 decompress_generic: Refactor variable length fields
> 
> Only cherry-pick these changes to LZ4 1.8.3 would not be introduce risks.

The fact is that ... LZ4_FAST_DEC_LOOP almost influences all decompression apis.
You should keep LZ4_decompress_generic from all known issues.

> 
> @Xiang:
> Do you prefer to upgrade kernel lz4 to 1.9.1 directly or only pick some optimization patches?

I think you should pick all patches which are related to LZ4_decompress_generic()
and LZ4_FAST_DEC_LOOP since known issues of LZ4_FAST_DEC_LOOP should be fixed of course.

Thanks,
Gao Xiang

> 
> Chenxi
> 
> 
> On 5/16/19 1:47 AM, Gao Xiang wrote:
>> Hi Yann,
>>
>> On 2019/5/16 1:03, Cyan wrote:
>>> Re-posted,
>>> it seems the previous message was rejected by the linux-kernel server
>>> due to some kind of format limitation (no html).
>>>
>>>
>>> Le mer. 15 mai 2019 à 09:56, Cyan <yann.collet.73@gmail.com> a écrit :
>>>>
>>>> The v1.9.0 version has a bug which makes it read a few bytes out of bound in certain cases.
>>>> This was fixed in v1.9.1.
>>>> Therefore, if you plan upgrading version, skip directly to v1.9.1.
>>>>
>>>> Note that, in v1.9.1, LZ4_decompress_fast() is now deprecated.
>>>> LZ4_decompress_fast() is a security liability, since it's unable to cope with malicious inputs.
>>>> While this is not a problem when the producer is under direct control (local production and consumption, no intermediate storage),
>>>> this entry point seems misused in multiple scenarios where input cannot be decently trusted.
>>>> The deprecation warning is meant to "scare away" such usages.
>>>> It's still possible to intentionally remove the warning, for users who know what they are doing.
>>>>
>>>> Note that, on top of that, the _fast() variant is now slower that LZ4_decompress_safe(), removing its most important positive differentiator.
>>>> That's because LZ4_decompress_fast() is "blind", it doesn't know the input size.
>>>> It just guesses it, from the requested output size, and by relying on end-of-block conditions.
>>>> This effectively limits its capability to copy up to 8 bytes at a time,
>>>> which proves slower than the 32-bytes at a time used in LZ4_decompress_safe().
>>>>
>>>> LZ4_decompress_fast() still has some possible usage :
>>>> it doesn't need the compressed size.
>>>> Therefore, in a scenario where decompressed size is known, but compressed size isn't,
>>>> it cannot be replaced by `LZ4_decompress_safe()`.
>>>> This is pretty rare, but can happen in some cases, erofs for example.
>>>> `LZ4_decompress_safe_partial()` *might* be able to replace it for such scenario,
>>>> but I haven't thoroughly tested it, so can't yet vouch for it.
>>
>> Yes, that is my real concern...
>>
>> In fact, we have shipped erofs with LZ4_decompress_safe_partial() of lz4-1.8.3
>> for our HUAWEI P30/P30 Pro and other HUAWEI EMUI 9.1 phones on the market...
>>
>> It seemd lz4-1.8.3 is rather stable since no stability report from our internal
>> beta test and real consumers...
>>
>> I haven't tested lz4-1.9.0 yet since I am working on the erofs paper [1]...
>>
>> But anyway, the patch looks good to me since it has obvious performance gain...
>>
>>>>
>>>> If it doesn't, then it will be necessary to create a new entry point.
>>>> The main difference is that this new entry point will need a bound for the input size, in order to guarantee it never reads beyond input buffer.
>>>> But as said, for the time being, such dedicated entry point doesn't exist.
>>
>> I agree, but current erofs still have to choose LZ4_decompress_fast...
>> I will test these 2 apis later and will report if something is wrong as well.
>>
>> [1] https://www.usenix.org/conference/atc19/presentation/gao
>>     https://www.usenix.org/conference/atc19/technical-sessions
>>
>> Thanks,
>> Gao Xiang
>>
>>
>>>>
>>>> The new FAST_DEC_LOOP setting works well on x86 and x64.
>>>> Recently, it has been extended to gcc+arm64 (`dev` branch).
>>>> The combination of clang+arm64 is less successful though (impacts vary, depending on version and exact hardware), so it's still disabled by default.
>>>> But anyone can test it by setting FAST_DEC_LOOP definition to 1, thus enabling the new decoder loop.
>>>>
>>>>
>>>> Y.
>>>>
>>>> Le mar. 14 mai 2019 à 20:28, Mao, Chenxi <Chenxi.Mao@sony.com> a écrit :
>>>>>
>>>>> Hi Xiang:
>>>>>
>>>>> Thanks for your reply, I will have a stress test on my device later.
>>>>> I didn't have chance to test LZ4 with clang build because of device limitation. I think I could do it later.
>>>>> I guess the clang performance downgrade is caused by some compiler optimization options.
>>>>> I will double check it later if I got the device which can build kernel with clang.
>>>>>
>>>>> BTW, the FAST_DEC_LOOP leverage LZ4_wildCopy8 instead of LZ4_wildCopy,
>>>>> however based on my test, original LZ4_wildCopy has the better performance.
>>>>> Original LZ4_wildCopy API still invoked by FAST_DEC_LOOP apis.
>>>>> That might be a problem for X86 devices for current patch.
>>>>>
>>>>> Chenxi
>>>>>
>>>>> -----Original Message-----
>>>>> From: Gao Xiang [mailto:gaoxiang25@huawei.com]
>>>>> Sent: Wednesday, May 15, 2019 10:21 AM
>>>>> To: Mao, Chenxi <Chenxi.Mao@sony.com>
>>>>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org; Feng, Roy <Roy.Feng@sony.com>; Xu, Yuanli 2 <Yuanli.Xu@sony.com>; Alm, Robert 2 <Robert.Alm@sony.com>; Takahashi, Masaya A (Sony Mobile) <Masaya.A.Takahashi@sony.com>; Miao Xie <miaoxie@huawei.com>
>>>>> Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
>>>>>
>>>>> Hi Chenxi,
>>>>>
>>>>> On 2019/5/15 8:43, Chenxi Mao wrote:
>>>>>> FAST_DEC_LOOP was introduced from LZ4 1.9.
>>>>>> This change would be introduce 10% on decompress operation according
>>>>>> to LZ4 benchmark result on X86 devices.
>>>>>> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements, however
>>>>>> clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>>>>
>>>>> I noticed this optimization and lz4-1.9.0 [1] was just released month ago with this big change therefore I'm a little afraid of its current stability (especially some rare used decompression apis...)
>>>>>
>>>>> Could you Cc more people (e.g. lz4 original author Yann Collet) in order to get some more ideas about this if possible?
>>>>>
>>>>> Anyway, I like this optimization as well since FAST_DEC_LOOP improves decompression speed a lot :)
>>>>>
>>>>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>>>>
>>>>> Thanks,
>>>>> Gao Xiang
>>>>>
>>>>>>
>>>>>> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>>>>>>
>>>>>> Here is the test result on ARM64(cortex-A53)
>>>>>>
>>>>>> Benchmark via ZRAM:
>>>>>>
>>>>>> Test case:
>>>>>> fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>>>>>> --buffer_compress_percentage=75 \
>>>>>> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
>>>>>> --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
>>>>>> --name=seq-read --rw=read --stonewall --name=seq-readwrite \ --rw=rw
>>>>>> --stonewall --name=rand-readwrite --rw=randrw --stonewall
>>>>>>
>>>>>> Patched:
>>>>>>    READ: bw=7077MiB/s (7421MB/s)
>>>>>> Vanilla:
>>>>>>    READ: bw=5134MiB/s (5384MB/s)
>>>>>>
>>>>>> Reference:
>>>>>> 1. https://github.com/lz4/lz4/pull/645
>>>>>> 2. https://github.com/lz4/lz4/pull/707
>>>>>>
>>>>>> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
>>>>>> ---
>>>>>>  lib/lz4/lz4_decompress.c | 425
>>>>>> +++++++++++++++++++++++++++++++++------
>>>>>>  1 file changed, 361 insertions(+), 64 deletions(-)
>>>>>>
>>>>>> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index
>>>>>> 0c9d3ad17e0f..a4c87e32b3c0 100644
>>>>>> --- a/lib/lz4/lz4_decompress.c
>>>>>> +++ b/lib/lz4/lz4_decompress.c
>>>>>> @@ -50,6 +50,131 @@
>>>>>>  #define assert(condition) ((void)0)
>>>>>>  #endif
>>>>>>
>>>>>> +#ifndef LZ4_FAST_DEC_LOOP
>>>>>> +#if defined(__i386__) || defined(__x86_64__) #define
>>>>>> +LZ4_FAST_DEC_LOOP 1 #elif defined(__aarch64__) && !defined(__clang__)
>>>>>> +     /* On aarch64, we disable this optimization for clang because on certain
>>>>>> +      * mobile chipsets and clang, it reduces performance. For more information
>>>>>> +      * refer to https://github.com/lz4/lz4/pull/707. */ #define
>>>>>> +LZ4_FAST_DEC_LOOP 1 #else #define LZ4_FAST_DEC_LOOP 0 #endif #endif
>>>>>> +
>>>>>> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>>>>>> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>>>>>> +
>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>> +#define FASTLOOP_SAFE_DISTANCE 64
>>>>>> +FORCE_INLINE void
>>>>>> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>>>>>> +                          const size_t offset)
>>>>>> +{
>>>>>> +     if (offset < 8) {
>>>>>> +             dstPtr[0] = srcPtr[0];
>>>>>> +
>>>>>> +             dstPtr[1] = srcPtr[1];
>>>>>> +             dstPtr[2] = srcPtr[2];
>>>>>> +             dstPtr[3] = srcPtr[3];
>>>>>> +             srcPtr += inc32table[offset];
>>>>>> +             memcpy(dstPtr + 4, srcPtr, 4);
>>>>>> +             srcPtr -= dec64table[offset];
>>>>>> +             dstPtr += 8;
>>>>>> +     } else {
>>>>>> +             memcpy(dstPtr, srcPtr, 8);
>>>>>> +             dstPtr += 8;
>>>>>> +             srcPtr += 8;
>>>>>> +     }
>>>>>> +
>>>>>> +     LZ4_wildCopy(dstPtr, srcPtr, dstEnd); }
>>>>>> +
>>>>>> +/* customized variant of memcpy, which can overwrite up to 32 bytes
>>>>>> +beyond dstEnd
>>>>>> + * this version copies two times 16 bytes (instead of one time 32
>>>>>> +bytes)
>>>>>> + * because it must be compatible with offsets >= 16. */ FORCE_INLINE
>>>>>> +void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) {
>>>>>> +     BYTE *d = (BYTE *) dstPtr;
>>>>>> +     const BYTE *s = (const BYTE *)srcPtr;
>>>>>> +     BYTE *const e = (BYTE *) dstEnd;
>>>>>> +
>>>>>> +     do {
>>>>>> +             memcpy(d, s, 16);
>>>>>> +             memcpy(d + 16, s + 16, 16);
>>>>>> +             d += 32;
>>>>>> +             s += 32;
>>>>>> +     } while (d < e);
>>>>>> +}
>>>>>> +
>>>>>> +FORCE_INLINE void
>>>>>> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>>>>>> +                     const size_t offset)
>>>>>> +{
>>>>>> +     BYTE v[8];
>>>>>> +     switch (offset) {
>>>>>> +
>>>>>> +     case 1:
>>>>>> +             memset(v, *srcPtr, 8);
>>>>>> +             goto copy_loop;
>>>>>> +     case 2:
>>>>>> +             memcpy(v, srcPtr, 2);
>>>>>> +             memcpy(&v[2], srcPtr, 2);
>>>>>> +             memcpy(&v[4], &v[0], 4);
>>>>>> +             goto copy_loop;
>>>>>> +     case 4:
>>>>>> +             memcpy(v, srcPtr, 4);
>>>>>> +             memcpy(&v[4], srcPtr, 4);
>>>>>> +             goto copy_loop;
>>>>>> +     default:
>>>>>> +             LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>>>>>> +             return;
>>>>>> +     }
>>>>>> +
>>>>>> +      copy_loop:
>>>>>> +     memcpy(dstPtr, v, 8);
>>>>>> +     dstPtr += 8;
>>>>>> +     while (dstPtr < dstEnd) {
>>>>>> +             memcpy(dstPtr, v, 8);
>>>>>> +             dstPtr += 8;
>>>>>> +     }
>>>>>> +}
>>>>>> +
>>>>>> +/* Read the variable-length literal or match length.
>>>>>> + *
>>>>>> + * ip - pointer to use as input.
>>>>>> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>>>>>> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>>>>>> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>>>>>> + * error (output) - error code.  Should be set to 0 before call.
>>>>>> + */
>>>>>> +typedef enum { loop_error = -2, initial_error = -1, ok = 0}
>>>>>> +variable_length_error; FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>>>>>> +                                        const BYTE *lencheck,
>>>>>> +                                        int loop_check, int initial_check,
>>>>>> +                                        variable_length_error *error) {
>>>>>> +     unsigned length = 0;
>>>>>> +     unsigned s;
>>>>>> +     if (initial_check && unlikely((*ip) >= lencheck)) {     /* overflow detection */
>>>>>> +             *error = initial_error;
>>>>>> +             return length;
>>>>>> +     }
>>>>>> +     do {
>>>>>> +             s = **ip;
>>>>>> +             (*ip)++;
>>>>>> +             length += s;
>>>>>> +             if (loop_check && unlikely((*ip) >= lencheck)) {        /* overflow detection */
>>>>>> +                     *error = loop_error;
>>>>>> +                     return length;
>>>>>> +             }
>>>>>> +     } while (s == 255);
>>>>>> +
>>>>>> +     return length;
>>>>>> +}
>>>>>> +#endif
>>>>>> +
>>>>>>  /*
>>>>>>   * LZ4_decompress_generic() :
>>>>>>   * This generic decompression function covers all use cases.
>>>>>> @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>        const size_t dictSize
>>>>>>        )
>>>>>>  {
>>>>>> -     const BYTE *ip = (const BYTE *) src;
>>>>>> -     const BYTE * const iend = ip + srcSize;
>>>>>> +     const BYTE *ip = (const BYTE *)src;
>>>>>> +     const BYTE *const iend = ip + srcSize;
>>>>>>
>>>>>>       BYTE *op = (BYTE *) dst;
>>>>>> -     BYTE * const oend = op + outputSize;
>>>>>> +     BYTE *const oend = op + outputSize;
>>>>>>       BYTE *cpy;
>>>>>>
>>>>>> -     const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>> -     static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>>>>>> -     static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>>>>>> +     const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>
>>>>>>       const int safeDecode = (endOnInput == endOnInputSize);
>>>>>>       const int checkOffset = ((safeDecode) && (dictSize < (int)(64 *
>>>>>> KB)));
>>>>>>
>>>>>>       /* Set up the "end" pointers for the shortcut. */
>>>>>>       const BYTE *const shortiend = iend -
>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>       const BYTE *const shortoend = oend -
>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>> +
>>>>>> +     const BYTE *match;
>>>>>> +     size_t offset;
>>>>>> +     unsigned int token;
>>>>>> +     size_t length;
>>>>>>
>>>>>>       DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>>>>>>                srcSize, outputSize);
>>>>>> @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>       if ((endOnInput) && unlikely(srcSize == 0))
>>>>>>               return -1;
>>>>>>
>>>>>> -     /* Main Loop : decode sequences */
>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>> +     if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>>>>>> +             DEBUGLOG(6, "skip fast decode loop");
>>>>>> +             goto safe_decode;
>>>>>> +     }
>>>>>> +
>>>>>> +     /* Fast loop : decode sequences as long as output <
>>>>>> +iend-FASTLOOP_SAFE_DISTANCE */
>>>>>>       while (1) {
>>>>>> -             size_t length;
>>>>>> -             const BYTE *match;
>>>>>> -             size_t offset;
>>>>>> +             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>>>>>> +             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>>>>>> +             if (endOnInput) {
>>>>>> +                     assert(ip < iend);
>>>>>> +             }
>>>>>> +             token = *ip++;
>>>>>> +             length = token >> ML_BITS;      /* literal length */
>>>>>> +
>>>>>> +             assert(!endOnInput || ip <= iend);      /* ip < iend before the increment */
>>>>>> +
>>>>>> +             /* decode literal length */
>>>>>> +             if (length == RUN_MASK) {
>>>>>> +                     variable_length_error error = ok;
>>>>>> +                     length +=
>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>> +                                              endOnInput, endOnInput,
>>>>>> +                                              &error);
>>>>>> +                     if (error == initial_error) {
>>>>>> +                             goto _output_error;
>>>>>> +                     }
>>>>>> +                     if ((safeDecode)
>>>>>> +                         && unlikely((uptrval) (op) + length <
>>>>>> +                                     (uptrval) (op))) {
>>>>>> +                             goto _output_error;
>>>>>> +                     }       /* overflow detection */
>>>>>> +                     if ((safeDecode)
>>>>>> +                         && unlikely((uptrval) (ip) + length <
>>>>>> +                                     (uptrval) (ip))) {
>>>>>> +                             goto _output_error;
>>>>>> +                     }
>>>>>> +
>>>>>> +                     /* overflow detection */
>>>>>> +                     /* copy literals */
>>>>>> +                     cpy = op + length;
>>>>>> +                     LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>> +                             if ((cpy > oend - 32)
>>>>>> +                                 || (ip + length > iend - 32)) {
>>>>>> +                                     goto safe_literal_copy;
>>>>>> +                             }
>>>>>> +                             LZ4_wildCopy32(op, ip, cpy);
>>>>>> +                     } else {        /* LZ4_decompress_fast() */
>>>>>> +                             if (cpy > oend - 8) {
>>>>>> +                                     goto safe_literal_copy;
>>>>>> +                             }
>>>>>> +                             LZ4_wildCopy(op, ip, cpy);
>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>> +                             /* it doesn't know input length, and only relies on end-of-block */
>>>>>> +                             /* properties */
>>>>>> +                     }
>>>>>> +                     ip += length;
>>>>>> +                     op = cpy;
>>>>>> +             } else {
>>>>>> +                     cpy = op + length;
>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>> +                             DEBUGLOG(7,
>>>>>> +                                      "copy %u bytes in a 16-bytes stripe",
>>>>>> +                                      (unsigned)length);
>>>>>> +                             /* We don't need to check oend */
>>>>>> +                             /* since we check it once for each loop below */
>>>>>> +                             if (ip > iend - (16 + 1)) {     /*max lit + offset + nextToken */
>>>>>> +                                     goto safe_literal_copy;
>>>>>> +                             }
>>>>>> +                             /* Literals can only be 14, but hope compilers optimize */
>>>>>> +                             /*if we copy by a register size */
>>>>>> +                             memcpy(op, ip, 16);
>>>>>> +                     } else {
>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>> +                             /* it doesn't know input length, and relies on end-of-block */
>>>>>> +                             /* properties */
>>>>>> +                             memcpy(op, ip, 8);
>>>>>> +                             if (length > 8) {
>>>>>> +                                     memcpy(op + 8, ip + 8, 8);
>>>>>> +                             }
>>>>>> +                     }
>>>>>> +                     ip += length;
>>>>>> +                     op = cpy;
>>>>>> +             }
>>>>>> +
>>>>>> +             /* get offset */
>>>>>> +             offset = LZ4_readLE16(ip);
>>>>>> +             ip += 2;        /* end-of-block condition violated */
>>>>>> +             match = op - offset;
>>>>>> +
>>>>>> +             /* get matchlength */
>>>>>> +             length = token & ML_MASK;
>>>>>>
>>>>>> -             /* get literal length */
>>>>>> -             unsigned int const token = *ip++;
>>>>>> -             length = token>>ML_BITS;
>>>>>> +             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>>>>>> +                     goto _output_error;
>>>>>> +             }
>>>>>> +             /* Error : offset outside buffers */
>>>>>> +             if (length == ML_MASK) {
>>>>>> +                     variable_length_error error = ok;
>>>>>> +                     length +=
>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>> +                                              endOnInput, 0, &error);
>>>>>> +                     if (error != ok) {
>>>>>> +                             goto _output_error;
>>>>>> +                     }
>>>>>> +                     if ((safeDecode)
>>>>>> +                         && unlikely((uptrval) (op) + length < (uptrval) op)) {
>>>>>> +                             goto _output_error;
>>>>>> +                     }       /* overflow detection */
>>>>>> +                     length += MINMATCH;
>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>> +                             goto safe_match_copy;
>>>>>> +                     }
>>>>>> +             } else {
>>>>>> +                     length += MINMATCH;
>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>> +                             goto safe_match_copy;
>>>>>> +                     }
>>>>>> +
>>>>>> +                     /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>>>>>> +                     if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>>>>>> +                             if (offset >= 8) {
>>>>>> +                                     memcpy(op, match, 8);
>>>>>> +                                     memcpy(op + 8, match + 8, 8);
>>>>>> +                                     memcpy(op + 16, match + 16, 2);
>>>>>> +                                     op += length;
>>>>>> +                                     continue;
>>>>>> +                             }
>>>>>> +                     }
>>>>>> +             }
>>>>>> +
>>>>>> +             /* match starting within external dictionary */
>>>>>> +             if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>> +                     if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>> +                             if (partialDecoding) {
>>>>>> +                                     /* reach end of buffer */
>>>>>> +                                     length =
>>>>>> +                                         min(length, (size_t) (oend - op));
>>>>>> +                             } else {
>>>>>> +                                     /* end-of-block condition violated */
>>>>>> +                                     goto _output_error;
>>>>>> +                             }
>>>>>> +                     }
>>>>>> +
>>>>>> +                     if (length <= (size_t) (lowPrefix - match)) {
>>>>>> +                             /* match fits entirely within external dictionary : just copy */
>>>>>> +                             memmove(op, dictEnd - (lowPrefix - match), length);
>>>>>> +                             op += length;
>>>>>> +                     } else {
>>>>>> +                             /* match stretches into both external dict and current block */
>>>>>> +                             size_t const copySize =
>>>>>> +                                 (size_t) (lowPrefix - match);
>>>>>> +                             size_t const restSize = length - copySize;
>>>>>> +                             memcpy(op, dictEnd - copySize, copySize);
>>>>>> +                             op += copySize;
>>>>>> +                             if (restSize > (size_t) (op - lowPrefix)) {     /* overlap copy */
>>>>>> +                                     BYTE *const endOfMatch = op + restSize;
>>>>>> +                                     const BYTE *copyFrom = lowPrefix;
>>>>>> +                                     while (op < endOfMatch) {
>>>>>> +                                             *op++ = *copyFrom++;
>>>>>> +                                     }
>>>>>> +                             } else {
>>>>>> +                                     memcpy(op, lowPrefix, restSize);
>>>>>> +                                     op += restSize;
>>>>>> +                             }
>>>>>> +                     }
>>>>>> +                     continue;
>>>>>> +             }
>>>>>> +
>>>>>> +             /* copy match within block */
>>>>>> +             cpy = op + length;
>>>>>> +
>>>>>> +             assert((op <= oend) && (oend - op >= 32));
>>>>>> +             if (unlikely(offset < 16)) {
>>>>>> +                     LZ4_memcpy_using_offset(op, match, cpy, offset);
>>>>>> +             } else {
>>>>>> +                     LZ4_wildCopy32(op, match, cpy);
>>>>>> +             }
>>>>>> +
>>>>>> +             op = cpy;       /* wildcopy correction */
>>>>>> +     }
>>>>>> +      safe_decode:
>>>>>> +#endif
>>>>>> +     /* Main Loop : decode sequences */
>>>>>> +     while (1) {
>>>>>> +             length = token >> ML_BITS;
>>>>>>
>>>>>>               /* ip < iend before the increment */
>>>>>>               assert(!endOnInput || ip <= iend);
>>>>>> @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>                * combined check for both stages).
>>>>>>                */
>>>>>>               if ((endOnInput ? length != RUN_MASK : length <= 8)
>>>>>> -                /*
>>>>>> -                 * strictly "less than" on input, to re-enter
>>>>>> -                 * the loop with at least one byte
>>>>>> -                 */
>>>>>> -                && likely((endOnInput ? ip < shortiend : 1) &
>>>>>> -                          (op <= shortoend))) {
>>>>>> +                 /*
>>>>>> +                  * strictly "less than" on input, to re-enter
>>>>>> +                  * the loop with at least one byte
>>>>>> +                  */
>>>>>> +                 && likely((endOnInput ? ip < shortiend : 1) &
>>>>>> +                           (op <= shortoend))) {
>>>>>>                       /* Copy the literals */
>>>>>>                       memcpy(op, ip, endOnInput ? 16 : 8);
>>>>>> -                     op += length; ip += length;
>>>>>> +                     op += length;
>>>>>> +                     ip += length;
>>>>>>
>>>>>>                       /*
>>>>>>                        * The second stage:
>>>>>>                        * prepare for match copying, decode full info.
>>>>>>                        * If it doesn't work out, the info won't be wasted.
>>>>>>                        */
>>>>>> -                     length = token & ML_MASK; /* match length */
>>>>>> +                     length = token & ML_MASK;       /* match length */
>>>>>>                       offset = LZ4_readLE16(ip);
>>>>>>                       ip += 2;
>>>>>>                       match = op - offset;
>>>>>> -                     assert(match <= op); /* check overflow */
>>>>>> +                     assert(match <= op);    /* check overflow */
>>>>>>
>>>>>>                       /* Do not deal with overlapping matches. */
>>>>>>                       if ((length != ML_MASK) &&
>>>>>> @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>
>>>>>>               /* decode literal length */
>>>>>>               if (length == RUN_MASK) {
>>>>>> -                     unsigned int s;
>>>>>>
>>>>>> -                     if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>>>>>> -                             /* overflow detection */
>>>>>> +                     variable_length_error error = ok;
>>>>>> +                     length +=
>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>> +                                              endOnInput, endOnInput,
>>>>>> +                                              &error);
>>>>>> +                     if (error == initial_error)
>>>>>>                               goto _output_error;
>>>>>> -                     }
>>>>>> -                     do {
>>>>>> -                             s = *ip++;
>>>>>> -                             length += s;
>>>>>> -                     } while (likely(endOnInput
>>>>>> -                             ? ip < iend - RUN_MASK
>>>>>> -                             : 1) & (s == 255));
>>>>>>
>>>>>>                       if ((safeDecode)
>>>>>> -                         && unlikely((uptrval)(op) +
>>>>>> -                                     length < (uptrval)(op))) {
>>>>>> +                         && unlikely((uptrval) (op) +
>>>>>> +                                     length < (uptrval) (op))) {
>>>>>>                               /* overflow detection */
>>>>>>                               goto _output_error;
>>>>>>                       }
>>>>>>                       if ((safeDecode)
>>>>>> -                         && unlikely((uptrval)(ip) +
>>>>>> -                                     length < (uptrval)(ip))) {
>>>>>> +                         && unlikely((uptrval) (ip) +
>>>>>> +                                     length < (uptrval) (ip))) {
>>>>>>                               /* overflow detection */
>>>>>>                               goto _output_error;
>>>>>>                       }
>>>>>> @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>
>>>>>>               /* copy literals */
>>>>>>               cpy = op + length;
>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>> +           safe_literal_copy:
>>>>>> +#endif
>>>>>>               LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>
>>>>>>               if (((endOnInput) && ((cpy > oend - MFLIMIT)
>>>>>> -                     || (ip + length > iend - (2 + 1 + LASTLITERALS))))
>>>>>> -                     || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>> +                                   || (ip + length >
>>>>>> +                                       iend - (2 + 1 + LASTLITERALS))))
>>>>>> +                 || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>                       if (partialDecoding) {
>>>>>>                               if (cpy > oend) {
>>>>>>                                       /*
>>>>>> @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>                                       length = oend - op;
>>>>>>                               }
>>>>>>                               if ((endOnInput)
>>>>>> -                                     && (ip + length > iend)) {
>>>>>> +                                 && (ip + length > iend)) {
>>>>>>                                       /*
>>>>>>                                        * Error :
>>>>>>                                        * read attempt beyond
>>>>>> @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>                               }
>>>>>>                       } else {
>>>>>>                               if ((!endOnInput)
>>>>>> -                                     && (cpy != oend)) {
>>>>>> +                                 && (cpy != oend)) {
>>>>>>                                       /*
>>>>>>                                        * Error :
>>>>>>                                        * block decoding must
>>>>>> @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>                                       goto _output_error;
>>>>>>                               }
>>>>>>                               if ((endOnInput)
>>>>>> -                                     && ((ip + length != iend)
>>>>>> +                                 && ((ip + length != iend)
>>>>>>                                       || (cpy > oend))) {
>>>>>>                                       /*
>>>>>>                                        * Error :
>>>>>> @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>                       goto _output_error;
>>>>>>               }
>>>>>>
>>>>>> -             /* costs ~1%; silence an msan warning when offset == 0 */
>>>>>> -             /*
>>>>>> -              * note : when partialDecoding, there is no guarantee that
>>>>>> -              * at least 4 bytes remain available in output buffer
>>>>>> -              */
>>>>>> -             if (!partialDecoding) {
>>>>>> -                     assert(oend > op);
>>>>>> -                     assert(oend - op >= 4);
>>>>>> -
>>>>>> -                     LZ4_write32(op, (U32)offset);
>>>>>> -             }
>>>>>> -
>>>>>>               if (length == ML_MASK) {
>>>>>> -                     unsigned int s;
>>>>>> -
>>>>>> -                     do {
>>>>>> -                             s = *ip++;
>>>>>> -
>>>>>> -                             if ((endOnInput) && (ip > iend - LASTLITERALS))
>>>>>> -                                     goto _output_error;
>>>>>>
>>>>>> -                             length += s;
>>>>>> -                     } while (s == 255);
>>>>>> +                     variable_length_error error = ok;
>>>>>> +                     length +=
>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>> +                                              endOnInput, 0, &error);
>>>>>> +                     if (error != ok)
>>>>>> +                             goto _output_error;
>>>>>>
>>>>>>                       if ((safeDecode)
>>>>>>                               && unlikely(
>>>>>> @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>
>>>>>>               length += MINMATCH;
>>>>>>
>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>> +safe_match_copy:
>>>>>> +#endif
>>>>>> +
>>>>>>               /* match starting within external dictionary */
>>>>>>               if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>                       if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-16  1:35             ` Gao Xiang
@ 2019-05-16  2:07               ` Mao, Chenxi
  2019-05-16  2:10                 ` Gao Xiang
  2019-05-16  2:11               ` Mao, Chenxi
  1 sibling, 1 reply; 16+ messages in thread
From: Mao, Chenxi @ 2019-05-16  2:07 UTC (permalink / raw)
  To: Gao Xiang
  Cc: Cyan, akpm, linux-kernel, Feng, Roy, Xu, Yuanli 2, Alm, Robert 2,
	Takahashi, Masaya A (Sony Mobile),
	Miao Xie, Greg Kroah-Hartman

Hi Xiang:

I checked the deliver history.
There is only below delivery related with lz4.c
Pull request:616
4e3accc Fix Dict Size Test in `LZ4_compress_fast_continue()`
535636f Don't Attach Very Small Dictionaries
This 2 changes seems like ONLY bug fixes for dictionary fix baesd on 1.8.3

Based on current investigation result, I think pick decompress patches is safe for v1.8.3

If there is any misunderstanding or faults, please feel free to let me know.

Chenxi

On 5/16/19 9:35 AM, Gao Xiang wrote:
> 
> 
> On 2019/5/16 7:48, Mao, Chenxi wrote:
>> Hi Yann and Xiang:
>> For this FAST_DEC_LOOP change, I only pick up decompress related patches to current kernel LZ4 implementation(based on 1.8.3).
>> Here is my cherry-pick list:
>> 2589c44 created LZ4_FAST_DEC_LOOP build macro
>> 605d811 enable LZ4_FAST_DEC_LOOP build macro on aarch64/GCC by default
>> 5d7d116 decompress_generic: Limit fastpath to x86
>> 75fb878 decompress_generic: Add fastpath for small offsets
>> faac110 decompress_generic: Unroll loops a bit more
>> 1fbaf84 decompress_generic: remove msan write
>> 28b8249 decompress_generic: re-add fastpath
>> 232f1e2 decompress_generic: drop partial copy check in fast loop
>> 59332a3 decompress_generic: Optimize literal copies
>> 5dfa7d4 decompress_generic: optimize match copy
>> 28356e0 decompress_generic: Add a loop fastpath
>> 4da3360 decompress_generic: Refactor variable length fields
>>
>> Only cherry-pick these changes to LZ4 1.8.3 would not be introduce risks.
> 
> The fact is that ... LZ4_FAST_DEC_LOOP almost influences all decompression apis.
> You should keep LZ4_decompress_generic from all known issues.
> 
>>
>> @Xiang:
>> Do you prefer to upgrade kernel lz4 to 1.9.1 directly or only pick some optimization patches?
> 
> I think you should pick all patches which are related to LZ4_decompress_generic()
> and LZ4_FAST_DEC_LOOP since known issues of LZ4_FAST_DEC_LOOP should be fixed of course.
> 
> Thanks,
> Gao Xiang
> 
>>
>> Chenxi
>>
>>
>> On 5/16/19 1:47 AM, Gao Xiang wrote:
>>> Hi Yann,
>>>
>>> On 2019/5/16 1:03, Cyan wrote:
>>>> Re-posted,
>>>> it seems the previous message was rejected by the linux-kernel server
>>>> due to some kind of format limitation (no html).
>>>>
>>>>
>>>> Le mer. 15 mai 2019 à 09:56, Cyan <yann.collet.73@gmail.com> a écrit :
>>>>>
>>>>> The v1.9.0 version has a bug which makes it read a few bytes out of bound in certain cases.
>>>>> This was fixed in v1.9.1.
>>>>> Therefore, if you plan upgrading version, skip directly to v1.9.1.
>>>>>
>>>>> Note that, in v1.9.1, LZ4_decompress_fast() is now deprecated.
>>>>> LZ4_decompress_fast() is a security liability, since it's unable to cope with malicious inputs.
>>>>> While this is not a problem when the producer is under direct control (local production and consumption, no intermediate storage),
>>>>> this entry point seems misused in multiple scenarios where input cannot be decently trusted.
>>>>> The deprecation warning is meant to "scare away" such usages.
>>>>> It's still possible to intentionally remove the warning, for users who know what they are doing.
>>>>>
>>>>> Note that, on top of that, the _fast() variant is now slower that LZ4_decompress_safe(), removing its most important positive differentiator.
>>>>> That's because LZ4_decompress_fast() is "blind", it doesn't know the input size.
>>>>> It just guesses it, from the requested output size, and by relying on end-of-block conditions.
>>>>> This effectively limits its capability to copy up to 8 bytes at a time,
>>>>> which proves slower than the 32-bytes at a time used in LZ4_decompress_safe().
>>>>>
>>>>> LZ4_decompress_fast() still has some possible usage :
>>>>> it doesn't need the compressed size.
>>>>> Therefore, in a scenario where decompressed size is known, but compressed size isn't,
>>>>> it cannot be replaced by `LZ4_decompress_safe()`.
>>>>> This is pretty rare, but can happen in some cases, erofs for example.
>>>>> `LZ4_decompress_safe_partial()` *might* be able to replace it for such scenario,
>>>>> but I haven't thoroughly tested it, so can't yet vouch for it.
>>>
>>> Yes, that is my real concern...
>>>
>>> In fact, we have shipped erofs with LZ4_decompress_safe_partial() of lz4-1.8.3
>>> for our HUAWEI P30/P30 Pro and other HUAWEI EMUI 9.1 phones on the market...
>>>
>>> It seemd lz4-1.8.3 is rather stable since no stability report from our internal
>>> beta test and real consumers...
>>>
>>> I haven't tested lz4-1.9.0 yet since I am working on the erofs paper [1]...
>>>
>>> But anyway, the patch looks good to me since it has obvious performance gain...
>>>
>>>>>
>>>>> If it doesn't, then it will be necessary to create a new entry point.
>>>>> The main difference is that this new entry point will need a bound for the input size, in order to guarantee it never reads beyond input buffer.
>>>>> But as said, for the time being, such dedicated entry point doesn't exist.
>>>
>>> I agree, but current erofs still have to choose LZ4_decompress_fast...
>>> I will test these 2 apis later and will report if something is wrong as well.
>>>
>>> [1] https://www.usenix.org/conference/atc19/presentation/gao
>>>     https://www.usenix.org/conference/atc19/technical-sessions
>>>
>>> Thanks,
>>> Gao Xiang
>>>
>>>
>>>>>
>>>>> The new FAST_DEC_LOOP setting works well on x86 and x64.
>>>>> Recently, it has been extended to gcc+arm64 (`dev` branch).
>>>>> The combination of clang+arm64 is less successful though (impacts vary, depending on version and exact hardware), so it's still disabled by default.
>>>>> But anyone can test it by setting FAST_DEC_LOOP definition to 1, thus enabling the new decoder loop.
>>>>>
>>>>>
>>>>> Y.
>>>>>
>>>>> Le mar. 14 mai 2019 à 20:28, Mao, Chenxi <Chenxi.Mao@sony.com> a écrit :
>>>>>>
>>>>>> Hi Xiang:
>>>>>>
>>>>>> Thanks for your reply, I will have a stress test on my device later.
>>>>>> I didn't have chance to test LZ4 with clang build because of device limitation. I think I could do it later.
>>>>>> I guess the clang performance downgrade is caused by some compiler optimization options.
>>>>>> I will double check it later if I got the device which can build kernel with clang.
>>>>>>
>>>>>> BTW, the FAST_DEC_LOOP leverage LZ4_wildCopy8 instead of LZ4_wildCopy,
>>>>>> however based on my test, original LZ4_wildCopy has the better performance.
>>>>>> Original LZ4_wildCopy API still invoked by FAST_DEC_LOOP apis.
>>>>>> That might be a problem for X86 devices for current patch.
>>>>>>
>>>>>> Chenxi
>>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Gao Xiang [mailto:gaoxiang25@huawei.com]
>>>>>> Sent: Wednesday, May 15, 2019 10:21 AM
>>>>>> To: Mao, Chenxi <Chenxi.Mao@sony.com>
>>>>>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org; Feng, Roy <Roy.Feng@sony.com>; Xu, Yuanli 2 <Yuanli.Xu@sony.com>; Alm, Robert 2 <Robert.Alm@sony.com>; Takahashi, Masaya A (Sony Mobile) <Masaya.A.Takahashi@sony.com>; Miao Xie <miaoxie@huawei.com>
>>>>>> Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
>>>>>>
>>>>>> Hi Chenxi,
>>>>>>
>>>>>> On 2019/5/15 8:43, Chenxi Mao wrote:
>>>>>>> FAST_DEC_LOOP was introduced from LZ4 1.9.
>>>>>>> This change would be introduce 10% on decompress operation according
>>>>>>> to LZ4 benchmark result on X86 devices.
>>>>>>> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements, however
>>>>>>> clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>>>>>
>>>>>> I noticed this optimization and lz4-1.9.0 [1] was just released month ago with this big change therefore I'm a little afraid of its current stability (especially some rare used decompression apis...)
>>>>>>
>>>>>> Could you Cc more people (e.g. lz4 original author Yann Collet) in order to get some more ideas about this if possible?
>>>>>>
>>>>>> Anyway, I like this optimization as well since FAST_DEC_LOOP improves decompression speed a lot :)
>>>>>>
>>>>>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>>>>>
>>>>>> Thanks,
>>>>>> Gao Xiang
>>>>>>
>>>>>>>
>>>>>>> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>>>>>>>
>>>>>>> Here is the test result on ARM64(cortex-A53)
>>>>>>>
>>>>>>> Benchmark via ZRAM:
>>>>>>>
>>>>>>> Test case:
>>>>>>> fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>>>>>>> --buffer_compress_percentage=75 \
>>>>>>> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
>>>>>>> --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
>>>>>>> --name=seq-read --rw=read --stonewall --name=seq-readwrite \ --rw=rw
>>>>>>> --stonewall --name=rand-readwrite --rw=randrw --stonewall
>>>>>>>
>>>>>>> Patched:
>>>>>>>    READ: bw=7077MiB/s (7421MB/s)
>>>>>>> Vanilla:
>>>>>>>    READ: bw=5134MiB/s (5384MB/s)
>>>>>>>
>>>>>>> Reference:
>>>>>>> 1. https://github.com/lz4/lz4/pull/645
>>>>>>> 2. https://github.com/lz4/lz4/pull/707
>>>>>>>
>>>>>>> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
>>>>>>> ---
>>>>>>>  lib/lz4/lz4_decompress.c | 425
>>>>>>> +++++++++++++++++++++++++++++++++------
>>>>>>>  1 file changed, 361 insertions(+), 64 deletions(-)
>>>>>>>
>>>>>>> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index
>>>>>>> 0c9d3ad17e0f..a4c87e32b3c0 100644
>>>>>>> --- a/lib/lz4/lz4_decompress.c
>>>>>>> +++ b/lib/lz4/lz4_decompress.c
>>>>>>> @@ -50,6 +50,131 @@
>>>>>>>  #define assert(condition) ((void)0)
>>>>>>>  #endif
>>>>>>>
>>>>>>> +#ifndef LZ4_FAST_DEC_LOOP
>>>>>>> +#if defined(__i386__) || defined(__x86_64__) #define
>>>>>>> +LZ4_FAST_DEC_LOOP 1 #elif defined(__aarch64__) && !defined(__clang__)
>>>>>>> +     /* On aarch64, we disable this optimization for clang because on certain
>>>>>>> +      * mobile chipsets and clang, it reduces performance. For more information
>>>>>>> +      * refer to https://github.com/lz4/lz4/pull/707. */ #define
>>>>>>> +LZ4_FAST_DEC_LOOP 1 #else #define LZ4_FAST_DEC_LOOP 0 #endif #endif
>>>>>>> +
>>>>>>> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>>>>>>> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>>>>>>> +
>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>> +#define FASTLOOP_SAFE_DISTANCE 64
>>>>>>> +FORCE_INLINE void
>>>>>>> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>>>>>>> +                          const size_t offset)
>>>>>>> +{
>>>>>>> +     if (offset < 8) {
>>>>>>> +             dstPtr[0] = srcPtr[0];
>>>>>>> +
>>>>>>> +             dstPtr[1] = srcPtr[1];
>>>>>>> +             dstPtr[2] = srcPtr[2];
>>>>>>> +             dstPtr[3] = srcPtr[3];
>>>>>>> +             srcPtr += inc32table[offset];
>>>>>>> +             memcpy(dstPtr + 4, srcPtr, 4);
>>>>>>> +             srcPtr -= dec64table[offset];
>>>>>>> +             dstPtr += 8;
>>>>>>> +     } else {
>>>>>>> +             memcpy(dstPtr, srcPtr, 8);
>>>>>>> +             dstPtr += 8;
>>>>>>> +             srcPtr += 8;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     LZ4_wildCopy(dstPtr, srcPtr, dstEnd); }
>>>>>>> +
>>>>>>> +/* customized variant of memcpy, which can overwrite up to 32 bytes
>>>>>>> +beyond dstEnd
>>>>>>> + * this version copies two times 16 bytes (instead of one time 32
>>>>>>> +bytes)
>>>>>>> + * because it must be compatible with offsets >= 16. */ FORCE_INLINE
>>>>>>> +void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) {
>>>>>>> +     BYTE *d = (BYTE *) dstPtr;
>>>>>>> +     const BYTE *s = (const BYTE *)srcPtr;
>>>>>>> +     BYTE *const e = (BYTE *) dstEnd;
>>>>>>> +
>>>>>>> +     do {
>>>>>>> +             memcpy(d, s, 16);
>>>>>>> +             memcpy(d + 16, s + 16, 16);
>>>>>>> +             d += 32;
>>>>>>> +             s += 32;
>>>>>>> +     } while (d < e);
>>>>>>> +}
>>>>>>> +
>>>>>>> +FORCE_INLINE void
>>>>>>> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>>>>>>> +                     const size_t offset)
>>>>>>> +{
>>>>>>> +     BYTE v[8];
>>>>>>> +     switch (offset) {
>>>>>>> +
>>>>>>> +     case 1:
>>>>>>> +             memset(v, *srcPtr, 8);
>>>>>>> +             goto copy_loop;
>>>>>>> +     case 2:
>>>>>>> +             memcpy(v, srcPtr, 2);
>>>>>>> +             memcpy(&v[2], srcPtr, 2);
>>>>>>> +             memcpy(&v[4], &v[0], 4);
>>>>>>> +             goto copy_loop;
>>>>>>> +     case 4:
>>>>>>> +             memcpy(v, srcPtr, 4);
>>>>>>> +             memcpy(&v[4], srcPtr, 4);
>>>>>>> +             goto copy_loop;
>>>>>>> +     default:
>>>>>>> +             LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>>>>>>> +             return;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +      copy_loop:
>>>>>>> +     memcpy(dstPtr, v, 8);
>>>>>>> +     dstPtr += 8;
>>>>>>> +     while (dstPtr < dstEnd) {
>>>>>>> +             memcpy(dstPtr, v, 8);
>>>>>>> +             dstPtr += 8;
>>>>>>> +     }
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* Read the variable-length literal or match length.
>>>>>>> + *
>>>>>>> + * ip - pointer to use as input.
>>>>>>> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>>>>>>> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>>>>>>> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>>>>>>> + * error (output) - error code.  Should be set to 0 before call.
>>>>>>> + */
>>>>>>> +typedef enum { loop_error = -2, initial_error = -1, ok = 0}
>>>>>>> +variable_length_error; FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>>>>>>> +                                        const BYTE *lencheck,
>>>>>>> +                                        int loop_check, int initial_check,
>>>>>>> +                                        variable_length_error *error) {
>>>>>>> +     unsigned length = 0;
>>>>>>> +     unsigned s;
>>>>>>> +     if (initial_check && unlikely((*ip) >= lencheck)) {     /* overflow detection */
>>>>>>> +             *error = initial_error;
>>>>>>> +             return length;
>>>>>>> +     }
>>>>>>> +     do {
>>>>>>> +             s = **ip;
>>>>>>> +             (*ip)++;
>>>>>>> +             length += s;
>>>>>>> +             if (loop_check && unlikely((*ip) >= lencheck)) {        /* overflow detection */
>>>>>>> +                     *error = loop_error;
>>>>>>> +                     return length;
>>>>>>> +             }
>>>>>>> +     } while (s == 255);
>>>>>>> +
>>>>>>> +     return length;
>>>>>>> +}
>>>>>>> +#endif
>>>>>>> +
>>>>>>>  /*
>>>>>>>   * LZ4_decompress_generic() :
>>>>>>>   * This generic decompression function covers all use cases.
>>>>>>> @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>        const size_t dictSize
>>>>>>>        )
>>>>>>>  {
>>>>>>> -     const BYTE *ip = (const BYTE *) src;
>>>>>>> -     const BYTE * const iend = ip + srcSize;
>>>>>>> +     const BYTE *ip = (const BYTE *)src;
>>>>>>> +     const BYTE *const iend = ip + srcSize;
>>>>>>>
>>>>>>>       BYTE *op = (BYTE *) dst;
>>>>>>> -     BYTE * const oend = op + outputSize;
>>>>>>> +     BYTE *const oend = op + outputSize;
>>>>>>>       BYTE *cpy;
>>>>>>>
>>>>>>> -     const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>> -     static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>>>>>>> -     static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>>>>>>> +     const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>>
>>>>>>>       const int safeDecode = (endOnInput == endOnInputSize);
>>>>>>>       const int checkOffset = ((safeDecode) && (dictSize < (int)(64 *
>>>>>>> KB)));
>>>>>>>
>>>>>>>       /* Set up the "end" pointers for the shortcut. */
>>>>>>>       const BYTE *const shortiend = iend -
>>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>>       const BYTE *const shortoend = oend -
>>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>>> +
>>>>>>> +     const BYTE *match;
>>>>>>> +     size_t offset;
>>>>>>> +     unsigned int token;
>>>>>>> +     size_t length;
>>>>>>>
>>>>>>>       DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>>>>>>>                srcSize, outputSize);
>>>>>>> @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>       if ((endOnInput) && unlikely(srcSize == 0))
>>>>>>>               return -1;
>>>>>>>
>>>>>>> -     /* Main Loop : decode sequences */
>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>> +     if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>>>>>>> +             DEBUGLOG(6, "skip fast decode loop");
>>>>>>> +             goto safe_decode;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     /* Fast loop : decode sequences as long as output <
>>>>>>> +iend-FASTLOOP_SAFE_DISTANCE */
>>>>>>>       while (1) {
>>>>>>> -             size_t length;
>>>>>>> -             const BYTE *match;
>>>>>>> -             size_t offset;
>>>>>>> +             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>>>>>>> +             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>>>>>>> +             if (endOnInput) {
>>>>>>> +                     assert(ip < iend);
>>>>>>> +             }
>>>>>>> +             token = *ip++;
>>>>>>> +             length = token >> ML_BITS;      /* literal length */
>>>>>>> +
>>>>>>> +             assert(!endOnInput || ip <= iend);      /* ip < iend before the increment */
>>>>>>> +
>>>>>>> +             /* decode literal length */
>>>>>>> +             if (length == RUN_MASK) {
>>>>>>> +                     variable_length_error error = ok;
>>>>>>> +                     length +=
>>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>>> +                                              endOnInput, endOnInput,
>>>>>>> +                                              &error);
>>>>>>> +                     if (error == initial_error) {
>>>>>>> +                             goto _output_error;
>>>>>>> +                     }
>>>>>>> +                     if ((safeDecode)
>>>>>>> +                         && unlikely((uptrval) (op) + length <
>>>>>>> +                                     (uptrval) (op))) {
>>>>>>> +                             goto _output_error;
>>>>>>> +                     }       /* overflow detection */
>>>>>>> +                     if ((safeDecode)
>>>>>>> +                         && unlikely((uptrval) (ip) + length <
>>>>>>> +                                     (uptrval) (ip))) {
>>>>>>> +                             goto _output_error;
>>>>>>> +                     }
>>>>>>> +
>>>>>>> +                     /* overflow detection */
>>>>>>> +                     /* copy literals */
>>>>>>> +                     cpy = op + length;
>>>>>>> +                     LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>>> +                             if ((cpy > oend - 32)
>>>>>>> +                                 || (ip + length > iend - 32)) {
>>>>>>> +                                     goto safe_literal_copy;
>>>>>>> +                             }
>>>>>>> +                             LZ4_wildCopy32(op, ip, cpy);
>>>>>>> +                     } else {        /* LZ4_decompress_fast() */
>>>>>>> +                             if (cpy > oend - 8) {
>>>>>>> +                                     goto safe_literal_copy;
>>>>>>> +                             }
>>>>>>> +                             LZ4_wildCopy(op, ip, cpy);
>>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>>> +                             /* it doesn't know input length, and only relies on end-of-block */
>>>>>>> +                             /* properties */
>>>>>>> +                     }
>>>>>>> +                     ip += length;
>>>>>>> +                     op = cpy;
>>>>>>> +             } else {
>>>>>>> +                     cpy = op + length;
>>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>>> +                             DEBUGLOG(7,
>>>>>>> +                                      "copy %u bytes in a 16-bytes stripe",
>>>>>>> +                                      (unsigned)length);
>>>>>>> +                             /* We don't need to check oend */
>>>>>>> +                             /* since we check it once for each loop below */
>>>>>>> +                             if (ip > iend - (16 + 1)) {     /*max lit + offset + nextToken */
>>>>>>> +                                     goto safe_literal_copy;
>>>>>>> +                             }
>>>>>>> +                             /* Literals can only be 14, but hope compilers optimize */
>>>>>>> +                             /*if we copy by a register size */
>>>>>>> +                             memcpy(op, ip, 16);
>>>>>>> +                     } else {
>>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>>> +                             /* it doesn't know input length, and relies on end-of-block */
>>>>>>> +                             /* properties */
>>>>>>> +                             memcpy(op, ip, 8);
>>>>>>> +                             if (length > 8) {
>>>>>>> +                                     memcpy(op + 8, ip + 8, 8);
>>>>>>> +                             }
>>>>>>> +                     }
>>>>>>> +                     ip += length;
>>>>>>> +                     op = cpy;
>>>>>>> +             }
>>>>>>> +
>>>>>>> +             /* get offset */
>>>>>>> +             offset = LZ4_readLE16(ip);
>>>>>>> +             ip += 2;        /* end-of-block condition violated */
>>>>>>> +             match = op - offset;
>>>>>>> +
>>>>>>> +             /* get matchlength */
>>>>>>> +             length = token & ML_MASK;
>>>>>>>
>>>>>>> -             /* get literal length */
>>>>>>> -             unsigned int const token = *ip++;
>>>>>>> -             length = token>>ML_BITS;
>>>>>>> +             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>>>>>>> +                     goto _output_error;
>>>>>>> +             }
>>>>>>> +             /* Error : offset outside buffers */
>>>>>>> +             if (length == ML_MASK) {
>>>>>>> +                     variable_length_error error = ok;
>>>>>>> +                     length +=
>>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>>> +                                              endOnInput, 0, &error);
>>>>>>> +                     if (error != ok) {
>>>>>>> +                             goto _output_error;
>>>>>>> +                     }
>>>>>>> +                     if ((safeDecode)
>>>>>>> +                         && unlikely((uptrval) (op) + length < (uptrval) op)) {
>>>>>>> +                             goto _output_error;
>>>>>>> +                     }       /* overflow detection */
>>>>>>> +                     length += MINMATCH;
>>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>>> +                             goto safe_match_copy;
>>>>>>> +                     }
>>>>>>> +             } else {
>>>>>>> +                     length += MINMATCH;
>>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>>> +                             goto safe_match_copy;
>>>>>>> +                     }
>>>>>>> +
>>>>>>> +                     /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>>>>>>> +                     if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>>>>>>> +                             if (offset >= 8) {
>>>>>>> +                                     memcpy(op, match, 8);
>>>>>>> +                                     memcpy(op + 8, match + 8, 8);
>>>>>>> +                                     memcpy(op + 16, match + 16, 2);
>>>>>>> +                                     op += length;
>>>>>>> +                                     continue;
>>>>>>> +                             }
>>>>>>> +                     }
>>>>>>> +             }
>>>>>>> +
>>>>>>> +             /* match starting within external dictionary */
>>>>>>> +             if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>> +                     if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>> +                             if (partialDecoding) {
>>>>>>> +                                     /* reach end of buffer */
>>>>>>> +                                     length =
>>>>>>> +                                         min(length, (size_t) (oend - op));
>>>>>>> +                             } else {
>>>>>>> +                                     /* end-of-block condition violated */
>>>>>>> +                                     goto _output_error;
>>>>>>> +                             }
>>>>>>> +                     }
>>>>>>> +
>>>>>>> +                     if (length <= (size_t) (lowPrefix - match)) {
>>>>>>> +                             /* match fits entirely within external dictionary : just copy */
>>>>>>> +                             memmove(op, dictEnd - (lowPrefix - match), length);
>>>>>>> +                             op += length;
>>>>>>> +                     } else {
>>>>>>> +                             /* match stretches into both external dict and current block */
>>>>>>> +                             size_t const copySize =
>>>>>>> +                                 (size_t) (lowPrefix - match);
>>>>>>> +                             size_t const restSize = length - copySize;
>>>>>>> +                             memcpy(op, dictEnd - copySize, copySize);
>>>>>>> +                             op += copySize;
>>>>>>> +                             if (restSize > (size_t) (op - lowPrefix)) {     /* overlap copy */
>>>>>>> +                                     BYTE *const endOfMatch = op + restSize;
>>>>>>> +                                     const BYTE *copyFrom = lowPrefix;
>>>>>>> +                                     while (op < endOfMatch) {
>>>>>>> +                                             *op++ = *copyFrom++;
>>>>>>> +                                     }
>>>>>>> +                             } else {
>>>>>>> +                                     memcpy(op, lowPrefix, restSize);
>>>>>>> +                                     op += restSize;
>>>>>>> +                             }
>>>>>>> +                     }
>>>>>>> +                     continue;
>>>>>>> +             }
>>>>>>> +
>>>>>>> +             /* copy match within block */
>>>>>>> +             cpy = op + length;
>>>>>>> +
>>>>>>> +             assert((op <= oend) && (oend - op >= 32));
>>>>>>> +             if (unlikely(offset < 16)) {
>>>>>>> +                     LZ4_memcpy_using_offset(op, match, cpy, offset);
>>>>>>> +             } else {
>>>>>>> +                     LZ4_wildCopy32(op, match, cpy);
>>>>>>> +             }
>>>>>>> +
>>>>>>> +             op = cpy;       /* wildcopy correction */
>>>>>>> +     }
>>>>>>> +      safe_decode:
>>>>>>> +#endif
>>>>>>> +     /* Main Loop : decode sequences */
>>>>>>> +     while (1) {
>>>>>>> +             length = token >> ML_BITS;
>>>>>>>
>>>>>>>               /* ip < iend before the increment */
>>>>>>>               assert(!endOnInput || ip <= iend);
>>>>>>> @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>                * combined check for both stages).
>>>>>>>                */
>>>>>>>               if ((endOnInput ? length != RUN_MASK : length <= 8)
>>>>>>> -                /*
>>>>>>> -                 * strictly "less than" on input, to re-enter
>>>>>>> -                 * the loop with at least one byte
>>>>>>> -                 */
>>>>>>> -                && likely((endOnInput ? ip < shortiend : 1) &
>>>>>>> -                          (op <= shortoend))) {
>>>>>>> +                 /*
>>>>>>> +                  * strictly "less than" on input, to re-enter
>>>>>>> +                  * the loop with at least one byte
>>>>>>> +                  */
>>>>>>> +                 && likely((endOnInput ? ip < shortiend : 1) &
>>>>>>> +                           (op <= shortoend))) {
>>>>>>>                       /* Copy the literals */
>>>>>>>                       memcpy(op, ip, endOnInput ? 16 : 8);
>>>>>>> -                     op += length; ip += length;
>>>>>>> +                     op += length;
>>>>>>> +                     ip += length;
>>>>>>>
>>>>>>>                       /*
>>>>>>>                        * The second stage:
>>>>>>>                        * prepare for match copying, decode full info.
>>>>>>>                        * If it doesn't work out, the info won't be wasted.
>>>>>>>                        */
>>>>>>> -                     length = token & ML_MASK; /* match length */
>>>>>>> +                     length = token & ML_MASK;       /* match length */
>>>>>>>                       offset = LZ4_readLE16(ip);
>>>>>>>                       ip += 2;
>>>>>>>                       match = op - offset;
>>>>>>> -                     assert(match <= op); /* check overflow */
>>>>>>> +                     assert(match <= op);    /* check overflow */
>>>>>>>
>>>>>>>                       /* Do not deal with overlapping matches. */
>>>>>>>                       if ((length != ML_MASK) &&
>>>>>>> @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>
>>>>>>>               /* decode literal length */
>>>>>>>               if (length == RUN_MASK) {
>>>>>>> -                     unsigned int s;
>>>>>>>
>>>>>>> -                     if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>>>>>>> -                             /* overflow detection */
>>>>>>> +                     variable_length_error error = ok;
>>>>>>> +                     length +=
>>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>>> +                                              endOnInput, endOnInput,
>>>>>>> +                                              &error);
>>>>>>> +                     if (error == initial_error)
>>>>>>>                               goto _output_error;
>>>>>>> -                     }
>>>>>>> -                     do {
>>>>>>> -                             s = *ip++;
>>>>>>> -                             length += s;
>>>>>>> -                     } while (likely(endOnInput
>>>>>>> -                             ? ip < iend - RUN_MASK
>>>>>>> -                             : 1) & (s == 255));
>>>>>>>
>>>>>>>                       if ((safeDecode)
>>>>>>> -                         && unlikely((uptrval)(op) +
>>>>>>> -                                     length < (uptrval)(op))) {
>>>>>>> +                         && unlikely((uptrval) (op) +
>>>>>>> +                                     length < (uptrval) (op))) {
>>>>>>>                               /* overflow detection */
>>>>>>>                               goto _output_error;
>>>>>>>                       }
>>>>>>>                       if ((safeDecode)
>>>>>>> -                         && unlikely((uptrval)(ip) +
>>>>>>> -                                     length < (uptrval)(ip))) {
>>>>>>> +                         && unlikely((uptrval) (ip) +
>>>>>>> +                                     length < (uptrval) (ip))) {
>>>>>>>                               /* overflow detection */
>>>>>>>                               goto _output_error;
>>>>>>>                       }
>>>>>>> @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>
>>>>>>>               /* copy literals */
>>>>>>>               cpy = op + length;
>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>> +           safe_literal_copy:
>>>>>>> +#endif
>>>>>>>               LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>>
>>>>>>>               if (((endOnInput) && ((cpy > oend - MFLIMIT)
>>>>>>> -                     || (ip + length > iend - (2 + 1 + LASTLITERALS))))
>>>>>>> -                     || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>> +                                   || (ip + length >
>>>>>>> +                                       iend - (2 + 1 + LASTLITERALS))))
>>>>>>> +                 || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>>                       if (partialDecoding) {
>>>>>>>                               if (cpy > oend) {
>>>>>>>                                       /*
>>>>>>> @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>                                       length = oend - op;
>>>>>>>                               }
>>>>>>>                               if ((endOnInput)
>>>>>>> -                                     && (ip + length > iend)) {
>>>>>>> +                                 && (ip + length > iend)) {
>>>>>>>                                       /*
>>>>>>>                                        * Error :
>>>>>>>                                        * read attempt beyond
>>>>>>> @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>                               }
>>>>>>>                       } else {
>>>>>>>                               if ((!endOnInput)
>>>>>>> -                                     && (cpy != oend)) {
>>>>>>> +                                 && (cpy != oend)) {
>>>>>>>                                       /*
>>>>>>>                                        * Error :
>>>>>>>                                        * block decoding must
>>>>>>> @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>                                       goto _output_error;
>>>>>>>                               }
>>>>>>>                               if ((endOnInput)
>>>>>>> -                                     && ((ip + length != iend)
>>>>>>> +                                 && ((ip + length != iend)
>>>>>>>                                       || (cpy > oend))) {
>>>>>>>                                       /*
>>>>>>>                                        * Error :
>>>>>>> @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>                       goto _output_error;
>>>>>>>               }
>>>>>>>
>>>>>>> -             /* costs ~1%; silence an msan warning when offset == 0 */
>>>>>>> -             /*
>>>>>>> -              * note : when partialDecoding, there is no guarantee that
>>>>>>> -              * at least 4 bytes remain available in output buffer
>>>>>>> -              */
>>>>>>> -             if (!partialDecoding) {
>>>>>>> -                     assert(oend > op);
>>>>>>> -                     assert(oend - op >= 4);
>>>>>>> -
>>>>>>> -                     LZ4_write32(op, (U32)offset);
>>>>>>> -             }
>>>>>>> -
>>>>>>>               if (length == ML_MASK) {
>>>>>>> -                     unsigned int s;
>>>>>>> -
>>>>>>> -                     do {
>>>>>>> -                             s = *ip++;
>>>>>>> -
>>>>>>> -                             if ((endOnInput) && (ip > iend - LASTLITERALS))
>>>>>>> -                                     goto _output_error;
>>>>>>>
>>>>>>> -                             length += s;
>>>>>>> -                     } while (s == 255);
>>>>>>> +                     variable_length_error error = ok;
>>>>>>> +                     length +=
>>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>>> +                                              endOnInput, 0, &error);
>>>>>>> +                     if (error != ok)
>>>>>>> +                             goto _output_error;
>>>>>>>
>>>>>>>                       if ((safeDecode)
>>>>>>>                               && unlikely(
>>>>>>> @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>
>>>>>>>               length += MINMATCH;
>>>>>>>
>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>> +safe_match_copy:
>>>>>>> +#endif
>>>>>>> +
>>>>>>>               /* match starting within external dictionary */
>>>>>>>               if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>>                       if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-16  2:07               ` Mao, Chenxi
@ 2019-05-16  2:10                 ` Gao Xiang
  2019-05-16  6:52                   ` Mao, Chenxi
  0 siblings, 1 reply; 16+ messages in thread
From: Gao Xiang @ 2019-05-16  2:10 UTC (permalink / raw)
  To: Mao, Chenxi
  Cc: Cyan, akpm, linux-kernel, Feng, Roy, Xu, Yuanli 2, Alm, Robert 2,
	Takahashi, Masaya A (Sony Mobile),
	Miao Xie, Greg Kroah-Hartman



On 2019/5/16 10:07, Mao, Chenxi wrote:
> Hi Xiang:
> 
> I checked the deliver history.
> There is only below delivery related with lz4.c
> Pull request:616
> 4e3accc Fix Dict Size Test in `LZ4_compress_fast_continue()`
> 535636f Don't Attach Very Small Dictionaries
> This 2 changes seems like ONLY bug fixes for dictionary fix baesd on 1.8.3

Have you taken a glance at

https://github.com/lz4/lz4/commit/5a6d72447ae998b387794df1135023689aa89995   at least?

and issues,

https://github.com/lz4/lz4/issues/676
https://github.com/lz4/lz4/issues/681

That is what Yann said before.

Thanks,
Gao Xiang

> 
> Based on current investigation result, I think pick decompress patches is safe for v1.8.3
> 
> If there is any misunderstanding or faults, please feel free to let me know.
> 
> Chenxi
> 
> On 5/16/19 9:35 AM, Gao Xiang wrote:
>>
>>
>> On 2019/5/16 7:48, Mao, Chenxi wrote:
>>> Hi Yann and Xiang:
>>> For this FAST_DEC_LOOP change, I only pick up decompress related patches to current kernel LZ4 implementation(based on 1.8.3).
>>> Here is my cherry-pick list:
>>> 2589c44 created LZ4_FAST_DEC_LOOP build macro
>>> 605d811 enable LZ4_FAST_DEC_LOOP build macro on aarch64/GCC by default
>>> 5d7d116 decompress_generic: Limit fastpath to x86
>>> 75fb878 decompress_generic: Add fastpath for small offsets
>>> faac110 decompress_generic: Unroll loops a bit more
>>> 1fbaf84 decompress_generic: remove msan write
>>> 28b8249 decompress_generic: re-add fastpath
>>> 232f1e2 decompress_generic: drop partial copy check in fast loop
>>> 59332a3 decompress_generic: Optimize literal copies
>>> 5dfa7d4 decompress_generic: optimize match copy
>>> 28356e0 decompress_generic: Add a loop fastpath
>>> 4da3360 decompress_generic: Refactor variable length fields
>>>
>>> Only cherry-pick these changes to LZ4 1.8.3 would not be introduce risks.
>>
>> The fact is that ... LZ4_FAST_DEC_LOOP almost influences all decompression apis.
>> You should keep LZ4_decompress_generic from all known issues.
>>
>>>
>>> @Xiang:
>>> Do you prefer to upgrade kernel lz4 to 1.9.1 directly or only pick some optimization patches?
>>
>> I think you should pick all patches which are related to LZ4_decompress_generic()
>> and LZ4_FAST_DEC_LOOP since known issues of LZ4_FAST_DEC_LOOP should be fixed of course.
>>
>> Thanks,
>> Gao Xiang
>>
>>>
>>> Chenxi
>>>
>>>
>>> On 5/16/19 1:47 AM, Gao Xiang wrote:
>>>> Hi Yann,
>>>>
>>>> On 2019/5/16 1:03, Cyan wrote:
>>>>> Re-posted,
>>>>> it seems the previous message was rejected by the linux-kernel server
>>>>> due to some kind of format limitation (no html).
>>>>>
>>>>>
>>>>> Le mer. 15 mai 2019 à 09:56, Cyan <yann.collet.73@gmail.com> a écrit :
>>>>>>
>>>>>> The v1.9.0 version has a bug which makes it read a few bytes out of bound in certain cases.
>>>>>> This was fixed in v1.9.1.
>>>>>> Therefore, if you plan upgrading version, skip directly to v1.9.1.
>>>>>>
>>>>>> Note that, in v1.9.1, LZ4_decompress_fast() is now deprecated.
>>>>>> LZ4_decompress_fast() is a security liability, since it's unable to cope with malicious inputs.
>>>>>> While this is not a problem when the producer is under direct control (local production and consumption, no intermediate storage),
>>>>>> this entry point seems misused in multiple scenarios where input cannot be decently trusted.
>>>>>> The deprecation warning is meant to "scare away" such usages.
>>>>>> It's still possible to intentionally remove the warning, for users who know what they are doing.
>>>>>>
>>>>>> Note that, on top of that, the _fast() variant is now slower that LZ4_decompress_safe(), removing its most important positive differentiator.
>>>>>> That's because LZ4_decompress_fast() is "blind", it doesn't know the input size.
>>>>>> It just guesses it, from the requested output size, and by relying on end-of-block conditions.
>>>>>> This effectively limits its capability to copy up to 8 bytes at a time,
>>>>>> which proves slower than the 32-bytes at a time used in LZ4_decompress_safe().
>>>>>>
>>>>>> LZ4_decompress_fast() still has some possible usage :
>>>>>> it doesn't need the compressed size.
>>>>>> Therefore, in a scenario where decompressed size is known, but compressed size isn't,
>>>>>> it cannot be replaced by `LZ4_decompress_safe()`.
>>>>>> This is pretty rare, but can happen in some cases, erofs for example.
>>>>>> `LZ4_decompress_safe_partial()` *might* be able to replace it for such scenario,
>>>>>> but I haven't thoroughly tested it, so can't yet vouch for it.
>>>>
>>>> Yes, that is my real concern...
>>>>
>>>> In fact, we have shipped erofs with LZ4_decompress_safe_partial() of lz4-1.8.3
>>>> for our HUAWEI P30/P30 Pro and other HUAWEI EMUI 9.1 phones on the market...
>>>>
>>>> It seemd lz4-1.8.3 is rather stable since no stability report from our internal
>>>> beta test and real consumers...
>>>>
>>>> I haven't tested lz4-1.9.0 yet since I am working on the erofs paper [1]...
>>>>
>>>> But anyway, the patch looks good to me since it has obvious performance gain...
>>>>
>>>>>>
>>>>>> If it doesn't, then it will be necessary to create a new entry point.
>>>>>> The main difference is that this new entry point will need a bound for the input size, in order to guarantee it never reads beyond input buffer.
>>>>>> But as said, for the time being, such dedicated entry point doesn't exist.
>>>>
>>>> I agree, but current erofs still have to choose LZ4_decompress_fast...
>>>> I will test these 2 apis later and will report if something is wrong as well.
>>>>
>>>> [1] https://www.usenix.org/conference/atc19/presentation/gao
>>>>     https://www.usenix.org/conference/atc19/technical-sessions
>>>>
>>>> Thanks,
>>>> Gao Xiang
>>>>
>>>>
>>>>>>
>>>>>> The new FAST_DEC_LOOP setting works well on x86 and x64.
>>>>>> Recently, it has been extended to gcc+arm64 (`dev` branch).
>>>>>> The combination of clang+arm64 is less successful though (impacts vary, depending on version and exact hardware), so it's still disabled by default.
>>>>>> But anyone can test it by setting FAST_DEC_LOOP definition to 1, thus enabling the new decoder loop.
>>>>>>
>>>>>>
>>>>>> Y.
>>>>>>
>>>>>> Le mar. 14 mai 2019 à 20:28, Mao, Chenxi <Chenxi.Mao@sony.com> a écrit :
>>>>>>>
>>>>>>> Hi Xiang:
>>>>>>>
>>>>>>> Thanks for your reply, I will have a stress test on my device later.
>>>>>>> I didn't have chance to test LZ4 with clang build because of device limitation. I think I could do it later.
>>>>>>> I guess the clang performance downgrade is caused by some compiler optimization options.
>>>>>>> I will double check it later if I got the device which can build kernel with clang.
>>>>>>>
>>>>>>> BTW, the FAST_DEC_LOOP leverage LZ4_wildCopy8 instead of LZ4_wildCopy,
>>>>>>> however based on my test, original LZ4_wildCopy has the better performance.
>>>>>>> Original LZ4_wildCopy API still invoked by FAST_DEC_LOOP apis.
>>>>>>> That might be a problem for X86 devices for current patch.
>>>>>>>
>>>>>>> Chenxi
>>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Gao Xiang [mailto:gaoxiang25@huawei.com]
>>>>>>> Sent: Wednesday, May 15, 2019 10:21 AM
>>>>>>> To: Mao, Chenxi <Chenxi.Mao@sony.com>
>>>>>>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org; Feng, Roy <Roy.Feng@sony.com>; Xu, Yuanli 2 <Yuanli.Xu@sony.com>; Alm, Robert 2 <Robert.Alm@sony.com>; Takahashi, Masaya A (Sony Mobile) <Masaya.A.Takahashi@sony.com>; Miao Xie <miaoxie@huawei.com>
>>>>>>> Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
>>>>>>>
>>>>>>> Hi Chenxi,
>>>>>>>
>>>>>>> On 2019/5/15 8:43, Chenxi Mao wrote:
>>>>>>>> FAST_DEC_LOOP was introduced from LZ4 1.9.
>>>>>>>> This change would be introduce 10% on decompress operation according
>>>>>>>> to LZ4 benchmark result on X86 devices.
>>>>>>>> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements, however
>>>>>>>> clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>>>>>>
>>>>>>> I noticed this optimization and lz4-1.9.0 [1] was just released month ago with this big change therefore I'm a little afraid of its current stability (especially some rare used decompression apis...)
>>>>>>>
>>>>>>> Could you Cc more people (e.g. lz4 original author Yann Collet) in order to get some more ideas about this if possible?
>>>>>>>
>>>>>>> Anyway, I like this optimization as well since FAST_DEC_LOOP improves decompression speed a lot :)
>>>>>>>
>>>>>>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Gao Xiang
>>>>>>>
>>>>>>>>
>>>>>>>> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>>>>>>>>
>>>>>>>> Here is the test result on ARM64(cortex-A53)
>>>>>>>>
>>>>>>>> Benchmark via ZRAM:
>>>>>>>>
>>>>>>>> Test case:
>>>>>>>> fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>>>>>>>> --buffer_compress_percentage=75 \
>>>>>>>> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
>>>>>>>> --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
>>>>>>>> --name=seq-read --rw=read --stonewall --name=seq-readwrite \ --rw=rw
>>>>>>>> --stonewall --name=rand-readwrite --rw=randrw --stonewall
>>>>>>>>
>>>>>>>> Patched:
>>>>>>>>    READ: bw=7077MiB/s (7421MB/s)
>>>>>>>> Vanilla:
>>>>>>>>    READ: bw=5134MiB/s (5384MB/s)
>>>>>>>>
>>>>>>>> Reference:
>>>>>>>> 1. https://github.com/lz4/lz4/pull/645
>>>>>>>> 2. https://github.com/lz4/lz4/pull/707
>>>>>>>>
>>>>>>>> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
>>>>>>>> ---
>>>>>>>>  lib/lz4/lz4_decompress.c | 425
>>>>>>>> +++++++++++++++++++++++++++++++++------
>>>>>>>>  1 file changed, 361 insertions(+), 64 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index
>>>>>>>> 0c9d3ad17e0f..a4c87e32b3c0 100644
>>>>>>>> --- a/lib/lz4/lz4_decompress.c
>>>>>>>> +++ b/lib/lz4/lz4_decompress.c
>>>>>>>> @@ -50,6 +50,131 @@
>>>>>>>>  #define assert(condition) ((void)0)
>>>>>>>>  #endif
>>>>>>>>
>>>>>>>> +#ifndef LZ4_FAST_DEC_LOOP
>>>>>>>> +#if defined(__i386__) || defined(__x86_64__) #define
>>>>>>>> +LZ4_FAST_DEC_LOOP 1 #elif defined(__aarch64__) && !defined(__clang__)
>>>>>>>> +     /* On aarch64, we disable this optimization for clang because on certain
>>>>>>>> +      * mobile chipsets and clang, it reduces performance. For more information
>>>>>>>> +      * refer to https://github.com/lz4/lz4/pull/707. */ #define
>>>>>>>> +LZ4_FAST_DEC_LOOP 1 #else #define LZ4_FAST_DEC_LOOP 0 #endif #endif
>>>>>>>> +
>>>>>>>> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>>>>>>>> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>>>>>>>> +
>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>> +#define FASTLOOP_SAFE_DISTANCE 64
>>>>>>>> +FORCE_INLINE void
>>>>>>>> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>>>>>>>> +                          const size_t offset)
>>>>>>>> +{
>>>>>>>> +     if (offset < 8) {
>>>>>>>> +             dstPtr[0] = srcPtr[0];
>>>>>>>> +
>>>>>>>> +             dstPtr[1] = srcPtr[1];
>>>>>>>> +             dstPtr[2] = srcPtr[2];
>>>>>>>> +             dstPtr[3] = srcPtr[3];
>>>>>>>> +             srcPtr += inc32table[offset];
>>>>>>>> +             memcpy(dstPtr + 4, srcPtr, 4);
>>>>>>>> +             srcPtr -= dec64table[offset];
>>>>>>>> +             dstPtr += 8;
>>>>>>>> +     } else {
>>>>>>>> +             memcpy(dstPtr, srcPtr, 8);
>>>>>>>> +             dstPtr += 8;
>>>>>>>> +             srcPtr += 8;
>>>>>>>> +     }
>>>>>>>> +
>>>>>>>> +     LZ4_wildCopy(dstPtr, srcPtr, dstEnd); }
>>>>>>>> +
>>>>>>>> +/* customized variant of memcpy, which can overwrite up to 32 bytes
>>>>>>>> +beyond dstEnd
>>>>>>>> + * this version copies two times 16 bytes (instead of one time 32
>>>>>>>> +bytes)
>>>>>>>> + * because it must be compatible with offsets >= 16. */ FORCE_INLINE
>>>>>>>> +void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) {
>>>>>>>> +     BYTE *d = (BYTE *) dstPtr;
>>>>>>>> +     const BYTE *s = (const BYTE *)srcPtr;
>>>>>>>> +     BYTE *const e = (BYTE *) dstEnd;
>>>>>>>> +
>>>>>>>> +     do {
>>>>>>>> +             memcpy(d, s, 16);
>>>>>>>> +             memcpy(d + 16, s + 16, 16);
>>>>>>>> +             d += 32;
>>>>>>>> +             s += 32;
>>>>>>>> +     } while (d < e);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +FORCE_INLINE void
>>>>>>>> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>>>>>>>> +                     const size_t offset)
>>>>>>>> +{
>>>>>>>> +     BYTE v[8];
>>>>>>>> +     switch (offset) {
>>>>>>>> +
>>>>>>>> +     case 1:
>>>>>>>> +             memset(v, *srcPtr, 8);
>>>>>>>> +             goto copy_loop;
>>>>>>>> +     case 2:
>>>>>>>> +             memcpy(v, srcPtr, 2);
>>>>>>>> +             memcpy(&v[2], srcPtr, 2);
>>>>>>>> +             memcpy(&v[4], &v[0], 4);
>>>>>>>> +             goto copy_loop;
>>>>>>>> +     case 4:
>>>>>>>> +             memcpy(v, srcPtr, 4);
>>>>>>>> +             memcpy(&v[4], srcPtr, 4);
>>>>>>>> +             goto copy_loop;
>>>>>>>> +     default:
>>>>>>>> +             LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>>>>>>>> +             return;
>>>>>>>> +     }
>>>>>>>> +
>>>>>>>> +      copy_loop:
>>>>>>>> +     memcpy(dstPtr, v, 8);
>>>>>>>> +     dstPtr += 8;
>>>>>>>> +     while (dstPtr < dstEnd) {
>>>>>>>> +             memcpy(dstPtr, v, 8);
>>>>>>>> +             dstPtr += 8;
>>>>>>>> +     }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +/* Read the variable-length literal or match length.
>>>>>>>> + *
>>>>>>>> + * ip - pointer to use as input.
>>>>>>>> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>>>>>>>> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>>>>>>>> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>>>>>>>> + * error (output) - error code.  Should be set to 0 before call.
>>>>>>>> + */
>>>>>>>> +typedef enum { loop_error = -2, initial_error = -1, ok = 0}
>>>>>>>> +variable_length_error; FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>>>>>>>> +                                        const BYTE *lencheck,
>>>>>>>> +                                        int loop_check, int initial_check,
>>>>>>>> +                                        variable_length_error *error) {
>>>>>>>> +     unsigned length = 0;
>>>>>>>> +     unsigned s;
>>>>>>>> +     if (initial_check && unlikely((*ip) >= lencheck)) {     /* overflow detection */
>>>>>>>> +             *error = initial_error;
>>>>>>>> +             return length;
>>>>>>>> +     }
>>>>>>>> +     do {
>>>>>>>> +             s = **ip;
>>>>>>>> +             (*ip)++;
>>>>>>>> +             length += s;
>>>>>>>> +             if (loop_check && unlikely((*ip) >= lencheck)) {        /* overflow detection */
>>>>>>>> +                     *error = loop_error;
>>>>>>>> +                     return length;
>>>>>>>> +             }
>>>>>>>> +     } while (s == 255);
>>>>>>>> +
>>>>>>>> +     return length;
>>>>>>>> +}
>>>>>>>> +#endif
>>>>>>>> +
>>>>>>>>  /*
>>>>>>>>   * LZ4_decompress_generic() :
>>>>>>>>   * This generic decompression function covers all use cases.
>>>>>>>> @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>        const size_t dictSize
>>>>>>>>        )
>>>>>>>>  {
>>>>>>>> -     const BYTE *ip = (const BYTE *) src;
>>>>>>>> -     const BYTE * const iend = ip + srcSize;
>>>>>>>> +     const BYTE *ip = (const BYTE *)src;
>>>>>>>> +     const BYTE *const iend = ip + srcSize;
>>>>>>>>
>>>>>>>>       BYTE *op = (BYTE *) dst;
>>>>>>>> -     BYTE * const oend = op + outputSize;
>>>>>>>> +     BYTE *const oend = op + outputSize;
>>>>>>>>       BYTE *cpy;
>>>>>>>>
>>>>>>>> -     const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>>> -     static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>>>>>>>> -     static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>>>>>>>> +     const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>>>
>>>>>>>>       const int safeDecode = (endOnInput == endOnInputSize);
>>>>>>>>       const int checkOffset = ((safeDecode) && (dictSize < (int)(64 *
>>>>>>>> KB)));
>>>>>>>>
>>>>>>>>       /* Set up the "end" pointers for the shortcut. */
>>>>>>>>       const BYTE *const shortiend = iend -
>>>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>>>       const BYTE *const shortoend = oend -
>>>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>>>> +
>>>>>>>> +     const BYTE *match;
>>>>>>>> +     size_t offset;
>>>>>>>> +     unsigned int token;
>>>>>>>> +     size_t length;
>>>>>>>>
>>>>>>>>       DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>>>>>>>>                srcSize, outputSize);
>>>>>>>> @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>       if ((endOnInput) && unlikely(srcSize == 0))
>>>>>>>>               return -1;
>>>>>>>>
>>>>>>>> -     /* Main Loop : decode sequences */
>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>> +     if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>>>>>>>> +             DEBUGLOG(6, "skip fast decode loop");
>>>>>>>> +             goto safe_decode;
>>>>>>>> +     }
>>>>>>>> +
>>>>>>>> +     /* Fast loop : decode sequences as long as output <
>>>>>>>> +iend-FASTLOOP_SAFE_DISTANCE */
>>>>>>>>       while (1) {
>>>>>>>> -             size_t length;
>>>>>>>> -             const BYTE *match;
>>>>>>>> -             size_t offset;
>>>>>>>> +             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>>>>>>>> +             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>>>>>>>> +             if (endOnInput) {
>>>>>>>> +                     assert(ip < iend);
>>>>>>>> +             }
>>>>>>>> +             token = *ip++;
>>>>>>>> +             length = token >> ML_BITS;      /* literal length */
>>>>>>>> +
>>>>>>>> +             assert(!endOnInput || ip <= iend);      /* ip < iend before the increment */
>>>>>>>> +
>>>>>>>> +             /* decode literal length */
>>>>>>>> +             if (length == RUN_MASK) {
>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>> +                     length +=
>>>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>>>> +                                              endOnInput, endOnInput,
>>>>>>>> +                                              &error);
>>>>>>>> +                     if (error == initial_error) {
>>>>>>>> +                             goto _output_error;
>>>>>>>> +                     }
>>>>>>>> +                     if ((safeDecode)
>>>>>>>> +                         && unlikely((uptrval) (op) + length <
>>>>>>>> +                                     (uptrval) (op))) {
>>>>>>>> +                             goto _output_error;
>>>>>>>> +                     }       /* overflow detection */
>>>>>>>> +                     if ((safeDecode)
>>>>>>>> +                         && unlikely((uptrval) (ip) + length <
>>>>>>>> +                                     (uptrval) (ip))) {
>>>>>>>> +                             goto _output_error;
>>>>>>>> +                     }
>>>>>>>> +
>>>>>>>> +                     /* overflow detection */
>>>>>>>> +                     /* copy literals */
>>>>>>>> +                     cpy = op + length;
>>>>>>>> +                     LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>>>> +                             if ((cpy > oend - 32)
>>>>>>>> +                                 || (ip + length > iend - 32)) {
>>>>>>>> +                                     goto safe_literal_copy;
>>>>>>>> +                             }
>>>>>>>> +                             LZ4_wildCopy32(op, ip, cpy);
>>>>>>>> +                     } else {        /* LZ4_decompress_fast() */
>>>>>>>> +                             if (cpy > oend - 8) {
>>>>>>>> +                                     goto safe_literal_copy;
>>>>>>>> +                             }
>>>>>>>> +                             LZ4_wildCopy(op, ip, cpy);
>>>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>>>> +                             /* it doesn't know input length, and only relies on end-of-block */
>>>>>>>> +                             /* properties */
>>>>>>>> +                     }
>>>>>>>> +                     ip += length;
>>>>>>>> +                     op = cpy;
>>>>>>>> +             } else {
>>>>>>>> +                     cpy = op + length;
>>>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>>>> +                             DEBUGLOG(7,
>>>>>>>> +                                      "copy %u bytes in a 16-bytes stripe",
>>>>>>>> +                                      (unsigned)length);
>>>>>>>> +                             /* We don't need to check oend */
>>>>>>>> +                             /* since we check it once for each loop below */
>>>>>>>> +                             if (ip > iend - (16 + 1)) {     /*max lit + offset + nextToken */
>>>>>>>> +                                     goto safe_literal_copy;
>>>>>>>> +                             }
>>>>>>>> +                             /* Literals can only be 14, but hope compilers optimize */
>>>>>>>> +                             /*if we copy by a register size */
>>>>>>>> +                             memcpy(op, ip, 16);
>>>>>>>> +                     } else {
>>>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>>>> +                             /* it doesn't know input length, and relies on end-of-block */
>>>>>>>> +                             /* properties */
>>>>>>>> +                             memcpy(op, ip, 8);
>>>>>>>> +                             if (length > 8) {
>>>>>>>> +                                     memcpy(op + 8, ip + 8, 8);
>>>>>>>> +                             }
>>>>>>>> +                     }
>>>>>>>> +                     ip += length;
>>>>>>>> +                     op = cpy;
>>>>>>>> +             }
>>>>>>>> +
>>>>>>>> +             /* get offset */
>>>>>>>> +             offset = LZ4_readLE16(ip);
>>>>>>>> +             ip += 2;        /* end-of-block condition violated */
>>>>>>>> +             match = op - offset;
>>>>>>>> +
>>>>>>>> +             /* get matchlength */
>>>>>>>> +             length = token & ML_MASK;
>>>>>>>>
>>>>>>>> -             /* get literal length */
>>>>>>>> -             unsigned int const token = *ip++;
>>>>>>>> -             length = token>>ML_BITS;
>>>>>>>> +             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>>>>>>>> +                     goto _output_error;
>>>>>>>> +             }
>>>>>>>> +             /* Error : offset outside buffers */
>>>>>>>> +             if (length == ML_MASK) {
>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>> +                     length +=
>>>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>>>> +                                              endOnInput, 0, &error);
>>>>>>>> +                     if (error != ok) {
>>>>>>>> +                             goto _output_error;
>>>>>>>> +                     }
>>>>>>>> +                     if ((safeDecode)
>>>>>>>> +                         && unlikely((uptrval) (op) + length < (uptrval) op)) {
>>>>>>>> +                             goto _output_error;
>>>>>>>> +                     }       /* overflow detection */
>>>>>>>> +                     length += MINMATCH;
>>>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>>>> +                             goto safe_match_copy;
>>>>>>>> +                     }
>>>>>>>> +             } else {
>>>>>>>> +                     length += MINMATCH;
>>>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>>>> +                             goto safe_match_copy;
>>>>>>>> +                     }
>>>>>>>> +
>>>>>>>> +                     /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>>>>>>>> +                     if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>>>>>>>> +                             if (offset >= 8) {
>>>>>>>> +                                     memcpy(op, match, 8);
>>>>>>>> +                                     memcpy(op + 8, match + 8, 8);
>>>>>>>> +                                     memcpy(op + 16, match + 16, 2);
>>>>>>>> +                                     op += length;
>>>>>>>> +                                     continue;
>>>>>>>> +                             }
>>>>>>>> +                     }
>>>>>>>> +             }
>>>>>>>> +
>>>>>>>> +             /* match starting within external dictionary */
>>>>>>>> +             if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>>> +                     if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>>> +                             if (partialDecoding) {
>>>>>>>> +                                     /* reach end of buffer */
>>>>>>>> +                                     length =
>>>>>>>> +                                         min(length, (size_t) (oend - op));
>>>>>>>> +                             } else {
>>>>>>>> +                                     /* end-of-block condition violated */
>>>>>>>> +                                     goto _output_error;
>>>>>>>> +                             }
>>>>>>>> +                     }
>>>>>>>> +
>>>>>>>> +                     if (length <= (size_t) (lowPrefix - match)) {
>>>>>>>> +                             /* match fits entirely within external dictionary : just copy */
>>>>>>>> +                             memmove(op, dictEnd - (lowPrefix - match), length);
>>>>>>>> +                             op += length;
>>>>>>>> +                     } else {
>>>>>>>> +                             /* match stretches into both external dict and current block */
>>>>>>>> +                             size_t const copySize =
>>>>>>>> +                                 (size_t) (lowPrefix - match);
>>>>>>>> +                             size_t const restSize = length - copySize;
>>>>>>>> +                             memcpy(op, dictEnd - copySize, copySize);
>>>>>>>> +                             op += copySize;
>>>>>>>> +                             if (restSize > (size_t) (op - lowPrefix)) {     /* overlap copy */
>>>>>>>> +                                     BYTE *const endOfMatch = op + restSize;
>>>>>>>> +                                     const BYTE *copyFrom = lowPrefix;
>>>>>>>> +                                     while (op < endOfMatch) {
>>>>>>>> +                                             *op++ = *copyFrom++;
>>>>>>>> +                                     }
>>>>>>>> +                             } else {
>>>>>>>> +                                     memcpy(op, lowPrefix, restSize);
>>>>>>>> +                                     op += restSize;
>>>>>>>> +                             }
>>>>>>>> +                     }
>>>>>>>> +                     continue;
>>>>>>>> +             }
>>>>>>>> +
>>>>>>>> +             /* copy match within block */
>>>>>>>> +             cpy = op + length;
>>>>>>>> +
>>>>>>>> +             assert((op <= oend) && (oend - op >= 32));
>>>>>>>> +             if (unlikely(offset < 16)) {
>>>>>>>> +                     LZ4_memcpy_using_offset(op, match, cpy, offset);
>>>>>>>> +             } else {
>>>>>>>> +                     LZ4_wildCopy32(op, match, cpy);
>>>>>>>> +             }
>>>>>>>> +
>>>>>>>> +             op = cpy;       /* wildcopy correction */
>>>>>>>> +     }
>>>>>>>> +      safe_decode:
>>>>>>>> +#endif
>>>>>>>> +     /* Main Loop : decode sequences */
>>>>>>>> +     while (1) {
>>>>>>>> +             length = token >> ML_BITS;
>>>>>>>>
>>>>>>>>               /* ip < iend before the increment */
>>>>>>>>               assert(!endOnInput || ip <= iend);
>>>>>>>> @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>                * combined check for both stages).
>>>>>>>>                */
>>>>>>>>               if ((endOnInput ? length != RUN_MASK : length <= 8)
>>>>>>>> -                /*
>>>>>>>> -                 * strictly "less than" on input, to re-enter
>>>>>>>> -                 * the loop with at least one byte
>>>>>>>> -                 */
>>>>>>>> -                && likely((endOnInput ? ip < shortiend : 1) &
>>>>>>>> -                          (op <= shortoend))) {
>>>>>>>> +                 /*
>>>>>>>> +                  * strictly "less than" on input, to re-enter
>>>>>>>> +                  * the loop with at least one byte
>>>>>>>> +                  */
>>>>>>>> +                 && likely((endOnInput ? ip < shortiend : 1) &
>>>>>>>> +                           (op <= shortoend))) {
>>>>>>>>                       /* Copy the literals */
>>>>>>>>                       memcpy(op, ip, endOnInput ? 16 : 8);
>>>>>>>> -                     op += length; ip += length;
>>>>>>>> +                     op += length;
>>>>>>>> +                     ip += length;
>>>>>>>>
>>>>>>>>                       /*
>>>>>>>>                        * The second stage:
>>>>>>>>                        * prepare for match copying, decode full info.
>>>>>>>>                        * If it doesn't work out, the info won't be wasted.
>>>>>>>>                        */
>>>>>>>> -                     length = token & ML_MASK; /* match length */
>>>>>>>> +                     length = token & ML_MASK;       /* match length */
>>>>>>>>                       offset = LZ4_readLE16(ip);
>>>>>>>>                       ip += 2;
>>>>>>>>                       match = op - offset;
>>>>>>>> -                     assert(match <= op); /* check overflow */
>>>>>>>> +                     assert(match <= op);    /* check overflow */
>>>>>>>>
>>>>>>>>                       /* Do not deal with overlapping matches. */
>>>>>>>>                       if ((length != ML_MASK) &&
>>>>>>>> @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>
>>>>>>>>               /* decode literal length */
>>>>>>>>               if (length == RUN_MASK) {
>>>>>>>> -                     unsigned int s;
>>>>>>>>
>>>>>>>> -                     if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>>>>>>>> -                             /* overflow detection */
>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>> +                     length +=
>>>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>>>> +                                              endOnInput, endOnInput,
>>>>>>>> +                                              &error);
>>>>>>>> +                     if (error == initial_error)
>>>>>>>>                               goto _output_error;
>>>>>>>> -                     }
>>>>>>>> -                     do {
>>>>>>>> -                             s = *ip++;
>>>>>>>> -                             length += s;
>>>>>>>> -                     } while (likely(endOnInput
>>>>>>>> -                             ? ip < iend - RUN_MASK
>>>>>>>> -                             : 1) & (s == 255));
>>>>>>>>
>>>>>>>>                       if ((safeDecode)
>>>>>>>> -                         && unlikely((uptrval)(op) +
>>>>>>>> -                                     length < (uptrval)(op))) {
>>>>>>>> +                         && unlikely((uptrval) (op) +
>>>>>>>> +                                     length < (uptrval) (op))) {
>>>>>>>>                               /* overflow detection */
>>>>>>>>                               goto _output_error;
>>>>>>>>                       }
>>>>>>>>                       if ((safeDecode)
>>>>>>>> -                         && unlikely((uptrval)(ip) +
>>>>>>>> -                                     length < (uptrval)(ip))) {
>>>>>>>> +                         && unlikely((uptrval) (ip) +
>>>>>>>> +                                     length < (uptrval) (ip))) {
>>>>>>>>                               /* overflow detection */
>>>>>>>>                               goto _output_error;
>>>>>>>>                       }
>>>>>>>> @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>
>>>>>>>>               /* copy literals */
>>>>>>>>               cpy = op + length;
>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>> +           safe_literal_copy:
>>>>>>>> +#endif
>>>>>>>>               LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>>>
>>>>>>>>               if (((endOnInput) && ((cpy > oend - MFLIMIT)
>>>>>>>> -                     || (ip + length > iend - (2 + 1 + LASTLITERALS))))
>>>>>>>> -                     || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>>> +                                   || (ip + length >
>>>>>>>> +                                       iend - (2 + 1 + LASTLITERALS))))
>>>>>>>> +                 || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>>>                       if (partialDecoding) {
>>>>>>>>                               if (cpy > oend) {
>>>>>>>>                                       /*
>>>>>>>> @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>                                       length = oend - op;
>>>>>>>>                               }
>>>>>>>>                               if ((endOnInput)
>>>>>>>> -                                     && (ip + length > iend)) {
>>>>>>>> +                                 && (ip + length > iend)) {
>>>>>>>>                                       /*
>>>>>>>>                                        * Error :
>>>>>>>>                                        * read attempt beyond
>>>>>>>> @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>                               }
>>>>>>>>                       } else {
>>>>>>>>                               if ((!endOnInput)
>>>>>>>> -                                     && (cpy != oend)) {
>>>>>>>> +                                 && (cpy != oend)) {
>>>>>>>>                                       /*
>>>>>>>>                                        * Error :
>>>>>>>>                                        * block decoding must
>>>>>>>> @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>                                       goto _output_error;
>>>>>>>>                               }
>>>>>>>>                               if ((endOnInput)
>>>>>>>> -                                     && ((ip + length != iend)
>>>>>>>> +                                 && ((ip + length != iend)
>>>>>>>>                                       || (cpy > oend))) {
>>>>>>>>                                       /*
>>>>>>>>                                        * Error :
>>>>>>>> @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>                       goto _output_error;
>>>>>>>>               }
>>>>>>>>
>>>>>>>> -             /* costs ~1%; silence an msan warning when offset == 0 */
>>>>>>>> -             /*
>>>>>>>> -              * note : when partialDecoding, there is no guarantee that
>>>>>>>> -              * at least 4 bytes remain available in output buffer
>>>>>>>> -              */
>>>>>>>> -             if (!partialDecoding) {
>>>>>>>> -                     assert(oend > op);
>>>>>>>> -                     assert(oend - op >= 4);
>>>>>>>> -
>>>>>>>> -                     LZ4_write32(op, (U32)offset);
>>>>>>>> -             }
>>>>>>>> -
>>>>>>>>               if (length == ML_MASK) {
>>>>>>>> -                     unsigned int s;
>>>>>>>> -
>>>>>>>> -                     do {
>>>>>>>> -                             s = *ip++;
>>>>>>>> -
>>>>>>>> -                             if ((endOnInput) && (ip > iend - LASTLITERALS))
>>>>>>>> -                                     goto _output_error;
>>>>>>>>
>>>>>>>> -                             length += s;
>>>>>>>> -                     } while (s == 255);
>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>> +                     length +=
>>>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>>>> +                                              endOnInput, 0, &error);
>>>>>>>> +                     if (error != ok)
>>>>>>>> +                             goto _output_error;
>>>>>>>>
>>>>>>>>                       if ((safeDecode)
>>>>>>>>                               && unlikely(
>>>>>>>> @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>
>>>>>>>>               length += MINMATCH;
>>>>>>>>
>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>> +safe_match_copy:
>>>>>>>> +#endif
>>>>>>>> +
>>>>>>>>               /* match starting within external dictionary */
>>>>>>>>               if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>>>                       if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-16  1:35             ` Gao Xiang
  2019-05-16  2:07               ` Mao, Chenxi
@ 2019-05-16  2:11               ` Mao, Chenxi
  2019-05-16  2:18                 ` Gao Xiang
  1 sibling, 1 reply; 16+ messages in thread
From: Mao, Chenxi @ 2019-05-16  2:11 UTC (permalink / raw)
  To: Gao Xiang
  Cc: Cyan, akpm, linux-kernel, Feng, Roy, Xu, Yuanli 2, Alm, Robert 2,
	Takahashi, Masaya A (Sony Mobile),
	Miao Xie, Greg Kroah-Hartman

Hi Xiang:

I checked the deliver history.
There is only below delivery related with lz4.c
Pull request:616
4e3accc Fix Dict Size Test in `LZ4_compress_fast_continue()`
535636f Don't Attach Very Small Dictionaries
This 2 changes seems like ONLY bug fixes for dictionary fix baesd on 1.8.3

Based on current investigation result, I think pick decompress patches is safe for v1.8.3

If there is any misunderstanding or faults, please feel free to let me know.

Chenxi

On 5/16/19 9:35 AM, Gao Xiang wrote:
> 
> 
> On 2019/5/16 7:48, Mao, Chenxi wrote:
>> Hi Yann and Xiang:
>> For this FAST_DEC_LOOP change, I only pick up decompress related patches to current kernel LZ4 implementation(based on 1.8.3).
>> Here is my cherry-pick list:
>> 2589c44 created LZ4_FAST_DEC_LOOP build macro
>> 605d811 enable LZ4_FAST_DEC_LOOP build macro on aarch64/GCC by default
>> 5d7d116 decompress_generic: Limit fastpath to x86
>> 75fb878 decompress_generic: Add fastpath for small offsets
>> faac110 decompress_generic: Unroll loops a bit more
>> 1fbaf84 decompress_generic: remove msan write
>> 28b8249 decompress_generic: re-add fastpath
>> 232f1e2 decompress_generic: drop partial copy check in fast loop
>> 59332a3 decompress_generic: Optimize literal copies
>> 5dfa7d4 decompress_generic: optimize match copy
>> 28356e0 decompress_generic: Add a loop fastpath
>> 4da3360 decompress_generic: Refactor variable length fields
>>
>> Only cherry-pick these changes to LZ4 1.8.3 would not be introduce risks.
> 
> The fact is that ... LZ4_FAST_DEC_LOOP almost influences all decompression apis.
> You should keep LZ4_decompress_generic from all known issues.
> 
>>
>> @Xiang:
>> Do you prefer to upgrade kernel lz4 to 1.9.1 directly or only pick some optimization patches?
> 
> I think you should pick all patches which are related to LZ4_decompress_generic()
> and LZ4_FAST_DEC_LOOP since known issues of LZ4_FAST_DEC_LOOP should be fixed of course.
> 
> Thanks,
> Gao Xiang
> 
>>
>> Chenxi
>>
>>
>> On 5/16/19 1:47 AM, Gao Xiang wrote:
>>> Hi Yann,
>>>
>>> On 2019/5/16 1:03, Cyan wrote:
>>>> Re-posted,
>>>> it seems the previous message was rejected by the linux-kernel server
>>>> due to some kind of format limitation (no html).
>>>>
>>>>
>>>> Le mer. 15 mai 2019 à 09:56, Cyan <yann.collet.73@gmail.com> a écrit :
>>>>>
>>>>> The v1.9.0 version has a bug which makes it read a few bytes out of bound in certain cases.
>>>>> This was fixed in v1.9.1.
>>>>> Therefore, if you plan upgrading version, skip directly to v1.9.1.
>>>>>
>>>>> Note that, in v1.9.1, LZ4_decompress_fast() is now deprecated.
>>>>> LZ4_decompress_fast() is a security liability, since it's unable to cope with malicious inputs.
>>>>> While this is not a problem when the producer is under direct control (local production and consumption, no intermediate storage),
>>>>> this entry point seems misused in multiple scenarios where input cannot be decently trusted.
>>>>> The deprecation warning is meant to "scare away" such usages.
>>>>> It's still possible to intentionally remove the warning, for users who know what they are doing.
>>>>>
>>>>> Note that, on top of that, the _fast() variant is now slower that LZ4_decompress_safe(), removing its most important positive differentiator.
>>>>> That's because LZ4_decompress_fast() is "blind", it doesn't know the input size.
>>>>> It just guesses it, from the requested output size, and by relying on end-of-block conditions.
>>>>> This effectively limits its capability to copy up to 8 bytes at a time,
>>>>> which proves slower than the 32-bytes at a time used in LZ4_decompress_safe().
>>>>>
>>>>> LZ4_decompress_fast() still has some possible usage :
>>>>> it doesn't need the compressed size.
>>>>> Therefore, in a scenario where decompressed size is known, but compressed size isn't,
>>>>> it cannot be replaced by `LZ4_decompress_safe()`.
>>>>> This is pretty rare, but can happen in some cases, erofs for example.
>>>>> `LZ4_decompress_safe_partial()` *might* be able to replace it for such scenario,
>>>>> but I haven't thoroughly tested it, so can't yet vouch for it.
>>>
>>> Yes, that is my real concern...
>>>
>>> In fact, we have shipped erofs with LZ4_decompress_safe_partial() of lz4-1.8.3
>>> for our HUAWEI P30/P30 Pro and other HUAWEI EMUI 9.1 phones on the market...
>>>
>>> It seemd lz4-1.8.3 is rather stable since no stability report from our internal
>>> beta test and real consumers...
>>>
>>> I haven't tested lz4-1.9.0 yet since I am working on the erofs paper [1]...
>>>
>>> But anyway, the patch looks good to me since it has obvious performance gain...
>>>
>>>>>
>>>>> If it doesn't, then it will be necessary to create a new entry point.
>>>>> The main difference is that this new entry point will need a bound for the input size, in order to guarantee it never reads beyond input buffer.
>>>>> But as said, for the time being, such dedicated entry point doesn't exist.
>>>
>>> I agree, but current erofs still have to choose LZ4_decompress_fast...
>>> I will test these 2 apis later and will report if something is wrong as well.
>>>
>>> [1] https://www.usenix.org/conference/atc19/presentation/gao
>>>     https://www.usenix.org/conference/atc19/technical-sessions
>>>
>>> Thanks,
>>> Gao Xiang
>>>
>>>
>>>>>
>>>>> The new FAST_DEC_LOOP setting works well on x86 and x64.
>>>>> Recently, it has been extended to gcc+arm64 (`dev` branch).
>>>>> The combination of clang+arm64 is less successful though (impacts vary, depending on version and exact hardware), so it's still disabled by default.
>>>>> But anyone can test it by setting FAST_DEC_LOOP definition to 1, thus enabling the new decoder loop.
>>>>>
>>>>>
>>>>> Y.
>>>>>
>>>>> Le mar. 14 mai 2019 à 20:28, Mao, Chenxi <Chenxi.Mao@sony.com> a écrit :
>>>>>>
>>>>>> Hi Xiang:
>>>>>>
>>>>>> Thanks for your reply, I will have a stress test on my device later.
>>>>>> I didn't have chance to test LZ4 with clang build because of device limitation. I think I could do it later.
>>>>>> I guess the clang performance downgrade is caused by some compiler optimization options.
>>>>>> I will double check it later if I got the device which can build kernel with clang.
>>>>>>
>>>>>> BTW, the FAST_DEC_LOOP leverage LZ4_wildCopy8 instead of LZ4_wildCopy,
>>>>>> however based on my test, original LZ4_wildCopy has the better performance.
>>>>>> Original LZ4_wildCopy API still invoked by FAST_DEC_LOOP apis.
>>>>>> That might be a problem for X86 devices for current patch.
>>>>>>
>>>>>> Chenxi
>>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Gao Xiang [mailto:gaoxiang25@huawei.com]
>>>>>> Sent: Wednesday, May 15, 2019 10:21 AM
>>>>>> To: Mao, Chenxi <Chenxi.Mao@sony.com>
>>>>>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org; Feng, Roy <Roy.Feng@sony.com>; Xu, Yuanli 2 <Yuanli.Xu@sony.com>; Alm, Robert 2 <Robert.Alm@sony.com>; Takahashi, Masaya A (Sony Mobile) <Masaya.A.Takahashi@sony.com>; Miao Xie <miaoxie@huawei.com>
>>>>>> Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
>>>>>>
>>>>>> Hi Chenxi,
>>>>>>
>>>>>> On 2019/5/15 8:43, Chenxi Mao wrote:
>>>>>>> FAST_DEC_LOOP was introduced from LZ4 1.9.
>>>>>>> This change would be introduce 10% on decompress operation according
>>>>>>> to LZ4 benchmark result on X86 devices.
>>>>>>> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements, however
>>>>>>> clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>>>>>
>>>>>> I noticed this optimization and lz4-1.9.0 [1] was just released month ago with this big change therefore I'm a little afraid of its current stability (especially some rare used decompression apis...)
>>>>>>
>>>>>> Could you Cc more people (e.g. lz4 original author Yann Collet) in order to get some more ideas about this if possible?
>>>>>>
>>>>>> Anyway, I like this optimization as well since FAST_DEC_LOOP improves decompression speed a lot :)
>>>>>>
>>>>>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>>>>>
>>>>>> Thanks,
>>>>>> Gao Xiang
>>>>>>
>>>>>>>
>>>>>>> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>>>>>>>
>>>>>>> Here is the test result on ARM64(cortex-A53)
>>>>>>>
>>>>>>> Benchmark via ZRAM:
>>>>>>>
>>>>>>> Test case:
>>>>>>> fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>>>>>>> --buffer_compress_percentage=75 \
>>>>>>> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
>>>>>>> --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
>>>>>>> --name=seq-read --rw=read --stonewall --name=seq-readwrite \ --rw=rw
>>>>>>> --stonewall --name=rand-readwrite --rw=randrw --stonewall
>>>>>>>
>>>>>>> Patched:
>>>>>>>    READ: bw=7077MiB/s (7421MB/s)
>>>>>>> Vanilla:
>>>>>>>    READ: bw=5134MiB/s (5384MB/s)
>>>>>>>
>>>>>>> Reference:
>>>>>>> 1. https://github.com/lz4/lz4/pull/645
>>>>>>> 2. https://github.com/lz4/lz4/pull/707
>>>>>>>
>>>>>>> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
>>>>>>> ---
>>>>>>>  lib/lz4/lz4_decompress.c | 425
>>>>>>> +++++++++++++++++++++++++++++++++------
>>>>>>>  1 file changed, 361 insertions(+), 64 deletions(-)
>>>>>>>
>>>>>>> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index
>>>>>>> 0c9d3ad17e0f..a4c87e32b3c0 100644
>>>>>>> --- a/lib/lz4/lz4_decompress.c
>>>>>>> +++ b/lib/lz4/lz4_decompress.c
>>>>>>> @@ -50,6 +50,131 @@
>>>>>>>  #define assert(condition) ((void)0)
>>>>>>>  #endif
>>>>>>>
>>>>>>> +#ifndef LZ4_FAST_DEC_LOOP
>>>>>>> +#if defined(__i386__) || defined(__x86_64__) #define
>>>>>>> +LZ4_FAST_DEC_LOOP 1 #elif defined(__aarch64__) && !defined(__clang__)
>>>>>>> +     /* On aarch64, we disable this optimization for clang because on certain
>>>>>>> +      * mobile chipsets and clang, it reduces performance. For more information
>>>>>>> +      * refer to https://github.com/lz4/lz4/pull/707. */ #define
>>>>>>> +LZ4_FAST_DEC_LOOP 1 #else #define LZ4_FAST_DEC_LOOP 0 #endif #endif
>>>>>>> +
>>>>>>> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>>>>>>> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>>>>>>> +
>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>> +#define FASTLOOP_SAFE_DISTANCE 64
>>>>>>> +FORCE_INLINE void
>>>>>>> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>>>>>>> +                          const size_t offset)
>>>>>>> +{
>>>>>>> +     if (offset < 8) {
>>>>>>> +             dstPtr[0] = srcPtr[0];
>>>>>>> +
>>>>>>> +             dstPtr[1] = srcPtr[1];
>>>>>>> +             dstPtr[2] = srcPtr[2];
>>>>>>> +             dstPtr[3] = srcPtr[3];
>>>>>>> +             srcPtr += inc32table[offset];
>>>>>>> +             memcpy(dstPtr + 4, srcPtr, 4);
>>>>>>> +             srcPtr -= dec64table[offset];
>>>>>>> +             dstPtr += 8;
>>>>>>> +     } else {
>>>>>>> +             memcpy(dstPtr, srcPtr, 8);
>>>>>>> +             dstPtr += 8;
>>>>>>> +             srcPtr += 8;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     LZ4_wildCopy(dstPtr, srcPtr, dstEnd); }
>>>>>>> +
>>>>>>> +/* customized variant of memcpy, which can overwrite up to 32 bytes
>>>>>>> +beyond dstEnd
>>>>>>> + * this version copies two times 16 bytes (instead of one time 32
>>>>>>> +bytes)
>>>>>>> + * because it must be compatible with offsets >= 16. */ FORCE_INLINE
>>>>>>> +void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) {
>>>>>>> +     BYTE *d = (BYTE *) dstPtr;
>>>>>>> +     const BYTE *s = (const BYTE *)srcPtr;
>>>>>>> +     BYTE *const e = (BYTE *) dstEnd;
>>>>>>> +
>>>>>>> +     do {
>>>>>>> +             memcpy(d, s, 16);
>>>>>>> +             memcpy(d + 16, s + 16, 16);
>>>>>>> +             d += 32;
>>>>>>> +             s += 32;
>>>>>>> +     } while (d < e);
>>>>>>> +}
>>>>>>> +
>>>>>>> +FORCE_INLINE void
>>>>>>> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>>>>>>> +                     const size_t offset)
>>>>>>> +{
>>>>>>> +     BYTE v[8];
>>>>>>> +     switch (offset) {
>>>>>>> +
>>>>>>> +     case 1:
>>>>>>> +             memset(v, *srcPtr, 8);
>>>>>>> +             goto copy_loop;
>>>>>>> +     case 2:
>>>>>>> +             memcpy(v, srcPtr, 2);
>>>>>>> +             memcpy(&v[2], srcPtr, 2);
>>>>>>> +             memcpy(&v[4], &v[0], 4);
>>>>>>> +             goto copy_loop;
>>>>>>> +     case 4:
>>>>>>> +             memcpy(v, srcPtr, 4);
>>>>>>> +             memcpy(&v[4], srcPtr, 4);
>>>>>>> +             goto copy_loop;
>>>>>>> +     default:
>>>>>>> +             LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>>>>>>> +             return;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +      copy_loop:
>>>>>>> +     memcpy(dstPtr, v, 8);
>>>>>>> +     dstPtr += 8;
>>>>>>> +     while (dstPtr < dstEnd) {
>>>>>>> +             memcpy(dstPtr, v, 8);
>>>>>>> +             dstPtr += 8;
>>>>>>> +     }
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* Read the variable-length literal or match length.
>>>>>>> + *
>>>>>>> + * ip - pointer to use as input.
>>>>>>> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>>>>>>> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>>>>>>> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>>>>>>> + * error (output) - error code.  Should be set to 0 before call.
>>>>>>> + */
>>>>>>> +typedef enum { loop_error = -2, initial_error = -1, ok = 0}
>>>>>>> +variable_length_error; FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>>>>>>> +                                        const BYTE *lencheck,
>>>>>>> +                                        int loop_check, int initial_check,
>>>>>>> +                                        variable_length_error *error) {
>>>>>>> +     unsigned length = 0;
>>>>>>> +     unsigned s;
>>>>>>> +     if (initial_check && unlikely((*ip) >= lencheck)) {     /* overflow detection */
>>>>>>> +             *error = initial_error;
>>>>>>> +             return length;
>>>>>>> +     }
>>>>>>> +     do {
>>>>>>> +             s = **ip;
>>>>>>> +             (*ip)++;
>>>>>>> +             length += s;
>>>>>>> +             if (loop_check && unlikely((*ip) >= lencheck)) {        /* overflow detection */
>>>>>>> +                     *error = loop_error;
>>>>>>> +                     return length;
>>>>>>> +             }
>>>>>>> +     } while (s == 255);
>>>>>>> +
>>>>>>> +     return length;
>>>>>>> +}
>>>>>>> +#endif
>>>>>>> +
>>>>>>>  /*
>>>>>>>   * LZ4_decompress_generic() :
>>>>>>>   * This generic decompression function covers all use cases.
>>>>>>> @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>        const size_t dictSize
>>>>>>>        )
>>>>>>>  {
>>>>>>> -     const BYTE *ip = (const BYTE *) src;
>>>>>>> -     const BYTE * const iend = ip + srcSize;
>>>>>>> +     const BYTE *ip = (const BYTE *)src;
>>>>>>> +     const BYTE *const iend = ip + srcSize;
>>>>>>>
>>>>>>>       BYTE *op = (BYTE *) dst;
>>>>>>> -     BYTE * const oend = op + outputSize;
>>>>>>> +     BYTE *const oend = op + outputSize;
>>>>>>>       BYTE *cpy;
>>>>>>>
>>>>>>> -     const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>> -     static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>>>>>>> -     static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>>>>>>> +     const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>>
>>>>>>>       const int safeDecode = (endOnInput == endOnInputSize);
>>>>>>>       const int checkOffset = ((safeDecode) && (dictSize < (int)(64 *
>>>>>>> KB)));
>>>>>>>
>>>>>>>       /* Set up the "end" pointers for the shortcut. */
>>>>>>>       const BYTE *const shortiend = iend -
>>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>>       const BYTE *const shortoend = oend -
>>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>>> +
>>>>>>> +     const BYTE *match;
>>>>>>> +     size_t offset;
>>>>>>> +     unsigned int token;
>>>>>>> +     size_t length;
>>>>>>>
>>>>>>>       DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>>>>>>>                srcSize, outputSize);
>>>>>>> @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>       if ((endOnInput) && unlikely(srcSize == 0))
>>>>>>>               return -1;
>>>>>>>
>>>>>>> -     /* Main Loop : decode sequences */
>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>> +     if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>>>>>>> +             DEBUGLOG(6, "skip fast decode loop");
>>>>>>> +             goto safe_decode;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     /* Fast loop : decode sequences as long as output <
>>>>>>> +iend-FASTLOOP_SAFE_DISTANCE */
>>>>>>>       while (1) {
>>>>>>> -             size_t length;
>>>>>>> -             const BYTE *match;
>>>>>>> -             size_t offset;
>>>>>>> +             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>>>>>>> +             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>>>>>>> +             if (endOnInput) {
>>>>>>> +                     assert(ip < iend);
>>>>>>> +             }
>>>>>>> +             token = *ip++;
>>>>>>> +             length = token >> ML_BITS;      /* literal length */
>>>>>>> +
>>>>>>> +             assert(!endOnInput || ip <= iend);      /* ip < iend before the increment */
>>>>>>> +
>>>>>>> +             /* decode literal length */
>>>>>>> +             if (length == RUN_MASK) {
>>>>>>> +                     variable_length_error error = ok;
>>>>>>> +                     length +=
>>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>>> +                                              endOnInput, endOnInput,
>>>>>>> +                                              &error);
>>>>>>> +                     if (error == initial_error) {
>>>>>>> +                             goto _output_error;
>>>>>>> +                     }
>>>>>>> +                     if ((safeDecode)
>>>>>>> +                         && unlikely((uptrval) (op) + length <
>>>>>>> +                                     (uptrval) (op))) {
>>>>>>> +                             goto _output_error;
>>>>>>> +                     }       /* overflow detection */
>>>>>>> +                     if ((safeDecode)
>>>>>>> +                         && unlikely((uptrval) (ip) + length <
>>>>>>> +                                     (uptrval) (ip))) {
>>>>>>> +                             goto _output_error;
>>>>>>> +                     }
>>>>>>> +
>>>>>>> +                     /* overflow detection */
>>>>>>> +                     /* copy literals */
>>>>>>> +                     cpy = op + length;
>>>>>>> +                     LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>>> +                             if ((cpy > oend - 32)
>>>>>>> +                                 || (ip + length > iend - 32)) {
>>>>>>> +                                     goto safe_literal_copy;
>>>>>>> +                             }
>>>>>>> +                             LZ4_wildCopy32(op, ip, cpy);
>>>>>>> +                     } else {        /* LZ4_decompress_fast() */
>>>>>>> +                             if (cpy > oend - 8) {
>>>>>>> +                                     goto safe_literal_copy;
>>>>>>> +                             }
>>>>>>> +                             LZ4_wildCopy(op, ip, cpy);
>>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>>> +                             /* it doesn't know input length, and only relies on end-of-block */
>>>>>>> +                             /* properties */
>>>>>>> +                     }
>>>>>>> +                     ip += length;
>>>>>>> +                     op = cpy;
>>>>>>> +             } else {
>>>>>>> +                     cpy = op + length;
>>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>>> +                             DEBUGLOG(7,
>>>>>>> +                                      "copy %u bytes in a 16-bytes stripe",
>>>>>>> +                                      (unsigned)length);
>>>>>>> +                             /* We don't need to check oend */
>>>>>>> +                             /* since we check it once for each loop below */
>>>>>>> +                             if (ip > iend - (16 + 1)) {     /*max lit + offset + nextToken */
>>>>>>> +                                     goto safe_literal_copy;
>>>>>>> +                             }
>>>>>>> +                             /* Literals can only be 14, but hope compilers optimize */
>>>>>>> +                             /*if we copy by a register size */
>>>>>>> +                             memcpy(op, ip, 16);
>>>>>>> +                     } else {
>>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>>> +                             /* it doesn't know input length, and relies on end-of-block */
>>>>>>> +                             /* properties */
>>>>>>> +                             memcpy(op, ip, 8);
>>>>>>> +                             if (length > 8) {
>>>>>>> +                                     memcpy(op + 8, ip + 8, 8);
>>>>>>> +                             }
>>>>>>> +                     }
>>>>>>> +                     ip += length;
>>>>>>> +                     op = cpy;
>>>>>>> +             }
>>>>>>> +
>>>>>>> +             /* get offset */
>>>>>>> +             offset = LZ4_readLE16(ip);
>>>>>>> +             ip += 2;        /* end-of-block condition violated */
>>>>>>> +             match = op - offset;
>>>>>>> +
>>>>>>> +             /* get matchlength */
>>>>>>> +             length = token & ML_MASK;
>>>>>>>
>>>>>>> -             /* get literal length */
>>>>>>> -             unsigned int const token = *ip++;
>>>>>>> -             length = token>>ML_BITS;
>>>>>>> +             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>>>>>>> +                     goto _output_error;
>>>>>>> +             }
>>>>>>> +             /* Error : offset outside buffers */
>>>>>>> +             if (length == ML_MASK) {
>>>>>>> +                     variable_length_error error = ok;
>>>>>>> +                     length +=
>>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>>> +                                              endOnInput, 0, &error);
>>>>>>> +                     if (error != ok) {
>>>>>>> +                             goto _output_error;
>>>>>>> +                     }
>>>>>>> +                     if ((safeDecode)
>>>>>>> +                         && unlikely((uptrval) (op) + length < (uptrval) op)) {
>>>>>>> +                             goto _output_error;
>>>>>>> +                     }       /* overflow detection */
>>>>>>> +                     length += MINMATCH;
>>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>>> +                             goto safe_match_copy;
>>>>>>> +                     }
>>>>>>> +             } else {
>>>>>>> +                     length += MINMATCH;
>>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>>> +                             goto safe_match_copy;
>>>>>>> +                     }
>>>>>>> +
>>>>>>> +                     /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>>>>>>> +                     if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>>>>>>> +                             if (offset >= 8) {
>>>>>>> +                                     memcpy(op, match, 8);
>>>>>>> +                                     memcpy(op + 8, match + 8, 8);
>>>>>>> +                                     memcpy(op + 16, match + 16, 2);
>>>>>>> +                                     op += length;
>>>>>>> +                                     continue;
>>>>>>> +                             }
>>>>>>> +                     }
>>>>>>> +             }
>>>>>>> +
>>>>>>> +             /* match starting within external dictionary */
>>>>>>> +             if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>> +                     if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>> +                             if (partialDecoding) {
>>>>>>> +                                     /* reach end of buffer */
>>>>>>> +                                     length =
>>>>>>> +                                         min(length, (size_t) (oend - op));
>>>>>>> +                             } else {
>>>>>>> +                                     /* end-of-block condition violated */
>>>>>>> +                                     goto _output_error;
>>>>>>> +                             }
>>>>>>> +                     }
>>>>>>> +
>>>>>>> +                     if (length <= (size_t) (lowPrefix - match)) {
>>>>>>> +                             /* match fits entirely within external dictionary : just copy */
>>>>>>> +                             memmove(op, dictEnd - (lowPrefix - match), length);
>>>>>>> +                             op += length;
>>>>>>> +                     } else {
>>>>>>> +                             /* match stretches into both external dict and current block */
>>>>>>> +                             size_t const copySize =
>>>>>>> +                                 (size_t) (lowPrefix - match);
>>>>>>> +                             size_t const restSize = length - copySize;
>>>>>>> +                             memcpy(op, dictEnd - copySize, copySize);
>>>>>>> +                             op += copySize;
>>>>>>> +                             if (restSize > (size_t) (op - lowPrefix)) {     /* overlap copy */
>>>>>>> +                                     BYTE *const endOfMatch = op + restSize;
>>>>>>> +                                     const BYTE *copyFrom = lowPrefix;
>>>>>>> +                                     while (op < endOfMatch) {
>>>>>>> +                                             *op++ = *copyFrom++;
>>>>>>> +                                     }
>>>>>>> +                             } else {
>>>>>>> +                                     memcpy(op, lowPrefix, restSize);
>>>>>>> +                                     op += restSize;
>>>>>>> +                             }
>>>>>>> +                     }
>>>>>>> +                     continue;
>>>>>>> +             }
>>>>>>> +
>>>>>>> +             /* copy match within block */
>>>>>>> +             cpy = op + length;
>>>>>>> +
>>>>>>> +             assert((op <= oend) && (oend - op >= 32));
>>>>>>> +             if (unlikely(offset < 16)) {
>>>>>>> +                     LZ4_memcpy_using_offset(op, match, cpy, offset);
>>>>>>> +             } else {
>>>>>>> +                     LZ4_wildCopy32(op, match, cpy);
>>>>>>> +             }
>>>>>>> +
>>>>>>> +             op = cpy;       /* wildcopy correction */
>>>>>>> +     }
>>>>>>> +      safe_decode:
>>>>>>> +#endif
>>>>>>> +     /* Main Loop : decode sequences */
>>>>>>> +     while (1) {
>>>>>>> +             length = token >> ML_BITS;
>>>>>>>
>>>>>>>               /* ip < iend before the increment */
>>>>>>>               assert(!endOnInput || ip <= iend);
>>>>>>> @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>                * combined check for both stages).
>>>>>>>                */
>>>>>>>               if ((endOnInput ? length != RUN_MASK : length <= 8)
>>>>>>> -                /*
>>>>>>> -                 * strictly "less than" on input, to re-enter
>>>>>>> -                 * the loop with at least one byte
>>>>>>> -                 */
>>>>>>> -                && likely((endOnInput ? ip < shortiend : 1) &
>>>>>>> -                          (op <= shortoend))) {
>>>>>>> +                 /*
>>>>>>> +                  * strictly "less than" on input, to re-enter
>>>>>>> +                  * the loop with at least one byte
>>>>>>> +                  */
>>>>>>> +                 && likely((endOnInput ? ip < shortiend : 1) &
>>>>>>> +                           (op <= shortoend))) {
>>>>>>>                       /* Copy the literals */
>>>>>>>                       memcpy(op, ip, endOnInput ? 16 : 8);
>>>>>>> -                     op += length; ip += length;
>>>>>>> +                     op += length;
>>>>>>> +                     ip += length;
>>>>>>>
>>>>>>>                       /*
>>>>>>>                        * The second stage:
>>>>>>>                        * prepare for match copying, decode full info.
>>>>>>>                        * If it doesn't work out, the info won't be wasted.
>>>>>>>                        */
>>>>>>> -                     length = token & ML_MASK; /* match length */
>>>>>>> +                     length = token & ML_MASK;       /* match length */
>>>>>>>                       offset = LZ4_readLE16(ip);
>>>>>>>                       ip += 2;
>>>>>>>                       match = op - offset;
>>>>>>> -                     assert(match <= op); /* check overflow */
>>>>>>> +                     assert(match <= op);    /* check overflow */
>>>>>>>
>>>>>>>                       /* Do not deal with overlapping matches. */
>>>>>>>                       if ((length != ML_MASK) &&
>>>>>>> @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>
>>>>>>>               /* decode literal length */
>>>>>>>               if (length == RUN_MASK) {
>>>>>>> -                     unsigned int s;
>>>>>>>
>>>>>>> -                     if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>>>>>>> -                             /* overflow detection */
>>>>>>> +                     variable_length_error error = ok;
>>>>>>> +                     length +=
>>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>>> +                                              endOnInput, endOnInput,
>>>>>>> +                                              &error);
>>>>>>> +                     if (error == initial_error)
>>>>>>>                               goto _output_error;
>>>>>>> -                     }
>>>>>>> -                     do {
>>>>>>> -                             s = *ip++;
>>>>>>> -                             length += s;
>>>>>>> -                     } while (likely(endOnInput
>>>>>>> -                             ? ip < iend - RUN_MASK
>>>>>>> -                             : 1) & (s == 255));
>>>>>>>
>>>>>>>                       if ((safeDecode)
>>>>>>> -                         && unlikely((uptrval)(op) +
>>>>>>> -                                     length < (uptrval)(op))) {
>>>>>>> +                         && unlikely((uptrval) (op) +
>>>>>>> +                                     length < (uptrval) (op))) {
>>>>>>>                               /* overflow detection */
>>>>>>>                               goto _output_error;
>>>>>>>                       }
>>>>>>>                       if ((safeDecode)
>>>>>>> -                         && unlikely((uptrval)(ip) +
>>>>>>> -                                     length < (uptrval)(ip))) {
>>>>>>> +                         && unlikely((uptrval) (ip) +
>>>>>>> +                                     length < (uptrval) (ip))) {
>>>>>>>                               /* overflow detection */
>>>>>>>                               goto _output_error;
>>>>>>>                       }
>>>>>>> @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>
>>>>>>>               /* copy literals */
>>>>>>>               cpy = op + length;
>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>> +           safe_literal_copy:
>>>>>>> +#endif
>>>>>>>               LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>>
>>>>>>>               if (((endOnInput) && ((cpy > oend - MFLIMIT)
>>>>>>> -                     || (ip + length > iend - (2 + 1 + LASTLITERALS))))
>>>>>>> -                     || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>> +                                   || (ip + length >
>>>>>>> +                                       iend - (2 + 1 + LASTLITERALS))))
>>>>>>> +                 || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>>                       if (partialDecoding) {
>>>>>>>                               if (cpy > oend) {
>>>>>>>                                       /*
>>>>>>> @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>                                       length = oend - op;
>>>>>>>                               }
>>>>>>>                               if ((endOnInput)
>>>>>>> -                                     && (ip + length > iend)) {
>>>>>>> +                                 && (ip + length > iend)) {
>>>>>>>                                       /*
>>>>>>>                                        * Error :
>>>>>>>                                        * read attempt beyond
>>>>>>> @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>                               }
>>>>>>>                       } else {
>>>>>>>                               if ((!endOnInput)
>>>>>>> -                                     && (cpy != oend)) {
>>>>>>> +                                 && (cpy != oend)) {
>>>>>>>                                       /*
>>>>>>>                                        * Error :
>>>>>>>                                        * block decoding must
>>>>>>> @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>                                       goto _output_error;
>>>>>>>                               }
>>>>>>>                               if ((endOnInput)
>>>>>>> -                                     && ((ip + length != iend)
>>>>>>> +                                 && ((ip + length != iend)
>>>>>>>                                       || (cpy > oend))) {
>>>>>>>                                       /*
>>>>>>>                                        * Error :
>>>>>>> @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>                       goto _output_error;
>>>>>>>               }
>>>>>>>
>>>>>>> -             /* costs ~1%; silence an msan warning when offset == 0 */
>>>>>>> -             /*
>>>>>>> -              * note : when partialDecoding, there is no guarantee that
>>>>>>> -              * at least 4 bytes remain available in output buffer
>>>>>>> -              */
>>>>>>> -             if (!partialDecoding) {
>>>>>>> -                     assert(oend > op);
>>>>>>> -                     assert(oend - op >= 4);
>>>>>>> -
>>>>>>> -                     LZ4_write32(op, (U32)offset);
>>>>>>> -             }
>>>>>>> -
>>>>>>>               if (length == ML_MASK) {
>>>>>>> -                     unsigned int s;
>>>>>>> -
>>>>>>> -                     do {
>>>>>>> -                             s = *ip++;
>>>>>>> -
>>>>>>> -                             if ((endOnInput) && (ip > iend - LASTLITERALS))
>>>>>>> -                                     goto _output_error;
>>>>>>>
>>>>>>> -                             length += s;
>>>>>>> -                     } while (s == 255);
>>>>>>> +                     variable_length_error error = ok;
>>>>>>> +                     length +=
>>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>>> +                                              endOnInput, 0, &error);
>>>>>>> +                     if (error != ok)
>>>>>>> +                             goto _output_error;
>>>>>>>
>>>>>>>                       if ((safeDecode)
>>>>>>>                               && unlikely(
>>>>>>> @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>
>>>>>>>               length += MINMATCH;
>>>>>>>
>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>> +safe_match_copy:
>>>>>>> +#endif
>>>>>>> +
>>>>>>>               /* match starting within external dictionary */
>>>>>>>               if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>>                       if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-16  2:11               ` Mao, Chenxi
@ 2019-05-16  2:18                 ` Gao Xiang
  0 siblings, 0 replies; 16+ messages in thread
From: Gao Xiang @ 2019-05-16  2:18 UTC (permalink / raw)
  To: Mao, Chenxi
  Cc: Cyan, akpm, linux-kernel, Feng, Roy, Xu, Yuanli 2, Alm, Robert 2,
	Takahashi, Masaya A (Sony Mobile),
	Miao Xie, Greg Kroah-Hartman

It seems you just sent the same email twice, please check my previous reply.

Thanks,
Gao Xiang

On 2019/5/16 10:11, Mao, Chenxi wrote:
> Hi Xiang:
> 
> I checked the deliver history.
> There is only below delivery related with lz4.c
> Pull request:616
> 4e3accc Fix Dict Size Test in `LZ4_compress_fast_continue()`
> 535636f Don't Attach Very Small Dictionaries
> This 2 changes seems like ONLY bug fixes for dictionary fix baesd on 1.8.3
> 
> Based on current investigation result, I think pick decompress patches is safe for v1.8.3
> 
> If there is any misunderstanding or faults, please feel free to let me know.
> 
> Chenxi
> 
> On 5/16/19 9:35 AM, Gao Xiang wrote:
>>
>>
>> On 2019/5/16 7:48, Mao, Chenxi wrote:
>>> Hi Yann and Xiang:
>>> For this FAST_DEC_LOOP change, I only pick up decompress related patches to current kernel LZ4 implementation(based on 1.8.3).
>>> Here is my cherry-pick list:
>>> 2589c44 created LZ4_FAST_DEC_LOOP build macro
>>> 605d811 enable LZ4_FAST_DEC_LOOP build macro on aarch64/GCC by default
>>> 5d7d116 decompress_generic: Limit fastpath to x86
>>> 75fb878 decompress_generic: Add fastpath for small offsets
>>> faac110 decompress_generic: Unroll loops a bit more
>>> 1fbaf84 decompress_generic: remove msan write
>>> 28b8249 decompress_generic: re-add fastpath
>>> 232f1e2 decompress_generic: drop partial copy check in fast loop
>>> 59332a3 decompress_generic: Optimize literal copies
>>> 5dfa7d4 decompress_generic: optimize match copy
>>> 28356e0 decompress_generic: Add a loop fastpath
>>> 4da3360 decompress_generic: Refactor variable length fields
>>>
>>> Only cherry-pick these changes to LZ4 1.8.3 would not be introduce risks.
>>
>> The fact is that ... LZ4_FAST_DEC_LOOP almost influences all decompression apis.
>> You should keep LZ4_decompress_generic from all known issues.
>>
>>>
>>> @Xiang:
>>> Do you prefer to upgrade kernel lz4 to 1.9.1 directly or only pick some optimization patches?
>>
>> I think you should pick all patches which are related to LZ4_decompress_generic()
>> and LZ4_FAST_DEC_LOOP since known issues of LZ4_FAST_DEC_LOOP should be fixed of course.
>>
>> Thanks,
>> Gao Xiang
>>
>>>
>>> Chenxi
>>>
>>>
>>> On 5/16/19 1:47 AM, Gao Xiang wrote:
>>>> Hi Yann,
>>>>
>>>> On 2019/5/16 1:03, Cyan wrote:
>>>>> Re-posted,
>>>>> it seems the previous message was rejected by the linux-kernel server
>>>>> due to some kind of format limitation (no html).
>>>>>
>>>>>
>>>>> Le mer. 15 mai 2019 à 09:56, Cyan <yann.collet.73@gmail.com> a écrit :
>>>>>>
>>>>>> The v1.9.0 version has a bug which makes it read a few bytes out of bound in certain cases.
>>>>>> This was fixed in v1.9.1.
>>>>>> Therefore, if you plan upgrading version, skip directly to v1.9.1.
>>>>>>
>>>>>> Note that, in v1.9.1, LZ4_decompress_fast() is now deprecated.
>>>>>> LZ4_decompress_fast() is a security liability, since it's unable to cope with malicious inputs.
>>>>>> While this is not a problem when the producer is under direct control (local production and consumption, no intermediate storage),
>>>>>> this entry point seems misused in multiple scenarios where input cannot be decently trusted.
>>>>>> The deprecation warning is meant to "scare away" such usages.
>>>>>> It's still possible to intentionally remove the warning, for users who know what they are doing.
>>>>>>
>>>>>> Note that, on top of that, the _fast() variant is now slower that LZ4_decompress_safe(), removing its most important positive differentiator.
>>>>>> That's because LZ4_decompress_fast() is "blind", it doesn't know the input size.
>>>>>> It just guesses it, from the requested output size, and by relying on end-of-block conditions.
>>>>>> This effectively limits its capability to copy up to 8 bytes at a time,
>>>>>> which proves slower than the 32-bytes at a time used in LZ4_decompress_safe().
>>>>>>
>>>>>> LZ4_decompress_fast() still has some possible usage :
>>>>>> it doesn't need the compressed size.
>>>>>> Therefore, in a scenario where decompressed size is known, but compressed size isn't,
>>>>>> it cannot be replaced by `LZ4_decompress_safe()`.
>>>>>> This is pretty rare, but can happen in some cases, erofs for example.
>>>>>> `LZ4_decompress_safe_partial()` *might* be able to replace it for such scenario,
>>>>>> but I haven't thoroughly tested it, so can't yet vouch for it.
>>>>
>>>> Yes, that is my real concern...
>>>>
>>>> In fact, we have shipped erofs with LZ4_decompress_safe_partial() of lz4-1.8.3
>>>> for our HUAWEI P30/P30 Pro and other HUAWEI EMUI 9.1 phones on the market...
>>>>
>>>> It seemd lz4-1.8.3 is rather stable since no stability report from our internal
>>>> beta test and real consumers...
>>>>
>>>> I haven't tested lz4-1.9.0 yet since I am working on the erofs paper [1]...
>>>>
>>>> But anyway, the patch looks good to me since it has obvious performance gain...
>>>>
>>>>>>
>>>>>> If it doesn't, then it will be necessary to create a new entry point.
>>>>>> The main difference is that this new entry point will need a bound for the input size, in order to guarantee it never reads beyond input buffer.
>>>>>> But as said, for the time being, such dedicated entry point doesn't exist.
>>>>
>>>> I agree, but current erofs still have to choose LZ4_decompress_fast...
>>>> I will test these 2 apis later and will report if something is wrong as well.
>>>>
>>>> [1] https://www.usenix.org/conference/atc19/presentation/gao
>>>>     https://www.usenix.org/conference/atc19/technical-sessions
>>>>
>>>> Thanks,
>>>> Gao Xiang
>>>>
>>>>
>>>>>>
>>>>>> The new FAST_DEC_LOOP setting works well on x86 and x64.
>>>>>> Recently, it has been extended to gcc+arm64 (`dev` branch).
>>>>>> The combination of clang+arm64 is less successful though (impacts vary, depending on version and exact hardware), so it's still disabled by default.
>>>>>> But anyone can test it by setting FAST_DEC_LOOP definition to 1, thus enabling the new decoder loop.
>>>>>>
>>>>>>
>>>>>> Y.
>>>>>>
>>>>>> Le mar. 14 mai 2019 à 20:28, Mao, Chenxi <Chenxi.Mao@sony.com> a écrit :
>>>>>>>
>>>>>>> Hi Xiang:
>>>>>>>
>>>>>>> Thanks for your reply, I will have a stress test on my device later.
>>>>>>> I didn't have chance to test LZ4 with clang build because of device limitation. I think I could do it later.
>>>>>>> I guess the clang performance downgrade is caused by some compiler optimization options.
>>>>>>> I will double check it later if I got the device which can build kernel with clang.
>>>>>>>
>>>>>>> BTW, the FAST_DEC_LOOP leverage LZ4_wildCopy8 instead of LZ4_wildCopy,
>>>>>>> however based on my test, original LZ4_wildCopy has the better performance.
>>>>>>> Original LZ4_wildCopy API still invoked by FAST_DEC_LOOP apis.
>>>>>>> That might be a problem for X86 devices for current patch.
>>>>>>>
>>>>>>> Chenxi
>>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Gao Xiang [mailto:gaoxiang25@huawei.com]
>>>>>>> Sent: Wednesday, May 15, 2019 10:21 AM
>>>>>>> To: Mao, Chenxi <Chenxi.Mao@sony.com>
>>>>>>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org; Feng, Roy <Roy.Feng@sony.com>; Xu, Yuanli 2 <Yuanli.Xu@sony.com>; Alm, Robert 2 <Robert.Alm@sony.com>; Takahashi, Masaya A (Sony Mobile) <Masaya.A.Takahashi@sony.com>; Miao Xie <miaoxie@huawei.com>
>>>>>>> Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
>>>>>>>
>>>>>>> Hi Chenxi,
>>>>>>>
>>>>>>> On 2019/5/15 8:43, Chenxi Mao wrote:
>>>>>>>> FAST_DEC_LOOP was introduced from LZ4 1.9.
>>>>>>>> This change would be introduce 10% on decompress operation according
>>>>>>>> to LZ4 benchmark result on X86 devices.
>>>>>>>> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements, however
>>>>>>>> clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>>>>>>
>>>>>>> I noticed this optimization and lz4-1.9.0 [1] was just released month ago with this big change therefore I'm a little afraid of its current stability (especially some rare used decompression apis...)
>>>>>>>
>>>>>>> Could you Cc more people (e.g. lz4 original author Yann Collet) in order to get some more ideas about this if possible?
>>>>>>>
>>>>>>> Anyway, I like this optimization as well since FAST_DEC_LOOP improves decompression speed a lot :)
>>>>>>>
>>>>>>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Gao Xiang
>>>>>>>
>>>>>>>>
>>>>>>>> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>>>>>>>>
>>>>>>>> Here is the test result on ARM64(cortex-A53)
>>>>>>>>
>>>>>>>> Benchmark via ZRAM:
>>>>>>>>
>>>>>>>> Test case:
>>>>>>>> fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>>>>>>>> --buffer_compress_percentage=75 \
>>>>>>>> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
>>>>>>>> --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
>>>>>>>> --name=seq-read --rw=read --stonewall --name=seq-readwrite \ --rw=rw
>>>>>>>> --stonewall --name=rand-readwrite --rw=randrw --stonewall
>>>>>>>>
>>>>>>>> Patched:
>>>>>>>>    READ: bw=7077MiB/s (7421MB/s)
>>>>>>>> Vanilla:
>>>>>>>>    READ: bw=5134MiB/s (5384MB/s)
>>>>>>>>
>>>>>>>> Reference:
>>>>>>>> 1. https://github.com/lz4/lz4/pull/645
>>>>>>>> 2. https://github.com/lz4/lz4/pull/707
>>>>>>>>
>>>>>>>> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
>>>>>>>> ---
>>>>>>>>  lib/lz4/lz4_decompress.c | 425
>>>>>>>> +++++++++++++++++++++++++++++++++------
>>>>>>>>  1 file changed, 361 insertions(+), 64 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index
>>>>>>>> 0c9d3ad17e0f..a4c87e32b3c0 100644
>>>>>>>> --- a/lib/lz4/lz4_decompress.c
>>>>>>>> +++ b/lib/lz4/lz4_decompress.c
>>>>>>>> @@ -50,6 +50,131 @@
>>>>>>>>  #define assert(condition) ((void)0)
>>>>>>>>  #endif
>>>>>>>>
>>>>>>>> +#ifndef LZ4_FAST_DEC_LOOP
>>>>>>>> +#if defined(__i386__) || defined(__x86_64__) #define
>>>>>>>> +LZ4_FAST_DEC_LOOP 1 #elif defined(__aarch64__) && !defined(__clang__)
>>>>>>>> +     /* On aarch64, we disable this optimization for clang because on certain
>>>>>>>> +      * mobile chipsets and clang, it reduces performance. For more information
>>>>>>>> +      * refer to https://github.com/lz4/lz4/pull/707. */ #define
>>>>>>>> +LZ4_FAST_DEC_LOOP 1 #else #define LZ4_FAST_DEC_LOOP 0 #endif #endif
>>>>>>>> +
>>>>>>>> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>>>>>>>> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>>>>>>>> +
>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>> +#define FASTLOOP_SAFE_DISTANCE 64
>>>>>>>> +FORCE_INLINE void
>>>>>>>> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>>>>>>>> +                          const size_t offset)
>>>>>>>> +{
>>>>>>>> +     if (offset < 8) {
>>>>>>>> +             dstPtr[0] = srcPtr[0];
>>>>>>>> +
>>>>>>>> +             dstPtr[1] = srcPtr[1];
>>>>>>>> +             dstPtr[2] = srcPtr[2];
>>>>>>>> +             dstPtr[3] = srcPtr[3];
>>>>>>>> +             srcPtr += inc32table[offset];
>>>>>>>> +             memcpy(dstPtr + 4, srcPtr, 4);
>>>>>>>> +             srcPtr -= dec64table[offset];
>>>>>>>> +             dstPtr += 8;
>>>>>>>> +     } else {
>>>>>>>> +             memcpy(dstPtr, srcPtr, 8);
>>>>>>>> +             dstPtr += 8;
>>>>>>>> +             srcPtr += 8;
>>>>>>>> +     }
>>>>>>>> +
>>>>>>>> +     LZ4_wildCopy(dstPtr, srcPtr, dstEnd); }
>>>>>>>> +
>>>>>>>> +/* customized variant of memcpy, which can overwrite up to 32 bytes
>>>>>>>> +beyond dstEnd
>>>>>>>> + * this version copies two times 16 bytes (instead of one time 32
>>>>>>>> +bytes)
>>>>>>>> + * because it must be compatible with offsets >= 16. */ FORCE_INLINE
>>>>>>>> +void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) {
>>>>>>>> +     BYTE *d = (BYTE *) dstPtr;
>>>>>>>> +     const BYTE *s = (const BYTE *)srcPtr;
>>>>>>>> +     BYTE *const e = (BYTE *) dstEnd;
>>>>>>>> +
>>>>>>>> +     do {
>>>>>>>> +             memcpy(d, s, 16);
>>>>>>>> +             memcpy(d + 16, s + 16, 16);
>>>>>>>> +             d += 32;
>>>>>>>> +             s += 32;
>>>>>>>> +     } while (d < e);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +FORCE_INLINE void
>>>>>>>> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>>>>>>>> +                     const size_t offset)
>>>>>>>> +{
>>>>>>>> +     BYTE v[8];
>>>>>>>> +     switch (offset) {
>>>>>>>> +
>>>>>>>> +     case 1:
>>>>>>>> +             memset(v, *srcPtr, 8);
>>>>>>>> +             goto copy_loop;
>>>>>>>> +     case 2:
>>>>>>>> +             memcpy(v, srcPtr, 2);
>>>>>>>> +             memcpy(&v[2], srcPtr, 2);
>>>>>>>> +             memcpy(&v[4], &v[0], 4);
>>>>>>>> +             goto copy_loop;
>>>>>>>> +     case 4:
>>>>>>>> +             memcpy(v, srcPtr, 4);
>>>>>>>> +             memcpy(&v[4], srcPtr, 4);
>>>>>>>> +             goto copy_loop;
>>>>>>>> +     default:
>>>>>>>> +             LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>>>>>>>> +             return;
>>>>>>>> +     }
>>>>>>>> +
>>>>>>>> +      copy_loop:
>>>>>>>> +     memcpy(dstPtr, v, 8);
>>>>>>>> +     dstPtr += 8;
>>>>>>>> +     while (dstPtr < dstEnd) {
>>>>>>>> +             memcpy(dstPtr, v, 8);
>>>>>>>> +             dstPtr += 8;
>>>>>>>> +     }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +/* Read the variable-length literal or match length.
>>>>>>>> + *
>>>>>>>> + * ip - pointer to use as input.
>>>>>>>> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>>>>>>>> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>>>>>>>> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>>>>>>>> + * error (output) - error code.  Should be set to 0 before call.
>>>>>>>> + */
>>>>>>>> +typedef enum { loop_error = -2, initial_error = -1, ok = 0}
>>>>>>>> +variable_length_error; FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>>>>>>>> +                                        const BYTE *lencheck,
>>>>>>>> +                                        int loop_check, int initial_check,
>>>>>>>> +                                        variable_length_error *error) {
>>>>>>>> +     unsigned length = 0;
>>>>>>>> +     unsigned s;
>>>>>>>> +     if (initial_check && unlikely((*ip) >= lencheck)) {     /* overflow detection */
>>>>>>>> +             *error = initial_error;
>>>>>>>> +             return length;
>>>>>>>> +     }
>>>>>>>> +     do {
>>>>>>>> +             s = **ip;
>>>>>>>> +             (*ip)++;
>>>>>>>> +             length += s;
>>>>>>>> +             if (loop_check && unlikely((*ip) >= lencheck)) {        /* overflow detection */
>>>>>>>> +                     *error = loop_error;
>>>>>>>> +                     return length;
>>>>>>>> +             }
>>>>>>>> +     } while (s == 255);
>>>>>>>> +
>>>>>>>> +     return length;
>>>>>>>> +}
>>>>>>>> +#endif
>>>>>>>> +
>>>>>>>>  /*
>>>>>>>>   * LZ4_decompress_generic() :
>>>>>>>>   * This generic decompression function covers all use cases.
>>>>>>>> @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>        const size_t dictSize
>>>>>>>>        )
>>>>>>>>  {
>>>>>>>> -     const BYTE *ip = (const BYTE *) src;
>>>>>>>> -     const BYTE * const iend = ip + srcSize;
>>>>>>>> +     const BYTE *ip = (const BYTE *)src;
>>>>>>>> +     const BYTE *const iend = ip + srcSize;
>>>>>>>>
>>>>>>>>       BYTE *op = (BYTE *) dst;
>>>>>>>> -     BYTE * const oend = op + outputSize;
>>>>>>>> +     BYTE *const oend = op + outputSize;
>>>>>>>>       BYTE *cpy;
>>>>>>>>
>>>>>>>> -     const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>>> -     static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>>>>>>>> -     static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>>>>>>>> +     const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>>>
>>>>>>>>       const int safeDecode = (endOnInput == endOnInputSize);
>>>>>>>>       const int checkOffset = ((safeDecode) && (dictSize < (int)(64 *
>>>>>>>> KB)));
>>>>>>>>
>>>>>>>>       /* Set up the "end" pointers for the shortcut. */
>>>>>>>>       const BYTE *const shortiend = iend -
>>>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>>>       const BYTE *const shortoend = oend -
>>>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>>>> +
>>>>>>>> +     const BYTE *match;
>>>>>>>> +     size_t offset;
>>>>>>>> +     unsigned int token;
>>>>>>>> +     size_t length;
>>>>>>>>
>>>>>>>>       DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>>>>>>>>                srcSize, outputSize);
>>>>>>>> @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>       if ((endOnInput) && unlikely(srcSize == 0))
>>>>>>>>               return -1;
>>>>>>>>
>>>>>>>> -     /* Main Loop : decode sequences */
>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>> +     if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>>>>>>>> +             DEBUGLOG(6, "skip fast decode loop");
>>>>>>>> +             goto safe_decode;
>>>>>>>> +     }
>>>>>>>> +
>>>>>>>> +     /* Fast loop : decode sequences as long as output <
>>>>>>>> +iend-FASTLOOP_SAFE_DISTANCE */
>>>>>>>>       while (1) {
>>>>>>>> -             size_t length;
>>>>>>>> -             const BYTE *match;
>>>>>>>> -             size_t offset;
>>>>>>>> +             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>>>>>>>> +             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>>>>>>>> +             if (endOnInput) {
>>>>>>>> +                     assert(ip < iend);
>>>>>>>> +             }
>>>>>>>> +             token = *ip++;
>>>>>>>> +             length = token >> ML_BITS;      /* literal length */
>>>>>>>> +
>>>>>>>> +             assert(!endOnInput || ip <= iend);      /* ip < iend before the increment */
>>>>>>>> +
>>>>>>>> +             /* decode literal length */
>>>>>>>> +             if (length == RUN_MASK) {
>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>> +                     length +=
>>>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>>>> +                                              endOnInput, endOnInput,
>>>>>>>> +                                              &error);
>>>>>>>> +                     if (error == initial_error) {
>>>>>>>> +                             goto _output_error;
>>>>>>>> +                     }
>>>>>>>> +                     if ((safeDecode)
>>>>>>>> +                         && unlikely((uptrval) (op) + length <
>>>>>>>> +                                     (uptrval) (op))) {
>>>>>>>> +                             goto _output_error;
>>>>>>>> +                     }       /* overflow detection */
>>>>>>>> +                     if ((safeDecode)
>>>>>>>> +                         && unlikely((uptrval) (ip) + length <
>>>>>>>> +                                     (uptrval) (ip))) {
>>>>>>>> +                             goto _output_error;
>>>>>>>> +                     }
>>>>>>>> +
>>>>>>>> +                     /* overflow detection */
>>>>>>>> +                     /* copy literals */
>>>>>>>> +                     cpy = op + length;
>>>>>>>> +                     LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>>>> +                             if ((cpy > oend - 32)
>>>>>>>> +                                 || (ip + length > iend - 32)) {
>>>>>>>> +                                     goto safe_literal_copy;
>>>>>>>> +                             }
>>>>>>>> +                             LZ4_wildCopy32(op, ip, cpy);
>>>>>>>> +                     } else {        /* LZ4_decompress_fast() */
>>>>>>>> +                             if (cpy > oend - 8) {
>>>>>>>> +                                     goto safe_literal_copy;
>>>>>>>> +                             }
>>>>>>>> +                             LZ4_wildCopy(op, ip, cpy);
>>>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>>>> +                             /* it doesn't know input length, and only relies on end-of-block */
>>>>>>>> +                             /* properties */
>>>>>>>> +                     }
>>>>>>>> +                     ip += length;
>>>>>>>> +                     op = cpy;
>>>>>>>> +             } else {
>>>>>>>> +                     cpy = op + length;
>>>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>>>> +                             DEBUGLOG(7,
>>>>>>>> +                                      "copy %u bytes in a 16-bytes stripe",
>>>>>>>> +                                      (unsigned)length);
>>>>>>>> +                             /* We don't need to check oend */
>>>>>>>> +                             /* since we check it once for each loop below */
>>>>>>>> +                             if (ip > iend - (16 + 1)) {     /*max lit + offset + nextToken */
>>>>>>>> +                                     goto safe_literal_copy;
>>>>>>>> +                             }
>>>>>>>> +                             /* Literals can only be 14, but hope compilers optimize */
>>>>>>>> +                             /*if we copy by a register size */
>>>>>>>> +                             memcpy(op, ip, 16);
>>>>>>>> +                     } else {
>>>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>>>> +                             /* it doesn't know input length, and relies on end-of-block */
>>>>>>>> +                             /* properties */
>>>>>>>> +                             memcpy(op, ip, 8);
>>>>>>>> +                             if (length > 8) {
>>>>>>>> +                                     memcpy(op + 8, ip + 8, 8);
>>>>>>>> +                             }
>>>>>>>> +                     }
>>>>>>>> +                     ip += length;
>>>>>>>> +                     op = cpy;
>>>>>>>> +             }
>>>>>>>> +
>>>>>>>> +             /* get offset */
>>>>>>>> +             offset = LZ4_readLE16(ip);
>>>>>>>> +             ip += 2;        /* end-of-block condition violated */
>>>>>>>> +             match = op - offset;
>>>>>>>> +
>>>>>>>> +             /* get matchlength */
>>>>>>>> +             length = token & ML_MASK;
>>>>>>>>
>>>>>>>> -             /* get literal length */
>>>>>>>> -             unsigned int const token = *ip++;
>>>>>>>> -             length = token>>ML_BITS;
>>>>>>>> +             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>>>>>>>> +                     goto _output_error;
>>>>>>>> +             }
>>>>>>>> +             /* Error : offset outside buffers */
>>>>>>>> +             if (length == ML_MASK) {
>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>> +                     length +=
>>>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>>>> +                                              endOnInput, 0, &error);
>>>>>>>> +                     if (error != ok) {
>>>>>>>> +                             goto _output_error;
>>>>>>>> +                     }
>>>>>>>> +                     if ((safeDecode)
>>>>>>>> +                         && unlikely((uptrval) (op) + length < (uptrval) op)) {
>>>>>>>> +                             goto _output_error;
>>>>>>>> +                     }       /* overflow detection */
>>>>>>>> +                     length += MINMATCH;
>>>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>>>> +                             goto safe_match_copy;
>>>>>>>> +                     }
>>>>>>>> +             } else {
>>>>>>>> +                     length += MINMATCH;
>>>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>>>> +                             goto safe_match_copy;
>>>>>>>> +                     }
>>>>>>>> +
>>>>>>>> +                     /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>>>>>>>> +                     if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>>>>>>>> +                             if (offset >= 8) {
>>>>>>>> +                                     memcpy(op, match, 8);
>>>>>>>> +                                     memcpy(op + 8, match + 8, 8);
>>>>>>>> +                                     memcpy(op + 16, match + 16, 2);
>>>>>>>> +                                     op += length;
>>>>>>>> +                                     continue;
>>>>>>>> +                             }
>>>>>>>> +                     }
>>>>>>>> +             }
>>>>>>>> +
>>>>>>>> +             /* match starting within external dictionary */
>>>>>>>> +             if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>>> +                     if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>>> +                             if (partialDecoding) {
>>>>>>>> +                                     /* reach end of buffer */
>>>>>>>> +                                     length =
>>>>>>>> +                                         min(length, (size_t) (oend - op));
>>>>>>>> +                             } else {
>>>>>>>> +                                     /* end-of-block condition violated */
>>>>>>>> +                                     goto _output_error;
>>>>>>>> +                             }
>>>>>>>> +                     }
>>>>>>>> +
>>>>>>>> +                     if (length <= (size_t) (lowPrefix - match)) {
>>>>>>>> +                             /* match fits entirely within external dictionary : just copy */
>>>>>>>> +                             memmove(op, dictEnd - (lowPrefix - match), length);
>>>>>>>> +                             op += length;
>>>>>>>> +                     } else {
>>>>>>>> +                             /* match stretches into both external dict and current block */
>>>>>>>> +                             size_t const copySize =
>>>>>>>> +                                 (size_t) (lowPrefix - match);
>>>>>>>> +                             size_t const restSize = length - copySize;
>>>>>>>> +                             memcpy(op, dictEnd - copySize, copySize);
>>>>>>>> +                             op += copySize;
>>>>>>>> +                             if (restSize > (size_t) (op - lowPrefix)) {     /* overlap copy */
>>>>>>>> +                                     BYTE *const endOfMatch = op + restSize;
>>>>>>>> +                                     const BYTE *copyFrom = lowPrefix;
>>>>>>>> +                                     while (op < endOfMatch) {
>>>>>>>> +                                             *op++ = *copyFrom++;
>>>>>>>> +                                     }
>>>>>>>> +                             } else {
>>>>>>>> +                                     memcpy(op, lowPrefix, restSize);
>>>>>>>> +                                     op += restSize;
>>>>>>>> +                             }
>>>>>>>> +                     }
>>>>>>>> +                     continue;
>>>>>>>> +             }
>>>>>>>> +
>>>>>>>> +             /* copy match within block */
>>>>>>>> +             cpy = op + length;
>>>>>>>> +
>>>>>>>> +             assert((op <= oend) && (oend - op >= 32));
>>>>>>>> +             if (unlikely(offset < 16)) {
>>>>>>>> +                     LZ4_memcpy_using_offset(op, match, cpy, offset);
>>>>>>>> +             } else {
>>>>>>>> +                     LZ4_wildCopy32(op, match, cpy);
>>>>>>>> +             }
>>>>>>>> +
>>>>>>>> +             op = cpy;       /* wildcopy correction */
>>>>>>>> +     }
>>>>>>>> +      safe_decode:
>>>>>>>> +#endif
>>>>>>>> +     /* Main Loop : decode sequences */
>>>>>>>> +     while (1) {
>>>>>>>> +             length = token >> ML_BITS;
>>>>>>>>
>>>>>>>>               /* ip < iend before the increment */
>>>>>>>>               assert(!endOnInput || ip <= iend);
>>>>>>>> @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>                * combined check for both stages).
>>>>>>>>                */
>>>>>>>>               if ((endOnInput ? length != RUN_MASK : length <= 8)
>>>>>>>> -                /*
>>>>>>>> -                 * strictly "less than" on input, to re-enter
>>>>>>>> -                 * the loop with at least one byte
>>>>>>>> -                 */
>>>>>>>> -                && likely((endOnInput ? ip < shortiend : 1) &
>>>>>>>> -                          (op <= shortoend))) {
>>>>>>>> +                 /*
>>>>>>>> +                  * strictly "less than" on input, to re-enter
>>>>>>>> +                  * the loop with at least one byte
>>>>>>>> +                  */
>>>>>>>> +                 && likely((endOnInput ? ip < shortiend : 1) &
>>>>>>>> +                           (op <= shortoend))) {
>>>>>>>>                       /* Copy the literals */
>>>>>>>>                       memcpy(op, ip, endOnInput ? 16 : 8);
>>>>>>>> -                     op += length; ip += length;
>>>>>>>> +                     op += length;
>>>>>>>> +                     ip += length;
>>>>>>>>
>>>>>>>>                       /*
>>>>>>>>                        * The second stage:
>>>>>>>>                        * prepare for match copying, decode full info.
>>>>>>>>                        * If it doesn't work out, the info won't be wasted.
>>>>>>>>                        */
>>>>>>>> -                     length = token & ML_MASK; /* match length */
>>>>>>>> +                     length = token & ML_MASK;       /* match length */
>>>>>>>>                       offset = LZ4_readLE16(ip);
>>>>>>>>                       ip += 2;
>>>>>>>>                       match = op - offset;
>>>>>>>> -                     assert(match <= op); /* check overflow */
>>>>>>>> +                     assert(match <= op);    /* check overflow */
>>>>>>>>
>>>>>>>>                       /* Do not deal with overlapping matches. */
>>>>>>>>                       if ((length != ML_MASK) &&
>>>>>>>> @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>
>>>>>>>>               /* decode literal length */
>>>>>>>>               if (length == RUN_MASK) {
>>>>>>>> -                     unsigned int s;
>>>>>>>>
>>>>>>>> -                     if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>>>>>>>> -                             /* overflow detection */
>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>> +                     length +=
>>>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>>>> +                                              endOnInput, endOnInput,
>>>>>>>> +                                              &error);
>>>>>>>> +                     if (error == initial_error)
>>>>>>>>                               goto _output_error;
>>>>>>>> -                     }
>>>>>>>> -                     do {
>>>>>>>> -                             s = *ip++;
>>>>>>>> -                             length += s;
>>>>>>>> -                     } while (likely(endOnInput
>>>>>>>> -                             ? ip < iend - RUN_MASK
>>>>>>>> -                             : 1) & (s == 255));
>>>>>>>>
>>>>>>>>                       if ((safeDecode)
>>>>>>>> -                         && unlikely((uptrval)(op) +
>>>>>>>> -                                     length < (uptrval)(op))) {
>>>>>>>> +                         && unlikely((uptrval) (op) +
>>>>>>>> +                                     length < (uptrval) (op))) {
>>>>>>>>                               /* overflow detection */
>>>>>>>>                               goto _output_error;
>>>>>>>>                       }
>>>>>>>>                       if ((safeDecode)
>>>>>>>> -                         && unlikely((uptrval)(ip) +
>>>>>>>> -                                     length < (uptrval)(ip))) {
>>>>>>>> +                         && unlikely((uptrval) (ip) +
>>>>>>>> +                                     length < (uptrval) (ip))) {
>>>>>>>>                               /* overflow detection */
>>>>>>>>                               goto _output_error;
>>>>>>>>                       }
>>>>>>>> @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>
>>>>>>>>               /* copy literals */
>>>>>>>>               cpy = op + length;
>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>> +           safe_literal_copy:
>>>>>>>> +#endif
>>>>>>>>               LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>>>
>>>>>>>>               if (((endOnInput) && ((cpy > oend - MFLIMIT)
>>>>>>>> -                     || (ip + length > iend - (2 + 1 + LASTLITERALS))))
>>>>>>>> -                     || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>>> +                                   || (ip + length >
>>>>>>>> +                                       iend - (2 + 1 + LASTLITERALS))))
>>>>>>>> +                 || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>>>                       if (partialDecoding) {
>>>>>>>>                               if (cpy > oend) {
>>>>>>>>                                       /*
>>>>>>>> @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>                                       length = oend - op;
>>>>>>>>                               }
>>>>>>>>                               if ((endOnInput)
>>>>>>>> -                                     && (ip + length > iend)) {
>>>>>>>> +                                 && (ip + length > iend)) {
>>>>>>>>                                       /*
>>>>>>>>                                        * Error :
>>>>>>>>                                        * read attempt beyond
>>>>>>>> @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>                               }
>>>>>>>>                       } else {
>>>>>>>>                               if ((!endOnInput)
>>>>>>>> -                                     && (cpy != oend)) {
>>>>>>>> +                                 && (cpy != oend)) {
>>>>>>>>                                       /*
>>>>>>>>                                        * Error :
>>>>>>>>                                        * block decoding must
>>>>>>>> @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>                                       goto _output_error;
>>>>>>>>                               }
>>>>>>>>                               if ((endOnInput)
>>>>>>>> -                                     && ((ip + length != iend)
>>>>>>>> +                                 && ((ip + length != iend)
>>>>>>>>                                       || (cpy > oend))) {
>>>>>>>>                                       /*
>>>>>>>>                                        * Error :
>>>>>>>> @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>                       goto _output_error;
>>>>>>>>               }
>>>>>>>>
>>>>>>>> -             /* costs ~1%; silence an msan warning when offset == 0 */
>>>>>>>> -             /*
>>>>>>>> -              * note : when partialDecoding, there is no guarantee that
>>>>>>>> -              * at least 4 bytes remain available in output buffer
>>>>>>>> -              */
>>>>>>>> -             if (!partialDecoding) {
>>>>>>>> -                     assert(oend > op);
>>>>>>>> -                     assert(oend - op >= 4);
>>>>>>>> -
>>>>>>>> -                     LZ4_write32(op, (U32)offset);
>>>>>>>> -             }
>>>>>>>> -
>>>>>>>>               if (length == ML_MASK) {
>>>>>>>> -                     unsigned int s;
>>>>>>>> -
>>>>>>>> -                     do {
>>>>>>>> -                             s = *ip++;
>>>>>>>> -
>>>>>>>> -                             if ((endOnInput) && (ip > iend - LASTLITERALS))
>>>>>>>> -                                     goto _output_error;
>>>>>>>>
>>>>>>>> -                             length += s;
>>>>>>>> -                     } while (s == 255);
>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>> +                     length +=
>>>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>>>> +                                              endOnInput, 0, &error);
>>>>>>>> +                     if (error != ok)
>>>>>>>> +                             goto _output_error;
>>>>>>>>
>>>>>>>>                       if ((safeDecode)
>>>>>>>>                               && unlikely(
>>>>>>>> @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>
>>>>>>>>               length += MINMATCH;
>>>>>>>>
>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>> +safe_match_copy:
>>>>>>>> +#endif
>>>>>>>> +
>>>>>>>>               /* match starting within external dictionary */
>>>>>>>>               if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>>>                       if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-16  2:10                 ` Gao Xiang
@ 2019-05-16  6:52                   ` Mao, Chenxi
  0 siblings, 0 replies; 16+ messages in thread
From: Mao, Chenxi @ 2019-05-16  6:52 UTC (permalink / raw)
  To: Gao Xiang
  Cc: Cyan, akpm, linux-kernel, Feng, Roy, Xu, Yuanli 2, Alm, Robert 2,
	Takahashi, Masaya A (Sony Mobile),
	Miao Xie, Greg Kroah-Hartman

Hi Xiang:

I checked my patch, this issue fixes already included in my first E-mail.
I am working on the patch PS2, I will send it later.

Chenxi

On 5/16/19 10:10 AM, Gao Xiang wrote:
> 
> 
> On 2019/5/16 10:07, Mao, Chenxi wrote:
>> Hi Xiang:
>>
>> I checked the deliver history.
>> There is only below delivery related with lz4.c
>> Pull request:616
>> 4e3accc Fix Dict Size Test in `LZ4_compress_fast_continue()`
>> 535636f Don't Attach Very Small Dictionaries
>> This 2 changes seems like ONLY bug fixes for dictionary fix baesd on 1.8.3
> 
> Have you taken a glance at
> 
> https://github.com/lz4/lz4/commit/5a6d72447ae998b387794df1135023689aa89995   at least?
> 
> and issues,
> 
> https://github.com/lz4/lz4/issues/676
> https://github.com/lz4/lz4/issues/681
> 
> That is what Yann said before.
> 
> Thanks,
> Gao Xiang
> 
>>
>> Based on current investigation result, I think pick decompress patches is safe for v1.8.3
>>
>> If there is any misunderstanding or faults, please feel free to let me know.
>>
>> Chenxi
>>
>> On 5/16/19 9:35 AM, Gao Xiang wrote:
>>>
>>>
>>> On 2019/5/16 7:48, Mao, Chenxi wrote:
>>>> Hi Yann and Xiang:
>>>> For this FAST_DEC_LOOP change, I only pick up decompress related patches to current kernel LZ4 implementation(based on 1.8.3).
>>>> Here is my cherry-pick list:
>>>> 2589c44 created LZ4_FAST_DEC_LOOP build macro
>>>> 605d811 enable LZ4_FAST_DEC_LOOP build macro on aarch64/GCC by default
>>>> 5d7d116 decompress_generic: Limit fastpath to x86
>>>> 75fb878 decompress_generic: Add fastpath for small offsets
>>>> faac110 decompress_generic: Unroll loops a bit more
>>>> 1fbaf84 decompress_generic: remove msan write
>>>> 28b8249 decompress_generic: re-add fastpath
>>>> 232f1e2 decompress_generic: drop partial copy check in fast loop
>>>> 59332a3 decompress_generic: Optimize literal copies
>>>> 5dfa7d4 decompress_generic: optimize match copy
>>>> 28356e0 decompress_generic: Add a loop fastpath
>>>> 4da3360 decompress_generic: Refactor variable length fields
>>>>
>>>> Only cherry-pick these changes to LZ4 1.8.3 would not be introduce risks.
>>>
>>> The fact is that ... LZ4_FAST_DEC_LOOP almost influences all decompression apis.
>>> You should keep LZ4_decompress_generic from all known issues.
>>>
>>>>
>>>> @Xiang:
>>>> Do you prefer to upgrade kernel lz4 to 1.9.1 directly or only pick some optimization patches?
>>>
>>> I think you should pick all patches which are related to LZ4_decompress_generic()
>>> and LZ4_FAST_DEC_LOOP since known issues of LZ4_FAST_DEC_LOOP should be fixed of course.
>>>
>>> Thanks,
>>> Gao Xiang
>>>
>>>>
>>>> Chenxi
>>>>
>>>>
>>>> On 5/16/19 1:47 AM, Gao Xiang wrote:
>>>>> Hi Yann,
>>>>>
>>>>> On 2019/5/16 1:03, Cyan wrote:
>>>>>> Re-posted,
>>>>>> it seems the previous message was rejected by the linux-kernel server
>>>>>> due to some kind of format limitation (no html).
>>>>>>
>>>>>>
>>>>>> Le mer. 15 mai 2019 à 09:56, Cyan <yann.collet.73@gmail.com> a écrit :
>>>>>>>
>>>>>>> The v1.9.0 version has a bug which makes it read a few bytes out of bound in certain cases.
>>>>>>> This was fixed in v1.9.1.
>>>>>>> Therefore, if you plan upgrading version, skip directly to v1.9.1.
>>>>>>>
>>>>>>> Note that, in v1.9.1, LZ4_decompress_fast() is now deprecated.
>>>>>>> LZ4_decompress_fast() is a security liability, since it's unable to cope with malicious inputs.
>>>>>>> While this is not a problem when the producer is under direct control (local production and consumption, no intermediate storage),
>>>>>>> this entry point seems misused in multiple scenarios where input cannot be decently trusted.
>>>>>>> The deprecation warning is meant to "scare away" such usages.
>>>>>>> It's still possible to intentionally remove the warning, for users who know what they are doing.
>>>>>>>
>>>>>>> Note that, on top of that, the _fast() variant is now slower that LZ4_decompress_safe(), removing its most important positive differentiator.
>>>>>>> That's because LZ4_decompress_fast() is "blind", it doesn't know the input size.
>>>>>>> It just guesses it, from the requested output size, and by relying on end-of-block conditions.
>>>>>>> This effectively limits its capability to copy up to 8 bytes at a time,
>>>>>>> which proves slower than the 32-bytes at a time used in LZ4_decompress_safe().
>>>>>>>
>>>>>>> LZ4_decompress_fast() still has some possible usage :
>>>>>>> it doesn't need the compressed size.
>>>>>>> Therefore, in a scenario where decompressed size is known, but compressed size isn't,
>>>>>>> it cannot be replaced by `LZ4_decompress_safe()`.
>>>>>>> This is pretty rare, but can happen in some cases, erofs for example.
>>>>>>> `LZ4_decompress_safe_partial()` *might* be able to replace it for such scenario,
>>>>>>> but I haven't thoroughly tested it, so can't yet vouch for it.
>>>>>
>>>>> Yes, that is my real concern...
>>>>>
>>>>> In fact, we have shipped erofs with LZ4_decompress_safe_partial() of lz4-1.8.3
>>>>> for our HUAWEI P30/P30 Pro and other HUAWEI EMUI 9.1 phones on the market...
>>>>>
>>>>> It seemd lz4-1.8.3 is rather stable since no stability report from our internal
>>>>> beta test and real consumers...
>>>>>
>>>>> I haven't tested lz4-1.9.0 yet since I am working on the erofs paper [1]...
>>>>>
>>>>> But anyway, the patch looks good to me since it has obvious performance gain...
>>>>>
>>>>>>>
>>>>>>> If it doesn't, then it will be necessary to create a new entry point.
>>>>>>> The main difference is that this new entry point will need a bound for the input size, in order to guarantee it never reads beyond input buffer.
>>>>>>> But as said, for the time being, such dedicated entry point doesn't exist.
>>>>>
>>>>> I agree, but current erofs still have to choose LZ4_decompress_fast...
>>>>> I will test these 2 apis later and will report if something is wrong as well.
>>>>>
>>>>> [1] https://www.usenix.org/conference/atc19/presentation/gao
>>>>>     https://www.usenix.org/conference/atc19/technical-sessions
>>>>>
>>>>> Thanks,
>>>>> Gao Xiang
>>>>>
>>>>>
>>>>>>>
>>>>>>> The new FAST_DEC_LOOP setting works well on x86 and x64.
>>>>>>> Recently, it has been extended to gcc+arm64 (`dev` branch).
>>>>>>> The combination of clang+arm64 is less successful though (impacts vary, depending on version and exact hardware), so it's still disabled by default.
>>>>>>> But anyone can test it by setting FAST_DEC_LOOP definition to 1, thus enabling the new decoder loop.
>>>>>>>
>>>>>>>
>>>>>>> Y.
>>>>>>>
>>>>>>> Le mar. 14 mai 2019 à 20:28, Mao, Chenxi <Chenxi.Mao@sony.com> a écrit :
>>>>>>>>
>>>>>>>> Hi Xiang:
>>>>>>>>
>>>>>>>> Thanks for your reply, I will have a stress test on my device later.
>>>>>>>> I didn't have chance to test LZ4 with clang build because of device limitation. I think I could do it later.
>>>>>>>> I guess the clang performance downgrade is caused by some compiler optimization options.
>>>>>>>> I will double check it later if I got the device which can build kernel with clang.
>>>>>>>>
>>>>>>>> BTW, the FAST_DEC_LOOP leverage LZ4_wildCopy8 instead of LZ4_wildCopy,
>>>>>>>> however based on my test, original LZ4_wildCopy has the better performance.
>>>>>>>> Original LZ4_wildCopy API still invoked by FAST_DEC_LOOP apis.
>>>>>>>> That might be a problem for X86 devices for current patch.
>>>>>>>>
>>>>>>>> Chenxi
>>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Gao Xiang [mailto:gaoxiang25@huawei.com]
>>>>>>>> Sent: Wednesday, May 15, 2019 10:21 AM
>>>>>>>> To: Mao, Chenxi <Chenxi.Mao@sony.com>
>>>>>>>> Cc: akpm@linux-foundation.org; linux-kernel@vger.kernel.org; Feng, Roy <Roy.Feng@sony.com>; Xu, Yuanli 2 <Yuanli.Xu@sony.com>; Alm, Robert 2 <Robert.Alm@sony.com>; Takahashi, Masaya A (Sony Mobile) <Masaya.A.Takahashi@sony.com>; Miao Xie <miaoxie@huawei.com>
>>>>>>>> Subject: Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
>>>>>>>>
>>>>>>>> Hi Chenxi,
>>>>>>>>
>>>>>>>> On 2019/5/15 8:43, Chenxi Mao wrote:
>>>>>>>>> FAST_DEC_LOOP was introduced from LZ4 1.9.
>>>>>>>>> This change would be introduce 10% on decompress operation according
>>>>>>>>> to LZ4 benchmark result on X86 devices.
>>>>>>>>> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements, however
>>>>>>>>> clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>>>>>>>
>>>>>>>> I noticed this optimization and lz4-1.9.0 [1] was just released month ago with this big change therefore I'm a little afraid of its current stability (especially some rare used decompression apis...)
>>>>>>>>
>>>>>>>> Could you Cc more people (e.g. lz4 original author Yann Collet) in order to get some more ideas about this if possible?
>>>>>>>>
>>>>>>>> Anyway, I like this optimization as well since FAST_DEC_LOOP improves decompression speed a lot :)
>>>>>>>>
>>>>>>>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> Gao Xiang
>>>>>>>>
>>>>>>>>>
>>>>>>>>> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>>>>>>>>>
>>>>>>>>> Here is the test result on ARM64(cortex-A53)
>>>>>>>>>
>>>>>>>>> Benchmark via ZRAM:
>>>>>>>>>
>>>>>>>>> Test case:
>>>>>>>>> fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>>>>>>>>> --buffer_compress_percentage=75 \
>>>>>>>>> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
>>>>>>>>> --filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
>>>>>>>>> --name=seq-read --rw=read --stonewall --name=seq-readwrite \ --rw=rw
>>>>>>>>> --stonewall --name=rand-readwrite --rw=randrw --stonewall
>>>>>>>>>
>>>>>>>>> Patched:
>>>>>>>>>    READ: bw=7077MiB/s (7421MB/s)
>>>>>>>>> Vanilla:
>>>>>>>>>    READ: bw=5134MiB/s (5384MB/s)
>>>>>>>>>
>>>>>>>>> Reference:
>>>>>>>>> 1. https://github.com/lz4/lz4/pull/645
>>>>>>>>> 2. https://github.com/lz4/lz4/pull/707
>>>>>>>>>
>>>>>>>>> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
>>>>>>>>> ---
>>>>>>>>>  lib/lz4/lz4_decompress.c | 425
>>>>>>>>> +++++++++++++++++++++++++++++++++------
>>>>>>>>>  1 file changed, 361 insertions(+), 64 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index
>>>>>>>>> 0c9d3ad17e0f..a4c87e32b3c0 100644
>>>>>>>>> --- a/lib/lz4/lz4_decompress.c
>>>>>>>>> +++ b/lib/lz4/lz4_decompress.c
>>>>>>>>> @@ -50,6 +50,131 @@
>>>>>>>>>  #define assert(condition) ((void)0)
>>>>>>>>>  #endif
>>>>>>>>>
>>>>>>>>> +#ifndef LZ4_FAST_DEC_LOOP
>>>>>>>>> +#if defined(__i386__) || defined(__x86_64__) #define
>>>>>>>>> +LZ4_FAST_DEC_LOOP 1 #elif defined(__aarch64__) && !defined(__clang__)
>>>>>>>>> +     /* On aarch64, we disable this optimization for clang because on certain
>>>>>>>>> +      * mobile chipsets and clang, it reduces performance. For more information
>>>>>>>>> +      * refer to https://github.com/lz4/lz4/pull/707. */ #define
>>>>>>>>> +LZ4_FAST_DEC_LOOP 1 #else #define LZ4_FAST_DEC_LOOP 0 #endif #endif
>>>>>>>>> +
>>>>>>>>> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>>>>>>>>> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>>>>>>>>> +
>>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>>> +#define FASTLOOP_SAFE_DISTANCE 64
>>>>>>>>> +FORCE_INLINE void
>>>>>>>>> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>>>>>>>>> +                          const size_t offset)
>>>>>>>>> +{
>>>>>>>>> +     if (offset < 8) {
>>>>>>>>> +             dstPtr[0] = srcPtr[0];
>>>>>>>>> +
>>>>>>>>> +             dstPtr[1] = srcPtr[1];
>>>>>>>>> +             dstPtr[2] = srcPtr[2];
>>>>>>>>> +             dstPtr[3] = srcPtr[3];
>>>>>>>>> +             srcPtr += inc32table[offset];
>>>>>>>>> +             memcpy(dstPtr + 4, srcPtr, 4);
>>>>>>>>> +             srcPtr -= dec64table[offset];
>>>>>>>>> +             dstPtr += 8;
>>>>>>>>> +     } else {
>>>>>>>>> +             memcpy(dstPtr, srcPtr, 8);
>>>>>>>>> +             dstPtr += 8;
>>>>>>>>> +             srcPtr += 8;
>>>>>>>>> +     }
>>>>>>>>> +
>>>>>>>>> +     LZ4_wildCopy(dstPtr, srcPtr, dstEnd); }
>>>>>>>>> +
>>>>>>>>> +/* customized variant of memcpy, which can overwrite up to 32 bytes
>>>>>>>>> +beyond dstEnd
>>>>>>>>> + * this version copies two times 16 bytes (instead of one time 32
>>>>>>>>> +bytes)
>>>>>>>>> + * because it must be compatible with offsets >= 16. */ FORCE_INLINE
>>>>>>>>> +void LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) {
>>>>>>>>> +     BYTE *d = (BYTE *) dstPtr;
>>>>>>>>> +     const BYTE *s = (const BYTE *)srcPtr;
>>>>>>>>> +     BYTE *const e = (BYTE *) dstEnd;
>>>>>>>>> +
>>>>>>>>> +     do {
>>>>>>>>> +             memcpy(d, s, 16);
>>>>>>>>> +             memcpy(d + 16, s + 16, 16);
>>>>>>>>> +             d += 32;
>>>>>>>>> +             s += 32;
>>>>>>>>> +     } while (d < e);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +FORCE_INLINE void
>>>>>>>>> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>>>>>>>>> +                     const size_t offset)
>>>>>>>>> +{
>>>>>>>>> +     BYTE v[8];
>>>>>>>>> +     switch (offset) {
>>>>>>>>> +
>>>>>>>>> +     case 1:
>>>>>>>>> +             memset(v, *srcPtr, 8);
>>>>>>>>> +             goto copy_loop;
>>>>>>>>> +     case 2:
>>>>>>>>> +             memcpy(v, srcPtr, 2);
>>>>>>>>> +             memcpy(&v[2], srcPtr, 2);
>>>>>>>>> +             memcpy(&v[4], &v[0], 4);
>>>>>>>>> +             goto copy_loop;
>>>>>>>>> +     case 4:
>>>>>>>>> +             memcpy(v, srcPtr, 4);
>>>>>>>>> +             memcpy(&v[4], srcPtr, 4);
>>>>>>>>> +             goto copy_loop;
>>>>>>>>> +     default:
>>>>>>>>> +             LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>>>>>>>>> +             return;
>>>>>>>>> +     }
>>>>>>>>> +
>>>>>>>>> +      copy_loop:
>>>>>>>>> +     memcpy(dstPtr, v, 8);
>>>>>>>>> +     dstPtr += 8;
>>>>>>>>> +     while (dstPtr < dstEnd) {
>>>>>>>>> +             memcpy(dstPtr, v, 8);
>>>>>>>>> +             dstPtr += 8;
>>>>>>>>> +     }
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +/* Read the variable-length literal or match length.
>>>>>>>>> + *
>>>>>>>>> + * ip - pointer to use as input.
>>>>>>>>> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>>>>>>>>> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>>>>>>>>> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>>>>>>>>> + * error (output) - error code.  Should be set to 0 before call.
>>>>>>>>> + */
>>>>>>>>> +typedef enum { loop_error = -2, initial_error = -1, ok = 0}
>>>>>>>>> +variable_length_error; FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>>>>>>>>> +                                        const BYTE *lencheck,
>>>>>>>>> +                                        int loop_check, int initial_check,
>>>>>>>>> +                                        variable_length_error *error) {
>>>>>>>>> +     unsigned length = 0;
>>>>>>>>> +     unsigned s;
>>>>>>>>> +     if (initial_check && unlikely((*ip) >= lencheck)) {     /* overflow detection */
>>>>>>>>> +             *error = initial_error;
>>>>>>>>> +             return length;
>>>>>>>>> +     }
>>>>>>>>> +     do {
>>>>>>>>> +             s = **ip;
>>>>>>>>> +             (*ip)++;
>>>>>>>>> +             length += s;
>>>>>>>>> +             if (loop_check && unlikely((*ip) >= lencheck)) {        /* overflow detection */
>>>>>>>>> +                     *error = loop_error;
>>>>>>>>> +                     return length;
>>>>>>>>> +             }
>>>>>>>>> +     } while (s == 255);
>>>>>>>>> +
>>>>>>>>> +     return length;
>>>>>>>>> +}
>>>>>>>>> +#endif
>>>>>>>>> +
>>>>>>>>>  /*
>>>>>>>>>   * LZ4_decompress_generic() :
>>>>>>>>>   * This generic decompression function covers all use cases.
>>>>>>>>> @@ -80,25 +205,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>>        const size_t dictSize
>>>>>>>>>        )
>>>>>>>>>  {
>>>>>>>>> -     const BYTE *ip = (const BYTE *) src;
>>>>>>>>> -     const BYTE * const iend = ip + srcSize;
>>>>>>>>> +     const BYTE *ip = (const BYTE *)src;
>>>>>>>>> +     const BYTE *const iend = ip + srcSize;
>>>>>>>>>
>>>>>>>>>       BYTE *op = (BYTE *) dst;
>>>>>>>>> -     BYTE * const oend = op + outputSize;
>>>>>>>>> +     BYTE *const oend = op + outputSize;
>>>>>>>>>       BYTE *cpy;
>>>>>>>>>
>>>>>>>>> -     const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>>>> -     static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>>>>>>>>> -     static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>>>>>>>>> +     const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>>>>>>>>>
>>>>>>>>>       const int safeDecode = (endOnInput == endOnInputSize);
>>>>>>>>>       const int checkOffset = ((safeDecode) && (dictSize < (int)(64 *
>>>>>>>>> KB)));
>>>>>>>>>
>>>>>>>>>       /* Set up the "end" pointers for the shortcut. */
>>>>>>>>>       const BYTE *const shortiend = iend -
>>>>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>>>>>>>>       const BYTE *const shortoend = oend -
>>>>>>>>> -             (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>>>>> +         (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>>>>>>>>> +
>>>>>>>>> +     const BYTE *match;
>>>>>>>>> +     size_t offset;
>>>>>>>>> +     unsigned int token;
>>>>>>>>> +     size_t length;
>>>>>>>>>
>>>>>>>>>       DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>>>>>>>>>                srcSize, outputSize);
>>>>>>>>> @@ -117,15 +245,194 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>>       if ((endOnInput) && unlikely(srcSize == 0))
>>>>>>>>>               return -1;
>>>>>>>>>
>>>>>>>>> -     /* Main Loop : decode sequences */
>>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>>> +     if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>>>>>>>>> +             DEBUGLOG(6, "skip fast decode loop");
>>>>>>>>> +             goto safe_decode;
>>>>>>>>> +     }
>>>>>>>>> +
>>>>>>>>> +     /* Fast loop : decode sequences as long as output <
>>>>>>>>> +iend-FASTLOOP_SAFE_DISTANCE */
>>>>>>>>>       while (1) {
>>>>>>>>> -             size_t length;
>>>>>>>>> -             const BYTE *match;
>>>>>>>>> -             size_t offset;
>>>>>>>>> +             /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>>>>>>>>> +             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>>>>>>>>> +             if (endOnInput) {
>>>>>>>>> +                     assert(ip < iend);
>>>>>>>>> +             }
>>>>>>>>> +             token = *ip++;
>>>>>>>>> +             length = token >> ML_BITS;      /* literal length */
>>>>>>>>> +
>>>>>>>>> +             assert(!endOnInput || ip <= iend);      /* ip < iend before the increment */
>>>>>>>>> +
>>>>>>>>> +             /* decode literal length */
>>>>>>>>> +             if (length == RUN_MASK) {
>>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>>> +                     length +=
>>>>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>>>>> +                                              endOnInput, endOnInput,
>>>>>>>>> +                                              &error);
>>>>>>>>> +                     if (error == initial_error) {
>>>>>>>>> +                             goto _output_error;
>>>>>>>>> +                     }
>>>>>>>>> +                     if ((safeDecode)
>>>>>>>>> +                         && unlikely((uptrval) (op) + length <
>>>>>>>>> +                                     (uptrval) (op))) {
>>>>>>>>> +                             goto _output_error;
>>>>>>>>> +                     }       /* overflow detection */
>>>>>>>>> +                     if ((safeDecode)
>>>>>>>>> +                         && unlikely((uptrval) (ip) + length <
>>>>>>>>> +                                     (uptrval) (ip))) {
>>>>>>>>> +                             goto _output_error;
>>>>>>>>> +                     }
>>>>>>>>> +
>>>>>>>>> +                     /* overflow detection */
>>>>>>>>> +                     /* copy literals */
>>>>>>>>> +                     cpy = op + length;
>>>>>>>>> +                     LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>>>>> +                             if ((cpy > oend - 32)
>>>>>>>>> +                                 || (ip + length > iend - 32)) {
>>>>>>>>> +                                     goto safe_literal_copy;
>>>>>>>>> +                             }
>>>>>>>>> +                             LZ4_wildCopy32(op, ip, cpy);
>>>>>>>>> +                     } else {        /* LZ4_decompress_fast() */
>>>>>>>>> +                             if (cpy > oend - 8) {
>>>>>>>>> +                                     goto safe_literal_copy;
>>>>>>>>> +                             }
>>>>>>>>> +                             LZ4_wildCopy(op, ip, cpy);
>>>>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>>>>> +                             /* it doesn't know input length, and only relies on end-of-block */
>>>>>>>>> +                             /* properties */
>>>>>>>>> +                     }
>>>>>>>>> +                     ip += length;
>>>>>>>>> +                     op = cpy;
>>>>>>>>> +             } else {
>>>>>>>>> +                     cpy = op + length;
>>>>>>>>> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
>>>>>>>>> +                             DEBUGLOG(7,
>>>>>>>>> +                                      "copy %u bytes in a 16-bytes stripe",
>>>>>>>>> +                                      (unsigned)length);
>>>>>>>>> +                             /* We don't need to check oend */
>>>>>>>>> +                             /* since we check it once for each loop below */
>>>>>>>>> +                             if (ip > iend - (16 + 1)) {     /*max lit + offset + nextToken */
>>>>>>>>> +                                     goto safe_literal_copy;
>>>>>>>>> +                             }
>>>>>>>>> +                             /* Literals can only be 14, but hope compilers optimize */
>>>>>>>>> +                             /*if we copy by a register size */
>>>>>>>>> +                             memcpy(op, ip, 16);
>>>>>>>>> +                     } else {
>>>>>>>>> +                             /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>>>>>>>>> +                             /* it doesn't know input length, and relies on end-of-block */
>>>>>>>>> +                             /* properties */
>>>>>>>>> +                             memcpy(op, ip, 8);
>>>>>>>>> +                             if (length > 8) {
>>>>>>>>> +                                     memcpy(op + 8, ip + 8, 8);
>>>>>>>>> +                             }
>>>>>>>>> +                     }
>>>>>>>>> +                     ip += length;
>>>>>>>>> +                     op = cpy;
>>>>>>>>> +             }
>>>>>>>>> +
>>>>>>>>> +             /* get offset */
>>>>>>>>> +             offset = LZ4_readLE16(ip);
>>>>>>>>> +             ip += 2;        /* end-of-block condition violated */
>>>>>>>>> +             match = op - offset;
>>>>>>>>> +
>>>>>>>>> +             /* get matchlength */
>>>>>>>>> +             length = token & ML_MASK;
>>>>>>>>>
>>>>>>>>> -             /* get literal length */
>>>>>>>>> -             unsigned int const token = *ip++;
>>>>>>>>> -             length = token>>ML_BITS;
>>>>>>>>> +             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>>>>>>>>> +                     goto _output_error;
>>>>>>>>> +             }
>>>>>>>>> +             /* Error : offset outside buffers */
>>>>>>>>> +             if (length == ML_MASK) {
>>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>>> +                     length +=
>>>>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>>>>> +                                              endOnInput, 0, &error);
>>>>>>>>> +                     if (error != ok) {
>>>>>>>>> +                             goto _output_error;
>>>>>>>>> +                     }
>>>>>>>>> +                     if ((safeDecode)
>>>>>>>>> +                         && unlikely((uptrval) (op) + length < (uptrval) op)) {
>>>>>>>>> +                             goto _output_error;
>>>>>>>>> +                     }       /* overflow detection */
>>>>>>>>> +                     length += MINMATCH;
>>>>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>>>>> +                             goto safe_match_copy;
>>>>>>>>> +                     }
>>>>>>>>> +             } else {
>>>>>>>>> +                     length += MINMATCH;
>>>>>>>>> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>>>>>>>>> +                             goto safe_match_copy;
>>>>>>>>> +                     }
>>>>>>>>> +
>>>>>>>>> +                     /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>>>>>>>>> +                     if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>>>>>>>>> +                             if (offset >= 8) {
>>>>>>>>> +                                     memcpy(op, match, 8);
>>>>>>>>> +                                     memcpy(op + 8, match + 8, 8);
>>>>>>>>> +                                     memcpy(op + 16, match + 16, 2);
>>>>>>>>> +                                     op += length;
>>>>>>>>> +                                     continue;
>>>>>>>>> +                             }
>>>>>>>>> +                     }
>>>>>>>>> +             }
>>>>>>>>> +
>>>>>>>>> +             /* match starting within external dictionary */
>>>>>>>>> +             if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>>>> +                     if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>>>> +                             if (partialDecoding) {
>>>>>>>>> +                                     /* reach end of buffer */
>>>>>>>>> +                                     length =
>>>>>>>>> +                                         min(length, (size_t) (oend - op));
>>>>>>>>> +                             } else {
>>>>>>>>> +                                     /* end-of-block condition violated */
>>>>>>>>> +                                     goto _output_error;
>>>>>>>>> +                             }
>>>>>>>>> +                     }
>>>>>>>>> +
>>>>>>>>> +                     if (length <= (size_t) (lowPrefix - match)) {
>>>>>>>>> +                             /* match fits entirely within external dictionary : just copy */
>>>>>>>>> +                             memmove(op, dictEnd - (lowPrefix - match), length);
>>>>>>>>> +                             op += length;
>>>>>>>>> +                     } else {
>>>>>>>>> +                             /* match stretches into both external dict and current block */
>>>>>>>>> +                             size_t const copySize =
>>>>>>>>> +                                 (size_t) (lowPrefix - match);
>>>>>>>>> +                             size_t const restSize = length - copySize;
>>>>>>>>> +                             memcpy(op, dictEnd - copySize, copySize);
>>>>>>>>> +                             op += copySize;
>>>>>>>>> +                             if (restSize > (size_t) (op - lowPrefix)) {     /* overlap copy */
>>>>>>>>> +                                     BYTE *const endOfMatch = op + restSize;
>>>>>>>>> +                                     const BYTE *copyFrom = lowPrefix;
>>>>>>>>> +                                     while (op < endOfMatch) {
>>>>>>>>> +                                             *op++ = *copyFrom++;
>>>>>>>>> +                                     }
>>>>>>>>> +                             } else {
>>>>>>>>> +                                     memcpy(op, lowPrefix, restSize);
>>>>>>>>> +                                     op += restSize;
>>>>>>>>> +                             }
>>>>>>>>> +                     }
>>>>>>>>> +                     continue;
>>>>>>>>> +             }
>>>>>>>>> +
>>>>>>>>> +             /* copy match within block */
>>>>>>>>> +             cpy = op + length;
>>>>>>>>> +
>>>>>>>>> +             assert((op <= oend) && (oend - op >= 32));
>>>>>>>>> +             if (unlikely(offset < 16)) {
>>>>>>>>> +                     LZ4_memcpy_using_offset(op, match, cpy, offset);
>>>>>>>>> +             } else {
>>>>>>>>> +                     LZ4_wildCopy32(op, match, cpy);
>>>>>>>>> +             }
>>>>>>>>> +
>>>>>>>>> +             op = cpy;       /* wildcopy correction */
>>>>>>>>> +     }
>>>>>>>>> +      safe_decode:
>>>>>>>>> +#endif
>>>>>>>>> +     /* Main Loop : decode sequences */
>>>>>>>>> +     while (1) {
>>>>>>>>> +             length = token >> ML_BITS;
>>>>>>>>>
>>>>>>>>>               /* ip < iend before the increment */
>>>>>>>>>               assert(!endOnInput || ip <= iend);
>>>>>>>>> @@ -143,26 +450,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>>                * combined check for both stages).
>>>>>>>>>                */
>>>>>>>>>               if ((endOnInput ? length != RUN_MASK : length <= 8)
>>>>>>>>> -                /*
>>>>>>>>> -                 * strictly "less than" on input, to re-enter
>>>>>>>>> -                 * the loop with at least one byte
>>>>>>>>> -                 */
>>>>>>>>> -                && likely((endOnInput ? ip < shortiend : 1) &
>>>>>>>>> -                          (op <= shortoend))) {
>>>>>>>>> +                 /*
>>>>>>>>> +                  * strictly "less than" on input, to re-enter
>>>>>>>>> +                  * the loop with at least one byte
>>>>>>>>> +                  */
>>>>>>>>> +                 && likely((endOnInput ? ip < shortiend : 1) &
>>>>>>>>> +                           (op <= shortoend))) {
>>>>>>>>>                       /* Copy the literals */
>>>>>>>>>                       memcpy(op, ip, endOnInput ? 16 : 8);
>>>>>>>>> -                     op += length; ip += length;
>>>>>>>>> +                     op += length;
>>>>>>>>> +                     ip += length;
>>>>>>>>>
>>>>>>>>>                       /*
>>>>>>>>>                        * The second stage:
>>>>>>>>>                        * prepare for match copying, decode full info.
>>>>>>>>>                        * If it doesn't work out, the info won't be wasted.
>>>>>>>>>                        */
>>>>>>>>> -                     length = token & ML_MASK; /* match length */
>>>>>>>>> +                     length = token & ML_MASK;       /* match length */
>>>>>>>>>                       offset = LZ4_readLE16(ip);
>>>>>>>>>                       ip += 2;
>>>>>>>>>                       match = op - offset;
>>>>>>>>> -                     assert(match <= op); /* check overflow */
>>>>>>>>> +                     assert(match <= op);    /* check overflow */
>>>>>>>>>
>>>>>>>>>                       /* Do not deal with overlapping matches. */
>>>>>>>>>                       if ((length != ML_MASK) &&
>>>>>>>>> @@ -187,28 +495,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>>
>>>>>>>>>               /* decode literal length */
>>>>>>>>>               if (length == RUN_MASK) {
>>>>>>>>> -                     unsigned int s;
>>>>>>>>>
>>>>>>>>> -                     if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>>>>>>>>> -                             /* overflow detection */
>>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>>> +                     length +=
>>>>>>>>> +                         read_variable_length(&ip, iend - RUN_MASK,
>>>>>>>>> +                                              endOnInput, endOnInput,
>>>>>>>>> +                                              &error);
>>>>>>>>> +                     if (error == initial_error)
>>>>>>>>>                               goto _output_error;
>>>>>>>>> -                     }
>>>>>>>>> -                     do {
>>>>>>>>> -                             s = *ip++;
>>>>>>>>> -                             length += s;
>>>>>>>>> -                     } while (likely(endOnInput
>>>>>>>>> -                             ? ip < iend - RUN_MASK
>>>>>>>>> -                             : 1) & (s == 255));
>>>>>>>>>
>>>>>>>>>                       if ((safeDecode)
>>>>>>>>> -                         && unlikely((uptrval)(op) +
>>>>>>>>> -                                     length < (uptrval)(op))) {
>>>>>>>>> +                         && unlikely((uptrval) (op) +
>>>>>>>>> +                                     length < (uptrval) (op))) {
>>>>>>>>>                               /* overflow detection */
>>>>>>>>>                               goto _output_error;
>>>>>>>>>                       }
>>>>>>>>>                       if ((safeDecode)
>>>>>>>>> -                         && unlikely((uptrval)(ip) +
>>>>>>>>> -                                     length < (uptrval)(ip))) {
>>>>>>>>> +                         && unlikely((uptrval) (ip) +
>>>>>>>>> +                                     length < (uptrval) (ip))) {
>>>>>>>>>                               /* overflow detection */
>>>>>>>>>                               goto _output_error;
>>>>>>>>>                       }
>>>>>>>>> @@ -216,11 +520,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>>
>>>>>>>>>               /* copy literals */
>>>>>>>>>               cpy = op + length;
>>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>>> +           safe_literal_copy:
>>>>>>>>> +#endif
>>>>>>>>>               LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>>>>>>>>
>>>>>>>>>               if (((endOnInput) && ((cpy > oend - MFLIMIT)
>>>>>>>>> -                     || (ip + length > iend - (2 + 1 + LASTLITERALS))))
>>>>>>>>> -                     || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>>>> +                                   || (ip + length >
>>>>>>>>> +                                       iend - (2 + 1 + LASTLITERALS))))
>>>>>>>>> +                 || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>>>>>>>>                       if (partialDecoding) {
>>>>>>>>>                               if (cpy > oend) {
>>>>>>>>>                                       /*
>>>>>>>>> @@ -231,7 +539,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>>                                       length = oend - op;
>>>>>>>>>                               }
>>>>>>>>>                               if ((endOnInput)
>>>>>>>>> -                                     && (ip + length > iend)) {
>>>>>>>>> +                                 && (ip + length > iend)) {
>>>>>>>>>                                       /*
>>>>>>>>>                                        * Error :
>>>>>>>>>                                        * read attempt beyond
>>>>>>>>> @@ -241,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>>                               }
>>>>>>>>>                       } else {
>>>>>>>>>                               if ((!endOnInput)
>>>>>>>>> -                                     && (cpy != oend)) {
>>>>>>>>> +                                 && (cpy != oend)) {
>>>>>>>>>                                       /*
>>>>>>>>>                                        * Error :
>>>>>>>>>                                        * block decoding must
>>>>>>>>> @@ -250,7 +558,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>>                                       goto _output_error;
>>>>>>>>>                               }
>>>>>>>>>                               if ((endOnInput)
>>>>>>>>> -                                     && ((ip + length != iend)
>>>>>>>>> +                                 && ((ip + length != iend)
>>>>>>>>>                                       || (cpy > oend))) {
>>>>>>>>>                                       /*
>>>>>>>>>                                        * Error :
>>>>>>>>> @@ -288,29 +596,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>>                       goto _output_error;
>>>>>>>>>               }
>>>>>>>>>
>>>>>>>>> -             /* costs ~1%; silence an msan warning when offset == 0 */
>>>>>>>>> -             /*
>>>>>>>>> -              * note : when partialDecoding, there is no guarantee that
>>>>>>>>> -              * at least 4 bytes remain available in output buffer
>>>>>>>>> -              */
>>>>>>>>> -             if (!partialDecoding) {
>>>>>>>>> -                     assert(oend > op);
>>>>>>>>> -                     assert(oend - op >= 4);
>>>>>>>>> -
>>>>>>>>> -                     LZ4_write32(op, (U32)offset);
>>>>>>>>> -             }
>>>>>>>>> -
>>>>>>>>>               if (length == ML_MASK) {
>>>>>>>>> -                     unsigned int s;
>>>>>>>>> -
>>>>>>>>> -                     do {
>>>>>>>>> -                             s = *ip++;
>>>>>>>>> -
>>>>>>>>> -                             if ((endOnInput) && (ip > iend - LASTLITERALS))
>>>>>>>>> -                                     goto _output_error;
>>>>>>>>>
>>>>>>>>> -                             length += s;
>>>>>>>>> -                     } while (s == 255);
>>>>>>>>> +                     variable_length_error error = ok;
>>>>>>>>> +                     length +=
>>>>>>>>> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
>>>>>>>>> +                                              endOnInput, 0, &error);
>>>>>>>>> +                     if (error != ok)
>>>>>>>>> +                             goto _output_error;
>>>>>>>>>
>>>>>>>>>                       if ((safeDecode)
>>>>>>>>>                               && unlikely(
>>>>>>>>> @@ -322,6 +615,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>>>>>>>>
>>>>>>>>>               length += MINMATCH;
>>>>>>>>>
>>>>>>>>> +#if LZ4_FAST_DEC_LOOP
>>>>>>>>> +safe_match_copy:
>>>>>>>>> +#endif
>>>>>>>>> +
>>>>>>>>>               /* match starting within external dictionary */
>>>>>>>>>               if ((dict == usingExtDict) && (match < lowPrefix)) {
>>>>>>>>>                       if (unlikely(op + length > oend - LASTLITERALS)) {
>>>>>>>>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-17  6:35 ` Gao Xiang
@ 2019-05-17  7:06   ` Mao, Chenxi
  0 siblings, 0 replies; 16+ messages in thread
From: Mao, Chenxi @ 2019-05-17  7:06 UTC (permalink / raw)
  To: Gao Xiang
  Cc: akpm, linux-kernel, roy.feng, yuanli.xu, robert.alm,
	masaya.a.takahashi, yann.collet.73, miaoxie

Hi Xiang:

Got it and thanks.
Will follow the rules in next PS.

Chenxi

On 5/17/19 2:35 PM, Gao Xiang wrote:
> Hi Chenxi,
> 
> Some words about the patch format.. not important tho.
> 
> One suggestion is that the subject line should be better written
> as "[PATCH v2/v3/...] title" since it's more clear to know
> which patch is the latest patch among these emails.
> 
> On 2019/5/17 13:56, Chenxi Mao wrote:
>> FAST_DEC_LOOP was introduced from LZ4 1.9.0[1]
>> This change would be introduce 10% on decompress operation
>> according to LZ4 benchmark result on X86 devices.
>> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements on ARM64,
>> however clang compiler has downgrade if FAST_DEC_LOOP enabled.
>>
>> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
>>
>> LZ4 FAST_DEC_LOOP bug fixes include as well.
>> 1. fixed read-after input in LZ4_decompress_safe() (issue 681)
>> 2. Fix out-of-bound read in LZ4_decompress_fast() (issue 676)
>>
>> PS2:
>> 1. Move common API to lz4defs.h
>> 2. Add PPC related inline Macro defination.
>> 3. Force inline new static apis.
>>
>> Here is the test result on ARM64(cortex-A53)
>> Benchmark via ZRAM:
>>
>> Test case:
>> taskset 03 /data/fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
>> --buffer_compress_percentage=75  --size=700M \
>> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=1 \
>> --filename=/data/test/test --name=seq-read --rw=read --stonewall
>>
>> Patched:
>>     READ: bw=150MiB/s (157MB/s)
>> Vanilla:
>>     READ: bw=135MiB/s (142MB/s)
>>
>> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
>>
>> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
>> ---
> 
> It's perferred to move all changelogs here if you don't want these changelog
> as a part of commit and there are some patches which can be referenced:
> https://lore.kernel.org/lkml/
> 
> Thanks,
> Gao Xiang
> 
>>  lib/lz4/lz4_compress.c   |   4 +-
>>  lib/lz4/lz4_decompress.c | 397 ++++++++++++++++++++++++++++++++-------
>>  lib/lz4/lz4defs.h        |  60 +++++-
>>  lib/lz4/lz4hc_compress.c |   2 +-
>>  4 files changed, 392 insertions(+), 71 deletions(-)
>>
>> diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
>> index cc7b6d4cc7c7..b703ed1ca57d 100644
>> --- a/lib/lz4/lz4_compress.c
>> +++ b/lib/lz4/lz4_compress.c
>> @@ -322,7 +322,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>>  				*token = (BYTE)(litLength << ML_BITS);
>>  
>>  			/* Copy Literals */
>> -			LZ4_wildCopy(op, anchor, op + litLength);
>> +			LZ4_wildCopy8(op, anchor, op + litLength);
>>  			op += litLength;
>>  		}
>>  
>> @@ -628,7 +628,7 @@ static int LZ4_compress_destSize_generic(
>>  				*token = (BYTE)(litLength << ML_BITS);
>>  
>>  			/* Copy Literals */
>> -			LZ4_wildCopy(op, anchor, op + litLength);
>> +			LZ4_wildCopy8(op, anchor, op + litLength);
>>  			op += litLength;
>>  		}
>>  
>> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
>> index 0c9d3ad17e0f..8622922304c3 100644
>> --- a/lib/lz4/lz4_decompress.c
>> +++ b/lib/lz4/lz4_decompress.c
>> @@ -50,6 +50,96 @@
>>  #define assert(condition) ((void)0)
>>  #endif
>>  
>> +#ifndef LZ4_FAST_DEC_LOOP
>> +#if defined(__i386__) || defined(__x86_64__)
>> +#define LZ4_FAST_DEC_LOOP 1
>> +#elif defined(__aarch64__) && !defined(__clang__)
>> +     /* On aarch64, we disable this optimization for clang because on certain
>> +      * mobile chipsets and clang, it reduces performance. For more information
>> +      * refer to https://github.com/lz4/lz4/pull/707. */
>> +#define LZ4_FAST_DEC_LOOP 1
>> +#else
>> +#define LZ4_FAST_DEC_LOOP 0
>> +#endif
>> +#endif
>> +
>> +#if LZ4_FAST_DEC_LOOP
>> +#define FASTLOOP_SAFE_DISTANCE 64
>> +FORCE_O2_INLINE_GCC_PPC64LE void
>> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
>> +			     const size_t offset)
>> +{
>> +	if (offset < 8) {
>> +		dstPtr[0] = srcPtr[0];
>> +
>> +		dstPtr[1] = srcPtr[1];
>> +		dstPtr[2] = srcPtr[2];
>> +		dstPtr[3] = srcPtr[3];
>> +		srcPtr += inc32table[offset];
>> +		memcpy(dstPtr + 4, srcPtr, 4);
>> +		srcPtr -= dec64table[offset];
>> +		dstPtr += 8;
>> +	} else {
>> +		memcpy(dstPtr, srcPtr, 8);
>> +		dstPtr += 8;
>> +		srcPtr += 8;
>> +	}
>> +
>> +	LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
>> +}
>> +
>> +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
>> + * this version copies two times 16 bytes (instead of one time 32 bytes)
>> + * because it must be compatible with offsets >= 16. */
>> +FORCE_O2_INLINE_GCC_PPC64LE void
>> +LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd)
>> +{
>> +	BYTE *d = (BYTE *) dstPtr;
>> +	const BYTE *s = (const BYTE *)srcPtr;
>> +	BYTE *const e = (BYTE *) dstEnd;
>> +
>> +	do {
>> +		memcpy(d, s, 16);
>> +		memcpy(d + 16, s + 16, 16);
>> +		d += 32;
>> +		s += 32;
>> +	} while (d < e);
>> +}
>> +
>> +FORCE_O2_INLINE_GCC_PPC64LE void
>> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
>> +			const size_t offset)
>> +{
>> +	BYTE v[8];
>> +	switch (offset) {
>> +
>> +	case 1:
>> +		memset(v, *srcPtr, 8);
>> +		goto copy_loop;
>> +	case 2:
>> +		memcpy(v, srcPtr, 2);
>> +		memcpy(&v[2], srcPtr, 2);
>> +		memcpy(&v[4], &v[0], 4);
>> +		goto copy_loop;
>> +	case 4:
>> +		memcpy(v, srcPtr, 4);
>> +		memcpy(&v[4], srcPtr, 4);
>> +		goto copy_loop;
>> +	default:
>> +		LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
>> +		return;
>> +	}
>> +
>> +      copy_loop:
>> +	memcpy(dstPtr, v, 8);
>> +	dstPtr += 8;
>> +	while (dstPtr < dstEnd) {
>> +		memcpy(dstPtr, v, 8);
>> +		dstPtr += 8;
>> +	}
>> +}
>> +#endif
>> +
>>  /*
>>   * LZ4_decompress_generic() :
>>   * This generic decompression function covers all use cases.
>> @@ -80,25 +170,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  	 const size_t dictSize
>>  	 )
>>  {
>> -	const BYTE *ip = (const BYTE *) src;
>> -	const BYTE * const iend = ip + srcSize;
>> +	const BYTE *ip = (const BYTE *)src;
>> +	const BYTE *const iend = ip + srcSize;
>>  
>>  	BYTE *op = (BYTE *) dst;
>> -	BYTE * const oend = op + outputSize;
>> +	BYTE *const oend = op + outputSize;
>>  	BYTE *cpy;
>>  
>> -	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
>> -	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
>> -	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
>> +	const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>>  
>>  	const int safeDecode = (endOnInput == endOnInputSize);
>>  	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
>>  
>>  	/* Set up the "end" pointers for the shortcut. */
>>  	const BYTE *const shortiend = iend -
>> -		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>>  	const BYTE *const shortoend = oend -
>> -		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
>> +
>> +	const BYTE *match;
>> +	size_t offset;
>> +	unsigned int token;
>> +	size_t length;
>>  
>>  	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>>  		 srcSize, outputSize);
>> @@ -117,15 +210,195 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  	if ((endOnInput) && unlikely(srcSize == 0))
>>  		return -1;
>>  
>> -	/* Main Loop : decode sequences */
>> +#if LZ4_FAST_DEC_LOOP
>> +	if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
>> +		DEBUGLOG(6, "skip fast decode loop");
>> +		goto safe_decode;
>> +	}
>> +
>> +	/* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
>>  	while (1) {
>> -		size_t length;
>> -		const BYTE *match;
>> -		size_t offset;
>> +		/* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
>> +		assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
>> +		if (endOnInput) {
>> +			assert(ip < iend);
>> +		}
>> +		token = *ip++;
>> +		length = token >> ML_BITS;	/* literal length */
>> +
>> +		assert(!endOnInput || ip <= iend);	/* ip < iend before the increment */
>> +
>> +		/* decode literal length */
>> +		if (length == RUN_MASK) {
>> +			variable_length_error error = ok;
>> +			length +=
>> +			    read_variable_length(&ip, iend - RUN_MASK,
>> +						 endOnInput, endOnInput,
>> +						 &error);
>> +			if (error == initial_error) {
>> +				goto _output_error;
>> +			}
>> +			if ((safeDecode)
>> +			    && unlikely((uptrval) (op) + length <
>> +					(uptrval) (op))) {
>> +				goto _output_error;
>> +			}	/* overflow detection */
>> +			if ((safeDecode)
>> +			    && unlikely((uptrval) (ip) + length <
>> +					(uptrval) (ip))) {
>> +				goto _output_error;
>> +			}
>>  
>> -		/* get literal length */
>> -		unsigned int const token = *ip++;
>> -		length = token>>ML_BITS;
>> +			/* overflow detection */
>> +			/* copy literals */
>> +			cpy = op + length;
>> +			LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>> +			if (endOnInput) {	/* LZ4_decompress_safe() */
>> +				if ((cpy > oend - 32)
>> +				    || (ip + length > iend - 32)) {
>> +					goto safe_literal_copy;
>> +				}
>> +				LZ4_wildCopy32(op, ip, cpy);
>> +			} else {	/* LZ4_decompress_fast() */
>> +				if (cpy > oend - 8) {
>> +					goto safe_literal_copy;
>> +				}
>> +				LZ4_wildCopy8(op, ip, cpy);
>> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>> +				/* it doesn't know input length, and only relies on end-of-block */
>> +				/* properties */
>> +			}
>> +			ip += length;
>> +			op = cpy;
>> +		} else {
>> +			cpy = op + length;
>> +			if (endOnInput) {	/* LZ4_decompress_safe() */
>> +				DEBUGLOG(7,
>> +					 "copy %u bytes in a 16-bytes stripe",
>> +					 (unsigned)length);
>> +				/* We don't need to check oend */
>> +				/* since we check it once for each loop below */
>> +				if (ip > iend - (16 + 1)) {	/*max lit + offset + nextToken */
>> +					goto safe_literal_copy;
>> +				}
>> +				/* Literals can only be 14, but hope compilers optimize */
>> +				/*if we copy by a register size */
>> +				memcpy(op, ip, 16);
>> +			} else {
>> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
>> +				/* it doesn't know input length, and relies on end-of-block */
>> +				/* properties */
>> +				memcpy(op, ip, 8);
>> +				if (length > 8) {
>> +					memcpy(op + 8, ip + 8, 8);
>> +				}
>> +			}
>> +			ip += length;
>> +			op = cpy;
>> +		}
>> +
>> +		/* get offset */
>> +		offset = LZ4_readLE16(ip);
>> +		ip += 2;	/* end-of-block condition violated */
>> +		match = op - offset;
>> +
>> +		/* get matchlength */
>> +		length = token & ML_MASK;
>> +
>> +		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
>> +			goto _output_error;
>> +		}
>> +		/* Error : offset outside buffers */
>> +		if (length == ML_MASK) {
>> +			variable_length_error error = ok;
>> +			length +=
>> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
>> +						 endOnInput, 0, &error);
>> +			if (error != ok) {
>> +				goto _output_error;
>> +			}
>> +			if ((safeDecode)
>> +			    && unlikely((uptrval) (op) + length < (uptrval) op)) {
>> +				goto _output_error;
>> +			}	/* overflow detection */
>> +			length += MINMATCH;
>> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>> +				goto safe_match_copy;
>> +			}
>> +		} else {
>> +			length += MINMATCH;
>> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
>> +				goto safe_match_copy;
>> +			}
>> +
>> +			/* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
>> +			if (!(dict == usingExtDict) || (match >= lowPrefix)) {
>> +				if (offset >= 8) {
>> +					memcpy(op, match, 8);
>> +					memcpy(op + 8, match + 8, 8);
>> +					memcpy(op + 16, match + 16, 2);
>> +					op += length;
>> +					continue;
>> +				}
>> +			}
>> +		}
>> +
>> +		/* match starting within external dictionary */
>> +		if ((dict == usingExtDict) && (match < lowPrefix)) {
>> +			if (unlikely(op + length > oend - LASTLITERALS)) {
>> +				if (partialDecoding) {
>> +					/* reach end of buffer */
>> +					length =
>> +					    min(length, (size_t) (oend - op));
>> +				} else {
>> +					/* end-of-block condition violated */
>> +					goto _output_error;
>> +				}
>> +			}
>> +
>> +			if (length <= (size_t) (lowPrefix - match)) {
>> +				/* match fits entirely within external dictionary : just copy */
>> +				memmove(op, dictEnd - (lowPrefix - match),
>> +					length);
>> +				op += length;
>> +			} else {
>> +				/* match stretches into both external dict and current block */
>> +				size_t const copySize =
>> +				    (size_t) (lowPrefix - match);
>> +				size_t const restSize = length - copySize;
>> +				memcpy(op, dictEnd - copySize, copySize);
>> +				op += copySize;
>> +				if (restSize > (size_t) (op - lowPrefix)) {	/* overlap copy */
>> +					BYTE *const endOfMatch = op + restSize;
>> +					const BYTE *copyFrom = lowPrefix;
>> +					while (op < endOfMatch) {
>> +						*op++ = *copyFrom++;
>> +					}
>> +				} else {
>> +					memcpy(op, lowPrefix, restSize);
>> +					op += restSize;
>> +				}
>> +			}
>> +			continue;
>> +		}
>> +
>> +		/* copy match within block */
>> +		cpy = op + length;
>> +
>> +		assert((op <= oend) && (oend - op >= 32));
>> +		if (unlikely(offset < 16)) {
>> +			LZ4_memcpy_using_offset(op, match, cpy, offset);
>> +		} else {
>> +			LZ4_wildCopy32(op, match, cpy);
>> +		}
>> +
>> +		op = cpy;	/* wildcopy correction */
>> +	}
>> +      safe_decode:
>> +#endif
>> +	/* Main Loop : decode sequences */
>> +	while (1) {
>> +		length = token >> ML_BITS;
>>  
>>  		/* ip < iend before the increment */
>>  		assert(!endOnInput || ip <= iend);
>> @@ -143,26 +416,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  		 * combined check for both stages).
>>  		 */
>>  		if ((endOnInput ? length != RUN_MASK : length <= 8)
>> -		   /*
>> -		    * strictly "less than" on input, to re-enter
>> -		    * the loop with at least one byte
>> -		    */
>> -		   && likely((endOnInput ? ip < shortiend : 1) &
>> -			     (op <= shortoend))) {
>> +		    /*
>> +		     * strictly "less than" on input, to re-enter
>> +		     * the loop with at least one byte
>> +		     */
>> +		    && likely((endOnInput ? ip < shortiend : 1) &
>> +			      (op <= shortoend))) {
>>  			/* Copy the literals */
>>  			memcpy(op, ip, endOnInput ? 16 : 8);
>> -			op += length; ip += length;
>> +			op += length;
>> +			ip += length;
>>  
>>  			/*
>>  			 * The second stage:
>>  			 * prepare for match copying, decode full info.
>>  			 * If it doesn't work out, the info won't be wasted.
>>  			 */
>> -			length = token & ML_MASK; /* match length */
>> +			length = token & ML_MASK;	/* match length */
>>  			offset = LZ4_readLE16(ip);
>>  			ip += 2;
>>  			match = op - offset;
>> -			assert(match <= op); /* check overflow */
>> +			assert(match <= op);	/* check overflow */
>>  
>>  			/* Do not deal with overlapping matches. */
>>  			if ((length != ML_MASK) &&
>> @@ -187,28 +461,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  
>>  		/* decode literal length */
>>  		if (length == RUN_MASK) {
>> -			unsigned int s;
>>  
>> -			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
>> -				/* overflow detection */
>> +			variable_length_error error = ok;
>> +			length +=
>> +			    read_variable_length(&ip, iend - RUN_MASK,
>> +						 endOnInput, endOnInput,
>> +						 &error);
>> +			if (error == initial_error)
>>  				goto _output_error;
>> -			}
>> -			do {
>> -				s = *ip++;
>> -				length += s;
>> -			} while (likely(endOnInput
>> -				? ip < iend - RUN_MASK
>> -				: 1) & (s == 255));
>>  
>>  			if ((safeDecode)
>> -			    && unlikely((uptrval)(op) +
>> -					length < (uptrval)(op))) {
>> +			    && unlikely((uptrval) (op) +
>> +					length < (uptrval) (op))) {
>>  				/* overflow detection */
>>  				goto _output_error;
>>  			}
>>  			if ((safeDecode)
>> -			    && unlikely((uptrval)(ip) +
>> -					length < (uptrval)(ip))) {
>> +			    && unlikely((uptrval) (ip) +
>> +					length < (uptrval) (ip))) {
>>  				/* overflow detection */
>>  				goto _output_error;
>>  			}
>> @@ -216,11 +486,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  
>>  		/* copy literals */
>>  		cpy = op + length;
>> +#if LZ4_FAST_DEC_LOOP
>> +	      safe_literal_copy:
>> +#endif
>>  		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>>  
>>  		if (((endOnInput) && ((cpy > oend - MFLIMIT)
>> -			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
>> -			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>> +				      || (ip + length >
>> +					  iend - (2 + 1 + LASTLITERALS))))
>> +		    || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>>  			if (partialDecoding) {
>>  				if (cpy > oend) {
>>  					/*
>> @@ -231,7 +505,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  					length = oend - op;
>>  				}
>>  				if ((endOnInput)
>> -					&& (ip + length > iend)) {
>> +				    && (ip + length > iend)) {
>>  					/*
>>  					 * Error :
>>  					 * read attempt beyond
>> @@ -241,7 +515,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  				}
>>  			} else {
>>  				if ((!endOnInput)
>> -					&& (cpy != oend)) {
>> +				    && (cpy != oend)) {
>>  					/*
>>  					 * Error :
>>  					 * block decoding must
>> @@ -250,7 +524,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  					goto _output_error;
>>  				}
>>  				if ((endOnInput)
>> -					&& ((ip + length != iend)
>> +				    && ((ip + length != iend)
>>  					|| (cpy > oend))) {
>>  					/*
>>  					 * Error :
>> @@ -269,7 +543,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  				break;
>>  		} else {
>>  			/* may overwrite up to WILDCOPYLENGTH beyond cpy */
>> -			LZ4_wildCopy(op, ip, cpy);
>> +			LZ4_wildCopy8(op, ip, cpy);
>>  			ip += length;
>>  			op = cpy;
>>  		}
>> @@ -288,29 +562,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  			goto _output_error;
>>  		}
>>  
>> -		/* costs ~1%; silence an msan warning when offset == 0 */
>> -		/*
>> -		 * note : when partialDecoding, there is no guarantee that
>> -		 * at least 4 bytes remain available in output buffer
>> -		 */
>> -		if (!partialDecoding) {
>> -			assert(oend > op);
>> -			assert(oend - op >= 4);
>> -
>> -			LZ4_write32(op, (U32)offset);
>> -		}
>> -
>>  		if (length == ML_MASK) {
>> -			unsigned int s;
>> -
>> -			do {
>> -				s = *ip++;
>> -
>> -				if ((endOnInput) && (ip > iend - LASTLITERALS))
>> -					goto _output_error;
>>  
>> -				length += s;
>> -			} while (s == 255);
>> +			variable_length_error error = ok;
>> +			length +=
>> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
>> +						 endOnInput, 0, &error);
>> +			if (error != ok)
>> +				goto _output_error;
>>  
>>  			if ((safeDecode)
>>  				&& unlikely(
>> @@ -322,6 +581,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  
>>  		length += MINMATCH;
>>  
>> +#if LZ4_FAST_DEC_LOOP
>> +safe_match_copy:
>> +#endif
>> +
>>  		/* match starting within external dictionary */
>>  		if ((dict == usingExtDict) && (match < lowPrefix)) {
>>  			if (unlikely(op + length > oend - LASTLITERALS)) {
>> @@ -418,7 +681,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  			}
>>  
>>  			if (op < oCopyLimit) {
>> -				LZ4_wildCopy(op, match, oCopyLimit);
>> +				LZ4_wildCopy8(op, match, oCopyLimit);
>>  				match += oCopyLimit - op;
>>  				op = oCopyLimit;
>>  			}
>> @@ -427,7 +690,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>>  		} else {
>>  			LZ4_copy8(op, match);
>>  			if (length > 16)
>> -				LZ4_wildCopy(op + 8, match + 8, cpy);
>> +				LZ4_wildCopy8(op + 8, match + 8, cpy);
>>  		}
>>  		op = cpy; /* wildcopy correction */
>>  	}
>> diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
>> index 1a7fa9d9170f..4cc17cf589ed 100644
>> --- a/lib/lz4/lz4defs.h
>> +++ b/lib/lz4/lz4defs.h
>> @@ -40,6 +40,28 @@
>>  
>>  #define FORCE_INLINE __always_inline
>>  
>> +/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
>> + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
>> + * together with a simple 8-byte copy loop as a fall-back path.
>> + * However, this optimization hurts the decompression speed by >30%,
>> + * because the execution does not go to the optimized loop
>> + * for typical compressible data, and all of the preamble checks
>> + * before going to the fall-back path become useless overhead.
>> + * This optimization happens only with the -O3 flag, and -O2 generates
>> + * a simple 8-byte copy loop.
>> + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
>> + * functions are annotated with __attribute__((optimize("O2"))),
>> + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
>> + * of LZ4_wildCopy8 does not affect the compression speed.
>> + */
>> +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
>> +#  define FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
>> +#  define FORCE_O2_INLINE_GCC_PPC64LE (__attribute__((optimize("O2"))) FORCE_INLINE)
>> +#else
>> +#  define FORCE_O2_GCC_PPC64LE		FORCE_INLINE
>> +#  define FORCE_O2_INLINE_GCC_PPC64LE	FORCE_INLINE
>> +#endif
>> +
>>  /*-************************************
>>   *	Basic Types
>>   **************************************/
>> @@ -99,6 +121,9 @@ typedef uintptr_t uptrval;
>>  #define RUN_BITS (8 - ML_BITS)
>>  #define RUN_MASK ((1U << RUN_BITS) - 1)
>>  
>> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
>> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
>> +
>>  /*-************************************
>>   *	Reading and writing into memory
>>   **************************************/
>> @@ -156,7 +181,7 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
>>   * customized variant of memcpy,
>>   * which can overwrite up to 7 bytes beyond dstEnd
>>   */
>> -static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
>> +static FORCE_O2_INLINE_GCC_PPC64LE void LZ4_wildCopy8(void *dstPtr,
>>  	const void *srcPtr, void *dstEnd)
>>  {
>>  	BYTE *d = (BYTE *)dstPtr;
>> @@ -220,6 +245,39 @@ static FORCE_INLINE unsigned int LZ4_count(
>>  	return (unsigned int)(pIn - pStart);
>>  }
>>  
>> +/* Read the variable-length literal or match length.
>> + *
>> + * ip - pointer to use as input.
>> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
>> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
>> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
>> + * error (output) - error code.  Should be set to 0 before call.
>> + */
>> +typedef enum { loop_error = -2, initial_error = -1, ok = 0} variable_length_error;
>> +static FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
>> +					   const BYTE *lencheck,
>> +					   int loop_check, int initial_check,
>> +					   variable_length_error *error)
>> +{
>> +	unsigned length = 0;
>> +	unsigned s;
>> +	if (initial_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
>> +		*error = initial_error;
>> +		return length;
>> +	}
>> +	do {
>> +		s = **ip;
>> +		(*ip)++;
>> +		length += s;
>> +		if (loop_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
>> +			*error = loop_error;
>> +			return length;
>> +		}
>> +	} while (s == 255);
>> +
>> +	return length;
>> +}
>> +
>>  typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
>>  typedef enum { byPtr, byU32, byU16 } tableType_t;
>>  
>> diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
>> index 176f03b83e56..e02e041a01d9 100644
>> --- a/lib/lz4/lz4hc_compress.c
>> +++ b/lib/lz4/lz4hc_compress.c
>> @@ -293,7 +293,7 @@ static FORCE_INLINE int LZ4HC_encodeSequence(
>>  		*token = (BYTE)(length<<ML_BITS);
>>  
>>  	/* Copy Literals */
>> -	LZ4_wildCopy(*op, *anchor, (*op) + length);
>> +	LZ4_wildCopy8(*op, *anchor, (*op) + length);
>>  	*op += length;
>>  
>>  	/* Encode Offset */
>>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
  2019-05-17  5:56 Chenxi Mao
@ 2019-05-17  6:35 ` Gao Xiang
  2019-05-17  7:06   ` Mao, Chenxi
  0 siblings, 1 reply; 16+ messages in thread
From: Gao Xiang @ 2019-05-17  6:35 UTC (permalink / raw)
  To: Chenxi Mao
  Cc: akpm, linux-kernel, roy.feng, yuanli.xu, robert.alm,
	masaya.a.takahashi, yann.collet.73, miaoxie

Hi Chenxi,

Some words about the patch format.. not important tho.

One suggestion is that the subject line should be better written
as "[PATCH v2/v3/...] title" since it's more clear to know
which patch is the latest patch among these emails.

On 2019/5/17 13:56, Chenxi Mao wrote:
> FAST_DEC_LOOP was introduced from LZ4 1.9.0[1]
> This change would be introduce 10% on decompress operation
> according to LZ4 benchmark result on X86 devices.
> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements on ARM64,
> however clang compiler has downgrade if FAST_DEC_LOOP enabled.
> 
> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
> 
> LZ4 FAST_DEC_LOOP bug fixes include as well.
> 1. fixed read-after input in LZ4_decompress_safe() (issue 681)
> 2. Fix out-of-bound read in LZ4_decompress_fast() (issue 676)
> 
> PS2:
> 1. Move common API to lz4defs.h
> 2. Add PPC related inline Macro defination.
> 3. Force inline new static apis.
> 
> Here is the test result on ARM64(cortex-A53)
> Benchmark via ZRAM:
> 
> Test case:
> taskset 03 /data/fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
> --buffer_compress_percentage=75  --size=700M \
> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=1 \
> --filename=/data/test/test --name=seq-read --rw=read --stonewall
> 
> Patched:
>     READ: bw=150MiB/s (157MB/s)
> Vanilla:
>     READ: bw=135MiB/s (142MB/s)
> 
> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
> 
> Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
> ---

It's perferred to move all changelogs here if you don't want these changelog
as a part of commit and there are some patches which can be referenced:
https://lore.kernel.org/lkml/

Thanks,
Gao Xiang

>  lib/lz4/lz4_compress.c   |   4 +-
>  lib/lz4/lz4_decompress.c | 397 ++++++++++++++++++++++++++++++++-------
>  lib/lz4/lz4defs.h        |  60 +++++-
>  lib/lz4/lz4hc_compress.c |   2 +-
>  4 files changed, 392 insertions(+), 71 deletions(-)
> 
> diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
> index cc7b6d4cc7c7..b703ed1ca57d 100644
> --- a/lib/lz4/lz4_compress.c
> +++ b/lib/lz4/lz4_compress.c
> @@ -322,7 +322,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>  				*token = (BYTE)(litLength << ML_BITS);
>  
>  			/* Copy Literals */
> -			LZ4_wildCopy(op, anchor, op + litLength);
> +			LZ4_wildCopy8(op, anchor, op + litLength);
>  			op += litLength;
>  		}
>  
> @@ -628,7 +628,7 @@ static int LZ4_compress_destSize_generic(
>  				*token = (BYTE)(litLength << ML_BITS);
>  
>  			/* Copy Literals */
> -			LZ4_wildCopy(op, anchor, op + litLength);
> +			LZ4_wildCopy8(op, anchor, op + litLength);
>  			op += litLength;
>  		}
>  
> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
> index 0c9d3ad17e0f..8622922304c3 100644
> --- a/lib/lz4/lz4_decompress.c
> +++ b/lib/lz4/lz4_decompress.c
> @@ -50,6 +50,96 @@
>  #define assert(condition) ((void)0)
>  #endif
>  
> +#ifndef LZ4_FAST_DEC_LOOP
> +#if defined(__i386__) || defined(__x86_64__)
> +#define LZ4_FAST_DEC_LOOP 1
> +#elif defined(__aarch64__) && !defined(__clang__)
> +     /* On aarch64, we disable this optimization for clang because on certain
> +      * mobile chipsets and clang, it reduces performance. For more information
> +      * refer to https://github.com/lz4/lz4/pull/707. */
> +#define LZ4_FAST_DEC_LOOP 1
> +#else
> +#define LZ4_FAST_DEC_LOOP 0
> +#endif
> +#endif
> +
> +#if LZ4_FAST_DEC_LOOP
> +#define FASTLOOP_SAFE_DISTANCE 64
> +FORCE_O2_INLINE_GCC_PPC64LE void
> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
> +			     const size_t offset)
> +{
> +	if (offset < 8) {
> +		dstPtr[0] = srcPtr[0];
> +
> +		dstPtr[1] = srcPtr[1];
> +		dstPtr[2] = srcPtr[2];
> +		dstPtr[3] = srcPtr[3];
> +		srcPtr += inc32table[offset];
> +		memcpy(dstPtr + 4, srcPtr, 4);
> +		srcPtr -= dec64table[offset];
> +		dstPtr += 8;
> +	} else {
> +		memcpy(dstPtr, srcPtr, 8);
> +		dstPtr += 8;
> +		srcPtr += 8;
> +	}
> +
> +	LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
> +}
> +
> +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
> + * this version copies two times 16 bytes (instead of one time 32 bytes)
> + * because it must be compatible with offsets >= 16. */
> +FORCE_O2_INLINE_GCC_PPC64LE void
> +LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd)
> +{
> +	BYTE *d = (BYTE *) dstPtr;
> +	const BYTE *s = (const BYTE *)srcPtr;
> +	BYTE *const e = (BYTE *) dstEnd;
> +
> +	do {
> +		memcpy(d, s, 16);
> +		memcpy(d + 16, s + 16, 16);
> +		d += 32;
> +		s += 32;
> +	} while (d < e);
> +}
> +
> +FORCE_O2_INLINE_GCC_PPC64LE void
> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
> +			const size_t offset)
> +{
> +	BYTE v[8];
> +	switch (offset) {
> +
> +	case 1:
> +		memset(v, *srcPtr, 8);
> +		goto copy_loop;
> +	case 2:
> +		memcpy(v, srcPtr, 2);
> +		memcpy(&v[2], srcPtr, 2);
> +		memcpy(&v[4], &v[0], 4);
> +		goto copy_loop;
> +	case 4:
> +		memcpy(v, srcPtr, 4);
> +		memcpy(&v[4], srcPtr, 4);
> +		goto copy_loop;
> +	default:
> +		LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
> +		return;
> +	}
> +
> +      copy_loop:
> +	memcpy(dstPtr, v, 8);
> +	dstPtr += 8;
> +	while (dstPtr < dstEnd) {
> +		memcpy(dstPtr, v, 8);
> +		dstPtr += 8;
> +	}
> +}
> +#endif
> +
>  /*
>   * LZ4_decompress_generic() :
>   * This generic decompression function covers all use cases.
> @@ -80,25 +170,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  	 const size_t dictSize
>  	 )
>  {
> -	const BYTE *ip = (const BYTE *) src;
> -	const BYTE * const iend = ip + srcSize;
> +	const BYTE *ip = (const BYTE *)src;
> +	const BYTE *const iend = ip + srcSize;
>  
>  	BYTE *op = (BYTE *) dst;
> -	BYTE * const oend = op + outputSize;
> +	BYTE *const oend = op + outputSize;
>  	BYTE *cpy;
>  
> -	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
> -	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
> -	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
> +	const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>  
>  	const int safeDecode = (endOnInput == endOnInputSize);
>  	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
>  
>  	/* Set up the "end" pointers for the shortcut. */
>  	const BYTE *const shortiend = iend -
> -		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>  	const BYTE *const shortoend = oend -
> -		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
> +	    (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
> +
> +	const BYTE *match;
> +	size_t offset;
> +	unsigned int token;
> +	size_t length;
>  
>  	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>  		 srcSize, outputSize);
> @@ -117,15 +210,195 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  	if ((endOnInput) && unlikely(srcSize == 0))
>  		return -1;
>  
> -	/* Main Loop : decode sequences */
> +#if LZ4_FAST_DEC_LOOP
> +	if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
> +		DEBUGLOG(6, "skip fast decode loop");
> +		goto safe_decode;
> +	}
> +
> +	/* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
>  	while (1) {
> -		size_t length;
> -		const BYTE *match;
> -		size_t offset;
> +		/* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
> +		assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
> +		if (endOnInput) {
> +			assert(ip < iend);
> +		}
> +		token = *ip++;
> +		length = token >> ML_BITS;	/* literal length */
> +
> +		assert(!endOnInput || ip <= iend);	/* ip < iend before the increment */
> +
> +		/* decode literal length */
> +		if (length == RUN_MASK) {
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - RUN_MASK,
> +						 endOnInput, endOnInput,
> +						 &error);
> +			if (error == initial_error) {
> +				goto _output_error;
> +			}
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (op) + length <
> +					(uptrval) (op))) {
> +				goto _output_error;
> +			}	/* overflow detection */
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (ip) + length <
> +					(uptrval) (ip))) {
> +				goto _output_error;
> +			}
>  
> -		/* get literal length */
> -		unsigned int const token = *ip++;
> -		length = token>>ML_BITS;
> +			/* overflow detection */
> +			/* copy literals */
> +			cpy = op + length;
> +			LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
> +			if (endOnInput) {	/* LZ4_decompress_safe() */
> +				if ((cpy > oend - 32)
> +				    || (ip + length > iend - 32)) {
> +					goto safe_literal_copy;
> +				}
> +				LZ4_wildCopy32(op, ip, cpy);
> +			} else {	/* LZ4_decompress_fast() */
> +				if (cpy > oend - 8) {
> +					goto safe_literal_copy;
> +				}
> +				LZ4_wildCopy8(op, ip, cpy);
> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
> +				/* it doesn't know input length, and only relies on end-of-block */
> +				/* properties */
> +			}
> +			ip += length;
> +			op = cpy;
> +		} else {
> +			cpy = op + length;
> +			if (endOnInput) {	/* LZ4_decompress_safe() */
> +				DEBUGLOG(7,
> +					 "copy %u bytes in a 16-bytes stripe",
> +					 (unsigned)length);
> +				/* We don't need to check oend */
> +				/* since we check it once for each loop below */
> +				if (ip > iend - (16 + 1)) {	/*max lit + offset + nextToken */
> +					goto safe_literal_copy;
> +				}
> +				/* Literals can only be 14, but hope compilers optimize */
> +				/*if we copy by a register size */
> +				memcpy(op, ip, 16);
> +			} else {
> +				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
> +				/* it doesn't know input length, and relies on end-of-block */
> +				/* properties */
> +				memcpy(op, ip, 8);
> +				if (length > 8) {
> +					memcpy(op + 8, ip + 8, 8);
> +				}
> +			}
> +			ip += length;
> +			op = cpy;
> +		}
> +
> +		/* get offset */
> +		offset = LZ4_readLE16(ip);
> +		ip += 2;	/* end-of-block condition violated */
> +		match = op - offset;
> +
> +		/* get matchlength */
> +		length = token & ML_MASK;
> +
> +		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
> +			goto _output_error;
> +		}
> +		/* Error : offset outside buffers */
> +		if (length == ML_MASK) {
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
> +						 endOnInput, 0, &error);
> +			if (error != ok) {
> +				goto _output_error;
> +			}
> +			if ((safeDecode)
> +			    && unlikely((uptrval) (op) + length < (uptrval) op)) {
> +				goto _output_error;
> +			}	/* overflow detection */
> +			length += MINMATCH;
> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
> +				goto safe_match_copy;
> +			}
> +		} else {
> +			length += MINMATCH;
> +			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
> +				goto safe_match_copy;
> +			}
> +
> +			/* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
> +			if (!(dict == usingExtDict) || (match >= lowPrefix)) {
> +				if (offset >= 8) {
> +					memcpy(op, match, 8);
> +					memcpy(op + 8, match + 8, 8);
> +					memcpy(op + 16, match + 16, 2);
> +					op += length;
> +					continue;
> +				}
> +			}
> +		}
> +
> +		/* match starting within external dictionary */
> +		if ((dict == usingExtDict) && (match < lowPrefix)) {
> +			if (unlikely(op + length > oend - LASTLITERALS)) {
> +				if (partialDecoding) {
> +					/* reach end of buffer */
> +					length =
> +					    min(length, (size_t) (oend - op));
> +				} else {
> +					/* end-of-block condition violated */
> +					goto _output_error;
> +				}
> +			}
> +
> +			if (length <= (size_t) (lowPrefix - match)) {
> +				/* match fits entirely within external dictionary : just copy */
> +				memmove(op, dictEnd - (lowPrefix - match),
> +					length);
> +				op += length;
> +			} else {
> +				/* match stretches into both external dict and current block */
> +				size_t const copySize =
> +				    (size_t) (lowPrefix - match);
> +				size_t const restSize = length - copySize;
> +				memcpy(op, dictEnd - copySize, copySize);
> +				op += copySize;
> +				if (restSize > (size_t) (op - lowPrefix)) {	/* overlap copy */
> +					BYTE *const endOfMatch = op + restSize;
> +					const BYTE *copyFrom = lowPrefix;
> +					while (op < endOfMatch) {
> +						*op++ = *copyFrom++;
> +					}
> +				} else {
> +					memcpy(op, lowPrefix, restSize);
> +					op += restSize;
> +				}
> +			}
> +			continue;
> +		}
> +
> +		/* copy match within block */
> +		cpy = op + length;
> +
> +		assert((op <= oend) && (oend - op >= 32));
> +		if (unlikely(offset < 16)) {
> +			LZ4_memcpy_using_offset(op, match, cpy, offset);
> +		} else {
> +			LZ4_wildCopy32(op, match, cpy);
> +		}
> +
> +		op = cpy;	/* wildcopy correction */
> +	}
> +      safe_decode:
> +#endif
> +	/* Main Loop : decode sequences */
> +	while (1) {
> +		length = token >> ML_BITS;
>  
>  		/* ip < iend before the increment */
>  		assert(!endOnInput || ip <= iend);
> @@ -143,26 +416,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  		 * combined check for both stages).
>  		 */
>  		if ((endOnInput ? length != RUN_MASK : length <= 8)
> -		   /*
> -		    * strictly "less than" on input, to re-enter
> -		    * the loop with at least one byte
> -		    */
> -		   && likely((endOnInput ? ip < shortiend : 1) &
> -			     (op <= shortoend))) {
> +		    /*
> +		     * strictly "less than" on input, to re-enter
> +		     * the loop with at least one byte
> +		     */
> +		    && likely((endOnInput ? ip < shortiend : 1) &
> +			      (op <= shortoend))) {
>  			/* Copy the literals */
>  			memcpy(op, ip, endOnInput ? 16 : 8);
> -			op += length; ip += length;
> +			op += length;
> +			ip += length;
>  
>  			/*
>  			 * The second stage:
>  			 * prepare for match copying, decode full info.
>  			 * If it doesn't work out, the info won't be wasted.
>  			 */
> -			length = token & ML_MASK; /* match length */
> +			length = token & ML_MASK;	/* match length */
>  			offset = LZ4_readLE16(ip);
>  			ip += 2;
>  			match = op - offset;
> -			assert(match <= op); /* check overflow */
> +			assert(match <= op);	/* check overflow */
>  
>  			/* Do not deal with overlapping matches. */
>  			if ((length != ML_MASK) &&
> @@ -187,28 +461,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		/* decode literal length */
>  		if (length == RUN_MASK) {
> -			unsigned int s;
>  
> -			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
> -				/* overflow detection */
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - RUN_MASK,
> +						 endOnInput, endOnInput,
> +						 &error);
> +			if (error == initial_error)
>  				goto _output_error;
> -			}
> -			do {
> -				s = *ip++;
> -				length += s;
> -			} while (likely(endOnInput
> -				? ip < iend - RUN_MASK
> -				: 1) & (s == 255));
>  
>  			if ((safeDecode)
> -			    && unlikely((uptrval)(op) +
> -					length < (uptrval)(op))) {
> +			    && unlikely((uptrval) (op) +
> +					length < (uptrval) (op))) {
>  				/* overflow detection */
>  				goto _output_error;
>  			}
>  			if ((safeDecode)
> -			    && unlikely((uptrval)(ip) +
> -					length < (uptrval)(ip))) {
> +			    && unlikely((uptrval) (ip) +
> +					length < (uptrval) (ip))) {
>  				/* overflow detection */
>  				goto _output_error;
>  			}
> @@ -216,11 +486,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		/* copy literals */
>  		cpy = op + length;
> +#if LZ4_FAST_DEC_LOOP
> +	      safe_literal_copy:
> +#endif
>  		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>  
>  		if (((endOnInput) && ((cpy > oend - MFLIMIT)
> -			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
> -			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
> +				      || (ip + length >
> +					  iend - (2 + 1 + LASTLITERALS))))
> +		    || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>  			if (partialDecoding) {
>  				if (cpy > oend) {
>  					/*
> @@ -231,7 +505,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  					length = oend - op;
>  				}
>  				if ((endOnInput)
> -					&& (ip + length > iend)) {
> +				    && (ip + length > iend)) {
>  					/*
>  					 * Error :
>  					 * read attempt beyond
> @@ -241,7 +515,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  				}
>  			} else {
>  				if ((!endOnInput)
> -					&& (cpy != oend)) {
> +				    && (cpy != oend)) {
>  					/*
>  					 * Error :
>  					 * block decoding must
> @@ -250,7 +524,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  					goto _output_error;
>  				}
>  				if ((endOnInput)
> -					&& ((ip + length != iend)
> +				    && ((ip + length != iend)
>  					|| (cpy > oend))) {
>  					/*
>  					 * Error :
> @@ -269,7 +543,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  				break;
>  		} else {
>  			/* may overwrite up to WILDCOPYLENGTH beyond cpy */
> -			LZ4_wildCopy(op, ip, cpy);
> +			LZ4_wildCopy8(op, ip, cpy);
>  			ip += length;
>  			op = cpy;
>  		}
> @@ -288,29 +562,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  			goto _output_error;
>  		}
>  
> -		/* costs ~1%; silence an msan warning when offset == 0 */
> -		/*
> -		 * note : when partialDecoding, there is no guarantee that
> -		 * at least 4 bytes remain available in output buffer
> -		 */
> -		if (!partialDecoding) {
> -			assert(oend > op);
> -			assert(oend - op >= 4);
> -
> -			LZ4_write32(op, (U32)offset);
> -		}
> -
>  		if (length == ML_MASK) {
> -			unsigned int s;
> -
> -			do {
> -				s = *ip++;
> -
> -				if ((endOnInput) && (ip > iend - LASTLITERALS))
> -					goto _output_error;
>  
> -				length += s;
> -			} while (s == 255);
> +			variable_length_error error = ok;
> +			length +=
> +			    read_variable_length(&ip, iend - LASTLITERALS + 1,
> +						 endOnInput, 0, &error);
> +			if (error != ok)
> +				goto _output_error;
>  
>  			if ((safeDecode)
>  				&& unlikely(
> @@ -322,6 +581,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>  		length += MINMATCH;
>  
> +#if LZ4_FAST_DEC_LOOP
> +safe_match_copy:
> +#endif
> +
>  		/* match starting within external dictionary */
>  		if ((dict == usingExtDict) && (match < lowPrefix)) {
>  			if (unlikely(op + length > oend - LASTLITERALS)) {
> @@ -418,7 +681,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  			}
>  
>  			if (op < oCopyLimit) {
> -				LZ4_wildCopy(op, match, oCopyLimit);
> +				LZ4_wildCopy8(op, match, oCopyLimit);
>  				match += oCopyLimit - op;
>  				op = oCopyLimit;
>  			}
> @@ -427,7 +690,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  		} else {
>  			LZ4_copy8(op, match);
>  			if (length > 16)
> -				LZ4_wildCopy(op + 8, match + 8, cpy);
> +				LZ4_wildCopy8(op + 8, match + 8, cpy);
>  		}
>  		op = cpy; /* wildcopy correction */
>  	}
> diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
> index 1a7fa9d9170f..4cc17cf589ed 100644
> --- a/lib/lz4/lz4defs.h
> +++ b/lib/lz4/lz4defs.h
> @@ -40,6 +40,28 @@
>  
>  #define FORCE_INLINE __always_inline
>  
> +/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
> + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
> + * together with a simple 8-byte copy loop as a fall-back path.
> + * However, this optimization hurts the decompression speed by >30%,
> + * because the execution does not go to the optimized loop
> + * for typical compressible data, and all of the preamble checks
> + * before going to the fall-back path become useless overhead.
> + * This optimization happens only with the -O3 flag, and -O2 generates
> + * a simple 8-byte copy loop.
> + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
> + * functions are annotated with __attribute__((optimize("O2"))),
> + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
> + * of LZ4_wildCopy8 does not affect the compression speed.
> + */
> +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
> +#  define FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
> +#  define FORCE_O2_INLINE_GCC_PPC64LE (__attribute__((optimize("O2"))) FORCE_INLINE)
> +#else
> +#  define FORCE_O2_GCC_PPC64LE		FORCE_INLINE
> +#  define FORCE_O2_INLINE_GCC_PPC64LE	FORCE_INLINE
> +#endif
> +
>  /*-************************************
>   *	Basic Types
>   **************************************/
> @@ -99,6 +121,9 @@ typedef uintptr_t uptrval;
>  #define RUN_BITS (8 - ML_BITS)
>  #define RUN_MASK ((1U << RUN_BITS) - 1)
>  
> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
> +
>  /*-************************************
>   *	Reading and writing into memory
>   **************************************/
> @@ -156,7 +181,7 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
>   * customized variant of memcpy,
>   * which can overwrite up to 7 bytes beyond dstEnd
>   */
> -static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
> +static FORCE_O2_INLINE_GCC_PPC64LE void LZ4_wildCopy8(void *dstPtr,
>  	const void *srcPtr, void *dstEnd)
>  {
>  	BYTE *d = (BYTE *)dstPtr;
> @@ -220,6 +245,39 @@ static FORCE_INLINE unsigned int LZ4_count(
>  	return (unsigned int)(pIn - pStart);
>  }
>  
> +/* Read the variable-length literal or match length.
> + *
> + * ip - pointer to use as input.
> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
> + * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
> + * error (output) - error code.  Should be set to 0 before call.
> + */
> +typedef enum { loop_error = -2, initial_error = -1, ok = 0} variable_length_error;
> +static FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
> +					   const BYTE *lencheck,
> +					   int loop_check, int initial_check,
> +					   variable_length_error *error)
> +{
> +	unsigned length = 0;
> +	unsigned s;
> +	if (initial_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
> +		*error = initial_error;
> +		return length;
> +	}
> +	do {
> +		s = **ip;
> +		(*ip)++;
> +		length += s;
> +		if (loop_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
> +			*error = loop_error;
> +			return length;
> +		}
> +	} while (s == 255);
> +
> +	return length;
> +}
> +
>  typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
>  typedef enum { byPtr, byU32, byU16 } tableType_t;
>  
> diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
> index 176f03b83e56..e02e041a01d9 100644
> --- a/lib/lz4/lz4hc_compress.c
> +++ b/lib/lz4/lz4hc_compress.c
> @@ -293,7 +293,7 @@ static FORCE_INLINE int LZ4HC_encodeSequence(
>  		*token = (BYTE)(length<<ML_BITS);
>  
>  	/* Copy Literals */
> -	LZ4_wildCopy(*op, *anchor, (*op) + length);
> +	LZ4_wildCopy8(*op, *anchor, (*op) + length);
>  	*op += length;
>  
>  	/* Encode Offset */
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
@ 2019-05-17  5:56 Chenxi Mao
  2019-05-17  6:35 ` Gao Xiang
  0 siblings, 1 reply; 16+ messages in thread
From: Chenxi Mao @ 2019-05-17  5:56 UTC (permalink / raw)
  To: akpm
  Cc: gaoxiang25, linux-kernel, roy.feng, yuanli.xu, robert.alm,
	masaya.a.takahashi, yann.collet.73, miaoxie, chenxi . mao

FAST_DEC_LOOP was introduced from LZ4 1.9.0[1]
This change would be introduce 10% on decompress operation
according to LZ4 benchmark result on X86 devices.
Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements on ARM64,
however clang compiler has downgrade if FAST_DEC_LOOP enabled.

So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.

LZ4 FAST_DEC_LOOP bug fixes include as well.
1. fixed read-after input in LZ4_decompress_safe() (issue 681)
2. Fix out-of-bound read in LZ4_decompress_fast() (issue 676)

PS2:
1. Move common API to lz4defs.h
2. Add PPC related inline Macro defination.
3. Force inline new static apis.

Here is the test result on ARM64(cortex-A53)
Benchmark via ZRAM:

Test case:
taskset 03 /data/fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
--buffer_compress_percentage=75  --size=700M \
--scramble_buffers=1 --direct=1 --loops=100 --numjobs=1 \
--filename=/data/test/test --name=seq-read --rw=read --stonewall

Patched:
    READ: bw=150MiB/s (157MB/s)
Vanilla:
    READ: bw=135MiB/s (142MB/s)

[1] https://github.com/lz4/lz4/releases/tag/v1.9.0

Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
---
 lib/lz4/lz4_compress.c   |   4 +-
 lib/lz4/lz4_decompress.c | 397 ++++++++++++++++++++++++++++++++-------
 lib/lz4/lz4defs.h        |  60 +++++-
 lib/lz4/lz4hc_compress.c |   2 +-
 4 files changed, 392 insertions(+), 71 deletions(-)

diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
index cc7b6d4cc7c7..b703ed1ca57d 100644
--- a/lib/lz4/lz4_compress.c
+++ b/lib/lz4/lz4_compress.c
@@ -322,7 +322,7 @@ static FORCE_INLINE int LZ4_compress_generic(
 				*token = (BYTE)(litLength << ML_BITS);
 
 			/* Copy Literals */
-			LZ4_wildCopy(op, anchor, op + litLength);
+			LZ4_wildCopy8(op, anchor, op + litLength);
 			op += litLength;
 		}
 
@@ -628,7 +628,7 @@ static int LZ4_compress_destSize_generic(
 				*token = (BYTE)(litLength << ML_BITS);
 
 			/* Copy Literals */
-			LZ4_wildCopy(op, anchor, op + litLength);
+			LZ4_wildCopy8(op, anchor, op + litLength);
 			op += litLength;
 		}
 
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 0c9d3ad17e0f..8622922304c3 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -50,6 +50,96 @@
 #define assert(condition) ((void)0)
 #endif
 
+#ifndef LZ4_FAST_DEC_LOOP
+#if defined(__i386__) || defined(__x86_64__)
+#define LZ4_FAST_DEC_LOOP 1
+#elif defined(__aarch64__) && !defined(__clang__)
+     /* On aarch64, we disable this optimization for clang because on certain
+      * mobile chipsets and clang, it reduces performance. For more information
+      * refer to https://github.com/lz4/lz4/pull/707. */
+#define LZ4_FAST_DEC_LOOP 1
+#else
+#define LZ4_FAST_DEC_LOOP 0
+#endif
+#endif
+
+#if LZ4_FAST_DEC_LOOP
+#define FASTLOOP_SAFE_DISTANCE 64
+FORCE_O2_INLINE_GCC_PPC64LE void
+LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
+			     const size_t offset)
+{
+	if (offset < 8) {
+		dstPtr[0] = srcPtr[0];
+
+		dstPtr[1] = srcPtr[1];
+		dstPtr[2] = srcPtr[2];
+		dstPtr[3] = srcPtr[3];
+		srcPtr += inc32table[offset];
+		memcpy(dstPtr + 4, srcPtr, 4);
+		srcPtr -= dec64table[offset];
+		dstPtr += 8;
+	} else {
+		memcpy(dstPtr, srcPtr, 8);
+		dstPtr += 8;
+		srcPtr += 8;
+	}
+
+	LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
+}
+
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
+ * this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+FORCE_O2_INLINE_GCC_PPC64LE void
+LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd)
+{
+	BYTE *d = (BYTE *) dstPtr;
+	const BYTE *s = (const BYTE *)srcPtr;
+	BYTE *const e = (BYTE *) dstEnd;
+
+	do {
+		memcpy(d, s, 16);
+		memcpy(d + 16, s + 16, 16);
+		d += 32;
+		s += 32;
+	} while (d < e);
+}
+
+FORCE_O2_INLINE_GCC_PPC64LE void
+LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
+			const size_t offset)
+{
+	BYTE v[8];
+	switch (offset) {
+
+	case 1:
+		memset(v, *srcPtr, 8);
+		goto copy_loop;
+	case 2:
+		memcpy(v, srcPtr, 2);
+		memcpy(&v[2], srcPtr, 2);
+		memcpy(&v[4], &v[0], 4);
+		goto copy_loop;
+	case 4:
+		memcpy(v, srcPtr, 4);
+		memcpy(&v[4], srcPtr, 4);
+		goto copy_loop;
+	default:
+		LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+		return;
+	}
+
+      copy_loop:
+	memcpy(dstPtr, v, 8);
+	dstPtr += 8;
+	while (dstPtr < dstEnd) {
+		memcpy(dstPtr, v, 8);
+		dstPtr += 8;
+	}
+}
+#endif
+
 /*
  * LZ4_decompress_generic() :
  * This generic decompression function covers all use cases.
@@ -80,25 +170,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
 	 const size_t dictSize
 	 )
 {
-	const BYTE *ip = (const BYTE *) src;
-	const BYTE * const iend = ip + srcSize;
+	const BYTE *ip = (const BYTE *)src;
+	const BYTE *const iend = ip + srcSize;
 
 	BYTE *op = (BYTE *) dst;
-	BYTE * const oend = op + outputSize;
+	BYTE *const oend = op + outputSize;
 	BYTE *cpy;
 
-	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
-	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
-	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
+	const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
 
 	const int safeDecode = (endOnInput == endOnInputSize);
 	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
 
 	/* Set up the "end" pointers for the shortcut. */
 	const BYTE *const shortiend = iend -
-		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+	    (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
 	const BYTE *const shortoend = oend -
-		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+	    (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+
+	const BYTE *match;
+	size_t offset;
+	unsigned int token;
+	size_t length;
 
 	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
 		 srcSize, outputSize);
@@ -117,15 +210,195 @@ static FORCE_INLINE int LZ4_decompress_generic(
 	if ((endOnInput) && unlikely(srcSize == 0))
 		return -1;
 
-	/* Main Loop : decode sequences */
+#if LZ4_FAST_DEC_LOOP
+	if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+		DEBUGLOG(6, "skip fast decode loop");
+		goto safe_decode;
+	}
+
+	/* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
 	while (1) {
-		size_t length;
-		const BYTE *match;
-		size_t offset;
+		/* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+		assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+		if (endOnInput) {
+			assert(ip < iend);
+		}
+		token = *ip++;
+		length = token >> ML_BITS;	/* literal length */
+
+		assert(!endOnInput || ip <= iend);	/* ip < iend before the increment */
+
+		/* decode literal length */
+		if (length == RUN_MASK) {
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - RUN_MASK,
+						 endOnInput, endOnInput,
+						 &error);
+			if (error == initial_error) {
+				goto _output_error;
+			}
+			if ((safeDecode)
+			    && unlikely((uptrval) (op) + length <
+					(uptrval) (op))) {
+				goto _output_error;
+			}	/* overflow detection */
+			if ((safeDecode)
+			    && unlikely((uptrval) (ip) + length <
+					(uptrval) (ip))) {
+				goto _output_error;
+			}
 
-		/* get literal length */
-		unsigned int const token = *ip++;
-		length = token>>ML_BITS;
+			/* overflow detection */
+			/* copy literals */
+			cpy = op + length;
+			LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+			if (endOnInput) {	/* LZ4_decompress_safe() */
+				if ((cpy > oend - 32)
+				    || (ip + length > iend - 32)) {
+					goto safe_literal_copy;
+				}
+				LZ4_wildCopy32(op, ip, cpy);
+			} else {	/* LZ4_decompress_fast() */
+				if (cpy > oend - 8) {
+					goto safe_literal_copy;
+				}
+				LZ4_wildCopy8(op, ip, cpy);
+				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
+				/* it doesn't know input length, and only relies on end-of-block */
+				/* properties */
+			}
+			ip += length;
+			op = cpy;
+		} else {
+			cpy = op + length;
+			if (endOnInput) {	/* LZ4_decompress_safe() */
+				DEBUGLOG(7,
+					 "copy %u bytes in a 16-bytes stripe",
+					 (unsigned)length);
+				/* We don't need to check oend */
+				/* since we check it once for each loop below */
+				if (ip > iend - (16 + 1)) {	/*max lit + offset + nextToken */
+					goto safe_literal_copy;
+				}
+				/* Literals can only be 14, but hope compilers optimize */
+				/*if we copy by a register size */
+				memcpy(op, ip, 16);
+			} else {
+				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
+				/* it doesn't know input length, and relies on end-of-block */
+				/* properties */
+				memcpy(op, ip, 8);
+				if (length > 8) {
+					memcpy(op + 8, ip + 8, 8);
+				}
+			}
+			ip += length;
+			op = cpy;
+		}
+
+		/* get offset */
+		offset = LZ4_readLE16(ip);
+		ip += 2;	/* end-of-block condition violated */
+		match = op - offset;
+
+		/* get matchlength */
+		length = token & ML_MASK;
+
+		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
+			goto _output_error;
+		}
+		/* Error : offset outside buffers */
+		if (length == ML_MASK) {
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - LASTLITERALS + 1,
+						 endOnInput, 0, &error);
+			if (error != ok) {
+				goto _output_error;
+			}
+			if ((safeDecode)
+			    && unlikely((uptrval) (op) + length < (uptrval) op)) {
+				goto _output_error;
+			}	/* overflow detection */
+			length += MINMATCH;
+			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+				goto safe_match_copy;
+			}
+		} else {
+			length += MINMATCH;
+			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+				goto safe_match_copy;
+			}
+
+			/* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
+			if (!(dict == usingExtDict) || (match >= lowPrefix)) {
+				if (offset >= 8) {
+					memcpy(op, match, 8);
+					memcpy(op + 8, match + 8, 8);
+					memcpy(op + 16, match + 16, 2);
+					op += length;
+					continue;
+				}
+			}
+		}
+
+		/* match starting within external dictionary */
+		if ((dict == usingExtDict) && (match < lowPrefix)) {
+			if (unlikely(op + length > oend - LASTLITERALS)) {
+				if (partialDecoding) {
+					/* reach end of buffer */
+					length =
+					    min(length, (size_t) (oend - op));
+				} else {
+					/* end-of-block condition violated */
+					goto _output_error;
+				}
+			}
+
+			if (length <= (size_t) (lowPrefix - match)) {
+				/* match fits entirely within external dictionary : just copy */
+				memmove(op, dictEnd - (lowPrefix - match),
+					length);
+				op += length;
+			} else {
+				/* match stretches into both external dict and current block */
+				size_t const copySize =
+				    (size_t) (lowPrefix - match);
+				size_t const restSize = length - copySize;
+				memcpy(op, dictEnd - copySize, copySize);
+				op += copySize;
+				if (restSize > (size_t) (op - lowPrefix)) {	/* overlap copy */
+					BYTE *const endOfMatch = op + restSize;
+					const BYTE *copyFrom = lowPrefix;
+					while (op < endOfMatch) {
+						*op++ = *copyFrom++;
+					}
+				} else {
+					memcpy(op, lowPrefix, restSize);
+					op += restSize;
+				}
+			}
+			continue;
+		}
+
+		/* copy match within block */
+		cpy = op + length;
+
+		assert((op <= oend) && (oend - op >= 32));
+		if (unlikely(offset < 16)) {
+			LZ4_memcpy_using_offset(op, match, cpy, offset);
+		} else {
+			LZ4_wildCopy32(op, match, cpy);
+		}
+
+		op = cpy;	/* wildcopy correction */
+	}
+      safe_decode:
+#endif
+	/* Main Loop : decode sequences */
+	while (1) {
+		length = token >> ML_BITS;
 
 		/* ip < iend before the increment */
 		assert(!endOnInput || ip <= iend);
@@ -143,26 +416,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
 		 * combined check for both stages).
 		 */
 		if ((endOnInput ? length != RUN_MASK : length <= 8)
-		   /*
-		    * strictly "less than" on input, to re-enter
-		    * the loop with at least one byte
-		    */
-		   && likely((endOnInput ? ip < shortiend : 1) &
-			     (op <= shortoend))) {
+		    /*
+		     * strictly "less than" on input, to re-enter
+		     * the loop with at least one byte
+		     */
+		    && likely((endOnInput ? ip < shortiend : 1) &
+			      (op <= shortoend))) {
 			/* Copy the literals */
 			memcpy(op, ip, endOnInput ? 16 : 8);
-			op += length; ip += length;
+			op += length;
+			ip += length;
 
 			/*
 			 * The second stage:
 			 * prepare for match copying, decode full info.
 			 * If it doesn't work out, the info won't be wasted.
 			 */
-			length = token & ML_MASK; /* match length */
+			length = token & ML_MASK;	/* match length */
 			offset = LZ4_readLE16(ip);
 			ip += 2;
 			match = op - offset;
-			assert(match <= op); /* check overflow */
+			assert(match <= op);	/* check overflow */
 
 			/* Do not deal with overlapping matches. */
 			if ((length != ML_MASK) &&
@@ -187,28 +461,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		/* decode literal length */
 		if (length == RUN_MASK) {
-			unsigned int s;
 
-			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
-				/* overflow detection */
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - RUN_MASK,
+						 endOnInput, endOnInput,
+						 &error);
+			if (error == initial_error)
 				goto _output_error;
-			}
-			do {
-				s = *ip++;
-				length += s;
-			} while (likely(endOnInput
-				? ip < iend - RUN_MASK
-				: 1) & (s == 255));
 
 			if ((safeDecode)
-			    && unlikely((uptrval)(op) +
-					length < (uptrval)(op))) {
+			    && unlikely((uptrval) (op) +
+					length < (uptrval) (op))) {
 				/* overflow detection */
 				goto _output_error;
 			}
 			if ((safeDecode)
-			    && unlikely((uptrval)(ip) +
-					length < (uptrval)(ip))) {
+			    && unlikely((uptrval) (ip) +
+					length < (uptrval) (ip))) {
 				/* overflow detection */
 				goto _output_error;
 			}
@@ -216,11 +486,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		/* copy literals */
 		cpy = op + length;
+#if LZ4_FAST_DEC_LOOP
+	      safe_literal_copy:
+#endif
 		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
 
 		if (((endOnInput) && ((cpy > oend - MFLIMIT)
-			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
-			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
+				      || (ip + length >
+					  iend - (2 + 1 + LASTLITERALS))))
+		    || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
 			if (partialDecoding) {
 				if (cpy > oend) {
 					/*
@@ -231,7 +505,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 					length = oend - op;
 				}
 				if ((endOnInput)
-					&& (ip + length > iend)) {
+				    && (ip + length > iend)) {
 					/*
 					 * Error :
 					 * read attempt beyond
@@ -241,7 +515,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				}
 			} else {
 				if ((!endOnInput)
-					&& (cpy != oend)) {
+				    && (cpy != oend)) {
 					/*
 					 * Error :
 					 * block decoding must
@@ -250,7 +524,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 					goto _output_error;
 				}
 				if ((endOnInput)
-					&& ((ip + length != iend)
+				    && ((ip + length != iend)
 					|| (cpy > oend))) {
 					/*
 					 * Error :
@@ -269,7 +543,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				break;
 		} else {
 			/* may overwrite up to WILDCOPYLENGTH beyond cpy */
-			LZ4_wildCopy(op, ip, cpy);
+			LZ4_wildCopy8(op, ip, cpy);
 			ip += length;
 			op = cpy;
 		}
@@ -288,29 +562,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			goto _output_error;
 		}
 
-		/* costs ~1%; silence an msan warning when offset == 0 */
-		/*
-		 * note : when partialDecoding, there is no guarantee that
-		 * at least 4 bytes remain available in output buffer
-		 */
-		if (!partialDecoding) {
-			assert(oend > op);
-			assert(oend - op >= 4);
-
-			LZ4_write32(op, (U32)offset);
-		}
-
 		if (length == ML_MASK) {
-			unsigned int s;
-
-			do {
-				s = *ip++;
-
-				if ((endOnInput) && (ip > iend - LASTLITERALS))
-					goto _output_error;
 
-				length += s;
-			} while (s == 255);
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - LASTLITERALS + 1,
+						 endOnInput, 0, &error);
+			if (error != ok)
+				goto _output_error;
 
 			if ((safeDecode)
 				&& unlikely(
@@ -322,6 +581,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		length += MINMATCH;
 
+#if LZ4_FAST_DEC_LOOP
+safe_match_copy:
+#endif
+
 		/* match starting within external dictionary */
 		if ((dict == usingExtDict) && (match < lowPrefix)) {
 			if (unlikely(op + length > oend - LASTLITERALS)) {
@@ -418,7 +681,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			}
 
 			if (op < oCopyLimit) {
-				LZ4_wildCopy(op, match, oCopyLimit);
+				LZ4_wildCopy8(op, match, oCopyLimit);
 				match += oCopyLimit - op;
 				op = oCopyLimit;
 			}
@@ -427,7 +690,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 		} else {
 			LZ4_copy8(op, match);
 			if (length > 16)
-				LZ4_wildCopy(op + 8, match + 8, cpy);
+				LZ4_wildCopy8(op + 8, match + 8, cpy);
 		}
 		op = cpy; /* wildcopy correction */
 	}
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index 1a7fa9d9170f..4cc17cf589ed 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -40,6 +40,28 @@
 
 #define FORCE_INLINE __always_inline
 
+/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
+ * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy8 does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
+#  define FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
+#  define FORCE_O2_INLINE_GCC_PPC64LE (__attribute__((optimize("O2"))) FORCE_INLINE)
+#else
+#  define FORCE_O2_GCC_PPC64LE		FORCE_INLINE
+#  define FORCE_O2_INLINE_GCC_PPC64LE	FORCE_INLINE
+#endif
+
 /*-************************************
  *	Basic Types
  **************************************/
@@ -99,6 +121,9 @@ typedef uintptr_t uptrval;
 #define RUN_BITS (8 - ML_BITS)
 #define RUN_MASK ((1U << RUN_BITS) - 1)
 
+static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
+static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
+
 /*-************************************
  *	Reading and writing into memory
  **************************************/
@@ -156,7 +181,7 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
  * customized variant of memcpy,
  * which can overwrite up to 7 bytes beyond dstEnd
  */
-static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
+static FORCE_O2_INLINE_GCC_PPC64LE void LZ4_wildCopy8(void *dstPtr,
 	const void *srcPtr, void *dstEnd)
 {
 	BYTE *d = (BYTE *)dstPtr;
@@ -220,6 +245,39 @@ static FORCE_INLINE unsigned int LZ4_count(
 	return (unsigned int)(pIn - pStart);
 }
 
+/* Read the variable-length literal or match length.
+ *
+ * ip - pointer to use as input.
+ * lencheck - end ip.  Return an error if ip advances >= lencheck.
+ * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
+ * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
+ * error (output) - error code.  Should be set to 0 before call.
+ */
+typedef enum { loop_error = -2, initial_error = -1, ok = 0} variable_length_error;
+static FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
+					   const BYTE *lencheck,
+					   int loop_check, int initial_check,
+					   variable_length_error *error)
+{
+	unsigned length = 0;
+	unsigned s;
+	if (initial_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
+		*error = initial_error;
+		return length;
+	}
+	do {
+		s = **ip;
+		(*ip)++;
+		length += s;
+		if (loop_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
+			*error = loop_error;
+			return length;
+		}
+	} while (s == 255);
+
+	return length;
+}
+
 typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
 typedef enum { byPtr, byU32, byU16 } tableType_t;
 
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
index 176f03b83e56..e02e041a01d9 100644
--- a/lib/lz4/lz4hc_compress.c
+++ b/lib/lz4/lz4hc_compress.c
@@ -293,7 +293,7 @@ static FORCE_INLINE int LZ4HC_encodeSequence(
 		*token = (BYTE)(length<<ML_BITS);
 
 	/* Copy Literals */
-	LZ4_wildCopy(*op, *anchor, (*op) + length);
+	LZ4_wildCopy8(*op, *anchor, (*op) + length);
 	*op += length;
 
 	/* Encode Offset */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
@ 2019-05-17  1:44 Chenxi Mao
  0 siblings, 0 replies; 16+ messages in thread
From: Chenxi Mao @ 2019-05-17  1:44 UTC (permalink / raw)
  To: akpm
  Cc: gaoxiang25, linux-kernel, roy.feng, yuanli.xu, robert.alm,
	masaya.a.takahashi, yann.collet.73, miaoxie, chenxi . mao

FAST_DEC_LOOP was introduced from LZ4 1.9[1].
This change would be introduce 10% on decompress operation
according to LZ4 benchmark result on X86 devices.
Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements,
however clang compiler has downgrade if FAST_DEC_LOOP enabled.

So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.

LZ4 FAST_DEC_LOOP bug fixes include as well.
1. fixed read-after input in LZ4_decompress_safe() (issue 681)
2. Fix out-of-bound read in LZ4_decompress_fast() (issue 676)

PS2:
1. Move common API to lz4defs.h
2. Add PPC related inline Macro defination.
3. Leverge specific LZ4_wildCopy32 for X86 and ARM64

Here is the test result on ARM64(cortex-A53)
Benchmark via ZRAM:

Test case:
fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
--buffer_compress_percentage=75 \
--scramble_buffers=1 --direct=1 --loops=100 --numjobs=8 \
--filename=/dev/block/zram0 --name=seq-write --rw=write --stonewall \
--name=seq-read --rw=read --stonewall --name=seq-readwrite \
--rw=rw --stonewall --name=rand-readwrite --rw=randrw --stonewall

Patched:
   READ: bw=8757MiB/s (9182MB/s)
Vanilla:
   READ: bw=7765MiB/s (8142MB/s)

[1] https://github.com/lz4/lz4/releases/tag/v1.9.0

Signed-off-by: chenxi.mao <chenxi.mao@sony.com>
---
 lib/lz4/lz4_compress.c   |   4 +-
 lib/lz4/lz4_decompress.c | 414 ++++++++++++++++++++++++++++++++-------
 lib/lz4/lz4defs.h        |  60 +++++-
 lib/lz4/lz4hc_compress.c |   2 +-
 4 files changed, 409 insertions(+), 71 deletions(-)

diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
index cc7b6d4cc7c7..b703ed1ca57d 100644
--- a/lib/lz4/lz4_compress.c
+++ b/lib/lz4/lz4_compress.c
@@ -322,7 +322,7 @@ static FORCE_INLINE int LZ4_compress_generic(
 				*token = (BYTE)(litLength << ML_BITS);
 
 			/* Copy Literals */
-			LZ4_wildCopy(op, anchor, op + litLength);
+			LZ4_wildCopy8(op, anchor, op + litLength);
 			op += litLength;
 		}
 
@@ -628,7 +628,7 @@ static int LZ4_compress_destSize_generic(
 				*token = (BYTE)(litLength << ML_BITS);
 
 			/* Copy Literals */
-			LZ4_wildCopy(op, anchor, op + litLength);
+			LZ4_wildCopy8(op, anchor, op + litLength);
 			op += litLength;
 		}
 
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 0c9d3ad17e0f..60c134e4699f 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -50,6 +50,113 @@
 #define assert(condition) ((void)0)
 #endif
 
+#ifndef LZ4_FAST_DEC_LOOP
+#if defined(__i386__) || defined(__x86_64__)
+#define LZ4_FAST_DEC_LOOP 1
+#elif defined(__aarch64__) && !defined(__clang__)
+     /* On aarch64, we disable this optimization for clang because on certain
+      * mobile chipsets and clang, it reduces performance. For more information
+      * refer to https://github.com/lz4/lz4/pull/707. */
+#define LZ4_FAST_DEC_LOOP 1
+#else
+#define LZ4_FAST_DEC_LOOP 0
+#endif
+#endif
+
+#if LZ4_FAST_DEC_LOOP
+#define FASTLOOP_SAFE_DISTANCE 64
+FORCE_O2_INLINE_GCC_PPC64LE void
+LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
+			     const size_t offset)
+{
+	if (offset < 8) {
+		dstPtr[0] = srcPtr[0];
+
+		dstPtr[1] = srcPtr[1];
+		dstPtr[2] = srcPtr[2];
+		dstPtr[3] = srcPtr[3];
+		srcPtr += inc32table[offset];
+		memcpy(dstPtr + 4, srcPtr, 4);
+		srcPtr -= dec64table[offset];
+		dstPtr += 8;
+	} else {
+		memcpy(dstPtr, srcPtr, 8);
+		dstPtr += 8;
+		srcPtr += 8;
+	}
+
+	LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
+}
+
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
+ * this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+FORCE_O2_INLINE_GCC_PPC64LE void
+LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd)
+{
+	BYTE *d = (BYTE *) dstPtr;
+	const BYTE *s = (const BYTE *)srcPtr;
+	BYTE *const e = (BYTE *) dstEnd;
+
+#if defined(__i386__) || defined(__x86_64__)
+	do {
+		memcpy(d, s, 16);
+		memcpy(d + 16, s + 16, 16);
+		d += 32;
+		s += 32;
+	} while (d < e);
+#elif defined(__aarch64__) && !defined(__clang__)
+	do {
+		LZ4_copy8(d, s);
+		d += 8;
+		s += 8;
+		LZ4_copy8(d, s);
+		d += 8;
+		s += 8;
+		LZ4_copy8(d, s);
+		d += 8;
+		s += 8;
+		LZ4_copy8(d, s);
+		d += 8;
+		s += 8;
+	} while (d < e);
+#endif
+}
+
+FORCE_O2_INLINE_GCC_PPC64LE void
+LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
+			const size_t offset)
+{
+	BYTE v[8];
+	switch (offset) {
+
+	case 1:
+		memset(v, *srcPtr, 8);
+		goto copy_loop;
+	case 2:
+		memcpy(v, srcPtr, 2);
+		memcpy(&v[2], srcPtr, 2);
+		memcpy(&v[4], &v[0], 4);
+		goto copy_loop;
+	case 4:
+		memcpy(v, srcPtr, 4);
+		memcpy(&v[4], srcPtr, 4);
+		goto copy_loop;
+	default:
+		LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+		return;
+	}
+
+      copy_loop:
+	memcpy(dstPtr, v, 8);
+	dstPtr += 8;
+	while (dstPtr < dstEnd) {
+		memcpy(dstPtr, v, 8);
+		dstPtr += 8;
+	}
+}
+#endif
+
 /*
  * LZ4_decompress_generic() :
  * This generic decompression function covers all use cases.
@@ -80,25 +187,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
 	 const size_t dictSize
 	 )
 {
-	const BYTE *ip = (const BYTE *) src;
-	const BYTE * const iend = ip + srcSize;
+	const BYTE *ip = (const BYTE *)src;
+	const BYTE *const iend = ip + srcSize;
 
 	BYTE *op = (BYTE *) dst;
-	BYTE * const oend = op + outputSize;
+	BYTE *const oend = op + outputSize;
 	BYTE *cpy;
 
-	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
-	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
-	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
+	const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
 
 	const int safeDecode = (endOnInput == endOnInputSize);
 	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
 
 	/* Set up the "end" pointers for the shortcut. */
 	const BYTE *const shortiend = iend -
-		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+	    (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
 	const BYTE *const shortoend = oend -
-		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+	    (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+
+	const BYTE *match;
+	size_t offset;
+	unsigned int token;
+	size_t length;
 
 	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
 		 srcSize, outputSize);
@@ -117,15 +227,195 @@ static FORCE_INLINE int LZ4_decompress_generic(
 	if ((endOnInput) && unlikely(srcSize == 0))
 		return -1;
 
-	/* Main Loop : decode sequences */
+#if LZ4_FAST_DEC_LOOP
+	if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+		DEBUGLOG(6, "skip fast decode loop");
+		goto safe_decode;
+	}
+
+	/* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
 	while (1) {
-		size_t length;
-		const BYTE *match;
-		size_t offset;
+		/* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+		assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+		if (endOnInput) {
+			assert(ip < iend);
+		}
+		token = *ip++;
+		length = token >> ML_BITS;	/* literal length */
+
+		assert(!endOnInput || ip <= iend);	/* ip < iend before the increment */
+
+		/* decode literal length */
+		if (length == RUN_MASK) {
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - RUN_MASK,
+						 endOnInput, endOnInput,
+						 &error);
+			if (error == initial_error) {
+				goto _output_error;
+			}
+			if ((safeDecode)
+			    && unlikely((uptrval) (op) + length <
+					(uptrval) (op))) {
+				goto _output_error;
+			}	/* overflow detection */
+			if ((safeDecode)
+			    && unlikely((uptrval) (ip) + length <
+					(uptrval) (ip))) {
+				goto _output_error;
+			}
+
+			/* overflow detection */
+			/* copy literals */
+			cpy = op + length;
+			LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+			if (endOnInput) {	/* LZ4_decompress_safe() */
+				if ((cpy > oend - 32)
+				    || (ip + length > iend - 32)) {
+					goto safe_literal_copy;
+				}
+				LZ4_wildCopy32(op, ip, cpy);
+			} else {	/* LZ4_decompress_fast() */
+				if (cpy > oend - 8) {
+					goto safe_literal_copy;
+				}
+				LZ4_wildCopy8(op, ip, cpy);
+				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
+				/* it doesn't know input length, and only relies on end-of-block */
+				/* properties */
+			}
+			ip += length;
+			op = cpy;
+		} else {
+			cpy = op + length;
+			if (endOnInput) {	/* LZ4_decompress_safe() */
+				DEBUGLOG(7,
+					 "copy %u bytes in a 16-bytes stripe",
+					 (unsigned)length);
+				/* We don't need to check oend */
+				/* since we check it once for each loop below */
+				if (ip > iend - (16 + 1)) {	/*max lit + offset + nextToken */
+					goto safe_literal_copy;
+				}
+				/* Literals can only be 14, but hope compilers optimize */
+				/*if we copy by a register size */
+				memcpy(op, ip, 16);
+			} else {
+				/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
+				/* it doesn't know input length, and relies on end-of-block */
+				/* properties */
+				memcpy(op, ip, 8);
+				if (length > 8) {
+					memcpy(op + 8, ip + 8, 8);
+				}
+			}
+			ip += length;
+			op = cpy;
+		}
+
+		/* get offset */
+		offset = LZ4_readLE16(ip);
+		ip += 2;	/* end-of-block condition violated */
+		match = op - offset;
+
+		/* get matchlength */
+		length = token & ML_MASK;
+
+		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
+			goto _output_error;
+		}
+		/* Error : offset outside buffers */
+		if (length == ML_MASK) {
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - LASTLITERALS + 1,
+						 endOnInput, 0, &error);
+			if (error != ok) {
+				goto _output_error;
+			}
+			if ((safeDecode)
+			    && unlikely((uptrval) (op) + length < (uptrval) op)) {
+				goto _output_error;
+			}	/* overflow detection */
+			length += MINMATCH;
+			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+				goto safe_match_copy;
+			}
+		} else {
+			length += MINMATCH;
+			if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+				goto safe_match_copy;
+			}
+
+			/* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
+			if (!(dict == usingExtDict) || (match >= lowPrefix)) {
+				if (offset >= 8) {
+					memcpy(op, match, 8);
+					memcpy(op + 8, match + 8, 8);
+					memcpy(op + 16, match + 16, 2);
+					op += length;
+					continue;
+				}
+			}
+		}
+
+		/* match starting within external dictionary */
+		if ((dict == usingExtDict) && (match < lowPrefix)) {
+			if (unlikely(op + length > oend - LASTLITERALS)) {
+				if (partialDecoding) {
+					/* reach end of buffer */
+					length =
+					    min(length, (size_t) (oend - op));
+				} else {
+					/* end-of-block condition violated */
+					goto _output_error;
+				}
+			}
+
+			if (length <= (size_t) (lowPrefix - match)) {
+				/* match fits entirely within external dictionary : just copy */
+				memmove(op, dictEnd - (lowPrefix - match),
+					length);
+				op += length;
+			} else {
+				/* match stretches into both external dict and current block */
+				size_t const copySize =
+				    (size_t) (lowPrefix - match);
+				size_t const restSize = length - copySize;
+				memcpy(op, dictEnd - copySize, copySize);
+				op += copySize;
+				if (restSize > (size_t) (op - lowPrefix)) {	/* overlap copy */
+					BYTE *const endOfMatch = op + restSize;
+					const BYTE *copyFrom = lowPrefix;
+					while (op < endOfMatch) {
+						*op++ = *copyFrom++;
+					}
+				} else {
+					memcpy(op, lowPrefix, restSize);
+					op += restSize;
+				}
+			}
+			continue;
+		}
+
+		/* copy match within block */
+		cpy = op + length;
 
-		/* get literal length */
-		unsigned int const token = *ip++;
-		length = token>>ML_BITS;
+		assert((op <= oend) && (oend - op >= 32));
+		if (unlikely(offset < 16)) {
+			LZ4_memcpy_using_offset(op, match, cpy, offset);
+		} else {
+			LZ4_wildCopy32(op, match, cpy);
+		}
+
+		op = cpy;	/* wildcopy correction */
+	}
+      safe_decode:
+#endif
+	/* Main Loop : decode sequences */
+	while (1) {
+		length = token >> ML_BITS;
 
 		/* ip < iend before the increment */
 		assert(!endOnInput || ip <= iend);
@@ -143,26 +433,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
 		 * combined check for both stages).
 		 */
 		if ((endOnInput ? length != RUN_MASK : length <= 8)
-		   /*
-		    * strictly "less than" on input, to re-enter
-		    * the loop with at least one byte
-		    */
-		   && likely((endOnInput ? ip < shortiend : 1) &
-			     (op <= shortoend))) {
+		    /*
+		     * strictly "less than" on input, to re-enter
+		     * the loop with at least one byte
+		     */
+		    && likely((endOnInput ? ip < shortiend : 1) &
+			      (op <= shortoend))) {
 			/* Copy the literals */
 			memcpy(op, ip, endOnInput ? 16 : 8);
-			op += length; ip += length;
+			op += length;
+			ip += length;
 
 			/*
 			 * The second stage:
 			 * prepare for match copying, decode full info.
 			 * If it doesn't work out, the info won't be wasted.
 			 */
-			length = token & ML_MASK; /* match length */
+			length = token & ML_MASK;	/* match length */
 			offset = LZ4_readLE16(ip);
 			ip += 2;
 			match = op - offset;
-			assert(match <= op); /* check overflow */
+			assert(match <= op);	/* check overflow */
 
 			/* Do not deal with overlapping matches. */
 			if ((length != ML_MASK) &&
@@ -187,28 +478,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		/* decode literal length */
 		if (length == RUN_MASK) {
-			unsigned int s;
 
-			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
-				/* overflow detection */
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - RUN_MASK,
+						 endOnInput, endOnInput,
+						 &error);
+			if (error == initial_error)
 				goto _output_error;
-			}
-			do {
-				s = *ip++;
-				length += s;
-			} while (likely(endOnInput
-				? ip < iend - RUN_MASK
-				: 1) & (s == 255));
 
 			if ((safeDecode)
-			    && unlikely((uptrval)(op) +
-					length < (uptrval)(op))) {
+			    && unlikely((uptrval) (op) +
+					length < (uptrval) (op))) {
 				/* overflow detection */
 				goto _output_error;
 			}
 			if ((safeDecode)
-			    && unlikely((uptrval)(ip) +
-					length < (uptrval)(ip))) {
+			    && unlikely((uptrval) (ip) +
+					length < (uptrval) (ip))) {
 				/* overflow detection */
 				goto _output_error;
 			}
@@ -216,11 +503,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		/* copy literals */
 		cpy = op + length;
+#if LZ4_FAST_DEC_LOOP
+	      safe_literal_copy:
+#endif
 		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
 
 		if (((endOnInput) && ((cpy > oend - MFLIMIT)
-			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
-			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
+				      || (ip + length >
+					  iend - (2 + 1 + LASTLITERALS))))
+		    || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
 			if (partialDecoding) {
 				if (cpy > oend) {
 					/*
@@ -231,7 +522,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 					length = oend - op;
 				}
 				if ((endOnInput)
-					&& (ip + length > iend)) {
+				    && (ip + length > iend)) {
 					/*
 					 * Error :
 					 * read attempt beyond
@@ -241,7 +532,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				}
 			} else {
 				if ((!endOnInput)
-					&& (cpy != oend)) {
+				    && (cpy != oend)) {
 					/*
 					 * Error :
 					 * block decoding must
@@ -250,7 +541,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 					goto _output_error;
 				}
 				if ((endOnInput)
-					&& ((ip + length != iend)
+				    && ((ip + length != iend)
 					|| (cpy > oend))) {
 					/*
 					 * Error :
@@ -269,7 +560,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				break;
 		} else {
 			/* may overwrite up to WILDCOPYLENGTH beyond cpy */
-			LZ4_wildCopy(op, ip, cpy);
+			LZ4_wildCopy8(op, ip, cpy);
 			ip += length;
 			op = cpy;
 		}
@@ -288,29 +579,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			goto _output_error;
 		}
 
-		/* costs ~1%; silence an msan warning when offset == 0 */
-		/*
-		 * note : when partialDecoding, there is no guarantee that
-		 * at least 4 bytes remain available in output buffer
-		 */
-		if (!partialDecoding) {
-			assert(oend > op);
-			assert(oend - op >= 4);
-
-			LZ4_write32(op, (U32)offset);
-		}
-
 		if (length == ML_MASK) {
-			unsigned int s;
-
-			do {
-				s = *ip++;
-
-				if ((endOnInput) && (ip > iend - LASTLITERALS))
-					goto _output_error;
 
-				length += s;
-			} while (s == 255);
+			variable_length_error error = ok;
+			length +=
+			    read_variable_length(&ip, iend - LASTLITERALS + 1,
+						 endOnInput, 0, &error);
+			if (error != ok)
+				goto _output_error;
 
 			if ((safeDecode)
 				&& unlikely(
@@ -322,6 +598,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		length += MINMATCH;
 
+#if LZ4_FAST_DEC_LOOP
+safe_match_copy:
+#endif
+
 		/* match starting within external dictionary */
 		if ((dict == usingExtDict) && (match < lowPrefix)) {
 			if (unlikely(op + length > oend - LASTLITERALS)) {
@@ -418,7 +698,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			}
 
 			if (op < oCopyLimit) {
-				LZ4_wildCopy(op, match, oCopyLimit);
+				LZ4_wildCopy8(op, match, oCopyLimit);
 				match += oCopyLimit - op;
 				op = oCopyLimit;
 			}
@@ -427,7 +707,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 		} else {
 			LZ4_copy8(op, match);
 			if (length > 16)
-				LZ4_wildCopy(op + 8, match + 8, cpy);
+				LZ4_wildCopy8(op + 8, match + 8, cpy);
 		}
 		op = cpy; /* wildcopy correction */
 	}
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index 1a7fa9d9170f..9f09521fd2a7 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -40,6 +40,28 @@
 
 #define FORCE_INLINE __always_inline
 
+/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
+ * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy8 does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
+#  define FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
+#  define FORCE_O2_INLINE_GCC_PPC64LE (__attribute__((optimize("O2"))) FORCE_INLINE)
+#else
+#  define FORCE_O2_GCC_PPC64LE
+#  define FORCE_O2_INLINE_GCC_PPC64LE
+#endif
+
 /*-************************************
  *	Basic Types
  **************************************/
@@ -99,6 +121,9 @@ typedef uintptr_t uptrval;
 #define RUN_BITS (8 - ML_BITS)
 #define RUN_MASK ((1U << RUN_BITS) - 1)
 
+static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
+static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
+
 /*-************************************
  *	Reading and writing into memory
  **************************************/
@@ -156,7 +181,7 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
  * customized variant of memcpy,
  * which can overwrite up to 7 bytes beyond dstEnd
  */
-static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
+static FORCE_O2_INLINE_GCC_PPC64LE void LZ4_wildCopy8(void *dstPtr,
 	const void *srcPtr, void *dstEnd)
 {
 	BYTE *d = (BYTE *)dstPtr;
@@ -220,6 +245,39 @@ static FORCE_INLINE unsigned int LZ4_count(
 	return (unsigned int)(pIn - pStart);
 }
 
+/* Read the variable-length literal or match length.
+ *
+ * ip - pointer to use as input.
+ * lencheck - end ip.  Return an error if ip advances >= lencheck.
+ * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
+ * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
+ * error (output) - error code.  Should be set to 0 before call.
+ */
+typedef enum { loop_error = -2, initial_error = -1, ok = 0} variable_length_error;
+static FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
+					   const BYTE *lencheck,
+					   int loop_check, int initial_check,
+					   variable_length_error *error)
+{
+	unsigned length = 0;
+	unsigned s;
+	if (initial_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
+		*error = initial_error;
+		return length;
+	}
+	do {
+		s = **ip;
+		(*ip)++;
+		length += s;
+		if (loop_check && unlikely((*ip) >= lencheck)) {	/* overflow detection */
+			*error = loop_error;
+			return length;
+		}
+	} while (s == 255);
+
+	return length;
+}
+
 typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
 typedef enum { byPtr, byU32, byU16 } tableType_t;
 
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
index 176f03b83e56..e02e041a01d9 100644
--- a/lib/lz4/lz4hc_compress.c
+++ b/lib/lz4/lz4hc_compress.c
@@ -293,7 +293,7 @@ static FORCE_INLINE int LZ4HC_encodeSequence(
 		*token = (BYTE)(length<<ML_BITS);
 
 	/* Copy Literals */
-	LZ4_wildCopy(*op, *anchor, (*op) + length);
+	LZ4_wildCopy8(*op, *anchor, (*op) + length);
 	*op += length;
 
 	/* Encode Offset */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2019-05-17  7:06 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-15  0:43 [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64 Chenxi Mao
2019-05-15  2:21 ` Gao Xiang
2019-05-15  3:27   ` Mao, Chenxi
     [not found]     ` <CADv+DGkGa3q4FJ0TeLcj626PrUEYCDMfyKdd0yx82VmAX2i98Q@mail.gmail.com>
2019-05-15 17:03       ` Cyan
2019-05-15 17:47         ` Gao Xiang
2019-05-15 23:48           ` Mao, Chenxi
2019-05-16  1:35             ` Gao Xiang
2019-05-16  2:07               ` Mao, Chenxi
2019-05-16  2:10                 ` Gao Xiang
2019-05-16  6:52                   ` Mao, Chenxi
2019-05-16  2:11               ` Mao, Chenxi
2019-05-16  2:18                 ` Gao Xiang
2019-05-17  1:44 Chenxi Mao
2019-05-17  5:56 Chenxi Mao
2019-05-17  6:35 ` Gao Xiang
2019-05-17  7:06   ` Mao, Chenxi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.