linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase
@ 2020-03-16 18:07 Raphael Moreira Zinsly
  2020-03-16 18:07 ` [PATCH 1/5] selftests/powerpc: Add header files for GZIP engine test Raphael Moreira Zinsly
                   ` (5 more replies)
  0 siblings, 6 replies; 14+ messages in thread
From: Raphael Moreira Zinsly @ 2020-03-16 18:07 UTC (permalink / raw)
  To: linuxppc-dev, linux-crypto; +Cc: herbert, mpe, haren, abali

This patch series are intended to test the power8 and power9 Nest
Accelerator (NX) GZIP engine that is being introduced by
https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-March/205659.html
More information about how to access the NX can be found in that patch, also a
complete userspace library and more documentation can be found at:
https://github.com/libnxz/power-gzip


Thanks,
Raphael



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/5] selftests/powerpc: Add header files for GZIP engine test
  2020-03-16 18:07 [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase Raphael Moreira Zinsly
@ 2020-03-16 18:07 ` Raphael Moreira Zinsly
  2020-03-18  3:48   ` Daniel Axtens
  2020-03-16 18:07 ` [PATCH 2/5] selftests/powerpc: Add header files for NX compresion/decompression Raphael Moreira Zinsly
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Raphael Moreira Zinsly @ 2020-03-16 18:07 UTC (permalink / raw)
  To: linuxppc-dev, linux-crypto
  Cc: herbert, mpe, haren, abali, Raphael Moreira Zinsly

Add files to access the powerpc NX-GZIP engine in user space.

Signed-off-by: Bulent Abali <abali@us.ibm.com>
Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
---
 .../selftests/powerpc/nx-gzip/inc/crb.h       | 170 ++++++++++++++++++
 .../selftests/powerpc/nx-gzip/inc/nx-gzip.h   |  27 +++
 .../powerpc/nx-gzip/inc/nx-helpers.h          |  53 ++++++
 .../selftests/powerpc/nx-gzip/inc/nx.h        |  30 ++++
 4 files changed, 280 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/crb.h
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/nx-gzip.h
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/nx-helpers.h
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/nx.h

diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/crb.h b/tools/testing/selftests/powerpc/nx-gzip/inc/crb.h
new file mode 100644
index 000000000000..6af25fb8461a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/inc/crb.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __CRB_H
+#define __CRB_H
+#include <linux/types.h>
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned long long u64;
+
+/* From nx-842.h */
+
+/* CCW 842 CI/FC masks
+ * NX P8 workbook, section 4.3.1, figure 4-6
+ * "CI/FC Boundary by NX CT type"
+ */
+#define CCW_CI_842              (0x00003ff8)
+#define CCW_FC_842              (0x00000007)
+
+/* end - nx-842.h */
+
+#ifndef __aligned
+#define __aligned(x)            __attribute__((aligned(x)))
+#endif
+
+#ifndef __packed
+#define __packed        __attribute__((packed))
+#endif
+
+/* Chapter 6.5.8 Coprocessor-Completion Block (CCB) */
+
+#define CCB_VALUE		(0x3fffffffffffffff)
+#define CCB_ADDRESS		(0xfffffffffffffff8)
+#define CCB_CM			(0x0000000000000007)
+#define CCB_CM0			(0x0000000000000004)
+#define CCB_CM12		(0x0000000000000003)
+
+#define CCB_CM0_ALL_COMPLETIONS	(0x0)
+#define CCB_CM0_LAST_IN_CHAIN	(0x4)
+#define CCB_CM12_STORE		(0x0)
+#define CCB_CM12_INTERRUPT	(0x1)
+
+#define CCB_SIZE		(0x10)
+#define CCB_ALIGN		CCB_SIZE
+
+struct coprocessor_completion_block {
+	__be64 value;
+	__be64 address;
+} __packed __aligned(CCB_ALIGN);
+
+
+/* Chapter 6.5.7 Coprocessor-Status Block (CSB) */
+
+#define CSB_V			(0x80)
+#define CSB_F			(0x04)
+#define CSB_CH			(0x03)
+#define CSB_CE_INCOMPLETE	(0x80)
+#define CSB_CE_TERMINATION	(0x40)
+#define CSB_CE_TPBC		(0x20)
+
+#define CSB_CC_SUCCESS		(0)
+#define CSB_CC_INVALID_ALIGN	(1)
+#define CSB_CC_OPERAND_OVERLAP	(2)
+#define CSB_CC_DATA_LENGTH	(3)
+#define CSB_CC_TRANSLATION	(5)
+#define CSB_CC_PROTECTION	(6)
+#define CSB_CC_RD_EXTERNAL	(7)
+#define CSB_CC_INVALID_OPERAND	(8)
+#define CSB_CC_PRIVILEGE	(9)
+#define CSB_CC_INTERNAL		(10)
+#define CSB_CC_WR_EXTERNAL	(12)
+#define CSB_CC_NOSPC		(13)
+#define CSB_CC_EXCESSIVE_DDE	(14)
+#define CSB_CC_WR_TRANSLATION	(15)
+#define CSB_CC_WR_PROTECTION	(16)
+#define CSB_CC_UNKNOWN_CODE	(17)
+#define CSB_CC_ABORT		(18)
+#define CSB_CC_TRANSPORT	(20)
+#define CSB_CC_SEGMENTED_DDL	(31)
+#define CSB_CC_PROGRESS_POINT	(32)
+#define CSB_CC_DDE_OVERFLOW	(33)
+#define CSB_CC_SESSION		(34)
+#define CSB_CC_PROVISION	(36)
+#define CSB_CC_CHAIN		(37)
+#define CSB_CC_SEQUENCE		(38)
+#define CSB_CC_HW		(39)
+
+#define CSB_SIZE		(0x10)
+#define CSB_ALIGN		CSB_SIZE
+
+struct coprocessor_status_block {
+	u8 flags;
+	u8 cs;
+	u8 cc;
+	u8 ce;
+	__be32 count;
+	__be64 address;
+} __packed __aligned(CSB_ALIGN);
+
+
+/* Chapter 6.5.10 Data-Descriptor List (DDL)
+ * each list contains one or more Data-Descriptor Entries (DDE)
+ */
+
+#define DDE_P			(0x8000)
+
+#define DDE_SIZE		(0x10)
+#define DDE_ALIGN		DDE_SIZE
+
+struct data_descriptor_entry {
+	__be16 flags;
+	u8 count;
+	u8 index;
+	__be32 length;
+	__be64 address;
+} __packed __aligned(DDE_ALIGN);
+
+
+/* Chapter 6.5.2 Coprocessor-Request Block (CRB) */
+
+#define CRB_SIZE		(0x80)
+#define CRB_ALIGN		(0x100) /* Errata: requires 256 alignment */
+
+
+/* Coprocessor Status Block field
+ *   ADDRESS	address of CSB
+ *   C		CCB is valid
+ *   AT		0 = addrs are virtual, 1 = addrs are phys
+ *   M		enable perf monitor
+ */
+#define CRB_CSB_ADDRESS		(0xfffffffffffffff0)
+#define CRB_CSB_C		(0x0000000000000008)
+#define CRB_CSB_AT		(0x0000000000000002)
+#define CRB_CSB_M		(0x0000000000000001)
+
+struct coprocessor_request_block {
+	__be32 ccw;
+	__be32 flags;
+	__be64 csb_addr;
+
+	struct data_descriptor_entry source;
+	struct data_descriptor_entry target;
+
+	struct coprocessor_completion_block ccb;
+
+	u8 reserved[48];
+
+	struct coprocessor_status_block csb;
+} __packed __aligned(CRB_ALIGN);
+
+#define crb_csb_addr(c)         __be64_to_cpu(c->csb_addr)
+#define crb_nx_fault_addr(c)    __be64_to_cpu(c->stamp.nx.fault_storage_addr)
+#define crb_nx_flags(c)         c->stamp.nx.flags
+#define crb_nx_fault_status(c)  c->stamp.nx.fault_status
+#define crb_nx_pswid(c)		c->stamp.nx.pswid;
+
+
+/* RFC02167 Initiate Coprocessor Instructions document
+ * Chapter 8.2.1.1.1 RS
+ * Chapter 8.2.3 Coprocessor Directive
+ * Chapter 8.2.4 Execution
+ *
+ * The CCW must be converted to BE before passing to icswx()
+ */
+
+#define CCW_PS                  (0xff000000)
+#define CCW_CT                  (0x00ff0000)
+#define CCW_CD                  (0x0000ffff)
+#define CCW_CL                  (0x0000c000)
+
+#endif
diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/nx-gzip.h b/tools/testing/selftests/powerpc/nx-gzip/inc/nx-gzip.h
new file mode 100644
index 000000000000..75482c45574d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/inc/nx-gzip.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright 2020 IBM Corp.
+ *
+ */
+
+#ifndef _UAPI_MISC_VAS_H
+#define _UAPI_MISC_VAS_H
+
+#include <asm/ioctl.h>
+
+#define VAS_FLAGS_PIN_WINDOW	0x1
+#define VAS_FLAGS_HIGH_PRI	0x2
+
+#define VAS_FTW_SETUP		_IOW('v', 1, struct vas_gzip_setup_attr)
+#define VAS_842_TX_WIN_OPEN	_IOW('v', 2, struct vas_gzip_setup_attr)
+#define VAS_GZIP_TX_WIN_OPEN	_IOW('v', 0x20, struct vas_gzip_setup_attr)
+
+struct vas_gzip_setup_attr {
+	int32_t		version;
+	int16_t		vas_id;
+	int16_t		reserved1;
+	int64_t		flags;
+	int64_t		reserved2[6];
+};
+
+#endif /* _UAPI_MISC_VAS_H */
diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/nx-helpers.h b/tools/testing/selftests/powerpc/nx-gzip/inc/nx-helpers.h
new file mode 100644
index 000000000000..201cf9f86a97
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/inc/nx-helpers.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#include <sys/time.h>
+#include <asm/byteorder.h>
+#include <stdint.h>
+#include "crb.h"
+
+#define cpu_to_be32		__cpu_to_be32
+#define cpu_to_be64		__cpu_to_be64
+#define be32_to_cpu		__be32_to_cpu
+#define be64_to_cpu		__be64_to_cpu
+
+/*
+ * Several helpers/macros below were copied from the tree
+ * (kernel.h, nx-842.h, nx-ftw.h, asm-compat.h etc)
+ */
+
+/* from kernel.h */
+#define IS_ALIGNED(x, a)	(((x) & ((typeof(x))(a) - 1)) == 0)
+#define __round_mask(x, y)	((__typeof__(x))((y)-1))
+#define round_up(x, y)		((((x)-1) | __round_mask(x, y))+1)
+#define round_down(x, y)	((x) & ~__round_mask(x, y))
+
+#define min_t(t, x, y)	((x) < (y) ? (x) : (y))
+/*
+ * Get/Set bit fields. (from nx-842.h)
+ */
+#define GET_FIELD(m, v)         (((v) & (m)) >> MASK_LSH(m))
+#define MASK_LSH(m)             (__builtin_ffsl(m) - 1)
+#define SET_FIELD(m, v, val)    \
+		(((v) & ~(m)) | ((((typeof(v))(val)) << MASK_LSH(m)) & (m)))
+
+/* From asm-compat.h */
+#define __stringify_in_c(...)	#__VA_ARGS__
+#define stringify_in_c(...)	__stringify_in_c(__VA_ARGS__) " "
+
+#define	pr_debug
+#define	pr_debug_ratelimited	printf
+#define	pr_err			printf
+#define	pr_err_ratelimited	printf
+
+#define WARN_ON_ONCE(x)		if (x) \
+				printf("WARNING: %s:%d\n", __func__, __LINE__)
+
+extern void dump_buffer(char *msg, char *buf, int len);
+extern void *alloc_aligned_mem(int len, int align, char *msg);
+extern void get_payload(char *buf, int len);
+extern void time_add(struct timeval *in, int seconds, struct timeval *out);
+
+typedef int bool;
+extern bool time_after(struct timeval *a, struct timeval *b);
+extern long time_delta(struct timeval *a, struct timeval *b);
+extern void dump_dde(struct data_descriptor_entry *dde, char *msg);
+extern void copy_paste_crb_data(struct coprocessor_request_block *crb);
diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/nx.h b/tools/testing/selftests/powerpc/nx-gzip/inc/nx.h
new file mode 100644
index 000000000000..08c93f7fb96c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/inc/nx.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright 2020 IBM Corp.
+ *
+ */
+
+#define	NX_FUNC_COMP_842	1
+#define NX_FUNC_COMP_GZIP	2
+
+typedef int bool;
+
+struct nx842_func_args {
+	bool use_crc;
+	bool decompress;		/* true: decompress; false compress */
+	bool move_data;
+	int timeout;			/* seconds */
+};
+
+typedef struct {
+	int len;
+	char *buf;
+} nxbuf_t;
+
+/* @function should be EFT (aka 842), GZIP etc */
+extern void *nx_function_begin(int function, int pri);
+
+extern int nx_function(void *handle, nxbuf_t *in, nxbuf_t *out, void *arg);
+
+extern int nx_function_end(void *handle);
+
-- 
2.21.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/5] selftests/powerpc: Add header files for NX compresion/decompression
  2020-03-16 18:07 [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase Raphael Moreira Zinsly
  2020-03-16 18:07 ` [PATCH 1/5] selftests/powerpc: Add header files for GZIP engine test Raphael Moreira Zinsly
@ 2020-03-16 18:07 ` Raphael Moreira Zinsly
  2020-03-18 22:29   ` Daniel Axtens
  2020-03-16 18:07 ` [PATCH 3/5] selftests/powerpc: Add NX-GZIP engine compress testcase Raphael Moreira Zinsly
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Raphael Moreira Zinsly @ 2020-03-16 18:07 UTC (permalink / raw)
  To: linuxppc-dev, linux-crypto
  Cc: herbert, mpe, haren, abali, Raphael Moreira Zinsly

Add files to be able to compress and decompress files using the
powerpc NX-GZIP engine.

Signed-off-by: Bulent Abali <abali@us.ibm.com>
Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
---
 .../powerpc/nx-gzip/inc/copy-paste.h          |  54 ++
 .../selftests/powerpc/nx-gzip/inc/nx_dbg.h    |  95 +++
 .../selftests/powerpc/nx-gzip/inc/nxu.h       | 644 ++++++++++++++++++
 3 files changed, 793 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/copy-paste.h
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/nx_dbg.h
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/nxu.h

diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/copy-paste.h b/tools/testing/selftests/powerpc/nx-gzip/inc/copy-paste.h
new file mode 100644
index 000000000000..107139b6c7df
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/inc/copy-paste.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "nx-helpers.h"
+
+/*
+ * Macros taken from arch/powerpc/include/asm/ppc-opcode.h and other
+ * header files.
+ */
+#define ___PPC_RA(a)    (((a) & 0x1f) << 16)
+#define ___PPC_RB(b)    (((b) & 0x1f) << 11)
+
+#define PPC_INST_COPY                   0x7c20060c
+#define PPC_INST_PASTE                  0x7c20070d
+
+#define PPC_COPY(a, b)          stringify_in_c(.long PPC_INST_COPY | \
+						___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_PASTE(a, b)         stringify_in_c(.long PPC_INST_PASTE | \
+						___PPC_RA(a) | ___PPC_RB(b))
+#define CR0_SHIFT	28
+#define CR0_MASK	0xF
+/*
+ * Copy/paste instructions:
+ *
+ *	copy RA,RB
+ *		Copy contents of address (RA) + effective_address(RB)
+ *		to internal copy-buffer.
+ *
+ *	paste RA,RB
+ *		Paste contents of internal copy-buffer to the address
+ *		(RA) + effective_address(RB)
+ */
+static inline int vas_copy(void *crb, int offset)
+{
+	asm volatile(PPC_COPY(%0, %1)";"
+		:
+		: "b" (offset), "b" (crb)
+		: "memory");
+
+	return 0;
+}
+
+static inline int vas_paste(void *paste_address, int offset)
+{
+	u32 cr;
+
+	cr = 0;
+	asm volatile(PPC_PASTE(%1, %2)";"
+		"mfocrf %0, 0x80;"
+		: "=r" (cr)
+		: "b" (offset), "b" (paste_address)
+		: "memory", "cr0");
+
+	return (cr >> CR0_SHIFT) & CR0_MASK;
+}
diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/nx_dbg.h b/tools/testing/selftests/powerpc/nx-gzip/inc/nx_dbg.h
new file mode 100644
index 000000000000..f2c0eee2317e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/inc/nx_dbg.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright 2020 IBM Corporation
+ *
+ */
+
+#ifndef _NXU_DBG_H_
+#define _NXU_DBG_H_
+
+#include <sys/file.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <time.h>
+#include <pthread.h>
+
+extern FILE * nx_gzip_log;
+extern int nx_gzip_trace;
+extern unsigned int nx_gzip_inflate_impl;
+extern unsigned int nx_gzip_deflate_impl;
+extern unsigned int nx_gzip_inflate_flags;
+extern unsigned int nx_gzip_deflate_flags;
+
+extern int nx_dbg;
+pthread_mutex_t mutex_log;
+
+#define nx_gzip_trace_enabled()       (nx_gzip_trace & 0x1)
+#define nx_gzip_hw_trace_enabled()    (nx_gzip_trace & 0x2)
+#define nx_gzip_sw_trace_enabled()    (nx_gzip_trace & 0x4)
+#define nx_gzip_gather_statistics()   (nx_gzip_trace & 0x8)
+#define nx_gzip_per_stream_stat()     (nx_gzip_trace & 0x10)
+
+#define prt(fmt, ...) do { \
+	pthread_mutex_lock(&mutex_log);					\
+	flock(nx_gzip_log->_fileno, LOCK_EX);				\
+	time_t t; struct tm *m; time(&t); m = localtime(&t);		\
+	fprintf(nx_gzip_log, "[%04d/%02d/%02d %02d:%02d:%02d] "		\
+		"pid %d: " fmt,	\
+		(int)m->tm_year + 1900, (int)m->tm_mon+1, (int)m->tm_mday, \
+		(int)m->tm_hour, (int)m->tm_min, (int)m->tm_sec,	\
+		(int)getpid(), ## __VA_ARGS__);				\
+	fflush(nx_gzip_log);						\
+	flock(nx_gzip_log->_fileno, LOCK_UN);				\
+	pthread_mutex_unlock(&mutex_log);				\
+} while (0)
+
+/* Use in case of an error */
+#define prt_err(fmt, ...) do { if (nx_dbg >= 0) {			\
+	prt("%s:%u: Error: "fmt,					\
+		__FILE__, __LINE__, ## __VA_ARGS__);			\
+}} while (0)
+
+/* Use in case of an warning */
+#define prt_warn(fmt, ...) do {	if (nx_dbg >= 1) {			\
+	prt("%s:%u: Warning: "fmt,					\
+		__FILE__, __LINE__, ## __VA_ARGS__);			\
+}} while (0)
+
+/* Informational printouts */
+#define prt_info(fmt, ...) do {	if (nx_dbg >= 2) {			\
+	prt("Info: "fmt, ## __VA_ARGS__);				\
+}} while (0)
+
+/* Trace zlib wrapper code */
+#define prt_trace(fmt, ...) do { if (nx_gzip_trace_enabled()) {		\
+	prt("### "fmt, ## __VA_ARGS__);					\
+}} while (0)
+
+/* Trace statistics */
+#define prt_stat(fmt, ...) do {	if (nx_gzip_gather_statistics()) {	\
+	prt("### "fmt, ## __VA_ARGS__);					\
+}} while (0)
+
+/* Trace zlib hardware implementation */
+#define hw_trace(fmt, ...) do {						\
+		if (nx_gzip_hw_trace_enabled())				\
+			fprintf(nx_gzip_log, "hhh " fmt, ## __VA_ARGS__); \
+	} while (0)
+
+/* Trace zlib software implementation */
+#define sw_trace(fmt, ...) do {						\
+		if (nx_gzip_sw_trace_enabled())				\
+			fprintf(nx_gzip_log, "sss " fmt, ## __VA_ARGS__); \
+	} while (0)
+
+
+/**
+ * str_to_num - Convert string into number and copy with endings like
+ *              KiB for kilobyte
+ *              MiB for megabyte
+ *              GiB for gigabyte
+ */
+uint64_t str_to_num(char *str);
+void nx_lib_debug(int onoff);
+
+#endif	/* _NXU_DBG_H_ */
diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/nxu.h b/tools/testing/selftests/powerpc/nx-gzip/inc/nxu.h
new file mode 100644
index 000000000000..faa95ffc162a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/inc/nxu.h
@@ -0,0 +1,644 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Hardware interface of the NX-GZIP compression accelerator
+ *
+ * Copyright (C) IBM Corporation, 2020
+ *
+ * Author: Bulent Abali <abali@us.ibm.com>
+ *
+ */
+
+#ifndef _NXU_H
+#define _NXU_H
+
+#include <stdint.h>
+#include <endian.h>
+
+/* deflate */
+#define LLSZ   286
+#define DSZ    30
+
+/* nx */
+#define DHTSZ  18
+#define DHT_MAXSZ 288
+#define MAX_DDE_COUNT 256
+
+/* util */
+#ifdef NXDBG
+#define NXPRT(X) do { X; } while (0)
+#else
+#define NXPRT(X) do { ; } while (0)
+#endif
+
+#ifdef NXTIMER
+#include <sys/platform/ppc.h>
+#define NX_CLK(X)      do { X; } while (0)
+#define nx_get_time()  __ppc_get_timebase()
+#define nx_get_freq()  __ppc_get_timebase_freq()
+#else
+#define NX_CLK(X)      do { ; } while (0)
+#define nx_get_time()  (-1)
+#define nx_get_freq()  (-1)
+#endif
+
+/*
+ * Definitions of acronyms used here. See
+ * P9 NX Gzip Accelerator User's Manual for details
+ *
+ * adler/crc: 32 bit checksums appended to stream tail
+ * ce:       completion extension
+ * cpb:      coprocessor parameter block (metadata)
+ * crb:      coprocessor request block (command)
+ * csb:      coprocessor status block (status)
+ * dht:      dynamic huffman table
+ * dde:      data descriptor element (address, length)
+ * ddl:      list of ddes
+ * dh/fh:    dynamic and fixed huffman types
+ * fc:       coprocessor function code
+ * histlen:  history/dictionary length
+ * history:  sliding window of up to 32KB of data
+ * lzcount:  Deflate LZ symbol counts
+ * rembytecnt: remaining byte count
+ * sfbt:     source final block type; last block's type during decomp
+ * spbc:     source processed byte count
+ * subc:     source unprocessed bit count
+ * tebc:     target ending bit count; valid bits in the last byte
+ * tpbc:     target processed byte count
+ * vas:      virtual accelerator switch; the user mode interface
+ */
+
+typedef union {
+    uint32_t word[4];
+    uint64_t dword[2];
+} nx_qw_t __attribute__((aligned (16)));
+
+/*
+ * Note: NX registers with fewer than 32 bits are declared by
+ * convention as uint32_t variables in unions. If *_offset and *_mask
+ * are defined for a variable, then use get_ put_ macros to
+ * conveniently access the register fields for endian conversions.
+ */
+
+typedef struct {
+    /* Data Descriptor Element, Section 6.4 */
+    union {
+	uint32_t dde_count;
+	/* When dde_count == 0 ddead is a pointer to a data buffer;
+	 * ddebc is the buffer length bytes.
+	 * When dde_count > 0 dde is an indirect dde; ddead is a pointer
+	 * to a contiguous list of direct ddes; ddebc is the total length
+	 * of all data pointed to by the list of direct ddes.
+	 * Note that only one level of indirection is permitted.
+	 * See Section 6.4 of the user manual for additional details
+	 */
+    };
+    uint32_t ddebc; /* dde byte count */
+    uint64_t ddead; /* dde address */
+} nx_dde_t __attribute__((aligned (16)));
+
+typedef struct {
+    /* Coprocessor Status Block, Section 6.6  */
+    union {
+	uint32_t csb_v;
+	/* Valid bit. v must be set to 0 by the program
+	 * before submitting the coprocessor command.
+	 * Software can poll for the v bit
+	 */
+
+	uint32_t csb_f;
+	/* 16B CSB size. Written to 0 by DMA when it writes the CPB */
+
+	uint32_t csb_cs;
+	/* cs completion sequence; unused */
+
+	uint32_t csb_cc;
+	/* cc completion code; cc != 0 exception occurred */
+
+	uint32_t csb_ce;
+	/* ce completion extension */
+
+    };
+    uint32_t tpbc;
+    /* target processed byte count TPBC */
+
+    uint64_t fsaddr;
+    /* Section 6.12.1 CSB NonZero error summary.  FSA Failing storage
+     * address.  Address where error occurred. When available, written
+     * to A field of CSB
+     */
+} nx_csb_t __attribute__((aligned (16)));
+
+typedef struct {
+    /* Coprocessor Completion Block, Section 6.7 */
+
+    uint32_t reserved[3];
+    union {
+	/* When crb.c==0 (no ccb defined) it is reserved;
+	 * When crb.c==1 (ccb defined) it is cm
+	 */
+
+	uint32_t ccb_cm;
+	/* Signal interrupt of crb.c==1 and cm==1 */
+
+	uint32_t word;
+	/* generic access to the 32bit word */
+    };
+} nx_ccb_t __attribute__((aligned (16)));
+
+typedef struct {
+    /*
+     * CRB operand of the paste coprocessor instruction is stamped
+     * in quadword 4 with the information shown here as its written
+     * in to the receive FIFO of the coprocessor
+     */
+
+    union {
+	uint32_t vas_buf_num;
+	/* Verification only vas buffer number which correlates to
+	 * the low order bits of the atag in the paste command
+	 */
+
+	uint32_t send_wc_id;
+	/* Pointer to Send Window Context that provides for NX address
+	 * translation information, such as MSR and LPCR bits, job completion
+	 * interrupt RA, PSWID, and job utilization counter.
+	 */
+
+    };
+    union {
+	uint32_t recv_wc_id;
+	/* Pointer to Receive Window Context. NX uses this to return
+	 * credits to a Receive FIFO as entries are dequeued.
+	 */
+
+    };
+    uint32_t reserved2;
+    union {
+	uint32_t vas_invalid;
+	/* Invalid bit. If this bit is 1 the CRB is discarded by
+	 * NX upon fetching from the receive FIFO. If this bit is 0
+	 * the CRB is processed normally. The bit is stamped to 0
+	 * by VAS and may be written to 1 by hypervisor while
+	 * the CRB is in the receive FIFO (in memory).
+	 */
+
+    };
+} vas_stamped_crb_t;
+
+typedef struct {
+    /*
+     * A CRB that has a translation fault is stamped by NX in quadword 4
+     * and pasted to the Fault Send Window in VAS.
+     */
+    uint64_t fsa;
+    union {
+	uint32_t nxsf_t;
+	uint32_t nxsf_fs;
+    };
+    uint32_t pswid;
+} nx_stamped_fault_crb_t;
+
+typedef union {
+    vas_stamped_crb_t      vas;
+    nx_stamped_fault_crb_t nx;
+} stamped_crb_t;
+
+typedef struct {
+    /*
+     * Coprocessor Parameter Block In/Out are used to pass metadata
+     * to/from accelerator.  Tables 6.5 and 6.6 of the user manual.
+     */
+
+    /* CPBInput */
+
+    struct {
+	union {
+	    nx_qw_t qw0;
+	    struct {
+		uint32_t in_adler;            /* bits 0:31    */
+		uint32_t in_crc;              /* bits 32:63   */
+		union {
+		    uint32_t in_histlen;      /* bits 64:75   */
+		    uint32_t in_subc;         /* bits 93:95   */
+		};
+		union {
+		    uint32_t in_sfbt;         /* bits 108:111 */
+		    uint32_t in_rembytecnt;   /* bits 112:127 */
+		    uint32_t in_dhtlen;       /* bits 116:127 */
+		};
+	    };
+	};
+	union {
+	    nx_qw_t  in_dht[DHTSZ];           /* qw[1:18]     */
+	    char     in_dht_char[DHT_MAXSZ];  /* byte access  */
+	};
+	nx_qw_t  reserved[5];                 /* qw[19:23]    */
+    };
+
+    /* CPBOutput */
+
+    volatile struct {
+	union {
+	    nx_qw_t qw24;
+	    struct {
+		uint32_t out_adler;           /* bits 0:31  qw[24]   */
+		uint32_t out_crc;             /* bits 32:63 qw[24]   */
+		union {
+		    uint32_t out_tebc;        /* bits 77:79 qw[24]   */
+		    uint32_t out_subc;        /* bits 80:95 qw[24]   */
+		};
+		union {
+		    uint32_t out_sfbt;        /* bits 108:111 qw[24] */
+		    uint32_t out_rembytecnt;  /* bits 112:127 qw[24] */
+		    uint32_t out_dhtlen;      /* bits 116:127 qw[24] */
+		};
+	    };
+	};
+	union {
+	    nx_qw_t  qw25[79];              /* qw[25:103] */
+	    /* qw[25] compress no lzcounts or wrap */
+	    uint32_t out_spbc_comp_wrap;
+	    uint32_t out_spbc_wrap;         /* qw[25] wrap */
+	    uint32_t out_spbc_comp;         /* qw[25] compress no lzcounts */
+	    uint32_t out_lzcount[LLSZ+DSZ]; /* 286 LL and 30 D symbol counts */
+	    struct {
+		nx_qw_t  out_dht[DHTSZ];    /* qw[25:42] */
+		uint32_t out_spbc_decomp;   /* qw[43] decompress */
+	    };
+	};
+	/* qw[104] compress with lzcounts */
+	uint32_t out_spbc_comp_with_count;
+    };
+} nx_gzip_cpb_t  __attribute__((aligned (128)));
+
+typedef struct {
+    union {                   /* byte[0:3]   */
+	uint32_t gzip_fc;     /* bits[24-31] */
+    };
+    uint32_t reserved1;       /* byte[4:7]   */
+    union {
+	uint64_t csb_address; /* byte[8:15]  */
+	struct {
+	    uint32_t reserved2;
+	    union {
+		uint32_t crb_c;
+		/* c==0 no ccb defined */
+
+		uint32_t crb_at;
+		/* at==0 address type is ignored;
+		 * all addrs effective assumed.
+		 */
+
+	    };
+	};
+    };
+    nx_dde_t source_dde;           /* byte[16:31] */
+    nx_dde_t target_dde;           /* byte[32:47] */
+    volatile nx_ccb_t ccb;         /* byte[48:63] */
+    volatile union {
+	/* byte[64:239] shift csb by 128 bytes out of the crb; csb was in crb
+	 * earlier; JReilly says csb written with partial inject.
+	 */
+	nx_qw_t reserved64[11];
+	stamped_crb_t stamp;       /* byte[64:79] */
+    };
+    volatile nx_csb_t csb;
+} nx_gzip_crb_t __attribute__((aligned (128)));
+
+
+typedef struct {
+    nx_gzip_crb_t crb;
+    nx_gzip_cpb_t cpb;
+} nx_gzip_crb_cpb_t __attribute__((aligned (2048)));
+
+
+/*
+ * NX hardware convention has the msb bit on the left numbered 0.
+ * The defines below has *_offset defined as the right most bit
+ * position of a field.  x of size_mask(x) is the field width in bits.
+ */
+
+#define size_mask(x)          ((1U<<(x))-1)
+
+/*
+ * Offsets and Widths within the containing 32 bits of the various NX
+ * gzip hardware registers.  Use the getnn/putnn macros to access
+ * these regs
+ */
+
+#define dde_count_mask        size_mask(8)
+#define dde_count_offset      23
+
+/* CSB */
+
+#define csb_v_mask            size_mask(1)
+#define csb_v_offset          0
+#define csb_f_mask            size_mask(1)
+#define csb_f_offset          6
+#define csb_cs_mask           size_mask(8)
+#define csb_cs_offset         15
+#define csb_cc_mask           size_mask(8)
+#define csb_cc_offset         23
+#define csb_ce_mask           size_mask(8)
+#define csb_ce_offset         31
+
+/* CCB */
+
+#define ccb_cm_mask           size_mask(3)
+#define ccb_cm_offset         31
+
+/* VAS stamped CRB fields */
+
+#define vas_buf_num_mask      size_mask(6)
+#define vas_buf_num_offset    5
+#define send_wc_id_mask       size_mask(16)
+#define send_wc_id_offset     31
+#define recv_wc_id_mask       size_mask(16)
+#define recv_wc_id_offset     31
+#define vas_invalid_mask      size_mask(1)
+#define vas_invalid_offset    31
+
+/* NX stamped fault CRB fields */
+
+#define nxsf_t_mask           size_mask(1)
+#define nxsf_t_offset         23
+#define nxsf_fs_mask          size_mask(8)
+#define nxsf_fs_offset        31
+
+/* CPB input */
+
+#define in_histlen_mask       size_mask(12)
+#define in_histlen_offset     11
+#define in_dhtlen_mask        size_mask(12)
+#define in_dhtlen_offset      31
+#define in_subc_mask          size_mask(3)
+#define in_subc_offset        31
+#define in_sfbt_mask          size_mask(4)
+#define in_sfbt_offset        15
+#define in_rembytecnt_mask    size_mask(16)
+#define in_rembytecnt_offset  31
+
+/* CPB output */
+
+#define out_tebc_mask         size_mask(3)
+#define out_tebc_offset       15
+#define out_subc_mask         size_mask(16)
+#define out_subc_offset       31
+#define out_sfbt_mask         size_mask(4)
+#define out_sfbt_offset       15
+#define out_rembytecnt_mask   size_mask(16)
+#define out_rembytecnt_offset 31
+#define out_dhtlen_mask       size_mask(12)
+#define out_dhtlen_offset     31
+
+/* CRB */
+
+#define gzip_fc_mask          size_mask(8)
+#define gzip_fc_offset        31
+#define crb_c_mask            size_mask(1)
+#define crb_c_offset          28
+#define crb_at_mask           size_mask(1)
+#define crb_at_offset         30
+#define csb_address_mask      ~(15UL) /* mask off bottom 4b */
+
+/*
+ * Access macros for the registers.  Do not access registers directly
+ * because of the endian conversion.  P9 processor may run either as
+ * Little or Big endian. However the NX coprocessor regs are always
+ * big endian.
+ * Use the 32 and 64b macros to access respective
+ * register sizes.
+ * Use nn forms for the register fields shorter than 32 bits.
+ */
+
+#define getnn(ST, REG)      ((be32toh(ST.REG) >> (31-REG##_offset)) \
+				 & REG##_mask)
+#define getpnn(ST, REG)     ((be32toh((ST)->REG) >> (31-REG##_offset)) \
+				 & REG##_mask)
+#define get32(ST, REG)      (be32toh(ST.REG))
+#define getp32(ST, REG)     (be32toh((ST)->REG))
+#define get64(ST, REG)      (be64toh(ST.REG))
+#define getp64(ST, REG)     (be64toh((ST)->REG))
+
+#define unget32(ST, REG)    (get32(ST, REG) & ~((REG##_mask) \
+				<< (31-REG##_offset)))
+/* get 32bits less the REG field */
+
+#define ungetp32(ST, REG)   (getp32(ST, REG) & ~((REG##_mask) \
+				<< (31-REG##_offset)))
+/* get 32bits less the REG field */
+
+#define clear_regs(ST)      do { memset((void *)(&(ST)), 0, sizeof(ST)); \
+				} while (0)
+#define clear_dde(ST)       do { ST.dde_count = ST.ddebc = 0; ST.ddead = 0; \
+				} while (0)
+#define clearp_dde(ST)      do { (ST)->dde_count = (ST)->ddebc = 0; \
+				 (ST)->ddead = 0; \
+				} while (0)
+#define clear_struct(ST)    do { memset((void *)(&(ST)), 0, sizeof(ST)); \
+				} while (0)
+
+#define putnn(ST, REG, X)   do { ST.REG = htobe32(unget32(ST, REG) | (((X) \
+				 & REG##_mask) << (31-REG##_offset))); \
+				} while (0)
+#define putpnn(ST, REG, X)  do { (ST)->REG = htobe32(ungetp32(ST, REG) \
+				| (((X) & REG##_mask) << (31-REG##_offset))); \
+				} while (0)
+
+#define put32(ST, REG, X)   do { ST.REG = htobe32(X); } while (0)
+#define putp32(ST, REG, X)  do { (ST)->REG = htobe32(X); } while (0)
+#define put64(ST, REG, X)   do { ST.REG = htobe64(X); } while (0)
+#define putp64(ST, REG, X)  do { (ST)->REG = htobe64(X); } while (0)
+
+/*
+ * Completion extension ce(0) ce(1) ce(2).  Bits ce(3-7)
+ * unused.  Section 6.6 Figure 6.7.
+ */
+
+#define get_csb_ce(ST) ((uint32_t)getnn(ST, csb_ce))
+#define get_csb_ce_ms3b(ST) (get_csb_ce(ST) >> 5)
+#define put_csb_ce_ms3b(ST, X) do { putnn(ST, csb_ce, ((uint32_t)(X) << 5)); \
+				   } while (0)
+
+#define CSB_CE_PARTIAL         0x4
+#define CSB_CE_TERMINATE       0x2
+#define CSB_CE_TPBC_VALID      0x1
+
+#define csb_ce_termination(X)         (!!((X) & CSB_CE_TERMINATE))
+/* termination, output buffers may be modified, SPBC/TPBC invalid Fig.6-7 */
+
+#define csb_ce_check_completion(X)    (!csb_ce_termination(X))
+/* if not terminated then check full or partial completion */
+
+#define csb_ce_partial_completion(X)  (!!((X) & CSB_CE_PARTIAL))
+#define csb_ce_full_completion(X)     (!csb_ce_partial_completion(X))
+#define csb_ce_tpbc_valid(X)          (!!((X) & CSB_CE_TPBC_VALID))
+/* TPBC indicates successfully stored data count */
+
+#define csb_ce_default_err(X)         csb_ce_termination(X)
+/* most error CEs have CE(0)=0 and CE(1)=1 */
+
+#define csb_ce_cc3_partial(X)         csb_ce_partial_completion(X)
+/* some CC=3 are partially completed, Table 6-8 */
+
+#define csb_ce_cc64(X)                ((X)&(CSB_CE_PARTIAL \
+					| CSB_CE_TERMINATE) == 0)
+/* Compression: when TPBC>SPBC then CC=64 Table 6-8; target didn't
+ * compress smaller than source.
+ */
+
+/* Decompress SFBT combinations Tables 5-3, 6-4, 6-6 */
+
+#define SFBT_BFINAL 0x1
+#define SFBT_LIT    0x4
+#define SFBT_FHT    0x5
+#define SFBT_DHT    0x6
+#define SFBT_HDR    0x7
+
+/*
+ * NX gzip function codes. Table 6.2.
+ * Bits 0:4 are the FC. Bit 5 is used by the DMA controller to
+ * select one of the two Byte Count Limits.
+ */
+
+#define GZIP_FC_LIMIT_MASK                               0x01
+#define GZIP_FC_COMPRESS_FHT                             0x00
+#define GZIP_FC_COMPRESS_DHT                             0x02
+#define GZIP_FC_COMPRESS_FHT_COUNT                       0x04
+#define GZIP_FC_COMPRESS_DHT_COUNT                       0x06
+#define GZIP_FC_COMPRESS_RESUME_FHT                      0x08
+#define GZIP_FC_COMPRESS_RESUME_DHT                      0x0a
+#define GZIP_FC_COMPRESS_RESUME_FHT_COUNT                0x0c
+#define GZIP_FC_COMPRESS_RESUME_DHT_COUNT                0x0e
+#define GZIP_FC_DECOMPRESS                               0x10
+#define GZIP_FC_DECOMPRESS_SINGLE_BLK_N_SUSPEND          0x12
+#define GZIP_FC_DECOMPRESS_RESUME                        0x14
+#define GZIP_FC_DECOMPRESS_RESUME_SINGLE_BLK_N_SUSPEND   0x16
+#define GZIP_FC_WRAP                                     0x1e
+
+#define fc_is_compress(fc)  (((fc) & 0x10) == 0)
+#define fc_has_count(fc)    (fc_is_compress(fc) && (((fc) & 0x4) != 0))
+
+/* CSB.CC Error codes */
+
+#define ERR_NX_OK             0
+#define ERR_NX_ALIGNMENT      1
+#define ERR_NX_OPOVERLAP      2
+#define ERR_NX_DATA_LENGTH    3
+#define ERR_NX_TRANSLATION    5
+#define ERR_NX_PROTECTION     6
+#define ERR_NX_EXTERNAL_UE7   7
+#define ERR_NX_INVALID_OP     8
+#define ERR_NX_PRIVILEGE      9
+#define ERR_NX_INTERNAL_UE   10
+#define ERR_NX_EXTERN_UE_WR  12
+#define ERR_NX_TARGET_SPACE  13
+#define ERR_NX_EXCESSIVE_DDE 14
+#define ERR_NX_TRANSL_WR     15
+#define ERR_NX_PROTECT_WR    16
+#define ERR_NX_SUBFUNCTION   17
+#define ERR_NX_FUNC_ABORT    18
+#define ERR_NX_BYTE_MAX      19
+#define ERR_NX_CORRUPT_CRB   20
+#define ERR_NX_INVALID_CRB   21
+#define ERR_NX_INVALID_DDE   30
+#define ERR_NX_SEGMENTED_DDL 31
+#define ERR_NX_DDE_OVERFLOW  33
+#define ERR_NX_TPBC_GT_SPBC  64
+#define ERR_NX_MISSING_CODE  66
+#define ERR_NX_INVALID_DIST  67
+#define ERR_NX_INVALID_DHT   68
+#define ERR_NX_EXTERNAL_UE90 90
+#define ERR_NX_WDOG_TIMER   224
+#define ERR_NX_AT_FAULT     250
+#define ERR_NX_INTR_SERVER  252
+#define ERR_NX_UE253        253
+#define ERR_NX_NO_HW        254
+#define ERR_NX_HUNG_OP      255
+#define ERR_NX_END          256
+
+/* initial values for non-resume operations */
+#define INIT_CRC   0  /* crc32(0L, Z_NULL, 0) */
+#define INIT_ADLER 1  /* adler32(0L, Z_NULL, 0)  adler is initalized to 1 */
+
+/* prototypes */
+#ifdef NX_JOB_CALLBACK
+int nxu_run_job(nx_gzip_crb_cpb_t *c, void *handle,
+		int (*callback)(const void *));
+#else
+int nxu_run_job(nx_gzip_crb_cpb_t *c, void *handle);
+#endif
+
+
+/* caller supplies a print buffer 4*sizeof(crb) */
+
+char *nx_crb_str(nx_gzip_crb_t *crb, char *prbuf);
+char *nx_cpb_str(nx_gzip_cpb_t *cpb, char *prbuf);
+char *nx_prt_hex(void *cp, int sz, char *prbuf);
+char *nx_lzcount_str(nx_gzip_cpb_t *cpb, char *prbuf);
+char *nx_strerror(int e);
+
+#ifdef NX_SIM
+#include <stdio.h>
+int nx_sim_init(void *ctx);
+int nx_sim_end(void *ctx);
+int nxu_run_sim_job(nx_gzip_crb_cpb_t *c, void *ctx);
+#endif /* NX_SIM */
+
+/* Deflate stream manipulation */
+
+#define set_final_bit(x) do { x |= (unsigned char)1; } while (0)
+#define clr_final_bit(x) do { x &= ~(unsigned char)1; } while (0)
+
+#define append_empty_fh_blk(p, b) do { *(p) = (2 | (1&(b))); *((p)+1) = 0; \
+				    } while (0)
+/* append 10 bits 0000001b 00...... ;
+ * assumes appending starts on a byte boundary; b is the final bit.
+ */
+
+
+#ifdef NX_842
+
+/* 842 Engine */
+
+typedef struct {
+    union {                   /* byte[0:3]   */
+	uint32_t eft_fc;      /* bits[29-31] */
+    };
+    uint32_t reserved1;       /* byte[4:7]   */
+    union {
+	uint64_t csb_address; /* byte[8:15]  */
+	struct {
+	    uint32_t reserved2;
+	    union {
+		uint32_t crb_c;
+		/* c==0 no ccb defined */
+
+		uint32_t crb_at;
+		/* at==0 address type is ignored;
+		   all addrs effective assumed */
+
+	    };
+	};
+    };
+    nx_dde_t source_dde;           /* byte[16:31] */
+    nx_dde_t target_dde;           /* byte[32:47] */
+    nx_ccb_t ccb;                  /* byte[48:63] */
+    union {
+	nx_qw_t reserved64[3];     /* byte[64:96] */
+    };
+    nx_csb_t csb;
+} nx_eft_crb_t __attribute__((aligned (128)));
+
+/* 842 CRB */
+
+#define EFT_FC_MASK                 size_mask(3)
+#define EFT_FC_OFFSET               31
+#define EFT_FC_COMPRESS             0x0
+#define EFT_FC_COMPRESS_WITH_CRC    0x1
+#define EFT_FC_DECOMPRESS           0x2
+#define EFT_FC_DECOMPRESS_WITH_CRC  0x3
+#define EFT_FC_BLK_DATA_MOVE        0x4
+#endif /* NX_842 */
+
+#endif /* _NXU_H */
-- 
2.21.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 3/5] selftests/powerpc: Add NX-GZIP engine compress testcase
  2020-03-16 18:07 [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase Raphael Moreira Zinsly
  2020-03-16 18:07 ` [PATCH 1/5] selftests/powerpc: Add header files for GZIP engine test Raphael Moreira Zinsly
  2020-03-16 18:07 ` [PATCH 2/5] selftests/powerpc: Add header files for NX compresion/decompression Raphael Moreira Zinsly
@ 2020-03-16 18:07 ` Raphael Moreira Zinsly
  2020-03-16 18:07 ` [PATCH 4/5] selftests/powerpc: Add NX-GZIP engine decompress testcase Raphael Moreira Zinsly
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 14+ messages in thread
From: Raphael Moreira Zinsly @ 2020-03-16 18:07 UTC (permalink / raw)
  To: linuxppc-dev, linux-crypto
  Cc: herbert, mpe, haren, abali, Raphael Moreira Zinsly

Add a compression testcase for the powerpc NX-GZIP engine.

Signed-off-by: Bulent Abali <abali@us.ibm.com>
Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
---
 .../selftests/powerpc/nx-gzip/Makefile        |  21 +
 .../selftests/powerpc/nx-gzip/gzfht_test.c    | 475 ++++++++++++++++++
 .../selftests/powerpc/nx-gzip/gzip_vas.c      | 257 ++++++++++
 3 files changed, 753 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/Makefile
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c

diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile
new file mode 100644
index 000000000000..ab903f63bbbd
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile
@@ -0,0 +1,21 @@
+CC = gcc
+CFLAGS = -O3
+INC = ./inc
+SRC = gzfht_test.c
+OBJ = $(SRC:.c=.o)
+TESTS = gzfht_test
+EXTRA_SOURCES = gzip_vas.c
+
+all:	$(TESTS)
+
+$(OBJ): %.o: %.c
+	$(CC) $(CFLAGS) -I$(INC) -c $<
+
+$(TESTS): $(OBJ)
+	$(CC) $(CFLAGS) -I$(INC) -o $@ $@.o $(EXTRA_SOURCES)
+
+run_tests: $(TESTS)
+	./gzfht_test gzip_vas.c
+
+clean:
+	rm -f $(TESTS) *.o *~ *.gz
diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
new file mode 100644
index 000000000000..29d83fe2694f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
@@ -0,0 +1,475 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * P9 gzip sample code for demonstrating the P9 NX hardware interface.
+ * Not intended for productive uses or for performance or compression
+ * ratio measurements.  For simplicity of demonstration, this sample
+ * code compresses in to fixed Huffman blocks only (Deflate btype=1)
+ * and has very simple memory management.  Dynamic Huffman blocks
+ * (Deflate btype=2) are more involved as detailed in the user guide.
+ * Note also that /dev/crypto/gzip, VAS and skiboot support are
+ * required.
+ *
+ * Copyright 2020 IBM Corp.
+ *
+ * https://github.com/libnxz/power-gzip for zlib api and other utils
+ *
+ * Author: Bulent Abali <abali@us.ibm.com>
+ *
+ * Definitions of acronyms used here. See
+ * P9 NX Gzip Accelerator User's Manual for details
+ *
+ * adler/crc: 32 bit checksums appended to stream tail
+ * ce:       completion extension
+ * cpb:      coprocessor parameter block (metadata)
+ * crb:      coprocessor request block (command)
+ * csb:      coprocessor status block (status)
+ * dht:      dynamic huffman table
+ * dde:      data descriptor element (address, length)
+ * ddl:      list of ddes
+ * dh/fh:    dynamic and fixed huffman types
+ * fc:       coprocessor function code
+ * histlen:  history/dictionary length
+ * history:  sliding window of up to 32KB of data
+ * lzcount:  Deflate LZ symbol counts
+ * rembytecnt: remaining byte count
+ * sfbt:     source final block type; last block's type during decomp
+ * spbc:     source processed byte count
+ * subc:     source unprocessed bit count
+ * tebc:     target ending bit count; valid bits in the last byte
+ * tpbc:     target processed byte count
+ * vas:      virtual accelerator switch; the user mode interface
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/fcntl.h>
+#include <sys/mman.h>
+#include <endian.h>
+#include <bits/endian.h>
+#include <sys/ioctl.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include "nxu.h"
+#include "nx.h"
+
+int nx_dbg = 0;
+FILE *nx_gzip_log = NULL;
+
+extern void *nx_fault_storage_address;
+extern void *nx_function_begin(int function, int pri);
+extern int nx_function_end(void *handle);
+
+#define NX_MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+
+/*
+ * LZ counts returned in the user supplied nx_gzip_crb_cpb_t structure.
+ */
+static int compress_fht_sample(char *src, uint32_t srclen, char *dst,
+				uint32_t dstlen, int with_count,
+				nx_gzip_crb_cpb_t *cmdp, void *handle)
+{
+	int cc;
+	uint32_t fc;
+
+	assert(!!cmdp);
+
+	put32(cmdp->crb, gzip_fc, 0);  /* clear */
+	fc = (with_count) ? GZIP_FC_COMPRESS_RESUME_FHT_COUNT :
+			    GZIP_FC_COMPRESS_RESUME_FHT;
+	putnn(cmdp->crb, gzip_fc, fc);
+	putnn(cmdp->cpb, in_histlen, 0); /* resuming with no history */
+	memset((void *) &cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
+
+	/* Section 6.6 programming notes; spbc may be in two different
+	 * places depending on FC.
+	 */
+	if (!with_count)
+		put32(cmdp->cpb, out_spbc_comp, 0);
+	else
+		put32(cmdp->cpb, out_spbc_comp_with_count, 0);
+
+	/* Figure 6-3 6-4; CSB location */
+	put64(cmdp->crb, csb_address, 0);
+	put64(cmdp->crb, csb_address,
+	      (uint64_t) &cmdp->crb.csb & csb_address_mask);
+
+	/* Source direct dde (scatter-gather list) */
+	clear_dde(cmdp->crb.source_dde);
+	putnn(cmdp->crb.source_dde, dde_count, 0);
+	put32(cmdp->crb.source_dde, ddebc, srclen);
+	put64(cmdp->crb.source_dde, ddead, (uint64_t) src);
+
+	/* Target direct dde (scatter-gather list) */
+	clear_dde(cmdp->crb.target_dde);
+	putnn(cmdp->crb.target_dde, dde_count, 0);
+	put32(cmdp->crb.target_dde, ddebc, dstlen);
+	put64(cmdp->crb.target_dde, ddead, (uint64_t) dst);
+
+	/* Submit the crb, the job descriptor, to the accelerator */
+	nxu_run_job(cmdp, handle);
+
+	/* Poll for the csb.v bit; you should also consider sleeping
+	 * or interrupts.
+	 */
+	do { ; } while (getnn(cmdp->crb.csb, csb_v) == 0);
+
+	/* CC Table 6-8 */
+	cc = getnn(cmdp->crb.csb, csb_cc);
+
+	return cc;
+}
+
+/*
+ * Prepares a blank no filename no timestamp gzip header and returns
+ * the number of bytes written to buf.
+ * Gzip specification at https://tools.ietf.org/html/rfc1952
+ */
+int gzip_header_blank(char *buf)
+{
+	int i = 0;
+	buf[i++] = 0x1f; /* ID1 */
+	buf[i++] = 0x8b; /* ID2 */
+	buf[i++] = 0x08; /* CM  */
+	buf[i++] = 0x00; /* FLG */
+	buf[i++] = 0x00; /* MTIME */
+	buf[i++] = 0x00; /* MTIME */
+	buf[i++] = 0x00; /* MTIME */
+	buf[i++] = 0x00; /* MTIME */
+	buf[i++] = 0x04; /* XFL 4=fastest */
+	buf[i++] = 0x03; /* OS UNIX */
+	return i;
+}
+
+/* Caller must free the allocated buffer return nonzero on error. */
+int read_alloc_input_file(char *fname, char **buf, size_t *bufsize)
+{
+	struct stat statbuf;
+	FILE *fp;
+	char *p;
+	size_t num_bytes;
+	if (stat(fname, &statbuf)) {
+		perror(fname);
+		return(-1);
+	}
+	if (NULL == (fp = fopen(fname, "r"))) {
+		perror(fname);
+		return(-1);
+	}
+	assert(NULL != (p = (char *) malloc(statbuf.st_size)));
+	num_bytes = fread(p, 1, statbuf.st_size, fp);
+	if (ferror(fp) || (num_bytes != statbuf.st_size)) {
+		perror(fname);
+		return(-1);
+	}
+	*buf = p;
+	*bufsize = num_bytes;
+	return 0;
+}
+
+/* Returns nonzero on error */
+int write_output_file(char *fname, char *buf, size_t bufsize)
+{
+	FILE *fp;
+	size_t num_bytes;
+	if (NULL == (fp = fopen(fname, "w"))) {
+		perror(fname);
+		return(-1);
+	}
+	num_bytes = fwrite(buf, 1, bufsize, fp);
+	if (ferror(fp) || (num_bytes != bufsize)) {
+		perror(fname);
+		return(-1);
+	}
+	fclose(fp);
+	return 0;
+}
+
+/*
+ * Z_SYNC_FLUSH as described in zlib.h.
+ * Returns number of appended bytes
+ */
+int append_sync_flush(char *buf, int tebc, int final)
+{
+	uint64_t flush;
+	int shift = (tebc & 0x7);
+	if (tebc > 0) {
+		/* Last byte is partially full */
+		buf = buf - 1;
+		*buf = *buf & (unsigned char) ((1<<tebc)-1);
+	} else
+		*buf = 0;
+	flush = ((0x1ULL & final) << shift) | *buf;
+	shift = shift + 3; /* BFINAL and BTYPE written */
+	shift = (shift <= 8) ? 8 : 16;
+	flush |= (0xFFFF0000ULL) << shift; /* Zero length block */
+	shift = shift + 32;
+	while (shift > 0) {
+		*buf++ = (unsigned char) (flush & 0xffULL);
+		flush = flush >> 8;
+		shift = shift - 8;
+	}
+	return(((tebc > 5) || (tebc == 0)) ? 5 : 4);
+}
+
+/*
+ * Fault in pages prior to NX job submission. wr=1 may be required to
+ * touch writeable pages.  System zero pages do not fault-in the page as
+ * intended.  Typically set wr=1 for NX target pages and set wr=0 for NX
+ * source pages.
+ */
+static int nx_touch_pages(void *buf, long buf_len, long page_len, int wr)
+{
+	char *begin = buf;
+	char *end = (char *) buf + buf_len - 1;
+	volatile char t;
+
+	assert(buf_len >= 0 && !!buf);
+
+	NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf,
+			(buf + buf_len), buf_len, wr));
+
+	if (buf_len <= 0 || buf == NULL)
+		return -1;
+
+	do {
+		t = *begin;
+		if (wr)
+			*begin = t;
+		begin = begin + page_len;
+	} while (begin < end);
+
+	/* When buf_sz is small or buf tail is in another page */
+	t = *end;
+	if (wr)
+		*end = t;
+
+	return 0;
+}
+
+/*
+ * Final deflate block bit.  This call assumes the block
+ * beginning is byte aligned.
+ */
+static void set_bfinal(void *buf, int bfinal)
+{
+	char *b = buf;
+	if (bfinal)
+		*b = *b | (unsigned char) 0x01;
+	else
+		*b = *b & (unsigned char) 0xfe;
+}
+
+int compress_file(int argc, char **argv, void *handle)
+{
+	char *inbuf, *outbuf, *srcbuf, *dstbuf;
+	char outname[1024];
+	uint32_t srclen, dstlen;
+	uint32_t flushlen, chunk;
+	size_t inlen, outlen, dsttotlen, srctotlen;
+	uint32_t crc, spbc, tpbc, tebc;
+	int lzcounts = 0;
+	int cc;
+	int num_hdr_bytes;
+	nx_gzip_crb_cpb_t *cmdp;
+	uint32_t pagelen = 65536;
+	int fault_tries = 50;
+
+	cmdp = (void *)(uintptr_t) aligned_alloc(sizeof(nx_gzip_crb_t),
+						 sizeof(nx_gzip_crb_cpb_t));
+
+	if (argc != 2) {
+		fprintf(stderr, "usage: %s <fname>\n", argv[0]);
+		exit(-1);
+	}
+	if (read_alloc_input_file(argv[1], &inbuf, &inlen))
+		exit(-1);
+	fprintf(stderr, "file %s read, %ld bytes\n", argv[1], inlen);
+
+	/* Generous output buffer for header/trailer */
+	outlen = 2 * inlen + 1024;
+
+	assert(NULL != (outbuf = (char *)malloc(outlen)));
+	nx_touch_pages(outbuf, outlen, pagelen, 1);
+
+	/* Compress piecemeal in smallish chunks */
+	chunk = 1<<22;
+
+	/* Write the gzip header to the stream */
+	num_hdr_bytes = gzip_header_blank(outbuf);
+	dstbuf    = outbuf + num_hdr_bytes;
+	outlen    = outlen - num_hdr_bytes;
+	dsttotlen = num_hdr_bytes;
+
+	srcbuf    = inbuf;
+	srctotlen = 0;
+
+	/* Init the CRB, the coprocessor request block */
+	memset(&cmdp->crb, 0, sizeof(cmdp->crb));
+
+	/* Initial gzip crc32 */
+	put32(cmdp->cpb, in_crc, 0);
+
+	fault_tries = 50;
+
+	while (inlen > 0) {
+
+		/* Submit chunk size source data per job */
+		srclen = NX_MIN(chunk, inlen);
+		/* Supply large target in case data expands */
+		dstlen = NX_MIN(2*srclen, outlen);
+
+		/* Page faults are handled by the user code */
+
+		/* Fault-in pages; an improved code wouldn't touch so
+		 * many pages but would try to estimate the
+		 * compression ratio and adjust both the src and dst
+		 * touch amounts.
+		 */
+		nx_touch_pages(cmdp, sizeof(nx_gzip_crb_cpb_t), pagelen, 1);
+		nx_touch_pages(srcbuf, srclen, pagelen, 0);
+		nx_touch_pages(dstbuf, dstlen, pagelen, 1);
+
+		cc = compress_fht_sample(
+			srcbuf, srclen,
+			dstbuf, dstlen,
+			lzcounts, cmdp, handle);
+
+		if (cc != ERR_NX_OK && cc != ERR_NX_TPBC_GT_SPBC &&
+		    cc != ERR_NX_TRANSLATION) {
+			fprintf(stderr, "nx error: cc= %d\n", cc);
+			exit(-1);
+		}
+
+		/* Page faults are handled by the user code */
+		if (cc == ERR_NX_TRANSLATION) {
+			volatile char touch = *(char *)cmdp->crb.csb.fsaddr;
+			NXPRT(fprintf(stderr, "page fault: cc= %d, try= %d, \
+				  fsa= %08llx\n", cc, fault_tries,
+				  (unsigned long long) cmdp->crb.csb.fsaddr));
+
+			fault_tries--;
+			if (fault_tries > 0) {
+				continue;
+			} else {
+				fprintf(stderr, "error: \
+					 cannot progress; too many faults\n");
+				exit(-1);
+			};
+		}
+
+		fault_tries = 50; /* Reset for the next chunk */
+
+		inlen     = inlen - srclen;
+		srcbuf    = srcbuf + srclen;
+		srctotlen = srctotlen + srclen;
+
+		/* Two possible locations for spbc depending on the function
+		 * code.
+		 */
+		spbc = (!lzcounts) ? get32(cmdp->cpb, out_spbc_comp) :
+			get32(cmdp->cpb, out_spbc_comp_with_count);
+		assert(spbc == srclen);
+
+		/* Target byte count */
+		tpbc = get32(cmdp->crb.csb, tpbc);
+		/* Target ending bit count */
+		tebc = getnn(cmdp->cpb, out_tebc);
+		NXPRT(fprintf(stderr, "compressed chunk %d to %d bytes, \
+				tebc= %d\n", spbc, tpbc, tebc));
+
+		if (inlen > 0) { /* More chunks to go */
+			set_bfinal(dstbuf, 0);
+			dstbuf    = dstbuf + tpbc;
+			dsttotlen = dsttotlen + tpbc;
+			outlen    = outlen - tpbc;
+			/* Round up to the next byte with a flush
+			 * block; do not set the BFINAqL bit.
+			 */
+			flushlen  = append_sync_flush(dstbuf, tebc, 0);
+			dsttotlen = dsttotlen + flushlen;
+			outlen    = outlen - flushlen;
+			dstbuf    = dstbuf + flushlen;
+			NXPRT(fprintf(stderr, "added deflate sync_flush %d \
+					bytes\n", flushlen));
+		} else {  /* Done */
+			/* Set the BFINAL bit of the last block per Deflate
+			 * specification.
+			 */
+			set_bfinal(dstbuf, 1);
+			dstbuf    = dstbuf + tpbc;
+			dsttotlen = dsttotlen + tpbc;
+			outlen    = outlen - tpbc;
+		}
+
+		/* Resuming crc32 for the next chunk */
+		crc = get32(cmdp->cpb, out_crc);
+		put32(cmdp->cpb, in_crc, crc);
+		crc = be32toh(crc);
+	}
+
+	/* Append crc32 and ISIZE to the end */
+	memcpy(dstbuf, &crc, 4);
+	memcpy(dstbuf+4, &srctotlen, 4);
+	dsttotlen = dsttotlen + 8;
+	outlen    = outlen - 8;
+
+	strcpy(outname, argv[1]);
+	strcat(outname, ".nx.gz");
+	if (write_output_file(outname, outbuf, dsttotlen)) {
+		fprintf(stderr, "write error: %s\n", outname);
+		exit(-1);
+	}
+
+	fprintf(stderr, "compressed %ld to %ld bytes total, \
+		crc32 checksum = %08x\n", srctotlen, dsttotlen, crc);
+
+	if (inbuf != NULL)
+		free(inbuf);
+
+	if (outbuf != NULL)
+		free(outbuf);
+
+	return 0;
+}
+
+void sigsegv_handler(int sig, siginfo_t *info, void *ctx)
+{
+	fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(),
+		sig, info->si_code, info->si_addr);
+
+	nx_fault_storage_address = info->si_addr;
+}
+
+int main(int argc, char **argv)
+{
+	int rc;
+	struct sigaction act;
+	void *handle;
+
+	act.sa_handler = 0;
+	act.sa_sigaction = sigsegv_handler;
+	act.sa_flags = SA_SIGINFO;
+	act.sa_restorer = 0;
+	sigemptyset(&act.sa_mask);
+	sigaction(SIGSEGV, &act, NULL);
+
+	handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
+	if (!handle) {
+		fprintf(stderr, "Unable to init NX, errno %d\n", errno);
+		exit(-1);
+	}
+
+	rc = compress_file(argc, argv, handle);
+
+	nx_function_end(handle);
+
+	return rc;
+}
diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c b/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c
new file mode 100644
index 000000000000..6408d7c0b8ac
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright 2020 IBM Corp.
+ *
+ * Author: Bulent Abali <abali@us.ibm.com>
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/fcntl.h>
+#include <sys/mman.h>
+#include <endian.h>
+#include <bits/endian.h>
+#include <sys/ioctl.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include "nx-gzip.h"
+#include "nx.h"
+#include "copy-paste.h"
+#include "nxu.h"
+#include "nx_dbg.h"
+#include <sys/platform/ppc.h>
+
+#define barrier()
+#define hwsync()    asm volatile("hwsync" ::: "memory")
+
+#ifndef NX_NO_CPU_PRI
+#define cpu_pri_default()  asm volatile ("or 2, 2, 2")
+#define cpu_pri_low()      asm volatile ("or 31, 31, 31")
+#else
+#define cpu_pri_default()  do { ; } while (0)
+#define cpu_pri_low()      do { ; } while (0)
+#endif
+
+void *nx_fault_storage_address;
+uint64_t dbgtimer = 0;
+
+struct nx_handle {
+	int fd;
+	int function;
+	void *paste_addr;
+};
+
+static int open_device_nodes(char *devname, int pri, struct nx_handle *handle)
+{
+	int rc, fd;
+	void *addr;
+	struct vas_gzip_setup_attr txattr;
+
+	fd = open(devname, O_RDWR);
+	if (fd < 0) {
+		fprintf(stderr, " open device name %s\n", devname);
+		return -errno;
+	}
+
+	memset(&txattr, 0, sizeof(txattr));
+	txattr.version = 1;
+	txattr.vas_id = pri;
+	rc = ioctl(fd, VAS_GZIP_TX_WIN_OPEN, (unsigned long)&txattr);
+	if (rc < 0) {
+		fprintf(stderr, "ioctl() n %d, error %d\n", rc, errno);
+		rc = -errno;
+		goto out;
+	}
+
+	addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0ULL);
+	if (addr == MAP_FAILED) {
+		fprintf(stderr, "mmap() failed, errno %d\n", errno);
+		rc = -errno;
+		goto out;
+	}
+	handle->fd = fd;
+	handle->paste_addr = (void *)((char *)addr + 0x400);
+
+	rc = 0;
+out:
+	close(fd);
+	return rc;
+}
+
+void *nx_function_begin(int function, int pri)
+{
+	int rc;
+	char *devname = "/dev/crypto/nx-gzip";
+	struct nx_handle *nxhandle;
+
+	if (function != NX_FUNC_COMP_GZIP) {
+		errno = EINVAL;
+		fprintf(stderr, " NX_FUNC_COMP_GZIP not found\n");
+		return NULL;
+	}
+
+
+	nxhandle = malloc(sizeof(*nxhandle));
+	if (!nxhandle) {
+		errno = ENOMEM;
+		fprintf(stderr, " No memory\n");
+		return NULL;
+	}
+
+	nxhandle->function = function;
+	rc = open_device_nodes(devname, pri, nxhandle);
+	if (rc < 0) {
+		errno = -rc;
+		fprintf(stderr, " open_device_nodes failed\n");
+		return NULL;
+	}
+
+	return nxhandle;
+}
+
+int nx_function_end(void *handle)
+{
+	int rc = 0;
+	struct nx_handle *nxhandle = handle;
+
+	rc = munmap(nxhandle->paste_addr - 0x400, 4096);
+	if (rc < 0) {
+		fprintf(stderr, "munmap() failed, errno %d\n", errno);
+		return rc;
+	}
+	close(nxhandle->fd);
+	free(nxhandle);
+
+	return rc;
+}
+
+static int nx_wait_for_csb(nx_gzip_crb_cpb_t *cmdp)
+{
+	volatile long poll = 0;
+	uint64_t t;
+
+	/* Save power and let other threads use the h/w. top may show
+	 * 100% but only because OS doesn't know we slowed the this
+	 * h/w thread while polling. We're letting other threads have
+	 * higher throughput on the core.
+	 */
+	cpu_pri_low();
+
+#define CSB_MAX_POLL 200000000UL
+#define USLEEP_TH     300000UL
+
+	t = __ppc_get_timebase();
+
+	while (getnn(cmdp->crb.csb, csb_v) == 0) {
+		++poll;
+		hwsync();
+
+		cpu_pri_low();
+
+		/* usleep(0) takes around 29000 ticks ~60 us.
+		 * 300000 is spinning for about 600 us then
+		 * start sleeping.
+		 */
+		if ((__ppc_get_timebase() - t) > USLEEP_TH) {
+			cpu_pri_default();
+			usleep(1);
+		}
+
+		if (poll > CSB_MAX_POLL)
+			break;
+
+		/* Fault address from signal handler */
+		if (nx_fault_storage_address) {
+			cpu_pri_default();
+			return -EAGAIN;
+		}
+
+	}
+
+	cpu_pri_default();
+
+	/* hw has updated csb and output buffer */
+	hwsync();
+
+	/* Check CSB flags. */
+	if (getnn(cmdp->crb.csb, csb_v) == 0) {
+		fprintf(stderr, "CSB still not valid after %d polls, \
+			 giving up", (int) poll);
+		prt_err("CSB still not valid after %d polls, giving up.\n",
+			(int) poll);
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+int nxu_run_job(nx_gzip_crb_cpb_t *cmdp, void *handle)
+{
+	int i, ret, retries;
+	struct nx_handle *nxhandle = handle;
+
+	assert(handle != NULL);
+	i = 0;
+	retries = 5000;
+	while (i++ < retries) {
+		hwsync();
+		vas_copy(&cmdp->crb, 0);
+		ret = vas_paste(nxhandle->paste_addr, 0);
+		hwsync();
+
+		NXPRT(fprintf(stderr, "Paste attempt %d/%d returns 0x%x\n",
+				i, retries, ret));
+
+		if ((ret == 2) || (ret == 3)) {
+
+			ret = nx_wait_for_csb(cmdp);
+			if (!ret) {
+				goto out;
+			} else if (ret == -EAGAIN) {
+				volatile long x;
+				prt_err("Touching address %p, 0x%lx\n",
+					 nx_fault_storage_address,
+					 *(long *) nx_fault_storage_address);
+				x = *(long *) nx_fault_storage_address;
+				*(long *) nx_fault_storage_address = x;
+				nx_fault_storage_address = 0;
+				continue;
+			} else {
+				prt_err("wait_for_csb() returns %d\n", ret);
+				break;
+			}
+		} else {
+			if (i < 10) {
+				/* spin for few ticks */
+#define SPIN_TH 500UL
+				uint64_t fail_spin;
+				fail_spin = __ppc_get_timebase();
+				while ((__ppc_get_timebase() - fail_spin) <
+					 SPIN_TH)
+					{ ; }
+			} else {
+				/* sleep */
+				static unsigned int pr = 0;
+				if (pr++ % 100 == 0) {
+					prt_err("Paste attempt %d/%d, \
+						 failed pid= %d\n", i, retries,
+						 getpid());
+				}
+				usleep(1);
+			}
+			continue;
+		}
+	}
+
+out:
+	cpu_pri_default();
+
+	return ret;
+}
-- 
2.21.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 4/5] selftests/powerpc: Add NX-GZIP engine decompress testcase
  2020-03-16 18:07 [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase Raphael Moreira Zinsly
                   ` (2 preceding siblings ...)
  2020-03-16 18:07 ` [PATCH 3/5] selftests/powerpc: Add NX-GZIP engine compress testcase Raphael Moreira Zinsly
@ 2020-03-16 18:07 ` Raphael Moreira Zinsly
  2020-03-18  4:31   ` Daniel Axtens
  2020-03-18  6:18   ` Daniel Axtens
  2020-03-16 18:07 ` [PATCH 5/5] selftests/powerpc: Add README for GZIP engine tests Raphael Moreira Zinsly
  2020-03-16 21:50 ` [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase Haren Myneni
  5 siblings, 2 replies; 14+ messages in thread
From: Raphael Moreira Zinsly @ 2020-03-16 18:07 UTC (permalink / raw)
  To: linuxppc-dev, linux-crypto
  Cc: herbert, mpe, haren, abali, Raphael Moreira Zinsly

Include a decompression testcase for the powerpc NX-GZIP
engine.

Signed-off-by: Bulent Abali <abali@us.ibm.com>
Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
---
 .../selftests/powerpc/nx-gzip/Makefile        |    7 +-
 .../selftests/powerpc/nx-gzip/gunz_test.c     | 1058 +++++++++++++++++
 2 files changed, 1062 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/gunz_test.c

diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile
index ab903f63bbbd..82abc19a49a0 100644
--- a/tools/testing/selftests/powerpc/nx-gzip/Makefile
+++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile
@@ -1,9 +1,9 @@
 CC = gcc
 CFLAGS = -O3
 INC = ./inc
-SRC = gzfht_test.c
+SRC = gzfht_test.c gunz_test.c
 OBJ = $(SRC:.c=.o)
-TESTS = gzfht_test
+TESTS = gzfht_test gunz_test
 EXTRA_SOURCES = gzip_vas.c
 
 all:	$(TESTS)
@@ -16,6 +16,7 @@ $(TESTS): $(OBJ)
 
 run_tests: $(TESTS)
 	./gzfht_test gzip_vas.c
+	./gunz_test gzip_vas.c.nx.gz
 
 clean:
-	rm -f $(TESTS) *.o *~ *.gz
+	rm -f $(TESTS) *.o *~ *.gz *.gunzip
diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
new file mode 100644
index 000000000000..653de92698cc
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
@@ -0,0 +1,1058 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * P9 gunzip sample code for demonstrating the P9 NX hardware
+ * interface.  Not intended for productive uses or for performance or
+ * compression ratio measurements.  Note also that /dev/crypto/gzip,
+ * VAS and skiboot support are required
+ *
+ * Copyright 2020 IBM Corp.
+ *
+ * Author: Bulent Abali <abali@us.ibm.com>
+ *
+ * https://github.com/libnxz/power-gzip for zlib api and other utils
+ * Definitions of acronyms used here.  See
+ * P9 NX Gzip Accelerator User's Manual for details
+ *
+ * adler/crc: 32 bit checksums appended to stream tail
+ * ce:       completion extension
+ * cpb:      coprocessor parameter block (metadata)
+ * crb:      coprocessor request block (command)
+ * csb:      coprocessor status block (status)
+ * dht:      dynamic huffman table
+ * dde:      data descriptor element (address, length)
+ * ddl:      list of ddes
+ * dh/fh:    dynamic and fixed huffman types
+ * fc:       coprocessor function code
+ * histlen:  history/dictionary length
+ * history:  sliding window of up to 32KB of data
+ * lzcount:  Deflate LZ symbol counts
+ * rembytecnt: remaining byte count
+ * sfbt:     source final block type; last block's type during decomp
+ * spbc:     source processed byte count
+ * subc:     source unprocessed bit count
+ * tebc:     target ending bit count; valid bits in the last byte
+ * tpbc:     target processed byte count
+ * vas:      virtual accelerator switch; the user mode interface
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/fcntl.h>
+#include <sys/mman.h>
+#include <endian.h>
+#include <bits/endian.h>
+#include <sys/ioctl.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include "nxu.h"
+#include "nx.h"
+
+int nx_dbg = 0;
+FILE *nx_gzip_log = NULL;
+
+#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y))
+#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y))
+
+#define mb()     asm volatile("sync" ::: "memory")
+#define rmb()    asm volatile("lwsync" ::: "memory")
+#define wmb()    rmb()
+
+const int fifo_in_len = 1<<24;
+const int fifo_out_len = 1<<24;
+const int page_sz = 1<<16;
+const int line_sz = 1<<7;
+const int window_max = 1<<15;
+const int retry_max = 50;
+
+extern void *nx_fault_storage_address;
+extern void *nx_function_begin(int function, int pri);
+extern int nx_function_end(void *handle);
+
+/*
+ * Fault in pages prior to NX job submission.  wr=1 may be required to
+ * touch writeable pages.  System zero pages do not fault-in the page as
+ * intended.  Typically set wr=1 for NX target pages and set wr=0 for
+ * NX source pages.
+ */
+static int nx_touch_pages(void *buf, long buf_len, long page_len, int wr)
+{
+	char *begin = buf;
+	char *end = (char *) buf + buf_len - 1;
+	volatile char t;
+
+	assert(buf_len >= 0 && !!buf);
+
+	NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf,
+			buf + buf_len, buf_len, wr));
+
+	if (buf_len <= 0 || buf == NULL)
+		return -1;
+
+	do {
+		t = *begin;
+		if (wr)
+			*begin = t;
+		begin = begin + page_len;
+	} while (begin < end);
+
+	/* When buf_sz is small or buf tail is in another page. */
+	t = *end;
+	if (wr)
+		*end = t;
+
+	return 0;
+}
+
+void sigsegv_handler(int sig, siginfo_t *info, void *ctx)
+{
+	fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(),
+	       sig, info->si_code, info->si_addr);
+
+	nx_fault_storage_address = info->si_addr;
+}
+
+/*
+ * Adds an (address, len) pair to the list of ddes (ddl) and updates
+ * the base dde.  ddl[0] is the only dde in a direct dde which
+ * contains a single (addr,len) pair.  For more pairs, ddl[0] becomes
+ * the indirect (base) dde that points to a list of direct ddes.
+ * See Section 6.4 of the NX-gzip user manual for DDE description.
+ * Addr=NULL, len=0 clears the ddl[0].  Returns the total number of
+ * bytes in ddl.  Caller is responsible for allocting the array of
+ * nx_dde_t *ddl.  If N addresses are required in the scatter-gather
+ * list, the ddl array must have N+1 entries minimum.
+ */
+static inline uint32_t nx_append_dde(nx_dde_t *ddl, void *addr, uint32_t len)
+{
+	uint32_t ddecnt;
+	uint32_t bytes;
+
+	if (addr == NULL && len == 0) {
+		clearp_dde(ddl);
+		return 0;
+	}
+
+	NXPRT(fprintf(stderr, "%d: nx_append_dde addr %p len %x\n", __LINE__,
+			addr, len));
+
+	/* Number of ddes in the dde list ; == 0 when it is a direct dde */
+	ddecnt = getpnn(ddl, dde_count);
+	bytes = getp32(ddl, ddebc);
+
+	if (ddecnt == 0 && bytes == 0) {
+		/* First dde is unused; make it a direct dde */
+		bytes = len;
+		putp32(ddl, ddebc, bytes);
+		putp64(ddl, ddead, (uint64_t) addr);
+	} else if (ddecnt == 0) {
+		/* Converting direct to indirect dde
+		 * ddl[0] becomes head dde of ddl
+		 * copy direct to indirect first.
+		 */
+		ddl[1] = ddl[0];
+
+		/* Add the new dde next */
+		clear_dde(ddl[2]);
+		put32(ddl[2], ddebc, len);
+		put64(ddl[2], ddead, (uint64_t) addr);
+
+		/* Ddl head points to 2 direct ddes */
+		ddecnt = 2;
+		putpnn(ddl, dde_count, ddecnt);
+		bytes = bytes + len;
+		putp32(ddl, ddebc, bytes);
+		/* Pointer to the first direct dde */
+		putp64(ddl, ddead, (uint64_t) &ddl[1]);
+	} else {
+		/* Append a dde to an existing indirect ddl */
+		++ddecnt;
+		clear_dde(ddl[ddecnt]);
+		put64(ddl[ddecnt], ddead, (uint64_t) addr);
+		put32(ddl[ddecnt], ddebc, len);
+
+		putpnn(ddl, dde_count, ddecnt);
+		bytes = bytes + len;
+		putp32(ddl, ddebc, bytes); /* byte sum of all dde */
+	}
+	return bytes;
+}
+
+/*
+ * Touch specified number of pages represented in number bytes
+ * beginning from the first buffer in a dde list.
+ * Do not touch the pages past buf_sz-th byte's page.
+ *
+ * Set buf_sz = 0 to touch all pages described by the ddep.
+ */
+static int nx_touch_pages_dde(nx_dde_t *ddep, long buf_sz, long page_sz,
+				int wr)
+{
+	uint32_t indirect_count;
+	uint32_t buf_len;
+	long total;
+	uint64_t buf_addr;
+	nx_dde_t *dde_list;
+	int i;
+
+	assert(!!ddep);
+
+	indirect_count = getpnn(ddep, dde_count);
+
+	NXPRT(fprintf(stderr, "nx_touch_pages_dde dde_count %d request len \
+			0x%lx\n", indirect_count, buf_sz));
+
+	if (indirect_count == 0) {
+		/* Direct dde */
+		buf_len = getp32(ddep, ddebc);
+		buf_addr = getp64(ddep, ddead);
+
+		NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n",
+				buf_len, (void *)buf_addr));
+
+		if (buf_sz == 0)
+			nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
+		else
+			nx_touch_pages((void *)buf_addr, NX_MIN(buf_len,
+					buf_sz), page_sz, wr);
+
+		return ERR_NX_OK;
+	}
+
+	/* Indirect dde */
+	if (indirect_count > MAX_DDE_COUNT)
+		return ERR_NX_EXCESSIVE_DDE;
+
+	/* First address of the list */
+	dde_list = (nx_dde_t *) getp64(ddep, ddead);
+
+	if (buf_sz == 0)
+		buf_sz = getp32(ddep, ddebc);
+
+	total = 0;
+	for (i = 0; i < indirect_count; i++) {
+		buf_len = get32(dde_list[i], ddebc);
+		buf_addr = get64(dde_list[i], ddead);
+		total += buf_len;
+
+		NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total \
+				0x%lx\n", buf_len, (void *)buf_addr, total));
+
+		/* Touching fewer pages than encoded in the ddebc */
+		if (total > buf_sz) {
+			buf_len = NX_MIN(buf_len, total - buf_sz);
+			nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
+			NXPRT(fprintf(stderr, "touch loop break len 0x%x \
+				      ddead %p\n", buf_len, (void *)buf_addr));
+			break;
+		}
+		nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
+	}
+	return ERR_NX_OK;
+}
+
+/*
+ * Src and dst buffers are supplied in scatter gather lists.
+ * NX function code and other parameters supplied in cmdp.
+ */
+static int nx_submit_job(nx_dde_t *src, nx_dde_t *dst, nx_gzip_crb_cpb_t *cmdp,
+			 void *handle)
+{
+	int cc;
+	uint64_t csbaddr;
+
+	memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
+
+	cmdp->crb.source_dde = *src;
+	cmdp->crb.target_dde = *dst;
+
+	/* Status, output byte count in tpbc */
+	csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask;
+	put64(cmdp->crb, csb_address, csbaddr);
+
+	/* NX reports input bytes in spbc; cleared */
+	cmdp->cpb.out_spbc_comp_wrap = 0;
+	cmdp->cpb.out_spbc_comp_with_count = 0;
+	cmdp->cpb.out_spbc_decomp = 0;
+
+	/* Clear output */
+	put32(cmdp->cpb, out_crc, INIT_CRC);
+	put32(cmdp->cpb, out_adler, INIT_ADLER);
+
+	cc = nxu_run_job(cmdp, handle);
+
+	if (!cc)
+		cc = getnn(cmdp->crb.csb, csb_cc);	/* CC Table 6-8 */
+
+	return cc;
+}
+
+/* fifo queue management */
+#define fifo_used_bytes(used) (used)
+#define fifo_free_bytes(used, len) ((len)-(used))
+/* amount of free bytes in the first and last parts */
+#define fifo_free_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
+						  ? (len)-((cur)+(used)) : 0)
+#define fifo_free_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
+						  ? (cur) : (len)-(used))
+/* amount of used bytes in the first and last parts */
+#define fifo_used_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
+						  ? (used) : (len)-(cur))
+#define fifo_used_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
+						  ? 0 : ((used)+(cur))-(len))
+/* first and last free parts start here */
+#define fifo_free_first_offset(cur, used)      ((cur)+(used))
+#define fifo_free_last_offset(cur, used, len)  \
+					   fifo_used_last_bytes(cur, used, len)
+/* first and last used parts start here */
+#define fifo_used_first_offset(cur)            (cur)
+#define fifo_used_last_offset(cur)             (0)
+
+int decompress_file(int argc, char **argv, void *devhandle)
+{
+	FILE *inpf;
+	FILE *outf;
+
+	int c, expect, i, cc, rc = 0;
+	char gzfname[1024];
+
+	/* Queuing, file ops, byte counting */
+	char *fifo_in, *fifo_out;
+	int used_in, cur_in, used_out, cur_out, read_sz, n;
+	int first_free, last_free, first_used, last_used;
+	int first_offset, last_offset;
+	int write_sz, free_space, source_sz;
+	int source_sz_estimate, target_sz_estimate;
+	uint64_t last_comp_ratio; /* 1000 max */
+	uint64_t total_out;
+	int is_final, is_eof;
+
+	/* nx hardware */
+	int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0;
+	int history_len = 0;
+	nx_gzip_crb_cpb_t cmd, *cmdp;
+	nx_dde_t *ddl_in;
+	nx_dde_t dde_in[6] __attribute__((aligned (128)));
+	nx_dde_t *ddl_out;
+	nx_dde_t dde_out[6] __attribute__((aligned (128)));
+	int pgfault_retries;
+
+	/* when using mmap'ed files */
+	off_t input_file_offset;
+
+	if (argc > 2) {
+		fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]);
+		fprintf(stderr, "    writes to stdout or <fname>.nx.gunzip\n");
+		return -1;
+	}
+
+	if (argc == 1) {
+		inpf = stdin;
+		outf = stdout;
+	} else if (argc == 2) {
+		char w[1024];
+		char *wp;
+		inpf = fopen(argv[1], "r");
+		if (inpf == NULL) {
+			perror(argv[1]);
+			return -1;
+		}
+
+		/* Make a new file name to write to.  Ignoring '.gz' */
+		wp = (NULL != (wp = strrchr(argv[1], '/'))) ? ++wp : argv[1];
+		strcpy(w, wp);
+		strcat(w, ".nx.gunzip");
+
+		outf = fopen(w, "w");
+		if (outf == NULL) {
+			perror(w);
+			return -1;
+		}
+	}
+
+#define GETINPC(X) fgetc(X)
+
+	/* Decode the gzip header */
+	c = GETINPC(inpf); expect = 0x1f; /* ID1 */
+	if (c != expect)
+		goto err1;
+
+	c = GETINPC(inpf); expect = 0x8b; /* ID2 */
+	if (c != expect)
+		goto err1;
+
+	c = GETINPC(inpf); expect = 0x08; /* CM */
+	if (c != expect)
+		goto err1;
+
+	int flg = GETINPC(inpf); /* FLG */
+	if (flg & 0b11100000 || flg & 0b100)
+		goto err2;
+
+	fprintf(stderr, "gzHeader FLG %x\n", flg);
+
+	/* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this
+	 * sample code.
+	 */
+	for (i = 0; i < 6; i++) {
+		char tmp[10];
+		if (EOF == (tmp[i] = GETINPC(inpf)))
+			goto err3;
+		fprintf(stderr, "%02x ", tmp[i]);
+		if (i == 5)
+			fprintf(stderr, "\n");
+	}
+	fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n");
+
+	/* FNAME */
+	if (flg & 0b1000) {
+		int k = 0;
+		do {
+			if (EOF == (c = GETINPC(inpf)))
+				goto err3;
+			gzfname[k++] = c;
+		} while (c);
+		fprintf(stderr, "gzHeader FNAME: %s\n", gzfname);
+	}
+
+	/* FHCRC */
+	if (flg & 0b10) {
+		c = GETINPC(inpf); c = GETINPC(inpf);
+		fprintf(stderr, "gzHeader FHCRC: ignored\n");
+	}
+
+	used_in = cur_in = used_out = cur_out = 0;
+	is_final = is_eof = 0;
+
+	/* Allocate one page larger to prevent page faults due to NX
+	 * overfetching.
+	 * Either do this (char*)(uintptr_t)aligned_alloc or use
+	 * -std=c11 flag to make the int-to-pointer warning go away.
+	 */
+	assert((fifo_in  = (char *)(uintptr_t)aligned_alloc(line_sz,
+				   fifo_in_len + page_sz)) != NULL);
+	assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz,
+				   fifo_out_len + page_sz + line_sz)) != NULL);
+	/* Leave unused space due to history rounding rules */
+	fifo_out = fifo_out + line_sz;
+	nx_touch_pages(fifo_out, fifo_out_len, page_sz, 1);
+
+	ddl_in  = &dde_in[0];
+	ddl_out = &dde_out[0];
+	cmdp = &cmd;
+	memset(&cmdp->crb, 0, sizeof(cmdp->crb));
+
+read_state:
+
+	/* Read from .gz file */
+
+	NXPRT(fprintf(stderr, "read_state:\n"));
+
+	if (is_eof != 0)
+		goto write_state;
+
+	/* We read in to fifo_in in two steps: first: read in to from
+	 * cur_in to the end of the buffer.  last: if free space wrapped
+	 * around, read from fifo_in offset 0 to offset cur_in.
+	 */
+
+	/* Reset fifo head to reduce unnecessary wrap arounds */
+	cur_in = (used_in == 0) ? 0 : cur_in;
+
+	/* Free space total is reduced by a gap */
+	free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len)
+			    - line_sz);
+
+	/* Free space may wrap around as first and last */
+	first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len);
+	last_free  = fifo_free_last_bytes(cur_in, used_in, fifo_in_len);
+
+	/* Start offsets of the free memory */
+	first_offset = fifo_free_first_offset(cur_in, used_in);
+	last_offset  = fifo_free_last_offset(cur_in, used_in, fifo_in_len);
+
+	/* Reduce read_sz because of the line_sz gap */
+	read_sz = NX_MIN(free_space, first_free);
+	n = 0;
+	if (read_sz > 0) {
+		/* Read in to offset cur_in + used_in */
+		n = fread(fifo_in + first_offset, 1, read_sz, inpf);
+		used_in = used_in + n;
+		free_space = free_space - n;
+		assert(n <= read_sz);
+		if (n != read_sz) {
+			/* Either EOF or error; exit the read loop */
+			is_eof = 1;
+			goto write_state;
+		}
+	}
+
+	/* If free space wrapped around */
+	if (last_free > 0) {
+		/* Reduce read_sz because of the line_sz gap */
+		read_sz = NX_MIN(free_space, last_free);
+		n = 0;
+		if (read_sz > 0) {
+			n = fread(fifo_in + last_offset, 1, read_sz, inpf);
+			used_in = used_in + n;       /* Increase used space */
+			free_space = free_space - n; /* Decrease free space */
+			assert(n <= read_sz);
+			if (n != read_sz) {
+				/* Either EOF or error; exit the read loop */
+				is_eof = 1;
+				goto write_state;
+			}
+		}
+	}
+
+	/* At this point we have used_in bytes in fifo_in with the
+	 * data head starting at cur_in and possibly wrapping around.
+	 */
+
+write_state:
+
+	/* Write decompressed data to output file */
+
+	NXPRT(fprintf(stderr, "write_state:\n"));
+
+	if (used_out == 0)
+		goto decomp_state;
+
+	/* If fifo_out has data waiting, write it out to the file to
+	 * make free target space for the accelerator used bytes in
+	 * the first and last parts of fifo_out.
+	 */
+
+	first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len);
+	last_used  = fifo_used_last_bytes(cur_out, used_out, fifo_out_len);
+
+	write_sz = first_used;
+
+	n = 0;
+	if (write_sz > 0) {
+		n = fwrite(fifo_out + cur_out, 1, write_sz, outf);
+		used_out = used_out - n;
+		/* Move head of the fifo */
+		cur_out = (cur_out + n) % fifo_out_len;
+		assert(n <= write_sz);
+		if (n != write_sz) {
+			fprintf(stderr, "error: write\n");
+			rc = -1;
+			goto err5;
+		}
+	}
+
+	if (last_used > 0) { /* If more data available in the last part */
+		write_sz = last_used; /* Keep it here for later */
+		n = 0;
+		if (write_sz > 0) {
+			n = fwrite(fifo_out, 1, write_sz, outf);
+			used_out = used_out - n;
+			cur_out = (cur_out + n) % fifo_out_len;
+			assert(n <= write_sz);
+			if (n != write_sz) {
+				fprintf(stderr, "error: write\n");
+				rc = -1;
+				goto err5;
+			}
+		}
+	}
+
+decomp_state:
+
+	/* NX decompresses input data */
+
+	NXPRT(fprintf(stderr, "decomp_state:\n"));
+
+	if (is_final)
+		goto finish_state;
+
+	/* Address/len lists */
+	clearp_dde(ddl_in);
+	clearp_dde(ddl_out);
+
+	/* FC, CRC, HistLen, Table 6-6 */
+	if (resuming) {
+		/* Resuming a partially decompressed input.
+		 * The key to resume is supplying the 32KB
+		 * dictionary (history) to NX, which is basically
+		 * the last 32KB of output produced.
+		 */
+		fc = GZIP_FC_DECOMPRESS_RESUME;
+
+		cmdp->cpb.in_crc   = cmdp->cpb.out_crc;
+		cmdp->cpb.in_adler = cmdp->cpb.out_adler;
+
+		/* Round up the history size to quadword.  Section 2.10 */
+		history_len = (history_len + 15) / 16;
+		putnn(cmdp->cpb, in_histlen, history_len);
+		history_len = history_len * 16; /* bytes */
+
+		if (history_len > 0) {
+			/* Chain in the history buffer to the DDE list */
+			if (cur_out >= history_len) {
+				nx_append_dde(ddl_in, fifo_out
+					      + (cur_out - history_len),
+					      history_len);
+			} else {
+				nx_append_dde(ddl_in, fifo_out
+					      + ((fifo_out_len + cur_out)
+					      - history_len),
+					      history_len - cur_out);
+				/* Up to 32KB history wraps around fifo_out */
+				nx_append_dde(ddl_in, fifo_out, cur_out);
+			}
+
+		}
+	} else {
+		/* First decompress job */
+		fc = GZIP_FC_DECOMPRESS;
+
+		history_len = 0;
+		/* Writing 0 clears out subc as well */
+		cmdp->cpb.in_histlen = 0;
+		total_out = 0;
+
+		put32(cmdp->cpb, in_crc, INIT_CRC);
+		put32(cmdp->cpb, in_adler, INIT_ADLER);
+		put32(cmdp->cpb, out_crc, INIT_CRC);
+		put32(cmdp->cpb, out_adler, INIT_ADLER);
+
+		/* Assuming 10% compression ratio initially; use the
+		 * most recently measured compression ratio as a
+		 * heuristic to estimate the input and output
+		 * sizes.  If we give too much input, the target buffer
+		 * overflows and NX cycles are wasted, and then we
+		 * must retry with smaller input size.  1000 is 100%.
+		 */
+		last_comp_ratio = 100UL;
+	}
+	cmdp->crb.gzip_fc = 0;
+	putnn(cmdp->crb, gzip_fc, fc);
+
+	/*
+	 * NX source buffers
+	 */
+	first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
+	last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
+
+	if (first_used > 0)
+		nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
+
+	if (last_used > 0)
+		nx_append_dde(ddl_in, fifo_in, last_used);
+
+	/*
+	 * NX target buffers
+	 */
+	first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len);
+	last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len);
+
+	/* Reduce output free space amount not to overwrite the history */
+	int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len)
+				- (1<<16));
+
+	NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max,
+		      target_max));
+
+	first_free = NX_MIN(target_max, first_free);
+	if (first_free > 0) {
+		first_offset = fifo_free_first_offset(cur_out, used_out);
+		nx_append_dde(ddl_out, fifo_out + first_offset, first_free);
+	}
+
+	if (last_free > 0) {
+		last_free = NX_MIN(target_max - first_free, last_free);
+		if (last_free > 0) {
+			last_offset = fifo_free_last_offset(cur_out, used_out,
+							    fifo_out_len);
+			nx_append_dde(ddl_out, fifo_out + last_offset,
+				      last_free);
+		}
+	}
+
+	/* Target buffer size is used to limit the source data size
+	 * based on previous measurements of compression ratio.
+	 */
+
+	/* source_sz includes history */
+	source_sz = getp32(ddl_in, ddebc);
+	assert(source_sz > history_len);
+	source_sz = source_sz - history_len;
+
+	/* Estimating how much source is needed to 3/4 fill a
+	 * target_max size target buffer.  If we overshoot, then NX
+	 * must repeat the job with smaller input and we waste
+	 * bandwidth.  If we undershoot then we use more NX calls than
+	 * necessary.
+	 */
+
+	source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL)
+				/ 4000;
+
+	if (source_sz_estimate < source_sz) {
+		/* Target might be small, therefore limiting the
+		 * source data.
+		 */
+		source_sz = source_sz_estimate;
+		target_sz_estimate = target_max;
+	} else {
+		/* Source file might be small, therefore limiting target
+		 * touch pages to a smaller value to save processor cycles.
+		 */
+		target_sz_estimate = ((uint64_t)source_sz * 1000UL)
+					/ (last_comp_ratio + 1);
+		target_sz_estimate = NX_MIN(2 * target_sz_estimate,
+					    target_max);
+	}
+
+	source_sz = source_sz + history_len;
+
+	/* Some NX condition codes require submitting the NX job again.
+	 * Kernel doesn't handle NX page faults. Expects user code to
+	 * touch pages.
+	 */
+	pgfault_retries = retry_max;
+
+restart_nx:
+
+	putp32(ddl_in, ddebc, source_sz);
+
+	/* Fault in pages */
+	nx_touch_pages_dde(ddl_in, 0, page_sz, 0);
+	nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1);
+
+	/* Send job to NX */
+	cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle);
+
+	switch (cc) {
+
+	case ERR_NX_TRANSLATION:
+
+		/* We touched the pages ahead of time.  In the most common case
+		 * we shouldn't be here.  But may be some pages were paged out.
+		 * Kernel should have placed the faulting address to fsaddr.
+		 */
+		NXPRT(fprintf(stderr, "ERR_NX_TRANSLATION %p\n",
+			      (void *)cmdp->crb.csb.fsaddr));
+
+		/* Touch 1 byte, read-only  */
+		nx_touch_pages((void *)cmdp->crb.csb.fsaddr, 1, page_sz, 0);
+
+		if (pgfault_retries == retry_max) {
+			/* Try once with exact number of pages */
+			--pgfault_retries;
+			goto restart_nx;
+		} else if (pgfault_retries > 0) {
+			/* If still faulting try fewer input pages
+			 * assuming memory outage
+			 */
+			if (source_sz > page_sz)
+				source_sz = NX_MAX(source_sz / 2, page_sz);
+			--pgfault_retries;
+			goto restart_nx;
+		} else {
+			fprintf(stderr, "cannot make progress; too many page \
+				fault retries cc= %d\n", cc);
+			rc = -1;
+			goto err5;
+		}
+
+	case ERR_NX_DATA_LENGTH:
+
+		NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; not an error \
+			      usually; stream may have trailing data\n"));
+
+		/* Not an error in the most common case; it just says
+		 * there is trailing data that we must examine.
+		 *
+		 * CC=3 CE(1)=0 CE(0)=1 indicates partial completion
+		 * Fig.6-7 and Table 6-8.
+		 */
+		nx_ce = get_csb_ce_ms3b(cmdp->crb.csb);
+
+		if (!csb_ce_termination(nx_ce) &&
+		    csb_ce_partial_completion(nx_ce)) {
+			/* Check CPB for more information
+			 * spbc and tpbc are valid
+			 */
+			sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */
+			subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */
+			spbc = get32(cmdp->cpb, out_spbc_decomp);
+			tpbc = get32(cmdp->crb.csb, tpbc);
+			assert(target_max >= tpbc);
+
+			goto ok_cc3; /* not an error */
+		} else {
+			/* History length error when CE(1)=1 CE(0)=0. */
+			rc = -1;
+			fprintf(stderr, "history length error cc= %d\n", cc);
+			goto err5;
+		}
+
+	case ERR_NX_TARGET_SPACE:
+
+		/* Target buffer not large enough; retry smaller input
+		 * data; give at least 1 byte.  SPBC/TPBC are not valid.
+		 */
+		assert(source_sz > history_len);
+		source_sz = ((source_sz - history_len + 2) / 2) + history_len;
+		NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with \
+			      smaller input data src %d hist %d\n", source_sz,
+			      history_len));
+		goto restart_nx;
+
+	case ERR_NX_OK:
+
+		/* This should not happen for gzip formatted data;
+		 * we need trailing crc and isize
+		 */
+		fprintf(stderr, "ERR_NX_OK\n");
+		spbc = get32(cmdp->cpb, out_spbc_decomp);
+		tpbc = get32(cmdp->crb.csb, tpbc);
+		assert(target_max >= tpbc);
+		assert(spbc >= history_len);
+		source_sz = spbc - history_len;
+		goto offsets_state;
+
+	default:
+		fprintf(stderr, "error: cc= %d\n", cc);
+		rc = -1;
+		goto err5;
+	}
+
+ok_cc3:
+
+	NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt));
+
+	assert(spbc > history_len);
+	source_sz = spbc - history_len;
+
+	/* Table 6-4: Source Final Block Type (SFBT) describes the
+	 * last processed deflate block and clues the software how to
+	 * resume the next job.  SUBC indicates how many input bits NX
+	 * consumed but did not process.  SPBC indicates how many
+	 * bytes of source were given to the accelerator including
+	 * history bytes.
+	 */
+
+	switch (sfbt) {
+		int dhtlen;
+
+	case 0b0000: /* Deflate final EOB received */
+
+		/* Calculating the checksum start position. */
+
+		source_sz = source_sz - subc / 8;
+		is_final = 1;
+		break;
+
+		/* Resume decompression cases are below. Basically
+		 * indicates where NX has suspended and how to resume
+		 * the input stream.
+		 */
+
+	case 0b1000: /* Within a literal block; use rembytecount */
+	case 0b1001: /* Within a literal block; use rembytecount; bfinal=1 */
+
+		/* Supply the partially processed source byte again */
+		source_sz = source_sz - ((subc + 7) / 8);
+
+		/* SUBC LS 3bits: number of bits in the first source byte need
+		 * to be processed.
+		 * 000 means all 8 bits;  Table 6-3
+		 * Clear subc, histlen, sfbt, rembytecnt, dhtlen
+		 */
+		cmdp->cpb.in_subc = 0;
+		cmdp->cpb.in_sfbt = 0;
+		putnn(cmdp->cpb, in_subc, subc % 8);
+		putnn(cmdp->cpb, in_sfbt, sfbt);
+		putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb,
+						      out_rembytecnt));
+		break;
+
+	case 0b1010: /* Within a FH block; */
+	case 0b1011: /* Within a FH block; bfinal=1 */
+
+		source_sz = source_sz - ((subc + 7) / 8);
+
+		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
+		cmdp->cpb.in_subc = 0;
+		cmdp->cpb.in_sfbt = 0;
+		putnn(cmdp->cpb, in_subc, subc % 8);
+		putnn(cmdp->cpb, in_sfbt, sfbt);
+		break;
+
+	case 0b1100: /* Within a DH block; */
+	case 0b1101: /* Within a DH block; bfinal=1 */
+
+		source_sz = source_sz - ((subc + 7) / 8);
+
+		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
+		cmdp->cpb.in_subc = 0;
+		cmdp->cpb.in_sfbt = 0;
+		putnn(cmdp->cpb, in_subc, subc % 8);
+		putnn(cmdp->cpb, in_sfbt, sfbt);
+
+		dhtlen = getnn(cmdp->cpb, out_dhtlen);
+		putnn(cmdp->cpb, in_dhtlen, dhtlen);
+		assert(dhtlen >= 42);
+
+		/* Round up to a qword */
+		dhtlen = (dhtlen + 127) / 128;
+
+		while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */
+			--dhtlen;
+			cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen];
+		}
+		break;
+
+	case 0b1110: /* Within a block header; bfinal=0; */
+		     /* Also given if source data exactly ends (SUBC=0) with
+		      * EOB code with BFINAL=0.  Means the next byte will
+		      * contain a block header.
+		      */
+	case 0b1111: /* within a block header with BFINAL=1. */
+
+		source_sz = source_sz - ((subc + 7) / 8);
+
+		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
+		cmdp->cpb.in_subc = 0;
+		cmdp->cpb.in_sfbt = 0;
+		putnn(cmdp->cpb, in_subc, subc % 8);
+		putnn(cmdp->cpb, in_sfbt, sfbt);
+	}
+
+offsets_state:
+
+	/* Adjust the source and target buffer offsets and lengths  */
+
+	NXPRT(fprintf(stderr, "offsets_state:\n"));
+
+	/* Delete input data from fifo_in */
+	used_in = used_in - source_sz;
+	cur_in = (cur_in + source_sz) % fifo_in_len;
+	input_file_offset = input_file_offset + source_sz;
+
+	/* Add output data to fifo_out */
+	used_out = used_out + tpbc;
+
+	assert(used_out <= fifo_out_len);
+
+	total_out = total_out + tpbc;
+
+	/* Deflate history is 32KB max.  No need to supply more
+	 * than 32KB on a resume.
+	 */
+	history_len = (total_out > window_max) ? window_max : total_out;
+
+	/* To estimate expected expansion in the next NX job; 500 means 50%.
+	 * Deflate best case is around 1 to 1000.
+	 */
+	last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1))
+			  / ((uint64_t)tpbc + 1);
+	last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1);
+	NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n",
+		      last_comp_ratio, source_sz, spbc, tpbc));
+
+	resuming = 1;
+
+finish_state:
+
+	NXPRT(fprintf(stderr, "finish_state:\n"));
+
+	if (is_final) {
+		if (used_out)
+			goto write_state; /* More data to write out */
+		else if (used_in < 8) {
+			/* Need at least 8 more bytes containing gzip crc
+			 * and isize.
+			 */
+			rc = -1;
+			goto err4;
+		} else {
+			/* Compare checksums and exit */
+			int i;
+			char tail[8];
+			uint32_t cksum, isize;
+			for (i = 0; i < 8; i++)
+				tail[i] = fifo_in[(cur_in + i) % fifo_in_len];
+			fprintf(stderr, "computed checksum %08x isize %08x\n",
+				cmdp->cpb.out_crc, (uint32_t) (total_out
+				% (1ULL<<32)));
+			cksum = (tail[0] | tail[1]<<8 | tail[2]<<16
+				| tail[3]<<24);
+			isize = (tail[4] | tail[5]<<8 | tail[6]<<16
+				| tail[7]<<24);
+			fprintf(stderr, "stored   checksum %08x isize %08x\n",
+				cksum, isize);
+
+			if (cksum == cmdp->cpb.out_crc && isize == (uint32_t)
+			    (total_out % (1ULL<<32))) {
+				rc = 0;	goto ok1;
+			} else {
+				rc = -1; goto err4;
+			}
+		}
+	} else
+		goto read_state;
+
+	return -1;
+
+err1:
+	fprintf(stderr, "error: not a gzip file, expect %x, read %x\n",
+		expect, c);
+	return -1;
+
+err2:
+	fprintf(stderr, "error: the FLG byte is wrong or not handled by this \
+		code sample\n");
+	return -1;
+
+err3:
+	fprintf(stderr, "error: gzip header\n");
+	return -1;
+
+err4:
+	fprintf(stderr, "error: checksum\n");
+
+err5:
+ok1:
+	fprintf(stderr, "decomp is complete: fclose\n");
+	fclose(outf);
+
+	return rc;
+}
+
+
+int main(int argc, char **argv)
+{
+	int rc;
+	struct sigaction act;
+	void *handle;
+
+	act.sa_handler = 0;
+	act.sa_sigaction = sigsegv_handler;
+	act.sa_flags = SA_SIGINFO;
+	act.sa_restorer = 0;
+	sigemptyset(&act.sa_mask);
+	sigaction(SIGSEGV, &act, NULL);
+
+	handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
+	if (!handle) {
+		fprintf(stderr, "Unable to init NX, errno %d\n", errno);
+		exit(-1);
+	}
+
+	rc = decompress_file(argc, argv, handle);
+
+	nx_function_end(handle);
+
+	return rc;
+}
-- 
2.21.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 5/5] selftests/powerpc: Add README for GZIP engine tests
  2020-03-16 18:07 [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase Raphael Moreira Zinsly
                   ` (3 preceding siblings ...)
  2020-03-16 18:07 ` [PATCH 4/5] selftests/powerpc: Add NX-GZIP engine decompress testcase Raphael Moreira Zinsly
@ 2020-03-16 18:07 ` Raphael Moreira Zinsly
  2020-03-18  6:40   ` Daniel Axtens
  2020-03-16 21:50 ` [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase Haren Myneni
  5 siblings, 1 reply; 14+ messages in thread
From: Raphael Moreira Zinsly @ 2020-03-16 18:07 UTC (permalink / raw)
  To: linuxppc-dev, linux-crypto
  Cc: herbert, mpe, haren, abali, Raphael Moreira Zinsly

Include a README file with the instructions to use the
testcases at selftests/powerpc/nx-gzip.

Signed-off-by: Bulent Abali <abali@us.ibm.com>
Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
---
 .../powerpc/nx-gzip/99-nx-gzip.rules          |  1 +
 .../testing/selftests/powerpc/nx-gzip/README  | 44 +++++++++++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules
 create mode 100644 tools/testing/selftests/powerpc/nx-gzip/README

diff --git a/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules
new file mode 100644
index 000000000000..5a7118495cb3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules
@@ -0,0 +1 @@
+SUBSYSTEM=="nxgzip", KERNEL=="nx-gzip", MODE="0666"
diff --git a/tools/testing/selftests/powerpc/nx-gzip/README b/tools/testing/selftests/powerpc/nx-gzip/README
new file mode 100644
index 000000000000..ff0c817a65c5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/README
@@ -0,0 +1,44 @@
+Test the nx-gzip function:
+=========================
+
+Verify that following device exists:
+  /dev/crypto/nx-gzip
+If you get a permission error run as sudo or set the device permissions:
+   sudo chmod go+rw /dev/crypto/nx-gzip
+However, chmod may not survive across boots. You may create a udev file such
+as:
+   /etc/udev/rules.d/99-nx-gzip.rules
+
+
+Then make and run:
+$ make
+gcc -O3 -I./inc -o gzfht_test gzfht_test.c gzip_vas.c
+gcc -O3 -I./inc -o gunz_test gunz_test.c gzip_vas.c
+
+
+Compress any file using Fixed Huffman mode. Output will have a .nx.gz suffix:
+$ ./gzfht_test gzip_vas.c
+file gzip_vas.c read, 5276 bytes
+compressed 5276 to 2564 bytes total, crc32 checksum = b937a37d
+
+
+Uncompress the previous output. Output will have a .nx.gunzip suffix:
+$ ./gunz_test gzip_vas.c.nx.gz
+gzHeader FLG 0
+00 00 00 00 04 03
+gzHeader MTIME, XFL, OS ignored
+computed checksum b937a37d isize 0000149c
+stored   checksum b937a37d isize 0000149c
+decomp is complete: fclose
+
+
+Compare two files:
+$ sha1sum gzip_vas.c.nx.gz.nx.gunzip gzip_vas.c
+f041cd8581e8d920f79f6ce7f65411be5d026c2a  gzip_vas.c.nx.gz.nx.gunzip
+f041cd8581e8d920f79f6ce7f65411be5d026c2a  gzip_vas.c
+
+
+Note that the code here are intended for testing the nx-gzip hardware function.
+They are not intended for demonstrating performance or compression ratio.
+For more information and source code consider using:
+https://github.com/libnxz/power-gzip
-- 
2.21.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase
  2020-03-16 18:07 [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase Raphael Moreira Zinsly
                   ` (4 preceding siblings ...)
  2020-03-16 18:07 ` [PATCH 5/5] selftests/powerpc: Add README for GZIP engine tests Raphael Moreira Zinsly
@ 2020-03-16 21:50 ` Haren Myneni
  5 siblings, 0 replies; 14+ messages in thread
From: Haren Myneni @ 2020-03-16 21:50 UTC (permalink / raw)
  To: Raphael Moreira Zinsly; +Cc: linuxppc-dev, linux-crypto, herbert, mpe, abali

On Mon, 2020-03-16 at 15:07 -0300, Raphael Moreira Zinsly wrote:
> This patch series are intended to test the power8 and power9 Nest
> Accelerator (NX) GZIP engine that is being introduced by
> https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-March/205659.html
> More information about how to access the NX can be found in that patch, also a
> complete userspace library and more documentation can be found at:
> https://github.com/libnxz/power-gzip
> 
Raphael, Please delete power8 reference. NX-GZIP engine and user space
support (with VAS) are introduced in P9. 

> 
> Thanks,
> Raphael
> 
> 



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/5] selftests/powerpc: Add header files for GZIP engine test
  2020-03-16 18:07 ` [PATCH 1/5] selftests/powerpc: Add header files for GZIP engine test Raphael Moreira Zinsly
@ 2020-03-18  3:48   ` Daniel Axtens
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Axtens @ 2020-03-18  3:48 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, linuxppc-dev, linux-crypto
  Cc: Raphael Moreira Zinsly, haren, herbert, abali

Hi,

This is throwing a number of snowpatch warnings, as well as a whitespace
warning when I apply it. Please could you check the warnings at
https://patchwork.ozlabs.org/patch/1255779/

It looks like the rest of the series also throws some warnings - please
check those also.

Kind regards,
Daniel


Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:

> Add files to access the powerpc NX-GZIP engine in user space.
>
> Signed-off-by: Bulent Abali <abali@us.ibm.com>
> Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
> ---
>  .../selftests/powerpc/nx-gzip/inc/crb.h       | 170 ++++++++++++++++++
>  .../selftests/powerpc/nx-gzip/inc/nx-gzip.h   |  27 +++
>  .../powerpc/nx-gzip/inc/nx-helpers.h          |  53 ++++++
>  .../selftests/powerpc/nx-gzip/inc/nx.h        |  30 ++++
>  4 files changed, 280 insertions(+)
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/crb.h
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/nx-gzip.h
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/nx-helpers.h
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/nx.h
>
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/crb.h b/tools/testing/selftests/powerpc/nx-gzip/inc/crb.h
> new file mode 100644
> index 000000000000..6af25fb8461a
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/inc/crb.h
> @@ -0,0 +1,170 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +#ifndef __CRB_H
> +#define __CRB_H
> +#include <linux/types.h>
> +
> +typedef unsigned char u8;
> +typedef unsigned int u32;
> +typedef unsigned long long u64;
> +
> +/* From nx-842.h */
> +
> +/* CCW 842 CI/FC masks
> + * NX P8 workbook, section 4.3.1, figure 4-6
> + * "CI/FC Boundary by NX CT type"
> + */
> +#define CCW_CI_842              (0x00003ff8)
> +#define CCW_FC_842              (0x00000007)
> +
> +/* end - nx-842.h */
> +
> +#ifndef __aligned
> +#define __aligned(x)            __attribute__((aligned(x)))
> +#endif
> +
> +#ifndef __packed
> +#define __packed        __attribute__((packed))
> +#endif
> +
> +/* Chapter 6.5.8 Coprocessor-Completion Block (CCB) */
> +
> +#define CCB_VALUE		(0x3fffffffffffffff)
> +#define CCB_ADDRESS		(0xfffffffffffffff8)
> +#define CCB_CM			(0x0000000000000007)
> +#define CCB_CM0			(0x0000000000000004)
> +#define CCB_CM12		(0x0000000000000003)
> +
> +#define CCB_CM0_ALL_COMPLETIONS	(0x0)
> +#define CCB_CM0_LAST_IN_CHAIN	(0x4)
> +#define CCB_CM12_STORE		(0x0)
> +#define CCB_CM12_INTERRUPT	(0x1)
> +
> +#define CCB_SIZE		(0x10)
> +#define CCB_ALIGN		CCB_SIZE
> +
> +struct coprocessor_completion_block {
> +	__be64 value;
> +	__be64 address;
> +} __packed __aligned(CCB_ALIGN);
> +
> +
> +/* Chapter 6.5.7 Coprocessor-Status Block (CSB) */
> +
> +#define CSB_V			(0x80)
> +#define CSB_F			(0x04)
> +#define CSB_CH			(0x03)
> +#define CSB_CE_INCOMPLETE	(0x80)
> +#define CSB_CE_TERMINATION	(0x40)
> +#define CSB_CE_TPBC		(0x20)
> +
> +#define CSB_CC_SUCCESS		(0)
> +#define CSB_CC_INVALID_ALIGN	(1)
> +#define CSB_CC_OPERAND_OVERLAP	(2)
> +#define CSB_CC_DATA_LENGTH	(3)
> +#define CSB_CC_TRANSLATION	(5)
> +#define CSB_CC_PROTECTION	(6)
> +#define CSB_CC_RD_EXTERNAL	(7)
> +#define CSB_CC_INVALID_OPERAND	(8)
> +#define CSB_CC_PRIVILEGE	(9)
> +#define CSB_CC_INTERNAL		(10)
> +#define CSB_CC_WR_EXTERNAL	(12)
> +#define CSB_CC_NOSPC		(13)
> +#define CSB_CC_EXCESSIVE_DDE	(14)
> +#define CSB_CC_WR_TRANSLATION	(15)
> +#define CSB_CC_WR_PROTECTION	(16)
> +#define CSB_CC_UNKNOWN_CODE	(17)
> +#define CSB_CC_ABORT		(18)
> +#define CSB_CC_TRANSPORT	(20)
> +#define CSB_CC_SEGMENTED_DDL	(31)
> +#define CSB_CC_PROGRESS_POINT	(32)
> +#define CSB_CC_DDE_OVERFLOW	(33)
> +#define CSB_CC_SESSION		(34)
> +#define CSB_CC_PROVISION	(36)
> +#define CSB_CC_CHAIN		(37)
> +#define CSB_CC_SEQUENCE		(38)
> +#define CSB_CC_HW		(39)
> +
> +#define CSB_SIZE		(0x10)
> +#define CSB_ALIGN		CSB_SIZE
> +
> +struct coprocessor_status_block {
> +	u8 flags;
> +	u8 cs;
> +	u8 cc;
> +	u8 ce;
> +	__be32 count;
> +	__be64 address;
> +} __packed __aligned(CSB_ALIGN);
> +
> +
> +/* Chapter 6.5.10 Data-Descriptor List (DDL)
> + * each list contains one or more Data-Descriptor Entries (DDE)
> + */
> +
> +#define DDE_P			(0x8000)
> +
> +#define DDE_SIZE		(0x10)
> +#define DDE_ALIGN		DDE_SIZE
> +
> +struct data_descriptor_entry {
> +	__be16 flags;
> +	u8 count;
> +	u8 index;
> +	__be32 length;
> +	__be64 address;
> +} __packed __aligned(DDE_ALIGN);
> +
> +
> +/* Chapter 6.5.2 Coprocessor-Request Block (CRB) */
> +
> +#define CRB_SIZE		(0x80)
> +#define CRB_ALIGN		(0x100) /* Errata: requires 256 alignment */
> +
> +
> +/* Coprocessor Status Block field
> + *   ADDRESS	address of CSB
> + *   C		CCB is valid
> + *   AT		0 = addrs are virtual, 1 = addrs are phys
> + *   M		enable perf monitor
> + */
> +#define CRB_CSB_ADDRESS		(0xfffffffffffffff0)
> +#define CRB_CSB_C		(0x0000000000000008)
> +#define CRB_CSB_AT		(0x0000000000000002)
> +#define CRB_CSB_M		(0x0000000000000001)
> +
> +struct coprocessor_request_block {
> +	__be32 ccw;
> +	__be32 flags;
> +	__be64 csb_addr;
> +
> +	struct data_descriptor_entry source;
> +	struct data_descriptor_entry target;
> +
> +	struct coprocessor_completion_block ccb;
> +
> +	u8 reserved[48];
> +
> +	struct coprocessor_status_block csb;
> +} __packed __aligned(CRB_ALIGN);
> +
> +#define crb_csb_addr(c)         __be64_to_cpu(c->csb_addr)
> +#define crb_nx_fault_addr(c)    __be64_to_cpu(c->stamp.nx.fault_storage_addr)
> +#define crb_nx_flags(c)         c->stamp.nx.flags
> +#define crb_nx_fault_status(c)  c->stamp.nx.fault_status
> +#define crb_nx_pswid(c)		c->stamp.nx.pswid;
> +
> +
> +/* RFC02167 Initiate Coprocessor Instructions document
> + * Chapter 8.2.1.1.1 RS
> + * Chapter 8.2.3 Coprocessor Directive
> + * Chapter 8.2.4 Execution
> + *
> + * The CCW must be converted to BE before passing to icswx()
> + */
> +
> +#define CCW_PS                  (0xff000000)
> +#define CCW_CT                  (0x00ff0000)
> +#define CCW_CD                  (0x0000ffff)
> +#define CCW_CL                  (0x0000c000)
> +
> +#endif
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/nx-gzip.h b/tools/testing/selftests/powerpc/nx-gzip/inc/nx-gzip.h
> new file mode 100644
> index 000000000000..75482c45574d
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/inc/nx-gzip.h
> @@ -0,0 +1,27 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * Copyright 2020 IBM Corp.
> + *
> + */
> +
> +#ifndef _UAPI_MISC_VAS_H
> +#define _UAPI_MISC_VAS_H
> +
> +#include <asm/ioctl.h>
> +
> +#define VAS_FLAGS_PIN_WINDOW	0x1
> +#define VAS_FLAGS_HIGH_PRI	0x2
> +
> +#define VAS_FTW_SETUP		_IOW('v', 1, struct vas_gzip_setup_attr)
> +#define VAS_842_TX_WIN_OPEN	_IOW('v', 2, struct vas_gzip_setup_attr)
> +#define VAS_GZIP_TX_WIN_OPEN	_IOW('v', 0x20, struct vas_gzip_setup_attr)
> +
> +struct vas_gzip_setup_attr {
> +	int32_t		version;
> +	int16_t		vas_id;
> +	int16_t		reserved1;
> +	int64_t		flags;
> +	int64_t		reserved2[6];
> +};
> +
> +#endif /* _UAPI_MISC_VAS_H */
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/nx-helpers.h b/tools/testing/selftests/powerpc/nx-gzip/inc/nx-helpers.h
> new file mode 100644
> index 000000000000..201cf9f86a97
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/inc/nx-helpers.h
> @@ -0,0 +1,53 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +#include <sys/time.h>
> +#include <asm/byteorder.h>
> +#include <stdint.h>
> +#include "crb.h"
> +
> +#define cpu_to_be32		__cpu_to_be32
> +#define cpu_to_be64		__cpu_to_be64
> +#define be32_to_cpu		__be32_to_cpu
> +#define be64_to_cpu		__be64_to_cpu
> +
> +/*
> + * Several helpers/macros below were copied from the tree
> + * (kernel.h, nx-842.h, nx-ftw.h, asm-compat.h etc)
> + */
> +
> +/* from kernel.h */
> +#define IS_ALIGNED(x, a)	(((x) & ((typeof(x))(a) - 1)) == 0)
> +#define __round_mask(x, y)	((__typeof__(x))((y)-1))
> +#define round_up(x, y)		((((x)-1) | __round_mask(x, y))+1)
> +#define round_down(x, y)	((x) & ~__round_mask(x, y))
> +
> +#define min_t(t, x, y)	((x) < (y) ? (x) : (y))
> +/*
> + * Get/Set bit fields. (from nx-842.h)
> + */
> +#define GET_FIELD(m, v)         (((v) & (m)) >> MASK_LSH(m))
> +#define MASK_LSH(m)             (__builtin_ffsl(m) - 1)
> +#define SET_FIELD(m, v, val)    \
> +		(((v) & ~(m)) | ((((typeof(v))(val)) << MASK_LSH(m)) & (m)))
> +
> +/* From asm-compat.h */
> +#define __stringify_in_c(...)	#__VA_ARGS__
> +#define stringify_in_c(...)	__stringify_in_c(__VA_ARGS__) " "
> +
> +#define	pr_debug
> +#define	pr_debug_ratelimited	printf
> +#define	pr_err			printf
> +#define	pr_err_ratelimited	printf
> +
> +#define WARN_ON_ONCE(x)		if (x) \
> +				printf("WARNING: %s:%d\n", __func__, __LINE__)
> +
> +extern void dump_buffer(char *msg, char *buf, int len);
> +extern void *alloc_aligned_mem(int len, int align, char *msg);
> +extern void get_payload(char *buf, int len);
> +extern void time_add(struct timeval *in, int seconds, struct timeval *out);
> +
> +typedef int bool;
> +extern bool time_after(struct timeval *a, struct timeval *b);
> +extern long time_delta(struct timeval *a, struct timeval *b);
> +extern void dump_dde(struct data_descriptor_entry *dde, char *msg);
> +extern void copy_paste_crb_data(struct coprocessor_request_block *crb);
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/nx.h b/tools/testing/selftests/powerpc/nx-gzip/inc/nx.h
> new file mode 100644
> index 000000000000..08c93f7fb96c
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/inc/nx.h
> @@ -0,0 +1,30 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * Copyright 2020 IBM Corp.
> + *
> + */
> +
> +#define	NX_FUNC_COMP_842	1
> +#define NX_FUNC_COMP_GZIP	2
> +
> +typedef int bool;
> +
> +struct nx842_func_args {
> +	bool use_crc;
> +	bool decompress;		/* true: decompress; false compress */
> +	bool move_data;
> +	int timeout;			/* seconds */
> +};
> +
> +typedef struct {
> +	int len;
> +	char *buf;
> +} nxbuf_t;
> +
> +/* @function should be EFT (aka 842), GZIP etc */
> +extern void *nx_function_begin(int function, int pri);
> +
> +extern int nx_function(void *handle, nxbuf_t *in, nxbuf_t *out, void *arg);
> +
> +extern int nx_function_end(void *handle);
> +
> -- 
> 2.21.0

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/5] selftests/powerpc: Add NX-GZIP engine decompress testcase
  2020-03-16 18:07 ` [PATCH 4/5] selftests/powerpc: Add NX-GZIP engine decompress testcase Raphael Moreira Zinsly
@ 2020-03-18  4:31   ` Daniel Axtens
  2020-03-18  6:18   ` Daniel Axtens
  1 sibling, 0 replies; 14+ messages in thread
From: Daniel Axtens @ 2020-03-18  4:31 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, linuxppc-dev, linux-crypto
  Cc: Raphael Moreira Zinsly, haren, herbert, abali

Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:

> Include a decompression testcase for the powerpc NX-GZIP
> engine.
>
> Signed-off-by: Bulent Abali <abali@us.ibm.com>
> Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
> ---
>  .../selftests/powerpc/nx-gzip/Makefile        |    7 +-
>  .../selftests/powerpc/nx-gzip/gunz_test.c     | 1058 +++++++++++++++++
>  2 files changed, 1062 insertions(+), 3 deletions(-)
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile
> index ab903f63bbbd..82abc19a49a0 100644
> --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile
> +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile
> @@ -1,9 +1,9 @@
>  CC = gcc
>  CFLAGS = -O3
>  INC = ./inc
> -SRC = gzfht_test.c
> +SRC = gzfht_test.c gunz_test.c
>  OBJ = $(SRC:.c=.o)
> -TESTS = gzfht_test
> +TESTS = gzfht_test gunz_test
>  EXTRA_SOURCES = gzip_vas.c
>  
>  all:	$(TESTS)
> @@ -16,6 +16,7 @@ $(TESTS): $(OBJ)
>  
>  run_tests: $(TESTS)
>  	./gzfht_test gzip_vas.c
> +	./gunz_test gzip_vas.c.nx.gz
>  
>  clean:
> -	rm -f $(TESTS) *.o *~ *.gz
> +	rm -f $(TESTS) *.o *~ *.gz *.gunzip
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
> new file mode 100644
> index 000000000000..653de92698cc
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
> @@ -0,0 +1,1058 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * P9 gunzip sample code for demonstrating the P9 NX hardware
> + * interface.  Not intended for productive uses or for performance or
> + * compression ratio measurements.  Note also that /dev/crypto/gzip,
> + * VAS and skiboot support are required
> + *
> + * Copyright 2020 IBM Corp.
> + *
> + * Author: Bulent Abali <abali@us.ibm.com>
> + *
> + * https://github.com/libnxz/power-gzip for zlib api and other utils
> + * Definitions of acronyms used here.  See
> + * P9 NX Gzip Accelerator User's Manual for details
> + *
> + * adler/crc: 32 bit checksums appended to stream tail
> + * ce:       completion extension
> + * cpb:      coprocessor parameter block (metadata)
> + * crb:      coprocessor request block (command)
> + * csb:      coprocessor status block (status)
> + * dht:      dynamic huffman table
> + * dde:      data descriptor element (address, length)
> + * ddl:      list of ddes
> + * dh/fh:    dynamic and fixed huffman types
> + * fc:       coprocessor function code
> + * histlen:  history/dictionary length
> + * history:  sliding window of up to 32KB of data
> + * lzcount:  Deflate LZ symbol counts
> + * rembytecnt: remaining byte count
> + * sfbt:     source final block type; last block's type during decomp
> + * spbc:     source processed byte count
> + * subc:     source unprocessed bit count
> + * tebc:     target ending bit count; valid bits in the last byte
> + * tpbc:     target processed byte count
> + * vas:      virtual accelerator switch; the user mode interface
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <stdint.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <sys/time.h>
> +#include <sys/fcntl.h>
> +#include <sys/mman.h>
> +#include <endian.h>
> +#include <bits/endian.h>
> +#include <sys/ioctl.h>
> +#include <assert.h>
> +#include <errno.h>
> +#include <signal.h>
> +#include "nxu.h"
> +#include "nx.h"
> +
> +int nx_dbg = 0;
> +FILE *nx_gzip_log = NULL;
> +
> +#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y))
> +#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y))
> +
> +#define mb()     asm volatile("sync" ::: "memory")
> +#define rmb()    asm volatile("lwsync" ::: "memory")
> +#define wmb()    rmb()
> +
> +const int fifo_in_len = 1<<24;
> +const int fifo_out_len = 1<<24;
> +const int page_sz = 1<<16;
> +const int line_sz = 1<<7;
> +const int window_max = 1<<15;
> +const int retry_max = 50;
> +
> +extern void *nx_fault_storage_address;
> +extern void *nx_function_begin(int function, int pri);
> +extern int nx_function_end(void *handle);
> +
> +/*
> + * Fault in pages prior to NX job submission.  wr=1 may be required to
> + * touch writeable pages.  System zero pages do not fault-in the page as
> + * intended.  Typically set wr=1 for NX target pages and set wr=0 for
> + * NX source pages.
> + */
> +static int nx_touch_pages(void *buf, long buf_len, long page_len, int wr)
> +{
> +	char *begin = buf;
> +	char *end = (char *) buf + buf_len - 1;
> +	volatile char t;
> +
> +	assert(buf_len >= 0 && !!buf);
> +
> +	NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf,
> +			buf + buf_len, buf_len, wr));
> +
> +	if (buf_len <= 0 || buf == NULL)
> +		return -1;
> +
> +	do {
> +		t = *begin;
> +		if (wr)
> +			*begin = t;
> +		begin = begin + page_len;
> +	} while (begin < end);
> +
> +	/* When buf_sz is small or buf tail is in another page. */
> +	t = *end;
> +	if (wr)
> +		*end = t;
> +
> +	return 0;
> +}
> +
> +void sigsegv_handler(int sig, siginfo_t *info, void *ctx)
> +{
> +	fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(),
> +	       sig, info->si_code, info->si_addr);
> +
> +	nx_fault_storage_address = info->si_addr;
> +}
> +
> +/*
> + * Adds an (address, len) pair to the list of ddes (ddl) and updates
> + * the base dde.  ddl[0] is the only dde in a direct dde which
> + * contains a single (addr,len) pair.  For more pairs, ddl[0] becomes
> + * the indirect (base) dde that points to a list of direct ddes.
> + * See Section 6.4 of the NX-gzip user manual for DDE description.
> + * Addr=NULL, len=0 clears the ddl[0].  Returns the total number of
> + * bytes in ddl.  Caller is responsible for allocting the array of
> + * nx_dde_t *ddl.  If N addresses are required in the scatter-gather
> + * list, the ddl array must have N+1 entries minimum.
> + */
> +static inline uint32_t nx_append_dde(nx_dde_t *ddl, void *addr, uint32_t len)
> +{
> +	uint32_t ddecnt;
> +	uint32_t bytes;
> +
> +	if (addr == NULL && len == 0) {
> +		clearp_dde(ddl);
> +		return 0;
> +	}
> +
> +	NXPRT(fprintf(stderr, "%d: nx_append_dde addr %p len %x\n", __LINE__,
> +			addr, len));
> +
> +	/* Number of ddes in the dde list ; == 0 when it is a direct dde */
> +	ddecnt = getpnn(ddl, dde_count);
> +	bytes = getp32(ddl, ddebc);
> +
> +	if (ddecnt == 0 && bytes == 0) {
> +		/* First dde is unused; make it a direct dde */
> +		bytes = len;
> +		putp32(ddl, ddebc, bytes);
> +		putp64(ddl, ddead, (uint64_t) addr);
> +	} else if (ddecnt == 0) {
> +		/* Converting direct to indirect dde
> +		 * ddl[0] becomes head dde of ddl
> +		 * copy direct to indirect first.
> +		 */
> +		ddl[1] = ddl[0];
> +
> +		/* Add the new dde next */
> +		clear_dde(ddl[2]);
> +		put32(ddl[2], ddebc, len);
> +		put64(ddl[2], ddead, (uint64_t) addr);
> +
> +		/* Ddl head points to 2 direct ddes */
> +		ddecnt = 2;
> +		putpnn(ddl, dde_count, ddecnt);
> +		bytes = bytes + len;
> +		putp32(ddl, ddebc, bytes);
> +		/* Pointer to the first direct dde */
> +		putp64(ddl, ddead, (uint64_t) &ddl[1]);
> +	} else {
> +		/* Append a dde to an existing indirect ddl */
> +		++ddecnt;
> +		clear_dde(ddl[ddecnt]);
> +		put64(ddl[ddecnt], ddead, (uint64_t) addr);
> +		put32(ddl[ddecnt], ddebc, len);
> +
> +		putpnn(ddl, dde_count, ddecnt);
> +		bytes = bytes + len;
> +		putp32(ddl, ddebc, bytes); /* byte sum of all dde */
> +	}
> +	return bytes;
> +}
> +
> +/*
> + * Touch specified number of pages represented in number bytes
> + * beginning from the first buffer in a dde list.
> + * Do not touch the pages past buf_sz-th byte's page.
> + *
> + * Set buf_sz = 0 to touch all pages described by the ddep.
> + */
> +static int nx_touch_pages_dde(nx_dde_t *ddep, long buf_sz, long page_sz,
> +				int wr)
> +{
> +	uint32_t indirect_count;
> +	uint32_t buf_len;
> +	long total;
> +	uint64_t buf_addr;
> +	nx_dde_t *dde_list;
> +	int i;
> +
> +	assert(!!ddep);
> +
> +	indirect_count = getpnn(ddep, dde_count);
> +
> +	NXPRT(fprintf(stderr, "nx_touch_pages_dde dde_count %d request len \
> +			0x%lx\n", indirect_count, buf_sz));
You use \ to break a string into multiple lines throughout this test
case.

It leads to things like this being printed:

ERR_NX_DATA_LENGTH; not an error                              usually; stream may have trailing data

Notice the big chunk of whitespace in the middle.

Could you use this instead please:

+	NXPRT(fprintf(stderr, "nx_touch_pages_dde dde_count %d request len "
+			"0x%lx\n", indirect_count, buf_sz));

Regards,
Daniel

> +
> +	if (indirect_count == 0) {
> +		/* Direct dde */
> +		buf_len = getp32(ddep, ddebc);
> +		buf_addr = getp64(ddep, ddead);
> +
> +		NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n",
> +				buf_len, (void *)buf_addr));
> +
> +		if (buf_sz == 0)
> +			nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
> +		else
> +			nx_touch_pages((void *)buf_addr, NX_MIN(buf_len,
> +					buf_sz), page_sz, wr);
> +
> +		return ERR_NX_OK;
> +	}
> +
> +	/* Indirect dde */
> +	if (indirect_count > MAX_DDE_COUNT)
> +		return ERR_NX_EXCESSIVE_DDE;
> +
> +	/* First address of the list */
> +	dde_list = (nx_dde_t *) getp64(ddep, ddead);
> +
> +	if (buf_sz == 0)
> +		buf_sz = getp32(ddep, ddebc);
> +
> +	total = 0;
> +	for (i = 0; i < indirect_count; i++) {
> +		buf_len = get32(dde_list[i], ddebc);
> +		buf_addr = get64(dde_list[i], ddead);
> +		total += buf_len;
> +
> +		NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total \
> +				0x%lx\n", buf_len, (void *)buf_addr, total));
> +
> +		/* Touching fewer pages than encoded in the ddebc */
> +		if (total > buf_sz) {
> +			buf_len = NX_MIN(buf_len, total - buf_sz);
> +			nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
> +			NXPRT(fprintf(stderr, "touch loop break len 0x%x \
> +				      ddead %p\n", buf_len, (void *)buf_addr));
> +			break;
> +		}
> +		nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
> +	}
> +	return ERR_NX_OK;
> +}
> +
> +/*
> + * Src and dst buffers are supplied in scatter gather lists.
> + * NX function code and other parameters supplied in cmdp.
> + */
> +static int nx_submit_job(nx_dde_t *src, nx_dde_t *dst, nx_gzip_crb_cpb_t *cmdp,
> +			 void *handle)
> +{
> +	int cc;
> +	uint64_t csbaddr;
> +
> +	memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
> +
> +	cmdp->crb.source_dde = *src;
> +	cmdp->crb.target_dde = *dst;
> +
> +	/* Status, output byte count in tpbc */
> +	csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask;
> +	put64(cmdp->crb, csb_address, csbaddr);
> +
> +	/* NX reports input bytes in spbc; cleared */
> +	cmdp->cpb.out_spbc_comp_wrap = 0;
> +	cmdp->cpb.out_spbc_comp_with_count = 0;
> +	cmdp->cpb.out_spbc_decomp = 0;
> +
> +	/* Clear output */
> +	put32(cmdp->cpb, out_crc, INIT_CRC);
> +	put32(cmdp->cpb, out_adler, INIT_ADLER);
> +
> +	cc = nxu_run_job(cmdp, handle);
> +
> +	if (!cc)
> +		cc = getnn(cmdp->crb.csb, csb_cc);	/* CC Table 6-8 */
> +
> +	return cc;
> +}
> +
> +/* fifo queue management */
> +#define fifo_used_bytes(used) (used)
> +#define fifo_free_bytes(used, len) ((len)-(used))
> +/* amount of free bytes in the first and last parts */
> +#define fifo_free_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
> +						  ? (len)-((cur)+(used)) : 0)
> +#define fifo_free_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
> +						  ? (cur) : (len)-(used))
> +/* amount of used bytes in the first and last parts */
> +#define fifo_used_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
> +						  ? (used) : (len)-(cur))
> +#define fifo_used_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
> +						  ? 0 : ((used)+(cur))-(len))
> +/* first and last free parts start here */
> +#define fifo_free_first_offset(cur, used)      ((cur)+(used))
> +#define fifo_free_last_offset(cur, used, len)  \
> +					   fifo_used_last_bytes(cur, used, len)
> +/* first and last used parts start here */
> +#define fifo_used_first_offset(cur)            (cur)
> +#define fifo_used_last_offset(cur)             (0)
> +
> +int decompress_file(int argc, char **argv, void *devhandle)
> +{
> +	FILE *inpf;
> +	FILE *outf;
> +
> +	int c, expect, i, cc, rc = 0;
> +	char gzfname[1024];
> +
> +	/* Queuing, file ops, byte counting */
> +	char *fifo_in, *fifo_out;
> +	int used_in, cur_in, used_out, cur_out, read_sz, n;
> +	int first_free, last_free, first_used, last_used;
> +	int first_offset, last_offset;
> +	int write_sz, free_space, source_sz;
> +	int source_sz_estimate, target_sz_estimate;
> +	uint64_t last_comp_ratio; /* 1000 max */
> +	uint64_t total_out;
> +	int is_final, is_eof;
> +
> +	/* nx hardware */
> +	int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0;
> +	int history_len = 0;
> +	nx_gzip_crb_cpb_t cmd, *cmdp;
> +	nx_dde_t *ddl_in;
> +	nx_dde_t dde_in[6] __attribute__((aligned (128)));
> +	nx_dde_t *ddl_out;
> +	nx_dde_t dde_out[6] __attribute__((aligned (128)));
> +	int pgfault_retries;
> +
> +	/* when using mmap'ed files */
> +	off_t input_file_offset;
> +
> +	if (argc > 2) {
> +		fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]);
> +		fprintf(stderr, "    writes to stdout or <fname>.nx.gunzip\n");
> +		return -1;
> +	}
> +
> +	if (argc == 1) {
> +		inpf = stdin;
> +		outf = stdout;
> +	} else if (argc == 2) {
> +		char w[1024];
> +		char *wp;
> +		inpf = fopen(argv[1], "r");
> +		if (inpf == NULL) {
> +			perror(argv[1]);
> +			return -1;
> +		}
> +
> +		/* Make a new file name to write to.  Ignoring '.gz' */
> +		wp = (NULL != (wp = strrchr(argv[1], '/'))) ? ++wp : argv[1];
> +		strcpy(w, wp);
> +		strcat(w, ".nx.gunzip");
> +
> +		outf = fopen(w, "w");
> +		if (outf == NULL) {
> +			perror(w);
> +			return -1;
> +		}
> +	}
> +
> +#define GETINPC(X) fgetc(X)
> +
> +	/* Decode the gzip header */
> +	c = GETINPC(inpf); expect = 0x1f; /* ID1 */
> +	if (c != expect)
> +		goto err1;
> +
> +	c = GETINPC(inpf); expect = 0x8b; /* ID2 */
> +	if (c != expect)
> +		goto err1;
> +
> +	c = GETINPC(inpf); expect = 0x08; /* CM */
> +	if (c != expect)
> +		goto err1;
> +
> +	int flg = GETINPC(inpf); /* FLG */
> +	if (flg & 0b11100000 || flg & 0b100)
> +		goto err2;
> +
> +	fprintf(stderr, "gzHeader FLG %x\n", flg);
> +
> +	/* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this
> +	 * sample code.
> +	 */
> +	for (i = 0; i < 6; i++) {
> +		char tmp[10];
> +		if (EOF == (tmp[i] = GETINPC(inpf)))
> +			goto err3;
> +		fprintf(stderr, "%02x ", tmp[i]);
> +		if (i == 5)
> +			fprintf(stderr, "\n");
> +	}
> +	fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n");
> +
> +	/* FNAME */
> +	if (flg & 0b1000) {
> +		int k = 0;
> +		do {
> +			if (EOF == (c = GETINPC(inpf)))
> +				goto err3;
> +			gzfname[k++] = c;
> +		} while (c);
> +		fprintf(stderr, "gzHeader FNAME: %s\n", gzfname);
> +	}
> +
> +	/* FHCRC */
> +	if (flg & 0b10) {
> +		c = GETINPC(inpf); c = GETINPC(inpf);
> +		fprintf(stderr, "gzHeader FHCRC: ignored\n");
> +	}
> +
> +	used_in = cur_in = used_out = cur_out = 0;
> +	is_final = is_eof = 0;
> +
> +	/* Allocate one page larger to prevent page faults due to NX
> +	 * overfetching.
> +	 * Either do this (char*)(uintptr_t)aligned_alloc or use
> +	 * -std=c11 flag to make the int-to-pointer warning go away.
> +	 */
> +	assert((fifo_in  = (char *)(uintptr_t)aligned_alloc(line_sz,
> +				   fifo_in_len + page_sz)) != NULL);
> +	assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz,
> +				   fifo_out_len + page_sz + line_sz)) != NULL);
> +	/* Leave unused space due to history rounding rules */
> +	fifo_out = fifo_out + line_sz;
> +	nx_touch_pages(fifo_out, fifo_out_len, page_sz, 1);
> +
> +	ddl_in  = &dde_in[0];
> +	ddl_out = &dde_out[0];
> +	cmdp = &cmd;
> +	memset(&cmdp->crb, 0, sizeof(cmdp->crb));
> +
> +read_state:
> +
> +	/* Read from .gz file */
> +
> +	NXPRT(fprintf(stderr, "read_state:\n"));
> +
> +	if (is_eof != 0)
> +		goto write_state;
> +
> +	/* We read in to fifo_in in two steps: first: read in to from
> +	 * cur_in to the end of the buffer.  last: if free space wrapped
> +	 * around, read from fifo_in offset 0 to offset cur_in.
> +	 */
> +
> +	/* Reset fifo head to reduce unnecessary wrap arounds */
> +	cur_in = (used_in == 0) ? 0 : cur_in;
> +
> +	/* Free space total is reduced by a gap */
> +	free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len)
> +			    - line_sz);
> +
> +	/* Free space may wrap around as first and last */
> +	first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len);
> +	last_free  = fifo_free_last_bytes(cur_in, used_in, fifo_in_len);
> +
> +	/* Start offsets of the free memory */
> +	first_offset = fifo_free_first_offset(cur_in, used_in);
> +	last_offset  = fifo_free_last_offset(cur_in, used_in, fifo_in_len);
> +
> +	/* Reduce read_sz because of the line_sz gap */
> +	read_sz = NX_MIN(free_space, first_free);
> +	n = 0;
> +	if (read_sz > 0) {
> +		/* Read in to offset cur_in + used_in */
> +		n = fread(fifo_in + first_offset, 1, read_sz, inpf);
> +		used_in = used_in + n;
> +		free_space = free_space - n;
> +		assert(n <= read_sz);
> +		if (n != read_sz) {
> +			/* Either EOF or error; exit the read loop */
> +			is_eof = 1;
> +			goto write_state;
> +		}
> +	}
> +
> +	/* If free space wrapped around */
> +	if (last_free > 0) {
> +		/* Reduce read_sz because of the line_sz gap */
> +		read_sz = NX_MIN(free_space, last_free);
> +		n = 0;
> +		if (read_sz > 0) {
> +			n = fread(fifo_in + last_offset, 1, read_sz, inpf);
> +			used_in = used_in + n;       /* Increase used space */
> +			free_space = free_space - n; /* Decrease free space */
> +			assert(n <= read_sz);
> +			if (n != read_sz) {
> +				/* Either EOF or error; exit the read loop */
> +				is_eof = 1;
> +				goto write_state;
> +			}
> +		}
> +	}
> +
> +	/* At this point we have used_in bytes in fifo_in with the
> +	 * data head starting at cur_in and possibly wrapping around.
> +	 */
> +
> +write_state:
> +
> +	/* Write decompressed data to output file */
> +
> +	NXPRT(fprintf(stderr, "write_state:\n"));
> +
> +	if (used_out == 0)
> +		goto decomp_state;
> +
> +	/* If fifo_out has data waiting, write it out to the file to
> +	 * make free target space for the accelerator used bytes in
> +	 * the first and last parts of fifo_out.
> +	 */
> +
> +	first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len);
> +	last_used  = fifo_used_last_bytes(cur_out, used_out, fifo_out_len);
> +
> +	write_sz = first_used;
> +
> +	n = 0;
> +	if (write_sz > 0) {
> +		n = fwrite(fifo_out + cur_out, 1, write_sz, outf);
> +		used_out = used_out - n;
> +		/* Move head of the fifo */
> +		cur_out = (cur_out + n) % fifo_out_len;
> +		assert(n <= write_sz);
> +		if (n != write_sz) {
> +			fprintf(stderr, "error: write\n");
> +			rc = -1;
> +			goto err5;
> +		}
> +	}
> +
> +	if (last_used > 0) { /* If more data available in the last part */
> +		write_sz = last_used; /* Keep it here for later */
> +		n = 0;
> +		if (write_sz > 0) {
> +			n = fwrite(fifo_out, 1, write_sz, outf);
> +			used_out = used_out - n;
> +			cur_out = (cur_out + n) % fifo_out_len;
> +			assert(n <= write_sz);
> +			if (n != write_sz) {
> +				fprintf(stderr, "error: write\n");
> +				rc = -1;
> +				goto err5;
> +			}
> +		}
> +	}
> +
> +decomp_state:
> +
> +	/* NX decompresses input data */
> +
> +	NXPRT(fprintf(stderr, "decomp_state:\n"));
> +
> +	if (is_final)
> +		goto finish_state;
> +
> +	/* Address/len lists */
> +	clearp_dde(ddl_in);
> +	clearp_dde(ddl_out);
> +
> +	/* FC, CRC, HistLen, Table 6-6 */
> +	if (resuming) {
> +		/* Resuming a partially decompressed input.
> +		 * The key to resume is supplying the 32KB
> +		 * dictionary (history) to NX, which is basically
> +		 * the last 32KB of output produced.
> +		 */
> +		fc = GZIP_FC_DECOMPRESS_RESUME;
> +
> +		cmdp->cpb.in_crc   = cmdp->cpb.out_crc;
> +		cmdp->cpb.in_adler = cmdp->cpb.out_adler;
> +
> +		/* Round up the history size to quadword.  Section 2.10 */
> +		history_len = (history_len + 15) / 16;
> +		putnn(cmdp->cpb, in_histlen, history_len);
> +		history_len = history_len * 16; /* bytes */
> +
> +		if (history_len > 0) {
> +			/* Chain in the history buffer to the DDE list */
> +			if (cur_out >= history_len) {
> +				nx_append_dde(ddl_in, fifo_out
> +					      + (cur_out - history_len),
> +					      history_len);
> +			} else {
> +				nx_append_dde(ddl_in, fifo_out
> +					      + ((fifo_out_len + cur_out)
> +					      - history_len),
> +					      history_len - cur_out);
> +				/* Up to 32KB history wraps around fifo_out */
> +				nx_append_dde(ddl_in, fifo_out, cur_out);
> +			}
> +
> +		}
> +	} else {
> +		/* First decompress job */
> +		fc = GZIP_FC_DECOMPRESS;
> +
> +		history_len = 0;
> +		/* Writing 0 clears out subc as well */
> +		cmdp->cpb.in_histlen = 0;
> +		total_out = 0;
> +
> +		put32(cmdp->cpb, in_crc, INIT_CRC);
> +		put32(cmdp->cpb, in_adler, INIT_ADLER);
> +		put32(cmdp->cpb, out_crc, INIT_CRC);
> +		put32(cmdp->cpb, out_adler, INIT_ADLER);
> +
> +		/* Assuming 10% compression ratio initially; use the
> +		 * most recently measured compression ratio as a
> +		 * heuristic to estimate the input and output
> +		 * sizes.  If we give too much input, the target buffer
> +		 * overflows and NX cycles are wasted, and then we
> +		 * must retry with smaller input size.  1000 is 100%.
> +		 */
> +		last_comp_ratio = 100UL;
> +	}
> +	cmdp->crb.gzip_fc = 0;
> +	putnn(cmdp->crb, gzip_fc, fc);
> +
> +	/*
> +	 * NX source buffers
> +	 */
> +	first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
> +	last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
> +
> +	if (first_used > 0)
> +		nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
> +
> +	if (last_used > 0)
> +		nx_append_dde(ddl_in, fifo_in, last_used);
> +
> +	/*
> +	 * NX target buffers
> +	 */
> +	first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len);
> +	last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len);
> +
> +	/* Reduce output free space amount not to overwrite the history */
> +	int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len)
> +				- (1<<16));
> +
> +	NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max,
> +		      target_max));
> +
> +	first_free = NX_MIN(target_max, first_free);
> +	if (first_free > 0) {
> +		first_offset = fifo_free_first_offset(cur_out, used_out);
> +		nx_append_dde(ddl_out, fifo_out + first_offset, first_free);
> +	}
> +
> +	if (last_free > 0) {
> +		last_free = NX_MIN(target_max - first_free, last_free);
> +		if (last_free > 0) {
> +			last_offset = fifo_free_last_offset(cur_out, used_out,
> +							    fifo_out_len);
> +			nx_append_dde(ddl_out, fifo_out + last_offset,
> +				      last_free);
> +		}
> +	}
> +
> +	/* Target buffer size is used to limit the source data size
> +	 * based on previous measurements of compression ratio.
> +	 */
> +
> +	/* source_sz includes history */
> +	source_sz = getp32(ddl_in, ddebc);
> +	assert(source_sz > history_len);
> +	source_sz = source_sz - history_len;
> +
> +	/* Estimating how much source is needed to 3/4 fill a
> +	 * target_max size target buffer.  If we overshoot, then NX
> +	 * must repeat the job with smaller input and we waste
> +	 * bandwidth.  If we undershoot then we use more NX calls than
> +	 * necessary.
> +	 */
> +
> +	source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL)
> +				/ 4000;
> +
> +	if (source_sz_estimate < source_sz) {
> +		/* Target might be small, therefore limiting the
> +		 * source data.
> +		 */
> +		source_sz = source_sz_estimate;
> +		target_sz_estimate = target_max;
> +	} else {
> +		/* Source file might be small, therefore limiting target
> +		 * touch pages to a smaller value to save processor cycles.
> +		 */
> +		target_sz_estimate = ((uint64_t)source_sz * 1000UL)
> +					/ (last_comp_ratio + 1);
> +		target_sz_estimate = NX_MIN(2 * target_sz_estimate,
> +					    target_max);
> +	}
> +
> +	source_sz = source_sz + history_len;
> +
> +	/* Some NX condition codes require submitting the NX job again.
> +	 * Kernel doesn't handle NX page faults. Expects user code to
> +	 * touch pages.
> +	 */
> +	pgfault_retries = retry_max;
> +
> +restart_nx:
> +
> +	putp32(ddl_in, ddebc, source_sz);
> +
> +	/* Fault in pages */
> +	nx_touch_pages_dde(ddl_in, 0, page_sz, 0);
> +	nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1);
> +
> +	/* Send job to NX */
> +	cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle);
> +
> +	switch (cc) {
> +
> +	case ERR_NX_TRANSLATION:
> +
> +		/* We touched the pages ahead of time.  In the most common case
> +		 * we shouldn't be here.  But may be some pages were paged out.
> +		 * Kernel should have placed the faulting address to fsaddr.
> +		 */
> +		NXPRT(fprintf(stderr, "ERR_NX_TRANSLATION %p\n",
> +			      (void *)cmdp->crb.csb.fsaddr));
> +
> +		/* Touch 1 byte, read-only  */
> +		nx_touch_pages((void *)cmdp->crb.csb.fsaddr, 1, page_sz, 0);
> +
> +		if (pgfault_retries == retry_max) {
> +			/* Try once with exact number of pages */
> +			--pgfault_retries;
> +			goto restart_nx;
> +		} else if (pgfault_retries > 0) {
> +			/* If still faulting try fewer input pages
> +			 * assuming memory outage
> +			 */
> +			if (source_sz > page_sz)
> +				source_sz = NX_MAX(source_sz / 2, page_sz);
> +			--pgfault_retries;
> +			goto restart_nx;
> +		} else {
> +			fprintf(stderr, "cannot make progress; too many page \
> +				fault retries cc= %d\n", cc);
> +			rc = -1;
> +			goto err5;
> +		}
> +
> +	case ERR_NX_DATA_LENGTH:
> +
> +		NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; not an error \
> +			      usually; stream may have trailing data\n"));
> +
> +		/* Not an error in the most common case; it just says
> +		 * there is trailing data that we must examine.
> +		 *
> +		 * CC=3 CE(1)=0 CE(0)=1 indicates partial completion
> +		 * Fig.6-7 and Table 6-8.
> +		 */
> +		nx_ce = get_csb_ce_ms3b(cmdp->crb.csb);
> +
> +		if (!csb_ce_termination(nx_ce) &&
> +		    csb_ce_partial_completion(nx_ce)) {
> +			/* Check CPB for more information
> +			 * spbc and tpbc are valid
> +			 */
> +			sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */
> +			subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */
> +			spbc = get32(cmdp->cpb, out_spbc_decomp);
> +			tpbc = get32(cmdp->crb.csb, tpbc);
> +			assert(target_max >= tpbc);
> +
> +			goto ok_cc3; /* not an error */
> +		} else {
> +			/* History length error when CE(1)=1 CE(0)=0. */
> +			rc = -1;
> +			fprintf(stderr, "history length error cc= %d\n", cc);
> +			goto err5;
> +		}
> +
> +	case ERR_NX_TARGET_SPACE:
> +
> +		/* Target buffer not large enough; retry smaller input
> +		 * data; give at least 1 byte.  SPBC/TPBC are not valid.
> +		 */
> +		assert(source_sz > history_len);
> +		source_sz = ((source_sz - history_len + 2) / 2) + history_len;
> +		NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with \
> +			      smaller input data src %d hist %d\n", source_sz,
> +			      history_len));
> +		goto restart_nx;
> +
> +	case ERR_NX_OK:
> +
> +		/* This should not happen for gzip formatted data;
> +		 * we need trailing crc and isize
> +		 */
> +		fprintf(stderr, "ERR_NX_OK\n");
> +		spbc = get32(cmdp->cpb, out_spbc_decomp);
> +		tpbc = get32(cmdp->crb.csb, tpbc);
> +		assert(target_max >= tpbc);
> +		assert(spbc >= history_len);
> +		source_sz = spbc - history_len;
> +		goto offsets_state;
> +
> +	default:
> +		fprintf(stderr, "error: cc= %d\n", cc);
> +		rc = -1;
> +		goto err5;
> +	}
> +
> +ok_cc3:
> +
> +	NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt));
> +
> +	assert(spbc > history_len);
> +	source_sz = spbc - history_len;
> +
> +	/* Table 6-4: Source Final Block Type (SFBT) describes the
> +	 * last processed deflate block and clues the software how to
> +	 * resume the next job.  SUBC indicates how many input bits NX
> +	 * consumed but did not process.  SPBC indicates how many
> +	 * bytes of source were given to the accelerator including
> +	 * history bytes.
> +	 */
> +
> +	switch (sfbt) {
> +		int dhtlen;
> +
> +	case 0b0000: /* Deflate final EOB received */
> +
> +		/* Calculating the checksum start position. */
> +
> +		source_sz = source_sz - subc / 8;
> +		is_final = 1;
> +		break;
> +
> +		/* Resume decompression cases are below. Basically
> +		 * indicates where NX has suspended and how to resume
> +		 * the input stream.
> +		 */
> +
> +	case 0b1000: /* Within a literal block; use rembytecount */
> +	case 0b1001: /* Within a literal block; use rembytecount; bfinal=1 */
> +
> +		/* Supply the partially processed source byte again */
> +		source_sz = source_sz - ((subc + 7) / 8);
> +
> +		/* SUBC LS 3bits: number of bits in the first source byte need
> +		 * to be processed.
> +		 * 000 means all 8 bits;  Table 6-3
> +		 * Clear subc, histlen, sfbt, rembytecnt, dhtlen
> +		 */
> +		cmdp->cpb.in_subc = 0;
> +		cmdp->cpb.in_sfbt = 0;
> +		putnn(cmdp->cpb, in_subc, subc % 8);
> +		putnn(cmdp->cpb, in_sfbt, sfbt);
> +		putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb,
> +						      out_rembytecnt));
> +		break;
> +
> +	case 0b1010: /* Within a FH block; */
> +	case 0b1011: /* Within a FH block; bfinal=1 */
> +
> +		source_sz = source_sz - ((subc + 7) / 8);
> +
> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
> +		cmdp->cpb.in_subc = 0;
> +		cmdp->cpb.in_sfbt = 0;
> +		putnn(cmdp->cpb, in_subc, subc % 8);
> +		putnn(cmdp->cpb, in_sfbt, sfbt);
> +		break;
> +
> +	case 0b1100: /* Within a DH block; */
> +	case 0b1101: /* Within a DH block; bfinal=1 */
> +
> +		source_sz = source_sz - ((subc + 7) / 8);
> +
> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
> +		cmdp->cpb.in_subc = 0;
> +		cmdp->cpb.in_sfbt = 0;
> +		putnn(cmdp->cpb, in_subc, subc % 8);
> +		putnn(cmdp->cpb, in_sfbt, sfbt);
> +
> +		dhtlen = getnn(cmdp->cpb, out_dhtlen);
> +		putnn(cmdp->cpb, in_dhtlen, dhtlen);
> +		assert(dhtlen >= 42);
> +
> +		/* Round up to a qword */
> +		dhtlen = (dhtlen + 127) / 128;
> +
> +		while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */
> +			--dhtlen;
> +			cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen];
> +		}
> +		break;
> +
> +	case 0b1110: /* Within a block header; bfinal=0; */
> +		     /* Also given if source data exactly ends (SUBC=0) with
> +		      * EOB code with BFINAL=0.  Means the next byte will
> +		      * contain a block header.
> +		      */
> +	case 0b1111: /* within a block header with BFINAL=1. */
> +
> +		source_sz = source_sz - ((subc + 7) / 8);
> +
> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
> +		cmdp->cpb.in_subc = 0;
> +		cmdp->cpb.in_sfbt = 0;
> +		putnn(cmdp->cpb, in_subc, subc % 8);
> +		putnn(cmdp->cpb, in_sfbt, sfbt);
> +	}
> +
> +offsets_state:
> +
> +	/* Adjust the source and target buffer offsets and lengths  */
> +
> +	NXPRT(fprintf(stderr, "offsets_state:\n"));
> +
> +	/* Delete input data from fifo_in */
> +	used_in = used_in - source_sz;
> +	cur_in = (cur_in + source_sz) % fifo_in_len;
> +	input_file_offset = input_file_offset + source_sz;
> +
> +	/* Add output data to fifo_out */
> +	used_out = used_out + tpbc;
> +
> +	assert(used_out <= fifo_out_len);
> +
> +	total_out = total_out + tpbc;
> +
> +	/* Deflate history is 32KB max.  No need to supply more
> +	 * than 32KB on a resume.
> +	 */
> +	history_len = (total_out > window_max) ? window_max : total_out;
> +
> +	/* To estimate expected expansion in the next NX job; 500 means 50%.
> +	 * Deflate best case is around 1 to 1000.
> +	 */
> +	last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1))
> +			  / ((uint64_t)tpbc + 1);
> +	last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1);
> +	NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n",
> +		      last_comp_ratio, source_sz, spbc, tpbc));
> +
> +	resuming = 1;
> +
> +finish_state:
> +
> +	NXPRT(fprintf(stderr, "finish_state:\n"));
> +
> +	if (is_final) {
> +		if (used_out)
> +			goto write_state; /* More data to write out */
> +		else if (used_in < 8) {
> +			/* Need at least 8 more bytes containing gzip crc
> +			 * and isize.
> +			 */
> +			rc = -1;
> +			goto err4;
> +		} else {
> +			/* Compare checksums and exit */
> +			int i;
> +			char tail[8];
> +			uint32_t cksum, isize;
> +			for (i = 0; i < 8; i++)
> +				tail[i] = fifo_in[(cur_in + i) % fifo_in_len];
> +			fprintf(stderr, "computed checksum %08x isize %08x\n",
> +				cmdp->cpb.out_crc, (uint32_t) (total_out
> +				% (1ULL<<32)));
> +			cksum = (tail[0] | tail[1]<<8 | tail[2]<<16
> +				| tail[3]<<24);
> +			isize = (tail[4] | tail[5]<<8 | tail[6]<<16
> +				| tail[7]<<24);
> +			fprintf(stderr, "stored   checksum %08x isize %08x\n",
> +				cksum, isize);
> +
> +			if (cksum == cmdp->cpb.out_crc && isize == (uint32_t)
> +			    (total_out % (1ULL<<32))) {
> +				rc = 0;	goto ok1;
> +			} else {
> +				rc = -1; goto err4;
> +			}
> +		}
> +	} else
> +		goto read_state;
> +
> +	return -1;
> +
> +err1:
> +	fprintf(stderr, "error: not a gzip file, expect %x, read %x\n",
> +		expect, c);
> +	return -1;
> +
> +err2:
> +	fprintf(stderr, "error: the FLG byte is wrong or not handled by this \
> +		code sample\n");
> +	return -1;
> +
> +err3:
> +	fprintf(stderr, "error: gzip header\n");
> +	return -1;
> +
> +err4:
> +	fprintf(stderr, "error: checksum\n");
> +
> +err5:
> +ok1:
> +	fprintf(stderr, "decomp is complete: fclose\n");
> +	fclose(outf);
> +
> +	return rc;
> +}
> +
> +
> +int main(int argc, char **argv)
> +{
> +	int rc;
> +	struct sigaction act;
> +	void *handle;
> +
> +	act.sa_handler = 0;
> +	act.sa_sigaction = sigsegv_handler;
> +	act.sa_flags = SA_SIGINFO;
> +	act.sa_restorer = 0;
> +	sigemptyset(&act.sa_mask);
> +	sigaction(SIGSEGV, &act, NULL);
> +
> +	handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
> +	if (!handle) {
> +		fprintf(stderr, "Unable to init NX, errno %d\n", errno);
> +		exit(-1);
> +	}
> +
> +	rc = decompress_file(argc, argv, handle);
> +
> +	nx_function_end(handle);
> +
> +	return rc;
> +}
> -- 
> 2.21.0

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/5] selftests/powerpc: Add NX-GZIP engine decompress testcase
  2020-03-16 18:07 ` [PATCH 4/5] selftests/powerpc: Add NX-GZIP engine decompress testcase Raphael Moreira Zinsly
  2020-03-18  4:31   ` Daniel Axtens
@ 2020-03-18  6:18   ` Daniel Axtens
  2020-03-18 13:08     ` Raphael M Zinsly
  1 sibling, 1 reply; 14+ messages in thread
From: Daniel Axtens @ 2020-03-18  6:18 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, linuxppc-dev, linux-crypto
  Cc: Raphael Moreira Zinsly, haren, herbert, abali

[-- Attachment #1: Type: text/plain, Size: 2501 bytes --]

Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:

> Include a decompression testcase for the powerpc NX-GZIP
> engine.

I compiled gzip with the AFL++ fuzzer and generated a corpus of tests to
run against this decompressor. I also fuzzed the decompressor
directly. I found a few issues. I _think_ they're just in the userspace
but I'm a bit too early in the process to know.

I realise this is self-test code but:
a) it stops me testing more deeply, and
b) it looks like some of this code is shared with https://github.com/libnxz/power-gzip/

The issues I've found are:

1) In the ERR_NX_DATA_LENGTH case, the decompressor doesn't check that
   you're making forward progress, so you can provoke it into an
   infinite loop.

Here's an _extremely_ ugly fix:

diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
index 653de92698cc..236a1f567656 100644
--- a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
+++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
@@ -343,6 +343,8 @@ int decompress_file(int argc, char **argv, void *devhandle)
        nx_dde_t dde_out[6] __attribute__((aligned (128)));
        int pgfault_retries;
 
+       int last_first_used = 0;
+
        /* when using mmap'ed files */
        off_t input_file_offset;
 
@@ -642,6 +644,11 @@ int decompress_file(int argc, char **argv, void *devhandle)
        first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
        last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
 
+       if (first_used > 0 && last_first_used > 0) {
+               assert(first_used != last_first_used);
+       }
+       last_first_used = first_used;
+
        if (first_used > 0)
                nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
 

2) It looks like you can provoke an out-of-bounds write. I've seen both
infinte loops printing something that seems to come from the file
content like:

57201: Got signal 11 si_code 3, si_addr 0xcacacacacacacac8

or a less bizzare address like

19285: Got signal 11 si_code 1, si_addr 0x7fffcf1b0000

Depending on the build I've also seen the stack smasher protection fire.

I don't understand the code well enough to figure out how this comes to
be just yet.

I've included a few test cases as attachments. I've preconverted them
with xxd to avoid anything that might flag suspicious gzip files!
Decompress them then use `xxd -r attachment testcase.gz` to convert them
back.

Regards,
Daniel


[-- Attachment #2: infloop.bz2 --]
[-- Type: application/octet-stream, Size: 79 bytes --]

[-- Attachment #3: sig1.bz2 --]
[-- Type: application/octet-stream, Size: 6100 bytes --]

[-- Attachment #4: sig676767.bz2 --]
[-- Type: application/octet-stream, Size: 1632 bytes --]

[-- Attachment #5: sigededed.bz2 --]
[-- Type: application/octet-stream, Size: 7267 bytes --]

[-- Attachment #6: Type: text/plain, Size: 33293 bytes --]



>
> Signed-off-by: Bulent Abali <abali@us.ibm.com>
> Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
> ---
>  .../selftests/powerpc/nx-gzip/Makefile        |    7 +-
>  .../selftests/powerpc/nx-gzip/gunz_test.c     | 1058 +++++++++++++++++
>  2 files changed, 1062 insertions(+), 3 deletions(-)
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile
> index ab903f63bbbd..82abc19a49a0 100644
> --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile
> +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile
> @@ -1,9 +1,9 @@
>  CC = gcc
>  CFLAGS = -O3
>  INC = ./inc
> -SRC = gzfht_test.c
> +SRC = gzfht_test.c gunz_test.c
>  OBJ = $(SRC:.c=.o)
> -TESTS = gzfht_test
> +TESTS = gzfht_test gunz_test
>  EXTRA_SOURCES = gzip_vas.c
>  
>  all:	$(TESTS)
> @@ -16,6 +16,7 @@ $(TESTS): $(OBJ)
>  
>  run_tests: $(TESTS)
>  	./gzfht_test gzip_vas.c
> +	./gunz_test gzip_vas.c.nx.gz
>  
>  clean:
> -	rm -f $(TESTS) *.o *~ *.gz
> +	rm -f $(TESTS) *.o *~ *.gz *.gunzip
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
> new file mode 100644
> index 000000000000..653de92698cc
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
> @@ -0,0 +1,1058 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * P9 gunzip sample code for demonstrating the P9 NX hardware
> + * interface.  Not intended for productive uses or for performance or
> + * compression ratio measurements.  Note also that /dev/crypto/gzip,
> + * VAS and skiboot support are required
> + *
> + * Copyright 2020 IBM Corp.
> + *
> + * Author: Bulent Abali <abali@us.ibm.com>
> + *
> + * https://github.com/libnxz/power-gzip for zlib api and other utils
> + * Definitions of acronyms used here.  See
> + * P9 NX Gzip Accelerator User's Manual for details
> + *
> + * adler/crc: 32 bit checksums appended to stream tail
> + * ce:       completion extension
> + * cpb:      coprocessor parameter block (metadata)
> + * crb:      coprocessor request block (command)
> + * csb:      coprocessor status block (status)
> + * dht:      dynamic huffman table
> + * dde:      data descriptor element (address, length)
> + * ddl:      list of ddes
> + * dh/fh:    dynamic and fixed huffman types
> + * fc:       coprocessor function code
> + * histlen:  history/dictionary length
> + * history:  sliding window of up to 32KB of data
> + * lzcount:  Deflate LZ symbol counts
> + * rembytecnt: remaining byte count
> + * sfbt:     source final block type; last block's type during decomp
> + * spbc:     source processed byte count
> + * subc:     source unprocessed bit count
> + * tebc:     target ending bit count; valid bits in the last byte
> + * tpbc:     target processed byte count
> + * vas:      virtual accelerator switch; the user mode interface
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <stdint.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <sys/time.h>
> +#include <sys/fcntl.h>
> +#include <sys/mman.h>
> +#include <endian.h>
> +#include <bits/endian.h>
> +#include <sys/ioctl.h>
> +#include <assert.h>
> +#include <errno.h>
> +#include <signal.h>
> +#include "nxu.h"
> +#include "nx.h"
> +
> +int nx_dbg = 0;
> +FILE *nx_gzip_log = NULL;
> +
> +#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y))
> +#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y))
> +
> +#define mb()     asm volatile("sync" ::: "memory")
> +#define rmb()    asm volatile("lwsync" ::: "memory")
> +#define wmb()    rmb()
> +
> +const int fifo_in_len = 1<<24;
> +const int fifo_out_len = 1<<24;
> +const int page_sz = 1<<16;
> +const int line_sz = 1<<7;
> +const int window_max = 1<<15;
> +const int retry_max = 50;
> +
> +extern void *nx_fault_storage_address;
> +extern void *nx_function_begin(int function, int pri);
> +extern int nx_function_end(void *handle);
> +
> +/*
> + * Fault in pages prior to NX job submission.  wr=1 may be required to
> + * touch writeable pages.  System zero pages do not fault-in the page as
> + * intended.  Typically set wr=1 for NX target pages and set wr=0 for
> + * NX source pages.
> + */
> +static int nx_touch_pages(void *buf, long buf_len, long page_len, int wr)
> +{
> +	char *begin = buf;
> +	char *end = (char *) buf + buf_len - 1;
> +	volatile char t;
> +
> +	assert(buf_len >= 0 && !!buf);
> +
> +	NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf,
> +			buf + buf_len, buf_len, wr));
> +
> +	if (buf_len <= 0 || buf == NULL)
> +		return -1;
> +
> +	do {
> +		t = *begin;
> +		if (wr)
> +			*begin = t;
> +		begin = begin + page_len;
> +	} while (begin < end);
> +
> +	/* When buf_sz is small or buf tail is in another page. */
> +	t = *end;
> +	if (wr)
> +		*end = t;
> +
> +	return 0;
> +}
> +
> +void sigsegv_handler(int sig, siginfo_t *info, void *ctx)
> +{
> +	fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(),
> +	       sig, info->si_code, info->si_addr);
> +
> +	nx_fault_storage_address = info->si_addr;
> +}
> +
> +/*
> + * Adds an (address, len) pair to the list of ddes (ddl) and updates
> + * the base dde.  ddl[0] is the only dde in a direct dde which
> + * contains a single (addr,len) pair.  For more pairs, ddl[0] becomes
> + * the indirect (base) dde that points to a list of direct ddes.
> + * See Section 6.4 of the NX-gzip user manual for DDE description.
> + * Addr=NULL, len=0 clears the ddl[0].  Returns the total number of
> + * bytes in ddl.  Caller is responsible for allocting the array of
> + * nx_dde_t *ddl.  If N addresses are required in the scatter-gather
> + * list, the ddl array must have N+1 entries minimum.
> + */
> +static inline uint32_t nx_append_dde(nx_dde_t *ddl, void *addr, uint32_t len)
> +{
> +	uint32_t ddecnt;
> +	uint32_t bytes;
> +
> +	if (addr == NULL && len == 0) {
> +		clearp_dde(ddl);
> +		return 0;
> +	}
> +
> +	NXPRT(fprintf(stderr, "%d: nx_append_dde addr %p len %x\n", __LINE__,
> +			addr, len));
> +
> +	/* Number of ddes in the dde list ; == 0 when it is a direct dde */
> +	ddecnt = getpnn(ddl, dde_count);
> +	bytes = getp32(ddl, ddebc);
> +
> +	if (ddecnt == 0 && bytes == 0) {
> +		/* First dde is unused; make it a direct dde */
> +		bytes = len;
> +		putp32(ddl, ddebc, bytes);
> +		putp64(ddl, ddead, (uint64_t) addr);
> +	} else if (ddecnt == 0) {
> +		/* Converting direct to indirect dde
> +		 * ddl[0] becomes head dde of ddl
> +		 * copy direct to indirect first.
> +		 */
> +		ddl[1] = ddl[0];
> +
> +		/* Add the new dde next */
> +		clear_dde(ddl[2]);
> +		put32(ddl[2], ddebc, len);
> +		put64(ddl[2], ddead, (uint64_t) addr);
> +
> +		/* Ddl head points to 2 direct ddes */
> +		ddecnt = 2;
> +		putpnn(ddl, dde_count, ddecnt);
> +		bytes = bytes + len;
> +		putp32(ddl, ddebc, bytes);
> +		/* Pointer to the first direct dde */
> +		putp64(ddl, ddead, (uint64_t) &ddl[1]);
> +	} else {
> +		/* Append a dde to an existing indirect ddl */
> +		++ddecnt;
> +		clear_dde(ddl[ddecnt]);
> +		put64(ddl[ddecnt], ddead, (uint64_t) addr);
> +		put32(ddl[ddecnt], ddebc, len);
> +
> +		putpnn(ddl, dde_count, ddecnt);
> +		bytes = bytes + len;
> +		putp32(ddl, ddebc, bytes); /* byte sum of all dde */
> +	}
> +	return bytes;
> +}
> +
> +/*
> + * Touch specified number of pages represented in number bytes
> + * beginning from the first buffer in a dde list.
> + * Do not touch the pages past buf_sz-th byte's page.
> + *
> + * Set buf_sz = 0 to touch all pages described by the ddep.
> + */
> +static int nx_touch_pages_dde(nx_dde_t *ddep, long buf_sz, long page_sz,
> +				int wr)
> +{
> +	uint32_t indirect_count;
> +	uint32_t buf_len;
> +	long total;
> +	uint64_t buf_addr;
> +	nx_dde_t *dde_list;
> +	int i;
> +
> +	assert(!!ddep);
> +
> +	indirect_count = getpnn(ddep, dde_count);
> +
> +	NXPRT(fprintf(stderr, "nx_touch_pages_dde dde_count %d request len \
> +			0x%lx\n", indirect_count, buf_sz));
> +
> +	if (indirect_count == 0) {
> +		/* Direct dde */
> +		buf_len = getp32(ddep, ddebc);
> +		buf_addr = getp64(ddep, ddead);
> +
> +		NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n",
> +				buf_len, (void *)buf_addr));
> +
> +		if (buf_sz == 0)
> +			nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
> +		else
> +			nx_touch_pages((void *)buf_addr, NX_MIN(buf_len,
> +					buf_sz), page_sz, wr);
> +
> +		return ERR_NX_OK;
> +	}
> +
> +	/* Indirect dde */
> +	if (indirect_count > MAX_DDE_COUNT)
> +		return ERR_NX_EXCESSIVE_DDE;
> +
> +	/* First address of the list */
> +	dde_list = (nx_dde_t *) getp64(ddep, ddead);
> +
> +	if (buf_sz == 0)
> +		buf_sz = getp32(ddep, ddebc);
> +
> +	total = 0;
> +	for (i = 0; i < indirect_count; i++) {
> +		buf_len = get32(dde_list[i], ddebc);
> +		buf_addr = get64(dde_list[i], ddead);
> +		total += buf_len;
> +
> +		NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total \
> +				0x%lx\n", buf_len, (void *)buf_addr, total));
> +
> +		/* Touching fewer pages than encoded in the ddebc */
> +		if (total > buf_sz) {
> +			buf_len = NX_MIN(buf_len, total - buf_sz);
> +			nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
> +			NXPRT(fprintf(stderr, "touch loop break len 0x%x \
> +				      ddead %p\n", buf_len, (void *)buf_addr));
> +			break;
> +		}
> +		nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
> +	}
> +	return ERR_NX_OK;
> +}
> +
> +/*
> + * Src and dst buffers are supplied in scatter gather lists.
> + * NX function code and other parameters supplied in cmdp.
> + */
> +static int nx_submit_job(nx_dde_t *src, nx_dde_t *dst, nx_gzip_crb_cpb_t *cmdp,
> +			 void *handle)
> +{
> +	int cc;
> +	uint64_t csbaddr;
> +
> +	memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
> +
> +	cmdp->crb.source_dde = *src;
> +	cmdp->crb.target_dde = *dst;
> +
> +	/* Status, output byte count in tpbc */
> +	csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask;
> +	put64(cmdp->crb, csb_address, csbaddr);
> +
> +	/* NX reports input bytes in spbc; cleared */
> +	cmdp->cpb.out_spbc_comp_wrap = 0;
> +	cmdp->cpb.out_spbc_comp_with_count = 0;
> +	cmdp->cpb.out_spbc_decomp = 0;
> +
> +	/* Clear output */
> +	put32(cmdp->cpb, out_crc, INIT_CRC);
> +	put32(cmdp->cpb, out_adler, INIT_ADLER);
> +
> +	cc = nxu_run_job(cmdp, handle);
> +
> +	if (!cc)
> +		cc = getnn(cmdp->crb.csb, csb_cc);	/* CC Table 6-8 */
> +
> +	return cc;
> +}
> +
> +/* fifo queue management */
> +#define fifo_used_bytes(used) (used)
> +#define fifo_free_bytes(used, len) ((len)-(used))
> +/* amount of free bytes in the first and last parts */
> +#define fifo_free_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
> +						  ? (len)-((cur)+(used)) : 0)
> +#define fifo_free_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
> +						  ? (cur) : (len)-(used))
> +/* amount of used bytes in the first and last parts */
> +#define fifo_used_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
> +						  ? (used) : (len)-(cur))
> +#define fifo_used_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
> +						  ? 0 : ((used)+(cur))-(len))
> +/* first and last free parts start here */
> +#define fifo_free_first_offset(cur, used)      ((cur)+(used))
> +#define fifo_free_last_offset(cur, used, len)  \
> +					   fifo_used_last_bytes(cur, used, len)
> +/* first and last used parts start here */
> +#define fifo_used_first_offset(cur)            (cur)
> +#define fifo_used_last_offset(cur)             (0)
> +
> +int decompress_file(int argc, char **argv, void *devhandle)
> +{
> +	FILE *inpf;
> +	FILE *outf;
> +
> +	int c, expect, i, cc, rc = 0;
> +	char gzfname[1024];
> +
> +	/* Queuing, file ops, byte counting */
> +	char *fifo_in, *fifo_out;
> +	int used_in, cur_in, used_out, cur_out, read_sz, n;
> +	int first_free, last_free, first_used, last_used;
> +	int first_offset, last_offset;
> +	int write_sz, free_space, source_sz;
> +	int source_sz_estimate, target_sz_estimate;
> +	uint64_t last_comp_ratio; /* 1000 max */
> +	uint64_t total_out;
> +	int is_final, is_eof;
> +
> +	/* nx hardware */
> +	int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0;
> +	int history_len = 0;
> +	nx_gzip_crb_cpb_t cmd, *cmdp;
> +	nx_dde_t *ddl_in;
> +	nx_dde_t dde_in[6] __attribute__((aligned (128)));
> +	nx_dde_t *ddl_out;
> +	nx_dde_t dde_out[6] __attribute__((aligned (128)));
> +	int pgfault_retries;
> +
> +	/* when using mmap'ed files */
> +	off_t input_file_offset;
> +
> +	if (argc > 2) {
> +		fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]);
> +		fprintf(stderr, "    writes to stdout or <fname>.nx.gunzip\n");
> +		return -1;
> +	}
> +
> +	if (argc == 1) {
> +		inpf = stdin;
> +		outf = stdout;
> +	} else if (argc == 2) {
> +		char w[1024];
> +		char *wp;
> +		inpf = fopen(argv[1], "r");
> +		if (inpf == NULL) {
> +			perror(argv[1]);
> +			return -1;
> +		}
> +
> +		/* Make a new file name to write to.  Ignoring '.gz' */
> +		wp = (NULL != (wp = strrchr(argv[1], '/'))) ? ++wp : argv[1];
> +		strcpy(w, wp);
> +		strcat(w, ".nx.gunzip");
> +
> +		outf = fopen(w, "w");
> +		if (outf == NULL) {
> +			perror(w);
> +			return -1;
> +		}
> +	}
> +
> +#define GETINPC(X) fgetc(X)
> +
> +	/* Decode the gzip header */
> +	c = GETINPC(inpf); expect = 0x1f; /* ID1 */
> +	if (c != expect)
> +		goto err1;
> +
> +	c = GETINPC(inpf); expect = 0x8b; /* ID2 */
> +	if (c != expect)
> +		goto err1;
> +
> +	c = GETINPC(inpf); expect = 0x08; /* CM */
> +	if (c != expect)
> +		goto err1;
> +
> +	int flg = GETINPC(inpf); /* FLG */
> +	if (flg & 0b11100000 || flg & 0b100)
> +		goto err2;
> +
> +	fprintf(stderr, "gzHeader FLG %x\n", flg);
> +
> +	/* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this
> +	 * sample code.
> +	 */
> +	for (i = 0; i < 6; i++) {
> +		char tmp[10];
> +		if (EOF == (tmp[i] = GETINPC(inpf)))
> +			goto err3;
> +		fprintf(stderr, "%02x ", tmp[i]);
> +		if (i == 5)
> +			fprintf(stderr, "\n");
> +	}
> +	fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n");
> +
> +	/* FNAME */
> +	if (flg & 0b1000) {
> +		int k = 0;
> +		do {
> +			if (EOF == (c = GETINPC(inpf)))
> +				goto err3;
> +			gzfname[k++] = c;
> +		} while (c);
> +		fprintf(stderr, "gzHeader FNAME: %s\n", gzfname);
> +	}
> +
> +	/* FHCRC */
> +	if (flg & 0b10) {
> +		c = GETINPC(inpf); c = GETINPC(inpf);
> +		fprintf(stderr, "gzHeader FHCRC: ignored\n");
> +	}
> +
> +	used_in = cur_in = used_out = cur_out = 0;
> +	is_final = is_eof = 0;
> +
> +	/* Allocate one page larger to prevent page faults due to NX
> +	 * overfetching.
> +	 * Either do this (char*)(uintptr_t)aligned_alloc or use
> +	 * -std=c11 flag to make the int-to-pointer warning go away.
> +	 */
> +	assert((fifo_in  = (char *)(uintptr_t)aligned_alloc(line_sz,
> +				   fifo_in_len + page_sz)) != NULL);
> +	assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz,
> +				   fifo_out_len + page_sz + line_sz)) != NULL);
> +	/* Leave unused space due to history rounding rules */
> +	fifo_out = fifo_out + line_sz;
> +	nx_touch_pages(fifo_out, fifo_out_len, page_sz, 1);
> +
> +	ddl_in  = &dde_in[0];
> +	ddl_out = &dde_out[0];
> +	cmdp = &cmd;
> +	memset(&cmdp->crb, 0, sizeof(cmdp->crb));
> +
> +read_state:
> +
> +	/* Read from .gz file */
> +
> +	NXPRT(fprintf(stderr, "read_state:\n"));
> +
> +	if (is_eof != 0)
> +		goto write_state;
> +
> +	/* We read in to fifo_in in two steps: first: read in to from
> +	 * cur_in to the end of the buffer.  last: if free space wrapped
> +	 * around, read from fifo_in offset 0 to offset cur_in.
> +	 */
> +
> +	/* Reset fifo head to reduce unnecessary wrap arounds */
> +	cur_in = (used_in == 0) ? 0 : cur_in;
> +
> +	/* Free space total is reduced by a gap */
> +	free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len)
> +			    - line_sz);
> +
> +	/* Free space may wrap around as first and last */
> +	first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len);
> +	last_free  = fifo_free_last_bytes(cur_in, used_in, fifo_in_len);
> +
> +	/* Start offsets of the free memory */
> +	first_offset = fifo_free_first_offset(cur_in, used_in);
> +	last_offset  = fifo_free_last_offset(cur_in, used_in, fifo_in_len);
> +
> +	/* Reduce read_sz because of the line_sz gap */
> +	read_sz = NX_MIN(free_space, first_free);
> +	n = 0;
> +	if (read_sz > 0) {
> +		/* Read in to offset cur_in + used_in */
> +		n = fread(fifo_in + first_offset, 1, read_sz, inpf);
> +		used_in = used_in + n;
> +		free_space = free_space - n;
> +		assert(n <= read_sz);
> +		if (n != read_sz) {
> +			/* Either EOF or error; exit the read loop */
> +			is_eof = 1;
> +			goto write_state;
> +		}
> +	}
> +
> +	/* If free space wrapped around */
> +	if (last_free > 0) {
> +		/* Reduce read_sz because of the line_sz gap */
> +		read_sz = NX_MIN(free_space, last_free);
> +		n = 0;
> +		if (read_sz > 0) {
> +			n = fread(fifo_in + last_offset, 1, read_sz, inpf);
> +			used_in = used_in + n;       /* Increase used space */
> +			free_space = free_space - n; /* Decrease free space */
> +			assert(n <= read_sz);
> +			if (n != read_sz) {
> +				/* Either EOF or error; exit the read loop */
> +				is_eof = 1;
> +				goto write_state;
> +			}
> +		}
> +	}
> +
> +	/* At this point we have used_in bytes in fifo_in with the
> +	 * data head starting at cur_in and possibly wrapping around.
> +	 */
> +
> +write_state:
> +
> +	/* Write decompressed data to output file */
> +
> +	NXPRT(fprintf(stderr, "write_state:\n"));
> +
> +	if (used_out == 0)
> +		goto decomp_state;
> +
> +	/* If fifo_out has data waiting, write it out to the file to
> +	 * make free target space for the accelerator used bytes in
> +	 * the first and last parts of fifo_out.
> +	 */
> +
> +	first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len);
> +	last_used  = fifo_used_last_bytes(cur_out, used_out, fifo_out_len);
> +
> +	write_sz = first_used;
> +
> +	n = 0;
> +	if (write_sz > 0) {
> +		n = fwrite(fifo_out + cur_out, 1, write_sz, outf);
> +		used_out = used_out - n;
> +		/* Move head of the fifo */
> +		cur_out = (cur_out + n) % fifo_out_len;
> +		assert(n <= write_sz);
> +		if (n != write_sz) {
> +			fprintf(stderr, "error: write\n");
> +			rc = -1;
> +			goto err5;
> +		}
> +	}
> +
> +	if (last_used > 0) { /* If more data available in the last part */
> +		write_sz = last_used; /* Keep it here for later */
> +		n = 0;
> +		if (write_sz > 0) {
> +			n = fwrite(fifo_out, 1, write_sz, outf);
> +			used_out = used_out - n;
> +			cur_out = (cur_out + n) % fifo_out_len;
> +			assert(n <= write_sz);
> +			if (n != write_sz) {
> +				fprintf(stderr, "error: write\n");
> +				rc = -1;
> +				goto err5;
> +			}
> +		}
> +	}
> +
> +decomp_state:
> +
> +	/* NX decompresses input data */
> +
> +	NXPRT(fprintf(stderr, "decomp_state:\n"));
> +
> +	if (is_final)
> +		goto finish_state;
> +
> +	/* Address/len lists */
> +	clearp_dde(ddl_in);
> +	clearp_dde(ddl_out);
> +
> +	/* FC, CRC, HistLen, Table 6-6 */
> +	if (resuming) {
> +		/* Resuming a partially decompressed input.
> +		 * The key to resume is supplying the 32KB
> +		 * dictionary (history) to NX, which is basically
> +		 * the last 32KB of output produced.
> +		 */
> +		fc = GZIP_FC_DECOMPRESS_RESUME;
> +
> +		cmdp->cpb.in_crc   = cmdp->cpb.out_crc;
> +		cmdp->cpb.in_adler = cmdp->cpb.out_adler;
> +
> +		/* Round up the history size to quadword.  Section 2.10 */
> +		history_len = (history_len + 15) / 16;
> +		putnn(cmdp->cpb, in_histlen, history_len);
> +		history_len = history_len * 16; /* bytes */
> +
> +		if (history_len > 0) {
> +			/* Chain in the history buffer to the DDE list */
> +			if (cur_out >= history_len) {
> +				nx_append_dde(ddl_in, fifo_out
> +					      + (cur_out - history_len),
> +					      history_len);
> +			} else {
> +				nx_append_dde(ddl_in, fifo_out
> +					      + ((fifo_out_len + cur_out)
> +					      - history_len),
> +					      history_len - cur_out);
> +				/* Up to 32KB history wraps around fifo_out */
> +				nx_append_dde(ddl_in, fifo_out, cur_out);
> +			}
> +
> +		}
> +	} else {
> +		/* First decompress job */
> +		fc = GZIP_FC_DECOMPRESS;
> +
> +		history_len = 0;
> +		/* Writing 0 clears out subc as well */
> +		cmdp->cpb.in_histlen = 0;
> +		total_out = 0;
> +
> +		put32(cmdp->cpb, in_crc, INIT_CRC);
> +		put32(cmdp->cpb, in_adler, INIT_ADLER);
> +		put32(cmdp->cpb, out_crc, INIT_CRC);
> +		put32(cmdp->cpb, out_adler, INIT_ADLER);
> +
> +		/* Assuming 10% compression ratio initially; use the
> +		 * most recently measured compression ratio as a
> +		 * heuristic to estimate the input and output
> +		 * sizes.  If we give too much input, the target buffer
> +		 * overflows and NX cycles are wasted, and then we
> +		 * must retry with smaller input size.  1000 is 100%.
> +		 */
> +		last_comp_ratio = 100UL;
> +	}
> +	cmdp->crb.gzip_fc = 0;
> +	putnn(cmdp->crb, gzip_fc, fc);
> +
> +	/*
> +	 * NX source buffers
> +	 */
> +	first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
> +	last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
> +
> +	if (first_used > 0)
> +		nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
> +
> +	if (last_used > 0)
> +		nx_append_dde(ddl_in, fifo_in, last_used);
> +
> +	/*
> +	 * NX target buffers
> +	 */
> +	first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len);
> +	last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len);
> +
> +	/* Reduce output free space amount not to overwrite the history */
> +	int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len)
> +				- (1<<16));
> +
> +	NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max,
> +		      target_max));
> +
> +	first_free = NX_MIN(target_max, first_free);
> +	if (first_free > 0) {
> +		first_offset = fifo_free_first_offset(cur_out, used_out);
> +		nx_append_dde(ddl_out, fifo_out + first_offset, first_free);
> +	}
> +
> +	if (last_free > 0) {
> +		last_free = NX_MIN(target_max - first_free, last_free);
> +		if (last_free > 0) {
> +			last_offset = fifo_free_last_offset(cur_out, used_out,
> +							    fifo_out_len);
> +			nx_append_dde(ddl_out, fifo_out + last_offset,
> +				      last_free);
> +		}
> +	}
> +
> +	/* Target buffer size is used to limit the source data size
> +	 * based on previous measurements of compression ratio.
> +	 */
> +
> +	/* source_sz includes history */
> +	source_sz = getp32(ddl_in, ddebc);
> +	assert(source_sz > history_len);
> +	source_sz = source_sz - history_len;
> +
> +	/* Estimating how much source is needed to 3/4 fill a
> +	 * target_max size target buffer.  If we overshoot, then NX
> +	 * must repeat the job with smaller input and we waste
> +	 * bandwidth.  If we undershoot then we use more NX calls than
> +	 * necessary.
> +	 */
> +
> +	source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL)
> +				/ 4000;
> +
> +	if (source_sz_estimate < source_sz) {
> +		/* Target might be small, therefore limiting the
> +		 * source data.
> +		 */
> +		source_sz = source_sz_estimate;
> +		target_sz_estimate = target_max;
> +	} else {
> +		/* Source file might be small, therefore limiting target
> +		 * touch pages to a smaller value to save processor cycles.
> +		 */
> +		target_sz_estimate = ((uint64_t)source_sz * 1000UL)
> +					/ (last_comp_ratio + 1);
> +		target_sz_estimate = NX_MIN(2 * target_sz_estimate,
> +					    target_max);
> +	}
> +
> +	source_sz = source_sz + history_len;
> +
> +	/* Some NX condition codes require submitting the NX job again.
> +	 * Kernel doesn't handle NX page faults. Expects user code to
> +	 * touch pages.
> +	 */
> +	pgfault_retries = retry_max;
> +
> +restart_nx:
> +
> +	putp32(ddl_in, ddebc, source_sz);
> +
> +	/* Fault in pages */
> +	nx_touch_pages_dde(ddl_in, 0, page_sz, 0);
> +	nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1);
> +
> +	/* Send job to NX */
> +	cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle);
> +
> +	switch (cc) {
> +
> +	case ERR_NX_TRANSLATION:
> +
> +		/* We touched the pages ahead of time.  In the most common case
> +		 * we shouldn't be here.  But may be some pages were paged out.
> +		 * Kernel should have placed the faulting address to fsaddr.
> +		 */
> +		NXPRT(fprintf(stderr, "ERR_NX_TRANSLATION %p\n",
> +			      (void *)cmdp->crb.csb.fsaddr));
> +
> +		/* Touch 1 byte, read-only  */
> +		nx_touch_pages((void *)cmdp->crb.csb.fsaddr, 1, page_sz, 0);
> +
> +		if (pgfault_retries == retry_max) {
> +			/* Try once with exact number of pages */
> +			--pgfault_retries;
> +			goto restart_nx;
> +		} else if (pgfault_retries > 0) {
> +			/* If still faulting try fewer input pages
> +			 * assuming memory outage
> +			 */
> +			if (source_sz > page_sz)
> +				source_sz = NX_MAX(source_sz / 2, page_sz);
> +			--pgfault_retries;
> +			goto restart_nx;
> +		} else {
> +			fprintf(stderr, "cannot make progress; too many page \
> +				fault retries cc= %d\n", cc);
> +			rc = -1;
> +			goto err5;
> +		}
> +
> +	case ERR_NX_DATA_LENGTH:
> +
> +		NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; not an error \
> +			      usually; stream may have trailing data\n"));
> +
> +		/* Not an error in the most common case; it just says
> +		 * there is trailing data that we must examine.
> +		 *
> +		 * CC=3 CE(1)=0 CE(0)=1 indicates partial completion
> +		 * Fig.6-7 and Table 6-8.
> +		 */
> +		nx_ce = get_csb_ce_ms3b(cmdp->crb.csb);
> +
> +		if (!csb_ce_termination(nx_ce) &&
> +		    csb_ce_partial_completion(nx_ce)) {
> +			/* Check CPB for more information
> +			 * spbc and tpbc are valid
> +			 */
> +			sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */
> +			subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */
> +			spbc = get32(cmdp->cpb, out_spbc_decomp);
> +			tpbc = get32(cmdp->crb.csb, tpbc);
> +			assert(target_max >= tpbc);
> +
> +			goto ok_cc3; /* not an error */
> +		} else {
> +			/* History length error when CE(1)=1 CE(0)=0. */
> +			rc = -1;
> +			fprintf(stderr, "history length error cc= %d\n", cc);
> +			goto err5;
> +		}
> +
> +	case ERR_NX_TARGET_SPACE:
> +
> +		/* Target buffer not large enough; retry smaller input
> +		 * data; give at least 1 byte.  SPBC/TPBC are not valid.
> +		 */
> +		assert(source_sz > history_len);
> +		source_sz = ((source_sz - history_len + 2) / 2) + history_len;
> +		NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with \
> +			      smaller input data src %d hist %d\n", source_sz,
> +			      history_len));
> +		goto restart_nx;
> +
> +	case ERR_NX_OK:
> +
> +		/* This should not happen for gzip formatted data;
> +		 * we need trailing crc and isize
> +		 */
> +		fprintf(stderr, "ERR_NX_OK\n");
> +		spbc = get32(cmdp->cpb, out_spbc_decomp);
> +		tpbc = get32(cmdp->crb.csb, tpbc);
> +		assert(target_max >= tpbc);
> +		assert(spbc >= history_len);
> +		source_sz = spbc - history_len;
> +		goto offsets_state;
> +
> +	default:
> +		fprintf(stderr, "error: cc= %d\n", cc);
> +		rc = -1;
> +		goto err5;
> +	}
> +
> +ok_cc3:
> +
> +	NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt));
> +
> +	assert(spbc > history_len);
> +	source_sz = spbc - history_len;
> +
> +	/* Table 6-4: Source Final Block Type (SFBT) describes the
> +	 * last processed deflate block and clues the software how to
> +	 * resume the next job.  SUBC indicates how many input bits NX
> +	 * consumed but did not process.  SPBC indicates how many
> +	 * bytes of source were given to the accelerator including
> +	 * history bytes.
> +	 */
> +
> +	switch (sfbt) {
> +		int dhtlen;
> +
> +	case 0b0000: /* Deflate final EOB received */
> +
> +		/* Calculating the checksum start position. */
> +
> +		source_sz = source_sz - subc / 8;
> +		is_final = 1;
> +		break;
> +
> +		/* Resume decompression cases are below. Basically
> +		 * indicates where NX has suspended and how to resume
> +		 * the input stream.
> +		 */
> +
> +	case 0b1000: /* Within a literal block; use rembytecount */
> +	case 0b1001: /* Within a literal block; use rembytecount; bfinal=1 */
> +
> +		/* Supply the partially processed source byte again */
> +		source_sz = source_sz - ((subc + 7) / 8);
> +
> +		/* SUBC LS 3bits: number of bits in the first source byte need
> +		 * to be processed.
> +		 * 000 means all 8 bits;  Table 6-3
> +		 * Clear subc, histlen, sfbt, rembytecnt, dhtlen
> +		 */
> +		cmdp->cpb.in_subc = 0;
> +		cmdp->cpb.in_sfbt = 0;
> +		putnn(cmdp->cpb, in_subc, subc % 8);
> +		putnn(cmdp->cpb, in_sfbt, sfbt);
> +		putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb,
> +						      out_rembytecnt));
> +		break;
> +
> +	case 0b1010: /* Within a FH block; */
> +	case 0b1011: /* Within a FH block; bfinal=1 */
> +
> +		source_sz = source_sz - ((subc + 7) / 8);
> +
> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
> +		cmdp->cpb.in_subc = 0;
> +		cmdp->cpb.in_sfbt = 0;
> +		putnn(cmdp->cpb, in_subc, subc % 8);
> +		putnn(cmdp->cpb, in_sfbt, sfbt);
> +		break;
> +
> +	case 0b1100: /* Within a DH block; */
> +	case 0b1101: /* Within a DH block; bfinal=1 */
> +
> +		source_sz = source_sz - ((subc + 7) / 8);
> +
> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
> +		cmdp->cpb.in_subc = 0;
> +		cmdp->cpb.in_sfbt = 0;
> +		putnn(cmdp->cpb, in_subc, subc % 8);
> +		putnn(cmdp->cpb, in_sfbt, sfbt);
> +
> +		dhtlen = getnn(cmdp->cpb, out_dhtlen);
> +		putnn(cmdp->cpb, in_dhtlen, dhtlen);
> +		assert(dhtlen >= 42);
> +
> +		/* Round up to a qword */
> +		dhtlen = (dhtlen + 127) / 128;
> +
> +		while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */
> +			--dhtlen;
> +			cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen];
> +		}
> +		break;
> +
> +	case 0b1110: /* Within a block header; bfinal=0; */
> +		     /* Also given if source data exactly ends (SUBC=0) with
> +		      * EOB code with BFINAL=0.  Means the next byte will
> +		      * contain a block header.
> +		      */
> +	case 0b1111: /* within a block header with BFINAL=1. */
> +
> +		source_sz = source_sz - ((subc + 7) / 8);
> +
> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
> +		cmdp->cpb.in_subc = 0;
> +		cmdp->cpb.in_sfbt = 0;
> +		putnn(cmdp->cpb, in_subc, subc % 8);
> +		putnn(cmdp->cpb, in_sfbt, sfbt);
> +	}
> +
> +offsets_state:
> +
> +	/* Adjust the source and target buffer offsets and lengths  */
> +
> +	NXPRT(fprintf(stderr, "offsets_state:\n"));
> +
> +	/* Delete input data from fifo_in */
> +	used_in = used_in - source_sz;
> +	cur_in = (cur_in + source_sz) % fifo_in_len;
> +	input_file_offset = input_file_offset + source_sz;
> +
> +	/* Add output data to fifo_out */
> +	used_out = used_out + tpbc;
> +
> +	assert(used_out <= fifo_out_len);
> +
> +	total_out = total_out + tpbc;
> +
> +	/* Deflate history is 32KB max.  No need to supply more
> +	 * than 32KB on a resume.
> +	 */
> +	history_len = (total_out > window_max) ? window_max : total_out;
> +
> +	/* To estimate expected expansion in the next NX job; 500 means 50%.
> +	 * Deflate best case is around 1 to 1000.
> +	 */
> +	last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1))
> +			  / ((uint64_t)tpbc + 1);
> +	last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1);
> +	NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n",
> +		      last_comp_ratio, source_sz, spbc, tpbc));
> +
> +	resuming = 1;
> +
> +finish_state:
> +
> +	NXPRT(fprintf(stderr, "finish_state:\n"));
> +
> +	if (is_final) {
> +		if (used_out)
> +			goto write_state; /* More data to write out */
> +		else if (used_in < 8) {
> +			/* Need at least 8 more bytes containing gzip crc
> +			 * and isize.
> +			 */
> +			rc = -1;
> +			goto err4;
> +		} else {
> +			/* Compare checksums and exit */
> +			int i;
> +			char tail[8];
> +			uint32_t cksum, isize;
> +			for (i = 0; i < 8; i++)
> +				tail[i] = fifo_in[(cur_in + i) % fifo_in_len];
> +			fprintf(stderr, "computed checksum %08x isize %08x\n",
> +				cmdp->cpb.out_crc, (uint32_t) (total_out
> +				% (1ULL<<32)));
> +			cksum = (tail[0] | tail[1]<<8 | tail[2]<<16
> +				| tail[3]<<24);
> +			isize = (tail[4] | tail[5]<<8 | tail[6]<<16
> +				| tail[7]<<24);
> +			fprintf(stderr, "stored   checksum %08x isize %08x\n",
> +				cksum, isize);
> +
> +			if (cksum == cmdp->cpb.out_crc && isize == (uint32_t)
> +			    (total_out % (1ULL<<32))) {
> +				rc = 0;	goto ok1;
> +			} else {
> +				rc = -1; goto err4;
> +			}
> +		}
> +	} else
> +		goto read_state;
> +
> +	return -1;
> +
> +err1:
> +	fprintf(stderr, "error: not a gzip file, expect %x, read %x\n",
> +		expect, c);
> +	return -1;
> +
> +err2:
> +	fprintf(stderr, "error: the FLG byte is wrong or not handled by this \
> +		code sample\n");
> +	return -1;
> +
> +err3:
> +	fprintf(stderr, "error: gzip header\n");
> +	return -1;
> +
> +err4:
> +	fprintf(stderr, "error: checksum\n");
> +
> +err5:
> +ok1:
> +	fprintf(stderr, "decomp is complete: fclose\n");
> +	fclose(outf);
> +
> +	return rc;
> +}
> +
> +
> +int main(int argc, char **argv)
> +{
> +	int rc;
> +	struct sigaction act;
> +	void *handle;
> +
> +	act.sa_handler = 0;
> +	act.sa_sigaction = sigsegv_handler;
> +	act.sa_flags = SA_SIGINFO;
> +	act.sa_restorer = 0;
> +	sigemptyset(&act.sa_mask);
> +	sigaction(SIGSEGV, &act, NULL);
> +
> +	handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
> +	if (!handle) {
> +		fprintf(stderr, "Unable to init NX, errno %d\n", errno);
> +		exit(-1);
> +	}
> +
> +	rc = decompress_file(argc, argv, handle);
> +
> +	nx_function_end(handle);
> +
> +	return rc;
> +}
> -- 
> 2.21.0

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5/5] selftests/powerpc: Add README for GZIP engine tests
  2020-03-16 18:07 ` [PATCH 5/5] selftests/powerpc: Add README for GZIP engine tests Raphael Moreira Zinsly
@ 2020-03-18  6:40   ` Daniel Axtens
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Axtens @ 2020-03-18  6:40 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, linuxppc-dev, linux-crypto
  Cc: Raphael Moreira Zinsly, haren, herbert, abali

This is a good readme, the instructions for compiling and testing work.

Reviewed-by: Daniel Axtens <dja@axtens.net>

Regards,
Daniel

Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:

> Include a README file with the instructions to use the
> testcases at selftests/powerpc/nx-gzip.
>
> Signed-off-by: Bulent Abali <abali@us.ibm.com>
> Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
> ---
>  .../powerpc/nx-gzip/99-nx-gzip.rules          |  1 +
>  .../testing/selftests/powerpc/nx-gzip/README  | 44 +++++++++++++++++++
>  2 files changed, 45 insertions(+)
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/README
>
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules
> new file mode 100644
> index 000000000000..5a7118495cb3
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules
> @@ -0,0 +1 @@
> +SUBSYSTEM=="nxgzip", KERNEL=="nx-gzip", MODE="0666"
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/README b/tools/testing/selftests/powerpc/nx-gzip/README
> new file mode 100644
> index 000000000000..ff0c817a65c5
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/README
> @@ -0,0 +1,44 @@
> +Test the nx-gzip function:
> +=========================
> +
> +Verify that following device exists:
> +  /dev/crypto/nx-gzip
> +If you get a permission error run as sudo or set the device permissions:
> +   sudo chmod go+rw /dev/crypto/nx-gzip
> +However, chmod may not survive across boots. You may create a udev file such
> +as:
> +   /etc/udev/rules.d/99-nx-gzip.rules
> +
> +
> +Then make and run:
> +$ make
> +gcc -O3 -I./inc -o gzfht_test gzfht_test.c gzip_vas.c
> +gcc -O3 -I./inc -o gunz_test gunz_test.c gzip_vas.c
> +
> +
> +Compress any file using Fixed Huffman mode. Output will have a .nx.gz suffix:
> +$ ./gzfht_test gzip_vas.c
> +file gzip_vas.c read, 5276 bytes
> +compressed 5276 to 2564 bytes total, crc32 checksum = b937a37d
> +
> +
> +Uncompress the previous output. Output will have a .nx.gunzip suffix:
> +$ ./gunz_test gzip_vas.c.nx.gz
> +gzHeader FLG 0
> +00 00 00 00 04 03
> +gzHeader MTIME, XFL, OS ignored
> +computed checksum b937a37d isize 0000149c
> +stored   checksum b937a37d isize 0000149c
> +decomp is complete: fclose
> +
> +
> +Compare two files:
> +$ sha1sum gzip_vas.c.nx.gz.nx.gunzip gzip_vas.c
> +f041cd8581e8d920f79f6ce7f65411be5d026c2a  gzip_vas.c.nx.gz.nx.gunzip
> +f041cd8581e8d920f79f6ce7f65411be5d026c2a  gzip_vas.c
> +
> +
> +Note that the code here are intended for testing the nx-gzip hardware function.
> +They are not intended for demonstrating performance or compression ratio.
> +For more information and source code consider using:
> +https://github.com/libnxz/power-gzip
> -- 
> 2.21.0

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/5] selftests/powerpc: Add NX-GZIP engine decompress testcase
  2020-03-18  6:18   ` Daniel Axtens
@ 2020-03-18 13:08     ` Raphael M Zinsly
  2020-03-18 22:19       ` Daniel Axtens
  0 siblings, 1 reply; 14+ messages in thread
From: Raphael M Zinsly @ 2020-03-18 13:08 UTC (permalink / raw)
  To: Daniel Axtens, linuxppc-dev, linux-crypto; +Cc: haren, herbert, abali

Thanks for the reviews Daniel, I'll use your testcases and address the 
issues you found, I still have some questions bellow:

On 18/03/2020 03:18, Daniel Axtens wrote:
> Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:
> 
>> Include a decompression testcase for the powerpc NX-GZIP
>> engine.
> 
> I compiled gzip with the AFL++ fuzzer and generated a corpus of tests to
> run against this decompressor. I also fuzzed the decompressor
> directly. I found a few issues. I _think_ they're just in the userspace
> but I'm a bit too early in the process to know.
> 
> I realise this is self-test code but:
> a) it stops me testing more deeply, and
I don't understand what do you mean by that, what did you couldn't test?

> b) it looks like some of this code is shared with https://github.com/libnxz/power-gzip/
Is that an issue?
This selftest were develop by the same team that develop the userspace 
library, the first version of this tests were pushed there in order for
the team to review. It uses some of the headers of the library to access 
the accelerator and part of the code in the samples.

Regards,
Raphael

> 
> The issues I've found are:
> 
> 1) In the ERR_NX_DATA_LENGTH case, the decompressor doesn't check that
>     you're making forward progress, so you can provoke it into an
>     infinite loop.
> 
> Here's an _extremely_ ugly fix:
> 
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
> index 653de92698cc..236a1f567656 100644
> --- a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
> +++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
> @@ -343,6 +343,8 @@ int decompress_file(int argc, char **argv, void *devhandle)
>          nx_dde_t dde_out[6] __attribute__((aligned (128)));
>          int pgfault_retries;
>   
> +       int last_first_used = 0;
> +
>          /* when using mmap'ed files */
>          off_t input_file_offset;
>   
> @@ -642,6 +644,11 @@ int decompress_file(int argc, char **argv, void *devhandle)
>          first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
>          last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
>   
> +       if (first_used > 0 && last_first_used > 0) {
> +               assert(first_used != last_first_used);
> +       }
> +       last_first_used = first_used;
> +
>          if (first_used > 0)
>                  nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
>   
> 
> 2) It looks like you can provoke an out-of-bounds write. I've seen both
> infinte loops printing something that seems to come from the file
> content like:
> 
> 57201: Got signal 11 si_code 3, si_addr 0xcacacacacacacac8
> 
> or a less bizzare address like
> 
> 19285: Got signal 11 si_code 1, si_addr 0x7fffcf1b0000
> 
> Depending on the build I've also seen the stack smasher protection fire.
> 
> I don't understand the code well enough to figure out how this comes to
> be just yet.
> 
> I've included a few test cases as attachments. I've preconverted them
> with xxd to avoid anything that might flag suspicious gzip files!
> Decompress them then use `xxd -r attachment testcase.gz` to convert them
> back.
> 
> Regards,
> Daniel
> 
> 
> 
> 
>>
>> Signed-off-by: Bulent Abali <abali@us.ibm.com>
>> Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
>> ---
>>   .../selftests/powerpc/nx-gzip/Makefile        |    7 +-
>>   .../selftests/powerpc/nx-gzip/gunz_test.c     | 1058 +++++++++++++++++
>>   2 files changed, 1062 insertions(+), 3 deletions(-)
>>   create mode 100644 tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>>
>> diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile
>> index ab903f63bbbd..82abc19a49a0 100644
>> --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile
>> +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile
>> @@ -1,9 +1,9 @@
>>   CC = gcc
>>   CFLAGS = -O3
>>   INC = ./inc
>> -SRC = gzfht_test.c
>> +SRC = gzfht_test.c gunz_test.c
>>   OBJ = $(SRC:.c=.o)
>> -TESTS = gzfht_test
>> +TESTS = gzfht_test gunz_test
>>   EXTRA_SOURCES = gzip_vas.c
>>   
>>   all:	$(TESTS)
>> @@ -16,6 +16,7 @@ $(TESTS): $(OBJ)
>>   
>>   run_tests: $(TESTS)
>>   	./gzfht_test gzip_vas.c
>> +	./gunz_test gzip_vas.c.nx.gz
>>   
>>   clean:
>> -	rm -f $(TESTS) *.o *~ *.gz
>> +	rm -f $(TESTS) *.o *~ *.gz *.gunzip
>> diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>> new file mode 100644
>> index 000000000000..653de92698cc
>> --- /dev/null
>> +++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>> @@ -0,0 +1,1058 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later
>> + *
>> + * P9 gunzip sample code for demonstrating the P9 NX hardware
>> + * interface.  Not intended for productive uses or for performance or
>> + * compression ratio measurements.  Note also that /dev/crypto/gzip,
>> + * VAS and skiboot support are required
>> + *
>> + * Copyright 2020 IBM Corp.
>> + *
>> + * Author: Bulent Abali <abali@us.ibm.com>
>> + *
>> + * https://github.com/libnxz/power-gzip for zlib api and other utils
>> + * Definitions of acronyms used here.  See
>> + * P9 NX Gzip Accelerator User's Manual for details
>> + *
>> + * adler/crc: 32 bit checksums appended to stream tail
>> + * ce:       completion extension
>> + * cpb:      coprocessor parameter block (metadata)
>> + * crb:      coprocessor request block (command)
>> + * csb:      coprocessor status block (status)
>> + * dht:      dynamic huffman table
>> + * dde:      data descriptor element (address, length)
>> + * ddl:      list of ddes
>> + * dh/fh:    dynamic and fixed huffman types
>> + * fc:       coprocessor function code
>> + * histlen:  history/dictionary length
>> + * history:  sliding window of up to 32KB of data
>> + * lzcount:  Deflate LZ symbol counts
>> + * rembytecnt: remaining byte count
>> + * sfbt:     source final block type; last block's type during decomp
>> + * spbc:     source processed byte count
>> + * subc:     source unprocessed bit count
>> + * tebc:     target ending bit count; valid bits in the last byte
>> + * tpbc:     target processed byte count
>> + * vas:      virtual accelerator switch; the user mode interface
>> + */
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <unistd.h>
>> +#include <stdint.h>
>> +#include <sys/types.h>
>> +#include <sys/stat.h>
>> +#include <sys/time.h>
>> +#include <sys/fcntl.h>
>> +#include <sys/mman.h>
>> +#include <endian.h>
>> +#include <bits/endian.h>
>> +#include <sys/ioctl.h>
>> +#include <assert.h>
>> +#include <errno.h>
>> +#include <signal.h>
>> +#include "nxu.h"
>> +#include "nx.h"
>> +
>> +int nx_dbg = 0;
>> +FILE *nx_gzip_log = NULL;
>> +
>> +#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y))
>> +#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y))
>> +
>> +#define mb()     asm volatile("sync" ::: "memory")
>> +#define rmb()    asm volatile("lwsync" ::: "memory")
>> +#define wmb()    rmb()
>> +
>> +const int fifo_in_len = 1<<24;
>> +const int fifo_out_len = 1<<24;
>> +const int page_sz = 1<<16;
>> +const int line_sz = 1<<7;
>> +const int window_max = 1<<15;
>> +const int retry_max = 50;
>> +
>> +extern void *nx_fault_storage_address;
>> +extern void *nx_function_begin(int function, int pri);
>> +extern int nx_function_end(void *handle);
>> +
>> +/*
>> + * Fault in pages prior to NX job submission.  wr=1 may be required to
>> + * touch writeable pages.  System zero pages do not fault-in the page as
>> + * intended.  Typically set wr=1 for NX target pages and set wr=0 for
>> + * NX source pages.
>> + */
>> +static int nx_touch_pages(void *buf, long buf_len, long page_len, int wr)
>> +{
>> +	char *begin = buf;
>> +	char *end = (char *) buf + buf_len - 1;
>> +	volatile char t;
>> +
>> +	assert(buf_len >= 0 && !!buf);
>> +
>> +	NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf,
>> +			buf + buf_len, buf_len, wr));
>> +
>> +	if (buf_len <= 0 || buf == NULL)
>> +		return -1;
>> +
>> +	do {
>> +		t = *begin;
>> +		if (wr)
>> +			*begin = t;
>> +		begin = begin + page_len;
>> +	} while (begin < end);
>> +
>> +	/* When buf_sz is small or buf tail is in another page. */
>> +	t = *end;
>> +	if (wr)
>> +		*end = t;
>> +
>> +	return 0;
>> +}
>> +
>> +void sigsegv_handler(int sig, siginfo_t *info, void *ctx)
>> +{
>> +	fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(),
>> +	       sig, info->si_code, info->si_addr);
>> +
>> +	nx_fault_storage_address = info->si_addr;
>> +}
>> +
>> +/*
>> + * Adds an (address, len) pair to the list of ddes (ddl) and updates
>> + * the base dde.  ddl[0] is the only dde in a direct dde which
>> + * contains a single (addr,len) pair.  For more pairs, ddl[0] becomes
>> + * the indirect (base) dde that points to a list of direct ddes.
>> + * See Section 6.4 of the NX-gzip user manual for DDE description.
>> + * Addr=NULL, len=0 clears the ddl[0].  Returns the total number of
>> + * bytes in ddl.  Caller is responsible for allocting the array of
>> + * nx_dde_t *ddl.  If N addresses are required in the scatter-gather
>> + * list, the ddl array must have N+1 entries minimum.
>> + */
>> +static inline uint32_t nx_append_dde(nx_dde_t *ddl, void *addr, uint32_t len)
>> +{
>> +	uint32_t ddecnt;
>> +	uint32_t bytes;
>> +
>> +	if (addr == NULL && len == 0) {
>> +		clearp_dde(ddl);
>> +		return 0;
>> +	}
>> +
>> +	NXPRT(fprintf(stderr, "%d: nx_append_dde addr %p len %x\n", __LINE__,
>> +			addr, len));
>> +
>> +	/* Number of ddes in the dde list ; == 0 when it is a direct dde */
>> +	ddecnt = getpnn(ddl, dde_count);
>> +	bytes = getp32(ddl, ddebc);
>> +
>> +	if (ddecnt == 0 && bytes == 0) {
>> +		/* First dde is unused; make it a direct dde */
>> +		bytes = len;
>> +		putp32(ddl, ddebc, bytes);
>> +		putp64(ddl, ddead, (uint64_t) addr);
>> +	} else if (ddecnt == 0) {
>> +		/* Converting direct to indirect dde
>> +		 * ddl[0] becomes head dde of ddl
>> +		 * copy direct to indirect first.
>> +		 */
>> +		ddl[1] = ddl[0];
>> +
>> +		/* Add the new dde next */
>> +		clear_dde(ddl[2]);
>> +		put32(ddl[2], ddebc, len);
>> +		put64(ddl[2], ddead, (uint64_t) addr);
>> +
>> +		/* Ddl head points to 2 direct ddes */
>> +		ddecnt = 2;
>> +		putpnn(ddl, dde_count, ddecnt);
>> +		bytes = bytes + len;
>> +		putp32(ddl, ddebc, bytes);
>> +		/* Pointer to the first direct dde */
>> +		putp64(ddl, ddead, (uint64_t) &ddl[1]);
>> +	} else {
>> +		/* Append a dde to an existing indirect ddl */
>> +		++ddecnt;
>> +		clear_dde(ddl[ddecnt]);
>> +		put64(ddl[ddecnt], ddead, (uint64_t) addr);
>> +		put32(ddl[ddecnt], ddebc, len);
>> +
>> +		putpnn(ddl, dde_count, ddecnt);
>> +		bytes = bytes + len;
>> +		putp32(ddl, ddebc, bytes); /* byte sum of all dde */
>> +	}
>> +	return bytes;
>> +}
>> +
>> +/*
>> + * Touch specified number of pages represented in number bytes
>> + * beginning from the first buffer in a dde list.
>> + * Do not touch the pages past buf_sz-th byte's page.
>> + *
>> + * Set buf_sz = 0 to touch all pages described by the ddep.
>> + */
>> +static int nx_touch_pages_dde(nx_dde_t *ddep, long buf_sz, long page_sz,
>> +				int wr)
>> +{
>> +	uint32_t indirect_count;
>> +	uint32_t buf_len;
>> +	long total;
>> +	uint64_t buf_addr;
>> +	nx_dde_t *dde_list;
>> +	int i;
>> +
>> +	assert(!!ddep);
>> +
>> +	indirect_count = getpnn(ddep, dde_count);
>> +
>> +	NXPRT(fprintf(stderr, "nx_touch_pages_dde dde_count %d request len \
>> +			0x%lx\n", indirect_count, buf_sz));
>> +
>> +	if (indirect_count == 0) {
>> +		/* Direct dde */
>> +		buf_len = getp32(ddep, ddebc);
>> +		buf_addr = getp64(ddep, ddead);
>> +
>> +		NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n",
>> +				buf_len, (void *)buf_addr));
>> +
>> +		if (buf_sz == 0)
>> +			nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
>> +		else
>> +			nx_touch_pages((void *)buf_addr, NX_MIN(buf_len,
>> +					buf_sz), page_sz, wr);
>> +
>> +		return ERR_NX_OK;
>> +	}
>> +
>> +	/* Indirect dde */
>> +	if (indirect_count > MAX_DDE_COUNT)
>> +		return ERR_NX_EXCESSIVE_DDE;
>> +
>> +	/* First address of the list */
>> +	dde_list = (nx_dde_t *) getp64(ddep, ddead);
>> +
>> +	if (buf_sz == 0)
>> +		buf_sz = getp32(ddep, ddebc);
>> +
>> +	total = 0;
>> +	for (i = 0; i < indirect_count; i++) {
>> +		buf_len = get32(dde_list[i], ddebc);
>> +		buf_addr = get64(dde_list[i], ddead);
>> +		total += buf_len;
>> +
>> +		NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total \
>> +				0x%lx\n", buf_len, (void *)buf_addr, total));
>> +
>> +		/* Touching fewer pages than encoded in the ddebc */
>> +		if (total > buf_sz) {
>> +			buf_len = NX_MIN(buf_len, total - buf_sz);
>> +			nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
>> +			NXPRT(fprintf(stderr, "touch loop break len 0x%x \
>> +				      ddead %p\n", buf_len, (void *)buf_addr));
>> +			break;
>> +		}
>> +		nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
>> +	}
>> +	return ERR_NX_OK;
>> +}
>> +
>> +/*
>> + * Src and dst buffers are supplied in scatter gather lists.
>> + * NX function code and other parameters supplied in cmdp.
>> + */
>> +static int nx_submit_job(nx_dde_t *src, nx_dde_t *dst, nx_gzip_crb_cpb_t *cmdp,
>> +			 void *handle)
>> +{
>> +	int cc;
>> +	uint64_t csbaddr;
>> +
>> +	memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
>> +
>> +	cmdp->crb.source_dde = *src;
>> +	cmdp->crb.target_dde = *dst;
>> +
>> +	/* Status, output byte count in tpbc */
>> +	csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask;
>> +	put64(cmdp->crb, csb_address, csbaddr);
>> +
>> +	/* NX reports input bytes in spbc; cleared */
>> +	cmdp->cpb.out_spbc_comp_wrap = 0;
>> +	cmdp->cpb.out_spbc_comp_with_count = 0;
>> +	cmdp->cpb.out_spbc_decomp = 0;
>> +
>> +	/* Clear output */
>> +	put32(cmdp->cpb, out_crc, INIT_CRC);
>> +	put32(cmdp->cpb, out_adler, INIT_ADLER);
>> +
>> +	cc = nxu_run_job(cmdp, handle);
>> +
>> +	if (!cc)
>> +		cc = getnn(cmdp->crb.csb, csb_cc);	/* CC Table 6-8 */
>> +
>> +	return cc;
>> +}
>> +
>> +/* fifo queue management */
>> +#define fifo_used_bytes(used) (used)
>> +#define fifo_free_bytes(used, len) ((len)-(used))
>> +/* amount of free bytes in the first and last parts */
>> +#define fifo_free_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
>> +						  ? (len)-((cur)+(used)) : 0)
>> +#define fifo_free_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
>> +						  ? (cur) : (len)-(used))
>> +/* amount of used bytes in the first and last parts */
>> +#define fifo_used_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
>> +						  ? (used) : (len)-(cur))
>> +#define fifo_used_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
>> +						  ? 0 : ((used)+(cur))-(len))
>> +/* first and last free parts start here */
>> +#define fifo_free_first_offset(cur, used)      ((cur)+(used))
>> +#define fifo_free_last_offset(cur, used, len)  \
>> +					   fifo_used_last_bytes(cur, used, len)
>> +/* first and last used parts start here */
>> +#define fifo_used_first_offset(cur)            (cur)
>> +#define fifo_used_last_offset(cur)             (0)
>> +
>> +int decompress_file(int argc, char **argv, void *devhandle)
>> +{
>> +	FILE *inpf;
>> +	FILE *outf;
>> +
>> +	int c, expect, i, cc, rc = 0;
>> +	char gzfname[1024];
>> +
>> +	/* Queuing, file ops, byte counting */
>> +	char *fifo_in, *fifo_out;
>> +	int used_in, cur_in, used_out, cur_out, read_sz, n;
>> +	int first_free, last_free, first_used, last_used;
>> +	int first_offset, last_offset;
>> +	int write_sz, free_space, source_sz;
>> +	int source_sz_estimate, target_sz_estimate;
>> +	uint64_t last_comp_ratio; /* 1000 max */
>> +	uint64_t total_out;
>> +	int is_final, is_eof;
>> +
>> +	/* nx hardware */
>> +	int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0;
>> +	int history_len = 0;
>> +	nx_gzip_crb_cpb_t cmd, *cmdp;
>> +	nx_dde_t *ddl_in;
>> +	nx_dde_t dde_in[6] __attribute__((aligned (128)));
>> +	nx_dde_t *ddl_out;
>> +	nx_dde_t dde_out[6] __attribute__((aligned (128)));
>> +	int pgfault_retries;
>> +
>> +	/* when using mmap'ed files */
>> +	off_t input_file_offset;
>> +
>> +	if (argc > 2) {
>> +		fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]);
>> +		fprintf(stderr, "    writes to stdout or <fname>.nx.gunzip\n");
>> +		return -1;
>> +	}
>> +
>> +	if (argc == 1) {
>> +		inpf = stdin;
>> +		outf = stdout;
>> +	} else if (argc == 2) {
>> +		char w[1024];
>> +		char *wp;
>> +		inpf = fopen(argv[1], "r");
>> +		if (inpf == NULL) {
>> +			perror(argv[1]);
>> +			return -1;
>> +		}
>> +
>> +		/* Make a new file name to write to.  Ignoring '.gz' */
>> +		wp = (NULL != (wp = strrchr(argv[1], '/'))) ? ++wp : argv[1];
>> +		strcpy(w, wp);
>> +		strcat(w, ".nx.gunzip");
>> +
>> +		outf = fopen(w, "w");
>> +		if (outf == NULL) {
>> +			perror(w);
>> +			return -1;
>> +		}
>> +	}
>> +
>> +#define GETINPC(X) fgetc(X)
>> +
>> +	/* Decode the gzip header */
>> +	c = GETINPC(inpf); expect = 0x1f; /* ID1 */
>> +	if (c != expect)
>> +		goto err1;
>> +
>> +	c = GETINPC(inpf); expect = 0x8b; /* ID2 */
>> +	if (c != expect)
>> +		goto err1;
>> +
>> +	c = GETINPC(inpf); expect = 0x08; /* CM */
>> +	if (c != expect)
>> +		goto err1;
>> +
>> +	int flg = GETINPC(inpf); /* FLG */
>> +	if (flg & 0b11100000 || flg & 0b100)
>> +		goto err2;
>> +
>> +	fprintf(stderr, "gzHeader FLG %x\n", flg);
>> +
>> +	/* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this
>> +	 * sample code.
>> +	 */
>> +	for (i = 0; i < 6; i++) {
>> +		char tmp[10];
>> +		if (EOF == (tmp[i] = GETINPC(inpf)))
>> +			goto err3;
>> +		fprintf(stderr, "%02x ", tmp[i]);
>> +		if (i == 5)
>> +			fprintf(stderr, "\n");
>> +	}
>> +	fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n");
>> +
>> +	/* FNAME */
>> +	if (flg & 0b1000) {
>> +		int k = 0;
>> +		do {
>> +			if (EOF == (c = GETINPC(inpf)))
>> +				goto err3;
>> +			gzfname[k++] = c;
>> +		} while (c);
>> +		fprintf(stderr, "gzHeader FNAME: %s\n", gzfname);
>> +	}
>> +
>> +	/* FHCRC */
>> +	if (flg & 0b10) {
>> +		c = GETINPC(inpf); c = GETINPC(inpf);
>> +		fprintf(stderr, "gzHeader FHCRC: ignored\n");
>> +	}
>> +
>> +	used_in = cur_in = used_out = cur_out = 0;
>> +	is_final = is_eof = 0;
>> +
>> +	/* Allocate one page larger to prevent page faults due to NX
>> +	 * overfetching.
>> +	 * Either do this (char*)(uintptr_t)aligned_alloc or use
>> +	 * -std=c11 flag to make the int-to-pointer warning go away.
>> +	 */
>> +	assert((fifo_in  = (char *)(uintptr_t)aligned_alloc(line_sz,
>> +				   fifo_in_len + page_sz)) != NULL);
>> +	assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz,
>> +				   fifo_out_len + page_sz + line_sz)) != NULL);
>> +	/* Leave unused space due to history rounding rules */
>> +	fifo_out = fifo_out + line_sz;
>> +	nx_touch_pages(fifo_out, fifo_out_len, page_sz, 1);
>> +
>> +	ddl_in  = &dde_in[0];
>> +	ddl_out = &dde_out[0];
>> +	cmdp = &cmd;
>> +	memset(&cmdp->crb, 0, sizeof(cmdp->crb));
>> +
>> +read_state:
>> +
>> +	/* Read from .gz file */
>> +
>> +	NXPRT(fprintf(stderr, "read_state:\n"));
>> +
>> +	if (is_eof != 0)
>> +		goto write_state;
>> +
>> +	/* We read in to fifo_in in two steps: first: read in to from
>> +	 * cur_in to the end of the buffer.  last: if free space wrapped
>> +	 * around, read from fifo_in offset 0 to offset cur_in.
>> +	 */
>> +
>> +	/* Reset fifo head to reduce unnecessary wrap arounds */
>> +	cur_in = (used_in == 0) ? 0 : cur_in;
>> +
>> +	/* Free space total is reduced by a gap */
>> +	free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len)
>> +			    - line_sz);
>> +
>> +	/* Free space may wrap around as first and last */
>> +	first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len);
>> +	last_free  = fifo_free_last_bytes(cur_in, used_in, fifo_in_len);
>> +
>> +	/* Start offsets of the free memory */
>> +	first_offset = fifo_free_first_offset(cur_in, used_in);
>> +	last_offset  = fifo_free_last_offset(cur_in, used_in, fifo_in_len);
>> +
>> +	/* Reduce read_sz because of the line_sz gap */
>> +	read_sz = NX_MIN(free_space, first_free);
>> +	n = 0;
>> +	if (read_sz > 0) {
>> +		/* Read in to offset cur_in + used_in */
>> +		n = fread(fifo_in + first_offset, 1, read_sz, inpf);
>> +		used_in = used_in + n;
>> +		free_space = free_space - n;
>> +		assert(n <= read_sz);
>> +		if (n != read_sz) {
>> +			/* Either EOF or error; exit the read loop */
>> +			is_eof = 1;
>> +			goto write_state;
>> +		}
>> +	}
>> +
>> +	/* If free space wrapped around */
>> +	if (last_free > 0) {
>> +		/* Reduce read_sz because of the line_sz gap */
>> +		read_sz = NX_MIN(free_space, last_free);
>> +		n = 0;
>> +		if (read_sz > 0) {
>> +			n = fread(fifo_in + last_offset, 1, read_sz, inpf);
>> +			used_in = used_in + n;       /* Increase used space */
>> +			free_space = free_space - n; /* Decrease free space */
>> +			assert(n <= read_sz);
>> +			if (n != read_sz) {
>> +				/* Either EOF or error; exit the read loop */
>> +				is_eof = 1;
>> +				goto write_state;
>> +			}
>> +		}
>> +	}
>> +
>> +	/* At this point we have used_in bytes in fifo_in with the
>> +	 * data head starting at cur_in and possibly wrapping around.
>> +	 */
>> +
>> +write_state:
>> +
>> +	/* Write decompressed data to output file */
>> +
>> +	NXPRT(fprintf(stderr, "write_state:\n"));
>> +
>> +	if (used_out == 0)
>> +		goto decomp_state;
>> +
>> +	/* If fifo_out has data waiting, write it out to the file to
>> +	 * make free target space for the accelerator used bytes in
>> +	 * the first and last parts of fifo_out.
>> +	 */
>> +
>> +	first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len);
>> +	last_used  = fifo_used_last_bytes(cur_out, used_out, fifo_out_len);
>> +
>> +	write_sz = first_used;
>> +
>> +	n = 0;
>> +	if (write_sz > 0) {
>> +		n = fwrite(fifo_out + cur_out, 1, write_sz, outf);
>> +		used_out = used_out - n;
>> +		/* Move head of the fifo */
>> +		cur_out = (cur_out + n) % fifo_out_len;
>> +		assert(n <= write_sz);
>> +		if (n != write_sz) {
>> +			fprintf(stderr, "error: write\n");
>> +			rc = -1;
>> +			goto err5;
>> +		}
>> +	}
>> +
>> +	if (last_used > 0) { /* If more data available in the last part */
>> +		write_sz = last_used; /* Keep it here for later */
>> +		n = 0;
>> +		if (write_sz > 0) {
>> +			n = fwrite(fifo_out, 1, write_sz, outf);
>> +			used_out = used_out - n;
>> +			cur_out = (cur_out + n) % fifo_out_len;
>> +			assert(n <= write_sz);
>> +			if (n != write_sz) {
>> +				fprintf(stderr, "error: write\n");
>> +				rc = -1;
>> +				goto err5;
>> +			}
>> +		}
>> +	}
>> +
>> +decomp_state:
>> +
>> +	/* NX decompresses input data */
>> +
>> +	NXPRT(fprintf(stderr, "decomp_state:\n"));
>> +
>> +	if (is_final)
>> +		goto finish_state;
>> +
>> +	/* Address/len lists */
>> +	clearp_dde(ddl_in);
>> +	clearp_dde(ddl_out);
>> +
>> +	/* FC, CRC, HistLen, Table 6-6 */
>> +	if (resuming) {
>> +		/* Resuming a partially decompressed input.
>> +		 * The key to resume is supplying the 32KB
>> +		 * dictionary (history) to NX, which is basically
>> +		 * the last 32KB of output produced.
>> +		 */
>> +		fc = GZIP_FC_DECOMPRESS_RESUME;
>> +
>> +		cmdp->cpb.in_crc   = cmdp->cpb.out_crc;
>> +		cmdp->cpb.in_adler = cmdp->cpb.out_adler;
>> +
>> +		/* Round up the history size to quadword.  Section 2.10 */
>> +		history_len = (history_len + 15) / 16;
>> +		putnn(cmdp->cpb, in_histlen, history_len);
>> +		history_len = history_len * 16; /* bytes */
>> +
>> +		if (history_len > 0) {
>> +			/* Chain in the history buffer to the DDE list */
>> +			if (cur_out >= history_len) {
>> +				nx_append_dde(ddl_in, fifo_out
>> +					      + (cur_out - history_len),
>> +					      history_len);
>> +			} else {
>> +				nx_append_dde(ddl_in, fifo_out
>> +					      + ((fifo_out_len + cur_out)
>> +					      - history_len),
>> +					      history_len - cur_out);
>> +				/* Up to 32KB history wraps around fifo_out */
>> +				nx_append_dde(ddl_in, fifo_out, cur_out);
>> +			}
>> +
>> +		}
>> +	} else {
>> +		/* First decompress job */
>> +		fc = GZIP_FC_DECOMPRESS;
>> +
>> +		history_len = 0;
>> +		/* Writing 0 clears out subc as well */
>> +		cmdp->cpb.in_histlen = 0;
>> +		total_out = 0;
>> +
>> +		put32(cmdp->cpb, in_crc, INIT_CRC);
>> +		put32(cmdp->cpb, in_adler, INIT_ADLER);
>> +		put32(cmdp->cpb, out_crc, INIT_CRC);
>> +		put32(cmdp->cpb, out_adler, INIT_ADLER);
>> +
>> +		/* Assuming 10% compression ratio initially; use the
>> +		 * most recently measured compression ratio as a
>> +		 * heuristic to estimate the input and output
>> +		 * sizes.  If we give too much input, the target buffer
>> +		 * overflows and NX cycles are wasted, and then we
>> +		 * must retry with smaller input size.  1000 is 100%.
>> +		 */
>> +		last_comp_ratio = 100UL;
>> +	}
>> +	cmdp->crb.gzip_fc = 0;
>> +	putnn(cmdp->crb, gzip_fc, fc);
>> +
>> +	/*
>> +	 * NX source buffers
>> +	 */
>> +	first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
>> +	last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
>> +
>> +	if (first_used > 0)
>> +		nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
>> +
>> +	if (last_used > 0)
>> +		nx_append_dde(ddl_in, fifo_in, last_used);
>> +
>> +	/*
>> +	 * NX target buffers
>> +	 */
>> +	first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len);
>> +	last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len);
>> +
>> +	/* Reduce output free space amount not to overwrite the history */
>> +	int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len)
>> +				- (1<<16));
>> +
>> +	NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max,
>> +		      target_max));
>> +
>> +	first_free = NX_MIN(target_max, first_free);
>> +	if (first_free > 0) {
>> +		first_offset = fifo_free_first_offset(cur_out, used_out);
>> +		nx_append_dde(ddl_out, fifo_out + first_offset, first_free);
>> +	}
>> +
>> +	if (last_free > 0) {
>> +		last_free = NX_MIN(target_max - first_free, last_free);
>> +		if (last_free > 0) {
>> +			last_offset = fifo_free_last_offset(cur_out, used_out,
>> +							    fifo_out_len);
>> +			nx_append_dde(ddl_out, fifo_out + last_offset,
>> +				      last_free);
>> +		}
>> +	}
>> +
>> +	/* Target buffer size is used to limit the source data size
>> +	 * based on previous measurements of compression ratio.
>> +	 */
>> +
>> +	/* source_sz includes history */
>> +	source_sz = getp32(ddl_in, ddebc);
>> +	assert(source_sz > history_len);
>> +	source_sz = source_sz - history_len;
>> +
>> +	/* Estimating how much source is needed to 3/4 fill a
>> +	 * target_max size target buffer.  If we overshoot, then NX
>> +	 * must repeat the job with smaller input and we waste
>> +	 * bandwidth.  If we undershoot then we use more NX calls than
>> +	 * necessary.
>> +	 */
>> +
>> +	source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL)
>> +				/ 4000;
>> +
>> +	if (source_sz_estimate < source_sz) {
>> +		/* Target might be small, therefore limiting the
>> +		 * source data.
>> +		 */
>> +		source_sz = source_sz_estimate;
>> +		target_sz_estimate = target_max;
>> +	} else {
>> +		/* Source file might be small, therefore limiting target
>> +		 * touch pages to a smaller value to save processor cycles.
>> +		 */
>> +		target_sz_estimate = ((uint64_t)source_sz * 1000UL)
>> +					/ (last_comp_ratio + 1);
>> +		target_sz_estimate = NX_MIN(2 * target_sz_estimate,
>> +					    target_max);
>> +	}
>> +
>> +	source_sz = source_sz + history_len;
>> +
>> +	/* Some NX condition codes require submitting the NX job again.
>> +	 * Kernel doesn't handle NX page faults. Expects user code to
>> +	 * touch pages.
>> +	 */
>> +	pgfault_retries = retry_max;
>> +
>> +restart_nx:
>> +
>> +	putp32(ddl_in, ddebc, source_sz);
>> +
>> +	/* Fault in pages */
>> +	nx_touch_pages_dde(ddl_in, 0, page_sz, 0);
>> +	nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1);
>> +
>> +	/* Send job to NX */
>> +	cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle);
>> +
>> +	switch (cc) {
>> +
>> +	case ERR_NX_TRANSLATION:
>> +
>> +		/* We touched the pages ahead of time.  In the most common case
>> +		 * we shouldn't be here.  But may be some pages were paged out.
>> +		 * Kernel should have placed the faulting address to fsaddr.
>> +		 */
>> +		NXPRT(fprintf(stderr, "ERR_NX_TRANSLATION %p\n",
>> +			      (void *)cmdp->crb.csb.fsaddr));
>> +
>> +		/* Touch 1 byte, read-only  */
>> +		nx_touch_pages((void *)cmdp->crb.csb.fsaddr, 1, page_sz, 0);
>> +
>> +		if (pgfault_retries == retry_max) {
>> +			/* Try once with exact number of pages */
>> +			--pgfault_retries;
>> +			goto restart_nx;
>> +		} else if (pgfault_retries > 0) {
>> +			/* If still faulting try fewer input pages
>> +			 * assuming memory outage
>> +			 */
>> +			if (source_sz > page_sz)
>> +				source_sz = NX_MAX(source_sz / 2, page_sz);
>> +			--pgfault_retries;
>> +			goto restart_nx;
>> +		} else {
>> +			fprintf(stderr, "cannot make progress; too many page \
>> +				fault retries cc= %d\n", cc);
>> +			rc = -1;
>> +			goto err5;
>> +		}
>> +
>> +	case ERR_NX_DATA_LENGTH:
>> +
>> +		NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; not an error \
>> +			      usually; stream may have trailing data\n"));
>> +
>> +		/* Not an error in the most common case; it just says
>> +		 * there is trailing data that we must examine.
>> +		 *
>> +		 * CC=3 CE(1)=0 CE(0)=1 indicates partial completion
>> +		 * Fig.6-7 and Table 6-8.
>> +		 */
>> +		nx_ce = get_csb_ce_ms3b(cmdp->crb.csb);
>> +
>> +		if (!csb_ce_termination(nx_ce) &&
>> +		    csb_ce_partial_completion(nx_ce)) {
>> +			/* Check CPB for more information
>> +			 * spbc and tpbc are valid
>> +			 */
>> +			sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */
>> +			subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */
>> +			spbc = get32(cmdp->cpb, out_spbc_decomp);
>> +			tpbc = get32(cmdp->crb.csb, tpbc);
>> +			assert(target_max >= tpbc);
>> +
>> +			goto ok_cc3; /* not an error */
>> +		} else {
>> +			/* History length error when CE(1)=1 CE(0)=0. */
>> +			rc = -1;
>> +			fprintf(stderr, "history length error cc= %d\n", cc);
>> +			goto err5;
>> +		}
>> +
>> +	case ERR_NX_TARGET_SPACE:
>> +
>> +		/* Target buffer not large enough; retry smaller input
>> +		 * data; give at least 1 byte.  SPBC/TPBC are not valid.
>> +		 */
>> +		assert(source_sz > history_len);
>> +		source_sz = ((source_sz - history_len + 2) / 2) + history_len;
>> +		NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with \
>> +			      smaller input data src %d hist %d\n", source_sz,
>> +			      history_len));
>> +		goto restart_nx;
>> +
>> +	case ERR_NX_OK:
>> +
>> +		/* This should not happen for gzip formatted data;
>> +		 * we need trailing crc and isize
>> +		 */
>> +		fprintf(stderr, "ERR_NX_OK\n");
>> +		spbc = get32(cmdp->cpb, out_spbc_decomp);
>> +		tpbc = get32(cmdp->crb.csb, tpbc);
>> +		assert(target_max >= tpbc);
>> +		assert(spbc >= history_len);
>> +		source_sz = spbc - history_len;
>> +		goto offsets_state;
>> +
>> +	default:
>> +		fprintf(stderr, "error: cc= %d\n", cc);
>> +		rc = -1;
>> +		goto err5;
>> +	}
>> +
>> +ok_cc3:
>> +
>> +	NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt));
>> +
>> +	assert(spbc > history_len);
>> +	source_sz = spbc - history_len;
>> +
>> +	/* Table 6-4: Source Final Block Type (SFBT) describes the
>> +	 * last processed deflate block and clues the software how to
>> +	 * resume the next job.  SUBC indicates how many input bits NX
>> +	 * consumed but did not process.  SPBC indicates how many
>> +	 * bytes of source were given to the accelerator including
>> +	 * history bytes.
>> +	 */
>> +
>> +	switch (sfbt) {
>> +		int dhtlen;
>> +
>> +	case 0b0000: /* Deflate final EOB received */
>> +
>> +		/* Calculating the checksum start position. */
>> +
>> +		source_sz = source_sz - subc / 8;
>> +		is_final = 1;
>> +		break;
>> +
>> +		/* Resume decompression cases are below. Basically
>> +		 * indicates where NX has suspended and how to resume
>> +		 * the input stream.
>> +		 */
>> +
>> +	case 0b1000: /* Within a literal block; use rembytecount */
>> +	case 0b1001: /* Within a literal block; use rembytecount; bfinal=1 */
>> +
>> +		/* Supply the partially processed source byte again */
>> +		source_sz = source_sz - ((subc + 7) / 8);
>> +
>> +		/* SUBC LS 3bits: number of bits in the first source byte need
>> +		 * to be processed.
>> +		 * 000 means all 8 bits;  Table 6-3
>> +		 * Clear subc, histlen, sfbt, rembytecnt, dhtlen
>> +		 */
>> +		cmdp->cpb.in_subc = 0;
>> +		cmdp->cpb.in_sfbt = 0;
>> +		putnn(cmdp->cpb, in_subc, subc % 8);
>> +		putnn(cmdp->cpb, in_sfbt, sfbt);
>> +		putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb,
>> +						      out_rembytecnt));
>> +		break;
>> +
>> +	case 0b1010: /* Within a FH block; */
>> +	case 0b1011: /* Within a FH block; bfinal=1 */
>> +
>> +		source_sz = source_sz - ((subc + 7) / 8);
>> +
>> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
>> +		cmdp->cpb.in_subc = 0;
>> +		cmdp->cpb.in_sfbt = 0;
>> +		putnn(cmdp->cpb, in_subc, subc % 8);
>> +		putnn(cmdp->cpb, in_sfbt, sfbt);
>> +		break;
>> +
>> +	case 0b1100: /* Within a DH block; */
>> +	case 0b1101: /* Within a DH block; bfinal=1 */
>> +
>> +		source_sz = source_sz - ((subc + 7) / 8);
>> +
>> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
>> +		cmdp->cpb.in_subc = 0;
>> +		cmdp->cpb.in_sfbt = 0;
>> +		putnn(cmdp->cpb, in_subc, subc % 8);
>> +		putnn(cmdp->cpb, in_sfbt, sfbt);
>> +
>> +		dhtlen = getnn(cmdp->cpb, out_dhtlen);
>> +		putnn(cmdp->cpb, in_dhtlen, dhtlen);
>> +		assert(dhtlen >= 42);
>> +
>> +		/* Round up to a qword */
>> +		dhtlen = (dhtlen + 127) / 128;
>> +
>> +		while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */
>> +			--dhtlen;
>> +			cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen];
>> +		}
>> +		break;
>> +
>> +	case 0b1110: /* Within a block header; bfinal=0; */
>> +		     /* Also given if source data exactly ends (SUBC=0) with
>> +		      * EOB code with BFINAL=0.  Means the next byte will
>> +		      * contain a block header.
>> +		      */
>> +	case 0b1111: /* within a block header with BFINAL=1. */
>> +
>> +		source_sz = source_sz - ((subc + 7) / 8);
>> +
>> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
>> +		cmdp->cpb.in_subc = 0;
>> +		cmdp->cpb.in_sfbt = 0;
>> +		putnn(cmdp->cpb, in_subc, subc % 8);
>> +		putnn(cmdp->cpb, in_sfbt, sfbt);
>> +	}
>> +
>> +offsets_state:
>> +
>> +	/* Adjust the source and target buffer offsets and lengths  */
>> +
>> +	NXPRT(fprintf(stderr, "offsets_state:\n"));
>> +
>> +	/* Delete input data from fifo_in */
>> +	used_in = used_in - source_sz;
>> +	cur_in = (cur_in + source_sz) % fifo_in_len;
>> +	input_file_offset = input_file_offset + source_sz;
>> +
>> +	/* Add output data to fifo_out */
>> +	used_out = used_out + tpbc;
>> +
>> +	assert(used_out <= fifo_out_len);
>> +
>> +	total_out = total_out + tpbc;
>> +
>> +	/* Deflate history is 32KB max.  No need to supply more
>> +	 * than 32KB on a resume.
>> +	 */
>> +	history_len = (total_out > window_max) ? window_max : total_out;
>> +
>> +	/* To estimate expected expansion in the next NX job; 500 means 50%.
>> +	 * Deflate best case is around 1 to 1000.
>> +	 */
>> +	last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1))
>> +			  / ((uint64_t)tpbc + 1);
>> +	last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1);
>> +	NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n",
>> +		      last_comp_ratio, source_sz, spbc, tpbc));
>> +
>> +	resuming = 1;
>> +
>> +finish_state:
>> +
>> +	NXPRT(fprintf(stderr, "finish_state:\n"));
>> +
>> +	if (is_final) {
>> +		if (used_out)
>> +			goto write_state; /* More data to write out */
>> +		else if (used_in < 8) {
>> +			/* Need at least 8 more bytes containing gzip crc
>> +			 * and isize.
>> +			 */
>> +			rc = -1;
>> +			goto err4;
>> +		} else {
>> +			/* Compare checksums and exit */
>> +			int i;
>> +			char tail[8];
>> +			uint32_t cksum, isize;
>> +			for (i = 0; i < 8; i++)
>> +				tail[i] = fifo_in[(cur_in + i) % fifo_in_len];
>> +			fprintf(stderr, "computed checksum %08x isize %08x\n",
>> +				cmdp->cpb.out_crc, (uint32_t) (total_out
>> +				% (1ULL<<32)));
>> +			cksum = (tail[0] | tail[1]<<8 | tail[2]<<16
>> +				| tail[3]<<24);
>> +			isize = (tail[4] | tail[5]<<8 | tail[6]<<16
>> +				| tail[7]<<24);
>> +			fprintf(stderr, "stored   checksum %08x isize %08x\n",
>> +				cksum, isize);
>> +
>> +			if (cksum == cmdp->cpb.out_crc && isize == (uint32_t)
>> +			    (total_out % (1ULL<<32))) {
>> +				rc = 0;	goto ok1;
>> +			} else {
>> +				rc = -1; goto err4;
>> +			}
>> +		}
>> +	} else
>> +		goto read_state;
>> +
>> +	return -1;
>> +
>> +err1:
>> +	fprintf(stderr, "error: not a gzip file, expect %x, read %x\n",
>> +		expect, c);
>> +	return -1;
>> +
>> +err2:
>> +	fprintf(stderr, "error: the FLG byte is wrong or not handled by this \
>> +		code sample\n");
>> +	return -1;
>> +
>> +err3:
>> +	fprintf(stderr, "error: gzip header\n");
>> +	return -1;
>> +
>> +err4:
>> +	fprintf(stderr, "error: checksum\n");
>> +
>> +err5:
>> +ok1:
>> +	fprintf(stderr, "decomp is complete: fclose\n");
>> +	fclose(outf);
>> +
>> +	return rc;
>> +}
>> +
>> +
>> +int main(int argc, char **argv)
>> +{
>> +	int rc;
>> +	struct sigaction act;
>> +	void *handle;
>> +
>> +	act.sa_handler = 0;
>> +	act.sa_sigaction = sigsegv_handler;
>> +	act.sa_flags = SA_SIGINFO;
>> +	act.sa_restorer = 0;
>> +	sigemptyset(&act.sa_mask);
>> +	sigaction(SIGSEGV, &act, NULL);
>> +
>> +	handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
>> +	if (!handle) {
>> +		fprintf(stderr, "Unable to init NX, errno %d\n", errno);
>> +		exit(-1);
>> +	}
>> +
>> +	rc = decompress_file(argc, argv, handle);
>> +
>> +	nx_function_end(handle);
>> +
>> +	return rc;
>> +}
>> -- 
>> 2.21.0

-- 
Raphael Moreira Zinsly
IBM
Linux on Power Toolchain

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/5] selftests/powerpc: Add NX-GZIP engine decompress testcase
  2020-03-18 13:08     ` Raphael M Zinsly
@ 2020-03-18 22:19       ` Daniel Axtens
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Axtens @ 2020-03-18 22:19 UTC (permalink / raw)
  To: Raphael M Zinsly, linuxppc-dev, linux-crypto; +Cc: haren, herbert, abali

Raphael M Zinsly <rzinsly@linux.ibm.com> writes:

> Thanks for the reviews Daniel, I'll use your testcases and address the 
> issues you found, I still have some questions bellow:
>
> On 18/03/2020 03:18, Daniel Axtens wrote:
>> Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:
>> 
>>> Include a decompression testcase for the powerpc NX-GZIP
>>> engine.
>> 
>> I compiled gzip with the AFL++ fuzzer and generated a corpus of tests to
>> run against this decompressor. I also fuzzed the decompressor
>> directly. I found a few issues. I _think_ they're just in the userspace
>> but I'm a bit too early in the process to know.
>> 
>> I realise this is self-test code but:
>> a) it stops me testing more deeply, and
> I don't understand what do you mean by that, what did you couldn't test?

I'm trying to stress-test the accellerator by fuzzing it. If it hangs
with an infinite loop rather than cleanly exiting, that inhibits my
ability to stress-test it.

>> b) it looks like some of this code is shared with https://github.com/libnxz/power-gzip/
> Is that an issue?
> This selftest were develop by the same team that develop the userspace 
> library, the first version of this tests were pushed there in order for
> the team to review. It uses some of the headers of the library to access 
> the accelerator and part of the code in the samples.

What I mean is that if there's a bug in code we copied, we should also
report it to the developers of the library so that it gets fixed in both
places.

Regards,
Daniel

> Regards,
> Raphael
>
>> 
>> The issues I've found are:
>> 
>> 1) In the ERR_NX_DATA_LENGTH case, the decompressor doesn't check that
>>     you're making forward progress, so you can provoke it into an
>>     infinite loop.
>> 
>> Here's an _extremely_ ugly fix:
>> 
>> diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>> index 653de92698cc..236a1f567656 100644
>> --- a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>> +++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>> @@ -343,6 +343,8 @@ int decompress_file(int argc, char **argv, void *devhandle)
>>          nx_dde_t dde_out[6] __attribute__((aligned (128)));
>>          int pgfault_retries;
>>   
>> +       int last_first_used = 0;
>> +
>>          /* when using mmap'ed files */
>>          off_t input_file_offset;
>>   
>> @@ -642,6 +644,11 @@ int decompress_file(int argc, char **argv, void *devhandle)
>>          first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
>>          last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
>>   
>> +       if (first_used > 0 && last_first_used > 0) {
>> +               assert(first_used != last_first_used);
>> +       }
>> +       last_first_used = first_used;
>> +
>>          if (first_used > 0)
>>                  nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
>>   
>> 
>> 2) It looks like you can provoke an out-of-bounds write. I've seen both
>> infinte loops printing something that seems to come from the file
>> content like:
>> 
>> 57201: Got signal 11 si_code 3, si_addr 0xcacacacacacacac8
>> 
>> or a less bizzare address like
>> 
>> 19285: Got signal 11 si_code 1, si_addr 0x7fffcf1b0000
>> 
>> Depending on the build I've also seen the stack smasher protection fire.
>> 
>> I don't understand the code well enough to figure out how this comes to
>> be just yet.
>> 
>> I've included a few test cases as attachments. I've preconverted them
>> with xxd to avoid anything that might flag suspicious gzip files!
>> Decompress them then use `xxd -r attachment testcase.gz` to convert them
>> back.
>> 
>> Regards,
>> Daniel
>> 
>> 
>> 
>> 
>>>
>>> Signed-off-by: Bulent Abali <abali@us.ibm.com>
>>> Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
>>> ---
>>>   .../selftests/powerpc/nx-gzip/Makefile        |    7 +-
>>>   .../selftests/powerpc/nx-gzip/gunz_test.c     | 1058 +++++++++++++++++
>>>   2 files changed, 1062 insertions(+), 3 deletions(-)
>>>   create mode 100644 tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>>>
>>> diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile
>>> index ab903f63bbbd..82abc19a49a0 100644
>>> --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile
>>> +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile
>>> @@ -1,9 +1,9 @@
>>>   CC = gcc
>>>   CFLAGS = -O3
>>>   INC = ./inc
>>> -SRC = gzfht_test.c
>>> +SRC = gzfht_test.c gunz_test.c
>>>   OBJ = $(SRC:.c=.o)
>>> -TESTS = gzfht_test
>>> +TESTS = gzfht_test gunz_test
>>>   EXTRA_SOURCES = gzip_vas.c
>>>   
>>>   all:	$(TESTS)
>>> @@ -16,6 +16,7 @@ $(TESTS): $(OBJ)
>>>   
>>>   run_tests: $(TESTS)
>>>   	./gzfht_test gzip_vas.c
>>> +	./gunz_test gzip_vas.c.nx.gz
>>>   
>>>   clean:
>>> -	rm -f $(TESTS) *.o *~ *.gz
>>> +	rm -f $(TESTS) *.o *~ *.gz *.gunzip
>>> diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>>> new file mode 100644
>>> index 000000000000..653de92698cc
>>> --- /dev/null
>>> +++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
>>> @@ -0,0 +1,1058 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-or-later
>>> + *
>>> + * P9 gunzip sample code for demonstrating the P9 NX hardware
>>> + * interface.  Not intended for productive uses or for performance or
>>> + * compression ratio measurements.  Note also that /dev/crypto/gzip,
>>> + * VAS and skiboot support are required
>>> + *
>>> + * Copyright 2020 IBM Corp.
>>> + *
>>> + * Author: Bulent Abali <abali@us.ibm.com>
>>> + *
>>> + * https://github.com/libnxz/power-gzip for zlib api and other utils
>>> + * Definitions of acronyms used here.  See
>>> + * P9 NX Gzip Accelerator User's Manual for details
>>> + *
>>> + * adler/crc: 32 bit checksums appended to stream tail
>>> + * ce:       completion extension
>>> + * cpb:      coprocessor parameter block (metadata)
>>> + * crb:      coprocessor request block (command)
>>> + * csb:      coprocessor status block (status)
>>> + * dht:      dynamic huffman table
>>> + * dde:      data descriptor element (address, length)
>>> + * ddl:      list of ddes
>>> + * dh/fh:    dynamic and fixed huffman types
>>> + * fc:       coprocessor function code
>>> + * histlen:  history/dictionary length
>>> + * history:  sliding window of up to 32KB of data
>>> + * lzcount:  Deflate LZ symbol counts
>>> + * rembytecnt: remaining byte count
>>> + * sfbt:     source final block type; last block's type during decomp
>>> + * spbc:     source processed byte count
>>> + * subc:     source unprocessed bit count
>>> + * tebc:     target ending bit count; valid bits in the last byte
>>> + * tpbc:     target processed byte count
>>> + * vas:      virtual accelerator switch; the user mode interface
>>> + */
>>> +
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +#include <string.h>
>>> +#include <unistd.h>
>>> +#include <stdint.h>
>>> +#include <sys/types.h>
>>> +#include <sys/stat.h>
>>> +#include <sys/time.h>
>>> +#include <sys/fcntl.h>
>>> +#include <sys/mman.h>
>>> +#include <endian.h>
>>> +#include <bits/endian.h>
>>> +#include <sys/ioctl.h>
>>> +#include <assert.h>
>>> +#include <errno.h>
>>> +#include <signal.h>
>>> +#include "nxu.h"
>>> +#include "nx.h"
>>> +
>>> +int nx_dbg = 0;
>>> +FILE *nx_gzip_log = NULL;
>>> +
>>> +#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y))
>>> +#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y))
>>> +
>>> +#define mb()     asm volatile("sync" ::: "memory")
>>> +#define rmb()    asm volatile("lwsync" ::: "memory")
>>> +#define wmb()    rmb()
>>> +
>>> +const int fifo_in_len = 1<<24;
>>> +const int fifo_out_len = 1<<24;
>>> +const int page_sz = 1<<16;
>>> +const int line_sz = 1<<7;
>>> +const int window_max = 1<<15;
>>> +const int retry_max = 50;
>>> +
>>> +extern void *nx_fault_storage_address;
>>> +extern void *nx_function_begin(int function, int pri);
>>> +extern int nx_function_end(void *handle);
>>> +
>>> +/*
>>> + * Fault in pages prior to NX job submission.  wr=1 may be required to
>>> + * touch writeable pages.  System zero pages do not fault-in the page as
>>> + * intended.  Typically set wr=1 for NX target pages and set wr=0 for
>>> + * NX source pages.
>>> + */
>>> +static int nx_touch_pages(void *buf, long buf_len, long page_len, int wr)
>>> +{
>>> +	char *begin = buf;
>>> +	char *end = (char *) buf + buf_len - 1;
>>> +	volatile char t;
>>> +
>>> +	assert(buf_len >= 0 && !!buf);
>>> +
>>> +	NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf,
>>> +			buf + buf_len, buf_len, wr));
>>> +
>>> +	if (buf_len <= 0 || buf == NULL)
>>> +		return -1;
>>> +
>>> +	do {
>>> +		t = *begin;
>>> +		if (wr)
>>> +			*begin = t;
>>> +		begin = begin + page_len;
>>> +	} while (begin < end);
>>> +
>>> +	/* When buf_sz is small or buf tail is in another page. */
>>> +	t = *end;
>>> +	if (wr)
>>> +		*end = t;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +void sigsegv_handler(int sig, siginfo_t *info, void *ctx)
>>> +{
>>> +	fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(),
>>> +	       sig, info->si_code, info->si_addr);
>>> +
>>> +	nx_fault_storage_address = info->si_addr;
>>> +}
>>> +
>>> +/*
>>> + * Adds an (address, len) pair to the list of ddes (ddl) and updates
>>> + * the base dde.  ddl[0] is the only dde in a direct dde which
>>> + * contains a single (addr,len) pair.  For more pairs, ddl[0] becomes
>>> + * the indirect (base) dde that points to a list of direct ddes.
>>> + * See Section 6.4 of the NX-gzip user manual for DDE description.
>>> + * Addr=NULL, len=0 clears the ddl[0].  Returns the total number of
>>> + * bytes in ddl.  Caller is responsible for allocting the array of
>>> + * nx_dde_t *ddl.  If N addresses are required in the scatter-gather
>>> + * list, the ddl array must have N+1 entries minimum.
>>> + */
>>> +static inline uint32_t nx_append_dde(nx_dde_t *ddl, void *addr, uint32_t len)
>>> +{
>>> +	uint32_t ddecnt;
>>> +	uint32_t bytes;
>>> +
>>> +	if (addr == NULL && len == 0) {
>>> +		clearp_dde(ddl);
>>> +		return 0;
>>> +	}
>>> +
>>> +	NXPRT(fprintf(stderr, "%d: nx_append_dde addr %p len %x\n", __LINE__,
>>> +			addr, len));
>>> +
>>> +	/* Number of ddes in the dde list ; == 0 when it is a direct dde */
>>> +	ddecnt = getpnn(ddl, dde_count);
>>> +	bytes = getp32(ddl, ddebc);
>>> +
>>> +	if (ddecnt == 0 && bytes == 0) {
>>> +		/* First dde is unused; make it a direct dde */
>>> +		bytes = len;
>>> +		putp32(ddl, ddebc, bytes);
>>> +		putp64(ddl, ddead, (uint64_t) addr);
>>> +	} else if (ddecnt == 0) {
>>> +		/* Converting direct to indirect dde
>>> +		 * ddl[0] becomes head dde of ddl
>>> +		 * copy direct to indirect first.
>>> +		 */
>>> +		ddl[1] = ddl[0];
>>> +
>>> +		/* Add the new dde next */
>>> +		clear_dde(ddl[2]);
>>> +		put32(ddl[2], ddebc, len);
>>> +		put64(ddl[2], ddead, (uint64_t) addr);
>>> +
>>> +		/* Ddl head points to 2 direct ddes */
>>> +		ddecnt = 2;
>>> +		putpnn(ddl, dde_count, ddecnt);
>>> +		bytes = bytes + len;
>>> +		putp32(ddl, ddebc, bytes);
>>> +		/* Pointer to the first direct dde */
>>> +		putp64(ddl, ddead, (uint64_t) &ddl[1]);
>>> +	} else {
>>> +		/* Append a dde to an existing indirect ddl */
>>> +		++ddecnt;
>>> +		clear_dde(ddl[ddecnt]);
>>> +		put64(ddl[ddecnt], ddead, (uint64_t) addr);
>>> +		put32(ddl[ddecnt], ddebc, len);
>>> +
>>> +		putpnn(ddl, dde_count, ddecnt);
>>> +		bytes = bytes + len;
>>> +		putp32(ddl, ddebc, bytes); /* byte sum of all dde */
>>> +	}
>>> +	return bytes;
>>> +}
>>> +
>>> +/*
>>> + * Touch specified number of pages represented in number bytes
>>> + * beginning from the first buffer in a dde list.
>>> + * Do not touch the pages past buf_sz-th byte's page.
>>> + *
>>> + * Set buf_sz = 0 to touch all pages described by the ddep.
>>> + */
>>> +static int nx_touch_pages_dde(nx_dde_t *ddep, long buf_sz, long page_sz,
>>> +				int wr)
>>> +{
>>> +	uint32_t indirect_count;
>>> +	uint32_t buf_len;
>>> +	long total;
>>> +	uint64_t buf_addr;
>>> +	nx_dde_t *dde_list;
>>> +	int i;
>>> +
>>> +	assert(!!ddep);
>>> +
>>> +	indirect_count = getpnn(ddep, dde_count);
>>> +
>>> +	NXPRT(fprintf(stderr, "nx_touch_pages_dde dde_count %d request len \
>>> +			0x%lx\n", indirect_count, buf_sz));
>>> +
>>> +	if (indirect_count == 0) {
>>> +		/* Direct dde */
>>> +		buf_len = getp32(ddep, ddebc);
>>> +		buf_addr = getp64(ddep, ddead);
>>> +
>>> +		NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n",
>>> +				buf_len, (void *)buf_addr));
>>> +
>>> +		if (buf_sz == 0)
>>> +			nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
>>> +		else
>>> +			nx_touch_pages((void *)buf_addr, NX_MIN(buf_len,
>>> +					buf_sz), page_sz, wr);
>>> +
>>> +		return ERR_NX_OK;
>>> +	}
>>> +
>>> +	/* Indirect dde */
>>> +	if (indirect_count > MAX_DDE_COUNT)
>>> +		return ERR_NX_EXCESSIVE_DDE;
>>> +
>>> +	/* First address of the list */
>>> +	dde_list = (nx_dde_t *) getp64(ddep, ddead);
>>> +
>>> +	if (buf_sz == 0)
>>> +		buf_sz = getp32(ddep, ddebc);
>>> +
>>> +	total = 0;
>>> +	for (i = 0; i < indirect_count; i++) {
>>> +		buf_len = get32(dde_list[i], ddebc);
>>> +		buf_addr = get64(dde_list[i], ddead);
>>> +		total += buf_len;
>>> +
>>> +		NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total \
>>> +				0x%lx\n", buf_len, (void *)buf_addr, total));
>>> +
>>> +		/* Touching fewer pages than encoded in the ddebc */
>>> +		if (total > buf_sz) {
>>> +			buf_len = NX_MIN(buf_len, total - buf_sz);
>>> +			nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
>>> +			NXPRT(fprintf(stderr, "touch loop break len 0x%x \
>>> +				      ddead %p\n", buf_len, (void *)buf_addr));
>>> +			break;
>>> +		}
>>> +		nx_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
>>> +	}
>>> +	return ERR_NX_OK;
>>> +}
>>> +
>>> +/*
>>> + * Src and dst buffers are supplied in scatter gather lists.
>>> + * NX function code and other parameters supplied in cmdp.
>>> + */
>>> +static int nx_submit_job(nx_dde_t *src, nx_dde_t *dst, nx_gzip_crb_cpb_t *cmdp,
>>> +			 void *handle)
>>> +{
>>> +	int cc;
>>> +	uint64_t csbaddr;
>>> +
>>> +	memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
>>> +
>>> +	cmdp->crb.source_dde = *src;
>>> +	cmdp->crb.target_dde = *dst;
>>> +
>>> +	/* Status, output byte count in tpbc */
>>> +	csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask;
>>> +	put64(cmdp->crb, csb_address, csbaddr);
>>> +
>>> +	/* NX reports input bytes in spbc; cleared */
>>> +	cmdp->cpb.out_spbc_comp_wrap = 0;
>>> +	cmdp->cpb.out_spbc_comp_with_count = 0;
>>> +	cmdp->cpb.out_spbc_decomp = 0;
>>> +
>>> +	/* Clear output */
>>> +	put32(cmdp->cpb, out_crc, INIT_CRC);
>>> +	put32(cmdp->cpb, out_adler, INIT_ADLER);
>>> +
>>> +	cc = nxu_run_job(cmdp, handle);
>>> +
>>> +	if (!cc)
>>> +		cc = getnn(cmdp->crb.csb, csb_cc);	/* CC Table 6-8 */
>>> +
>>> +	return cc;
>>> +}
>>> +
>>> +/* fifo queue management */
>>> +#define fifo_used_bytes(used) (used)
>>> +#define fifo_free_bytes(used, len) ((len)-(used))
>>> +/* amount of free bytes in the first and last parts */
>>> +#define fifo_free_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
>>> +						  ? (len)-((cur)+(used)) : 0)
>>> +#define fifo_free_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
>>> +						  ? (cur) : (len)-(used))
>>> +/* amount of used bytes in the first and last parts */
>>> +#define fifo_used_first_bytes(cur, used, len)  ((((cur)+(used)) <= (len)) \
>>> +						  ? (used) : (len)-(cur))
>>> +#define fifo_used_last_bytes(cur, used, len)   ((((cur)+(used)) <= (len)) \
>>> +						  ? 0 : ((used)+(cur))-(len))
>>> +/* first and last free parts start here */
>>> +#define fifo_free_first_offset(cur, used)      ((cur)+(used))
>>> +#define fifo_free_last_offset(cur, used, len)  \
>>> +					   fifo_used_last_bytes(cur, used, len)
>>> +/* first and last used parts start here */
>>> +#define fifo_used_first_offset(cur)            (cur)
>>> +#define fifo_used_last_offset(cur)             (0)
>>> +
>>> +int decompress_file(int argc, char **argv, void *devhandle)
>>> +{
>>> +	FILE *inpf;
>>> +	FILE *outf;
>>> +
>>> +	int c, expect, i, cc, rc = 0;
>>> +	char gzfname[1024];
>>> +
>>> +	/* Queuing, file ops, byte counting */
>>> +	char *fifo_in, *fifo_out;
>>> +	int used_in, cur_in, used_out, cur_out, read_sz, n;
>>> +	int first_free, last_free, first_used, last_used;
>>> +	int first_offset, last_offset;
>>> +	int write_sz, free_space, source_sz;
>>> +	int source_sz_estimate, target_sz_estimate;
>>> +	uint64_t last_comp_ratio; /* 1000 max */
>>> +	uint64_t total_out;
>>> +	int is_final, is_eof;
>>> +
>>> +	/* nx hardware */
>>> +	int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0;
>>> +	int history_len = 0;
>>> +	nx_gzip_crb_cpb_t cmd, *cmdp;
>>> +	nx_dde_t *ddl_in;
>>> +	nx_dde_t dde_in[6] __attribute__((aligned (128)));
>>> +	nx_dde_t *ddl_out;
>>> +	nx_dde_t dde_out[6] __attribute__((aligned (128)));
>>> +	int pgfault_retries;
>>> +
>>> +	/* when using mmap'ed files */
>>> +	off_t input_file_offset;
>>> +
>>> +	if (argc > 2) {
>>> +		fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]);
>>> +		fprintf(stderr, "    writes to stdout or <fname>.nx.gunzip\n");
>>> +		return -1;
>>> +	}
>>> +
>>> +	if (argc == 1) {
>>> +		inpf = stdin;
>>> +		outf = stdout;
>>> +	} else if (argc == 2) {
>>> +		char w[1024];
>>> +		char *wp;
>>> +		inpf = fopen(argv[1], "r");
>>> +		if (inpf == NULL) {
>>> +			perror(argv[1]);
>>> +			return -1;
>>> +		}
>>> +
>>> +		/* Make a new file name to write to.  Ignoring '.gz' */
>>> +		wp = (NULL != (wp = strrchr(argv[1], '/'))) ? ++wp : argv[1];
>>> +		strcpy(w, wp);
>>> +		strcat(w, ".nx.gunzip");
>>> +
>>> +		outf = fopen(w, "w");
>>> +		if (outf == NULL) {
>>> +			perror(w);
>>> +			return -1;
>>> +		}
>>> +	}
>>> +
>>> +#define GETINPC(X) fgetc(X)
>>> +
>>> +	/* Decode the gzip header */
>>> +	c = GETINPC(inpf); expect = 0x1f; /* ID1 */
>>> +	if (c != expect)
>>> +		goto err1;
>>> +
>>> +	c = GETINPC(inpf); expect = 0x8b; /* ID2 */
>>> +	if (c != expect)
>>> +		goto err1;
>>> +
>>> +	c = GETINPC(inpf); expect = 0x08; /* CM */
>>> +	if (c != expect)
>>> +		goto err1;
>>> +
>>> +	int flg = GETINPC(inpf); /* FLG */
>>> +	if (flg & 0b11100000 || flg & 0b100)
>>> +		goto err2;
>>> +
>>> +	fprintf(stderr, "gzHeader FLG %x\n", flg);
>>> +
>>> +	/* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this
>>> +	 * sample code.
>>> +	 */
>>> +	for (i = 0; i < 6; i++) {
>>> +		char tmp[10];
>>> +		if (EOF == (tmp[i] = GETINPC(inpf)))
>>> +			goto err3;
>>> +		fprintf(stderr, "%02x ", tmp[i]);
>>> +		if (i == 5)
>>> +			fprintf(stderr, "\n");
>>> +	}
>>> +	fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n");
>>> +
>>> +	/* FNAME */
>>> +	if (flg & 0b1000) {
>>> +		int k = 0;
>>> +		do {
>>> +			if (EOF == (c = GETINPC(inpf)))
>>> +				goto err3;
>>> +			gzfname[k++] = c;
>>> +		} while (c);
>>> +		fprintf(stderr, "gzHeader FNAME: %s\n", gzfname);
>>> +	}
>>> +
>>> +	/* FHCRC */
>>> +	if (flg & 0b10) {
>>> +		c = GETINPC(inpf); c = GETINPC(inpf);
>>> +		fprintf(stderr, "gzHeader FHCRC: ignored\n");
>>> +	}
>>> +
>>> +	used_in = cur_in = used_out = cur_out = 0;
>>> +	is_final = is_eof = 0;
>>> +
>>> +	/* Allocate one page larger to prevent page faults due to NX
>>> +	 * overfetching.
>>> +	 * Either do this (char*)(uintptr_t)aligned_alloc or use
>>> +	 * -std=c11 flag to make the int-to-pointer warning go away.
>>> +	 */
>>> +	assert((fifo_in  = (char *)(uintptr_t)aligned_alloc(line_sz,
>>> +				   fifo_in_len + page_sz)) != NULL);
>>> +	assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz,
>>> +				   fifo_out_len + page_sz + line_sz)) != NULL);
>>> +	/* Leave unused space due to history rounding rules */
>>> +	fifo_out = fifo_out + line_sz;
>>> +	nx_touch_pages(fifo_out, fifo_out_len, page_sz, 1);
>>> +
>>> +	ddl_in  = &dde_in[0];
>>> +	ddl_out = &dde_out[0];
>>> +	cmdp = &cmd;
>>> +	memset(&cmdp->crb, 0, sizeof(cmdp->crb));
>>> +
>>> +read_state:
>>> +
>>> +	/* Read from .gz file */
>>> +
>>> +	NXPRT(fprintf(stderr, "read_state:\n"));
>>> +
>>> +	if (is_eof != 0)
>>> +		goto write_state;
>>> +
>>> +	/* We read in to fifo_in in two steps: first: read in to from
>>> +	 * cur_in to the end of the buffer.  last: if free space wrapped
>>> +	 * around, read from fifo_in offset 0 to offset cur_in.
>>> +	 */
>>> +
>>> +	/* Reset fifo head to reduce unnecessary wrap arounds */
>>> +	cur_in = (used_in == 0) ? 0 : cur_in;
>>> +
>>> +	/* Free space total is reduced by a gap */
>>> +	free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len)
>>> +			    - line_sz);
>>> +
>>> +	/* Free space may wrap around as first and last */
>>> +	first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len);
>>> +	last_free  = fifo_free_last_bytes(cur_in, used_in, fifo_in_len);
>>> +
>>> +	/* Start offsets of the free memory */
>>> +	first_offset = fifo_free_first_offset(cur_in, used_in);
>>> +	last_offset  = fifo_free_last_offset(cur_in, used_in, fifo_in_len);
>>> +
>>> +	/* Reduce read_sz because of the line_sz gap */
>>> +	read_sz = NX_MIN(free_space, first_free);
>>> +	n = 0;
>>> +	if (read_sz > 0) {
>>> +		/* Read in to offset cur_in + used_in */
>>> +		n = fread(fifo_in + first_offset, 1, read_sz, inpf);
>>> +		used_in = used_in + n;
>>> +		free_space = free_space - n;
>>> +		assert(n <= read_sz);
>>> +		if (n != read_sz) {
>>> +			/* Either EOF or error; exit the read loop */
>>> +			is_eof = 1;
>>> +			goto write_state;
>>> +		}
>>> +	}
>>> +
>>> +	/* If free space wrapped around */
>>> +	if (last_free > 0) {
>>> +		/* Reduce read_sz because of the line_sz gap */
>>> +		read_sz = NX_MIN(free_space, last_free);
>>> +		n = 0;
>>> +		if (read_sz > 0) {
>>> +			n = fread(fifo_in + last_offset, 1, read_sz, inpf);
>>> +			used_in = used_in + n;       /* Increase used space */
>>> +			free_space = free_space - n; /* Decrease free space */
>>> +			assert(n <= read_sz);
>>> +			if (n != read_sz) {
>>> +				/* Either EOF or error; exit the read loop */
>>> +				is_eof = 1;
>>> +				goto write_state;
>>> +			}
>>> +		}
>>> +	}
>>> +
>>> +	/* At this point we have used_in bytes in fifo_in with the
>>> +	 * data head starting at cur_in and possibly wrapping around.
>>> +	 */
>>> +
>>> +write_state:
>>> +
>>> +	/* Write decompressed data to output file */
>>> +
>>> +	NXPRT(fprintf(stderr, "write_state:\n"));
>>> +
>>> +	if (used_out == 0)
>>> +		goto decomp_state;
>>> +
>>> +	/* If fifo_out has data waiting, write it out to the file to
>>> +	 * make free target space for the accelerator used bytes in
>>> +	 * the first and last parts of fifo_out.
>>> +	 */
>>> +
>>> +	first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len);
>>> +	last_used  = fifo_used_last_bytes(cur_out, used_out, fifo_out_len);
>>> +
>>> +	write_sz = first_used;
>>> +
>>> +	n = 0;
>>> +	if (write_sz > 0) {
>>> +		n = fwrite(fifo_out + cur_out, 1, write_sz, outf);
>>> +		used_out = used_out - n;
>>> +		/* Move head of the fifo */
>>> +		cur_out = (cur_out + n) % fifo_out_len;
>>> +		assert(n <= write_sz);
>>> +		if (n != write_sz) {
>>> +			fprintf(stderr, "error: write\n");
>>> +			rc = -1;
>>> +			goto err5;
>>> +		}
>>> +	}
>>> +
>>> +	if (last_used > 0) { /* If more data available in the last part */
>>> +		write_sz = last_used; /* Keep it here for later */
>>> +		n = 0;
>>> +		if (write_sz > 0) {
>>> +			n = fwrite(fifo_out, 1, write_sz, outf);
>>> +			used_out = used_out - n;
>>> +			cur_out = (cur_out + n) % fifo_out_len;
>>> +			assert(n <= write_sz);
>>> +			if (n != write_sz) {
>>> +				fprintf(stderr, "error: write\n");
>>> +				rc = -1;
>>> +				goto err5;
>>> +			}
>>> +		}
>>> +	}
>>> +
>>> +decomp_state:
>>> +
>>> +	/* NX decompresses input data */
>>> +
>>> +	NXPRT(fprintf(stderr, "decomp_state:\n"));
>>> +
>>> +	if (is_final)
>>> +		goto finish_state;
>>> +
>>> +	/* Address/len lists */
>>> +	clearp_dde(ddl_in);
>>> +	clearp_dde(ddl_out);
>>> +
>>> +	/* FC, CRC, HistLen, Table 6-6 */
>>> +	if (resuming) {
>>> +		/* Resuming a partially decompressed input.
>>> +		 * The key to resume is supplying the 32KB
>>> +		 * dictionary (history) to NX, which is basically
>>> +		 * the last 32KB of output produced.
>>> +		 */
>>> +		fc = GZIP_FC_DECOMPRESS_RESUME;
>>> +
>>> +		cmdp->cpb.in_crc   = cmdp->cpb.out_crc;
>>> +		cmdp->cpb.in_adler = cmdp->cpb.out_adler;
>>> +
>>> +		/* Round up the history size to quadword.  Section 2.10 */
>>> +		history_len = (history_len + 15) / 16;
>>> +		putnn(cmdp->cpb, in_histlen, history_len);
>>> +		history_len = history_len * 16; /* bytes */
>>> +
>>> +		if (history_len > 0) {
>>> +			/* Chain in the history buffer to the DDE list */
>>> +			if (cur_out >= history_len) {
>>> +				nx_append_dde(ddl_in, fifo_out
>>> +					      + (cur_out - history_len),
>>> +					      history_len);
>>> +			} else {
>>> +				nx_append_dde(ddl_in, fifo_out
>>> +					      + ((fifo_out_len + cur_out)
>>> +					      - history_len),
>>> +					      history_len - cur_out);
>>> +				/* Up to 32KB history wraps around fifo_out */
>>> +				nx_append_dde(ddl_in, fifo_out, cur_out);
>>> +			}
>>> +
>>> +		}
>>> +	} else {
>>> +		/* First decompress job */
>>> +		fc = GZIP_FC_DECOMPRESS;
>>> +
>>> +		history_len = 0;
>>> +		/* Writing 0 clears out subc as well */
>>> +		cmdp->cpb.in_histlen = 0;
>>> +		total_out = 0;
>>> +
>>> +		put32(cmdp->cpb, in_crc, INIT_CRC);
>>> +		put32(cmdp->cpb, in_adler, INIT_ADLER);
>>> +		put32(cmdp->cpb, out_crc, INIT_CRC);
>>> +		put32(cmdp->cpb, out_adler, INIT_ADLER);
>>> +
>>> +		/* Assuming 10% compression ratio initially; use the
>>> +		 * most recently measured compression ratio as a
>>> +		 * heuristic to estimate the input and output
>>> +		 * sizes.  If we give too much input, the target buffer
>>> +		 * overflows and NX cycles are wasted, and then we
>>> +		 * must retry with smaller input size.  1000 is 100%.
>>> +		 */
>>> +		last_comp_ratio = 100UL;
>>> +	}
>>> +	cmdp->crb.gzip_fc = 0;
>>> +	putnn(cmdp->crb, gzip_fc, fc);
>>> +
>>> +	/*
>>> +	 * NX source buffers
>>> +	 */
>>> +	first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
>>> +	last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
>>> +
>>> +	if (first_used > 0)
>>> +		nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
>>> +
>>> +	if (last_used > 0)
>>> +		nx_append_dde(ddl_in, fifo_in, last_used);
>>> +
>>> +	/*
>>> +	 * NX target buffers
>>> +	 */
>>> +	first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len);
>>> +	last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len);
>>> +
>>> +	/* Reduce output free space amount not to overwrite the history */
>>> +	int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len)
>>> +				- (1<<16));
>>> +
>>> +	NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max,
>>> +		      target_max));
>>> +
>>> +	first_free = NX_MIN(target_max, first_free);
>>> +	if (first_free > 0) {
>>> +		first_offset = fifo_free_first_offset(cur_out, used_out);
>>> +		nx_append_dde(ddl_out, fifo_out + first_offset, first_free);
>>> +	}
>>> +
>>> +	if (last_free > 0) {
>>> +		last_free = NX_MIN(target_max - first_free, last_free);
>>> +		if (last_free > 0) {
>>> +			last_offset = fifo_free_last_offset(cur_out, used_out,
>>> +							    fifo_out_len);
>>> +			nx_append_dde(ddl_out, fifo_out + last_offset,
>>> +				      last_free);
>>> +		}
>>> +	}
>>> +
>>> +	/* Target buffer size is used to limit the source data size
>>> +	 * based on previous measurements of compression ratio.
>>> +	 */
>>> +
>>> +	/* source_sz includes history */
>>> +	source_sz = getp32(ddl_in, ddebc);
>>> +	assert(source_sz > history_len);
>>> +	source_sz = source_sz - history_len;
>>> +
>>> +	/* Estimating how much source is needed to 3/4 fill a
>>> +	 * target_max size target buffer.  If we overshoot, then NX
>>> +	 * must repeat the job with smaller input and we waste
>>> +	 * bandwidth.  If we undershoot then we use more NX calls than
>>> +	 * necessary.
>>> +	 */
>>> +
>>> +	source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL)
>>> +				/ 4000;
>>> +
>>> +	if (source_sz_estimate < source_sz) {
>>> +		/* Target might be small, therefore limiting the
>>> +		 * source data.
>>> +		 */
>>> +		source_sz = source_sz_estimate;
>>> +		target_sz_estimate = target_max;
>>> +	} else {
>>> +		/* Source file might be small, therefore limiting target
>>> +		 * touch pages to a smaller value to save processor cycles.
>>> +		 */
>>> +		target_sz_estimate = ((uint64_t)source_sz * 1000UL)
>>> +					/ (last_comp_ratio + 1);
>>> +		target_sz_estimate = NX_MIN(2 * target_sz_estimate,
>>> +					    target_max);
>>> +	}
>>> +
>>> +	source_sz = source_sz + history_len;
>>> +
>>> +	/* Some NX condition codes require submitting the NX job again.
>>> +	 * Kernel doesn't handle NX page faults. Expects user code to
>>> +	 * touch pages.
>>> +	 */
>>> +	pgfault_retries = retry_max;
>>> +
>>> +restart_nx:
>>> +
>>> +	putp32(ddl_in, ddebc, source_sz);
>>> +
>>> +	/* Fault in pages */
>>> +	nx_touch_pages_dde(ddl_in, 0, page_sz, 0);
>>> +	nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1);
>>> +
>>> +	/* Send job to NX */
>>> +	cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle);
>>> +
>>> +	switch (cc) {
>>> +
>>> +	case ERR_NX_TRANSLATION:
>>> +
>>> +		/* We touched the pages ahead of time.  In the most common case
>>> +		 * we shouldn't be here.  But may be some pages were paged out.
>>> +		 * Kernel should have placed the faulting address to fsaddr.
>>> +		 */
>>> +		NXPRT(fprintf(stderr, "ERR_NX_TRANSLATION %p\n",
>>> +			      (void *)cmdp->crb.csb.fsaddr));
>>> +
>>> +		/* Touch 1 byte, read-only  */
>>> +		nx_touch_pages((void *)cmdp->crb.csb.fsaddr, 1, page_sz, 0);
>>> +
>>> +		if (pgfault_retries == retry_max) {
>>> +			/* Try once with exact number of pages */
>>> +			--pgfault_retries;
>>> +			goto restart_nx;
>>> +		} else if (pgfault_retries > 0) {
>>> +			/* If still faulting try fewer input pages
>>> +			 * assuming memory outage
>>> +			 */
>>> +			if (source_sz > page_sz)
>>> +				source_sz = NX_MAX(source_sz / 2, page_sz);
>>> +			--pgfault_retries;
>>> +			goto restart_nx;
>>> +		} else {
>>> +			fprintf(stderr, "cannot make progress; too many page \
>>> +				fault retries cc= %d\n", cc);
>>> +			rc = -1;
>>> +			goto err5;
>>> +		}
>>> +
>>> +	case ERR_NX_DATA_LENGTH:
>>> +
>>> +		NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; not an error \
>>> +			      usually; stream may have trailing data\n"));
>>> +
>>> +		/* Not an error in the most common case; it just says
>>> +		 * there is trailing data that we must examine.
>>> +		 *
>>> +		 * CC=3 CE(1)=0 CE(0)=1 indicates partial completion
>>> +		 * Fig.6-7 and Table 6-8.
>>> +		 */
>>> +		nx_ce = get_csb_ce_ms3b(cmdp->crb.csb);
>>> +
>>> +		if (!csb_ce_termination(nx_ce) &&
>>> +		    csb_ce_partial_completion(nx_ce)) {
>>> +			/* Check CPB for more information
>>> +			 * spbc and tpbc are valid
>>> +			 */
>>> +			sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */
>>> +			subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */
>>> +			spbc = get32(cmdp->cpb, out_spbc_decomp);
>>> +			tpbc = get32(cmdp->crb.csb, tpbc);
>>> +			assert(target_max >= tpbc);
>>> +
>>> +			goto ok_cc3; /* not an error */
>>> +		} else {
>>> +			/* History length error when CE(1)=1 CE(0)=0. */
>>> +			rc = -1;
>>> +			fprintf(stderr, "history length error cc= %d\n", cc);
>>> +			goto err5;
>>> +		}
>>> +
>>> +	case ERR_NX_TARGET_SPACE:
>>> +
>>> +		/* Target buffer not large enough; retry smaller input
>>> +		 * data; give at least 1 byte.  SPBC/TPBC are not valid.
>>> +		 */
>>> +		assert(source_sz > history_len);
>>> +		source_sz = ((source_sz - history_len + 2) / 2) + history_len;
>>> +		NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with \
>>> +			      smaller input data src %d hist %d\n", source_sz,
>>> +			      history_len));
>>> +		goto restart_nx;
>>> +
>>> +	case ERR_NX_OK:
>>> +
>>> +		/* This should not happen for gzip formatted data;
>>> +		 * we need trailing crc and isize
>>> +		 */
>>> +		fprintf(stderr, "ERR_NX_OK\n");
>>> +		spbc = get32(cmdp->cpb, out_spbc_decomp);
>>> +		tpbc = get32(cmdp->crb.csb, tpbc);
>>> +		assert(target_max >= tpbc);
>>> +		assert(spbc >= history_len);
>>> +		source_sz = spbc - history_len;
>>> +		goto offsets_state;
>>> +
>>> +	default:
>>> +		fprintf(stderr, "error: cc= %d\n", cc);
>>> +		rc = -1;
>>> +		goto err5;
>>> +	}
>>> +
>>> +ok_cc3:
>>> +
>>> +	NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt));
>>> +
>>> +	assert(spbc > history_len);
>>> +	source_sz = spbc - history_len;
>>> +
>>> +	/* Table 6-4: Source Final Block Type (SFBT) describes the
>>> +	 * last processed deflate block and clues the software how to
>>> +	 * resume the next job.  SUBC indicates how many input bits NX
>>> +	 * consumed but did not process.  SPBC indicates how many
>>> +	 * bytes of source were given to the accelerator including
>>> +	 * history bytes.
>>> +	 */
>>> +
>>> +	switch (sfbt) {
>>> +		int dhtlen;
>>> +
>>> +	case 0b0000: /* Deflate final EOB received */
>>> +
>>> +		/* Calculating the checksum start position. */
>>> +
>>> +		source_sz = source_sz - subc / 8;
>>> +		is_final = 1;
>>> +		break;
>>> +
>>> +		/* Resume decompression cases are below. Basically
>>> +		 * indicates where NX has suspended and how to resume
>>> +		 * the input stream.
>>> +		 */
>>> +
>>> +	case 0b1000: /* Within a literal block; use rembytecount */
>>> +	case 0b1001: /* Within a literal block; use rembytecount; bfinal=1 */
>>> +
>>> +		/* Supply the partially processed source byte again */
>>> +		source_sz = source_sz - ((subc + 7) / 8);
>>> +
>>> +		/* SUBC LS 3bits: number of bits in the first source byte need
>>> +		 * to be processed.
>>> +		 * 000 means all 8 bits;  Table 6-3
>>> +		 * Clear subc, histlen, sfbt, rembytecnt, dhtlen
>>> +		 */
>>> +		cmdp->cpb.in_subc = 0;
>>> +		cmdp->cpb.in_sfbt = 0;
>>> +		putnn(cmdp->cpb, in_subc, subc % 8);
>>> +		putnn(cmdp->cpb, in_sfbt, sfbt);
>>> +		putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb,
>>> +						      out_rembytecnt));
>>> +		break;
>>> +
>>> +	case 0b1010: /* Within a FH block; */
>>> +	case 0b1011: /* Within a FH block; bfinal=1 */
>>> +
>>> +		source_sz = source_sz - ((subc + 7) / 8);
>>> +
>>> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
>>> +		cmdp->cpb.in_subc = 0;
>>> +		cmdp->cpb.in_sfbt = 0;
>>> +		putnn(cmdp->cpb, in_subc, subc % 8);
>>> +		putnn(cmdp->cpb, in_sfbt, sfbt);
>>> +		break;
>>> +
>>> +	case 0b1100: /* Within a DH block; */
>>> +	case 0b1101: /* Within a DH block; bfinal=1 */
>>> +
>>> +		source_sz = source_sz - ((subc + 7) / 8);
>>> +
>>> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
>>> +		cmdp->cpb.in_subc = 0;
>>> +		cmdp->cpb.in_sfbt = 0;
>>> +		putnn(cmdp->cpb, in_subc, subc % 8);
>>> +		putnn(cmdp->cpb, in_sfbt, sfbt);
>>> +
>>> +		dhtlen = getnn(cmdp->cpb, out_dhtlen);
>>> +		putnn(cmdp->cpb, in_dhtlen, dhtlen);
>>> +		assert(dhtlen >= 42);
>>> +
>>> +		/* Round up to a qword */
>>> +		dhtlen = (dhtlen + 127) / 128;
>>> +
>>> +		while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */
>>> +			--dhtlen;
>>> +			cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen];
>>> +		}
>>> +		break;
>>> +
>>> +	case 0b1110: /* Within a block header; bfinal=0; */
>>> +		     /* Also given if source data exactly ends (SUBC=0) with
>>> +		      * EOB code with BFINAL=0.  Means the next byte will
>>> +		      * contain a block header.
>>> +		      */
>>> +	case 0b1111: /* within a block header with BFINAL=1. */
>>> +
>>> +		source_sz = source_sz - ((subc + 7) / 8);
>>> +
>>> +		/* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
>>> +		cmdp->cpb.in_subc = 0;
>>> +		cmdp->cpb.in_sfbt = 0;
>>> +		putnn(cmdp->cpb, in_subc, subc % 8);
>>> +		putnn(cmdp->cpb, in_sfbt, sfbt);
>>> +	}
>>> +
>>> +offsets_state:
>>> +
>>> +	/* Adjust the source and target buffer offsets and lengths  */
>>> +
>>> +	NXPRT(fprintf(stderr, "offsets_state:\n"));
>>> +
>>> +	/* Delete input data from fifo_in */
>>> +	used_in = used_in - source_sz;
>>> +	cur_in = (cur_in + source_sz) % fifo_in_len;
>>> +	input_file_offset = input_file_offset + source_sz;
>>> +
>>> +	/* Add output data to fifo_out */
>>> +	used_out = used_out + tpbc;
>>> +
>>> +	assert(used_out <= fifo_out_len);
>>> +
>>> +	total_out = total_out + tpbc;
>>> +
>>> +	/* Deflate history is 32KB max.  No need to supply more
>>> +	 * than 32KB on a resume.
>>> +	 */
>>> +	history_len = (total_out > window_max) ? window_max : total_out;
>>> +
>>> +	/* To estimate expected expansion in the next NX job; 500 means 50%.
>>> +	 * Deflate best case is around 1 to 1000.
>>> +	 */
>>> +	last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1))
>>> +			  / ((uint64_t)tpbc + 1);
>>> +	last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1);
>>> +	NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n",
>>> +		      last_comp_ratio, source_sz, spbc, tpbc));
>>> +
>>> +	resuming = 1;
>>> +
>>> +finish_state:
>>> +
>>> +	NXPRT(fprintf(stderr, "finish_state:\n"));
>>> +
>>> +	if (is_final) {
>>> +		if (used_out)
>>> +			goto write_state; /* More data to write out */
>>> +		else if (used_in < 8) {
>>> +			/* Need at least 8 more bytes containing gzip crc
>>> +			 * and isize.
>>> +			 */
>>> +			rc = -1;
>>> +			goto err4;
>>> +		} else {
>>> +			/* Compare checksums and exit */
>>> +			int i;
>>> +			char tail[8];
>>> +			uint32_t cksum, isize;
>>> +			for (i = 0; i < 8; i++)
>>> +				tail[i] = fifo_in[(cur_in + i) % fifo_in_len];
>>> +			fprintf(stderr, "computed checksum %08x isize %08x\n",
>>> +				cmdp->cpb.out_crc, (uint32_t) (total_out
>>> +				% (1ULL<<32)));
>>> +			cksum = (tail[0] | tail[1]<<8 | tail[2]<<16
>>> +				| tail[3]<<24);
>>> +			isize = (tail[4] | tail[5]<<8 | tail[6]<<16
>>> +				| tail[7]<<24);
>>> +			fprintf(stderr, "stored   checksum %08x isize %08x\n",
>>> +				cksum, isize);
>>> +
>>> +			if (cksum == cmdp->cpb.out_crc && isize == (uint32_t)
>>> +			    (total_out % (1ULL<<32))) {
>>> +				rc = 0;	goto ok1;
>>> +			} else {
>>> +				rc = -1; goto err4;
>>> +			}
>>> +		}
>>> +	} else
>>> +		goto read_state;
>>> +
>>> +	return -1;
>>> +
>>> +err1:
>>> +	fprintf(stderr, "error: not a gzip file, expect %x, read %x\n",
>>> +		expect, c);
>>> +	return -1;
>>> +
>>> +err2:
>>> +	fprintf(stderr, "error: the FLG byte is wrong or not handled by this \
>>> +		code sample\n");
>>> +	return -1;
>>> +
>>> +err3:
>>> +	fprintf(stderr, "error: gzip header\n");
>>> +	return -1;
>>> +
>>> +err4:
>>> +	fprintf(stderr, "error: checksum\n");
>>> +
>>> +err5:
>>> +ok1:
>>> +	fprintf(stderr, "decomp is complete: fclose\n");
>>> +	fclose(outf);
>>> +
>>> +	return rc;
>>> +}
>>> +
>>> +
>>> +int main(int argc, char **argv)
>>> +{
>>> +	int rc;
>>> +	struct sigaction act;
>>> +	void *handle;
>>> +
>>> +	act.sa_handler = 0;
>>> +	act.sa_sigaction = sigsegv_handler;
>>> +	act.sa_flags = SA_SIGINFO;
>>> +	act.sa_restorer = 0;
>>> +	sigemptyset(&act.sa_mask);
>>> +	sigaction(SIGSEGV, &act, NULL);
>>> +
>>> +	handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
>>> +	if (!handle) {
>>> +		fprintf(stderr, "Unable to init NX, errno %d\n", errno);
>>> +		exit(-1);
>>> +	}
>>> +
>>> +	rc = decompress_file(argc, argv, handle);
>>> +
>>> +	nx_function_end(handle);
>>> +
>>> +	return rc;
>>> +}
>>> -- 
>>> 2.21.0
>
> -- 
> Raphael Moreira Zinsly
> IBM
> Linux on Power Toolchain

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/5] selftests/powerpc: Add header files for NX compresion/decompression
  2020-03-16 18:07 ` [PATCH 2/5] selftests/powerpc: Add header files for NX compresion/decompression Raphael Moreira Zinsly
@ 2020-03-18 22:29   ` Daniel Axtens
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Axtens @ 2020-03-18 22:29 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, linuxppc-dev, linux-crypto
  Cc: Raphael Moreira Zinsly, haren, herbert, abali

Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:

> Add files to be able to compress and decompress files using the
> powerpc NX-GZIP engine.
>
> Signed-off-by: Bulent Abali <abali@us.ibm.com>
> Signed-off-by: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
> ---
>  .../powerpc/nx-gzip/inc/copy-paste.h          |  54 ++
>  .../selftests/powerpc/nx-gzip/inc/nx_dbg.h    |  95 +++
>  .../selftests/powerpc/nx-gzip/inc/nxu.h       | 644 ++++++++++++++++++
>  3 files changed, 793 insertions(+)
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/copy-paste.h
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/nx_dbg.h
>  create mode 100644 tools/testing/selftests/powerpc/nx-gzip/inc/nxu.h
>
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/copy-paste.h b/tools/testing/selftests/powerpc/nx-gzip/inc/copy-paste.h
> new file mode 100644
> index 000000000000..107139b6c7df
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/inc/copy-paste.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +
> +#include "nx-helpers.h"
> +
> +/*
> + * Macros taken from arch/powerpc/include/asm/ppc-opcode.h and other
> + * header files.
> + */
> +#define ___PPC_RA(a)    (((a) & 0x1f) << 16)
> +#define ___PPC_RB(b)    (((b) & 0x1f) << 11)
> +
> +#define PPC_INST_COPY                   0x7c20060c
> +#define PPC_INST_PASTE                  0x7c20070d
> +
> +#define PPC_COPY(a, b)          stringify_in_c(.long PPC_INST_COPY | \
> +						___PPC_RA(a) | ___PPC_RB(b))
> +#define PPC_PASTE(a, b)         stringify_in_c(.long PPC_INST_PASTE | \
> +						___PPC_RA(a) | ___PPC_RB(b))
> +#define CR0_SHIFT	28
> +#define CR0_MASK	0xF
> +/*
> + * Copy/paste instructions:
> + *
> + *	copy RA,RB
> + *		Copy contents of address (RA) + effective_address(RB)
> + *		to internal copy-buffer.
> + *
> + *	paste RA,RB
> + *		Paste contents of internal copy-buffer to the address
> + *		(RA) + effective_address(RB)
> + */
> +static inline int vas_copy(void *crb, int offset)
> +{
> +	asm volatile(PPC_COPY(%0, %1)";"
> +		:
> +		: "b" (offset), "b" (crb)
> +		: "memory");
> +
> +	return 0;
> +}
> +
> +static inline int vas_paste(void *paste_address, int offset)
> +{
> +	u32 cr;
> +
> +	cr = 0;
> +	asm volatile(PPC_PASTE(%1, %2)";"
> +		"mfocrf %0, 0x80;"
> +		: "=r" (cr)
> +		: "b" (offset), "b" (paste_address)
> +		: "memory", "cr0");
> +
> +	return (cr >> CR0_SHIFT) & CR0_MASK;
> +}
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/nx_dbg.h b/tools/testing/selftests/powerpc/nx-gzip/inc/nx_dbg.h
> new file mode 100644
> index 000000000000..f2c0eee2317e
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/inc/nx_dbg.h
> @@ -0,0 +1,95 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * Copyright 2020 IBM Corporation
> + *
> + */
> +
> +#ifndef _NXU_DBG_H_
> +#define _NXU_DBG_H_
> +
> +#include <sys/file.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <time.h>
> +#include <pthread.h>
> +
> +extern FILE * nx_gzip_log;
> +extern int nx_gzip_trace;
> +extern unsigned int nx_gzip_inflate_impl;
> +extern unsigned int nx_gzip_deflate_impl;
> +extern unsigned int nx_gzip_inflate_flags;
> +extern unsigned int nx_gzip_deflate_flags;
> +
> +extern int nx_dbg;
> +pthread_mutex_t mutex_log;
> +
> +#define nx_gzip_trace_enabled()       (nx_gzip_trace & 0x1)
> +#define nx_gzip_hw_trace_enabled()    (nx_gzip_trace & 0x2)
> +#define nx_gzip_sw_trace_enabled()    (nx_gzip_trace & 0x4)
> +#define nx_gzip_gather_statistics()   (nx_gzip_trace & 0x8)
> +#define nx_gzip_per_stream_stat()     (nx_gzip_trace & 0x10)
> +
> +#define prt(fmt, ...) do { \
> +	pthread_mutex_lock(&mutex_log);					\
> +	flock(nx_gzip_log->_fileno, LOCK_EX);				\
> +	time_t t; struct tm *m; time(&t); m = localtime(&t);		\
> +	fprintf(nx_gzip_log, "[%04d/%02d/%02d %02d:%02d:%02d] "		\
> +		"pid %d: " fmt,	\
> +		(int)m->tm_year + 1900, (int)m->tm_mon+1, (int)m->tm_mday, \
> +		(int)m->tm_hour, (int)m->tm_min, (int)m->tm_sec,	\
> +		(int)getpid(), ## __VA_ARGS__);				\
> +	fflush(nx_gzip_log);						\
> +	flock(nx_gzip_log->_fileno, LOCK_UN);				\
> +	pthread_mutex_unlock(&mutex_log);				\
> +} while (0)
> +
> +/* Use in case of an error */
> +#define prt_err(fmt, ...) do { if (nx_dbg >= 0) {			\
> +	prt("%s:%u: Error: "fmt,					\
> +		__FILE__, __LINE__, ## __VA_ARGS__);			\
> +}} while (0)
> +
> +/* Use in case of an warning */
> +#define prt_warn(fmt, ...) do {	if (nx_dbg >= 1) {			\
> +	prt("%s:%u: Warning: "fmt,					\
> +		__FILE__, __LINE__, ## __VA_ARGS__);			\
> +}} while (0)
> +
> +/* Informational printouts */
> +#define prt_info(fmt, ...) do {	if (nx_dbg >= 2) {			\
> +	prt("Info: "fmt, ## __VA_ARGS__);				\
> +}} while (0)
> +
> +/* Trace zlib wrapper code */
> +#define prt_trace(fmt, ...) do { if (nx_gzip_trace_enabled()) {		\
> +	prt("### "fmt, ## __VA_ARGS__);					\
> +}} while (0)
> +
> +/* Trace statistics */
> +#define prt_stat(fmt, ...) do {	if (nx_gzip_gather_statistics()) {	\
> +	prt("### "fmt, ## __VA_ARGS__);					\
> +}} while (0)
> +
> +/* Trace zlib hardware implementation */
> +#define hw_trace(fmt, ...) do {						\
> +		if (nx_gzip_hw_trace_enabled())				\
> +			fprintf(nx_gzip_log, "hhh " fmt, ## __VA_ARGS__); \
> +	} while (0)
> +
> +/* Trace zlib software implementation */
> +#define sw_trace(fmt, ...) do {						\
> +		if (nx_gzip_sw_trace_enabled())				\
> +			fprintf(nx_gzip_log, "sss " fmt, ## __VA_ARGS__); \
> +	} while (0)
> +
> +
> +/**
> + * str_to_num - Convert string into number and copy with endings like
> + *              KiB for kilobyte
> + *              MiB for megabyte
> + *              GiB for gigabyte
> + */
> +uint64_t str_to_num(char *str);
> +void nx_lib_debug(int onoff);
> +
> +#endif	/* _NXU_DBG_H_ */
> diff --git a/tools/testing/selftests/powerpc/nx-gzip/inc/nxu.h b/tools/testing/selftests/powerpc/nx-gzip/inc/nxu.h
> new file mode 100644
> index 000000000000..faa95ffc162a
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/nx-gzip/inc/nxu.h
> @@ -0,0 +1,644 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * Hardware interface of the NX-GZIP compression accelerator
> + *
> + * Copyright (C) IBM Corporation, 2020
> + *
> + * Author: Bulent Abali <abali@us.ibm.com>
> + *
> + */
> +
> +#ifndef _NXU_H
> +#define _NXU_H
> +
> +#include <stdint.h>
> +#include <endian.h>
> +
> +/* deflate */
> +#define LLSZ   286
> +#define DSZ    30
> +
> +/* nx */
> +#define DHTSZ  18
> +#define DHT_MAXSZ 288
> +#define MAX_DDE_COUNT 256
> +
> +/* util */
> +#ifdef NXDBG
> +#define NXPRT(X) do { X; } while (0)
> +#else
> +#define NXPRT(X) do { ; } while (0)
> +#endif
> +
> +#ifdef NXTIMER
> +#include <sys/platform/ppc.h>
> +#define NX_CLK(X)      do { X; } while (0)
> +#define nx_get_time()  __ppc_get_timebase()
> +#define nx_get_freq()  __ppc_get_timebase_freq()
> +#else
> +#define NX_CLK(X)      do { ; } while (0)
> +#define nx_get_time()  (-1)
> +#define nx_get_freq()  (-1)
> +#endif
> +
> +/*
> + * Definitions of acronyms used here. See
> + * P9 NX Gzip Accelerator User's Manual for details
> + *

If I google "P9 NX Gzip Accelerator User's Manual" I just find the
POWER9 Processor User's Manual which seems to be a different
thing... following the links in that document to a GitHub repo I think
the document I'm looking for is

https://github.com/abalib/power-gzip/blob/master/power_nx_gzip_um.pdf

If that's right and the URL isstable, could you include a link in source
code? (Should perhaps the power-gzip repository be moved from a personal
github account to the open-power organisation?)

Regards,
Daniel

> + * adler/crc: 32 bit checksums appended to stream tail
> + * ce:       completion extension
> + * cpb:      coprocessor parameter block (metadata)
> + * crb:      coprocessor request block (command)
> + * csb:      coprocessor status block (status)
> + * dht:      dynamic huffman table
> + * dde:      data descriptor element (address, length)
> + * ddl:      list of ddes
> + * dh/fh:    dynamic and fixed huffman types
> + * fc:       coprocessor function code
> + * histlen:  history/dictionary length
> + * history:  sliding window of up to 32KB of data
> + * lzcount:  Deflate LZ symbol counts
> + * rembytecnt: remaining byte count
> + * sfbt:     source final block type; last block's type during decomp
> + * spbc:     source processed byte count
> + * subc:     source unprocessed bit count
> + * tebc:     target ending bit count; valid bits in the last byte
> + * tpbc:     target processed byte count
> + * vas:      virtual accelerator switch; the user mode interface
> + */
> +
> +typedef union {
> +    uint32_t word[4];
> +    uint64_t dword[2];
> +} nx_qw_t __attribute__((aligned (16)));
> +
> +/*
> + * Note: NX registers with fewer than 32 bits are declared by
> + * convention as uint32_t variables in unions. If *_offset and *_mask
> + * are defined for a variable, then use get_ put_ macros to
> + * conveniently access the register fields for endian conversions.
> + */
> +
> +typedef struct {
> +    /* Data Descriptor Element, Section 6.4 */
> +    union {
> +	uint32_t dde_count;
> +	/* When dde_count == 0 ddead is a pointer to a data buffer;
> +	 * ddebc is the buffer length bytes.
> +	 * When dde_count > 0 dde is an indirect dde; ddead is a pointer
> +	 * to a contiguous list of direct ddes; ddebc is the total length
> +	 * of all data pointed to by the list of direct ddes.
> +	 * Note that only one level of indirection is permitted.
> +	 * See Section 6.4 of the user manual for additional details
> +	 */
> +    };
> +    uint32_t ddebc; /* dde byte count */
> +    uint64_t ddead; /* dde address */
> +} nx_dde_t __attribute__((aligned (16)));
> +
> +typedef struct {
> +    /* Coprocessor Status Block, Section 6.6  */
> +    union {
> +	uint32_t csb_v;
> +	/* Valid bit. v must be set to 0 by the program
> +	 * before submitting the coprocessor command.
> +	 * Software can poll for the v bit
> +	 */
> +
> +	uint32_t csb_f;
> +	/* 16B CSB size. Written to 0 by DMA when it writes the CPB */
> +
> +	uint32_t csb_cs;
> +	/* cs completion sequence; unused */
> +
> +	uint32_t csb_cc;
> +	/* cc completion code; cc != 0 exception occurred */
> +
> +	uint32_t csb_ce;
> +	/* ce completion extension */
> +
> +    };
> +    uint32_t tpbc;
> +    /* target processed byte count TPBC */
> +
> +    uint64_t fsaddr;
> +    /* Section 6.12.1 CSB NonZero error summary.  FSA Failing storage
> +     * address.  Address where error occurred. When available, written
> +     * to A field of CSB
> +     */
> +} nx_csb_t __attribute__((aligned (16)));
> +
> +typedef struct {
> +    /* Coprocessor Completion Block, Section 6.7 */
> +
> +    uint32_t reserved[3];
> +    union {
> +	/* When crb.c==0 (no ccb defined) it is reserved;
> +	 * When crb.c==1 (ccb defined) it is cm
> +	 */
> +
> +	uint32_t ccb_cm;
> +	/* Signal interrupt of crb.c==1 and cm==1 */
> +
> +	uint32_t word;
> +	/* generic access to the 32bit word */
> +    };
> +} nx_ccb_t __attribute__((aligned (16)));
> +
> +typedef struct {
> +    /*
> +     * CRB operand of the paste coprocessor instruction is stamped
> +     * in quadword 4 with the information shown here as its written
> +     * in to the receive FIFO of the coprocessor
> +     */
> +
> +    union {
> +	uint32_t vas_buf_num;
> +	/* Verification only vas buffer number which correlates to
> +	 * the low order bits of the atag in the paste command
> +	 */
> +
> +	uint32_t send_wc_id;
> +	/* Pointer to Send Window Context that provides for NX address
> +	 * translation information, such as MSR and LPCR bits, job completion
> +	 * interrupt RA, PSWID, and job utilization counter.
> +	 */
> +
> +    };
> +    union {
> +	uint32_t recv_wc_id;
> +	/* Pointer to Receive Window Context. NX uses this to return
> +	 * credits to a Receive FIFO as entries are dequeued.
> +	 */
> +
> +    };
> +    uint32_t reserved2;
> +    union {
> +	uint32_t vas_invalid;
> +	/* Invalid bit. If this bit is 1 the CRB is discarded by
> +	 * NX upon fetching from the receive FIFO. If this bit is 0
> +	 * the CRB is processed normally. The bit is stamped to 0
> +	 * by VAS and may be written to 1 by hypervisor while
> +	 * the CRB is in the receive FIFO (in memory).
> +	 */
> +
> +    };
> +} vas_stamped_crb_t;
> +
> +typedef struct {
> +    /*
> +     * A CRB that has a translation fault is stamped by NX in quadword 4
> +     * and pasted to the Fault Send Window in VAS.
> +     */
> +    uint64_t fsa;
> +    union {
> +	uint32_t nxsf_t;
> +	uint32_t nxsf_fs;
> +    };
> +    uint32_t pswid;
> +} nx_stamped_fault_crb_t;
> +
> +typedef union {
> +    vas_stamped_crb_t      vas;
> +    nx_stamped_fault_crb_t nx;
> +} stamped_crb_t;
> +
> +typedef struct {
> +    /*
> +     * Coprocessor Parameter Block In/Out are used to pass metadata
> +     * to/from accelerator.  Tables 6.5 and 6.6 of the user manual.
> +     */
> +
> +    /* CPBInput */
> +
> +    struct {
> +	union {
> +	    nx_qw_t qw0;
> +	    struct {
> +		uint32_t in_adler;            /* bits 0:31    */
> +		uint32_t in_crc;              /* bits 32:63   */
> +		union {
> +		    uint32_t in_histlen;      /* bits 64:75   */
> +		    uint32_t in_subc;         /* bits 93:95   */
> +		};
> +		union {
> +		    uint32_t in_sfbt;         /* bits 108:111 */
> +		    uint32_t in_rembytecnt;   /* bits 112:127 */
> +		    uint32_t in_dhtlen;       /* bits 116:127 */
> +		};
> +	    };
> +	};
> +	union {
> +	    nx_qw_t  in_dht[DHTSZ];           /* qw[1:18]     */
> +	    char     in_dht_char[DHT_MAXSZ];  /* byte access  */
> +	};
> +	nx_qw_t  reserved[5];                 /* qw[19:23]    */
> +    };
> +
> +    /* CPBOutput */
> +
> +    volatile struct {
> +	union {
> +	    nx_qw_t qw24;
> +	    struct {
> +		uint32_t out_adler;           /* bits 0:31  qw[24]   */
> +		uint32_t out_crc;             /* bits 32:63 qw[24]   */
> +		union {
> +		    uint32_t out_tebc;        /* bits 77:79 qw[24]   */
> +		    uint32_t out_subc;        /* bits 80:95 qw[24]   */
> +		};
> +		union {
> +		    uint32_t out_sfbt;        /* bits 108:111 qw[24] */
> +		    uint32_t out_rembytecnt;  /* bits 112:127 qw[24] */
> +		    uint32_t out_dhtlen;      /* bits 116:127 qw[24] */
> +		};
> +	    };
> +	};
> +	union {
> +	    nx_qw_t  qw25[79];              /* qw[25:103] */
> +	    /* qw[25] compress no lzcounts or wrap */
> +	    uint32_t out_spbc_comp_wrap;
> +	    uint32_t out_spbc_wrap;         /* qw[25] wrap */
> +	    uint32_t out_spbc_comp;         /* qw[25] compress no lzcounts */
> +	    uint32_t out_lzcount[LLSZ+DSZ]; /* 286 LL and 30 D symbol counts */
> +	    struct {
> +		nx_qw_t  out_dht[DHTSZ];    /* qw[25:42] */
> +		uint32_t out_spbc_decomp;   /* qw[43] decompress */
> +	    };
> +	};
> +	/* qw[104] compress with lzcounts */
> +	uint32_t out_spbc_comp_with_count;
> +    };
> +} nx_gzip_cpb_t  __attribute__((aligned (128)));
> +
> +typedef struct {
> +    union {                   /* byte[0:3]   */
> +	uint32_t gzip_fc;     /* bits[24-31] */
> +    };
> +    uint32_t reserved1;       /* byte[4:7]   */
> +    union {
> +	uint64_t csb_address; /* byte[8:15]  */
> +	struct {
> +	    uint32_t reserved2;
> +	    union {
> +		uint32_t crb_c;
> +		/* c==0 no ccb defined */
> +
> +		uint32_t crb_at;
> +		/* at==0 address type is ignored;
> +		 * all addrs effective assumed.
> +		 */
> +
> +	    };
> +	};
> +    };
> +    nx_dde_t source_dde;           /* byte[16:31] */
> +    nx_dde_t target_dde;           /* byte[32:47] */
> +    volatile nx_ccb_t ccb;         /* byte[48:63] */
> +    volatile union {
> +	/* byte[64:239] shift csb by 128 bytes out of the crb; csb was in crb
> +	 * earlier; JReilly says csb written with partial inject.
> +	 */
> +	nx_qw_t reserved64[11];
> +	stamped_crb_t stamp;       /* byte[64:79] */
> +    };
> +    volatile nx_csb_t csb;
> +} nx_gzip_crb_t __attribute__((aligned (128)));
> +
> +
> +typedef struct {
> +    nx_gzip_crb_t crb;
> +    nx_gzip_cpb_t cpb;
> +} nx_gzip_crb_cpb_t __attribute__((aligned (2048)));
> +
> +
> +/*
> + * NX hardware convention has the msb bit on the left numbered 0.
> + * The defines below has *_offset defined as the right most bit
> + * position of a field.  x of size_mask(x) is the field width in bits.
> + */
> +
> +#define size_mask(x)          ((1U<<(x))-1)
> +
> +/*
> + * Offsets and Widths within the containing 32 bits of the various NX
> + * gzip hardware registers.  Use the getnn/putnn macros to access
> + * these regs
> + */
> +
> +#define dde_count_mask        size_mask(8)
> +#define dde_count_offset      23
> +
> +/* CSB */
> +
> +#define csb_v_mask            size_mask(1)
> +#define csb_v_offset          0
> +#define csb_f_mask            size_mask(1)
> +#define csb_f_offset          6
> +#define csb_cs_mask           size_mask(8)
> +#define csb_cs_offset         15
> +#define csb_cc_mask           size_mask(8)
> +#define csb_cc_offset         23
> +#define csb_ce_mask           size_mask(8)
> +#define csb_ce_offset         31
> +
> +/* CCB */
> +
> +#define ccb_cm_mask           size_mask(3)
> +#define ccb_cm_offset         31
> +
> +/* VAS stamped CRB fields */
> +
> +#define vas_buf_num_mask      size_mask(6)
> +#define vas_buf_num_offset    5
> +#define send_wc_id_mask       size_mask(16)
> +#define send_wc_id_offset     31
> +#define recv_wc_id_mask       size_mask(16)
> +#define recv_wc_id_offset     31
> +#define vas_invalid_mask      size_mask(1)
> +#define vas_invalid_offset    31
> +
> +/* NX stamped fault CRB fields */
> +
> +#define nxsf_t_mask           size_mask(1)
> +#define nxsf_t_offset         23
> +#define nxsf_fs_mask          size_mask(8)
> +#define nxsf_fs_offset        31
> +
> +/* CPB input */
> +
> +#define in_histlen_mask       size_mask(12)
> +#define in_histlen_offset     11
> +#define in_dhtlen_mask        size_mask(12)
> +#define in_dhtlen_offset      31
> +#define in_subc_mask          size_mask(3)
> +#define in_subc_offset        31
> +#define in_sfbt_mask          size_mask(4)
> +#define in_sfbt_offset        15
> +#define in_rembytecnt_mask    size_mask(16)
> +#define in_rembytecnt_offset  31
> +
> +/* CPB output */
> +
> +#define out_tebc_mask         size_mask(3)
> +#define out_tebc_offset       15
> +#define out_subc_mask         size_mask(16)
> +#define out_subc_offset       31
> +#define out_sfbt_mask         size_mask(4)
> +#define out_sfbt_offset       15
> +#define out_rembytecnt_mask   size_mask(16)
> +#define out_rembytecnt_offset 31
> +#define out_dhtlen_mask       size_mask(12)
> +#define out_dhtlen_offset     31
> +
> +/* CRB */
> +
> +#define gzip_fc_mask          size_mask(8)
> +#define gzip_fc_offset        31
> +#define crb_c_mask            size_mask(1)
> +#define crb_c_offset          28
> +#define crb_at_mask           size_mask(1)
> +#define crb_at_offset         30
> +#define csb_address_mask      ~(15UL) /* mask off bottom 4b */
> +
> +/*
> + * Access macros for the registers.  Do not access registers directly
> + * because of the endian conversion.  P9 processor may run either as
> + * Little or Big endian. However the NX coprocessor regs are always
> + * big endian.
> + * Use the 32 and 64b macros to access respective
> + * register sizes.
> + * Use nn forms for the register fields shorter than 32 bits.
> + */
> +
> +#define getnn(ST, REG)      ((be32toh(ST.REG) >> (31-REG##_offset)) \
> +				 & REG##_mask)
> +#define getpnn(ST, REG)     ((be32toh((ST)->REG) >> (31-REG##_offset)) \
> +				 & REG##_mask)
> +#define get32(ST, REG)      (be32toh(ST.REG))
> +#define getp32(ST, REG)     (be32toh((ST)->REG))
> +#define get64(ST, REG)      (be64toh(ST.REG))
> +#define getp64(ST, REG)     (be64toh((ST)->REG))
> +
> +#define unget32(ST, REG)    (get32(ST, REG) & ~((REG##_mask) \
> +				<< (31-REG##_offset)))
> +/* get 32bits less the REG field */
> +
> +#define ungetp32(ST, REG)   (getp32(ST, REG) & ~((REG##_mask) \
> +				<< (31-REG##_offset)))
> +/* get 32bits less the REG field */
> +
> +#define clear_regs(ST)      do { memset((void *)(&(ST)), 0, sizeof(ST)); \
> +				} while (0)
> +#define clear_dde(ST)       do { ST.dde_count = ST.ddebc = 0; ST.ddead = 0; \
> +				} while (0)
> +#define clearp_dde(ST)      do { (ST)->dde_count = (ST)->ddebc = 0; \
> +				 (ST)->ddead = 0; \
> +				} while (0)
> +#define clear_struct(ST)    do { memset((void *)(&(ST)), 0, sizeof(ST)); \
> +				} while (0)
> +
> +#define putnn(ST, REG, X)   do { ST.REG = htobe32(unget32(ST, REG) | (((X) \
> +				 & REG##_mask) << (31-REG##_offset))); \
> +				} while (0)
> +#define putpnn(ST, REG, X)  do { (ST)->REG = htobe32(ungetp32(ST, REG) \
> +				| (((X) & REG##_mask) << (31-REG##_offset))); \
> +				} while (0)
> +
> +#define put32(ST, REG, X)   do { ST.REG = htobe32(X); } while (0)
> +#define putp32(ST, REG, X)  do { (ST)->REG = htobe32(X); } while (0)
> +#define put64(ST, REG, X)   do { ST.REG = htobe64(X); } while (0)
> +#define putp64(ST, REG, X)  do { (ST)->REG = htobe64(X); } while (0)
> +
> +/*
> + * Completion extension ce(0) ce(1) ce(2).  Bits ce(3-7)
> + * unused.  Section 6.6 Figure 6.7.
> + */
> +
> +#define get_csb_ce(ST) ((uint32_t)getnn(ST, csb_ce))
> +#define get_csb_ce_ms3b(ST) (get_csb_ce(ST) >> 5)
> +#define put_csb_ce_ms3b(ST, X) do { putnn(ST, csb_ce, ((uint32_t)(X) << 5)); \
> +				   } while (0)
> +
> +#define CSB_CE_PARTIAL         0x4
> +#define CSB_CE_TERMINATE       0x2
> +#define CSB_CE_TPBC_VALID      0x1
> +
> +#define csb_ce_termination(X)         (!!((X) & CSB_CE_TERMINATE))
> +/* termination, output buffers may be modified, SPBC/TPBC invalid Fig.6-7 */
> +
> +#define csb_ce_check_completion(X)    (!csb_ce_termination(X))
> +/* if not terminated then check full or partial completion */
> +
> +#define csb_ce_partial_completion(X)  (!!((X) & CSB_CE_PARTIAL))
> +#define csb_ce_full_completion(X)     (!csb_ce_partial_completion(X))
> +#define csb_ce_tpbc_valid(X)          (!!((X) & CSB_CE_TPBC_VALID))
> +/* TPBC indicates successfully stored data count */
> +
> +#define csb_ce_default_err(X)         csb_ce_termination(X)
> +/* most error CEs have CE(0)=0 and CE(1)=1 */
> +
> +#define csb_ce_cc3_partial(X)         csb_ce_partial_completion(X)
> +/* some CC=3 are partially completed, Table 6-8 */
> +
> +#define csb_ce_cc64(X)                ((X)&(CSB_CE_PARTIAL \
> +					| CSB_CE_TERMINATE) == 0)
> +/* Compression: when TPBC>SPBC then CC=64 Table 6-8; target didn't
> + * compress smaller than source.
> + */
> +
> +/* Decompress SFBT combinations Tables 5-3, 6-4, 6-6 */
> +
> +#define SFBT_BFINAL 0x1
> +#define SFBT_LIT    0x4
> +#define SFBT_FHT    0x5
> +#define SFBT_DHT    0x6
> +#define SFBT_HDR    0x7
> +
> +/*
> + * NX gzip function codes. Table 6.2.
> + * Bits 0:4 are the FC. Bit 5 is used by the DMA controller to
> + * select one of the two Byte Count Limits.
> + */
> +
> +#define GZIP_FC_LIMIT_MASK                               0x01
> +#define GZIP_FC_COMPRESS_FHT                             0x00
> +#define GZIP_FC_COMPRESS_DHT                             0x02
> +#define GZIP_FC_COMPRESS_FHT_COUNT                       0x04
> +#define GZIP_FC_COMPRESS_DHT_COUNT                       0x06
> +#define GZIP_FC_COMPRESS_RESUME_FHT                      0x08
> +#define GZIP_FC_COMPRESS_RESUME_DHT                      0x0a
> +#define GZIP_FC_COMPRESS_RESUME_FHT_COUNT                0x0c
> +#define GZIP_FC_COMPRESS_RESUME_DHT_COUNT                0x0e
> +#define GZIP_FC_DECOMPRESS                               0x10
> +#define GZIP_FC_DECOMPRESS_SINGLE_BLK_N_SUSPEND          0x12
> +#define GZIP_FC_DECOMPRESS_RESUME                        0x14
> +#define GZIP_FC_DECOMPRESS_RESUME_SINGLE_BLK_N_SUSPEND   0x16
> +#define GZIP_FC_WRAP                                     0x1e
> +
> +#define fc_is_compress(fc)  (((fc) & 0x10) == 0)
> +#define fc_has_count(fc)    (fc_is_compress(fc) && (((fc) & 0x4) != 0))
> +
> +/* CSB.CC Error codes */
> +
> +#define ERR_NX_OK             0
> +#define ERR_NX_ALIGNMENT      1
> +#define ERR_NX_OPOVERLAP      2
> +#define ERR_NX_DATA_LENGTH    3
> +#define ERR_NX_TRANSLATION    5
> +#define ERR_NX_PROTECTION     6
> +#define ERR_NX_EXTERNAL_UE7   7
> +#define ERR_NX_INVALID_OP     8
> +#define ERR_NX_PRIVILEGE      9
> +#define ERR_NX_INTERNAL_UE   10
> +#define ERR_NX_EXTERN_UE_WR  12
> +#define ERR_NX_TARGET_SPACE  13
> +#define ERR_NX_EXCESSIVE_DDE 14
> +#define ERR_NX_TRANSL_WR     15
> +#define ERR_NX_PROTECT_WR    16
> +#define ERR_NX_SUBFUNCTION   17
> +#define ERR_NX_FUNC_ABORT    18
> +#define ERR_NX_BYTE_MAX      19
> +#define ERR_NX_CORRUPT_CRB   20
> +#define ERR_NX_INVALID_CRB   21
> +#define ERR_NX_INVALID_DDE   30
> +#define ERR_NX_SEGMENTED_DDL 31
> +#define ERR_NX_DDE_OVERFLOW  33
> +#define ERR_NX_TPBC_GT_SPBC  64
> +#define ERR_NX_MISSING_CODE  66
> +#define ERR_NX_INVALID_DIST  67
> +#define ERR_NX_INVALID_DHT   68
> +#define ERR_NX_EXTERNAL_UE90 90
> +#define ERR_NX_WDOG_TIMER   224
> +#define ERR_NX_AT_FAULT     250
> +#define ERR_NX_INTR_SERVER  252
> +#define ERR_NX_UE253        253
> +#define ERR_NX_NO_HW        254
> +#define ERR_NX_HUNG_OP      255
> +#define ERR_NX_END          256
> +
> +/* initial values for non-resume operations */
> +#define INIT_CRC   0  /* crc32(0L, Z_NULL, 0) */
> +#define INIT_ADLER 1  /* adler32(0L, Z_NULL, 0)  adler is initalized to 1 */
> +
> +/* prototypes */
> +#ifdef NX_JOB_CALLBACK
> +int nxu_run_job(nx_gzip_crb_cpb_t *c, void *handle,
> +		int (*callback)(const void *));
> +#else
> +int nxu_run_job(nx_gzip_crb_cpb_t *c, void *handle);
> +#endif
> +
> +
> +/* caller supplies a print buffer 4*sizeof(crb) */
> +
> +char *nx_crb_str(nx_gzip_crb_t *crb, char *prbuf);
> +char *nx_cpb_str(nx_gzip_cpb_t *cpb, char *prbuf);
> +char *nx_prt_hex(void *cp, int sz, char *prbuf);
> +char *nx_lzcount_str(nx_gzip_cpb_t *cpb, char *prbuf);
> +char *nx_strerror(int e);
> +
> +#ifdef NX_SIM
> +#include <stdio.h>
> +int nx_sim_init(void *ctx);
> +int nx_sim_end(void *ctx);
> +int nxu_run_sim_job(nx_gzip_crb_cpb_t *c, void *ctx);
> +#endif /* NX_SIM */
> +
> +/* Deflate stream manipulation */
> +
> +#define set_final_bit(x) do { x |= (unsigned char)1; } while (0)
> +#define clr_final_bit(x) do { x &= ~(unsigned char)1; } while (0)
> +
> +#define append_empty_fh_blk(p, b) do { *(p) = (2 | (1&(b))); *((p)+1) = 0; \
> +				    } while (0)
> +/* append 10 bits 0000001b 00...... ;
> + * assumes appending starts on a byte boundary; b is the final bit.
> + */
> +
> +
> +#ifdef NX_842
> +
> +/* 842 Engine */
> +
> +typedef struct {
> +    union {                   /* byte[0:3]   */
> +	uint32_t eft_fc;      /* bits[29-31] */
> +    };
> +    uint32_t reserved1;       /* byte[4:7]   */
> +    union {
> +	uint64_t csb_address; /* byte[8:15]  */
> +	struct {
> +	    uint32_t reserved2;
> +	    union {
> +		uint32_t crb_c;
> +		/* c==0 no ccb defined */
> +
> +		uint32_t crb_at;
> +		/* at==0 address type is ignored;
> +		   all addrs effective assumed */
> +
> +	    };
> +	};
> +    };
> +    nx_dde_t source_dde;           /* byte[16:31] */
> +    nx_dde_t target_dde;           /* byte[32:47] */
> +    nx_ccb_t ccb;                  /* byte[48:63] */
> +    union {
> +	nx_qw_t reserved64[3];     /* byte[64:96] */
> +    };
> +    nx_csb_t csb;
> +} nx_eft_crb_t __attribute__((aligned (128)));
> +
> +/* 842 CRB */
> +
> +#define EFT_FC_MASK                 size_mask(3)
> +#define EFT_FC_OFFSET               31
> +#define EFT_FC_COMPRESS             0x0
> +#define EFT_FC_COMPRESS_WITH_CRC    0x1
> +#define EFT_FC_DECOMPRESS           0x2
> +#define EFT_FC_DECOMPRESS_WITH_CRC  0x3
> +#define EFT_FC_BLK_DATA_MOVE        0x4
> +#endif /* NX_842 */
> +
> +#endif /* _NXU_H */
> -- 
> 2.21.0

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2020-03-18 22:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-16 18:07 [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase Raphael Moreira Zinsly
2020-03-16 18:07 ` [PATCH 1/5] selftests/powerpc: Add header files for GZIP engine test Raphael Moreira Zinsly
2020-03-18  3:48   ` Daniel Axtens
2020-03-16 18:07 ` [PATCH 2/5] selftests/powerpc: Add header files for NX compresion/decompression Raphael Moreira Zinsly
2020-03-18 22:29   ` Daniel Axtens
2020-03-16 18:07 ` [PATCH 3/5] selftests/powerpc: Add NX-GZIP engine compress testcase Raphael Moreira Zinsly
2020-03-16 18:07 ` [PATCH 4/5] selftests/powerpc: Add NX-GZIP engine decompress testcase Raphael Moreira Zinsly
2020-03-18  4:31   ` Daniel Axtens
2020-03-18  6:18   ` Daniel Axtens
2020-03-18 13:08     ` Raphael M Zinsly
2020-03-18 22:19       ` Daniel Axtens
2020-03-16 18:07 ` [PATCH 5/5] selftests/powerpc: Add README for GZIP engine tests Raphael Moreira Zinsly
2020-03-18  6:40   ` Daniel Axtens
2020-03-16 21:50 ` [PATCH 0/5] selftests/powerpc: Add NX-GZIP engine testcase Haren Myneni

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).