RE: [PATCH v3 20/21] nfit-test: manufactured NFITs for interface development

From: "Elliott, Robert (Server Storage)" <Elliott@hp.com>
To: Dan Williams <dan.j.williams@intel.com>,
	"axboe@kernel.dk" <axboe@kernel.dk>
Cc: "linux-nvdimm@lists.01.org" <linux-nvdimm@lists.01.org>,
	"neilb@suse.de" <neilb@suse.de>,
	"gregkh@linuxfoundation.org" <gregkh@linuxfoundation.org>,
	"Rafael J. Wysocki" <rafael.j.wysocki@intel.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Robert Moore <robert.moore@intel.com>,
	"linux-acpi@vger.kernel.org" <linux-acpi@vger.kernel.org>,
	Lv Zheng <lv.zheng@intel.com>, "hch@lst.de" <hch@lst.de>,
	"mingo@kernel.org" <mingo@kernel.org>,
	"Kani,
	Toshimitsu" <toshi.kani@hp.com>Christoph Hellwig <hch@lst.de>,
	"Boaz Harrosh (boaz@plexistor.com)" <boaz@plexistor.com>
Subject: RE: [PATCH v3 20/21] nfit-test: manufactured NFITs for interface development
Date: Mon, 25 May 2015 07:02:57 +0000	[thread overview]
Message-ID: <94D0CD8314A33A4D9D801C0FE68B40295A9217B0@G9W0745.americas.hpqcorp.net> (raw)
In-Reply-To: <20150520205800.32249.74581.stgit@dwillia2-desk3.amr.corp.intel.com>

[-- Attachment #1: Type: text/plain, Size: 667 bytes --]

> -----Original Message-----
> From: Linux-nvdimm [mailto:linux-nvdimm-bounces@lists.01.org] On Behalf
> Of Dan Williams
> Sent: Wednesday, May 20, 2015 3:58 PM
> To: axboe@kernel.dk
> Subject: [PATCH v3 20/21] nfit-test: manufactured NFITs for interface
> development
...

Attached is some experimental code to try pmem with different 
cache types (UC, WB, WC, and WT) and memcpy functions using x86 
AVX non-temporal load and store instructions.

It depends on Toshi's WT patch series:
	https://lkml.org/lkml/2015/5/13/866

If you don't have that, you can just comment out the lines related
to ioremap_wt.

---
Rob Elliott, HP Server Storage


[-- Attachment #2: 0001-pmem-cache-type --]
[-- Type: application/octet-stream, Size: 19027 bytes --]

From 18e75a7134e0130b925fffab13f41c1ffc4d9f05 Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hp.com>
Date: Fri, 22 May 2015 16:46:21 -0500
Subject: [PATCH] pmem cache type patch

Author: Robert Elliott <elliott@hp.com>
Date:   Tue Apr 28 19:14:53 2015 -0500

    pmem: cache_type, non-temporal memcpy experiments

    WARNING: Not for inclusion in the kernel - just for experimentation.

    Add modparams to select cache_type and various kinds of
    memcpy with non-temporal loads and stores.  Parameters
    are printed to the kernel serial log at module load time.

    Example usage:
    modprobe pmem pmem_cachetype=2 pmem_readscan=2 pmem_ntw=1 pmem_ntr=1

    x86 offers several non-temporal instructions:
    *  8 byte: movnti (store) from normal registers
    * 16 byte: movntdq (store) and movntdqa (load) using xmm registers (SSE)
    * 32 byte: vmovntdq and vmovntdqa using ymm registers (AVX)
    * 64 byte: vmovntdq and vmovntdqa using zmm registers (AVX512)

    The 32-byte AVX instructions are used by this patch.

    Normal memcpy is used for unaligned pmem_rw_bytes accesses,
    so is unsafe for WB mode.

    Module parameters
    =================
    pmem_cachetype=n	(default 3)
    	Select the cache type (which ioremap function to use to
    	map the NVDIMM memory)
    	0 = UC (uncacheable) - slow writes, slow reads
    	1 = WB (writeback) - fast unsafe writes, fast reads
    	2 = WC (write combining) - fast writes, slow reads
    	3 = WT (writethrough) - slow writes, fast reads

    	WB writes are safe if:
    	* non-temporal stores are exclusively used
    	* clflush instructions are added

    pmem_readscan=n		(default 0)
    	0 = no read scan
    	1 = read the entire memory range, looking to trigger
    	UC memory errors

    	The rate is also printed, serving as a quick performance
    	check (uses a 64 byte loop with NT loads).

    pmem_clean=n		(default 0)
    	0 = no clean
    	1 = overwrite the entire memory range, possibly
    	clearing UC memory errors (dangerous, destroys
    	all data)

    	The rate is also printed, serving as a quick performance
    	check (uses a 64 byte loop with NT stores).

    pmem_ntw=n		(default 3)
    	Use non-temporal stores when writing persistent memory

    	0 = memcpy (unsafe for WB)
    	1 = 64 byte loop with NT stores
    	2 = 128 byte loop with NT stores
    	3 = 64 byte loop with NT stores, plus use NT loads from
    	  normal memory (may be better cache usage)
    	4 = 128 byte loop with NT stores, plus use NT loads from
    	  normal memory
    	5 = __copy_from_user (existing kernel function with
    	  8 byte NT instructions)
    	6 = no write at all (nop)(dangerous)
    	7 = 64-byte loop, store only (write garbage)(dangerous)

    pmem_ntr=n		(default 3)
    	Use non-temporal loads when reading persistent memory

    	0 = memcpy
    	1 = 64 byte loop with NT loads
    	2 = 128 byte loop with NT loads
    	3 = 64 byte loop with NT loads, plus use NT stores to
    	  normal memory
    	4 = 128 byte loop with NT loads, plus use NT stores to
    	  normal memory
    	5 = memcpy
    	6 = no load at all (nop)(dangerous)
    	7 = 64-byte loop, load only (return garbage)(dangerous)

    pmm_ntw=6 pmem_ntr=6 exhibits the block layer IOPS limits.

    Signed-off-by: Robert Elliott <elliott@hp.com>
---
 drivers/block/nd/pmem.c | 550 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 539 insertions(+), 11 deletions(-)

diff --git a/drivers/block/nd/pmem.c b/drivers/block/nd/pmem.c
index 7b5cedf1f2a4..f378ef81733f 100644
--- a/drivers/block/nd/pmem.c
+++ b/drivers/block/nd/pmem.c
@@ -26,6 +26,382 @@
 #include <linux/nd.h>
 #include "nd.h"
 
+static int pmem_cachetype;	/* default UC */
+module_param(pmem_cachetype, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(pmem_cachetype,
+	"Select cache attribute for pmem driver (0=UC, 1=WB 2=WC 3=WT)");
+
+static int pmem_readscan;
+module_param(pmem_readscan, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(pmem_readscan,
+	"Read scan pmem device upon init (trigger ECC errors)");
+
+static int pmem_clean;
+module_param(pmem_clean, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(pmem_clean,
+	"Clean pmem device upon init (write garbage, but cleans the ECC)");
+
+static int pmem_ntw = 3;
+module_param(pmem_ntw, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(pmem_ntw,
+	"Use non-temporal stores for block writes in pmem (0=memcpy, 1=64 byte NT, 2=128 byte NT, 3=64 dual NT, 4=128 dual NT, 5=copy_from_user, 6=nop, 7=64-byte NT-store only)");
+
+static int pmem_ntr = 3;
+module_param(pmem_ntr, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(pmem_ntr,
+	"Use non-temporal loads for block reads in pmem (0=memcpy, 1=64 byte NT, 2=128 byte NT, 3=64 dual NT, 4=128 dual NT, 5=memcpy, 6=nop, 7=64-byte NT-load only)");
+
+/* load: normal, store: non-temporal, loop: 64 bytes */
+static void memcpy_lt_snt_64(void *to, const void *from, size_t size)
+{
+	u64 bs = 64;
+	int i;
+
+	BUG_ON(!IS_ALIGNED(size, bs));
+	BUG_ON(!IS_ALIGNED((u64)to, bs));
+	BUG_ON(!IS_ALIGNED((u64)from, bs));
+
+	for (i = 0; i < size; i += bs) {
+		__asm__ __volatile__ (
+#if 0
+		/* 16-byte SSE instructions */
+		"movdqa (%0), %%xmm0\n"
+		"movdqa 16(%0), %%xmm1\n"
+		"movdqa 32(%0), %%xmm2\n"
+		"movdqa 48(%0), %%xmm3\n"
+		"movntdq %%xmm0, (%1)\n"
+		"movntdq %%xmm1, 16(%1)\n"
+		"movntdq %%xmm2, 32(%1)\n"
+		"movntdq %%xmm3, 48(%1)\n"
+#endif
+		/* 32-byte AVX instructions */
+		"vmovdqa (%0), %%ymm0\n"
+		"vmovdqa 32(%0), %%ymm1\n"
+		"vmovntdq %%ymm0, (%1)\n"
+		"vmovntdq %%ymm1, 32(%1)\n"
+		:
+		: "r" (from), "r" (to)
+		: "memory");
+
+		to += bs;
+		from += bs;
+	}
+
+	__asm__ __volatile__ (
+		" sfence\n" : :
+	);
+}
+
+/* load: skip, store: non-temporal, loop: 64 bytes */
+static void memcpy_lskip_snt_64(void *to, const void *from, size_t size)
+{
+	u64 bs = 64;
+	int i;
+
+	BUG_ON(!IS_ALIGNED(size, bs));
+	BUG_ON(!IS_ALIGNED((u64)to, bs));
+	BUG_ON(!IS_ALIGNED((u64)from, bs));
+
+	for (i = 0; i < size; i += bs) {
+		__asm__ __volatile__ (
+#if 0
+		"movntdq %%xmm0, (%1)\n"
+		"movntdq %%xmm1, 16(%1)\n"
+		"movntdq %%xmm2, 32(%1)\n"
+		"movntdq %%xmm3, 48(%1)\n"
+#endif
+		"vmovntdq %%ymm0, (%1)\n"
+		"vmovntdq %%ymm1, 32(%1)\n"
+		:
+		: "r" (from), "r" (to)
+		: "memory");
+
+		to += bs;
+		from += bs;
+	}
+
+	__asm__ __volatile__ (
+		" sfence\n" : :
+	);
+}
+
+/* load: non-temporal, store: non-temporal, loop: 64 bytes */
+static void memcpy_lnt_snt_64(void *to, const void *from, size_t size)
+{
+	u64 bs = 64;
+	int i;
+
+	BUG_ON(!IS_ALIGNED(size, bs));
+	BUG_ON(!IS_ALIGNED((u64)to, bs));
+	BUG_ON(!IS_ALIGNED((u64)from, bs));
+
+	for (i = 0; i < size; i += bs) {
+		__asm__ __volatile__ (
+#if 0
+		"movntdqa (%0), %%xmm0\n"
+		"movntdqa 16(%0), %%xmm1\n"
+		"movntdqa 32(%0), %%xmm2\n"
+		"movntdqa 48(%0), %%xmm3\n"
+		"movntdq %%xmm0, (%1)\n"
+		"movntdq %%xmm1, 16(%1)\n"
+		"movntdq %%xmm2, 32(%1)\n"
+		"movntdq %%xmm3, 48(%1)\n"
+#endif
+		"vmovntdqa (%0), %%ymm0\n"
+		"vmovntdqa 32(%0), %%ymm1\n"
+		"vmovntdq %%ymm0, (%1)\n"
+		"vmovntdq %%ymm1, 32(%1)\n"
+		:
+		: "r" (from), "r" (to)
+		: "memory");
+
+		to += bs;
+		from += bs;
+	}
+
+	__asm__ __volatile__ (
+		" sfence\n" : :
+	);
+}
+
+/* load: normal, store: non-temporal, loop: 128 bytes */
+static void memcpy_lt_snt_128(void *to, const void *from, size_t size)
+{
+	u64 bs = 128;
+	int i;
+
+	BUG_ON(!IS_ALIGNED(size, bs));
+	BUG_ON(!IS_ALIGNED((u64)to, bs));
+	BUG_ON(!IS_ALIGNED((u64)from, bs));
+
+	for (i = 0; i < size; i += bs) {
+		__asm__ __volatile__ (
+#if 0
+		/* hard to use prefetch effectively */
+		"prefetchnta 128(%0)\n"
+		"prefetchnta 192(%0)\n"
+#endif
+#if 0
+		"movdqa (%0), %%xmm0\n"
+		"movdqa 16(%0), %%xmm1\n"
+		"movdqa 32(%0), %%xmm2\n"
+		"movdqa 48(%0), %%xmm3\n"
+		"movdqa 64(%0), %%xmm4\n"
+		"movdqa 80(%0), %%xmm5\n"
+		"movdqa 96(%0), %%xmm6\n"
+		"movdqa 112(%0), %%xmm7\n"
+		"movntdq %%xmm0, (%1)\n"
+		"movntdq %%xmm1, 16(%1)\n"
+		"movntdq %%xmm2, 32(%1)\n"
+		"movntdq %%xmm3, 48(%1)\n"
+		"movntdq %%xmm4, 64(%1)\n"
+		"movntdq %%xmm5, 80(%1)\n"
+		"movntdq %%xmm6, 96(%1)\n"
+		"movntdq %%xmm7, 112(%1)\n"
+#endif
+		"vmovdqa (%0), %%ymm0\n"
+		"vmovdqa 32(%0), %%ymm1\n"
+		"vmovdqa 64(%0), %%ymm2\n"
+		"vmovdqa 96(%0), %%ymm3\n"
+		"vmovntdq %%ymm0, (%1)\n"
+		"vmovntdq %%ymm1, 32(%1)\n"
+		"vmovntdq %%ymm2, 64(%1)\n"
+		"vmovntdq %%ymm3, 96(%1)\n"
+		:
+		: "r" (from), "r" (to)
+		: "memory");
+
+		to += bs;
+		from += bs;
+	}
+
+	__asm__ __volatile__ (
+		" sfence\n" : :
+	);
+}
+
+/* load: non-temporal, store: non-temporal, loop: 128 bytes */
+static void memcpy_lnt_snt_128(void *to, const void *from, size_t size)
+{
+	u64 bs = 128;
+	int i;
+
+	BUG_ON(!IS_ALIGNED(size, bs));
+	BUG_ON(!IS_ALIGNED((u64)to, bs));
+	BUG_ON(!IS_ALIGNED((u64)from, bs));
+
+	for (i = 0; i < size; i += bs) {
+		__asm__ __volatile__ (
+#if 0
+		"prefetchnta 128(%0)\n"
+		"prefetchnta 192(%0)\n"
+#endif
+#if 0
+		"movntdqa (%0), %%xmm0\n"
+		"movntdqa 16(%0), %%xmm1\n"
+		"movntdqa 32(%0), %%xmm2\n"
+		"movntdqa 48(%0), %%xmm3\n"
+		"movntdqa 64(%0), %%xmm4\n"
+		"movntdqa 80(%0), %%xmm5\n"
+		"movntdqa 96(%0), %%xmm6\n"
+		"movntdqa 112(%0), %%xmm7\n"
+		"movntdq %%xmm0, (%1)\n"
+		"movntdq %%xmm1, 16(%1)\n"
+		"movntdq %%xmm2, 32(%1)\n"
+		"movntdq %%xmm3, 48(%1)\n"
+		"movntdq %%xmm4, 64(%1)\n"
+		"movntdq %%xmm5, 80(%1)\n"
+		"movntdq %%xmm6, 96(%1)\n"
+		"movntdq %%xmm7, 112(%1)\n"
+#endif
+		"vmovntdqa (%0), %%ymm0\n"
+		"vmovntdqa 32(%0), %%ymm1\n"
+		"vmovntdqa 64(%0), %%ymm2\n"
+		"vmovntdqa 96(%0), %%ymm3\n"
+		"vmovntdq %%ymm0, (%1)\n"
+		"vmovntdq %%ymm1, 32(%1)\n"
+		"vmovntdq %%ymm2, 64(%1)\n"
+		"vmovntdq %%ymm3, 96(%1)\n"
+		:
+		: "r" (from), "r" (to)
+		: "memory");
+
+		to += bs;
+		from += bs;
+	}
+
+	__asm__ __volatile__ (
+		" sfence\n" : :
+	);
+}
+
+/* load: non-temporal, store: normal, loop: 64 bytes */
+static void memcpy_lnt_st_64(void *to, const void *from, size_t size)
+{
+	u64 bs = 64;
+	int i;
+
+	BUG_ON(!IS_ALIGNED(size, bs));
+	BUG_ON(!IS_ALIGNED((u64)to, bs));
+	BUG_ON(!IS_ALIGNED((u64)from, bs));
+
+	for (i = 0; i < size; i += bs) {
+		__asm__ __volatile__ (
+#if 0
+		"movntdqa (%0), %%xmm0\n"
+		"movntdqa 16(%0), %%xmm1\n"
+		"movntdqa 32(%0), %%xmm2\n"
+		"movntdqa 48(%0), %%xmm3\n"
+		"movdqa %%xmm0, (%1)\n"
+		"movdqa %%xmm1, 16(%1)\n"
+		"movdqa %%xmm2, 32(%1)\n"
+		"movdqa %%xmm3, 48(%1)\n"
+#endif
+		"vmovntdqa (%0), %%ymm0\n"
+		"vmovntdqa 32(%0), %%ymm1\n"
+		"vmovdqa %%ymm0, (%1)\n"
+		"vmovdqa %%ymm1, 32(%1)\n"
+		:
+		: "r" (from), "r" (to)
+		: "memory");
+
+		to += bs;
+		from += bs;
+	}
+
+	__asm__ __volatile__ (
+		" sfence\n" : :
+	);
+}
+
+/* load: non-temporal, store: skip, loop: 64 bytes */
+static void memcpy_lnt_sskip_64(void *to, const void *from, size_t size)
+{
+	u64 bs = 64;
+	int i;
+
+	BUG_ON(!IS_ALIGNED(size, bs));
+	BUG_ON(!IS_ALIGNED((u64)to, bs));
+	BUG_ON(!IS_ALIGNED((u64)from, bs));
+
+	for (i = 0; i < size; i += bs) {
+		__asm__ __volatile__ (
+#if 0
+		"movntdqa (%0), %%xmm0\n"
+		"movntdqa 16(%0), %%xmm1\n"
+		"movntdqa 32(%0), %%xmm2\n"
+		"movntdqa 48(%0), %%xmm3\n"
+#endif
+		"vmovntdqa (%0), %%ymm0\n"
+		"vmovntdqa 32(%0), %%ymm1\n"
+		:
+		: "r" (from), "r" (to)
+		: "memory");
+
+		to += bs;
+		from += bs;
+	}
+
+	__asm__ __volatile__ (
+		" sfence\n" : :
+	);
+}
+
+/* load: non-temporal, store: normal, loop: 128 bytes */
+static void memcpy_lnt_st_128(void *to, const void *from, size_t size)
+{
+	u64 bs = 128;
+	int i;
+
+	BUG_ON(!IS_ALIGNED(size, bs));
+	BUG_ON(!IS_ALIGNED((u64)to, bs));
+	BUG_ON(!IS_ALIGNED((u64)from, bs));
+
+	for (i = 0; i < size; i += bs) {
+		__asm__ __volatile__ (
+#if 0
+		"prefetchnta 128(%0)\n"
+		"prefetchnta 192(%0)\n"
+#endif
+#if 0
+		"movntdqa (%0), %%xmm0\n"
+		"movntdqa 16(%0), %%xmm1\n"
+		"movntdqa 32(%0), %%xmm2\n"
+		"movntdqa 48(%0), %%xmm3\n"
+		"movntdqa 64(%0), %%xmm4\n"
+		"movntdqa 80(%0), %%xmm5\n"
+		"movntdqa 96(%0), %%xmm6\n"
+		"movntdqa 112(%0), %%xmm7\n"
+		"movdqa %%xmm0, (%1)\n"
+		"movdqa %%xmm1, 16(%1)\n"
+		"movdqa %%xmm2, 32(%1)\n"
+		"movdqa %%xmm3, 48(%1)\n"
+		"movdqa %%xmm4, 64(%1)\n"
+		"movdqa %%xmm5, 80(%1)\n"
+		"movdqa %%xmm6, 96(%1)\n"
+		"movdqa %%xmm7, 112(%1)\n"
+#endif
+		"vmovntdqa (%0), %%ymm0\n"
+		"vmovntdqa 32(%0), %%ymm1\n"
+		"vmovntdqa 64(%0), %%ymm2\n"
+		"vmovntdqa 96(%0), %%ymm3\n"
+		"vmovdqa %%ymm0, (%1)\n"
+		"vmovdqa %%ymm1, 32(%1)\n"
+		"vmovdqa %%ymm2, 64(%1)\n"
+		"vmovdqa %%ymm3, 96(%1)\n"
+		:
+		: "r" (from), "r" (to)
+		: "memory");
+
+		to += bs;
+		from += bs;
+	}
+
+	__asm__ __volatile__ (
+		" sfence\n" : :
+	);
+}
+
 struct pmem_device {
 	struct request_queue	*pmem_queue;
 	struct gendisk		*pmem_disk;
@@ -37,6 +413,81 @@ struct pmem_device {
 	size_t			size;
 };
 
+/* pick the type of memcpy for a read from NVDIMMs */
+static void memcpy_ntr(void *to, const void *from, size_t size)
+{
+	switch (pmem_ntr) {
+	case 1:
+		memcpy_lnt_st_64(to, from, size);
+		break;
+	case 2:
+		memcpy_lnt_st_128(to, from, size);
+		break;
+	case 3:
+		memcpy_lnt_snt_64(to, from, size);
+		break;
+	case 4:
+		memcpy_lnt_snt_128(to, from, size);
+		break;
+	case 6:
+		/* nop */
+		break;
+	case 7:
+		memcpy_lnt_sskip_64(to, from, size);
+		break;
+	default:
+		memcpy(to, from, size);
+		break;
+	}
+}
+
+/* pick the type of memcpy for a write to NVDIMMs */
+static void memcpy_ntw(void *to, const void *from, size_t size)
+{
+	int ret;
+	switch (pmem_ntw) {
+	case 1:
+		memcpy_lt_snt_64(to, from, size);
+		ret = 0;
+		break;
+	case 2:
+		memcpy_lt_snt_128(to, from, size);
+		ret = 0;
+		break;
+	case 3:
+		memcpy_lnt_snt_64(to, from, size);
+		ret = 0;
+		break;
+	case 4:
+		memcpy_lnt_snt_128(to, from, size);
+		ret = 0;
+		break;
+	case 5:
+		ret = __copy_from_user(to, from, size);
+		if (ret)
+			goto exit;
+	case 6:
+		/* nop */
+		ret = 0;
+		break;
+	case 7:
+		memcpy_lskip_snt_64(to, from, size);
+		ret = 0;
+		break;
+	default:
+		memcpy(to, from, size);
+		ret = 0;
+		break;
+	}
+exit:
+	/* if __copy_from_user or other memcpy functions with return
+	 * values are used, the return value should really be
+	 * propagated upstream. Since most memcpys assume success,
+	 * forgo this for now
+	 */
+	return;
+}
+
 static int pmem_major;
 
 static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
@@ -47,11 +498,11 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 	size_t pmem_off = sector << 9;
 
 	if (rw == READ) {
-		memcpy(mem + off, pmem->virt_addr + pmem_off, len);
+		memcpy_ntr(mem + off, pmem->virt_addr + pmem_off, len);
 		flush_dcache_page(page);
 	} else {
 		flush_dcache_page(page);
-		memcpy(pmem->virt_addr + pmem_off, mem + off, len);
+		memcpy_ntw(pmem->virt_addr + pmem_off, mem + off, len);
 	}
 
 	kunmap_atomic(mem);
@@ -109,10 +560,26 @@ static int pmem_rw_bytes(struct nd_io *ndio, void *buf, size_t offset,
 		return -EFAULT;
 	}
 
-	if (rw == READ)
-		memcpy(buf, pmem->virt_addr + offset, n);
-	else
-		memcpy(pmem->virt_addr + offset, buf, n);
+	/* NOTE: Plain memcpy is used for unaligned accesses, meaning
+	 * this is not safe for WB mode.
+	 *
+	 * All btt accesses come through here; many are not aligned.
+	 */
+	if (rw == READ) {
+		if (IS_ALIGNED((u64) buf, 64) &&
+		    IS_ALIGNED((u64) pmem->virt_addr + offset, 64) &&
+		    IS_ALIGNED(n, 64))
+			memcpy_ntr(buf, pmem->virt_addr + offset, n);
+		else
+			memcpy(buf, pmem->virt_addr + offset, n);
+	} else {
+		if (IS_ALIGNED((u64) buf, 64) &&
+		    IS_ALIGNED((u64) pmem->virt_addr + offset, 64) &&
+		    IS_ALIGNED(n, 64))
+			memcpy_ntw(pmem->virt_addr + offset, buf, n);
+		else
+			memcpy(pmem->virt_addr + offset, buf, n);
+	}
 
 	return 0;
 }
@@ -143,6 +610,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res,
 	struct pmem_device *pmem;
 	struct gendisk *disk;
 	int err;
+	u64 ts, te;
 
 	err = -ENOMEM;
 	pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
@@ -152,21 +620,78 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res,
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
 
+	dev_info(dev,
+		"mapping phys=0x%llx (%lld GiB) size=0x%zx (%ld GiB)\n",
+		pmem->phys_addr, pmem->phys_addr / (1024*1024*1024),
+		pmem->size, pmem->size / (1024*1024*1024));
+
 	err = -EINVAL;
 	if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
 		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size);
 		goto out_free_dev;
 	}
 
-	/*
-	 * Map the memory as non-cachable, as we can't write back the contents
-	 * of the CPU caches in case of a crash.
-	 */
 	err = -ENOMEM;
-	pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
+	switch (pmem_cachetype) {
+	case 0: /* UC */
+		pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
+		break;
+	case 1: /* WB */
+		/* WB is unsafe unless system flushes caches on power loss */
+		pmem->virt_addr = ioremap_cache(pmem->phys_addr, pmem->size);
+		break;
+	case 2: /* WC */
+		/* WC is unsafe unless system flushes buffers on power loss */
+		pmem->virt_addr = ioremap_wc(pmem->phys_addr, pmem->size);
+		break;
+	case 3: /* WT */
+	default:
+		pmem->virt_addr = ioremap_wt(pmem->phys_addr, pmem->size);
+		break;
+	}
+
+	dev_info(dev,
+		"mapped: cache_type=%d virt=0x%p phys=0x%llx (%lld GiB) size=0x%zx (%ld GiB)\n",
+		pmem_cachetype,
+		pmem->virt_addr,
+		pmem->phys_addr, pmem->phys_addr / (1024*1024*1024),
+		pmem->size, pmem->size / (1024*1024*1024));
+
 	if (!pmem->virt_addr)
 		goto out_release_region;
 
+	if (pmem_clean) {
+		/* write all of NVDIMM memory to clear any ECC errors */
+		dev_info(dev,
+			"write clean starting: virt=0x%p phys=0x%llx (%lld GiB) size=0x%zx (%ld GiB)\n",
+			pmem->virt_addr,
+			pmem->phys_addr, pmem->phys_addr / (1024*1024*1024),
+			pmem->size, pmem->size / (1024*1024*1024));
+		ts = local_clock();
+		memcpy_lskip_snt_64(pmem->virt_addr, NULL, pmem->size);
+		te = local_clock();
+		dev_info(dev,
+			"write clean complete: ct=%d in %lld GB/s\n",
+			pmem_cachetype,
+			pmem->size / (te - ts));	/* B/ns equals GB/s */
+	}
+
+	/* read all of NVDIMM memory to trigger any ECC errors now */
+	if (pmem_readscan) {
+		dev_info(dev,
+			"read scan starting: virt=0x%p phys=0x%llx (%lld GiB) size=0x%zx (%ld GiB)\n",
+			pmem->virt_addr,
+			pmem->phys_addr, pmem->phys_addr / (1024*1024*1024),
+			pmem->size, pmem->size / (1024*1024*1024));
+		ts = local_clock();
+		memcpy_lnt_sskip_64(0, pmem->virt_addr, pmem->size);
+		te = local_clock();
+		dev_info(dev,
+			"read scan complete: ct=%d in %lld GB/s\n",
+			pmem_cachetype,
+			pmem->size / (te - ts));	/* B/ns equals GB/s */
+	}
+
 	pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
 	if (!pmem->pmem_queue)
 		goto out_unmap;
@@ -276,6 +801,9 @@ static int __init pmem_init(void)
 {
 	int error;
 
+	pr_info("pmem loading with pmem_readscan=%d pmem_clean=%d pmem_cachetype=%d pmem_ntw=%d pmem_ntr=%d\n",
+		pmem_readscan, pmem_clean, pmem_cachetype, pmem_ntw, pmem_ntr);
+
 	pmem_major = register_blkdev(0, "pmem");
 	if (pmem_major < 0)
 		return pmem_major;
-- 
1.8.3.1