All of lore.kernel.org
 help / color / mirror / Atom feed
* another pmem variant V3
@ 2015-04-01  7:12 Christoph Hellwig
  2015-04-01  7:12 ` [PATCH 1/2] x86: add support for the non-standard protected e820 type Christoph Hellwig
                   ` (3 more replies)
  0 siblings, 4 replies; 50+ messages in thread
From: Christoph Hellwig @ 2015-04-01  7:12 UTC (permalink / raw)
  To: linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe; +Cc: ross.zwisler, boaz

Here is another version of the same trivial pmem driver, because two
obviously aren't enough.  The first patch is the same pmem driver
that Ross posted a short time ago, just modified to use platform_devices
to find the persistant memory region instead of hardconding it in the
Kconfig.  This allows to keep pmem.c separate from any discovery mechanism,
but still allow auto-discovery.

The other two patches are a heavily rewritten version of the code that
Intel gave to various storage vendors to discover the type 12 (and earlier
type 6) nvdimms, which I massaged into a form that is hopefully suitable
for mainline.

Note that pmem.c really is the minimal version as I think we need something
included ASAP.  We'll eventually need to be able to do other I/O from and
to it, and as most people know everyone has their own preferre method to
do it, which I'd like to discuss once we have the basic driver in.

This has been tested on a real NVDIMM on a system with a type 12
capable BIOS.

Changes since V2:
  - dropped support for the memmap= kernel command line override
  - dropped the not needed memblock_reserve call
  - merged various cleanups from Boaz in pmem.c

Changes since V1:
  - s/E820_PROTECTED_KERN/E820_PMEM/g
  - map the persistent memory as uncached
  - better kernel parameter description
  - various typo fixes
  - MODULE_LICENSE fix


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-01  7:12 another pmem variant V3 Christoph Hellwig
@ 2015-04-01  7:12 ` Christoph Hellwig
  2015-04-01 14:25   ` [PATCH] SQUASHME: Fixes to e820 handling of pmem Boaz Harrosh
                     ` (3 more replies)
  2015-04-01  7:12 ` [PATCH 2/2] pmem: add a driver for persistent memory Christoph Hellwig
                   ` (2 subsequent siblings)
  3 siblings, 4 replies; 50+ messages in thread
From: Christoph Hellwig @ 2015-04-01  7:12 UTC (permalink / raw)
  To: linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe; +Cc: ross.zwisler, boaz

Various recent BIOSes support NVDIMMs or ADR using a non-standard
e820 memory type, and Intel supplied reference Linux code using this
type to various vendors.

Wire this e820 table type up to export platform devices for the pmem
driver so that we can use it in Linux.

Based on arlier work from Dave Jiang <dave.jiang@intel.com> and
Dan Williams <dan.j.williams@intel.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/Kconfig                 | 10 ++++++++
 arch/x86/include/uapi/asm/e820.h | 10 ++++++++
 arch/x86/kernel/Makefile         |  1 +
 arch/x86/kernel/e820.c           | 28 ++++++++++++++++-----
 arch/x86/kernel/pmem.c           | 53 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 96 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/pmem.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca..c0e8ee3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1430,6 +1430,16 @@ config ILLEGAL_POINTER_VALUE
 
 source "mm/Kconfig"
 
+config X86_PMEM_LEGACY
+	bool "Support non-standard NVDIMMs and ADR protected memory"
+	help
+	  Treat memory marked using the non-standard e820 type of 12 as used
+	  by the Intel Sandy Bridge-EP reference BIOS as protected memory.
+	  The kernel will offer these regions to the pmem driver so
+	  they can be used for persistent storage.
+
+	  Say Y if unsure.
+
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
 	depends on HIGHMEM
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index d993e33..ce0d0bf 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -33,6 +33,16 @@
 #define E820_NVS	4
 #define E820_UNUSABLE	5
 
+/*
+ * This is a non-standardized way to represent ADR or NVDIMM regions that
+ * persist over a reboot.  The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY option is set.
+ *
+ * Note that older platforms also used 6 for the same type of memory,
+ * but newer versions switched to 12 as 6 was assigned differently.  Some
+ * time they will learn..
+ */
+#define E820_PRAM	12
 
 /*
  * reserved RAM used by kernel itself
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70..971f18c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY)	+= pmem.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201de..e26ca56 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type)
 	case E820_UNUSABLE:
 		printk(KERN_CONT "unusable");
 		break;
+	case E820_PRAM:
+		printk(KERN_CONT "persistent (type %u)", type);
+		break;
 	default:
 		printk(KERN_CONT "type %u", type);
 		break;
@@ -688,8 +691,15 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 			register_nosave_region(pfn, PFN_UP(ei->addr));
 
 		pfn = PFN_DOWN(ei->addr + ei->size);
-		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+
+		switch (ei->type) {
+		case E820_RAM:
+		case E820_PRAM:
+		case E820_RESERVED_KERN:
+			break;
+		default:
 			register_nosave_region(PFN_UP(ei->addr), pfn);
+		}
 
 		if (pfn >= limit_pfn)
 			break;
@@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
 /*
  * Find the highest page frame number we have available
  */
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
 {
 	int i;
 	unsigned long last_pfn = 0;
@@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 		unsigned long start_pfn;
 		unsigned long end_pfn;
 
-		if (ei->type != type)
+		/*
+		 * Persistent memory is accounted as ram for purposes of
+		 * establishing max_pfn and mem_map.
+		 */
+		if (ei->type != E820_RAM && ei->type != E820_PRAM)
 			continue;
 
 		start_pfn = ei->addr >> PAGE_SHIFT;
@@ -784,12 +798,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 }
 unsigned long __init e820_end_of_ram_pfn(void)
 {
-	return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+	return e820_end_pfn(MAX_ARCH_PFN);
 }
 
 unsigned long __init e820_end_of_low_ram_pfn(void)
 {
-	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT));
 }
 
 static void early_panic(char *msg)
@@ -907,6 +921,7 @@ static inline const char *e820_type_to_string(int e820_type)
 	case E820_ACPI:	return "ACPI Tables";
 	case E820_NVS:	return "ACPI Non-volatile Storage";
 	case E820_UNUSABLE:	return "Unusable memory";
+	case E820_PRAM: return "Persistent RAM";
 	default:	return "reserved";
 	}
 }
@@ -941,7 +956,8 @@ void __init e820_reserve_resources(void)
 		 * pcibios_resource_survey()
 		 */
 		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
-			res->flags |= IORESOURCE_BUSY;
+			if (e820.map[i].type != E820_PRAM)
+				res->flags |= IORESOURCE_BUSY;
 			insert_resource(&iomem_resource, res);
 		}
 		res++;
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 0000000..89ab53c
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ */
+#include <linux/memblock.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <asm/e820.h>
+#include <asm/page_types.h>
+#include <asm/setup.h>
+
+static __init void register_pmem_device(struct resource *res)
+{
+	struct platform_device *pdev;
+	int error;
+
+	pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
+	if (!pdev)
+		return;
+
+	error = platform_device_add_resources(pdev, res, 1);
+	if (error)
+		goto out_put_pdev;
+
+	error = platform_device_add(pdev);
+	if (error)
+		goto out_put_pdev;
+	return;
+
+out_put_pdev:
+	dev_warn(&pdev->dev, "failed to add pmem device!\n");
+	platform_device_put(pdev);
+}
+
+static __init int register_pmem_devices(void)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (ei->type == E820_PRAM) {
+			struct resource res = {
+				.flags	= IORESOURCE_MEM,
+				.start	= ei->addr,
+				.end	= ei->addr + ei->size - 1,
+			};
+			register_pmem_device(&res);
+		}
+	}
+
+	return 0;
+}
+device_initcall(register_pmem_devices);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 2/2] pmem: add a driver for persistent memory
  2015-04-01  7:12 another pmem variant V3 Christoph Hellwig
  2015-04-01  7:12 ` [PATCH 1/2] x86: add support for the non-standard protected e820 type Christoph Hellwig
@ 2015-04-01  7:12 ` Christoph Hellwig
  2015-04-01 15:18   ` Boaz Harrosh
  2015-04-02 12:31   ` [tip:x86/pmem] drivers/block/pmem: Add " tip-bot for Ross Zwisler
  2015-04-02 15:31 ` [PATCH] pmem: Add prints at module load and unload Boaz Harrosh
  2015-04-07 15:46 ` [PATCH A+B] " Boaz Harrosh
  3 siblings, 2 replies; 50+ messages in thread
From: Christoph Hellwig @ 2015-04-01  7:12 UTC (permalink / raw)
  To: linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe; +Cc: ross.zwisler, boaz

From: Ross Zwisler <ross.zwisler@linux.intel.com>

PMEM is a new driver that presents a reserved range of memory as a
block device.  This is useful for developing with NV-DIMMs, and
can be used with volatile memory as a development platform.

This patch contains the initial driver from Ross Zwisler, with
various changes from Boaz Harrosh and me.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
[hch: convert to use a platform_device for discovery, fix partition
 support, merged various patches from Boaz]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
---
 MAINTAINERS            |   6 ++
 drivers/block/Kconfig  |  11 +++
 drivers/block/Makefile |   1 +
 drivers/block/pmem.c   | 260 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 278 insertions(+)
 create mode 100644 drivers/block/pmem.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1de6afa..4517613 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8071,6 +8071,12 @@ S:	Maintained
 F:	Documentation/blockdev/ramdisk.txt
 F:	drivers/block/brd.c
 
+PERSISTENT MEMORY DRIVER
+M:	Ross Zwisler <ross.zwisler@linux.intel.com>
+L:	linux-nvdimm@lists.01.org
+S:	Supported
+F:	drivers/block/pmem.c
+
 RANDOM NUMBER DRIVER
 M:	"Theodore Ts'o" <tytso@mit.edu>
 S:	Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1b8094d..34753cf 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -404,6 +404,17 @@ config BLK_DEV_RAM_DAX
 	  and will prevent RAM block device backing store memory from being
 	  allocated from highmem (only a problem for highmem systems).
 
+config BLK_DEV_PMEM
+	tristate "Persistent memory block device support"
+	help
+	  Saying Y here will allow you to use a contiguous range of reserved
+	  memory as one or more block devices.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called pmem.
+
+	  If unsure, say N.
+
 config CDROM_PKTCDVD
 	tristate "Packet writing on CD/DVD media"
 	depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 02b688d..9cc6c18 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM)		+= ps3vram.o
 obj-$(CONFIG_ATARI_FLOPPY)	+= ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
+obj-$(CONFIG_BLK_DEV_PMEM)	+= pmem.o
 obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
 obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
new file mode 100644
index 0000000..0232ab7
--- /dev/null
+++ b/drivers/block/pmem.c
@@ -0,0 +1,260 @@
+/*
+ * Persistent Memory Driver
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
+ * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <asm/cacheflush.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+
+#define PMEM_MINORS		16
+
+struct pmem_device {
+	struct request_queue	*pmem_queue;
+	struct gendisk		*pmem_disk;
+
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	void			*virt_addr;
+	size_t			size;
+};
+
+static int pmem_major;
+static atomic_t pmem_index;
+
+static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
+			unsigned int len, unsigned int off, int rw,
+			sector_t sector)
+{
+	void *mem = kmap_atomic(page);
+	size_t pmem_off = sector << 9;
+
+	if (rw == READ) {
+		memcpy(mem + off, pmem->virt_addr + pmem_off, len);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		memcpy(pmem->virt_addr + pmem_off, mem + off, len);
+	}
+
+	kunmap_atomic(mem);
+}
+
+static void pmem_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct block_device *bdev = bio->bi_bdev;
+	struct pmem_device *pmem = bdev->bd_disk->private_data;
+	int rw;
+	struct bio_vec bvec;
+	sector_t sector;
+	struct bvec_iter iter;
+	int err = 0;
+
+	if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
+		err = -EIO;
+		goto out;
+	}
+
+	BUG_ON(bio->bi_rw & REQ_DISCARD);
+
+	rw = bio_data_dir(bio);
+	sector = bio->bi_iter.bi_sector;
+	bio_for_each_segment(bvec, bio, iter) {
+		pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
+			     rw, sector);
+		sector += bvec.bv_len >> 9;
+	}
+
+out:
+	bio_endio(bio, err);
+}
+
+static int pmem_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, int rw)
+{
+	struct pmem_device *pmem = bdev->bd_disk->private_data;
+
+	pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+	page_endio(page, rw & WRITE, 0);
+	return 0;
+}
+
+static long pmem_direct_access(struct block_device *bdev, sector_t sector,
+			      void **kaddr, unsigned long *pfn, long size)
+{
+	struct pmem_device *pmem = bdev->bd_disk->private_data;
+	size_t offset = sector << 9;
+
+	if (!pmem)
+		return -ENODEV;
+
+	*kaddr = pmem->virt_addr + offset;
+	*pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
+
+	return pmem->size - offset;
+}
+
+static const struct block_device_operations pmem_fops = {
+	.owner =		THIS_MODULE,
+	.rw_page =		pmem_rw_page,
+	.direct_access =	pmem_direct_access,
+};
+
+static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
+{
+	struct pmem_device *pmem;
+	struct gendisk *disk;
+	int idx, err;
+
+	err = -ENOMEM;
+	pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
+	if (!pmem)
+		goto out;
+
+	pmem->phys_addr = res->start;
+	pmem->size = resource_size(res);
+
+	err = -EINVAL;
+	if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
+		dev_warn(dev, "could not reserve region [0x%llx:0x%zx]\n",
+			   pmem->phys_addr, pmem->size);
+		goto out_free_dev;
+	}
+
+	/*
+	 * Map the memory as non-cachable, as we can't write back the contents
+	 * of the CPU caches in case of a crash.
+	 */
+	err = -ENOMEM;
+	pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
+	if (!pmem->virt_addr)
+		goto out_release_region;
+
+	pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
+	if (!pmem->pmem_queue)
+		goto out_unmap;
+
+	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
+	blk_queue_max_hw_sectors(pmem->pmem_queue, 1024);
+	blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
+
+	disk = alloc_disk(PMEM_MINORS);
+	if (!disk)
+		goto out_free_queue;
+
+	idx = atomic_inc_return(&pmem_index) - 1;
+
+	disk->major		= pmem_major;
+	disk->first_minor	= PMEM_MINORS * idx;
+	disk->fops		= &pmem_fops;
+	disk->private_data	= pmem;
+	disk->queue		= pmem->pmem_queue;
+	disk->flags		= GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "pmem%d", idx);
+	disk->driverfs_dev = dev;
+	set_capacity(disk, pmem->size >> 9);
+	pmem->pmem_disk = disk;
+
+	add_disk(disk);
+	return pmem;
+
+out_free_queue:
+	blk_cleanup_queue(pmem->pmem_queue);
+out_unmap:
+	iounmap(pmem->virt_addr);
+out_release_region:
+	release_mem_region(pmem->phys_addr, pmem->size);
+out_free_dev:
+	kfree(pmem);
+out:
+	return ERR_PTR(err);
+}
+
+static void pmem_free(struct pmem_device *pmem)
+{
+	del_gendisk(pmem->pmem_disk);
+	put_disk(pmem->pmem_disk);
+	blk_cleanup_queue(pmem->pmem_queue);
+	iounmap(pmem->virt_addr);
+	release_mem_region(pmem->phys_addr, pmem->size);
+	kfree(pmem);
+}
+
+static int pmem_probe(struct platform_device *pdev)
+{
+	struct pmem_device *pmem;
+	struct resource *res;
+
+	if (WARN_ON(pdev->num_resources > 1))
+		return -ENXIO;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENXIO;
+
+	pmem = pmem_alloc(&pdev->dev, res);
+	if (IS_ERR(pmem))
+		return PTR_ERR(pmem);
+
+	platform_set_drvdata(pdev, pmem);
+	return 0;
+}
+
+static int pmem_remove(struct platform_device *pdev)
+{
+	struct pmem_device *pmem = platform_get_drvdata(pdev);
+
+	pmem_free(pmem);
+	return 0;
+}
+
+static struct platform_driver pmem_driver = {
+	.probe		= pmem_probe,
+	.remove		= pmem_remove,
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= "pmem",
+	},
+};
+
+static int __init pmem_init(void)
+{
+	int error;
+
+	pmem_major = register_blkdev(0, "pmem");
+	if (pmem_major < 0)
+		return pmem_major;
+
+	error = platform_driver_register(&pmem_driver);
+	if (error)
+		unregister_blkdev(pmem_major, "pmem");
+	return error;
+}
+
+static void pmem_exit(void)
+{
+	platform_driver_unregister(&pmem_driver);
+	unregister_blkdev(pmem_major, "pmem");
+}
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+
+module_init(pmem_init);
+module_exit(pmem_exit);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH] SQUASHME: Fixes to e820 handling of pmem
  2015-04-01  7:12 ` [PATCH 1/2] x86: add support for the non-standard protected e820 type Christoph Hellwig
@ 2015-04-01 14:25   ` Boaz Harrosh
  2015-04-02  9:30     ` Christoph Hellwig
  2015-04-02 12:31   ` [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type tip-bot for Christoph Hellwig
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-01 14:25 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe
  Cc: ross.zwisler


Finally with these fixes I'm able to define memmap=!
regions in NUMA machines. Any combination cross or
not cross NUMA boundary.

And not only the memmap=! regions had problems also
the real type-12 NvDIMMs had the same NUMA problems.
Now it all works.

Also
I have kept the "don't merge PRAM" regions for ease
of emulated NUMA setups.

Also I have reverted the change Ch did to
e820_mark_nosave_regions. From comment above the function
and if I'm reading register_nosave_region() correctly,
We certainly do not want the system to try and save any
pmem to swap or hibernate. (Actually it will be the
opposite right). Can we actually define swap on a /dev/pmemX ? ;-)

Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
---
 Documentation/kernel-parameters.txt |  6 ++++++
 arch/x86/kernel/e820.c              | 20 +++++++++-----------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a6..c87122d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1965,6 +1965,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			         or
 			         memmap=0x10000$0x18690000
 
+	memmap=nn[KMG]!ss[KMG]
+			[KNL,X86] Mark specific memory as protected.
+			Region of memory to be used, from ss to ss+nn.
+			The memory region may be marked as e820 type 12 (0xc)
+			and is NVDIMM or ADR memory.
+
 	memory_corruption_check=0/1 [X86]
 			Some BIOSes seem to corrupt the first 64k of
 			memory when doing things like suspend/resume.
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e26ca56..3572a22 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -346,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 		 * continue building up new bios map based on this
 		 * information
 		 */
-		if (current_type != last_type)	{
+		if (current_type != last_type || current_type == E820_PRAM) {
 			if (last_type != 0)	 {
 				new_bios[new_bios_entry].size =
 					change_point[chgidx]->addr - last_addr;
@@ -692,14 +692,8 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 
 		pfn = PFN_DOWN(ei->addr + ei->size);
 
-		switch (ei->type) {
-		case E820_RAM:
-		case E820_PRAM:
-		case E820_RESERVED_KERN:
-			break;
-		default:
+		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
 			register_nosave_region(PFN_UP(ei->addr), pfn);
-		}
 
 		if (pfn >= limit_pfn)
 			break;
@@ -880,6 +874,9 @@ static int __init parse_memmap_one(char *p)
 	} else if (*p == '$') {
 		start_at = memparse(p+1, &p);
 		e820_add_region(start_at, mem_size, E820_RESERVED);
+	} else if (*p == '!') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_PRAM);
 	} else
 		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 
@@ -955,9 +952,10 @@ void __init e820_reserve_resources(void)
 		 * pci device BAR resource and insert them later in
 		 * pcibios_resource_survey()
 		 */
-		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
-			if (e820.map[i].type != E820_PRAM)
-				res->flags |= IORESOURCE_BUSY;
+		if (((e820.map[i].type != E820_RESERVED) &&
+		     (e820.map[i].type != E820_PRAM)) ||
+		     res->start < (1ULL<<20)) {
+			res->flags |= IORESOURCE_BUSY;
 			insert_resource(&iomem_resource, res);
 		}
 		res++;
-- 
1.9.3



^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] pmem: add a driver for persistent memory
  2015-04-01  7:12 ` [PATCH 2/2] pmem: add a driver for persistent memory Christoph Hellwig
@ 2015-04-01 15:18   ` Boaz Harrosh
  2015-04-02  9:32     ` Christoph Hellwig
  2015-04-02 12:31   ` [tip:x86/pmem] drivers/block/pmem: Add " tip-bot for Ross Zwisler
  1 sibling, 1 reply; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-01 15:18 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe
  Cc: ross.zwisler, boaz

On 04/01/2015 10:12 AM, Christoph Hellwig wrote:
> From: Ross Zwisler <ross.zwisler@linux.intel.com>
> 
> PMEM is a new driver that presents a reserved range of memory as a
> block device.  This is useful for developing with NV-DIMMs, and
> can be used with volatile memory as a development platform.
> 
> This patch contains the initial driver from Ross Zwisler, with
> various changes from Boaz Harrosh and me.
> 
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> [hch: convert to use a platform_device for discovery, fix partition
>  support, merged various patches from Boaz]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> Acked-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  MAINTAINERS            |   6 ++
>  drivers/block/Kconfig  |  11 +++
>  drivers/block/Makefile |   1 +
>  drivers/block/pmem.c   | 260 +++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 278 insertions(+)
>  create mode 100644 drivers/block/pmem.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1de6afa..4517613 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8071,6 +8071,12 @@ S:	Maintained
>  F:	Documentation/blockdev/ramdisk.txt
>  F:	drivers/block/brd.c
>  
> +PERSISTENT MEMORY DRIVER
> +M:	Ross Zwisler <ross.zwisler@linux.intel.com>
> +L:	linux-nvdimm@lists.01.org
> +S:	Supported
> +F:	drivers/block/pmem.c
> +
>  RANDOM NUMBER DRIVER
>  M:	"Theodore Ts'o" <tytso@mit.edu>
>  S:	Maintained
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 1b8094d..34753cf 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -404,6 +404,17 @@ config BLK_DEV_RAM_DAX
>  	  and will prevent RAM block device backing store memory from being
>  	  allocated from highmem (only a problem for highmem systems).
>  
> +config BLK_DEV_PMEM
> +	tristate "Persistent memory block device support"
> +	help
> +	  Saying Y here will allow you to use a contiguous range of reserved
> +	  memory as one or more block devices.
> +
> +	  To compile this driver as a module, choose M here: the module will be
> +	  called pmem.
> +
> +	  If unsure, say N.
> +
>  config CDROM_PKTCDVD
>  	tristate "Packet writing on CD/DVD media"
>  	depends on !UML
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 02b688d..9cc6c18 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM)		+= ps3vram.o
>  obj-$(CONFIG_ATARI_FLOPPY)	+= ataflop.o
>  obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
>  obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
> +obj-$(CONFIG_BLK_DEV_PMEM)	+= pmem.o
>  obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
>  obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
>  obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
> new file mode 100644
> index 0000000..0232ab7
> --- /dev/null
> +++ b/drivers/block/pmem.c
> @@ -0,0 +1,260 @@
> +/*
> + * Persistent Memory Driver
> + * Copyright (c) 2014, Intel Corporation.
> + * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
> + * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <asm/cacheflush.h>
> +#include <linux/blkdev.h>
> +#include <linux/hdreg.h>
> +#include <linux/init.h>
> +#include <linux/platform_device.h>
> +#include <linux/module.h>
> +#include <linux/moduleparam.h>
> +#include <linux/slab.h>
> +
> +#define PMEM_MINORS		16
> +
> +struct pmem_device {
> +	struct request_queue	*pmem_queue;
> +	struct gendisk		*pmem_disk;
> +
> +	/* One contiguous memory region per device */
> +	phys_addr_t		phys_addr;
> +	void			*virt_addr;
> +	size_t			size;
> +};
> +
> +static int pmem_major;
> +static atomic_t pmem_index;
> +
> +static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
> +			unsigned int len, unsigned int off, int rw,
> +			sector_t sector)
> +{
> +	void *mem = kmap_atomic(page);
> +	size_t pmem_off = sector << 9;
> +
> +	if (rw == READ) {
> +		memcpy(mem + off, pmem->virt_addr + pmem_off, len);
> +		flush_dcache_page(page);
> +	} else {
> +		flush_dcache_page(page);
> +		memcpy(pmem->virt_addr + pmem_off, mem + off, len);
> +	}
> +
> +	kunmap_atomic(mem);
> +}
> +
> +static void pmem_make_request(struct request_queue *q, struct bio *bio)
> +{
> +	struct block_device *bdev = bio->bi_bdev;
> +	struct pmem_device *pmem = bdev->bd_disk->private_data;
> +	int rw;
> +	struct bio_vec bvec;
> +	sector_t sector;
> +	struct bvec_iter iter;
> +	int err = 0;
> +
> +	if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	BUG_ON(bio->bi_rw & REQ_DISCARD);
> +
> +	rw = bio_data_dir(bio);

OK cool I was afraid since you did not like my unlikely patch
that you might have missed this fix.

> +	sector = bio->bi_iter.bi_sector;
> +	bio_for_each_segment(bvec, bio, iter) {
> +		pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
> +			     rw, sector);
> +		sector += bvec.bv_len >> 9;
> +	}
> +
> +out:
> +	bio_endio(bio, err);
> +}
> +
> +static int pmem_rw_page(struct block_device *bdev, sector_t sector,
> +		       struct page *page, int rw)
> +{
> +	struct pmem_device *pmem = bdev->bd_disk->private_data;
> +
> +	pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
> +	page_endio(page, rw & WRITE, 0);

You might as well rw & WRITE also for the above pmem_do_bvec call.
If there can be such problem than the rw == READ inside
pmem_do_bvec will fail as well.

> +	return 0;
> +}
> +
> +static long pmem_direct_access(struct block_device *bdev, sector_t sector,
> +			      void **kaddr, unsigned long *pfn, long size)
> +{
> +	struct pmem_device *pmem = bdev->bd_disk->private_data;
> +	size_t offset = sector << 9;
> +
> +	if (!pmem)
> +		return -ENODEV;
> +
> +	*kaddr = pmem->virt_addr + offset;
> +	*pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
> +
> +	return pmem->size - offset;
> +}
> +
> +static const struct block_device_operations pmem_fops = {
> +	.owner =		THIS_MODULE,
> +	.rw_page =		pmem_rw_page,
> +	.direct_access =	pmem_direct_access,
> +};
> +
> +static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
> +{
> +	struct pmem_device *pmem;
> +	struct gendisk *disk;
> +	int idx, err;
> +
> +	err = -ENOMEM;
> +	pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
> +	if (!pmem)
> +		goto out;
> +
> +	pmem->phys_addr = res->start;
> +	pmem->size = resource_size(res);
> +
> +	err = -EINVAL;
> +	if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
> +		dev_warn(dev, "could not reserve region [0x%llx:0x%zx]\n",
> +			   pmem->phys_addr, pmem->size);
> +		goto out_free_dev;
> +	}
> +
> +	/*
> +	 * Map the memory as non-cachable, as we can't write back the contents
> +	 * of the CPU caches in case of a crash.
> +	 */
> +	err = -ENOMEM;
> +	pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
> +	if (!pmem->virt_addr)
> +		goto out_release_region;

You have removed the pmem_mapmem() helper and embedded all this here.
The patch to add page-structs that I'm posting on top of
this, uses the abstraction to switch between implementations.

I'll add the split-out to the page-structs patch.

Am heavy testing this set (Together with my small fix to e820)
And will report tomorrow if the lab is able to survive the night.

Thanks
Boaz

> +
> +	pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
> +	if (!pmem->pmem_queue)
> +		goto out_unmap;
> +
> +	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
> +	blk_queue_max_hw_sectors(pmem->pmem_queue, 1024);
> +	blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
> +
> +	disk = alloc_disk(PMEM_MINORS);
> +	if (!disk)
> +		goto out_free_queue;
> +
> +	idx = atomic_inc_return(&pmem_index) - 1;
> +
> +	disk->major		= pmem_major;
> +	disk->first_minor	= PMEM_MINORS * idx;
> +	disk->fops		= &pmem_fops;
> +	disk->private_data	= pmem;
> +	disk->queue		= pmem->pmem_queue;
> +	disk->flags		= GENHD_FL_EXT_DEVT;
> +	sprintf(disk->disk_name, "pmem%d", idx);
> +	disk->driverfs_dev = dev;
> +	set_capacity(disk, pmem->size >> 9);
> +	pmem->pmem_disk = disk;
> +
> +	add_disk(disk);
> +	return pmem;
> +
> +out_free_queue:
> +	blk_cleanup_queue(pmem->pmem_queue);
> +out_unmap:
> +	iounmap(pmem->virt_addr);
> +out_release_region:
> +	release_mem_region(pmem->phys_addr, pmem->size);
> +out_free_dev:
> +	kfree(pmem);
> +out:
> +	return ERR_PTR(err);
> +}
> +
> +static void pmem_free(struct pmem_device *pmem)
> +{
> +	del_gendisk(pmem->pmem_disk);
> +	put_disk(pmem->pmem_disk);
> +	blk_cleanup_queue(pmem->pmem_queue);
> +	iounmap(pmem->virt_addr);
> +	release_mem_region(pmem->phys_addr, pmem->size);
> +	kfree(pmem);
> +}
> +
> +static int pmem_probe(struct platform_device *pdev)
> +{
> +	struct pmem_device *pmem;
> +	struct resource *res;
> +
> +	if (WARN_ON(pdev->num_resources > 1))
> +		return -ENXIO;
> +
> +	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	if (!res)
> +		return -ENXIO;
> +
> +	pmem = pmem_alloc(&pdev->dev, res);
> +	if (IS_ERR(pmem))
> +		return PTR_ERR(pmem);
> +
> +	platform_set_drvdata(pdev, pmem);
> +	return 0;
> +}
> +
> +static int pmem_remove(struct platform_device *pdev)
> +{
> +	struct pmem_device *pmem = platform_get_drvdata(pdev);
> +
> +	pmem_free(pmem);
> +	return 0;
> +}
> +
> +static struct platform_driver pmem_driver = {
> +	.probe		= pmem_probe,
> +	.remove		= pmem_remove,
> +	.driver		= {
> +		.owner	= THIS_MODULE,
> +		.name	= "pmem",
> +	},
> +};
> +
> +static int __init pmem_init(void)
> +{
> +	int error;
> +
> +	pmem_major = register_blkdev(0, "pmem");
> +	if (pmem_major < 0)
> +		return pmem_major;
> +
> +	error = platform_driver_register(&pmem_driver);
> +	if (error)
> +		unregister_blkdev(pmem_major, "pmem");
> +	return error;
> +}
> +
> +static void pmem_exit(void)
> +{
> +	platform_driver_unregister(&pmem_driver);
> +	unregister_blkdev(pmem_major, "pmem");
> +}
> +
> +MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
> +MODULE_LICENSE("GPL v2");
> +
> +module_init(pmem_init);
> +module_exit(pmem_exit);
> 


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] SQUASHME: Fixes to e820 handling of pmem
  2015-04-01 14:25   ` [PATCH] SQUASHME: Fixes to e820 handling of pmem Boaz Harrosh
@ 2015-04-02  9:30     ` Christoph Hellwig
  2015-04-02  9:37       ` Ingo Molnar
  2015-04-02 11:20       ` Boaz Harrosh
  0 siblings, 2 replies; 50+ messages in thread
From: Christoph Hellwig @ 2015-04-02  9:30 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel,
	x86, axboe, ross.zwisler

On Wed, Apr 01, 2015 at 05:25:22PM +0300, Boaz Harrosh wrote:
>  		pfn = PFN_DOWN(ei->addr + ei->size);
>  
> -		switch (ei->type) {
> -		case E820_RAM:
> -		case E820_PRAM:
> -		case E820_RESERVED_KERN:
> -			break;
> -		default:
> +		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
>  			register_nosave_region(PFN_UP(ei->addr), pfn);
> -		}

I guess this makes sense - if the content is persistent already we don't need
to save it.

> -		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
> -			if (e820.map[i].type != E820_PRAM)
> -				res->flags |= IORESOURCE_BUSY;
> +		if (((e820.map[i].type != E820_RESERVED) &&
> +		     (e820.map[i].type != E820_PRAM)) ||
> +		     res->start < (1ULL<<20)) {

So now we also trigger for PRAM regions under 1ULL<<20, was that the
intentional change?  Honestly I don't really understand this 1ULL<<20
magic here even for the existing case.  Guess this is magic from the
old ISA PC days?  

> +			res->flags |= IORESOURCE_BUSY;

Guess this is the real change, and I'd love to understand why this
makes a difference for you.  IORESOURCE_BUSY is checked almost never,
and is intented to mean it's a driver mapping.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 2/2] pmem: add a driver for persistent memory
  2015-04-01 15:18   ` Boaz Harrosh
@ 2015-04-02  9:32     ` Christoph Hellwig
  0 siblings, 0 replies; 50+ messages in thread
From: Christoph Hellwig @ 2015-04-02  9:32 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel,
	x86, axboe, ross.zwisler

On Wed, Apr 01, 2015 at 06:18:54PM +0300, Boaz Harrosh wrote:
> You might as well rw & WRITE also for the above pmem_do_bvec call.
> If there can be such problem than the rw == READ inside
> pmem_do_bvec will fail as well.

Might be worth to pass a bool to pmem_do_bvec, but we're getting into
serious bikeshed territory here..


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] SQUASHME: Fixes to e820 handling of pmem
  2015-04-02  9:30     ` Christoph Hellwig
@ 2015-04-02  9:37       ` Ingo Molnar
  2015-04-02  9:40         ` Christoph Hellwig
  2015-04-02 11:18         ` Christoph Hellwig
  2015-04-02 11:20       ` Boaz Harrosh
  1 sibling, 2 replies; 50+ messages in thread
From: Ingo Molnar @ 2015-04-02  9:37 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Boaz Harrosh, linux-nvdimm, linux-fsdevel, linux-kernel, x86,
	axboe, ross.zwisler


* Christoph Hellwig <hch@lst.de> wrote:

> On Wed, Apr 01, 2015 at 05:25:22PM +0300, Boaz Harrosh wrote:
> >  		pfn = PFN_DOWN(ei->addr + ei->size);
> >  
> > -		switch (ei->type) {
> > -		case E820_RAM:
> > -		case E820_PRAM:
> > -		case E820_RESERVED_KERN:
> > -			break;
> > -		default:
> > +		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
> >  			register_nosave_region(PFN_UP(ei->addr), pfn);
> > -		}
> 
> I guess this makes sense - if the content is persistent already we don't need
> to save it.
> 
> > -		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
> > -			if (e820.map[i].type != E820_PRAM)
> > -				res->flags |= IORESOURCE_BUSY;
> > +		if (((e820.map[i].type != E820_RESERVED) &&
> > +		     (e820.map[i].type != E820_PRAM)) ||
> > +		     res->start < (1ULL<<20)) {
> 
> So now we also trigger for PRAM regions under 1ULL<<20, was that the
> intentional change?  Honestly I don't really understand this 1ULL<<20
> magic here even for the existing case.  Guess this is magic from the
> old ISA PC days?  
> 
> > +			res->flags |= IORESOURCE_BUSY;
> 
> Guess this is the real change, and I'd love to understand why this 
> makes a difference for you.  IORESOURCE_BUSY is checked almost 
> never, and is intented to mean it's a driver mapping.

So assuming this works on your test setup I'm inclined to squash 
Boaz's fixes into the original patch, assuming you see no outright bug 
in them. Anything else can be done as delta improvements.

Agreed?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] SQUASHME: Fixes to e820 handling of pmem
  2015-04-02  9:37       ` Ingo Molnar
@ 2015-04-02  9:40         ` Christoph Hellwig
  2015-04-02 11:18         ` Christoph Hellwig
  1 sibling, 0 replies; 50+ messages in thread
From: Christoph Hellwig @ 2015-04-02  9:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Boaz Harrosh, linux-nvdimm, linux-fsdevel,
	linux-kernel, x86, axboe, ross.zwisler

On Thu, Apr 02, 2015 at 11:37:40AM +0200, Ingo Molnar wrote:
> So assuming this works on your test setup I'm inclined to squash 
> Boaz's fixes into the original patch, assuming you see no outright bug 
> in them. Anything else can be done as delta improvements.

It looks sensible, but I'd really like to understand the changes a bit
better.  In the meantime I'll test them on my setup.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] SQUASHME: Fixes to e820 handling of pmem
  2015-04-02  9:37       ` Ingo Molnar
  2015-04-02  9:40         ` Christoph Hellwig
@ 2015-04-02 11:18         ` Christoph Hellwig
  1 sibling, 0 replies; 50+ messages in thread
From: Christoph Hellwig @ 2015-04-02 11:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Boaz Harrosh, linux-nvdimm, linux-fsdevel,
	linux-kernel, x86, axboe, ross.zwisler

On Thu, Apr 02, 2015 at 11:37:40AM +0200, Ingo Molnar wrote:
> So assuming this works on your test setup

Boaz's changes work fine here.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] SQUASHME: Fixes to e820 handling of pmem
  2015-04-02  9:30     ` Christoph Hellwig
  2015-04-02  9:37       ` Ingo Molnar
@ 2015-04-02 11:20       ` Boaz Harrosh
  1 sibling, 0 replies; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-02 11:20 UTC (permalink / raw)
  To: Christoph Hellwig, Boaz Harrosh
  Cc: linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe, ross.zwisler

On 04/02/2015 12:30 PM, Christoph Hellwig wrote:
> On Wed, Apr 01, 2015 at 05:25:22PM +0300, Boaz Harrosh wrote:
>>  		pfn = PFN_DOWN(ei->addr + ei->size);
>>  
>> -		switch (ei->type) {
>> -		case E820_RAM:
>> -		case E820_PRAM:
>> -		case E820_RESERVED_KERN:
>> -			break;
>> -		default:
>> +		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
>>  			register_nosave_region(PFN_UP(ei->addr), pfn);
>> -		}
> 
> I guess this makes sense - if the content is persistent already we don't need
> to save it.
> 

Yes

>> -		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
>> -			if (e820.map[i].type != E820_PRAM)
>> -				res->flags |= IORESOURCE_BUSY;
>> +		if (((e820.map[i].type != E820_RESERVED) &&
>> +		     (e820.map[i].type != E820_PRAM)) ||
>> +		     res->start < (1ULL<<20)) {
> 
> So now we also trigger for PRAM regions under 1ULL<<20, was that the
> intentional change?  Honestly I don't really understand this 1ULL<<20
> magic here even for the existing case.  Guess this is magic from the
> old ISA PC days?  
> 

OK so by now I have had a chance to test all cases and figure out
what happened. I was running on your V1 and then V2. And had huge
problems with NUMA. The thing that actually fixed everything and
brought the system back to life, was already in your V3.

It was the remove of reserve_pmem() and the call to memblock_reserve()
and init_memory_mapping() from within. It was making a mess.

So your V3 was already running nice. But I already fixed everything
on my side by the time you sent V3, and what I sent here is the
diff from what I had and your V3.

But these are all good fixes still. (Though not fatal as V2 was)

3 small fixes here:
* Adding back the memmap=! now that everything works perfectly
  as expected.

* The one above that fixes register_nosave_region

and ...
>> +			res->flags |= IORESOURCE_BUSY;
> 
> Guess this is the real change, and I'd love to understand why this
> makes a difference for you.  IORESOURCE_BUSY is checked almost never,
> and is intented to mean it's a driver mapping.

This here is a very minor thing. I have lots of experience with this
code and with the internals of insert_resource() from my old e820.c
fixes (Which are BTW still relevant but no longer needed for any
current platform)

So what happens here is just the adding of the IORESOURCE_BUSY flag.
Actually all these resources are already in the resource list and this
code is just changing the flag. So if you are not changing the flag there
is just no point in calling insert_resource(). It will change nothing.

>From what I understand from the history of this file and from prints
that I put in this file and at the resource manager. The logic is that
it wants to make all E820_XXX busy so to not let any driver try and claim them.
And Also the code does not want to allow any kind of e820 resource below the 1M
be allowed for drivers, reserved or otherwise.

Any E820_RESERVED regions it allows for driver use by not setting IORESOURCE_BUSY.

Your code and mine are effectively the same only that I optimize out the call to
insert_resource() since the flags were not changed.

Testing status:
We had some birth problems with the new system and the APIs that changed so the
testing rig broke half through the night. But we have wrinkled out the last
problems and the machines are pumping away full steam, NUMA and everything.
So far it looks good. I will very soon now, Send you a tested-by: That
confirms these patches are just as good as what we had until now out-of-tree.
(I'm running with my page-struct patches on top of your pmem driver. Will publish
 trees soon)

Thank you for your patience
Boaz


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type
  2015-04-01  7:12 ` [PATCH 1/2] x86: add support for the non-standard protected e820 type Christoph Hellwig
  2015-04-01 14:25   ` [PATCH] SQUASHME: Fixes to e820 handling of pmem Boaz Harrosh
@ 2015-04-02 12:31   ` tip-bot for Christoph Hellwig
  2015-04-02 19:08     ` Andy Lutomirski
  2015-04-02 20:28     ` Yinghai Lu
  2015-04-02 20:23   ` [PATCH 1/2] x86: add " Yinghai Lu
  2015-04-03 16:14   ` [Linux-nvdimm] " Toshi Kani
  3 siblings, 2 replies; 50+ messages in thread
From: tip-bot for Christoph Hellwig @ 2015-04-02 12:31 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: boaz, axboe, axboe, tglx, linux-kernel, dan.j.williams, hpa,
	luto, bp, akpm, torvalds, keith.busch, willy, mingo, hch,
	ross.zwisler

Commit-ID:  ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
Gitweb:     http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
Author:     Christoph Hellwig <hch@lst.de>
AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Wed, 1 Apr 2015 17:02:43 +0200

x86/mm: Add support for the non-standard protected e820 type

Various recent BIOSes support NVDIMMs or ADR using a
non-standard e820 memory type, and Intel supplied reference
Linux code using this type to various vendors.

Wire this e820 table type up to export platform devices for the
pmem driver so that we can use it in Linux.

Based on earlier work from:

   Dave Jiang <dave.jiang@intel.com>
   Dan Williams <dan.j.williams@intel.com>

Includes fixes for NUMA regions from Boaz Harrosh.

Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-nvdimm@ml01.01.org
Link: http://lkml.kernel.org/r/1427872339-6688-2-git-send-email-hch@lst.de
[ Minor cleanups. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/kernel-parameters.txt |  6 +++++
 arch/x86/Kconfig                    | 10 +++++++
 arch/x86/include/uapi/asm/e820.h    | 10 +++++++
 arch/x86/kernel/Makefile            |  1 +
 arch/x86/kernel/e820.c              | 26 +++++++++++++-----
 arch/x86/kernel/pmem.c              | 53 +++++++++++++++++++++++++++++++++++++
 6 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a6..c87122d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1965,6 +1965,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			         or
 			         memmap=0x10000$0x18690000
 
+	memmap=nn[KMG]!ss[KMG]
+			[KNL,X86] Mark specific memory as protected.
+			Region of memory to be used, from ss to ss+nn.
+			The memory region may be marked as e820 type 12 (0xc)
+			and is NVDIMM or ADR memory.
+
 	memory_corruption_check=0/1 [X86]
 			Some BIOSes seem to corrupt the first 64k of
 			memory when doing things like suspend/resume.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca..9e3bcd6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1430,6 +1430,16 @@ config ILLEGAL_POINTER_VALUE
 
 source "mm/Kconfig"
 
+config X86_PMEM_LEGACY
+	bool "Support non-standard NVDIMMs and ADR protected memory"
+	help
+	  Treat memory marked using the non-standard e820 type of 12 as used
+	  by the Intel Sandy Bridge-EP reference BIOS as protected memory.
+	  The kernel will offer these regions to the 'pmem' driver so
+	  they can be used for persistent storage.
+
+	  Say Y if unsure.
+
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
 	depends on HIGHMEM
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index d993e33..960a8a9 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -33,6 +33,16 @@
 #define E820_NVS	4
 #define E820_UNUSABLE	5
 
+/*
+ * This is a non-standardized way to represent ADR or NVDIMM regions that
+ * persist over a reboot.  The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ *
+ * ( Note that older platforms also used 6 for the same type of memory,
+ *   but newer versions switched to 12 as 6 was assigned differently.  Some
+ *   time they will learn... )
+ */
+#define E820_PRAM	12
 
 /*
  * reserved RAM used by kernel itself
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70..971f18c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY)	+= pmem.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201de..11cc7d5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type)
 	case E820_UNUSABLE:
 		printk(KERN_CONT "unusable");
 		break;
+	case E820_PRAM:
+		printk(KERN_CONT "persistent (type %u)", type);
+		break;
 	default:
 		printk(KERN_CONT "type %u", type);
 		break;
@@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 		 * continue building up new bios map based on this
 		 * information
 		 */
-		if (current_type != last_type)	{
+		if (current_type != last_type || current_type == E820_PRAM) {
 			if (last_type != 0)	 {
 				new_bios[new_bios_entry].size =
 					change_point[chgidx]->addr - last_addr;
@@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 			register_nosave_region(pfn, PFN_UP(ei->addr));
 
 		pfn = PFN_DOWN(ei->addr + ei->size);
+
 		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
 			register_nosave_region(PFN_UP(ei->addr), pfn);
 
@@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
 /*
  * Find the highest page frame number we have available
  */
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
 {
 	int i;
 	unsigned long last_pfn = 0;
@@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 		unsigned long start_pfn;
 		unsigned long end_pfn;
 
-		if (ei->type != type)
+		/*
+		 * Persistent memory is accounted as ram for purposes of
+		 * establishing max_pfn and mem_map.
+		 */
+		if (ei->type != E820_RAM && ei->type != E820_PRAM)
 			continue;
 
 		start_pfn = ei->addr >> PAGE_SHIFT;
@@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 }
 unsigned long __init e820_end_of_ram_pfn(void)
 {
-	return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+	return e820_end_pfn(MAX_ARCH_PFN);
 }
 
 unsigned long __init e820_end_of_low_ram_pfn(void)
 {
-	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+	return e820_end_pfn(1UL << (32-PAGE_SHIFT));
 }
 
 static void early_panic(char *msg)
@@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p)
 	} else if (*p == '$') {
 		start_at = memparse(p+1, &p);
 		e820_add_region(start_at, mem_size, E820_RESERVED);
+	} else if (*p == '!') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_PRAM);
 	} else
 		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 
@@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type)
 	case E820_ACPI:	return "ACPI Tables";
 	case E820_NVS:	return "ACPI Non-volatile Storage";
 	case E820_UNUSABLE:	return "Unusable memory";
+	case E820_PRAM: return "Persistent RAM";
 	default:	return "reserved";
 	}
 }
@@ -940,7 +952,9 @@ void __init e820_reserve_resources(void)
 		 * pci device BAR resource and insert them later in
 		 * pcibios_resource_survey()
 		 */
-		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
+		if (((e820.map[i].type != E820_RESERVED) &&
+		     (e820.map[i].type != E820_PRAM)) ||
+		     res->start < (1ULL<<20)) {
 			res->flags |= IORESOURCE_BUSY;
 			insert_resource(&iomem_resource, res);
 		}
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 0000000..3420c87
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ */
+#include <linux/memblock.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <asm/e820.h>
+#include <asm/page_types.h>
+#include <asm/setup.h>
+
+static __init void register_pmem_device(struct resource *res)
+{
+	struct platform_device *pdev;
+	int error;
+
+	pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
+	if (!pdev)
+		return;
+
+	error = platform_device_add_resources(pdev, res, 1);
+	if (error)
+		goto out_put_pdev;
+
+	error = platform_device_add(pdev);
+	if (error)
+		goto out_put_pdev;
+	return;
+
+out_put_pdev:
+	dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
+	platform_device_put(pdev);
+}
+
+static __init int register_pmem_devices(void)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (ei->type == E820_PRAM) {
+			struct resource res = {
+				.flags	= IORESOURCE_MEM,
+				.start	= ei->addr,
+				.end	= ei->addr + ei->size - 1,
+			};
+			register_pmem_device(&res);
+		}
+	}
+
+	return 0;
+}
+device_initcall(register_pmem_devices);

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [tip:x86/pmem] drivers/block/pmem: Add a driver for persistent memory
  2015-04-01  7:12 ` [PATCH 2/2] pmem: add a driver for persistent memory Christoph Hellwig
  2015-04-01 15:18   ` Boaz Harrosh
@ 2015-04-02 12:31   ` tip-bot for Ross Zwisler
  1 sibling, 0 replies; 50+ messages in thread
From: tip-bot for Ross Zwisler @ 2015-04-02 12:31 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: ross.zwisler, dan.j.williams, tglx, keith.busch, mingo, bp, akpm,
	hpa, linux-kernel, luto, willy, boaz, axboe, hch, torvalds,
	axboe

Commit-ID:  9e853f2313e5eb163cb1ea461b23c2332cf6438a
Gitweb:     http://git.kernel.org/tip/9e853f2313e5eb163cb1ea461b23c2332cf6438a
Author:     Ross Zwisler <ross.zwisler@linux.intel.com>
AuthorDate: Wed, 1 Apr 2015 09:12:19 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Wed, 1 Apr 2015 17:03:56 +0200

drivers/block/pmem: Add a driver for persistent memory

PMEM is a new driver that presents a reserved range of memory as
a block device.  This is useful for developing with NV-DIMMs,
and can be used with volatile memory as a development platform.

This patch contains the initial driver from Ross Zwisler, with
various changes: converted it to use a platform_device for
discovery, fixed partition support and merged various patches
from Boaz Harrosh.

Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-nvdimm@ml01.01.org
Link: http://lkml.kernel.org/r/1427872339-6688-3-git-send-email-hch@lst.de
[ Minor cleanups. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 MAINTAINERS            |   6 ++
 drivers/block/Kconfig  |  11 +++
 drivers/block/Makefile |   1 +
 drivers/block/pmem.c   | 263 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 281 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1de6afa..4517613 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8071,6 +8071,12 @@ S:	Maintained
 F:	Documentation/blockdev/ramdisk.txt
 F:	drivers/block/brd.c
 
+PERSISTENT MEMORY DRIVER
+M:	Ross Zwisler <ross.zwisler@linux.intel.com>
+L:	linux-nvdimm@lists.01.org
+S:	Supported
+F:	drivers/block/pmem.c
+
 RANDOM NUMBER DRIVER
 M:	"Theodore Ts'o" <tytso@mit.edu>
 S:	Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1b8094d..eb1fed5 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -404,6 +404,17 @@ config BLK_DEV_RAM_DAX
 	  and will prevent RAM block device backing store memory from being
 	  allocated from highmem (only a problem for highmem systems).
 
+config BLK_DEV_PMEM
+	tristate "Persistent memory block device support"
+	help
+	  Saying Y here will allow you to use a contiguous range of reserved
+	  memory as one or more persistent block devices.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called 'pmem'.
+
+	  If unsure, say N.
+
 config CDROM_PKTCDVD
 	tristate "Packet writing on CD/DVD media"
 	depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 02b688d..9cc6c18 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM)		+= ps3vram.o
 obj-$(CONFIG_ATARI_FLOPPY)	+= ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
+obj-$(CONFIG_BLK_DEV_PMEM)	+= pmem.o
 obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
 obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
new file mode 100644
index 0000000..988f384
--- /dev/null
+++ b/drivers/block/pmem.c
@@ -0,0 +1,263 @@
+/*
+ * Persistent Memory Driver
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
+ * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <asm/cacheflush.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+
+#define PMEM_MINORS		16
+
+struct pmem_device {
+	struct request_queue	*pmem_queue;
+	struct gendisk		*pmem_disk;
+
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	void			*virt_addr;
+	size_t			size;
+};
+
+static int pmem_major;
+static atomic_t pmem_index;
+
+static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
+			unsigned int len, unsigned int off, int rw,
+			sector_t sector)
+{
+	void *mem = kmap_atomic(page);
+	size_t pmem_off = sector << 9;
+
+	if (rw == READ) {
+		memcpy(mem + off, pmem->virt_addr + pmem_off, len);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		memcpy(pmem->virt_addr + pmem_off, mem + off, len);
+	}
+
+	kunmap_atomic(mem);
+}
+
+static void pmem_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct block_device *bdev = bio->bi_bdev;
+	struct pmem_device *pmem = bdev->bd_disk->private_data;
+	int rw;
+	struct bio_vec bvec;
+	sector_t sector;
+	struct bvec_iter iter;
+	int err = 0;
+
+	if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
+		err = -EIO;
+		goto out;
+	}
+
+	BUG_ON(bio->bi_rw & REQ_DISCARD);
+
+	rw = bio_data_dir(bio);
+	sector = bio->bi_iter.bi_sector;
+	bio_for_each_segment(bvec, bio, iter) {
+		pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
+			     rw, sector);
+		sector += bvec.bv_len >> 9;
+	}
+
+out:
+	bio_endio(bio, err);
+}
+
+static int pmem_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, int rw)
+{
+	struct pmem_device *pmem = bdev->bd_disk->private_data;
+
+	pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+	page_endio(page, rw & WRITE, 0);
+
+	return 0;
+}
+
+static long pmem_direct_access(struct block_device *bdev, sector_t sector,
+			      void **kaddr, unsigned long *pfn, long size)
+{
+	struct pmem_device *pmem = bdev->bd_disk->private_data;
+	size_t offset = sector << 9;
+
+	if (!pmem)
+		return -ENODEV;
+
+	*kaddr = pmem->virt_addr + offset;
+	*pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
+
+	return pmem->size - offset;
+}
+
+static const struct block_device_operations pmem_fops = {
+	.owner =		THIS_MODULE,
+	.rw_page =		pmem_rw_page,
+	.direct_access =	pmem_direct_access,
+};
+
+static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
+{
+	struct pmem_device *pmem;
+	struct gendisk *disk;
+	int idx, err;
+
+	err = -ENOMEM;
+	pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
+	if (!pmem)
+		goto out;
+
+	pmem->phys_addr = res->start;
+	pmem->size = resource_size(res);
+
+	err = -EINVAL;
+	if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
+		dev_warn(dev, "could not reserve region [0x%llx:0x%zx]\n",
+			   pmem->phys_addr, pmem->size);
+		goto out_free_dev;
+	}
+
+	/*
+	 * Map the memory as non-cachable, as we can't write back the contents
+	 * of the CPU caches in case of a crash.
+	 */
+	err = -ENOMEM;
+	pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
+	if (!pmem->virt_addr)
+		goto out_release_region;
+
+	pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
+	if (!pmem->pmem_queue)
+		goto out_unmap;
+
+	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
+	blk_queue_max_hw_sectors(pmem->pmem_queue, 1024);
+	blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
+
+	disk = alloc_disk(PMEM_MINORS);
+	if (!disk)
+		goto out_free_queue;
+
+	idx = atomic_inc_return(&pmem_index) - 1;
+
+	disk->major		= pmem_major;
+	disk->first_minor	= PMEM_MINORS * idx;
+	disk->fops		= &pmem_fops;
+	disk->private_data	= pmem;
+	disk->queue		= pmem->pmem_queue;
+	disk->flags		= GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "pmem%d", idx);
+	disk->driverfs_dev = dev;
+	set_capacity(disk, pmem->size >> 9);
+	pmem->pmem_disk = disk;
+
+	add_disk(disk);
+
+	return pmem;
+
+out_free_queue:
+	blk_cleanup_queue(pmem->pmem_queue);
+out_unmap:
+	iounmap(pmem->virt_addr);
+out_release_region:
+	release_mem_region(pmem->phys_addr, pmem->size);
+out_free_dev:
+	kfree(pmem);
+out:
+	return ERR_PTR(err);
+}
+
+static void pmem_free(struct pmem_device *pmem)
+{
+	del_gendisk(pmem->pmem_disk);
+	put_disk(pmem->pmem_disk);
+	blk_cleanup_queue(pmem->pmem_queue);
+	iounmap(pmem->virt_addr);
+	release_mem_region(pmem->phys_addr, pmem->size);
+	kfree(pmem);
+}
+
+static int pmem_probe(struct platform_device *pdev)
+{
+	struct pmem_device *pmem;
+	struct resource *res;
+
+	if (WARN_ON(pdev->num_resources > 1))
+		return -ENXIO;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENXIO;
+
+	pmem = pmem_alloc(&pdev->dev, res);
+	if (IS_ERR(pmem))
+		return PTR_ERR(pmem);
+
+	platform_set_drvdata(pdev, pmem);
+
+	return 0;
+}
+
+static int pmem_remove(struct platform_device *pdev)
+{
+	struct pmem_device *pmem = platform_get_drvdata(pdev);
+
+	pmem_free(pmem);
+	return 0;
+}
+
+static struct platform_driver pmem_driver = {
+	.probe		= pmem_probe,
+	.remove		= pmem_remove,
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= "pmem",
+	},
+};
+
+static int __init pmem_init(void)
+{
+	int error;
+
+	pmem_major = register_blkdev(0, "pmem");
+	if (pmem_major < 0)
+		return pmem_major;
+
+	error = platform_driver_register(&pmem_driver);
+	if (error)
+		unregister_blkdev(pmem_major, "pmem");
+	return error;
+}
+module_init(pmem_init);
+
+static void pmem_exit(void)
+{
+	platform_driver_unregister(&pmem_driver);
+	unregister_blkdev(pmem_major, "pmem");
+}
+module_exit(pmem_exit);
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL v2");

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH] pmem: Add prints at module load and unload
  2015-04-01  7:12 another pmem variant V3 Christoph Hellwig
  2015-04-01  7:12 ` [PATCH 1/2] x86: add support for the non-standard protected e820 type Christoph Hellwig
  2015-04-01  7:12 ` [PATCH 2/2] pmem: add a driver for persistent memory Christoph Hellwig
@ 2015-04-02 15:31 ` Boaz Harrosh
  2015-04-02 15:39   ` [Linux-nvdimm] " Dan Williams
  2015-04-07 15:46 ` [PATCH A+B] " Boaz Harrosh
  3 siblings, 1 reply; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-02 15:31 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe
  Cc: ross.zwisler

Hi Christoph, Ingo

Please consider this small patch below just a small print at module
load/unload so to know at user systems how things progressed.
As it is now, we know nothing. For any other disk kind we have two
tuns of prints.

---
From: Boaz Harrosh <boaz@plexistor.com>
Date: Thu, 2 Apr 2015 16:43:48 +0300
Subject: [PATCH] pmem: Add prints at module load and unload

When debugging people's systems it is helpful
to see what went on. The load and unload of pmem is
an important event.

The importance is the number of loaded devices and
error status. The exact device's addresses created
we can see from the other prints at e820 so no need
to duplicate this information.

While at it fix up a small issue with rw flags.

Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
---
 drivers/block/pmem.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 988f384..3f15fbc 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -91,8 +91,9 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 {
 	struct pmem_device *pmem = bdev->bd_disk->private_data;
 
+	rw &= WRITE;
 	pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
-	page_endio(page, rw & WRITE, 0);
+	page_endio(page, rw, 0);
 
 	return 0;
 }
@@ -248,6 +249,9 @@ static int __init pmem_init(void)
 	error = platform_driver_register(&pmem_driver);
 	if (error)
 		unregister_blkdev(pmem_major, "pmem");
+
+	pr_info("pmem: init %d devices => %d\n",
+		atomic_read(&pmem_index), error);
 	return error;
 }
 module_init(pmem_init);
@@ -256,6 +260,7 @@ static void pmem_exit(void)
 {
 	platform_driver_unregister(&pmem_driver);
 	unregister_blkdev(pmem_major, "pmem");
+	pr_info("pmem: exit\n");
 }
 module_exit(pmem_exit);
 
-- 
1.9.3



^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH] pmem: Add prints at module load and unload
  2015-04-02 15:31 ` [PATCH] pmem: Add prints at module load and unload Boaz Harrosh
@ 2015-04-02 15:39   ` Dan Williams
  2015-04-02 15:47     ` Boaz Harrosh
  0 siblings, 1 reply; 50+ messages in thread
From: Dan Williams @ 2015-04-02 15:39 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel,
	X86 ML, Jens Axboe

On Thu, Apr 2, 2015 at 8:31 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
> Hi Christoph, Ingo
>
> Please consider this small patch below just a small print at module
> load/unload so to know at user systems how things progressed.
> As it is now, we know nothing. For any other disk kind we have two
> tuns of prints.
>
> ---
> From: Boaz Harrosh <boaz@plexistor.com>
> Date: Thu, 2 Apr 2015 16:43:48 +0300
> Subject: [PATCH] pmem: Add prints at module load and unload
>
> When debugging people's systems it is helpful
> to see what went on. The load and unload of pmem is
> an important event.
>
> The importance is the number of loaded devices and
> error status. The exact device's addresses created
> we can see from the other prints at e820 so no need
> to duplicate this information.
>
> While at it fix up a small issue with rw flags.

Separate patch for fixes please.

>
> Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
> ---
>  drivers/block/pmem.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
> index 988f384..3f15fbc 100644
> --- a/drivers/block/pmem.c
> +++ b/drivers/block/pmem.c
> @@ -91,8 +91,9 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
>  {
>         struct pmem_device *pmem = bdev->bd_disk->private_data;
>
> +       rw &= WRITE;
>         pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
> -       page_endio(page, rw & WRITE, 0);
> +       page_endio(page, rw, 0);
>
>         return 0;
>  }
> @@ -248,6 +249,9 @@ static int __init pmem_init(void)
>         error = platform_driver_register(&pmem_driver);
>         if (error)
>                 unregister_blkdev(pmem_major, "pmem");
> +
> +       pr_info("pmem: init %d devices => %d\n",
> +               atomic_read(&pmem_index), error);

If anything I think these should be dev_dbg().

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH] pmem: Add prints at module load and unload
  2015-04-02 15:39   ` [Linux-nvdimm] " Dan Williams
@ 2015-04-02 15:47     ` Boaz Harrosh
  2015-04-02 16:01       ` Dan Williams
  0 siblings, 1 reply; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-02 15:47 UTC (permalink / raw)
  To: Dan Williams
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel,
	X86 ML, Jens Axboe, Ingo Molnar

On 04/02/2015 06:39 PM, Dan Williams wrote:
> On Thu, Apr 2, 2015 at 8:31 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
>> Hi Christoph, Ingo
>>
>> Please consider this small patch below just a small print at module
>> load/unload so to know at user systems how things progressed.
>> As it is now, we know nothing. For any other disk kind we have two
>> tuns of prints.
>>
>> ---
>> From: Boaz Harrosh <boaz@plexistor.com>
>> Date: Thu, 2 Apr 2015 16:43:48 +0300
>> Subject: [PATCH] pmem: Add prints at module load and unload
>>
>> When debugging people's systems it is helpful
>> to see what went on. The load and unload of pmem is
>> an important event.
>>
>> The importance is the number of loaded devices and
>> error status. The exact device's addresses created
>> we can see from the other prints at e820 so no need
>> to duplicate this information.
>>
>> While at it fix up a small issue with rw flags.
> 
> Separate patch for fixes please.

Sigh, OK I was preparing this and hopping it would just
be SQUASHED into the original patch but Ingo bit me to it
and already submitted the patches.

> 
>>
>> Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
>> ---
>>  drivers/block/pmem.c | 7 ++++++-
>>  1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
>> index 988f384..3f15fbc 100644
>> --- a/drivers/block/pmem.c
>> +++ b/drivers/block/pmem.c
>> @@ -91,8 +91,9 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
>>  {
>>         struct pmem_device *pmem = bdev->bd_disk->private_data;
>>
>> +       rw &= WRITE;
>>         pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
>> -       page_endio(page, rw & WRITE, 0);
>> +       page_endio(page, rw, 0);
>>
>>         return 0;
>>  }
>> @@ -248,6 +249,9 @@ static int __init pmem_init(void)
>>         error = platform_driver_register(&pmem_driver);
>>         if (error)
>>                 unregister_blkdev(pmem_major, "pmem");
>> +
>> +       pr_info("pmem: init %d devices => %d\n",
>> +               atomic_read(&pmem_index), error);
> 
> If anything I think these should be dev_dbg().

We do not have a dev at any of this point, and it does not
belong to any specific device. Also I would like this
_info and not _dbg so to always have it, also for production.
See the chatter for a single SCSI disk the minimum we need
is just the small print that tells all that we need (for now)

Please consider as is?

Thanks
Boaz


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH] pmem: Add prints at module load and unload
  2015-04-02 15:47     ` Boaz Harrosh
@ 2015-04-02 16:01       ` Dan Williams
  2015-04-02 16:44         ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Dan Williams @ 2015-04-02 16:01 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel,
	X86 ML, Jens Axboe, Ingo Molnar

On Thu, Apr 2, 2015 at 8:47 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
> On 04/02/2015 06:39 PM, Dan Williams wrote:
>> On Thu, Apr 2, 2015 at 8:31 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
>>> Hi Christoph, Ingo
>>>
>>> Please consider this small patch below just a small print at module
>>> load/unload so to know at user systems how things progressed.
>>> As it is now, we know nothing. For any other disk kind we have two
>>> tuns of prints.
>>>
>>> ---
>>> From: Boaz Harrosh <boaz@plexistor.com>
>>> Date: Thu, 2 Apr 2015 16:43:48 +0300
>>> Subject: [PATCH] pmem: Add prints at module load and unload
>>>
>>> When debugging people's systems it is helpful
>>> to see what went on. The load and unload of pmem is
>>> an important event.
>>>
>>> The importance is the number of loaded devices and
>>> error status. The exact device's addresses created
>>> we can see from the other prints at e820 so no need
>>> to duplicate this information.
>>>
>>> While at it fix up a small issue with rw flags.
>>
>> Separate patch for fixes please.
>
> Sigh, OK I was preparing this and hopping it would just
> be SQUASHED into the original patch but Ingo bit me to it
> and already submitted the patches.
>
>>
>>>
>>> Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
>>> ---
>>>  drivers/block/pmem.c | 7 ++++++-
>>>  1 file changed, 6 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
>>> index 988f384..3f15fbc 100644
>>> --- a/drivers/block/pmem.c
>>> +++ b/drivers/block/pmem.c
>>> @@ -91,8 +91,9 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
>>>  {
>>>         struct pmem_device *pmem = bdev->bd_disk->private_data;
>>>
>>> +       rw &= WRITE;
>>>         pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
>>> -       page_endio(page, rw & WRITE, 0);
>>> +       page_endio(page, rw, 0);
>>>
>>>         return 0;
>>>  }
>>> @@ -248,6 +249,9 @@ static int __init pmem_init(void)
>>>         error = platform_driver_register(&pmem_driver);
>>>         if (error)
>>>                 unregister_blkdev(pmem_major, "pmem");
>>> +
>>> +       pr_info("pmem: init %d devices => %d\n",
>>> +               atomic_read(&pmem_index), error);
>>
>> If anything I think these should be dev_dbg().
>
> We do not have a dev at any of this point, and it does not
> belong to any specific device.

Ah, true this is prior to the driver attaching... that said it seems
more relevant to print from probe() (where we do have a device) than
init where the device may remain idle due to some other policy.

> Also I would like this
> _info and not _dbg so to always have it, also for production.
> See the chatter for a single SCSI disk the minimum we need
> is just the small print that tells all that we need (for now)

Not sure we want to follow so closely in the footsteps of SCSI's chattiness.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH] pmem: Add prints at module load and unload
  2015-04-02 16:01       ` Dan Williams
@ 2015-04-02 16:44         ` Christoph Hellwig
  2015-04-05  8:50           ` Boaz Harrosh
  0 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2015-04-02 16:44 UTC (permalink / raw)
  To: Dan Williams
  Cc: Boaz Harrosh, Christoph Hellwig, linux-nvdimm, linux-fsdevel,
	linux-kernel, X86 ML, Jens Axboe, Ingo Molnar

On Thu, Apr 02, 2015 at 09:01:14AM -0700, Dan Williams wrote:
> >> If anything I think these should be dev_dbg().
> >
> > We do not have a dev at any of this point, and it does not
> > belong to any specific device.
> 
> Ah, true this is prior to the driver attaching... that said it seems
> more relevant to print from probe() (where we do have a device) than
> init where the device may remain idle due to some other policy.
> 
> > Also I would like this
> > _info and not _dbg so to always have it, also for production.
> > See the chatter for a single SCSI disk the minimum we need
> > is just the small print that tells all that we need (for now)
> 
> Not sure we want to follow so closely in the footsteps of SCSI's chattiness.

Defintively not!  A single dev_info at ->probe time sounds ok, something
like:

	dev_info(dev, "registering region [0x%pa:0x%zx]\n",
			&pmem->phys_addr, pmem->size);

but there are plenty other drivers not that chatty, e.g. virtio and we're
doing just fine for them.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type
  2015-04-02 12:31   ` [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type tip-bot for Christoph Hellwig
@ 2015-04-02 19:08     ` Andy Lutomirski
  2015-04-02 19:13       ` Ingo Molnar
  2015-04-02 20:28     ` Yinghai Lu
  1 sibling, 1 reply; 50+ messages in thread
From: Andy Lutomirski @ 2015-04-02 19:08 UTC (permalink / raw)
  To: axboe, Boaz Harrosh, Dan Williams, H. Peter Anvin,
	Andy Lutomirski, Jens Axboe, linux-kernel, Thomas Gleixner,
	Borislav Petkov, Andrew Morton, Linus Torvalds,
	Christoph Hellwig, Ross Zwisler, Ingo Molnar, Matthew Wilcox,
	keith.busch
  Cc: linux-tip-commits

On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
<tipbot@zytor.com> wrote:
> Commit-ID:  ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> Gitweb:     http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> Author:     Christoph Hellwig <hch@lst.de>
> AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
> Committer:  Ingo Molnar <mingo@kernel.org>
> CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
>
> x86/mm: Add support for the non-standard protected e820 type
>
> Various recent BIOSes support NVDIMMs or ADR using a
> non-standard e820 memory type, and Intel supplied reference
> Linux code using this type to various vendors.
>
> Wire this e820 table type up to export platform devices for the
> pmem driver so that we can use it in Linux.

This scares me a bit.  Do we know that the upcoming ACPI 6.0
enumeration mechanism *won't* use e820 type 12?  If it will, what
stops a non-legacy device from being incorrectly claimed as a legacy
device?

--Andy

>
> Based on earlier work from:
>
>    Dave Jiang <dave.jiang@intel.com>
>    Dan Williams <dan.j.williams@intel.com>
>
> Includes fixes for NUMA regions from Boaz Harrosh.
>
> Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Acked-by: Dan Williams <dan.j.williams@intel.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Andy Lutomirski <luto@amacapital.net>
> Cc: Boaz Harrosh <boaz@plexistor.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: H. Peter Anvin <hpa@zytor.com>
> Cc: Jens Axboe <axboe@fb.com>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: Keith Busch <keith.busch@intel.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Matthew Wilcox <willy@linux.intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: linux-nvdimm@ml01.01.org
> Link: http://lkml.kernel.org/r/1427872339-6688-2-git-send-email-hch@lst.de
> [ Minor cleanups. ]
> Signed-off-by: Ingo Molnar <mingo@kernel.org>
> ---
>  Documentation/kernel-parameters.txt |  6 +++++
>  arch/x86/Kconfig                    | 10 +++++++
>  arch/x86/include/uapi/asm/e820.h    | 10 +++++++
>  arch/x86/kernel/Makefile            |  1 +
>  arch/x86/kernel/e820.c              | 26 +++++++++++++-----
>  arch/x86/kernel/pmem.c              | 53 +++++++++++++++++++++++++++++++++++++
>  6 files changed, 100 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index bfcb1a6..c87122d 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1965,6 +1965,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
>                                  or
>                                  memmap=0x10000$0x18690000
>
> +       memmap=nn[KMG]!ss[KMG]
> +                       [KNL,X86] Mark specific memory as protected.
> +                       Region of memory to be used, from ss to ss+nn.
> +                       The memory region may be marked as e820 type 12 (0xc)
> +                       and is NVDIMM or ADR memory.
> +
>         memory_corruption_check=0/1 [X86]
>                         Some BIOSes seem to corrupt the first 64k of
>                         memory when doing things like suspend/resume.
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index b7d31ca..9e3bcd6 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1430,6 +1430,16 @@ config ILLEGAL_POINTER_VALUE
>
>  source "mm/Kconfig"
>
> +config X86_PMEM_LEGACY
> +       bool "Support non-standard NVDIMMs and ADR protected memory"
> +       help
> +         Treat memory marked using the non-standard e820 type of 12 as used
> +         by the Intel Sandy Bridge-EP reference BIOS as protected memory.
> +         The kernel will offer these regions to the 'pmem' driver so
> +         they can be used for persistent storage.
> +
> +         Say Y if unsure.
> +
>  config HIGHPTE
>         bool "Allocate 3rd-level pagetables from highmem"
>         depends on HIGHMEM
> diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
> index d993e33..960a8a9 100644
> --- a/arch/x86/include/uapi/asm/e820.h
> +++ b/arch/x86/include/uapi/asm/e820.h
> @@ -33,6 +33,16 @@
>  #define E820_NVS       4
>  #define E820_UNUSABLE  5
>
> +/*
> + * This is a non-standardized way to represent ADR or NVDIMM regions that
> + * persist over a reboot.  The kernel will ignore their special capabilities
> + * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
> + *
> + * ( Note that older platforms also used 6 for the same type of memory,
> + *   but newer versions switched to 12 as 6 was assigned differently.  Some
> + *   time they will learn... )
> + */
> +#define E820_PRAM      12
>
>  /*
>   * reserved RAM used by kernel itself
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index cdb1b70..971f18c 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -94,6 +94,7 @@ obj-$(CONFIG_KVM_GUEST)               += kvm.o kvmclock.o
>  obj-$(CONFIG_PARAVIRT)         += paravirt.o paravirt_patch_$(BITS).o
>  obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
>  obj-$(CONFIG_PARAVIRT_CLOCK)   += pvclock.o
> +obj-$(CONFIG_X86_PMEM_LEGACY)  += pmem.o
>
>  obj-$(CONFIG_PCSPKR_PLATFORM)  += pcspeaker.o
>
> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
> index 46201de..11cc7d5 100644
> --- a/arch/x86/kernel/e820.c
> +++ b/arch/x86/kernel/e820.c
> @@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type)
>         case E820_UNUSABLE:
>                 printk(KERN_CONT "unusable");
>                 break;
> +       case E820_PRAM:
> +               printk(KERN_CONT "persistent (type %u)", type);
> +               break;
>         default:
>                 printk(KERN_CONT "type %u", type);
>                 break;
> @@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
>                  * continue building up new bios map based on this
>                  * information
>                  */
> -               if (current_type != last_type)  {
> +               if (current_type != last_type || current_type == E820_PRAM) {
>                         if (last_type != 0)      {
>                                 new_bios[new_bios_entry].size =
>                                         change_point[chgidx]->addr - last_addr;
> @@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
>                         register_nosave_region(pfn, PFN_UP(ei->addr));
>
>                 pfn = PFN_DOWN(ei->addr + ei->size);
> +
>                 if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
>                         register_nosave_region(PFN_UP(ei->addr), pfn);
>
> @@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
>  /*
>   * Find the highest page frame number we have available
>   */
> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
>  {
>         int i;
>         unsigned long last_pfn = 0;
> @@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>                 unsigned long start_pfn;
>                 unsigned long end_pfn;
>
> -               if (ei->type != type)
> +               /*
> +                * Persistent memory is accounted as ram for purposes of
> +                * establishing max_pfn and mem_map.
> +                */
> +               if (ei->type != E820_RAM && ei->type != E820_PRAM)
>                         continue;
>
>                 start_pfn = ei->addr >> PAGE_SHIFT;
> @@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>  }
>  unsigned long __init e820_end_of_ram_pfn(void)
>  {
> -       return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
> +       return e820_end_pfn(MAX_ARCH_PFN);
>  }
>
>  unsigned long __init e820_end_of_low_ram_pfn(void)
>  {
> -       return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
> +       return e820_end_pfn(1UL << (32-PAGE_SHIFT));
>  }
>
>  static void early_panic(char *msg)
> @@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p)
>         } else if (*p == '$') {
>                 start_at = memparse(p+1, &p);
>                 e820_add_region(start_at, mem_size, E820_RESERVED);
> +       } else if (*p == '!') {
> +               start_at = memparse(p+1, &p);
> +               e820_add_region(start_at, mem_size, E820_PRAM);
>         } else
>                 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
>
> @@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type)
>         case E820_ACPI: return "ACPI Tables";
>         case E820_NVS:  return "ACPI Non-volatile Storage";
>         case E820_UNUSABLE:     return "Unusable memory";
> +       case E820_PRAM: return "Persistent RAM";
>         default:        return "reserved";
>         }
>  }
> @@ -940,7 +952,9 @@ void __init e820_reserve_resources(void)
>                  * pci device BAR resource and insert them later in
>                  * pcibios_resource_survey()
>                  */
> -               if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
> +               if (((e820.map[i].type != E820_RESERVED) &&
> +                    (e820.map[i].type != E820_PRAM)) ||
> +                    res->start < (1ULL<<20)) {
>                         res->flags |= IORESOURCE_BUSY;
>                         insert_resource(&iomem_resource, res);
>                 }
> diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
> new file mode 100644
> index 0000000..3420c87
> --- /dev/null
> +++ b/arch/x86/kernel/pmem.c
> @@ -0,0 +1,53 @@
> +/*
> + * Copyright (c) 2015, Christoph Hellwig.
> + */
> +#include <linux/memblock.h>
> +#include <linux/platform_device.h>
> +#include <linux/slab.h>
> +#include <asm/e820.h>
> +#include <asm/page_types.h>
> +#include <asm/setup.h>
> +
> +static __init void register_pmem_device(struct resource *res)
> +{
> +       struct platform_device *pdev;
> +       int error;
> +
> +       pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
> +       if (!pdev)
> +               return;
> +
> +       error = platform_device_add_resources(pdev, res, 1);
> +       if (error)
> +               goto out_put_pdev;
> +
> +       error = platform_device_add(pdev);
> +       if (error)
> +               goto out_put_pdev;
> +       return;
> +
> +out_put_pdev:
> +       dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
> +       platform_device_put(pdev);
> +}
> +
> +static __init int register_pmem_devices(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < e820.nr_map; i++) {
> +               struct e820entry *ei = &e820.map[i];
> +
> +               if (ei->type == E820_PRAM) {
> +                       struct resource res = {
> +                               .flags  = IORESOURCE_MEM,
> +                               .start  = ei->addr,
> +                               .end    = ei->addr + ei->size - 1,
> +                       };
> +                       register_pmem_device(&res);
> +               }
> +       }
> +
> +       return 0;
> +}
> +device_initcall(register_pmem_devices);



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type
  2015-04-02 19:08     ` Andy Lutomirski
@ 2015-04-02 19:13       ` Ingo Molnar
  2015-04-02 19:51         ` Andy Lutomirski
  0 siblings, 1 reply; 50+ messages in thread
From: Ingo Molnar @ 2015-04-02 19:13 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: axboe, Boaz Harrosh, Dan Williams, H. Peter Anvin, Jens Axboe,
	linux-kernel, Thomas Gleixner, Borislav Petkov, Andrew Morton,
	Linus Torvalds, Christoph Hellwig, Ross Zwisler, Matthew Wilcox,
	keith.busch, linux-tip-commits


* Andy Lutomirski <luto@amacapital.net> wrote:

> On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
> <tipbot@zytor.com> wrote:
> > Commit-ID:  ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> > Gitweb:     http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> > Author:     Christoph Hellwig <hch@lst.de>
> > AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
> > Committer:  Ingo Molnar <mingo@kernel.org>
> > CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
> >
> > x86/mm: Add support for the non-standard protected e820 type
> >
> > Various recent BIOSes support NVDIMMs or ADR using a
> > non-standard e820 memory type, and Intel supplied reference
> > Linux code using this type to various vendors.
> >
> > Wire this e820 table type up to export platform devices for the
> > pmem driver so that we can use it in Linux.
> 
> This scares me a bit.  Do we know that the upcoming ACPI 6.0
> enumeration mechanism *won't* use e820 type 12? [...]

So I know nothing about it, but I'd be surprised if e820 was touched 
at all, as e820 isn't really well suited to enumerate more complex 
resources, and it appears pmem wants to grow into complex directions?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type
  2015-04-02 19:13       ` Ingo Molnar
@ 2015-04-02 19:51         ` Andy Lutomirski
  2015-04-16 22:31           ` Andy Lutomirski
  0 siblings, 1 reply; 50+ messages in thread
From: Andy Lutomirski @ 2015-04-02 19:51 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: axboe, Boaz Harrosh, Dan Williams, H. Peter Anvin, Jens Axboe,
	linux-kernel, Thomas Gleixner, Borislav Petkov, Andrew Morton,
	Linus Torvalds, Christoph Hellwig, Ross Zwisler, Matthew Wilcox,
	keith.busch, linux-tip-commits

On Thu, Apr 2, 2015 at 12:13 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Andy Lutomirski <luto@amacapital.net> wrote:
>
>> On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
>> <tipbot@zytor.com> wrote:
>> > Commit-ID:  ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>> > Gitweb:     http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>> > Author:     Christoph Hellwig <hch@lst.de>
>> > AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
>> > Committer:  Ingo Molnar <mingo@kernel.org>
>> > CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
>> >
>> > x86/mm: Add support for the non-standard protected e820 type
>> >
>> > Various recent BIOSes support NVDIMMs or ADR using a
>> > non-standard e820 memory type, and Intel supplied reference
>> > Linux code using this type to various vendors.
>> >
>> > Wire this e820 table type up to export platform devices for the
>> > pmem driver so that we can use it in Linux.
>>
>> This scares me a bit.  Do we know that the upcoming ACPI 6.0
>> enumeration mechanism *won't* use e820 type 12? [...]
>
> So I know nothing about it, but I'd be surprised if e820 was touched
> at all, as e820 isn't really well suited to enumerate more complex
> resources, and it appears pmem wants to grow into complex directions?

I hope so, but I have no idea what the ACPI committee's schemes are.

We could require pmem.enable_legacy_e820=Y to load the driver for now
if we're concerned about it.

--Andy

>
> Thanks,
>
>         Ingo



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-01  7:12 ` [PATCH 1/2] x86: add support for the non-standard protected e820 type Christoph Hellwig
  2015-04-01 14:25   ` [PATCH] SQUASHME: Fixes to e820 handling of pmem Boaz Harrosh
  2015-04-02 12:31   ` [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type tip-bot for Christoph Hellwig
@ 2015-04-02 20:23   ` Yinghai Lu
  2015-04-03 16:14   ` [Linux-nvdimm] " Toshi Kani
  3 siblings, 0 replies; 50+ messages in thread
From: Yinghai Lu @ 2015-04-02 20:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvdimm, linux-fsdevel, Linux Kernel Mailing List,
	the arch/x86 maintainers, Jens Axboe, ross.zwisler, Boaz Harrosh

On Wed, Apr 1, 2015 at 12:12 AM, Christoph Hellwig <hch@lst.de> wrote:
> Various recent BIOSes support NVDIMMs or ADR using a non-standard
> e820 memory type, and Intel supplied reference Linux code using this
> type to various vendors.
>
> Wire this e820 table type up to export platform devices for the pmem
> driver so that we can use it in Linux.
>
> Based on arlier work from Dave Jiang <dave.jiang@intel.com> and
> Dan Williams <dan.j.williams@intel.com>.
>
...
> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
>  /*
>   * Find the highest page frame number we have available
>   */
> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
>  {
>         int i;
>         unsigned long last_pfn = 0;
> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>                 unsigned long start_pfn;
>                 unsigned long end_pfn;
>
> -               if (ei->type != type)
> +               /*
> +                * Persistent memory is accounted as ram for purposes of
> +                * establishing max_pfn and mem_map.
> +                */
> +               if (ei->type != E820_RAM && ei->type != E820_PRAM)
>                         continue;
>
>                 start_pfn = ei->addr >> PAGE_SHIFT;
> @@ -784,12 +798,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>  }
>  unsigned long __init e820_end_of_ram_pfn(void)
>  {
> -       return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
> +       return e820_end_pfn(MAX_ARCH_PFN);
>  }
>
>  unsigned long __init e820_end_of_low_ram_pfn(void)
>  {
> -       return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
> +       return e820_end_pfn(1UL<<(32 - PAGE_SHIFT));
>  }

You are using iomap_nocache to access pmem.

Do you still need to account E820_PRAM to get those end_of_ram ?
You should not need that to help to set kernel mapping.

So please drop those not needed changes.

Thanks

Yinghai

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type
  2015-04-02 12:31   ` [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type tip-bot for Christoph Hellwig
  2015-04-02 19:08     ` Andy Lutomirski
@ 2015-04-02 20:28     ` Yinghai Lu
  1 sibling, 0 replies; 50+ messages in thread
From: Yinghai Lu @ 2015-04-02 20:28 UTC (permalink / raw)
  To: axboe, Boaz Harrosh, Dan Williams, H. Peter Anvin,
	Andy Lutomirski, Jens Axboe, Linux Kernel Mailing List,
	Thomas Gleixner, Borislav Petkov, Andrew Morton, Linus Torvalds,
	Christoph Hellwig, ross.zwisler, Ingo Molnar, Matthew Wilcox,
	keith.busch
  Cc: linux-tip-commits

On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
<tipbot@zytor.com> wrote:
> Commit-ID:  ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> Gitweb:     http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> Author:     Christoph Hellwig <hch@lst.de>
> AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
> Committer:  Ingo Molnar <mingo@kernel.org>
> CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
>
> x86/mm: Add support for the non-standard protected e820 type
..
> @@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
>                         register_nosave_region(pfn, PFN_UP(ei->addr));
>
>                 pfn = PFN_DOWN(ei->addr + ei->size);
> +
>                 if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
>                         register_nosave_region(PFN_UP(ei->addr), pfn);
>

related?

> @@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
>  /*
>   * Find the highest page frame number we have available
>   */
> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
>  {
>         int i;
>         unsigned long last_pfn = 0;
> @@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>                 unsigned long start_pfn;
>                 unsigned long end_pfn;
>
> -               if (ei->type != type)
> +               /*
> +                * Persistent memory is accounted as ram for purposes of
> +                * establishing max_pfn and mem_map.
> +                */
> +               if (ei->type != E820_RAM && ei->type != E820_PRAM)
>                         continue;
>
>                 start_pfn = ei->addr >> PAGE_SHIFT;
> @@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>  }
>  unsigned long __init e820_end_of_ram_pfn(void)
>  {
> -       return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
> +       return e820_end_pfn(MAX_ARCH_PFN);
>  }
>
>  unsigned long __init e820_end_of_low_ram_pfn(void)
>  {
> -       return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
> +       return e820_end_pfn(1UL << (32-PAGE_SHIFT));
>  }
>
>  static void early_panic(char *msg)
>

those eno_of_ram changes are not needed, as it will not used for
kernel mapping setup.

Thanks

Yinghai

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-01  7:12 ` [PATCH 1/2] x86: add support for the non-standard protected e820 type Christoph Hellwig
                     ` (2 preceding siblings ...)
  2015-04-02 20:23   ` [PATCH 1/2] x86: add " Yinghai Lu
@ 2015-04-03 16:14   ` Toshi Kani
  2015-04-03 17:12     ` Yinghai Lu
  3 siblings, 1 reply; 50+ messages in thread
From: Toshi Kani @ 2015-04-03 16:14 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe

On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
  :
> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
>  /*
>   * Find the highest page frame number we have available
>   */
> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
>  {
>  	int i;
>  	unsigned long last_pfn = 0;
> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>  		unsigned long start_pfn;
>  		unsigned long end_pfn;
>  
> -		if (ei->type != type)
> +		/*
> +		 * Persistent memory is accounted as ram for purposes of
> +		 * establishing max_pfn and mem_map.
> +		 */
> +		if (ei->type != E820_RAM && ei->type != E820_PRAM)
>  			continue;

Should we also delete this code, accounting E820_PRAM as ram, along with
the deletion of reserve_pmem() in this version?

Thanks,
-Toshi




^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-03 16:14   ` [Linux-nvdimm] " Toshi Kani
@ 2015-04-03 17:12     ` Yinghai Lu
  2015-04-03 20:54       ` Toshi Kani
                         ` (2 more replies)
  0 siblings, 3 replies; 50+ messages in thread
From: Yinghai Lu @ 2015-04-03 17:12 UTC (permalink / raw)
  To: Toshi Kani
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel,
	Linux Kernel Mailing List, the arch/x86 maintainers, Jens Axboe

[-- Attachment #1: Type: text/plain, Size: 1181 bytes --]

On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <toshi.kani@hp.com> wrote:
> On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
>   :
>> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
>>  /*
>>   * Find the highest page frame number we have available
>>   */
>> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
>>  {
>>       int i;
>>       unsigned long last_pfn = 0;
>> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>>               unsigned long start_pfn;
>>               unsigned long end_pfn;
>>
>> -             if (ei->type != type)
>> +             /*
>> +              * Persistent memory is accounted as ram for purposes of
>> +              * establishing max_pfn and mem_map.
>> +              */
>> +             if (ei->type != E820_RAM && ei->type != E820_PRAM)
>>                       continue;
>
> Should we also delete this code, accounting E820_PRAM as ram, along with
> the deletion of reserve_pmem() in this version?

should revert those end_of_ram change as attached.

[-- Attachment #2: revert_end_of_ram_change.patch --]
[-- Type: text/x-patch, Size: 1298 bytes --]

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e2ce85d..e09a346 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -752,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
 /*
  * Find the highest page frame number we have available
  */
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 {
 	int i;
 	unsigned long last_pfn = 0;
@@ -763,11 +763,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
 		unsigned long start_pfn;
 		unsigned long end_pfn;
 
-		/*
-		 * Persistent memory is accounted as ram for purposes of
-		 * establishing max_pfn and mem_map.
-		 */
-		if (ei->type != E820_RAM && ei->type != E820_PRAM)
+		if (ei->type != type)
 			continue;
 
 		start_pfn = ei->addr >> PAGE_SHIFT;
@@ -792,12 +788,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
 }
 unsigned long __init e820_end_of_ram_pfn(void)
 {
-	return e820_end_pfn(MAX_ARCH_PFN);
+	return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
 }
 
 unsigned long __init e820_end_of_low_ram_pfn(void)
 {
-	return e820_end_pfn(1UL << (32-PAGE_SHIFT));
+	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
 }
 
 static void early_panic(char *msg)

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-03 17:12     ` Yinghai Lu
@ 2015-04-03 20:54       ` Toshi Kani
  2015-04-04  9:40         ` Ingo Molnar
  2015-04-05  9:18       ` Boaz Harrosh
  2015-04-06 15:55       ` Christoph Hellwig
  2 siblings, 1 reply; 50+ messages in thread
From: Toshi Kani @ 2015-04-03 20:54 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel,
	Linux Kernel Mailing List, the arch/x86 maintainers, Jens Axboe

On Fri, 2015-04-03 at 10:12 -0700, Yinghai Lu wrote:
> On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <toshi.kani@hp.com> wrote:
> > On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
> >   :
> >> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
> >>  /*
> >>   * Find the highest page frame number we have available
> >>   */
> >> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> >> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> >>  {
> >>       int i;
> >>       unsigned long last_pfn = 0;
> >> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> >>               unsigned long start_pfn;
> >>               unsigned long end_pfn;
> >>
> >> -             if (ei->type != type)
> >> +             /*
> >> +              * Persistent memory is accounted as ram for purposes of
> >> +              * establishing max_pfn and mem_map.
> >> +              */
> >> +             if (ei->type != E820_RAM && ei->type != E820_PRAM)
> >>                       continue;
> >
> > Should we also delete this code, accounting E820_PRAM as ram, along with
> > the deletion of reserve_pmem() in this version?
> 
> should revert those end_of_ram change as attached.

I confirmed that the pmem driver works with the change, and last_pfn is
updated as expected.

Thanks,
-Toshi


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-03 20:54       ` Toshi Kani
@ 2015-04-04  9:40         ` Ingo Molnar
  2015-04-05  7:44           ` Yinghai Lu
  2015-04-06 17:29           ` Toshi Kani
  0 siblings, 2 replies; 50+ messages in thread
From: Ingo Molnar @ 2015-04-04  9:40 UTC (permalink / raw)
  To: Toshi Kani
  Cc: Yinghai Lu, Christoph Hellwig, linux-nvdimm, linux-fsdevel,
	Linux Kernel Mailing List, the arch/x86 maintainers, Jens Axboe


* Toshi Kani <toshi.kani@hp.com> wrote:

> On Fri, 2015-04-03 at 10:12 -0700, Yinghai Lu wrote:
> > On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <toshi.kani@hp.com> wrote:
> > > On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
> > >   :
> > >> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
> > >>  /*
> > >>   * Find the highest page frame number we have available
> > >>   */
> > >> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> > >> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> > >>  {
> > >>       int i;
> > >>       unsigned long last_pfn = 0;
> > >> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> > >>               unsigned long start_pfn;
> > >>               unsigned long end_pfn;
> > >>
> > >> -             if (ei->type != type)
> > >> +             /*
> > >> +              * Persistent memory is accounted as ram for purposes of
> > >> +              * establishing max_pfn and mem_map.
> > >> +              */
> > >> +             if (ei->type != E820_RAM && ei->type != E820_PRAM)
> > >>                       continue;
> > >
> > > Should we also delete this code, accounting E820_PRAM as ram, along with
> > > the deletion of reserve_pmem() in this version?
> > 
> > should revert those end_of_ram change as attached.
> 
> I confirmed that the pmem driver works with the change, and last_pfn is
> updated as expected.

Could someone please send the fix with a changelog, etc?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-04  9:40         ` Ingo Molnar
@ 2015-04-05  7:44           ` Yinghai Lu
  2015-04-06  7:27             ` Ingo Molnar
  2015-04-06 17:29           ` Toshi Kani
  1 sibling, 1 reply; 50+ messages in thread
From: Yinghai Lu @ 2015-04-05  7:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Toshi Kani, Christoph Hellwig, linux-nvdimm, linux-fsdevel,
	Linux Kernel Mailing List, the arch/x86 maintainers, Jens Axboe

On Sat, Apr 4, 2015 at 2:40 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Toshi Kani <toshi.kani@hp.com> wrote:
>
>> On Fri, 2015-04-03 at 10:12 -0700, Yinghai Lu wrote:
>> > On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <toshi.kani@hp.com> wrote:
>> > > On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
>> >
>> > should revert those end_of_ram change as attached.
>>
>> I confirmed that the pmem driver works with the change, and last_pfn is
>> updated as expected.
>
> Could someone please send the fix with a changelog, etc?
>

Why just fold those change into that commit.
Or you can just drop the patch and ask Christoph to resubmit updated
patch again.

I asked Christoph to remove reserved_pmem, and he agreed to do that

http://lkml.iu.edu/hypermail/linux/kernel/1503.3/02919.html
http://lkml.iu.edu/hypermail/linux/kernel/1503.3/03119.html

but sadly, he did not put me on the CC list for while sending updated patch.
and next day you picked it up to tip/pmem branch.
otherwise we could find the problem early

and he even did not put my name on the changelog :-(
with that, I could find the email early too..

Yinghai

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH] pmem: Add prints at module load and unload
  2015-04-02 16:44         ` Christoph Hellwig
@ 2015-04-05  8:50           ` Boaz Harrosh
  2015-04-07 15:19             ` Christoph Hellwig
  0 siblings, 1 reply; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-05  8:50 UTC (permalink / raw)
  To: Christoph Hellwig, Dan Williams
  Cc: linux-nvdimm, linux-fsdevel, linux-kernel, X86 ML, Jens Axboe,
	Ingo Molnar

On 04/02/2015 07:44 PM, Christoph Hellwig wrote:
> On Thu, Apr 02, 2015 at 09:01:14AM -0700, Dan Williams wrote:
>>>> If anything I think these should be dev_dbg().
>>>
>>> We do not have a dev at any of this point, and it does not
>>> belong to any specific device.
>>
>> Ah, true this is prior to the driver attaching... that said it seems
>> more relevant to print from probe() (where we do have a device) than
>> init where the device may remain idle due to some other policy.
>>
>>> Also I would like this
>>> _info and not _dbg so to always have it, also for production.
>>> See the chatter for a single SCSI disk the minimum we need
>>> is just the small print that tells all that we need (for now)
>>
>> Not sure we want to follow so closely in the footsteps of SCSI's chattiness.
> 
> Defintively not!  A single dev_info at ->probe time sounds ok, something
> like:
> 
> 	dev_info(dev, "registering region [0x%pa:0x%zx]\n",
> 			&pmem->phys_addr, pmem->size);
> 
> but there are plenty other drivers not that chatty, e.g. virtio and we're
> doing just fine for them.
> 

For me, my print is just the exact amount.

So I will see in my dmesg:

[  +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
[  +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)

...
[  +0.000537] pmem: init 2 devices => 0

So I have all the information. And I know the driver was actually loaded
successfully on the expected two regions.

Your print above will give me two prints with information I already have.
All I want to know from users logs is that the driver was actually loaded
successful or not. And when it was unloaded. I think this is the bear minimum
that gives me all the information I need. Less then this I would be missing
a very important event.

Thanks
Boaz


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-03 17:12     ` Yinghai Lu
  2015-04-03 20:54       ` Toshi Kani
@ 2015-04-05  9:18       ` Boaz Harrosh
  2015-04-05 20:06         ` Yinghai Lu
  2015-04-06 15:55       ` Christoph Hellwig
  2 siblings, 1 reply; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-05  9:18 UTC (permalink / raw)
  To: Yinghai Lu, Toshi Kani
  Cc: Jens Axboe, linux-nvdimm, the arch/x86 maintainers,
	Linux Kernel Mailing List, linux-fsdevel, Christoph Hellwig

On 04/03/2015 08:12 PM, Yinghai Lu wrote:
> On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <toshi.kani@hp.com> wrote:
>> On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
>>   :
>>> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
>>>  /*
>>>   * Find the highest page frame number we have available
>>>   */
>>> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>>> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
>>>  {
>>>       int i;
>>>       unsigned long last_pfn = 0;
>>> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>>>               unsigned long start_pfn;
>>>               unsigned long end_pfn;
>>>
>>> -             if (ei->type != type)
>>> +             /*
>>> +              * Persistent memory is accounted as ram for purposes of
>>> +              * establishing max_pfn and mem_map.
>>> +              */
>>> +             if (ei->type != E820_RAM && ei->type != E820_PRAM)
>>>                       continue;
>>
>> Should we also delete this code, accounting E820_PRAM as ram, along with
>> the deletion of reserve_pmem() in this version?
> 

Hi Yinghai, Toshi

In my old patches I did not have these updates as well, and everything
was very much usable, for a long time.

However. I actually liked these changes in Christoph's patches and
thought they should stay, here is why.

Today I will be sending patches to make pmem be supported with
page-struct as an optional alternative to the use of ioremap.
This is for advanced users that wants to RDMA direct_IO and so
on directly out of pmem.
At one point we had a BUG in some mm/memory.c code that was checking max_pfn.
Actually that was a bug and we do not go through this code anymore. And between
us that global variable max_pfn is a bad hack. But I kind of like to have it as
long as it is used. So code that wants to protect by max_pfn can still accept
pmem memory submitted to it.

I have tried to audit the Kernel use of max_pfn and I do not see how
this can hurt? I do see were it would theoretically help.

Think of a system that looks like this as a memory map:
1. VM (Volitile mem)
2. PM
3. VM
4. PM

Which is what is returned by current and planned NUMA implementations.
So pmem region-2 will be covered by max_pfn. But pmem region 4 will not.
If any code checks for max_pfn it will be OK with pmem-2 but *not* with
pmem-4. This is highly unexpected.

I think the all max_pfn should be killed ASAP, but until it is then
it will not hurt for pmem to be covered.

Thanks
Boaz


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-05  9:18       ` Boaz Harrosh
@ 2015-04-05 20:06         ` Yinghai Lu
  2015-04-06  7:16           ` Boaz Harrosh
  0 siblings, 1 reply; 50+ messages in thread
From: Yinghai Lu @ 2015-04-05 20:06 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Toshi Kani, Jens Axboe, linux-nvdimm, the arch/x86 maintainers,
	Linux Kernel Mailing List, linux-fsdevel, Christoph Hellwig

On Sun, Apr 5, 2015 at 2:18 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
> On 04/03/2015 08:12 PM, Yinghai Lu wrote:
>> On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <toshi.kani@hp.com> wrote:
>>> On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
>>>   :
>>>> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
>>>>  /*
>>>>   * Find the highest page frame number we have available
>>>>   */
>>>> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>>>> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
>>>>  {
>>>>       int i;
>>>>       unsigned long last_pfn = 0;
>>>> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>>>>               unsigned long start_pfn;
>>>>               unsigned long end_pfn;
>>>>
>>>> -             if (ei->type != type)
>>>> +             /*
>>>> +              * Persistent memory is accounted as ram for purposes of
>>>> +              * establishing max_pfn and mem_map.
>>>> +              */
>>>> +             if (ei->type != E820_RAM && ei->type != E820_PRAM)
>>>>                       continue;
>>>
>>> Should we also delete this code, accounting E820_PRAM as ram, along with
>>> the deletion of reserve_pmem() in this version?
>>
>
> Hi Yinghai, Toshi
>
> In my old patches I did not have these updates as well, and everything
> was very much usable, for a long time.
>
> However. I actually liked these changes in Christoph's patches and
> thought they should stay, here is why.
>
> Today I will be sending patches to make pmem be supported with
> page-struct as an optional alternative to the use of ioremap.
> This is for advanced users that wants to RDMA direct_IO and so
> on directly out of pmem.

but it is not related.  Should just remove those lines.

And even his original changes about memblock is not needed.

| You did not modify memblock_x86_fill() to treat
| E820_PRAM as E820_RAM, so memblock will not have any
| entry for E820_PRAM, so you do not need to call memblock_reserve
| there.
|
| And the same time, init_memory_mapping() will call
| init_range_memory_mapping/for_each_mem_pfn_range() to
| set kernel mapping for memory range in memblock only.
| So here calling init_memory_mapping will not do anything.
| then just drop calling to that init_memory_mapping.
| --- so will not kernel mapping pmem, is that what you intended to have?
|
| After those two changes, You do not need this reserve_pmem at all.
| Just drop it.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-05 20:06         ` Yinghai Lu
@ 2015-04-06  7:16           ` Boaz Harrosh
  0 siblings, 0 replies; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-06  7:16 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Toshi Kani, Jens Axboe, linux-nvdimm, the arch/x86 maintainers,
	Linux Kernel Mailing List, linux-fsdevel, Christoph Hellwig

On 04/05/2015 11:06 PM, Yinghai Lu wrote:
> On Sun, Apr 5, 2015 at 2:18 AM, Boaz Harrosh <boaz@plexistor.com> wrote:
<>
>> Hi Yinghai, Toshi
>>
>> In my old patches I did not have these updates as well, and everything
>> was very much usable, for a long time.
>>
>> However. I actually liked these changes in Christoph's patches and
>> thought they should stay, here is why.
>>
>> Today I will be sending patches to make pmem be supported with
>> page-struct as an optional alternative to the use of ioremap.
>> This is for advanced users that wants to RDMA direct_IO and so
>> on directly out of pmem.
> 
> but it is not related.  Should just remove those lines.
> 
> And even his original changes about memblock is not needed.
> 

Never mind. Has I said it hit us once in the passed but do to
a bug. I do not mind you can remove the max_pfn thing I can do
without it.

Thanks
Boaz

> | You did not modify memblock_x86_fill() to treat
> | E820_PRAM as E820_RAM, so memblock will not have any
> | entry for E820_PRAM, so you do not need to call memblock_reserve
> | there.
> |
> | And the same time, init_memory_mapping() will call
> | init_range_memory_mapping/for_each_mem_pfn_range() to
> | set kernel mapping for memory range in memblock only.
> | So here calling init_memory_mapping will not do anything.
> | then just drop calling to that init_memory_mapping.
> | --- so will not kernel mapping pmem, is that what you intended to have?
> |
> | After those two changes, You do not need this reserve_pmem at all.
> | Just drop it.
> 


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-05  7:44           ` Yinghai Lu
@ 2015-04-06  7:27             ` Ingo Molnar
  0 siblings, 0 replies; 50+ messages in thread
From: Ingo Molnar @ 2015-04-06  7:27 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Toshi Kani, Christoph Hellwig, linux-nvdimm, linux-fsdevel,
	Linux Kernel Mailing List, the arch/x86 maintainers, Jens Axboe


* Yinghai Lu <yinghai@kernel.org> wrote:

> On Sat, Apr 4, 2015 at 2:40 AM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > * Toshi Kani <toshi.kani@hp.com> wrote:
> >
> >> On Fri, 2015-04-03 at 10:12 -0700, Yinghai Lu wrote:
> >> > On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <toshi.kani@hp.com> wrote:
> >> > > On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
> >> >
> >> > should revert those end_of_ram change as attached.
> >>
> >> I confirmed that the pmem driver works with the change, and last_pfn is
> >> updated as expected.
> >
> > Could someone please send the fix with a changelog, etc?
> >
> 
> Why just fold those change into that commit.

Because we try to avoid doing rebases of otherwise tested trees, and 
because fixes will outline the thinking behind the code as well.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-03 17:12     ` Yinghai Lu
  2015-04-03 20:54       ` Toshi Kani
  2015-04-05  9:18       ` Boaz Harrosh
@ 2015-04-06 15:55       ` Christoph Hellwig
  2 siblings, 0 replies; 50+ messages in thread
From: Christoph Hellwig @ 2015-04-06 15:55 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Toshi Kani, Christoph Hellwig, linux-nvdimm, linux-fsdevel,
	Linux Kernel Mailing List, the arch/x86 maintainers, Jens Axboe

On Fri, Apr 03, 2015 at 10:12:39AM -0700, Yinghai Lu wrote:
> > Should we also delete this code, accounting E820_PRAM as ram, along with
> > the deletion of reserve_pmem() in this version?
> 
> should revert those end_of_ram change as attached.

This works fine for me:

Tested-by: Christoph Hellwig <hch@lst.de>


> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)

But I'd prefer not to re-add the argument here, it only obsfucates the
code.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-04  9:40         ` Ingo Molnar
  2015-04-05  7:44           ` Yinghai Lu
@ 2015-04-06 17:29           ` Toshi Kani
  2015-04-06 18:26             ` Yinghai Lu
  1 sibling, 1 reply; 50+ messages in thread
From: Toshi Kani @ 2015-04-06 17:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Yinghai Lu, Christoph Hellwig, linux-nvdimm, linux-fsdevel,
	Linux Kernel Mailing List, the arch/x86 maintainers, Jens Axboe

On Sat, 2015-04-04 at 11:40 +0200, Ingo Molnar wrote:
> * Toshi Kani <toshi.kani@hp.com> wrote:
> 
> > On Fri, 2015-04-03 at 10:12 -0700, Yinghai Lu wrote:
> > > On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <toshi.kani@hp.com> wrote:
> > > > On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
> > > >   :
> > > >> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
> > > >>  /*
> > > >>   * Find the highest page frame number we have available
> > > >>   */
> > > >> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> > > >> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> > > >>  {
> > > >>       int i;
> > > >>       unsigned long last_pfn = 0;
> > > >> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> > > >>               unsigned long start_pfn;
> > > >>               unsigned long end_pfn;
> > > >>
> > > >> -             if (ei->type != type)
> > > >> +             /*
> > > >> +              * Persistent memory is accounted as ram for purposes of
> > > >> +              * establishing max_pfn and mem_map.
> > > >> +              */
> > > >> +             if (ei->type != E820_RAM && ei->type != E820_PRAM)
> > > >>                       continue;
> > > >
> > > > Should we also delete this code, accounting E820_PRAM as ram, along with
> > > > the deletion of reserve_pmem() in this version?
> > > 
> > > should revert those end_of_ram change as attached.
> > 
> > I confirmed that the pmem driver works with the change, and last_pfn is
> > updated as expected.
> 
> Could someone please send the fix with a changelog, etc?

OK, I will send a patch for the fix, with suggested update from
Christoph of not to re-add 'type' argument to e820_end_pfn().

Yinghai, I will add your sign-off to the patch.  Let me know if you have
any concern.

Thanks,
-Toshi


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-06 18:26             ` Yinghai Lu
@ 2015-04-06 18:23               ` Toshi Kani
  0 siblings, 0 replies; 50+ messages in thread
From: Toshi Kani @ 2015-04-06 18:23 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, Christoph Hellwig, linux-nvdimm, linux-fsdevel,
	Linux Kernel Mailing List, the arch/x86 maintainers, Jens Axboe

On Mon, 2015-04-06 at 11:26 -0700, Yinghai Lu wrote:
> On Mon, Apr 6, 2015 at 10:29 AM, Toshi Kani <toshi.kani@hp.com> wrote:
> > On Sat, 2015-04-04 at 11:40 +0200, Ingo Molnar wrote:
> 
> > OK, I will send a patch for the fix, with suggested update from
> > Christoph of not to re-add 'type' argument to e820_end_pfn().
> >
> > Yinghai, I will add your sign-off to the patch.  Let me know if you have
> > any concern.
> 
> I think we should restore all to original.
> 
> e820_end_pfn(unsigned long limit_pfn, unsigned type)
> e820_end_of_ram_pfn
> e820_end_of_low_ram_pfn
> 
> otherwise it will cause confusing, because last two really have ram_pfn,
> and the first one does not has ram in the function name.

Good point.  I will revert back to the original.  

Thanks,
-Toshi



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH 1/2] x86: add support for the non-standard protected e820 type
  2015-04-06 17:29           ` Toshi Kani
@ 2015-04-06 18:26             ` Yinghai Lu
  2015-04-06 18:23               ` Toshi Kani
  0 siblings, 1 reply; 50+ messages in thread
From: Yinghai Lu @ 2015-04-06 18:26 UTC (permalink / raw)
  To: Toshi Kani
  Cc: Ingo Molnar, Christoph Hellwig, linux-nvdimm, linux-fsdevel,
	Linux Kernel Mailing List, the arch/x86 maintainers, Jens Axboe

On Mon, Apr 6, 2015 at 10:29 AM, Toshi Kani <toshi.kani@hp.com> wrote:
> On Sat, 2015-04-04 at 11:40 +0200, Ingo Molnar wrote:

> OK, I will send a patch for the fix, with suggested update from
> Christoph of not to re-add 'type' argument to e820_end_pfn().
>
> Yinghai, I will add your sign-off to the patch.  Let me know if you have
> any concern.

I think we should restore all to original.

e820_end_pfn(unsigned long limit_pfn, unsigned type)
e820_end_of_ram_pfn
e820_end_of_low_ram_pfn

otherwise it will cause confusing, because last two really have ram_pfn,
and the first one does not has ram in the function name.

Thanks

Yinghai

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH] pmem: Add prints at module load and unload
  2015-04-05  8:50           ` Boaz Harrosh
@ 2015-04-07 15:19             ` Christoph Hellwig
  2015-04-07 15:34               ` Boaz Harrosh
  0 siblings, 1 reply; 50+ messages in thread
From: Christoph Hellwig @ 2015-04-07 15:19 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Christoph Hellwig, Dan Williams, linux-nvdimm, linux-fsdevel,
	linux-kernel, X86 ML, Jens Axboe, Ingo Molnar

On Sun, Apr 05, 2015 at 11:50:06AM +0300, Boaz Harrosh wrote:
> [  +0.000537] pmem: init 2 devices => 0
> 
> So I have all the information. And I know the driver was actually loaded
> successfully on the expected two regions.

The second number will always be 0, so no point in printing it.
Also device can be hotplugged at runtime, e.g. your magic PCIe device,
so iff you really want to print anything ->probe is the place for it.

But I still don't think we need it, once booted you can trivially look
up the information in sysfs.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Linux-nvdimm] [PATCH] pmem: Add prints at module load and unload
  2015-04-07 15:19             ` Christoph Hellwig
@ 2015-04-07 15:34               ` Boaz Harrosh
  0 siblings, 0 replies; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-07 15:34 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Dan Williams, linux-nvdimm, linux-fsdevel, linux-kernel, X86 ML,
	Jens Axboe, Ingo Molnar

On 04/07/2015 06:19 PM, Christoph Hellwig wrote:
> On Sun, Apr 05, 2015 at 11:50:06AM +0300, Boaz Harrosh wrote:
>> [  +0.000537] pmem: init 2 devices => 0
>>
>> So I have all the information. And I know the driver was actually loaded
>> successfully on the expected two regions.
> 
> The second number will always be 0, so no point in printing it.

Why it can be any error that was collected in the load process its
the error we are returning to the Kernel from pmem_init()
Any none zero will mean module will not load and modprobe will
return with an exit code.

(Errors from probe() will be returned here)

> Also device can be hotplugged at runtime, e.g. your magic PCIe device,
> so iff you really want to print anything ->probe is the place for it.
> 

Yes but for now it is only very much static. We could add it later
when that happens, no?

> But I still don't think we need it, once booted you can trivially look
> up the information in sysfs.
> 

Its not for the systems I can probe around in sysfs and or just ls /dev/pmem*

It is postmortem when all I have is the logs and say a stack trace.
For us in the lab it is very important, all we need is the single
line on load/unload. But at probe time is fine as well, just more
chatty as you were complaining.

I will post two versions of this right away, I did it and forgot
to post.

Thanks
Boaz


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH A+B] pmem: Add prints at module load and unload
  2015-04-01  7:12 another pmem variant V3 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2015-04-02 15:31 ` [PATCH] pmem: Add prints at module load and unload Boaz Harrosh
@ 2015-04-07 15:46 ` Boaz Harrosh
  2015-04-07 15:47   ` [PATCH 1A] pmem: Add prints at pmem_probe/remove Boaz Harrosh
                     ` (2 more replies)
  3 siblings, 3 replies; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-07 15:46 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe
  Cc: ross.zwisler

Hi Christoph, Ingo

It is important in the lab for postmortem analysis to know if
pmem driver loaded and/or unloaded. And the return code from this
operation.

I submit two versions [A] more chatty and version [B]. Both give me
the info I need.

I like [B] because [A] prints more lines, and also the driver might not
load at the end and we would still not see it from [A]'s prints.

But it does not matter that much just take any one you guys like
better.

Here are the commit logs:
---
[PATCH 1A] pmem: Add prints at pmem_probe/remove

Add small prints at creation/remove of pmem devices.
So we can see in dmesg logs when users loaded/unloaded
the pmem driver and what devices were created.

The prints will look like this:
Printed by e820 on load:
 [  +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
 [  +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
 ...
Printed by modprobe pmem:
 [  +0.003065] pmem pmem.0.auto: probe [0x0000000100000000:0x60000000]
 [  +0.001816] pmem pmem.1.auto: probe [0x0000000160000000:0x80000000]
 ...
Printed by modprobe -r pmem:
 [ +16.299145] pmem pmem.1.auto: remove
 [  +0.011155] pmem pmem.0.auto: remove

Signed-off-by: Boaz Harrosh <boaz@plexistor.com>

---
[PATCH 1B] pmem: Add prints at module load/unload

When debugging people's systems it is helpful
to see what went on. The load and unload of pmem is
an important event.

The importance is the number of loaded devices and
error status. The exact device's addresses created
we can see from the other prints at e820 so no need
to duplicate this information.

After this patch the important info at dmesg will be:

Printed by the e820 code on load:
[  +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
[  +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)

...
Printed by pmem modprobe:
[  +0.000537] pmem: init 2 devices => 0

...
Printed by pmem modprobe -r:
[  +0.000537] pmem: exit

Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
---

Thanks
Boaz



^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH 1A] pmem: Add prints at pmem_probe/remove
  2015-04-07 15:46 ` [PATCH A+B] " Boaz Harrosh
@ 2015-04-07 15:47   ` Boaz Harrosh
  2015-04-07 15:47   ` [PATCH 1B] pmem: Add prints at module load and unload Boaz Harrosh
  2015-04-13  9:05   ` [PATCH A+B] " Greg KH
  2 siblings, 0 replies; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-07 15:47 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe
  Cc: ross.zwisler


Add small prints at creation/remove of pmem devices.
So we can see in dmesg logs when users loaded/unloaded
the pmem driver and what devices were created.

The prints will look like this:
Printed by e820 on load:
 [  +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
 [  +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
 ...
Printed by modprobe pmem:
 [  +0.003065] pmem pmem.0.auto: probe [0x0000000100000000:0x60000000]
 [  +0.001816] pmem pmem.1.auto: probe [0x0000000160000000:0x80000000]
 ...
Printed by modprobe -r pmem:
 [ +16.299145] pmem pmem.1.auto: remove
 [  +0.011155] pmem pmem.0.auto: remove

Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
---
 drivers/block/pmem.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 988f384..36017f1 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -216,6 +216,8 @@ static int pmem_probe(struct platform_device *pdev)
 		return PTR_ERR(pmem);
 
 	platform_set_drvdata(pdev, pmem);
+	dev_info(&pdev->dev, "probe [%pa:0x%zx]\n",
+		 &pmem->phys_addr, pmem->size);
 
 	return 0;
 }
@@ -224,6 +226,7 @@ static int pmem_remove(struct platform_device *pdev)
 {
 	struct pmem_device *pmem = platform_get_drvdata(pdev);
 
+	dev_info(&pdev->dev, "remove\n");
 	pmem_free(pmem);
 	return 0;
 }
-- 
1.9.3



^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH 1B] pmem: Add prints at module load and unload
  2015-04-07 15:46 ` [PATCH A+B] " Boaz Harrosh
  2015-04-07 15:47   ` [PATCH 1A] pmem: Add prints at pmem_probe/remove Boaz Harrosh
@ 2015-04-07 15:47   ` Boaz Harrosh
  2015-04-13  9:05   ` [PATCH A+B] " Greg KH
  2 siblings, 0 replies; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-07 15:47 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel, x86, axboe
  Cc: ross.zwisler


When debugging people's systems it is helpful
to see what went on. The load and unload of pmem is
an important event.

The importance is the number of loaded devices and
error status. The exact device's addresses created
we can see from the other prints at e820 so no need
to duplicate this information.

After this patch the important info at dmesg will be:

Printed by the e820 code on load:
[  +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
[  +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)

...
Printed by pmem modprobe:
[  +0.000537] pmem: init 2 devices => 0

...
Printed by pmem modprobe -r:
[  +0.000537] pmem: exit

Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
---
 drivers/block/pmem.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 988f384..44d3f33 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -248,6 +248,9 @@ static int __init pmem_init(void)
 	error = platform_driver_register(&pmem_driver);
 	if (error)
 		unregister_blkdev(pmem_major, "pmem");
+
+	pr_info("pmem: init %d devices => %d\n",
+		atomic_read(&pmem_index), error);
 	return error;
 }
 module_init(pmem_init);
@@ -256,6 +259,7 @@ static void pmem_exit(void)
 {
 	platform_driver_unregister(&pmem_driver);
 	unregister_blkdev(pmem_major, "pmem");
+	pr_info("pmem: exit\n");
 }
 module_exit(pmem_exit);
 
-- 
1.9.3



^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH A+B] pmem: Add prints at module load and unload
  2015-04-07 15:46 ` [PATCH A+B] " Boaz Harrosh
  2015-04-07 15:47   ` [PATCH 1A] pmem: Add prints at pmem_probe/remove Boaz Harrosh
  2015-04-07 15:47   ` [PATCH 1B] pmem: Add prints at module load and unload Boaz Harrosh
@ 2015-04-13  9:05   ` Greg KH
  2015-04-13 12:05     ` Boaz Harrosh
  2 siblings, 1 reply; 50+ messages in thread
From: Greg KH @ 2015-04-13  9:05 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel,
	x86, axboe, ross.zwisler

On Tue, Apr 07, 2015 at 06:46:15PM +0300, Boaz Harrosh wrote:
> Hi Christoph, Ingo
> 
> It is important in the lab for postmortem analysis to know if
> pmem driver loaded and/or unloaded. And the return code from this
> operation.
> 
> I submit two versions [A] more chatty and version [B]. Both give me
> the info I need.
> 
> I like [B] because [A] prints more lines, and also the driver might not
> load at the end and we would still not see it from [A]'s prints.
> 
> But it does not matter that much just take any one you guys like
> better.
> 
> Here are the commit logs:
> ---
> [PATCH 1A] pmem: Add prints at pmem_probe/remove
> 
> Add small prints at creation/remove of pmem devices.
> So we can see in dmesg logs when users loaded/unloaded
> the pmem driver and what devices were created.
> 
> The prints will look like this:
> Printed by e820 on load:
>  [  +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
>  [  +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
>  ...
> Printed by modprobe pmem:
>  [  +0.003065] pmem pmem.0.auto: probe [0x0000000100000000:0x60000000]
>  [  +0.001816] pmem pmem.1.auto: probe [0x0000000160000000:0x80000000]
>  ...
> Printed by modprobe -r pmem:
>  [ +16.299145] pmem pmem.1.auto: remove
>  [  +0.011155] pmem pmem.0.auto: remove
> 
> Signed-off-by: Boaz Harrosh <boaz@plexistor.com>

Don't polute the kernel logs with "chatty" things like this, just
trigger off of the block device uevent if you really want to know if the
block device is still around or not.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH A+B] pmem: Add prints at module load and unload
  2015-04-13  9:05   ` [PATCH A+B] " Greg KH
@ 2015-04-13 12:05     ` Boaz Harrosh
  2015-04-13 12:36       ` Greg KH
  0 siblings, 1 reply; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-13 12:05 UTC (permalink / raw)
  To: Greg KH, Boaz Harrosh
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel,
	x86, axboe, ross.zwisler

On 04/13/2015 12:05 PM, Greg KH wrote:
> On Tue, Apr 07, 2015 at 06:46:15PM +0300, Boaz Harrosh wrote:
>> Hi Christoph, Ingo
>>
>> It is important in the lab for postmortem analysis to know if
>> pmem driver loaded and/or unloaded. And the return code from this
>> operation.
>>
>> I submit two versions [A] more chatty and version [B]. Both give me
>> the info I need.
>>
>> I like [B] because [A] prints more lines, and also the driver might not
>> load at the end and we would still not see it from [A]'s prints.
>>
>> But it does not matter that much just take any one you guys like
>> better.
>>
>> Here are the commit logs:
>> ---
>> [PATCH 1A] pmem: Add prints at pmem_probe/remove
>>
>> Add small prints at creation/remove of pmem devices.
>> So we can see in dmesg logs when users loaded/unloaded
>> the pmem driver and what devices were created.
>>
>> The prints will look like this:
>> Printed by e820 on load:
>>  [  +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
>>  [  +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
>>  ...
>> Printed by modprobe pmem:
>>  [  +0.003065] pmem pmem.0.auto: probe [0x0000000100000000:0x60000000]
>>  [  +0.001816] pmem pmem.1.auto: probe [0x0000000160000000:0x80000000]
>>  ...
>> Printed by modprobe -r pmem:
>>  [ +16.299145] pmem pmem.1.auto: remove
>>  [  +0.011155] pmem pmem.0.auto: remove
>>
>> Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
> 
> Don't polute the kernel logs with "chatty" things like this, 

Why do you say this is chatty. With [B] This is a single line of print
on modprobe. With [A] it is a print per device (Is why I like [B])
Compare to all the other block-devices in the system, say scsi, that print
bunch of info for each device, this is very very minimalistic.

> just
> trigger off of the block device uevent if you really want to know if the
> block device is still around or not.
> 

Again I do not need this for run time. At run time I have two tons of ways
to check and see. BTW a uevent is already triggered for insertion as part
of regular block core operation.

I need this at dmesg for when analyzing users logs, say when a crash happens.
I need to see what/when drivers were loaded/unloaded. It is common practice
in dmseg for block devices to leave foot prints.

Sigh, do you not believe that this single line in dmseg makes my life much
easier?

> thanks,
> greg k-h

Thanks
Boaz


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH A+B] pmem: Add prints at module load and unload
  2015-04-13 12:05     ` Boaz Harrosh
@ 2015-04-13 12:36       ` Greg KH
  2015-04-13 13:20         ` Boaz Harrosh
  0 siblings, 1 reply; 50+ messages in thread
From: Greg KH @ 2015-04-13 12:36 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel,
	x86, axboe, ross.zwisler

On Mon, Apr 13, 2015 at 03:05:27PM +0300, Boaz Harrosh wrote:
> On 04/13/2015 12:05 PM, Greg KH wrote:
> > On Tue, Apr 07, 2015 at 06:46:15PM +0300, Boaz Harrosh wrote:
> >> Hi Christoph, Ingo
> >>
> >> It is important in the lab for postmortem analysis to know if
> >> pmem driver loaded and/or unloaded. And the return code from this
> >> operation.
> >>
> >> I submit two versions [A] more chatty and version [B]. Both give me
> >> the info I need.
> >>
> >> I like [B] because [A] prints more lines, and also the driver might not
> >> load at the end and we would still not see it from [A]'s prints.
> >>
> >> But it does not matter that much just take any one you guys like
> >> better.
> >>
> >> Here are the commit logs:
> >> ---
> >> [PATCH 1A] pmem: Add prints at pmem_probe/remove
> >>
> >> Add small prints at creation/remove of pmem devices.
> >> So we can see in dmesg logs when users loaded/unloaded
> >> the pmem driver and what devices were created.
> >>
> >> The prints will look like this:
> >> Printed by e820 on load:
> >>  [  +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
> >>  [  +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
> >>  ...
> >> Printed by modprobe pmem:
> >>  [  +0.003065] pmem pmem.0.auto: probe [0x0000000100000000:0x60000000]
> >>  [  +0.001816] pmem pmem.1.auto: probe [0x0000000160000000:0x80000000]
> >>  ...
> >> Printed by modprobe -r pmem:
> >>  [ +16.299145] pmem pmem.1.auto: remove
> >>  [  +0.011155] pmem pmem.0.auto: remove
> >>
> >> Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
> > 
> > Don't polute the kernel logs with "chatty" things like this, 
> 
> Why do you say this is chatty. With [B] This is a single line of print
> on modprobe. With [A] it is a print per device (Is why I like [B])
> Compare to all the other block-devices in the system, say scsi, that print
> bunch of info for each device, this is very very minimalistic.
> 
> > just
> > trigger off of the block device uevent if you really want to know if the
> > block device is still around or not.
> > 
> 
> Again I do not need this for run time. At run time I have two tons of ways
> to check and see. BTW a uevent is already triggered for insertion as part
> of regular block core operation.
> 
> I need this at dmesg for when analyzing users logs, say when a crash happens.
> I need to see what/when drivers were loaded/unloaded. It is common practice
> in dmseg for block devices to leave foot prints.
> 
> Sigh, do you not believe that this single line in dmseg makes my life much
> easier?

it might, as grepping kernel logs is a "lazy" way of doing things.  We
are working hard to make it so that each and every device found in the
system does _not_ print out kernel log messages, as they really are
pretty pointless and useless overall for the 99.99% of the time.

Anyway, I'm not the maintainer here of this driver, so I don't have the
final say, but really, if you are relying on kernel log messages for
specific things to happen in your system, you are doing it wrong as they
can change and disappear in any future kernel release, they are NOT an
api.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH A+B] pmem: Add prints at module load and unload
  2015-04-13 12:36       ` Greg KH
@ 2015-04-13 13:20         ` Boaz Harrosh
  2015-04-13 13:36           ` Greg KH
  0 siblings, 1 reply; 50+ messages in thread
From: Boaz Harrosh @ 2015-04-13 13:20 UTC (permalink / raw)
  To: Greg KH
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel,
	x86, axboe, ross.zwisler

On 04/13/2015 03:36 PM, Greg KH wrote:
> if you are relying on kernel log messages for
> specific things to happen in your system, you are doing it wrong as they
> can change and disappear in any future kernel release, they are NOT an
> api.
> 

Again. I am not doing anything with these messages. Yes messages are not
API. I do not need them for a live and running system, at all.

I need it for a dead system. for a long dead system. A system that has failed
3 days ago and all I have is the system/application logs and the Kernel logs.

I guess I'm antic and you guys have other means for these things, I wish to
learn then. So far all we are asking to keep are the logs, Is there something
else I need to store from a *past* running system?

Until now this gave me a very good place to start my investigation. With
empty log files, how do I then proceed?

> thanks,
> greg k-h

It looks like everyone agrees with you though. I wish I knew something you
guys know, I do not see how I can analyze failing systems without logs.

[I promise this is my last email on the subject]

Thanks
Boaz


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH A+B] pmem: Add prints at module load and unload
  2015-04-13 13:20         ` Boaz Harrosh
@ 2015-04-13 13:36           ` Greg KH
  0 siblings, 0 replies; 50+ messages in thread
From: Greg KH @ 2015-04-13 13:36 UTC (permalink / raw)
  To: Boaz Harrosh
  Cc: Christoph Hellwig, linux-nvdimm, linux-fsdevel, linux-kernel,
	x86, axboe, ross.zwisler

On Mon, Apr 13, 2015 at 04:20:37PM +0300, Boaz Harrosh wrote:
> On 04/13/2015 03:36 PM, Greg KH wrote:
> > if you are relying on kernel log messages for
> > specific things to happen in your system, you are doing it wrong as they
> > can change and disappear in any future kernel release, they are NOT an
> > api.
> > 
> 
> Again. I am not doing anything with these messages. Yes messages are not
> API. I do not need them for a live and running system, at all.
> 
> I need it for a dead system. for a long dead system. A system that has failed
> 3 days ago and all I have is the system/application logs and the Kernel logs.
> 
> I guess I'm antic and you guys have other means for these things, I wish to
> learn then. So far all we are asking to keep are the logs, Is there something
> else I need to store from a *past* running system?
> 
> Until now this gave me a very good place to start my investigation. With
> empty log files, how do I then proceed?

What would this simple "the device was removed" log message help you out
with on a dead and old system?

Same goes for the "device found!" message, how does that help anyone
out?  If they do help in some manner with debugging, then make them
debugging messages, and enable them on your systems when doing testing.
But for general purpose systems, if they don't do anything except be
noisy, might as well turn them off, right?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type
  2015-04-02 19:51         ` Andy Lutomirski
@ 2015-04-16 22:31           ` Andy Lutomirski
  2015-04-17  0:55             ` Elliott, Robert (Server Storage)
  0 siblings, 1 reply; 50+ messages in thread
From: Andy Lutomirski @ 2015-04-16 22:31 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: axboe, Boaz Harrosh, Dan Williams, H. Peter Anvin, Jens Axboe,
	linux-kernel, Thomas Gleixner, Borislav Petkov, Andrew Morton,
	Linus Torvalds, Christoph Hellwig, Ross Zwisler, Matthew Wilcox,
	keith.busch, linux-tip-commits

On Thu, Apr 2, 2015 at 12:51 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Thu, Apr 2, 2015 at 12:13 PM, Ingo Molnar <mingo@kernel.org> wrote:
>>
>> * Andy Lutomirski <luto@amacapital.net> wrote:
>>
>>> On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
>>> <tipbot@zytor.com> wrote:
>>> > Commit-ID:  ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>>> > Gitweb:     http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>>> > Author:     Christoph Hellwig <hch@lst.de>
>>> > AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
>>> > Committer:  Ingo Molnar <mingo@kernel.org>
>>> > CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
>>> >
>>> > x86/mm: Add support for the non-standard protected e820 type
>>> >
>>> > Various recent BIOSes support NVDIMMs or ADR using a
>>> > non-standard e820 memory type, and Intel supplied reference
>>> > Linux code using this type to various vendors.
>>> >
>>> > Wire this e820 table type up to export platform devices for the
>>> > pmem driver so that we can use it in Linux.
>>>
>>> This scares me a bit.  Do we know that the upcoming ACPI 6.0
>>> enumeration mechanism *won't* use e820 type 12? [...]
>>
>> So I know nothing about it, but I'd be surprised if e820 was touched
>> at all, as e820 isn't really well suited to enumerate more complex
>> resources, and it appears pmem wants to grow into complex directions?
>
> I hope so, but I have no idea what the ACPI committee's schemes are.
>
> We could require pmem.enable_legacy_e820=Y to load the driver for now
> if we're concerned about it.
>

ACPI 6.0 is out:

http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf

AFAICT from a quick read, ACPI 6.0 systems will show EFI type 14
(EfiPersistentMemory), ACPI type 7 (AddressRangePersistentMemory) and
e820 type 7.

Type 12 is still "OEM defined".  See table 15-312.  Maybe I'm reading
this wrong.

*However*, ACPI 6.0 unsurprisingly also has a real enumeration
mechanism for NVDIMMs and such, and those should take precedence.

So this driver could plausibly be safe even on ACPI 6.0 systems.
Someone from one of the relevant vendors should probably confirm that.
I'm still a bit nervous, though.

--Andy

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type
  2015-04-16 22:31           ` Andy Lutomirski
@ 2015-04-17  0:55             ` Elliott, Robert (Server Storage)
  2015-04-17  0:59               ` Andy Lutomirski
  0 siblings, 1 reply; 50+ messages in thread
From: Elliott, Robert (Server Storage) @ 2015-04-17  0:55 UTC (permalink / raw)
  To: Andy Lutomirski, Ingo Molnar
  Cc: axboe, Boaz Harrosh, Dan Williams, H. Peter Anvin, Jens Axboe,
	linux-kernel, Thomas Gleixner, Borislav Petkov, Andrew Morton,
	Linus Torvalds, Christoph Hellwig, Ross Zwisler, Matthew Wilcox,
	keith.busch, linux-tip-commits

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 3532 bytes --]



> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> owner@vger.kernel.org] On Behalf Of Andy Lutomirski
> Sent: Thursday, April 16, 2015 5:31 PM
> To: Ingo Molnar
> Subject: Re: [tip:x86/pmem] x86/mm: Add support for the non-standard
> protected e820 type
> 
> On Thu, Apr 2, 2015 at 12:51 PM, Andy Lutomirski <luto@amacapital.net>
> wrote:
> > On Thu, Apr 2, 2015 at 12:13 PM, Ingo Molnar <mingo@kernel.org> wrote:
> >>
> >> * Andy Lutomirski <luto@amacapital.net> wrote:
> >>
> >>> On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
> >>> <tipbot@zytor.com> wrote:
> >>> > Commit-ID:  ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> >>> > Gitweb:
> http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> >>> > Author:     Christoph Hellwig <hch@lst.de>
> >>> > AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
> >>> > Committer:  Ingo Molnar <mingo@kernel.org>
> >>> > CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
> >>> >
> >>> > x86/mm: Add support for the non-standard protected e820 type
> >>> >
> >>> > Various recent BIOSes support NVDIMMs or ADR using a
> >>> > non-standard e820 memory type, and Intel supplied reference
> >>> > Linux code using this type to various vendors.
> >>> >
> >>> > Wire this e820 table type up to export platform devices for the
> >>> > pmem driver so that we can use it in Linux.
> >>>
> >>> This scares me a bit.  Do we know that the upcoming ACPI 6.0
> >>> enumeration mechanism *won't* use e820 type 12? [...]
> >>
> >> So I know nothing about it, but I'd be surprised if e820 was touched
> >> at all, as e820 isn't really well suited to enumerate more complex
> >> resources, and it appears pmem wants to grow into complex directions?
> >
> > I hope so, but I have no idea what the ACPI committee's schemes are.
> >
> > We could require pmem.enable_legacy_e820=Y to load the driver for now
> > if we're concerned about it.
> >
> 
> ACPI 6.0 is out:
> 
> http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
> 
> AFAICT from a quick read, ACPI 6.0 systems will show EFI type 14
> (EfiPersistentMemory), ACPI type 7 (AddressRangePersistentMemory) and
> e820 type 7.
> 
> Type 12 is still "OEM defined".  See table 15-312.  Maybe I'm reading
> this wrong.
> 
> *However*, ACPI 6.0 unsurprisingly also has a real enumeration
> mechanism for NVDIMMs and such, and those should take precedence.
> 
> So this driver could plausibly be safe even on ACPI 6.0 systems.
> Someone from one of the relevant vendors should probably confirm that.
> I'm still a bit nervous, though.

That value was set aside on behalf of this pre-standard usage, to
keep future ACPI revisions from standardizing it for anything else.
The kernel should be just as safe in continuing to recognize that
value as it is now.

New legacy BIOS systems should follow ACPI 6.0 going forward and
report type 7 in their E820 tables, along with meeting the other
ACPI 6.0 requirements.

UEFI systems don't provide an E820 table; that's an artificial
creation by the bootloader (e.g., grub2) or the kernel with its
add_efi_memmmap parameter. These systems will just report the
EFI memory type 14. There was no pre-standard EFI type
identified that needed to be blocked out.

Any software creating a fake E820 table (grub2, etc.) should 
map EFI type 14 to E820 type 7.

---
Robert Elliott, HP Server Storage
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type
  2015-04-17  0:55             ` Elliott, Robert (Server Storage)
@ 2015-04-17  0:59               ` Andy Lutomirski
  0 siblings, 0 replies; 50+ messages in thread
From: Andy Lutomirski @ 2015-04-17  0:59 UTC (permalink / raw)
  To: Elliott, Robert (Server Storage)
  Cc: Ingo Molnar, axboe, Boaz Harrosh, Dan Williams, H. Peter Anvin,
	Jens Axboe, linux-kernel, Thomas Gleixner, Borislav Petkov,
	Andrew Morton, Linus Torvalds, Christoph Hellwig, Ross Zwisler,
	Matthew Wilcox, keith.busch, linux-tip-commits

On Thu, Apr 16, 2015 at 5:55 PM, Elliott, Robert (Server Storage)
<Elliott@hp.com> wrote:
>
>
>> -----Original Message-----
>> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
>> owner@vger.kernel.org] On Behalf Of Andy Lutomirski
>> Sent: Thursday, April 16, 2015 5:31 PM
>> To: Ingo Molnar
>> Subject: Re: [tip:x86/pmem] x86/mm: Add support for the non-standard
>> protected e820 type
>>
>> On Thu, Apr 2, 2015 at 12:51 PM, Andy Lutomirski <luto@amacapital.net>
>> wrote:
>> > On Thu, Apr 2, 2015 at 12:13 PM, Ingo Molnar <mingo@kernel.org> wrote:
>> >>
>> >> * Andy Lutomirski <luto@amacapital.net> wrote:
>> >>
>> >>> On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
>> >>> <tipbot@zytor.com> wrote:
>> >>> > Commit-ID:  ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>> >>> > Gitweb:
>> http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>> >>> > Author:     Christoph Hellwig <hch@lst.de>
>> >>> > AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
>> >>> > Committer:  Ingo Molnar <mingo@kernel.org>
>> >>> > CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
>> >>> >
>> >>> > x86/mm: Add support for the non-standard protected e820 type
>> >>> >
>> >>> > Various recent BIOSes support NVDIMMs or ADR using a
>> >>> > non-standard e820 memory type, and Intel supplied reference
>> >>> > Linux code using this type to various vendors.
>> >>> >
>> >>> > Wire this e820 table type up to export platform devices for the
>> >>> > pmem driver so that we can use it in Linux.
>> >>>
>> >>> This scares me a bit.  Do we know that the upcoming ACPI 6.0
>> >>> enumeration mechanism *won't* use e820 type 12? [...]
>> >>
>> >> So I know nothing about it, but I'd be surprised if e820 was touched
>> >> at all, as e820 isn't really well suited to enumerate more complex
>> >> resources, and it appears pmem wants to grow into complex directions?
>> >
>> > I hope so, but I have no idea what the ACPI committee's schemes are.
>> >
>> > We could require pmem.enable_legacy_e820=Y to load the driver for now
>> > if we're concerned about it.
>> >
>>
>> ACPI 6.0 is out:
>>
>> http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
>>
>> AFAICT from a quick read, ACPI 6.0 systems will show EFI type 14
>> (EfiPersistentMemory), ACPI type 7 (AddressRangePersistentMemory) and
>> e820 type 7.
>>
>> Type 12 is still "OEM defined".  See table 15-312.  Maybe I'm reading
>> this wrong.
>>
>> *However*, ACPI 6.0 unsurprisingly also has a real enumeration
>> mechanism for NVDIMMs and such, and those should take precedence.
>>
>> So this driver could plausibly be safe even on ACPI 6.0 systems.
>> Someone from one of the relevant vendors should probably confirm that.
>> I'm still a bit nervous, though.
>
> That value was set aside on behalf of this pre-standard usage, to
> keep future ACPI revisions from standardizing it for anything else.
> The kernel should be just as safe in continuing to recognize that
> value as it is now.
>
> New legacy BIOS systems should follow ACPI 6.0 going forward and
> report type 7 in their E820 tables, along with meeting the other
> ACPI 6.0 requirements.
>
> UEFI systems don't provide an E820 table; that's an artificial
> creation by the bootloader (e.g., grub2) or the kernel with its
> add_efi_memmmap parameter. These systems will just report the
> EFI memory type 14. There was no pre-standard EFI type
> identified that needed to be blocked out.
>
> Any software creating a fake E820 table (grub2, etc.) should
> map EFI type 14 to E820 type 7.

Great!

Off-topic: do you know what an NVDIMM control block is?  Are we really
supposed to access NVDIMM registers using MMIO?  If so, that must have
been a beast to get right in the memory controller.  Do you know if
any vendor has published docs for this stuff?

(I'm trying to figure out whether we're supposed to use SMBUS cycles
for any purpose so I can either fix my driver or relegate it to
legacy-only status.)

--Andy

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2015-04-17  1:00 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-01  7:12 another pmem variant V3 Christoph Hellwig
2015-04-01  7:12 ` [PATCH 1/2] x86: add support for the non-standard protected e820 type Christoph Hellwig
2015-04-01 14:25   ` [PATCH] SQUASHME: Fixes to e820 handling of pmem Boaz Harrosh
2015-04-02  9:30     ` Christoph Hellwig
2015-04-02  9:37       ` Ingo Molnar
2015-04-02  9:40         ` Christoph Hellwig
2015-04-02 11:18         ` Christoph Hellwig
2015-04-02 11:20       ` Boaz Harrosh
2015-04-02 12:31   ` [tip:x86/pmem] x86/mm: Add support for the non-standard protected e820 type tip-bot for Christoph Hellwig
2015-04-02 19:08     ` Andy Lutomirski
2015-04-02 19:13       ` Ingo Molnar
2015-04-02 19:51         ` Andy Lutomirski
2015-04-16 22:31           ` Andy Lutomirski
2015-04-17  0:55             ` Elliott, Robert (Server Storage)
2015-04-17  0:59               ` Andy Lutomirski
2015-04-02 20:28     ` Yinghai Lu
2015-04-02 20:23   ` [PATCH 1/2] x86: add " Yinghai Lu
2015-04-03 16:14   ` [Linux-nvdimm] " Toshi Kani
2015-04-03 17:12     ` Yinghai Lu
2015-04-03 20:54       ` Toshi Kani
2015-04-04  9:40         ` Ingo Molnar
2015-04-05  7:44           ` Yinghai Lu
2015-04-06  7:27             ` Ingo Molnar
2015-04-06 17:29           ` Toshi Kani
2015-04-06 18:26             ` Yinghai Lu
2015-04-06 18:23               ` Toshi Kani
2015-04-05  9:18       ` Boaz Harrosh
2015-04-05 20:06         ` Yinghai Lu
2015-04-06  7:16           ` Boaz Harrosh
2015-04-06 15:55       ` Christoph Hellwig
2015-04-01  7:12 ` [PATCH 2/2] pmem: add a driver for persistent memory Christoph Hellwig
2015-04-01 15:18   ` Boaz Harrosh
2015-04-02  9:32     ` Christoph Hellwig
2015-04-02 12:31   ` [tip:x86/pmem] drivers/block/pmem: Add " tip-bot for Ross Zwisler
2015-04-02 15:31 ` [PATCH] pmem: Add prints at module load and unload Boaz Harrosh
2015-04-02 15:39   ` [Linux-nvdimm] " Dan Williams
2015-04-02 15:47     ` Boaz Harrosh
2015-04-02 16:01       ` Dan Williams
2015-04-02 16:44         ` Christoph Hellwig
2015-04-05  8:50           ` Boaz Harrosh
2015-04-07 15:19             ` Christoph Hellwig
2015-04-07 15:34               ` Boaz Harrosh
2015-04-07 15:46 ` [PATCH A+B] " Boaz Harrosh
2015-04-07 15:47   ` [PATCH 1A] pmem: Add prints at pmem_probe/remove Boaz Harrosh
2015-04-07 15:47   ` [PATCH 1B] pmem: Add prints at module load and unload Boaz Harrosh
2015-04-13  9:05   ` [PATCH A+B] " Greg KH
2015-04-13 12:05     ` Boaz Harrosh
2015-04-13 12:36       ` Greg KH
2015-04-13 13:20         ` Boaz Harrosh
2015-04-13 13:36           ` Greg KH

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.