All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] Remove memory from nodes for memtrace.
@ 2017-02-22 21:39 Rashmica Gupta
  2017-02-23  3:38 ` Stewart Smith
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Rashmica Gupta @ 2017-02-22 21:39 UTC (permalink / raw)
  To: linuxppc-dev, npiggin, bsingharora, dllehr, mpe; +Cc: Rashmica Gupta

 Some powerpc hardware features may want to gain access to a
 chunk of undisturbed real memory.  This update provides a means to unplug
 said memory from the kernel with a set of sysfs calls.  By writing an integer
 containing  the size of memory to be unplugged into
 /sys/kernel/debug/powerpc/memtrace/enable, the code will remove that much
 memory from the end of each available chip's memory space. In addition, the
 means to read out the contents of the unplugged memory is also provided by
 reading out the /sys/kernel/debug/powerpc/memtrace/<chip-id>/dump file.

Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
---
Written by Douglas Lehr <dllehr@us.ibm.com>.
Have tested and seems to work as I would expect. Only change I have made from
the original is to check that the value being written to the debugfs file is
not 0 (or obscenely large), as otherwise you get a nice kernel oops where the
kernel attempts to access data at 0xfffffffe0.  

Thoughts about doing this with hot unplug or other changes?

 arch/powerpc/mm/hash_native_64.c          |  39 +++-
 arch/powerpc/platforms/powernv/Makefile   |   1 +
 arch/powerpc/platforms/powernv/memtrace.c | 285 ++++++++++++++++++++++++++++++
 3 files changed, 321 insertions(+), 4 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/memtrace.c

diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index cc33260..44cc6ce 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -3,7 +3,7 @@
  *
  * SMP scalability work:
  *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
@@ -181,7 +181,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
 	while (1) {
 		if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
 			break;
-		while(test_bit(HPTE_LOCK_BIT, word))
+		while (test_bit(HPTE_LOCK_BIT, word))
 			cpu_relax();
 	}
 }
@@ -208,10 +208,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 	}
 
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
-		if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
+		if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
 			/* retry with lock held */
 			native_lock_hpte(hptep);
-			if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
+			if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID))
 				break;
 			native_unlock_hpte(hptep);
 		}
@@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 	tlbie(vpn, psize, psize, ssize, 0);
 }
 
+/*
+ * Remove a bolted kernel entry. Memory hotplug uses this.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
+{
+	unsigned long vpn;
+	unsigned long vsid;
+	long slot;
+	struct hash_pte *hptep;
+
+	vsid = get_kernel_vsid(ea, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
+
+	slot = native_hpte_find(vpn, psize, ssize);
+	if (slot == -1)
+		return -ENOENT;
+
+	hptep = htab_address + slot;
+
+	/* Invalidate the hpte */
+	hptep->v = 0;
+
+	/* Invalidate the TLB */
+	tlbie(vpn, psize, psize, ssize, 0);
+	return 0;
+}
+
+
 static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 				   int bpsize, int apsize, int ssize, int local)
 {
@@ -722,6 +752,7 @@ void __init hpte_init_native(void)
 	mmu_hash_ops.hpte_invalidate	= native_hpte_invalidate;
 	mmu_hash_ops.hpte_updatepp	= native_hpte_updatepp;
 	mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
+	mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
 	mmu_hash_ops.hpte_insert	= native_hpte_insert;
 	mmu_hash_ops.hpte_remove	= native_hpte_remove;
 	mmu_hash_ops.hpte_clear_all	= native_hpte_clear;
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index b5d98cb..2026661 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -11,4 +11,5 @@ obj-$(CONFIG_EEH)	+= eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
 obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
+obj-$(CONFIG_MEMORY_HOTREMOVE)         += memtrace.o
 obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
new file mode 100644
index 0000000..fdba3ee
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -0,0 +1,285 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+
+#define pr_fmt(fmt) "powernv-memtrace: " fmt
+
+#include <linux/bitops.h>
+#include <linux/string.h>
+#include <linux/memblock.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <asm/machdep.h>
+
+struct memtrace_entry {
+	void *mem;
+	u64 start;
+	u64 size;
+	u32 nid;
+	struct dentry *dir;
+	char name[16];
+};
+
+static struct memtrace_entry *memtrace_array;
+static unsigned int memtrace_array_nr;
+
+static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
+			     size_t count, loff_t *ppos)
+{
+	struct memtrace_entry *ent = filp->private_data;
+
+	return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
+}
+
+static bool valid_memtrace_range(struct memtrace_entry *dev,
+				 unsigned long start, unsigned long size)
+{
+	if ((dev->start <= start) &&
+	    ((start + size) <= (dev->start + dev->size)))
+		return true;
+
+	return false;
+}
+
+static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct memtrace_entry *dev = filp->private_data;
+
+	if (!valid_memtrace_range(dev, vma->vm_pgoff << PAGE_SHIFT, size))
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	if (io_remap_pfn_range(vma, vma->vm_start,
+			       vma->vm_pgoff + (dev->start >> PAGE_SHIFT),
+			       size, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static const struct file_operations memtrace_fops = {
+	.llseek = default_llseek,
+	.read	= memtrace_read,
+	.mmap	= memtrace_mmap,
+	.open	= simple_open,
+};
+
+static void flush_memory_region(u64 base, u64 size)
+{
+	unsigned long line_size = ppc64_caches.dline_size;
+	u64 end = base + size;
+	u64 addr;
+
+	base = round_down(base, line_size);
+	end = round_up(end, line_size);
+
+	for (addr = base; addr < end; addr += line_size)
+		asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory");
+}
+
+static int check_memblock_online(struct memory_block *mem, void *arg)
+{
+	if (mem->state != MEM_ONLINE)
+		return -1;
+
+	return 0;
+}
+
+static int change_memblock_state(struct memory_block *mem, void *arg)
+{
+	unsigned long state = (unsigned long)arg;
+
+	mem->state = state;
+	return 0;
+}
+
+static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
+{
+	u64 end_pfn = start_pfn + nr_pages - 1;
+
+	if (walk_memory_range(start_pfn, end_pfn, NULL,
+	    check_memblock_online))
+		return false;
+
+	walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
+			  change_memblock_state);
+
+	if (offline_pages(start_pfn, nr_pages)) {
+		walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
+				  change_memblock_state);
+		return false;
+	}
+
+	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
+			  change_memblock_state);
+
+	/* RCU grace period? */
+	flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), nr_pages << PAGE_SHIFT);
+
+	remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
+
+	return true;
+}
+
+static u64 memtrace_alloc_node(u32 nid, u64 size)
+{
+	u64 start_pfn, end_pfn, nr_pages;
+	u64 base_pfn;
+
+	if (!NODE_DATA(nid) || !node_spanned_pages(nid))
+		return 0;
+
+	start_pfn = node_start_pfn(nid);
+	end_pfn = node_end_pfn(nid);
+	nr_pages = size >> PAGE_SHIFT;
+
+	/* Trace memory needs to be aligned to the size */
+	start_pfn = round_up(start_pfn, nr_pages);
+
+	for (base_pfn = start_pfn; base_pfn < end_pfn; base_pfn += nr_pages) {
+		if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true)
+			return base_pfn << PAGE_SHIFT;
+	}
+
+	return 0;
+}
+
+static int memtrace_init_regions_runtime(u64 size)
+{
+	u64 m;
+	u32 nid;
+
+	memtrace_array = kzalloc(sizeof(struct memtrace_entry) *
+				num_online_nodes(), GFP_KERNEL);
+	if (!memtrace_array) {
+		pr_err("Failed to allocate memtrace_array\n");
+		return -EINVAL;
+	}
+
+	for_each_online_node(nid) {
+		m = memtrace_alloc_node(nid, size);
+		/*
+		 * A node might not have any local memory, so warn but
+		 * continue on.
+		 */
+		if (!m) {
+			pr_err("Failed to allocate trace memory on node %d\n",
+				 nid);
+		} else {
+			pr_info("Allocated trace memory on node %d at "
+				"0x%016llx\n", nid, m);
+
+			memtrace_array[memtrace_array_nr].start = m;
+			memtrace_array[memtrace_array_nr].size = size;
+			memtrace_array[memtrace_array_nr].nid = nid;
+			memtrace_array_nr++;
+		}
+	}
+
+	return 0;
+}
+
+static struct dentry *memtrace_debugfs_dir;
+
+static int memtrace_init_debugfs(void)
+{
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < memtrace_array_nr; i++) {
+		struct dentry *dir;
+		struct memtrace_entry *ent = &memtrace_array[i];
+
+		ent->mem = ioremap(ent->start, ent->size);
+		/* Warn but continue on */
+		if (!ent->mem) {
+			pr_err("Failed to map trace memory at 0x%llx\n",
+				 ent->start);
+			ret = -1;
+			continue;
+		}
+
+		snprintf(ent->name, 16, "%08x", ent->nid);
+		dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir);
+		if (!dir)
+			return -1;
+
+		ent->dir = dir;
+		debugfs_create_file("trace", S_IRUSR, dir, ent, &memtrace_fops);
+		debugfs_create_x64("start", S_IRUSR, dir, &ent->start);
+		debugfs_create_x64("size", S_IRUSR, dir, &ent->size);
+		debugfs_create_u32("node", S_IRUSR, dir, &ent->nid);
+	}
+
+	return ret;
+}
+
+static u64 memtrace_size;
+
+static int memtrace_enable_set(void *data, u64 val)
+{
+	if (memtrace_size)
+		return -EINVAL;
+
+	if (!val)
+		return -EINVAL;
+
+	/* Make sure size is aligned to a memory block */
+	if (val & (memory_block_size_bytes()-1))
+		return -EINVAL;
+
+	if (memtrace_init_regions_runtime(val))
+		return -EINVAL;
+
+	if (memtrace_init_debugfs())
+		return -EINVAL;
+
+	memtrace_size = val;
+
+	return 0;
+}
+
+static int memtrace_enable_get(void *data, u64 *val)
+{
+	*val = memtrace_size;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get, memtrace_enable_set, "0x%016llx\n");
+
+static int memtrace_init(void)
+{
+	memtrace_debugfs_dir = debugfs_create_dir("memtrace", powerpc_debugfs_root);
+	if (!memtrace_debugfs_dir)
+		return -1;
+
+	debugfs_create_file("enable", S_IRUSR|S_IWUSR, memtrace_debugfs_dir,
+			    NULL, &memtrace_init_fops);
+
+	return 0;
+}
+machine_device_initcall(powernv, memtrace_init);
+
+
+
+/* XXX FIXME DEBUG CRAP */
+machine_device_initcall(pseries, memtrace_init);
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC] Remove memory from nodes for memtrace.
  2017-02-22 21:39 [RFC] Remove memory from nodes for memtrace Rashmica Gupta
@ 2017-02-23  3:38 ` Stewart Smith
  2017-02-23  3:56 ` Nicholas Piggin
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: Stewart Smith @ 2017-02-23  3:38 UTC (permalink / raw)
  To: Rashmica Gupta, linuxppc-dev, npiggin, bsingharora, dllehr, mpe
  Cc: Rashmica Gupta

Rashmica Gupta <rashmica.g@gmail.com> writes:
> +}
> +machine_device_initcall(powernv, memtrace_init);
> +
> +
> +
> +/* XXX FIXME DEBUG CRAP */
> +machine_device_initcall(pseries, memtrace_init);

Should the fixme be there?

-- 
Stewart Smith
OPAL Architect, IBM.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] Remove memory from nodes for memtrace.
  2017-02-22 21:39 [RFC] Remove memory from nodes for memtrace Rashmica Gupta
  2017-02-23  3:38 ` Stewart Smith
@ 2017-02-23  3:56 ` Nicholas Piggin
  2017-02-23  4:01   ` Andrew Donnellan
                     ` (2 more replies)
  2017-02-24 23:47 ` Balbir Singh
  2017-02-26 23:54 ` Oliver O'Halloran
  3 siblings, 3 replies; 10+ messages in thread
From: Nicholas Piggin @ 2017-02-23  3:56 UTC (permalink / raw)
  To: Rashmica Gupta; +Cc: linuxppc-dev, bsingharora, dllehr, mpe

On Thu, 23 Feb 2017 08:39:10 +1100
Rashmica Gupta <rashmica.g@gmail.com> wrote:

>  Some powerpc hardware features may want to gain access to a
>  chunk of undisturbed real memory.  This update provides a means to unplug
>  said memory from the kernel with a set of sysfs calls.  By writing an integer
>  containing  the size of memory to be unplugged into
>  /sys/kernel/debug/powerpc/memtrace/enable, the code will remove that much
>  memory from the end of each available chip's memory space. In addition, the
>  means to read out the contents of the unplugged memory is also provided by
>  reading out the /sys/kernel/debug/powerpc/memtrace/<chip-id>/dump file.
> 
> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>

Thanks for picking this up. A few suggestions.

> ---
> Written by Douglas Lehr <dllehr@us.ibm.com>.

You could mention original author in the changelog. I thought Anton
wrote some of it too?


>  arch/powerpc/mm/hash_native_64.c          |  39 +++-
>  arch/powerpc/platforms/powernv/Makefile   |   1 +
>  arch/powerpc/platforms/powernv/memtrace.c | 285 ++++++++++++++++++++++++++++++
>  3 files changed, 321 insertions(+), 4 deletions(-)
>  create mode 100644 arch/powerpc/platforms/powernv/memtrace.c
> 
> diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
> index cc33260..44cc6ce 100644
> --- a/arch/powerpc/mm/hash_native_64.c
> +++ b/arch/powerpc/mm/hash_native_64.c
> @@ -3,7 +3,7 @@
>   *
>   * SMP scalability work:
>   *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
> - * 
> + *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
>   * as published by the Free Software Foundation; either version
> @@ -181,7 +181,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
>  	while (1) {
>  		if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
>  			break;
> -		while(test_bit(HPTE_LOCK_BIT, word))
> +		while (test_bit(HPTE_LOCK_BIT, word))
>  			cpu_relax();
>  	}
>  }
> @@ -208,10 +208,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
>  	}
>  
>  	for (i = 0; i < HPTES_PER_GROUP; i++) {
> -		if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
> +		if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
>  			/* retry with lock held */
>  			native_lock_hpte(hptep);
> -			if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
> +			if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID))
>  				break;
>  			native_unlock_hpte(hptep);

Suggest dropping the whitespace cleanups.


>  		}
> @@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
>  	tlbie(vpn, psize, psize, ssize, 0);
>  }
>  
> +/*
> + * Remove a bolted kernel entry. Memory hotplug uses this.
> + *
> + * No need to lock here because we should be the only user.
> + */
> +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
> +{
> +	unsigned long vpn;
> +	unsigned long vsid;
> +	long slot;
> +	struct hash_pte *hptep;
> +
> +	vsid = get_kernel_vsid(ea, ssize);
> +	vpn = hpt_vpn(ea, vsid, ssize);
> +
> +	slot = native_hpte_find(vpn, psize, ssize);
> +	if (slot == -1)
> +		return -ENOENT;
> +
> +	hptep = htab_address + slot;
> +
> +	/* Invalidate the hpte */
> +	hptep->v = 0;
> +
> +	/* Invalidate the TLB */
> +	tlbie(vpn, psize, psize, ssize, 0);
> +	return 0;
> +}
> +
> +
>  static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
>  				   int bpsize, int apsize, int ssize, int local)
>  {
> @@ -722,6 +752,7 @@ void __init hpte_init_native(void)
>  	mmu_hash_ops.hpte_invalidate	= native_hpte_invalidate;
>  	mmu_hash_ops.hpte_updatepp	= native_hpte_updatepp;
>  	mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
> +	mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;

I would submit this as a separate patch. Consider if it should be made
dependent on any CONFIG options (e.g., MEMORY_HOTREMOVE), and Aneesh had
been looking at this too so he could help review it.

>  	mmu_hash_ops.hpte_insert	= native_hpte_insert;
>  	mmu_hash_ops.hpte_remove	= native_hpte_remove;
>  	mmu_hash_ops.hpte_clear_all	= native_hpte_clear;
> diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
> index b5d98cb..2026661 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -11,4 +11,5 @@ obj-$(CONFIG_EEH)	+= eeh-powernv.o
>  obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
>  obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
>  obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
> +obj-$(CONFIG_MEMORY_HOTREMOVE)         += memtrace.o
>  obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o

I would make a new Kconfig option for it, for the time being.


> +static int check_memblock_online(struct memory_block *mem, void *arg)
> +{
> +	if (mem->state != MEM_ONLINE)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int change_memblock_state(struct memory_block *mem, void *arg)
> +{
> +	unsigned long state = (unsigned long)arg;
> +
> +	mem->state = state;
> +	return 0;
> +}
> +
> +static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
> +{
> +	u64 end_pfn = start_pfn + nr_pages - 1;
> +
> +	if (walk_memory_range(start_pfn, end_pfn, NULL,
> +	    check_memblock_online))
> +		return false;
> +
> +	walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
> +			  change_memblock_state);
> +
> +	if (offline_pages(start_pfn, nr_pages)) {
> +		walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
> +				  change_memblock_state);
> +		return false;
> +	}
> +
> +	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
> +			  change_memblock_state);
> +
> +	/* RCU grace period? */
> +	flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), nr_pages << PAGE_SHIFT);
> +
> +	remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
> +
> +	return true;
> +}

This is the tricky part. Memory hotplug APIs don't seem well suited for
what we're trying to do... Anyway, do a bit of grepping around for
definitions of some of these calls, and how other code uses them. For
example, remove_memory comment says caller must hold 
lock_device_hotplug() first, so we're missing that at least. I think
that's also needed over the memblock state changes.

We don't need an RCU grace period there AFAICT, because offline_pages
should have us covered.


I haven't looked at memory hotplug enough to know why we're open-coding
the memblock stuff there. It would be nice to just be able to call
memblock_remove() like the pseries hotplug code does.

I *think* it is because hot remove mostly comes from when we know about
an online region of memory and we want to take it down. In this case we
also are trying to discover if those addresses are covered by online
memory. Still, I wonder if there are better memblock APIs to do this
with? Balbir may have a better idea of that?


> +/* XXX FIXME DEBUG CRAP */
> +machine_device_initcall(pseries, memtrace_init);

Can remove that, I think.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] Remove memory from nodes for memtrace.
  2017-02-23  3:56 ` Nicholas Piggin
@ 2017-02-23  4:01   ` Andrew Donnellan
  2017-02-23  4:23     ` RashmicaGupta
  2017-02-23  4:29   ` RashmicaGupta
  2017-02-23  6:19   ` RashmicaGupta
  2 siblings, 1 reply; 10+ messages in thread
From: Andrew Donnellan @ 2017-02-23  4:01 UTC (permalink / raw)
  To: Nicholas Piggin, Rashmica Gupta; +Cc: dllehr, linuxppc-dev

On 23/02/17 14:56, Nicholas Piggin wrote:
>> Written by Douglas Lehr <dllehr@us.ibm.com>.
>
> You could mention original author in the changelog. I thought Anton
> wrote some of it too?

You'll want to include the Signed-off-by of all authors involved.


-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan@au1.ibm.com  IBM Australia Limited

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] Remove memory from nodes for memtrace.
  2017-02-23  4:01   ` Andrew Donnellan
@ 2017-02-23  4:23     ` RashmicaGupta
  2017-02-23  5:01       ` Andrew Donnellan
  0 siblings, 1 reply; 10+ messages in thread
From: RashmicaGupta @ 2017-02-23  4:23 UTC (permalink / raw)
  To: Andrew Donnellan, Nicholas Piggin, Rashmica Gupta; +Cc: dllehr, linuxppc-dev



On 23/02/17 15:01, Andrew Donnellan wrote:
> On 23/02/17 14:56, Nicholas Piggin wrote:
>>> Written by Douglas Lehr <dllehr@us.ibm.com>.
>>
>> You could mention original author in the changelog. I thought Anton
>> wrote some of it too?
>
> You'll want to include the Signed-off-by of all authors involved

Even without them seeing any changes first?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] Remove memory from nodes for memtrace.
  2017-02-23  3:56 ` Nicholas Piggin
  2017-02-23  4:01   ` Andrew Donnellan
@ 2017-02-23  4:29   ` RashmicaGupta
  2017-02-23  6:19   ` RashmicaGupta
  2 siblings, 0 replies; 10+ messages in thread
From: RashmicaGupta @ 2017-02-23  4:29 UTC (permalink / raw)
  To: Nicholas Piggin, Rashmica Gupta; +Cc: linuxppc-dev, bsingharora, dllehr, mpe



On 23/02/17 14:56, Nicholas Piggin wrote:
> On Thu, 23 Feb 2017 08:39:10 +1100
> Rashmica Gupta <rashmica.g@gmail.com> wrote:
>
>>   Some powerpc hardware features may want to gain access to a
>>   chunk of undisturbed real memory.  This update provides a means to unplug
>>   said memory from the kernel with a set of sysfs calls.  By writing an integer
>>   containing  the size of memory to be unplugged into
>>   /sys/kernel/debug/powerpc/memtrace/enable, the code will remove that much
>>   memory from the end of each available chip's memory space. In addition, the
>>   means to read out the contents of the unplugged memory is also provided by
>>   reading out the /sys/kernel/debug/powerpc/memtrace/<chip-id>/dump file.
>>
>> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
> Thanks for picking this up. A few suggestions.
>
>> ---
>> Written by Douglas Lehr <dllehr@us.ibm.com>.
> You could mention original author in the changelog. I thought Anton
> wrote some of it too?
Yup he did. Will do that.
>
>>   arch/powerpc/mm/hash_native_64.c          |  39 +++-
>>   arch/powerpc/platforms/powernv/Makefile   |   1 +
>>   arch/powerpc/platforms/powernv/memtrace.c | 285 ++++++++++++++++++++++++++++++
>>   3 files changed, 321 insertions(+), 4 deletions(-)
>>   create mode 100644 arch/powerpc/platforms/powernv/memtrace.c
>>
>> diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
>> index cc33260..44cc6ce 100644
>> --- a/arch/powerpc/mm/hash_native_64.c
>> +++ b/arch/powerpc/mm/hash_native_64.c
>> @@ -3,7 +3,7 @@
>>    *
>>    * SMP scalability work:
>>    *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
>> - *
>> + *
>>    * This program is free software; you can redistribute it and/or
>>    * modify it under the terms of the GNU General Public License
>>    * as published by the Free Software Foundation; either version
>> @@ -181,7 +181,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
>>   	while (1) {
>>   		if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
>>   			break;
>> -		while(test_bit(HPTE_LOCK_BIT, word))
>> +		while (test_bit(HPTE_LOCK_BIT, word))
>>   			cpu_relax();
>>   	}
>>   }
>> @@ -208,10 +208,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
>>   	}
>>   
>>   	for (i = 0; i < HPTES_PER_GROUP; i++) {
>> -		if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
>> +		if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
>>   			/* retry with lock held */
>>   			native_lock_hpte(hptep);
>> -			if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
>> +			if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID))
>>   				break;
>>   			native_unlock_hpte(hptep);
> Suggest dropping the whitespace cleanups.
>
>
>>   		}
>> @@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
>>   	tlbie(vpn, psize, psize, ssize, 0);
>>   }
>>   
>> +/*
>> + * Remove a bolted kernel entry. Memory hotplug uses this.
>> + *
>> + * No need to lock here because we should be the only user.
>> + */
>> +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
>> +{
>> +	unsigned long vpn;
>> +	unsigned long vsid;
>> +	long slot;
>> +	struct hash_pte *hptep;
>> +
>> +	vsid = get_kernel_vsid(ea, ssize);
>> +	vpn = hpt_vpn(ea, vsid, ssize);
>> +
>> +	slot = native_hpte_find(vpn, psize, ssize);
>> +	if (slot == -1)
>> +		return -ENOENT;
>> +
>> +	hptep = htab_address + slot;
>> +
>> +	/* Invalidate the hpte */
>> +	hptep->v = 0;
>> +
>> +	/* Invalidate the TLB */
>> +	tlbie(vpn, psize, psize, ssize, 0);
>> +	return 0;
>> +}
>> +
>> +
>>   static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
>>   				   int bpsize, int apsize, int ssize, int local)
>>   {
>> @@ -722,6 +752,7 @@ void __init hpte_init_native(void)
>>   	mmu_hash_ops.hpte_invalidate	= native_hpte_invalidate;
>>   	mmu_hash_ops.hpte_updatepp	= native_hpte_updatepp;
>>   	mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
>> +	mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
> I would submit this as a separate patch. Consider if it should be made
> dependent on any CONFIG options (e.g., MEMORY_HOTREMOVE), and Aneesh had
> been looking at this too so he could help review it.
>
>>   	mmu_hash_ops.hpte_insert	= native_hpte_insert;
>>   	mmu_hash_ops.hpte_remove	= native_hpte_remove;
>>   	mmu_hash_ops.hpte_clear_all	= native_hpte_clear;
>> diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
>> index b5d98cb..2026661 100644
>> --- a/arch/powerpc/platforms/powernv/Makefile
>> +++ b/arch/powerpc/platforms/powernv/Makefile
>> @@ -11,4 +11,5 @@ obj-$(CONFIG_EEH)	+= eeh-powernv.o
>>   obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
>>   obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
>>   obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
>> +obj-$(CONFIG_MEMORY_HOTREMOVE)         += memtrace.o
>>   obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o
> I would make a new Kconfig option for it, for the time being.
>
>
>> +static int check_memblock_online(struct memory_block *mem, void *arg)
>> +{
>> +	if (mem->state != MEM_ONLINE)
>> +		return -1;
>> +
>> +	return 0;
>> +}
>> +
>> +static int change_memblock_state(struct memory_block *mem, void *arg)
>> +{
>> +	unsigned long state = (unsigned long)arg;
>> +
>> +	mem->state = state;
>> +	return 0;
>> +}
>> +
>> +static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
>> +{
>> +	u64 end_pfn = start_pfn + nr_pages - 1;
>> +
>> +	if (walk_memory_range(start_pfn, end_pfn, NULL,
>> +	    check_memblock_online))
>> +		return false;
>> +
>> +	walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
>> +			  change_memblock_state);
>> +
>> +	if (offline_pages(start_pfn, nr_pages)) {
>> +		walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
>> +				  change_memblock_state);
>> +		return false;
>> +	}
>> +
>> +	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
>> +			  change_memblock_state);
>> +
>> +	/* RCU grace period? */
>> +	flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), nr_pages << PAGE_SHIFT);
>> +
>> +	remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
>> +
>> +	return true;
>> +}
> This is the tricky part. Memory hotplug APIs don't seem well suited for
> what we're trying to do... Anyway, do a bit of grepping around for
> definitions of some of these calls, and how other code uses them. For
> example, remove_memory comment says caller must hold
> lock_device_hotplug() first, so we're missing that at least. I think
> that's also needed over the memblock state changes.

Yup, realised I missed the lock_device_hotplug()
that right after I sent the email...

>
> We don't need an RCU grace period there AFAICT, because offline_pages
> should have us covered.
>
>
> I haven't looked at memory hotplug enough to know why we're open-coding
> the memblock stuff there. It would be nice to just be able to call
> memblock_remove() like the pseries hotplug code does.
>
> I *think* it is because hot remove mostly comes from when we know about
> an online region of memory and we want to take it down. In this case we
> also are trying to discover if those addresses are covered by online
> memory. Still, I wonder if there are better memblock APIs to do this
> with? Balbir may have a better idea of that?
>

Gotcha. Will look into that and other ways to do this.

  

>> +/* XXX FIXME DEBUG CRAP */
>> +machine_device_initcall(pseries, memtrace_init);
> Can remove that, I think.
>
> Thanks,
> Nick

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] Remove memory from nodes for memtrace.
  2017-02-23  4:23     ` RashmicaGupta
@ 2017-02-23  5:01       ` Andrew Donnellan
  0 siblings, 0 replies; 10+ messages in thread
From: Andrew Donnellan @ 2017-02-23  5:01 UTC (permalink / raw)
  To: RashmicaGupta, Nicholas Piggin, Rashmica Gupta; +Cc: dllehr, linuxppc-dev

On 23/02/17 15:23, RashmicaGupta wrote:
>> You'll want to include the Signed-off-by of all authors involved
>
> Even without them seeing any changes first?

IIUC, you should either run the changes past them first, or add a note 
(e.g. "[rashmica: general cleanup, refactored foo]") with the changes 
between what they gave you and what you submitted between their SOB and 
your SOB.

Though generally this is a somewhat academic discussion :)

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan@au1.ibm.com  IBM Australia Limited

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] Remove memory from nodes for memtrace.
  2017-02-23  3:56 ` Nicholas Piggin
  2017-02-23  4:01   ` Andrew Donnellan
  2017-02-23  4:29   ` RashmicaGupta
@ 2017-02-23  6:19   ` RashmicaGupta
  2 siblings, 0 replies; 10+ messages in thread
From: RashmicaGupta @ 2017-02-23  6:19 UTC (permalink / raw)
  To: Nicholas Piggin, bsingharora; +Cc: linuxppc-dev, bsingharora, dllehr, mpe



On 23/02/17 14:56, Nicholas Piggin wrote:
>
>> +
>> +static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
>> +{
>> +	u64 end_pfn = start_pfn + nr_pages - 1;
>> +
>> +	if (walk_memory_range(start_pfn, end_pfn, NULL,
>> +	    check_memblock_online))
>> +		return false;
>> +
>> +	walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
>> +			  change_memblock_state);
>> +
>> +	if (offline_pages(start_pfn, nr_pages)) {
>> +		walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
>> +				  change_memblock_state);
>> +		return false;
>> +	}
>> +
>> +	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
>> +			  change_memblock_state);
>> +
>> +	/* RCU grace period? */
>> +	flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), nr_pages << PAGE_SHIFT);
>> +
>> +	remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
>> +
>> +	return true;
>> +}
> This is the tricky part. Memory hotplug APIs don't seem well suited for
> what we're trying to do... Anyway, do a bit of grepping around for
> definitions of some of these calls, and how other code uses them. For
> example, remove_memory comment says caller must hold
> lock_device_hotplug() first, so we're missing that at least. I think
> that's also needed over the memblock state changes.
>
> We don't need an RCU grace period there AFAICT, because offline_pages
> should have us covered.
>
>
> I haven't looked at memory hotplug enough to know why we're open-coding
> the memblock stuff there. It would be nice to just be able to call
> memblock_remove() like the pseries hotplug code does.
remove_memory() calls memblock_remove() after confirming that
the memory is offlined. That seems sensible to me.
>
> I *think* it is because hot remove mostly comes from when we know about
> an online region of memory and we want to take it down. In this case we
> also are trying to discover if those addresses are covered by online
> memory. Still, I wonder if there are better memblock APIs to do this
> with? Balbir may have a better idea of that?
>
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] Remove memory from nodes for memtrace.
  2017-02-22 21:39 [RFC] Remove memory from nodes for memtrace Rashmica Gupta
  2017-02-23  3:38 ` Stewart Smith
  2017-02-23  3:56 ` Nicholas Piggin
@ 2017-02-24 23:47 ` Balbir Singh
  2017-02-26 23:54 ` Oliver O'Halloran
  3 siblings, 0 replies; 10+ messages in thread
From: Balbir Singh @ 2017-02-24 23:47 UTC (permalink / raw)
  To: Rashmica Gupta
  Cc: open list:LINUX FOR POWERPC (32-BIT AND 64-BIT),
	Nicholas Piggin, dllehr, Michael Ellerman

On Thu, Feb 23, 2017 at 8:39 AM, Rashmica Gupta <rashmica.g@gmail.com> wrote:
>  Some powerpc hardware features may want to gain access to a
>  chunk of undisturbed real memory.  This update provides a means to unplug
>  said memory from the kernel with a set of sysfs calls.  By writing an integer
>  containing  the size of memory to be unplugged into
>  /sys/kernel/debug/powerpc/memtrace/enable, the code will remove that much
>  memory from the end of each available chip's memory space.

Could we define this better - what is chips's memory space, the local node?

In addition, the
>  means to read out the contents of the unplugged memory is also provided by
>  reading out the /sys/kernel/debug/powerpc/memtrace/<chip-id>/dump file.
>
> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
> ---
> Written by Douglas Lehr <dllehr@us.ibm.com>.
> Have tested and seems to work as I would expect. Only change I have made from
> the original is to check that the value being written to the debugfs file is
> not 0 (or obscenely large), as otherwise you get a nice kernel oops where the
> kernel attempts to access data at 0xfffffffe0.
>
> Thoughts about doing this with hot unplug or other changes?
>
>  arch/powerpc/mm/hash_native_64.c          |  39 +++-
>  arch/powerpc/platforms/powernv/Makefile   |   1 +
>  arch/powerpc/platforms/powernv/memtrace.c | 285 ++++++++++++++++++++++++++++++
>  3 files changed, 321 insertions(+), 4 deletions(-)
>  create mode 100644 arch/powerpc/platforms/powernv/memtrace.c
>
> diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
> index cc33260..44cc6ce 100644
> --- a/arch/powerpc/mm/hash_native_64.c
> +++ b/arch/powerpc/mm/hash_native_64.c
> @@ -3,7 +3,7 @@
>   *
>   * SMP scalability work:
>   *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
> - *
> + *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
>   * as published by the Free Software Foundation; either version
> @@ -181,7 +181,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
>         while (1) {
>                 if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
>                         break;
> -               while(test_bit(HPTE_LOCK_BIT, word))
> +               while (test_bit(HPTE_LOCK_BIT, word))

Can we segregate white space fixes from actual changes

>                         cpu_relax();
>         }
>  }
> @@ -208,10 +208,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
>         }
>
>         for (i = 0; i < HPTES_PER_GROUP; i++) {
> -               if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
> +               if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
>                         /* retry with lock held */
>                         native_lock_hpte(hptep);
> -                       if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
> +                       if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID))
>                                 break;
>                         native_unlock_hpte(hptep);
>                 }
> @@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
>         tlbie(vpn, psize, psize, ssize, 0);
>  }
>
> +/*
> + * Remove a bolted kernel entry. Memory hotplug uses this.
> + *
> + * No need to lock here because we should be the only user.
> + */
> +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
> +{
> +       unsigned long vpn;
> +       unsigned long vsid;
> +       long slot;
> +       struct hash_pte *hptep;
> +
> +       vsid = get_kernel_vsid(ea, ssize);
> +       vpn = hpt_vpn(ea, vsid, ssize);
> +

Do we need checks to ensure that ea does not belong to _stext or _etext

> +       slot = native_hpte_find(vpn, psize, ssize);
> +       if (slot == -1)
> +               return -ENOENT;
> +

What does this mean, we have an invalid address?

> +       hptep = htab_address + slot;
> +
> +       /* Invalidate the hpte */
> +       hptep->v = 0;
> +
> +       /* Invalidate the TLB */
> +       tlbie(vpn, psize, psize, ssize, 0);
> +       return 0;
> +}
> +
> +
>  static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
>                                    int bpsize, int apsize, int ssize, int local)
>  {
> @@ -722,6 +752,7 @@ void __init hpte_init_native(void)
>         mmu_hash_ops.hpte_invalidate    = native_hpte_invalidate;
>         mmu_hash_ops.hpte_updatepp      = native_hpte_updatepp;
>         mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
> +       mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
>         mmu_hash_ops.hpte_insert        = native_hpte_insert;
>         mmu_hash_ops.hpte_remove        = native_hpte_remove;
>         mmu_hash_ops.hpte_clear_all     = native_hpte_clear;
> diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
> index b5d98cb..2026661 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -11,4 +11,5 @@ obj-$(CONFIG_EEH)     += eeh-powernv.o
>  obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
>  obj-$(CONFIG_MEMORY_FAILURE)   += opal-memory-errors.o
>  obj-$(CONFIG_TRACEPOINTS)      += opal-tracepoints.o
> +obj-$(CONFIG_MEMORY_HOTREMOVE)         += memtrace.o
>  obj-$(CONFIG_OPAL_PRD) += opal-prd.o
> diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
> new file mode 100644
> index 0000000..fdba3ee
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/memtrace.c
> @@ -0,0 +1,285 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * Copyright (C) IBM Corporation, 2014
> + *
> + * Author: Anton Blanchard <anton@au.ibm.com>
> + */
> +
> +#define pr_fmt(fmt) "powernv-memtrace: " fmt
> +
> +#include <linux/bitops.h>
> +#include <linux/string.h>
> +#include <linux/memblock.h>
> +#include <linux/init.h>
> +#include <linux/moduleparam.h>
> +#include <linux/fs.h>
> +#include <linux/debugfs.h>
> +#include <linux/slab.h>
> +#include <linux/memory.h>
> +#include <linux/memory_hotplug.h>
> +#include <asm/machdep.h>
> +
> +struct memtrace_entry {
> +       void *mem;
> +       u64 start;
> +       u64 size;
> +       u32 nid;
> +       struct dentry *dir;
> +       char name[16];
> +};
> +
> +static struct memtrace_entry *memtrace_array;
> +static unsigned int memtrace_array_nr;
> +
> +static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
> +                            size_t count, loff_t *ppos)
> +{
> +       struct memtrace_entry *ent = filp->private_data;
> +
> +       return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
> +}
> +
> +static bool valid_memtrace_range(struct memtrace_entry *dev,
> +                                unsigned long start, unsigned long size)
> +{
> +       if ((dev->start <= start) &&
> +           ((start + size) <= (dev->start + dev->size)))
> +               return true;
> +
> +       return false;
> +}
> +
> +static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +       unsigned long size = vma->vm_end - vma->vm_start;
> +       struct memtrace_entry *dev = filp->private_data;
> +
> +       if (!valid_memtrace_range(dev, vma->vm_pgoff << PAGE_SHIFT, size))
> +               return -EINVAL;
> +
> +       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +
> +       if (io_remap_pfn_range(vma, vma->vm_start,
> +                              vma->vm_pgoff + (dev->start >> PAGE_SHIFT),
> +                              size, vma->vm_page_prot))
> +               return -EAGAIN;
> +
> +       return 0;
> +}
> +
> +static const struct file_operations memtrace_fops = {
> +       .llseek = default_llseek,
> +       .read   = memtrace_read,
> +       .mmap   = memtrace_mmap,
> +       .open   = simple_open,
> +};
> +
> +static void flush_memory_region(u64 base, u64 size)
> +{
> +       unsigned long line_size = ppc64_caches.dline_size;
> +       u64 end = base + size;
> +       u64 addr;
> +
> +       base = round_down(base, line_size);
> +       end = round_up(end, line_size);
> +
> +       for (addr = base; addr < end; addr += line_size)
> +               asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory");
> +}
> +
> +static int check_memblock_online(struct memory_block *mem, void *arg)
> +{
> +       if (mem->state != MEM_ONLINE)
> +               return -1;
> +
> +       return 0;
> +}
> +
> +static int change_memblock_state(struct memory_block *mem, void *arg)
> +{
> +       unsigned long state = (unsigned long)arg;
> +
> +       mem->state = state;
> +       return 0;
> +}
> +
> +static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
> +{
> +       u64 end_pfn = start_pfn + nr_pages - 1;
> +
> +       if (walk_memory_range(start_pfn, end_pfn, NULL,
> +           check_memblock_online))
> +               return false;
> +
> +       walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
> +                         change_memblock_state);
> +
> +       if (offline_pages(start_pfn, nr_pages)) {
> +               walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
> +                                 change_memblock_state);
> +               return false;
> +       }
> +
> +       walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
> +                         change_memblock_state);
> +
> +       /* RCU grace period? */
> +       flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), nr_pages << PAGE_SHIFT);
> +
> +       remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
> +
> +       return true;
> +}
> +
> +static u64 memtrace_alloc_node(u32 nid, u64 size)
> +{
> +       u64 start_pfn, end_pfn, nr_pages;
> +       u64 base_pfn;
> +

I wonder if it makes more sense to find a free region in the node
via the allocator and then offline it, but that can be racy was well
I suppose

> +       if (!NODE_DATA(nid) || !node_spanned_pages(nid))
> +               return 0;
> +
> +       start_pfn = node_start_pfn(nid);
> +       end_pfn = node_end_pfn(nid);
> +       nr_pages = size >> PAGE_SHIFT;
> +
> +       /* Trace memory needs to be aligned to the size */
> +       start_pfn = round_up(start_pfn, nr_pages);
> +
> +       for (base_pfn = start_pfn; base_pfn < end_pfn; base_pfn += nr_pages) {
> +               if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true)
> +                       return base_pfn << PAGE_SHIFT;
> +       }
> +
> +       return 0;
> +}
> +
> +static int memtrace_init_regions_runtime(u64 size)
> +{
> +       u64 m;
> +       u32 nid;
> +
> +       memtrace_array = kzalloc(sizeof(struct memtrace_entry) *
> +                               num_online_nodes(), GFP_KERNEL);
> +       if (!memtrace_array) {
> +               pr_err("Failed to allocate memtrace_array\n");
> +               return -EINVAL;
> +       }
> +
> +       for_each_online_node(nid) {

What makes this hotplug safe?

> +               m = memtrace_alloc_node(nid, size);
> +               /*
> +                * A node might not have any local memory, so warn but
> +                * continue on.
> +                */

Honestly, I think we need some more design around this. The patches work, but
I am not quite sure if there is a better way to release memory. I've
not looked at the memblock
API's, but I am concerned about blowing holes in memory that might impact our
organization of zones, generally we do ZONE_DMA, so we should be good. What
happens if we end up off-lining all our CMA memory? I don't know if we
are guarded
against it.

Balbir Singh

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] Remove memory from nodes for memtrace.
  2017-02-22 21:39 [RFC] Remove memory from nodes for memtrace Rashmica Gupta
                   ` (2 preceding siblings ...)
  2017-02-24 23:47 ` Balbir Singh
@ 2017-02-26 23:54 ` Oliver O'Halloran
  3 siblings, 0 replies; 10+ messages in thread
From: Oliver O'Halloran @ 2017-02-26 23:54 UTC (permalink / raw)
  To: Rashmica Gupta
  Cc: linuxppc-dev, Nicholas Piggin, Balbir Singh, dllehr, Michael Ellerman

On Thu, Feb 23, 2017 at 8:39 AM, Rashmica Gupta <rashmica.g@gmail.com> wrote:
>  Some powerpc hardware features may want to gain access to a
>  chunk of undisturbed real memory.  This update provides a means to unplug
>  said memory from the kernel with a set of sysfs calls.  By writing an integer
>  containing  the size of memory to be unplugged into
>  /sys/kernel/debug/powerpc/memtrace/enable, the code will remove that much
>  memory from the end of each available chip's memory space. In addition, the
>  means to read out the contents of the unplugged memory is also provided by
>  reading out the /sys/kernel/debug/powerpc/memtrace/<chip-id>/dump file.
>
> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
> ---
> Written by Douglas Lehr <dllehr@us.ibm.com>.
> Have tested and seems to work as I would expect. Only change I have made from
> the original is to check that the value being written to the debugfs file is
> not 0 (or obscenely large), as otherwise you get a nice kernel oops where the
> kernel attempts to access data at 0xfffffffe0.
>
> Thoughts about doing this with hot unplug or other changes?
>
>  arch/powerpc/mm/hash_native_64.c          |  39 +++-
>  arch/powerpc/platforms/powernv/Makefile   |   1 +
>  arch/powerpc/platforms/powernv/memtrace.c | 285 ++++++++++++++++++++++++++++++
>  3 files changed, 321 insertions(+), 4 deletions(-)
>  create mode 100644 arch/powerpc/platforms/powernv/memtrace.c
>
> diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
> index cc33260..44cc6ce 100644
> --- a/arch/powerpc/mm/hash_native_64.c
> +++ b/arch/powerpc/mm/hash_native_64.c
> @@ -3,7 +3,7 @@
>   *
>   * SMP scalability work:
>   *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
> - *
> + *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
>   * as published by the Free Software Foundation; either version
> @@ -181,7 +181,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
>         while (1) {
>                 if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
>                         break;
> -               while(test_bit(HPTE_LOCK_BIT, word))
> +               while (test_bit(HPTE_LOCK_BIT, word))
>                         cpu_relax();
>         }
>  }
> @@ -208,10 +208,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
>         }
>
>         for (i = 0; i < HPTES_PER_GROUP; i++) {
> -               if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
> +               if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
>                         /* retry with lock held */
>                         native_lock_hpte(hptep);
> -                       if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
> +                       if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID))
>                                 break;
>                         native_unlock_hpte(hptep);
>                 }
> @@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
>         tlbie(vpn, psize, psize, ssize, 0);
>  }
>
> +/*
> + * Remove a bolted kernel entry. Memory hotplug uses this.
> + *
> + * No need to lock here because we should be the only user.
> + */
> +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
> +{
> +       unsigned long vpn;
> +       unsigned long vsid;
> +       long slot;
> +       struct hash_pte *hptep;
> +
> +       vsid = get_kernel_vsid(ea, ssize);
> +       vpn = hpt_vpn(ea, vsid, ssize);
> +
> +       slot = native_hpte_find(vpn, psize, ssize);
> +       if (slot == -1)
> +               return -ENOENT;
> +
> +       hptep = htab_address + slot;
> +
> +       /* Invalidate the hpte */
> +       hptep->v = 0;
> +
> +       /* Invalidate the TLB */
> +       tlbie(vpn, psize, psize, ssize, 0);
> +       return 0;
> +}
> +
> +
>  static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
>                                    int bpsize, int apsize, int ssize, int local)
>  {
> @@ -722,6 +752,7 @@ void __init hpte_init_native(void)
>         mmu_hash_ops.hpte_invalidate    = native_hpte_invalidate;
>         mmu_hash_ops.hpte_updatepp      = native_hpte_updatepp;
>         mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
> +       mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
>         mmu_hash_ops.hpte_insert        = native_hpte_insert;
>         mmu_hash_ops.hpte_remove        = native_hpte_remove;
>         mmu_hash_ops.hpte_clear_all     = native_hpte_clear;
> diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
> index b5d98cb..2026661 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -11,4 +11,5 @@ obj-$(CONFIG_EEH)     += eeh-powernv.o
>  obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
>  obj-$(CONFIG_MEMORY_FAILURE)   += opal-memory-errors.o
>  obj-$(CONFIG_TRACEPOINTS)      += opal-tracepoints.o
> +obj-$(CONFIG_MEMORY_HOTREMOVE)         += memtrace.o
>  obj-$(CONFIG_OPAL_PRD) += opal-prd.o
> diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
> new file mode 100644
> index 0000000..fdba3ee
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/memtrace.c
> @@ -0,0 +1,285 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * Copyright (C) IBM Corporation, 2014
> + *
> + * Author: Anton Blanchard <anton@au.ibm.com>
> + */
> +
> +#define pr_fmt(fmt) "powernv-memtrace: " fmt
> +
> +#include <linux/bitops.h>
> +#include <linux/string.h>
> +#include <linux/memblock.h>
> +#include <linux/init.h>
> +#include <linux/moduleparam.h>
> +#include <linux/fs.h>
> +#include <linux/debugfs.h>
> +#include <linux/slab.h>
> +#include <linux/memory.h>
> +#include <linux/memory_hotplug.h>
> +#include <asm/machdep.h>
> +
> +struct memtrace_entry {
> +       void *mem;
> +       u64 start;
> +       u64 size;
> +       u32 nid;
> +       struct dentry *dir;
> +       char name[16];
> +};
> +
> +static struct memtrace_entry *memtrace_array;
> +static unsigned int memtrace_array_nr;
> +
> +static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
> +                            size_t count, loff_t *ppos)
> +{
> +       struct memtrace_entry *ent = filp->private_data;
> +
> +       return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
> +}
> +
> +static bool valid_memtrace_range(struct memtrace_entry *dev,
> +                                unsigned long start, unsigned long size)
> +{
> +       if ((dev->start <= start) &&
> +           ((start + size) <= (dev->start + dev->size)))
> +               return true;
> +
> +       return false;
> +}
> +
> +static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +       unsigned long size = vma->vm_end - vma->vm_start;
> +       struct memtrace_entry *dev = filp->private_data;
> +
> +       if (!valid_memtrace_range(dev, vma->vm_pgoff << PAGE_SHIFT, size))
> +               return -EINVAL;
> +
> +       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +
> +       if (io_remap_pfn_range(vma, vma->vm_start,
> +                              vma->vm_pgoff + (dev->start >> PAGE_SHIFT),
> +                              size, vma->vm_page_prot))
> +               return -EAGAIN;
> +
> +       return 0;
> +}
> +
> +static const struct file_operations memtrace_fops = {
> +       .llseek = default_llseek,
> +       .read   = memtrace_read,
> +       .mmap   = memtrace_mmap,
> +       .open   = simple_open,
> +};
> +

> +static void flush_memory_region(u64 base, u64 size)
> +{
> +       unsigned long line_size = ppc64_caches.dline_size;
> +       u64 end = base + size;
> +       u64 addr;
> +
> +       base = round_down(base, line_size);
> +       end = round_up(end, line_size);
> +
> +       for (addr = base; addr < end; addr += line_size)
> +               asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory");
> +}

I think this is the same as flush_inval_dcache_range()?

> +
> +static int check_memblock_online(struct memory_block *mem, void *arg)
> +{
> +       if (mem->state != MEM_ONLINE)
> +               return -1;
> +
> +       return 0;
> +}
> +
> +static int change_memblock_state(struct memory_block *mem, void *arg)
> +{
> +       unsigned long state = (unsigned long)arg;
> +
> +       mem->state = state;
> +       return 0;
> +}
> +
> +static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
> +{
> +       u64 end_pfn = start_pfn + nr_pages - 1;
> +
> +       if (walk_memory_range(start_pfn, end_pfn, NULL,
> +           check_memblock_online))
> +               return false;
> +
> +       walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
> +                         change_memblock_state);
> +
> +       if (offline_pages(start_pfn, nr_pages)) {
> +               walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
> +                                 change_memblock_state);
> +               return false;
> +       }
> +
> +       walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
> +                         change_memblock_state);
> +

> +       /* RCU grace period? */
> +       flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), nr_pages << PAGE_SHIFT);
> +
> +       remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);

This concerns me slightly. We're relying on flush_memory_region() to
ensure the whole region is out of the cache, but until it's been
unmapped the processor can still do speculative loads from this
region. Given the memory will be re-mapped as cache inhibited this is
opening a window for cache paradoxes. It might not be an issue, but if
you encounter random checkstops while testing we should look into
hardening this.

> +
> +       return true;
> +}
> +
> +static u64 memtrace_alloc_node(u32 nid, u64 size)
> +{
> +       u64 start_pfn, end_pfn, nr_pages;
> +       u64 base_pfn;
> +
> +       if (!NODE_DATA(nid) || !node_spanned_pages(nid))
> +               return 0;
> +
> +       start_pfn = node_start_pfn(nid);
> +       end_pfn = node_end_pfn(nid);
> +       nr_pages = size >> PAGE_SHIFT;
> +
> +       /* Trace memory needs to be aligned to the size */
> +       start_pfn = round_up(start_pfn, nr_pages);
> +
> +       for (base_pfn = start_pfn; base_pfn < end_pfn; base_pfn += nr_pages) {
> +               if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true)
> +                       return base_pfn << PAGE_SHIFT;
> +       }
> +
> +       return 0;
> +}
> +
> +static int memtrace_init_regions_runtime(u64 size)
> +{
> +       u64 m;
> +       u32 nid;
> +
> +       memtrace_array = kzalloc(sizeof(struct memtrace_entry) *
> +                               num_online_nodes(), GFP_KERNEL);
> +       if (!memtrace_array) {
> +               pr_err("Failed to allocate memtrace_array\n");
> +               return -EINVAL;
> +       }
> +
> +       for_each_online_node(nid) {
> +               m = memtrace_alloc_node(nid, size);
> +               /*
> +                * A node might not have any local memory, so warn but
> +                * continue on.
> +                */
> +               if (!m) {
> +                       pr_err("Failed to allocate trace memory on node %d\n",
> +                                nid);
> +               } else {
> +                       pr_info("Allocated trace memory on node %d at "
> +                               "0x%016llx\n", nid, m);
> +
> +                       memtrace_array[memtrace_array_nr].start = m;
> +                       memtrace_array[memtrace_array_nr].size = size;
> +                       memtrace_array[memtrace_array_nr].nid = nid;
> +                       memtrace_array_nr++;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +static struct dentry *memtrace_debugfs_dir;
> +
> +static int memtrace_init_debugfs(void)
> +{
> +       int ret = 0;
> +       int i;
> +
> +       for (i = 0; i < memtrace_array_nr; i++) {
> +               struct dentry *dir;
> +               struct memtrace_entry *ent = &memtrace_array[i];
> +
> +               ent->mem = ioremap(ent->start, ent->size);
> +               /* Warn but continue on */
> +               if (!ent->mem) {
> +                       pr_err("Failed to map trace memory at 0x%llx\n",
> +                                ent->start);
> +                       ret = -1;
> +                       continue;
> +               }
> +
> +               snprintf(ent->name, 16, "%08x", ent->nid);
> +               dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir);
> +               if (!dir)
> +                       return -1;
> +
> +               ent->dir = dir;
> +               debugfs_create_file("trace", S_IRUSR, dir, ent, &memtrace_fops);
> +               debugfs_create_x64("start", S_IRUSR, dir, &ent->start);
> +               debugfs_create_x64("size", S_IRUSR, dir, &ent->size);
> +               debugfs_create_u32("node", S_IRUSR, dir, &ent->nid);
> +       }
> +
> +       return ret;
> +}
> +
> +static u64 memtrace_size;
> +
> +static int memtrace_enable_set(void *data, u64 val)
> +{
> +       if (memtrace_size)
> +               return -EINVAL;
> +
> +       if (!val)
> +               return -EINVAL;
> +

> +       /* Make sure size is aligned to a memory block */
> +       if (val & (memory_block_size_bytes()-1))
> +               return -EINVAL;
Printing a warning here with the expected alignment would be helpful .

> +
> +       if (memtrace_init_regions_runtime(val))
> +               return -EINVAL;
> +
> +       if (memtrace_init_debugfs())
> +               return -EINVAL;
> +
> +       memtrace_size = val;
> +
> +       return 0;
> +}

I know this RFC is mostly about how the offlining path should be
handled, but can you sketch out the re-onlining path too?

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-02-26 23:54 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-22 21:39 [RFC] Remove memory from nodes for memtrace Rashmica Gupta
2017-02-23  3:38 ` Stewart Smith
2017-02-23  3:56 ` Nicholas Piggin
2017-02-23  4:01   ` Andrew Donnellan
2017-02-23  4:23     ` RashmicaGupta
2017-02-23  5:01       ` Andrew Donnellan
2017-02-23  4:29   ` RashmicaGupta
2017-02-23  6:19   ` RashmicaGupta
2017-02-24 23:47 ` Balbir Singh
2017-02-26 23:54 ` Oliver O'Halloran

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.