All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] powerpc: properly reserve in bootmem the lmb reserved regions that cross NUMA nodes
@ 2008-10-09 20:18 ` Jon Tollefson
  0 siblings, 0 replies; 11+ messages in thread
From: Jon Tollefson @ 2008-10-09 20:18 UTC (permalink / raw)
  To: linuxppc-dev, Linux Memory Management List,
	Linux Kernel Mailing List, Adam Litke, Kumar Gala,
	Benjamin Herrenschmidt, Paul Mackerras

If there are multiple reserved memory blocks via lmb_reserve() that are
contiguous addresses and on different NUMA nodes we are losing track of which 
address ranges to reserve in bootmem on which node.  I discovered this 
when I recently got to try 16GB huge pages on a system with more then 2 nodes.

When scanning the device tree in early boot we call lmb_reserve() with 
the addresses of the 16G pages that we find so that the memory doesn't 
get used for something else.  For example the addresses for the pages 
could be 4000000000, 4400000000, 4800000000, 4C00000000, etc - 8 pages, 
one on each of eight nodes.  In the lmb after all the pages have been 
reserved it will look something like the following:

lmb_dump_all:
    memory.cnt            = 0x2
    memory.size           = 0x3e80000000
    memory.region[0x0].base       = 0x0
                      .size     = 0x1e80000000
    memory.region[0x1].base       = 0x4000000000
                      .size     = 0x2000000000
    reserved.cnt          = 0x5
    reserved.size         = 0x3e80000000
    reserved.region[0x0].base       = 0x0
                      .size     = 0x7b5000
    reserved.region[0x1].base       = 0x2a00000
                      .size     = 0x78c000
    reserved.region[0x2].base       = 0x328c000
                      .size     = 0x43000
    reserved.region[0x3].base       = 0xf4e8000
                      .size     = 0xb18000
    reserved.region[0x4].base       = 0x4000000000
                      .size     = 0x2000000000


The reserved.region[0x4] contains the 16G pages.  In 
arch/powerpc/mm/num.c: do_init_bootmem() we loop through each of the 
node numbers looking for the reserved regions that belong to the 
particular node.  It is not able to identify region 0x4 as being a part 
of each of the 8 nodes.  It is assuming that a reserved region is only
on a single node.

This patch takes out the reserved region loop from inside
the loop that goes over each node.  It looks up the active region containing
the start of the reserved region.  If it extends past that active region then
it adjusts the size and gets the next active region containing it.

Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
---

Changes:
	v2:
	-style changes as suggested by Adam Litke
	v3:
	-moved helper function to powerpc code since it is the only user at present
	-made end_pfn consistently exclusive
	-other minor code cleanups

Please consider for 2.6.28.

 numa.c |  108 ++++++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index d9a1813..72447f1 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -89,6 +89,46 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
 	return 0;
 }
 
+/*
+ * get_active_region_work_fn - A helper function for get_node_active_region
+ *	Returns datax set to the start_pfn and end_pfn if they contain
+ *	the initial value of datax->start_pfn between them
+ * @start_pfn: start page(inclusive) of region to check
+ * @end_pfn: end page(exclusive) of region to check
+ * @datax: comes in with ->start_pfn set to value to search for and
+ *	goes out with active range if it contains it
+ * Returns 1 if search value is in range else 0
+ */
+static int __init get_active_region_work_fn(unsigned long start_pfn,
+					unsigned long end_pfn, void *datax)
+{
+	struct node_active_region *data;
+	data = (struct node_active_region *)datax;
+
+	if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
+		data->start_pfn = start_pfn;
+		data->end_pfn = end_pfn;
+		return 1;
+	}
+	return 0;
+
+}
+
+/*
+ * get_node_active_region - Return active region containing start_pfn
+ * @start_pfn: The page to return the region for.
+ * @node_ar: Returned set to the active region containing start_pfn
+ */
+static void __init get_node_active_region(unsigned long start_pfn,
+		       struct node_active_region *node_ar)
+{
+	int nid = early_pfn_to_nid(start_pfn);
+
+	node_ar->nid = nid;
+	node_ar->start_pfn = start_pfn;
+	work_with_active_regions(nid, get_active_region_work_fn, node_ar);
+}
+
 static void __cpuinit map_cpu_to_node(int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
@@ -837,38 +877,50 @@ void __init do_init_bootmem(void)
 				  start_pfn, end_pfn);
 
 		free_bootmem_with_active_regions(nid, end_pfn);
+	}
 
-		/* Mark reserved regions on this node */
-		for (i = 0; i < lmb.reserved.cnt; i++) {
-			unsigned long physbase = lmb.reserved.region[i].base;
-			unsigned long size = lmb.reserved.region[i].size;
-			unsigned long start_paddr = start_pfn << PAGE_SHIFT;
-			unsigned long end_paddr = end_pfn << PAGE_SHIFT;
-
-			if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid &&
-			    early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid)
-				continue;
-
-			if (physbase < end_paddr &&
-			    (physbase+size) > start_paddr) {
-				/* overlaps */
-				if (physbase < start_paddr) {
-					size -= start_paddr - physbase;
-					physbase = start_paddr;
-				}
-
-				if (size > end_paddr - physbase)
-					size = end_paddr - physbase;
-
-				dbg("reserve_bootmem %lx %lx\n", physbase,
-				    size);
-				reserve_bootmem_node(NODE_DATA(nid), physbase,
-						     size, BOOTMEM_DEFAULT);
-			}
+	/* Mark reserved regions */
+	for (i = 0; i < lmb.reserved.cnt; i++) {
+		unsigned long physbase = lmb.reserved.region[i].base;
+		unsigned long size = lmb.reserved.region[i].size;
+		unsigned long start_pfn = physbase >> PAGE_SHIFT;
+		unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
+		struct node_active_region node_ar;
+
+		get_node_active_region(start_pfn, &node_ar);
+		while (start_pfn < end_pfn) {
+			/*
+			 * if reserved region extends past active region
+			 * then trim size to active region
+			 */
+			if (end_pfn > node_ar.end_pfn)
+				size = (node_ar.end_pfn << PAGE_SHIFT)
+					- (start_pfn << PAGE_SHIFT);
+			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, size,
+				node_ar.nid);
+			reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
+						size, BOOTMEM_DEFAULT);
+			/*
+			 * if reserved region is contained in the active region
+			 * then done.
+			 */
+			if (end_pfn <= node_ar.end_pfn)
+				break;
+
+			/*
+			 * reserved region extends past the active region
+			 *   get next active region that contains this
+			 *   reserved region
+			 */
+			start_pfn = node_ar.end_pfn;
+			physbase = start_pfn << PAGE_SHIFT;
+			get_node_active_region(start_pfn, &node_ar);
 		}
 
-		sparse_memory_present_with_active_regions(nid);
 	}
+
+	for_each_online_node(nid)
+		sparse_memory_present_with_active_regions(nid);
 }
 
 void __init paging_init(void)





^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v3] powerpc: properly reserve in bootmem the lmb reserved regions that cross NUMA nodes
@ 2008-10-09 20:18 ` Jon Tollefson
  0 siblings, 0 replies; 11+ messages in thread
From: Jon Tollefson @ 2008-10-09 20:18 UTC (permalink / raw)
  To: linuxppc-dev, Linux Memory Management List,
	Linux Kernel Mailing List, Adam Litke, Kumar Gala,
	Benjamin Herrenschmidt, Paul Mackerras

If there are multiple reserved memory blocks via lmb_reserve() that are
contiguous addresses and on different NUMA nodes we are losing track of which 
address ranges to reserve in bootmem on which node.  I discovered this 
when I recently got to try 16GB huge pages on a system with more then 2 nodes.

When scanning the device tree in early boot we call lmb_reserve() with 
the addresses of the 16G pages that we find so that the memory doesn't 
get used for something else.  For example the addresses for the pages 
could be 4000000000, 4400000000, 4800000000, 4C00000000, etc - 8 pages, 
one on each of eight nodes.  In the lmb after all the pages have been 
reserved it will look something like the following:

lmb_dump_all:
    memory.cnt            = 0x2
    memory.size           = 0x3e80000000
    memory.region[0x0].base       = 0x0
                      .size     = 0x1e80000000
    memory.region[0x1].base       = 0x4000000000
                      .size     = 0x2000000000
    reserved.cnt          = 0x5
    reserved.size         = 0x3e80000000
    reserved.region[0x0].base       = 0x0
                      .size     = 0x7b5000
    reserved.region[0x1].base       = 0x2a00000
                      .size     = 0x78c000
    reserved.region[0x2].base       = 0x328c000
                      .size     = 0x43000
    reserved.region[0x3].base       = 0xf4e8000
                      .size     = 0xb18000
    reserved.region[0x4].base       = 0x4000000000
                      .size     = 0x2000000000


The reserved.region[0x4] contains the 16G pages.  In 
arch/powerpc/mm/num.c: do_init_bootmem() we loop through each of the 
node numbers looking for the reserved regions that belong to the 
particular node.  It is not able to identify region 0x4 as being a part 
of each of the 8 nodes.  It is assuming that a reserved region is only
on a single node.

This patch takes out the reserved region loop from inside
the loop that goes over each node.  It looks up the active region containing
the start of the reserved region.  If it extends past that active region then
it adjusts the size and gets the next active region containing it.

Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
---

Changes:
	v2:
	-style changes as suggested by Adam Litke
	v3:
	-moved helper function to powerpc code since it is the only user at present
	-made end_pfn consistently exclusive
	-other minor code cleanups

Please consider for 2.6.28.

 numa.c |  108 ++++++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index d9a1813..72447f1 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -89,6 +89,46 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
 	return 0;
 }
 
+/*
+ * get_active_region_work_fn - A helper function for get_node_active_region
+ *	Returns datax set to the start_pfn and end_pfn if they contain
+ *	the initial value of datax->start_pfn between them
+ * @start_pfn: start page(inclusive) of region to check
+ * @end_pfn: end page(exclusive) of region to check
+ * @datax: comes in with ->start_pfn set to value to search for and
+ *	goes out with active range if it contains it
+ * Returns 1 if search value is in range else 0
+ */
+static int __init get_active_region_work_fn(unsigned long start_pfn,
+					unsigned long end_pfn, void *datax)
+{
+	struct node_active_region *data;
+	data = (struct node_active_region *)datax;
+
+	if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
+		data->start_pfn = start_pfn;
+		data->end_pfn = end_pfn;
+		return 1;
+	}
+	return 0;
+
+}
+
+/*
+ * get_node_active_region - Return active region containing start_pfn
+ * @start_pfn: The page to return the region for.
+ * @node_ar: Returned set to the active region containing start_pfn
+ */
+static void __init get_node_active_region(unsigned long start_pfn,
+		       struct node_active_region *node_ar)
+{
+	int nid = early_pfn_to_nid(start_pfn);
+
+	node_ar->nid = nid;
+	node_ar->start_pfn = start_pfn;
+	work_with_active_regions(nid, get_active_region_work_fn, node_ar);
+}
+
 static void __cpuinit map_cpu_to_node(int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
@@ -837,38 +877,50 @@ void __init do_init_bootmem(void)
 				  start_pfn, end_pfn);
 
 		free_bootmem_with_active_regions(nid, end_pfn);
+	}
 
-		/* Mark reserved regions on this node */
-		for (i = 0; i < lmb.reserved.cnt; i++) {
-			unsigned long physbase = lmb.reserved.region[i].base;
-			unsigned long size = lmb.reserved.region[i].size;
-			unsigned long start_paddr = start_pfn << PAGE_SHIFT;
-			unsigned long end_paddr = end_pfn << PAGE_SHIFT;
-
-			if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid &&
-			    early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid)
-				continue;
-
-			if (physbase < end_paddr &&
-			    (physbase+size) > start_paddr) {
-				/* overlaps */
-				if (physbase < start_paddr) {
-					size -= start_paddr - physbase;
-					physbase = start_paddr;
-				}
-
-				if (size > end_paddr - physbase)
-					size = end_paddr - physbase;
-
-				dbg("reserve_bootmem %lx %lx\n", physbase,
-				    size);
-				reserve_bootmem_node(NODE_DATA(nid), physbase,
-						     size, BOOTMEM_DEFAULT);
-			}
+	/* Mark reserved regions */
+	for (i = 0; i < lmb.reserved.cnt; i++) {
+		unsigned long physbase = lmb.reserved.region[i].base;
+		unsigned long size = lmb.reserved.region[i].size;
+		unsigned long start_pfn = physbase >> PAGE_SHIFT;
+		unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
+		struct node_active_region node_ar;
+
+		get_node_active_region(start_pfn, &node_ar);
+		while (start_pfn < end_pfn) {
+			/*
+			 * if reserved region extends past active region
+			 * then trim size to active region
+			 */
+			if (end_pfn > node_ar.end_pfn)
+				size = (node_ar.end_pfn << PAGE_SHIFT)
+					- (start_pfn << PAGE_SHIFT);
+			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, size,
+				node_ar.nid);
+			reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
+						size, BOOTMEM_DEFAULT);
+			/*
+			 * if reserved region is contained in the active region
+			 * then done.
+			 */
+			if (end_pfn <= node_ar.end_pfn)
+				break;
+
+			/*
+			 * reserved region extends past the active region
+			 *   get next active region that contains this
+			 *   reserved region
+			 */
+			start_pfn = node_ar.end_pfn;
+			physbase = start_pfn << PAGE_SHIFT;
+			get_node_active_region(start_pfn, &node_ar);
 		}
 
-		sparse_memory_present_with_active_regions(nid);
 	}
+
+	for_each_online_node(nid)
+		sparse_memory_present_with_active_regions(nid);
 }
 
 void __init paging_init(void)




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] powerpc: properly reserve in bootmem the lmb reserved regions that cross NUMA nodes
  2008-10-09 20:18 ` Jon Tollefson
  (?)
@ 2008-10-10  4:55   ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 11+ messages in thread
From: Benjamin Herrenschmidt @ 2008-10-10  4:55 UTC (permalink / raw)
  To: Jon Tollefson
  Cc: linuxppc-dev, Linux Memory Management List,
	Linux Kernel Mailing List, Adam Litke, Kumar Gala,
	Paul Mackerras

On Thu, 2008-10-09 at 15:18 -0500, Jon Tollefson wrote:
> If there are multiple reserved memory blocks via lmb_reserve() that are
> contiguous addresses and on different NUMA nodes we are losing track of which 
> address ranges to reserve in bootmem on which node.  I discovered this 
> when I recently got to try 16GB huge pages on a system with more then 2 nodes.

I'm going to apply it, however, could you double check something for
me ? A cursory glance of the new version makes me wonder, what if the
first call to get_node_active_region() ends up with the work_fn never
hitting the if () case ? I think in that case, node_ar->end_pfn never
gets initialized right ? Can that happen in practice ? I suspect that
isn't the case but better safe than sorry...

If there's indeed a potential problem, please send a fixup patch.

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] powerpc: properly reserve in bootmem the lmb reserved regions that cross NUMA nodes
@ 2008-10-10  4:55   ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 11+ messages in thread
From: Benjamin Herrenschmidt @ 2008-10-10  4:55 UTC (permalink / raw)
  To: Jon Tollefson
  Cc: linuxppc-dev, Linux Memory Management List,
	Linux Kernel Mailing List, Adam Litke, Kumar Gala,
	Paul Mackerras

On Thu, 2008-10-09 at 15:18 -0500, Jon Tollefson wrote:
> If there are multiple reserved memory blocks via lmb_reserve() that are
> contiguous addresses and on different NUMA nodes we are losing track of which 
> address ranges to reserve in bootmem on which node.  I discovered this 
> when I recently got to try 16GB huge pages on a system with more then 2 nodes.

I'm going to apply it, however, could you double check something for
me ? A cursory glance of the new version makes me wonder, what if the
first call to get_node_active_region() ends up with the work_fn never
hitting the if () case ? I think in that case, node_ar->end_pfn never
gets initialized right ? Can that happen in practice ? I suspect that
isn't the case but better safe than sorry...

If there's indeed a potential problem, please send a fixup patch.

Cheers,
Ben.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] powerpc: properly reserve in bootmem the lmb reserved regions that cross NUMA nodes
@ 2008-10-10  4:55   ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 11+ messages in thread
From: Benjamin Herrenschmidt @ 2008-10-10  4:55 UTC (permalink / raw)
  To: Jon Tollefson
  Cc: Linux Memory Management List, Linux Kernel Mailing List,
	linuxppc-dev, Paul Mackerras

On Thu, 2008-10-09 at 15:18 -0500, Jon Tollefson wrote:
> If there are multiple reserved memory blocks via lmb_reserve() that are
> contiguous addresses and on different NUMA nodes we are losing track of which 
> address ranges to reserve in bootmem on which node.  I discovered this 
> when I recently got to try 16GB huge pages on a system with more then 2 nodes.

I'm going to apply it, however, could you double check something for
me ? A cursory glance of the new version makes me wonder, what if the
first call to get_node_active_region() ends up with the work_fn never
hitting the if () case ? I think in that case, node_ar->end_pfn never
gets initialized right ? Can that happen in practice ? I suspect that
isn't the case but better safe than sorry...

If there's indeed a potential problem, please send a fixup patch.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] powerpc: properly reserve in bootmem the lmb reserved regions that cross NUMA nodes
  2008-10-10  4:55   ` Benjamin Herrenschmidt
  (?)
@ 2008-10-17  4:59     ` Jon Tollefson
  -1 siblings, 0 replies; 11+ messages in thread
From: Jon Tollefson @ 2008-10-17  4:59 UTC (permalink / raw)
  To: benh
  Cc: linuxppc-dev, Linux Memory Management List,
	Linux Kernel Mailing List, Adam Litke, Kumar Gala,
	Paul Mackerras

Benjamin Herrenschmidt wrote:
> On Thu, 2008-10-09 at 15:18 -0500, Jon Tollefson wrote:
>   
>> If there are multiple reserved memory blocks via lmb_reserve() that are
>> contiguous addresses and on different NUMA nodes we are losing track of which 
>> address ranges to reserve in bootmem on which node.  I discovered this 
>> when I recently got to try 16GB huge pages on a system with more then 2 nodes.
>>     
>
> I'm going to apply it, however, could you double check something for
> me ? A cursory glance of the new version makes me wonder, what if the
> first call to get_node_active_region() ends up with the work_fn never
> hitting the if () case ? I think in that case, node_ar->end_pfn never
> gets initialized right ? Can that happen in practice ? I suspect that
> isn't the case but better safe than sorry...
>   
I have tested this on a few machines and it hasn't been a problem.  But 
I don't see anything in lmb_reserve() that would prevent reserving a 
block that was outside of valid memory.  So to be safe I have attached a 
patch that checks for an empty active range.

I also noticed that the size to reserve for subsequent nodes for a 
reserve that spans nodes wasn't taking into account the amount reserved 
on previous nodes so the patch addresses that too.  If you would prefer 
this be a separate patch let me know.

> If there's indeed a potential problem, please send a fixup patch.
>
> Cheers,
> Ben.
>   
Adjust amount to reserve based on previous nodes for reserves spanning
multiple nodes. Check if the node active range is empty before attempting
to pass the reserve to bootmem.  In practice the range shouldn't be empty,
but to be sure we check.

Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
---

 
 arch/powerpc/mm/numa.c |   15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)


diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 6cf5c71..195bfcd 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -116,6 +116,7 @@ static int __init get_active_region_work_fn(unsigned long start_pfn,
 
 /*
  * get_node_active_region - Return active region containing start_pfn
+ * Active range returned is empty if none found.
  * @start_pfn: The page to return the region for.
  * @node_ar: Returned set to the active region containing start_pfn
  */
@@ -126,6 +127,7 @@ static void __init get_node_active_region(unsigned long start_pfn,
 
 	node_ar->nid = nid;
 	node_ar->start_pfn = start_pfn;
+	node_ar->end_pfn = start_pfn;
 	work_with_active_regions(nid, get_active_region_work_fn, node_ar);
 }
 
@@ -933,18 +935,20 @@ void __init do_init_bootmem(void)
 		struct node_active_region node_ar;
 
 		get_node_active_region(start_pfn, &node_ar);
-		while (start_pfn < end_pfn) {
+		while (start_pfn < end_pfn &&
+			node_ar.start_pfn < node_ar.end_pfn) {
+			unsigned long reserve_size = size;
 			/*
 			 * if reserved region extends past active region
 			 * then trim size to active region
 			 */
 			if (end_pfn > node_ar.end_pfn)
-				size = (node_ar.end_pfn << PAGE_SHIFT)
+				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
 					- (start_pfn << PAGE_SHIFT);
-			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, size,
-				node_ar.nid);
+			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase,
+				reserve_size, node_ar.nid);
 			reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
-						size, BOOTMEM_DEFAULT);
+						reserve_size, BOOTMEM_DEFAULT);
 			/*
 			 * if reserved region is contained in the active region
 			 * then done.
@@ -959,6 +963,7 @@ void __init do_init_bootmem(void)
 			 */
 			start_pfn = node_ar.end_pfn;
 			physbase = start_pfn << PAGE_SHIFT;
+			size = size - reserve_size;
 			get_node_active_region(start_pfn, &node_ar);
 		}
 





^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] powerpc: properly reserve in bootmem the lmb reserved regions that cross NUMA nodes
@ 2008-10-17  4:59     ` Jon Tollefson
  0 siblings, 0 replies; 11+ messages in thread
From: Jon Tollefson @ 2008-10-17  4:59 UTC (permalink / raw)
  To: benh
  Cc: linuxppc-dev, Linux Memory Management List,
	Linux Kernel Mailing List, Adam Litke, Kumar Gala,
	Paul Mackerras

Benjamin Herrenschmidt wrote:
> On Thu, 2008-10-09 at 15:18 -0500, Jon Tollefson wrote:
>   
>> If there are multiple reserved memory blocks via lmb_reserve() that are
>> contiguous addresses and on different NUMA nodes we are losing track of which 
>> address ranges to reserve in bootmem on which node.  I discovered this 
>> when I recently got to try 16GB huge pages on a system with more then 2 nodes.
>>     
>
> I'm going to apply it, however, could you double check something for
> me ? A cursory glance of the new version makes me wonder, what if the
> first call to get_node_active_region() ends up with the work_fn never
> hitting the if () case ? I think in that case, node_ar->end_pfn never
> gets initialized right ? Can that happen in practice ? I suspect that
> isn't the case but better safe than sorry...
>   
I have tested this on a few machines and it hasn't been a problem.  But 
I don't see anything in lmb_reserve() that would prevent reserving a 
block that was outside of valid memory.  So to be safe I have attached a 
patch that checks for an empty active range.

I also noticed that the size to reserve for subsequent nodes for a 
reserve that spans nodes wasn't taking into account the amount reserved 
on previous nodes so the patch addresses that too.  If you would prefer 
this be a separate patch let me know.

> If there's indeed a potential problem, please send a fixup patch.
>
> Cheers,
> Ben.
>   
Adjust amount to reserve based on previous nodes for reserves spanning
multiple nodes. Check if the node active range is empty before attempting
to pass the reserve to bootmem.  In practice the range shouldn't be empty,
but to be sure we check.

Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
---

 
 arch/powerpc/mm/numa.c |   15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)


diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 6cf5c71..195bfcd 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -116,6 +116,7 @@ static int __init get_active_region_work_fn(unsigned long start_pfn,
 
 /*
  * get_node_active_region - Return active region containing start_pfn
+ * Active range returned is empty if none found.
  * @start_pfn: The page to return the region for.
  * @node_ar: Returned set to the active region containing start_pfn
  */
@@ -126,6 +127,7 @@ static void __init get_node_active_region(unsigned long start_pfn,
 
 	node_ar->nid = nid;
 	node_ar->start_pfn = start_pfn;
+	node_ar->end_pfn = start_pfn;
 	work_with_active_regions(nid, get_active_region_work_fn, node_ar);
 }
 
@@ -933,18 +935,20 @@ void __init do_init_bootmem(void)
 		struct node_active_region node_ar;
 
 		get_node_active_region(start_pfn, &node_ar);
-		while (start_pfn < end_pfn) {
+		while (start_pfn < end_pfn &&
+			node_ar.start_pfn < node_ar.end_pfn) {
+			unsigned long reserve_size = size;
 			/*
 			 * if reserved region extends past active region
 			 * then trim size to active region
 			 */
 			if (end_pfn > node_ar.end_pfn)
-				size = (node_ar.end_pfn << PAGE_SHIFT)
+				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
 					- (start_pfn << PAGE_SHIFT);
-			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, size,
-				node_ar.nid);
+			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase,
+				reserve_size, node_ar.nid);
 			reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
-						size, BOOTMEM_DEFAULT);
+						reserve_size, BOOTMEM_DEFAULT);
 			/*
 			 * if reserved region is contained in the active region
 			 * then done.
@@ -959,6 +963,7 @@ void __init do_init_bootmem(void)
 			 */
 			start_pfn = node_ar.end_pfn;
 			physbase = start_pfn << PAGE_SHIFT;
+			size = size - reserve_size;
 			get_node_active_region(start_pfn, &node_ar);
 		}
 




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] powerpc: properly reserve in bootmem the lmb reserved regions that cross NUMA nodes
@ 2008-10-17  4:59     ` Jon Tollefson
  0 siblings, 0 replies; 11+ messages in thread
From: Jon Tollefson @ 2008-10-17  4:59 UTC (permalink / raw)
  To: benh
  Cc: Linux Memory Management List, Linux Kernel Mailing List,
	linuxppc-dev, Paul Mackerras

Benjamin Herrenschmidt wrote:
> On Thu, 2008-10-09 at 15:18 -0500, Jon Tollefson wrote:
>   
>> If there are multiple reserved memory blocks via lmb_reserve() that are
>> contiguous addresses and on different NUMA nodes we are losing track of which 
>> address ranges to reserve in bootmem on which node.  I discovered this 
>> when I recently got to try 16GB huge pages on a system with more then 2 nodes.
>>     
>
> I'm going to apply it, however, could you double check something for
> me ? A cursory glance of the new version makes me wonder, what if the
> first call to get_node_active_region() ends up with the work_fn never
> hitting the if () case ? I think in that case, node_ar->end_pfn never
> gets initialized right ? Can that happen in practice ? I suspect that
> isn't the case but better safe than sorry...
>   
I have tested this on a few machines and it hasn't been a problem.  But 
I don't see anything in lmb_reserve() that would prevent reserving a 
block that was outside of valid memory.  So to be safe I have attached a 
patch that checks for an empty active range.

I also noticed that the size to reserve for subsequent nodes for a 
reserve that spans nodes wasn't taking into account the amount reserved 
on previous nodes so the patch addresses that too.  If you would prefer 
this be a separate patch let me know.

> If there's indeed a potential problem, please send a fixup patch.
>
> Cheers,
> Ben.
>   
Adjust amount to reserve based on previous nodes for reserves spanning
multiple nodes. Check if the node active range is empty before attempting
to pass the reserve to bootmem.  In practice the range shouldn't be empty,
but to be sure we check.

Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
---

 
 arch/powerpc/mm/numa.c |   15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)


diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 6cf5c71..195bfcd 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -116,6 +116,7 @@ static int __init get_active_region_work_fn(unsigned long start_pfn,
 
 /*
  * get_node_active_region - Return active region containing start_pfn
+ * Active range returned is empty if none found.
  * @start_pfn: The page to return the region for.
  * @node_ar: Returned set to the active region containing start_pfn
  */
@@ -126,6 +127,7 @@ static void __init get_node_active_region(unsigned long start_pfn,
 
 	node_ar->nid = nid;
 	node_ar->start_pfn = start_pfn;
+	node_ar->end_pfn = start_pfn;
 	work_with_active_regions(nid, get_active_region_work_fn, node_ar);
 }
 
@@ -933,18 +935,20 @@ void __init do_init_bootmem(void)
 		struct node_active_region node_ar;
 
 		get_node_active_region(start_pfn, &node_ar);
-		while (start_pfn < end_pfn) {
+		while (start_pfn < end_pfn &&
+			node_ar.start_pfn < node_ar.end_pfn) {
+			unsigned long reserve_size = size;
 			/*
 			 * if reserved region extends past active region
 			 * then trim size to active region
 			 */
 			if (end_pfn > node_ar.end_pfn)
-				size = (node_ar.end_pfn << PAGE_SHIFT)
+				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
 					- (start_pfn << PAGE_SHIFT);
-			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, size,
-				node_ar.nid);
+			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase,
+				reserve_size, node_ar.nid);
 			reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
-						size, BOOTMEM_DEFAULT);
+						reserve_size, BOOTMEM_DEFAULT);
 			/*
 			 * if reserved region is contained in the active region
 			 * then done.
@@ -959,6 +963,7 @@ void __init do_init_bootmem(void)
 			 */
 			start_pfn = node_ar.end_pfn;
 			physbase = start_pfn << PAGE_SHIFT;
+			size = size - reserve_size;
 			get_node_active_region(start_pfn, &node_ar);
 		}
 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* problem with numa reserve bootmem
  2008-10-09 20:18 ` Jon Tollefson
  (?)
  (?)
@ 2009-02-11  3:17 ` Geoff Levand
  2009-02-11  3:55   ` Michael Ellerman
  2009-02-12 22:36   ` [patch] powerpc: fix numa reserve bootmem page selection Geoff Levand
  -1 siblings, 2 replies; 11+ messages in thread
From: Geoff Levand @ 2009-02-11  3:17 UTC (permalink / raw)
  To: Jon Tollefson, Benjamin Herrenschmidt; +Cc: Linuxppc-dev

Hi Jon,

Jon Tollefson wrote:
> This patch takes out the reserved region loop from inside
> the loop that goes over each node.  It looks up the active region containing
> the start of the reserved region.  If it extends past that active region then
> it adjusts the size and gets the next active region containing it.
> 
>  numa.c |  108 ++++++++++++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 80 insertions(+), 28 deletions(-)

I had some problems with this numa change (commit 8f64e1f2d1e09267ac926e15090fd505c1c0cbcb)
missing an lmb reserved region.

There have been some changes to this code since this patch was committed,
but the general problem still exists.

With the PS3 platform, the boot wrapper program puts the device tree
above the boot wrapper's _end symbol.  So with this there is a small
reserved bootmem section for the DT of about 0x270 bytes
(reserved.region[0x1]):

lmb_dump_all:
    memory.cnt            = 0x1
    memory.size           = 0x8000000
    memory.region[0x0].base       = 0x0
                      .size     = 0x8000000
    reserved.cnt          = 0x2
    reserved.size         = 0x8000000
    reserved.region[0x0].base       = 0x0
                      .size     = 0xcc8000
    reserved.region[0x1].base       = 0xce0300
                      .size     = 0x270

> +	/* Mark reserved regions */
> +	for (i = 0; i < lmb.reserved.cnt; i++) {
> +		unsigned long physbase = lmb.reserved.region[i].base;
> +		unsigned long size = lmb.reserved.region[i].size;
> +		unsigned long start_pfn = physbase >> PAGE_SHIFT;
> +		unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);

With reserved.region[0x1] start_pfn and end_pfn are equal (0xce0) here.

> +		struct node_active_region node_ar;
> +
> +		get_node_active_region(start_pfn, &node_ar);
> +		while (start_pfn < end_pfn) {

And this while (start_pfn < end_pfn) test fails,

> +			/*
> +			 * if reserved region extends past active region
> +			 * then trim size to active region
> +			 */
> +			if (end_pfn > node_ar.end_pfn)
> +				size = (node_ar.end_pfn << PAGE_SHIFT)
> +					- (start_pfn << PAGE_SHIFT);
> +			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, size,
> +				node_ar.nid);
> +			reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
> +						size, BOOTMEM_DEFAULT);

And so this reserve_bootmem_node() is never called for the small region.

I'm not sure if the problem is the calculation of the end_pfn, or if we
need to test for equality in the while: (start_pfn <= end_pfn).  Please
let me know what you think.  I'll look at it some more tomorrow.

-Geoff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: problem with numa reserve bootmem
  2009-02-11  3:17 ` problem with numa reserve bootmem Geoff Levand
@ 2009-02-11  3:55   ` Michael Ellerman
  2009-02-12 22:36   ` [patch] powerpc: fix numa reserve bootmem page selection Geoff Levand
  1 sibling, 0 replies; 11+ messages in thread
From: Michael Ellerman @ 2009-02-11  3:55 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Jon Tollefson, Linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 3148 bytes --]

On Tue, 2009-02-10 at 19:17 -0800, Geoff Levand wrote:
> Hi Jon,
> 
> Jon Tollefson wrote:
> > This patch takes out the reserved region loop from inside
> > the loop that goes over each node.  It looks up the active region containing
> > the start of the reserved region.  If it extends past that active region then
> > it adjusts the size and gets the next active region containing it.
> > 
> >  numa.c |  108 ++++++++++++++++++++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 80 insertions(+), 28 deletions(-)
> 
> I had some problems with this numa change (commit 8f64e1f2d1e09267ac926e15090fd505c1c0cbcb)
> missing an lmb reserved region.
> 
> There have been some changes to this code since this patch was committed,
> but the general problem still exists.
> 
> With the PS3 platform, the boot wrapper program puts the device tree
> above the boot wrapper's _end symbol.  So with this there is a small
> reserved bootmem section for the DT of about 0x270 bytes
> (reserved.region[0x1]):
> 
> lmb_dump_all:
>     memory.cnt            = 0x1
>     memory.size           = 0x8000000
>     memory.region[0x0].base       = 0x0
>                       .size     = 0x8000000
>     reserved.cnt          = 0x2
>     reserved.size         = 0x8000000
>     reserved.region[0x0].base       = 0x0
>                       .size     = 0xcc8000
>     reserved.region[0x1].base       = 0xce0300
>                       .size     = 0x270
> 
> > +	/* Mark reserved regions */
> > +	for (i = 0; i < lmb.reserved.cnt; i++) {
> > +		unsigned long physbase = lmb.reserved.region[i].base;
> > +		unsigned long size = lmb.reserved.region[i].size;
> > +		unsigned long start_pfn = physbase >> PAGE_SHIFT;
> > +		unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
> 
> With reserved.region[0x1] start_pfn and end_pfn are equal (0xce0) here.
> 
> > +		struct node_active_region node_ar;
> > +
> > +		get_node_active_region(start_pfn, &node_ar);
> > +		while (start_pfn < end_pfn) {
> 
> And this while (start_pfn < end_pfn) test fails,
> 
> > +			/*
> > +			 * if reserved region extends past active region
> > +			 * then trim size to active region
> > +			 */
> > +			if (end_pfn > node_ar.end_pfn)
> > +				size = (node_ar.end_pfn << PAGE_SHIFT)
> > +					- (start_pfn << PAGE_SHIFT);
> > +			dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, size,
> > +				node_ar.nid);
> > +			reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
> > +						size, BOOTMEM_DEFAULT);
> 
> And so this reserve_bootmem_node() is never called for the small region.
> 
> I'm not sure if the problem is the calculation of the end_pfn, or if we
> need to test for equality in the while: (start_pfn <= end_pfn).  Please
> let me know what you think.  I'll look at it some more tomorrow.

Dave, you had a patch for this I think?

cheers

-- 
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [patch] powerpc: fix numa reserve bootmem page selection
  2009-02-11  3:17 ` problem with numa reserve bootmem Geoff Levand
  2009-02-11  3:55   ` Michael Ellerman
@ 2009-02-12 22:36   ` Geoff Levand
  1 sibling, 0 replies; 11+ messages in thread
From: Geoff Levand @ 2009-02-12 22:36 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Linuxppc-dev, Jon Tollefson, Johannes Weiner, Dave Hansen

From: Dave Hansen <dave@linux.vnet.ibm.com>

Fix the powerpc NUMA reserve bootmem page selection logic.

commit 8f64e1f2d1e09267ac926e15090fd505c1c0cbcb (powerpc: Reserve
in bootmem lmb reserved regions that cross NUMA nodes) changed
the logic for how the powerpc LMB reserved regions were converted
to bootmen reserved regions.  As the folowing discussion reports,
the new logic was not correct.


mark_reserved_regions_for_nid() goes through each LMB on the
system that specifies a reserved area.  It searches for
active regions that intersect with that LMB and are on the
specified node.  It attempts to bootmem-reserve only the area
where the active region and the reserved LMB intersect.  We
can not reserve things on other nodes as they may not have
bootmem structures allocated, yet.

We base the size of the bootmem reservation on two possible
things.  Normally, we just make the reservation start and
stop exactly at the start and end of the LMB.

However, the LMB reservations are not aware of NUMA nodes and
on occasion a single LMB may cross into several adjacent
active regions.  Those may even be on different NUMA nodes
and will require separate calls to the bootmem reserve
functions.  So, the bootmem reservation must be trimmed to
fit inside the current active region.

That's all fine and dandy, but we trim the reservation
in a page-aligned fashion.  That's bad because we start the
reservation at a non-page-aligned address: physbase.

The reservation may only span 2 bytes, but that those bytes
may span two pfns and cause a reserve_size of 2*PAGE_SIZE.

Take the case where you reserve 0x2 bytes at 0x0fff and
where the active region ends at 0x1000.  You'll jump into
that if() statment, but node_ar.end_pfn=0x1 and
start_pfn=0x0.  You'll end up with a reserve_size=0x1000,
and then call

  reserve_bootmem_node(node, physbase=0xfff, size=0x1000);

0x1000 may not be on the same node as 0xfff.  Oops.

In almost all the vm code, end_<anything> is not inclusive.
If you have an end_pfn of 0x1234, page 0x1234 is not
included in the range.  Using PFN_UP instead of the 
(>> >> PAGE_SHIFT) will make this consistent with the other VM
code.

We also need to do math for the reserved size with physbase
instead of start_pfn.  node_ar.end_pfn << PAGE_SHIFT is
*precisely* the end of the node.  However,
(start_pfn << PAGE_SHIFT) is *NOT* precisely the beginning
of the reserved area.  That is, of course, physbase.
If we don't use physbase here, the reserve_size can be
made too large.

From: Dave Hansen <dave@linux.vnet.ibm.com>
Tested-by: Geoff Levand <geoffrey.levand@am.sony.com>  Tested on PS3.
---

 linux-2.6.git-dave/arch/powerpc/mm/numa.c |    5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff -puN arch/powerpc/mm/numa.c~reserve-over-fix arch/powerpc/mm/numa.c
--- a/arch/powerpc/mm/numa.c~reserve-over-fix
+++ b/arch/powerpc/mm/numa.c
@@ -19,6 +19,7 @@
 #include <linux/notifier.h>
 #include <linux/lmb.h>
 #include <linux/of.h>
+#include <linux/pfn.h>
 #include <asm/sparsemem.h>
 #include <asm/prom.h>
 #include <asm/system.h>
@@ -882,7 +883,7 @@ static void mark_reserved_regions_for_ni
 		unsigned long physbase = lmb.reserved.region[i].base;
 		unsigned long size = lmb.reserved.region[i].size;
 		unsigned long start_pfn = physbase >> PAGE_SHIFT;
-		unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
+		unsigned long end_pfn = PFN_UP(physbase + size);
 		struct node_active_region node_ar;
 		unsigned long node_end_pfn = node->node_start_pfn +
 					     node->node_spanned_pages;
@@ -908,7 +909,7 @@ static void mark_reserved_regions_for_ni
 			 */
 			if (end_pfn > node_ar.end_pfn)
 				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
-					- (start_pfn << PAGE_SHIFT);
+					- physbase;
 			/*
 			 * Only worry about *this* node, others may not
 			 * yet have valid NODE_DATA().

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2009-02-12 22:36 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-10-09 20:18 [PATCH v3] powerpc: properly reserve in bootmem the lmb reserved regions that cross NUMA nodes Jon Tollefson
2008-10-09 20:18 ` Jon Tollefson
2008-10-10  4:55 ` Benjamin Herrenschmidt
2008-10-10  4:55   ` Benjamin Herrenschmidt
2008-10-10  4:55   ` Benjamin Herrenschmidt
2008-10-17  4:59   ` Jon Tollefson
2008-10-17  4:59     ` Jon Tollefson
2008-10-17  4:59     ` Jon Tollefson
2009-02-11  3:17 ` problem with numa reserve bootmem Geoff Levand
2009-02-11  3:55   ` Michael Ellerman
2009-02-12 22:36   ` [patch] powerpc: fix numa reserve bootmem page selection Geoff Levand

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.