linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Andy Whitcroft <apw@shadowen.org>
To: gone@us.ibm.com
Cc: linux-kernel@vger.kernel.org, apw@shadowen.org
Subject: Re: [PATCH][RFC] Discontigmem support for the x440
Date: 18 Feb 2003 22:39:41 +0000	[thread overview]
Message-ID: <1045608022.22519.9.camel@voidhawk.shadowen.org> (raw)

[-- Attachment #1: Type: text/plain, Size: 2078 bytes --]

Pat,

Whilst looking at the Summit NUMA support I believe I have found a bug
in the memory hole handling.  Specifically, there appears to be a type
mismatch between get_zholes_size() returning a single long and
free_area_init_core() requiring a log array.  What I cannot adequately
explain is why this does not lead to a panic during boot.

Attached is a patch against 2.5.59-mjb6 which I believe should correct
this.  It has been tested in isolation and compile tested, but as I
don't have access to a test machine I cannot be sure it works.  I
believe some investigation is needed to understand why this bug does not
prevent booting, or lead to a large disparity in the zone free page
counts, perhaps the e820 map is helping here.

[gory details for the interested]
Under NUMA support  constructing the memory map we call
free_area_init_node() to initilialise the pglist_data and allocate the
memory map structures. As part of this we supply a per node, per memory
zone page count and a per node, per memory zone missing page count. 
These are used in free_area_init_core() to determine the true number of
pages per node, per zone.  In the existing summit code we parse the SRAT
in order to locate and size the inter-chunk gaps, on a per node basis. 
Later this is queried via get_zholes_size() from zone_init_sizes(). 
Unfortuantly, get_zholes_size is returning a single long representing
the per node total holes, whilst zone_init_sizes() requires an array of
longs one per zone (long[MAX_NR_ZONES]). In the zero holes case this
will be safe as if there are zero pages of hole then we pass an
apparently null pointer to get_zholes_size which is interpreted as
having no holes.  If the presence of any such holes a low-memory
reference would be passed potentially leading to an oops.

The attached patch modifies the memory chunk hole scan such that each
hole is allocated to one or more zones using the calculated zone
boundries, converting zholes_size[] from a per node count to a per node,
per zone count in a similar form to the associated zones[] array.

Cheers.

-apw


[-- Attachment #2: patch.mjb6-zholes --]
[-- Type: text/x-patch, Size: 5706 bytes --]

diff -X /home/apw/bin/makediff.excl -rupN linux-2.5.59-mjb6/arch/i386/kernel/srat.c linux-2.5.59-mjb6-zholes/arch/i386/kernel/srat.c
--- linux-2.5.59-mjb6/arch/i386/kernel/srat.c	2003-02-12 11:28:54.000000000 +0000
+++ linux-2.5.59-mjb6-zholes/arch/i386/kernel/srat.c	2003-02-13 13:17:08.000000000 +0000
@@ -50,7 +50,8 @@ static u8 pxm_bitmap[PXM_BITMAP_LEN];	/*
 struct node_memory_chunk_s node_memory_chunk[MAXCLUMPS];
 
 static int num_memory_chunks;		/* total number of memory chunks */
-static unsigned long zholes_size[MAX_NUMNODES];
+static int zholes_size_init;
+static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
 
 unsigned long node_start_pfn[MAX_NUMNODES];
 unsigned long node_end_pfn[MAX_NUMNODES];
@@ -151,6 +152,49 @@ static void __init parse_memory_affinity
 		 "enabled and removable" : "enabled" ) );
 }
 
+#if MAX_NR_ZONES != 3
+#error "MAX_NR_ZONES != 3, chunk_to_zone requires review"
+#endif
+/* Take a chunk of pages from page frame cstart to cend and count the number
+ * of pages in each zone, returned via zones[].
+ */
+static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, 
+		unsigned long *zones)
+{
+	unsigned long max_dma;
+	extern unsigned long max_low_pfn;
+
+	int z;
+	unsigned long rend;
+
+	/* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide
+	 * similarly scoped information and should be handled in a consistant
+	 * manner.
+	 */
+	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+	/* Split the hole into the zones in which it falls.  Repeatedly
+	 * take the segment in which the remaining hole starts, round it
+	 * to the end of that zone.
+	 */
+	memset(zones, 0, MAX_NR_ZONES * sizeof(long));
+	while (cstart < cend) {
+		if (cstart < max_dma) {
+			z = ZONE_DMA;
+			rend = (cend < max_dma)? cend : max_dma;
+
+		} else if (cstart < max_low_pfn) {
+			z = ZONE_NORMAL;
+			rend = (cend < max_low_pfn)? cend : max_low_pfn;
+
+		} else {
+			z = ZONE_HIGHMEM;
+			rend = cend;
+		}
+		zones[z] += rend - cstart;
+		cstart = rend;
+	}
+}
 
 /* Parse the ACPI Static Resource Affinity Table */
 static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
@@ -242,10 +286,6 @@ static int __init acpi20_parse_srat(stru
 				} else { /* We've found another chunk of memory for the node */
 					if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
 						printk("found a another chunk on nid %d, chunk %d\n", nid, j);
-
-						zholes_size[nid] = zholes_size[nid] +
-							(node_memory_chunk[j].start_pfn
-							 - node_end_pfn[nid]);
 						node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
 					}
 				}
@@ -396,10 +436,53 @@ printk("Begin table scan....\n");
 	wbinvd();
 }
 
-unsigned long __init get_zholes_size(int nid)
+/* For each node run the memory list to determine whether there are
+ * any memory holes.  For each hole determine which ZONE they fall
+ * into.
+ *
+ * NOTE#1: this requires knowledge of the zone boundries and so
+ * _cannot_ be performed before those are calculated in setup_memory.
+ * 
+ * NOTE#2: we rely on the fact that the memory chunks are ordered by
+ * start pfn number during setup.
+ */
+static void __init get_zholes_init(void)
 {
+	int nid;
+	int c;
+	int first;
+	unsigned long end = 0;
+
+	for (nid = 0; nid < numnodes; nid++) {
+		first = 1;
+		for (c = 0; c < num_memory_chunks; c++){
+			if (node_memory_chunk[c].nid == nid) {
+				if (first) {
+					end = node_memory_chunk[c].end_pfn;
+					first = 0;
+
+				} else {
+					/* Record any gap between this chunk
+					 * and the previous chunk on this node
+					 * against the zones it spans.
+					 */
+					chunk_to_zones(end,
+						node_memory_chunk[c].start_pfn,
+						&zholes_size[nid * MAX_NR_ZONES]);
+				}
+			}
+		}
+	}
+}
+
+unsigned long * __init get_zholes_size(int nid)
+{
+	if (!zholes_size_init) {
+		zholes_size_init++;
+		get_zholes_init();
+	}
 	if((nid >= numnodes) | (nid >= MAX_NUMNODES))
 		printk("%s: nid = %d is invalid. numnodes = %d",
 		       __FUNCTION__, nid, numnodes);
-	return zholes_size[nid];
+	return &zholes_size[nid * MAX_NR_ZONES];
 }
diff -X /home/apw/bin/makediff.excl -rupN linux-2.5.59-mjb6/arch/i386/mm/discontig.c linux-2.5.59-mjb6-zholes/arch/i386/mm/discontig.c
--- linux-2.5.59-mjb6/arch/i386/mm/discontig.c	2003-02-12 11:28:54.000000000 +0000
+++ linux-2.5.59-mjb6-zholes/arch/i386/mm/discontig.c	2003-02-13 13:03:34.000000000 +0000
@@ -290,7 +290,7 @@ void __init zone_sizes_init(void)
 
 	for (nid = 0; nid < numnodes; nid++) {
 		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
-		unsigned long zholes_size;
+		unsigned long *zholes_size;
 		unsigned int max_dma;
 
 		unsigned long low = max_low_pfn;
@@ -331,7 +331,7 @@ void __init zone_sizes_init(void)
 			lmem_map &= PAGE_MASK;
 			free_area_init_node(nid, NODE_DATA(nid), 
 				(struct page *)lmem_map, zones_size, 
-				start, (unsigned long *)zholes_size);
+				start, zholes_size);
 		}
 	}
 	return;
diff -X /home/apw/bin/makediff.excl -rupN linux-2.5.59-mjb6/include/asm-i386/srat.h linux-2.5.59-mjb6-zholes/include/asm-i386/srat.h
--- linux-2.5.59-mjb6/include/asm-i386/srat.h	2003-02-12 11:28:55.000000000 +0000
+++ linux-2.5.59-mjb6-zholes/include/asm-i386/srat.h	2003-02-13 11:02:43.000000000 +0000
@@ -34,7 +34,7 @@
 #define MAXCLUMPS		(MAX_CLUMPS_PER_NODE * MAX_NUMNODES)
 extern int pfn_to_nid(unsigned long);
 extern void get_memcfg_from_srat(void);
-extern unsigned long get_zholes_size(int);
+extern unsigned long *get_zholes_size(int);
 #define get_memcfg_numa() get_memcfg_from_srat()
 
 /*

             reply	other threads:[~2003-02-18 22:30 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-02-18 22:39 Andy Whitcroft [this message]
2003-02-18 22:55 ` [PATCH][RFC] Discontigmem support for the x440 Patricia Gaughen
  -- strict thread matches above, loose matches on Subject: below --
2003-02-06 20:16 Grover, Andrew
2003-02-06  7:10 Patricia Gaughen
2003-02-08 19:23 ` Martin J. Bligh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1045608022.22519.9.camel@voidhawk.shadowen.org \
    --to=apw@shadowen.org \
    --cc=gone@us.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).