linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/4] x86: cleanup and add missing log levels for k8
@ 2009-09-25 22:19 David Rientjes
  2009-09-25 22:20 ` [patch 2/4] x86: export k8 physical topology David Rientjes
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: David Rientjes @ 2009-09-25 22:19 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner
  Cc: Yinghai Lu, Balbir Singh, Ankita Garg, Len Brown, x86, linux-kernel

Convert all printk's in arch/x86/mm/k8topology_64.c to use pr_info() or
pr_err() appropriately.

Adds log levels for messages currently lacking them.

Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 arch/x86/mm/k8topology_64.c |   39 +++++++++++++++++++--------------------
 1 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -91,14 +91,14 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 	if (nb < 0)
 		return nb;
 
-	printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
+	pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
 
 	reg = read_pci_config(0, nb, 0, 0x60);
 	numnodes = ((reg >> 4) & 0xF) + 1;
 	if (numnodes <= 1)
 		return -1;
 
-	printk(KERN_INFO "Number of nodes %d\n", numnodes);
+	pr_info("Number of nodes %d\n", numnodes);
 
 	memset(&nodes, 0, sizeof(nodes));
 	prevbase = 0;
@@ -111,28 +111,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		nodeid = limit & 7;
 		if ((base & 3) == 0) {
 			if (i < numnodes)
-				printk("Skipping disabled node %d\n", i);
+				pr_info("Skipping disabled node %d\n", i);
 			continue;
 		}
 		if (nodeid >= numnodes) {
-			printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
-			       base, limit);
+			pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+				base, limit);
 			continue;
 		}
 
 		if (!limit) {
-			printk(KERN_INFO "Skipping node entry %d (base %lx)\n",
-			       i, base);
+			pr_info("Skipping node entry %d (base %lx)\n",
+				i, base);
 			continue;
 		}
 		if ((base >> 8) & 3 || (limit >> 8) & 3) {
-			printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
-			       nodeid, (base>>8)&3, (limit>>8) & 3);
+			pr_err("Node %d using interleaving mode %lx/%lx\n",
+			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
 			return -1;
 		}
 		if (node_isset(nodeid, node_possible_map)) {
-			printk(KERN_INFO "Node %d already present. Skipping\n",
-			       nodeid);
+			pr_info("Node %d already present, skipping\n",
+				nodeid);
 			continue;
 		}
 
@@ -154,24 +154,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		if (limit > end)
 			limit = end;
 		if (limit == base) {
-			printk(KERN_ERR "Empty node %d\n", nodeid);
+			pr_err("Empty node %d\n", nodeid);
 			continue;
 		}
 		if (limit < base) {
-			printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
+			pr_err("Node %d bogus settings %lx-%lx.\n",
 			       nodeid, base, limit);
 			continue;
 		}
 
 		/* Could sort here, but pun for now. Should not happen anyroads. */
 		if (prevbase > base) {
-			printk(KERN_ERR "Node map not sorted %lx,%lx\n",
+			pr_err("Node map not sorted %lx,%lx\n",
 			       prevbase, base);
 			return -1;
 		}
 
-		printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
-		       nodeid, base, limit);
+		pr_info("Node %d MemBase %016lx Limit %016lx\n",
+			nodeid, base, limit);
 
 		found++;
 
@@ -188,10 +188,10 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 
 	memnode_shift = compute_hash_shift(nodes, 8, NULL);
 	if (memnode_shift < 0) {
-		printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
+		pr_err("No NUMA node hash function found. Contact maintainer\n");
 		return -1;
 	}
-	printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
+	pr_info("Using node hash shift of %d\n", memnode_shift);
 
 	/* use the coreid bits from early_identify_cpu */
 	bits = boot_cpu_data.x86_coreid_bits;
@@ -200,8 +200,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 	/* need to get boot_cpu_id early for system with apicid lifting */
 	early_get_boot_cpu_id();
 	if (boot_cpu_physical_apicid > 0) {
-		printk(KERN_INFO "BSP APIC ID: %02x\n",
-				 boot_cpu_physical_apicid);
+		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
 		apicid_base = boot_cpu_physical_apicid;
 	}
 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [patch 2/4] x86: export k8 physical topology
  2009-09-25 22:19 [patch 1/4] x86: cleanup and add missing log levels for k8 David Rientjes
@ 2009-09-25 22:20 ` David Rientjes
  2009-10-12 21:31   ` [tip:x86/mm] x86: Export " tip-bot for David Rientjes
  2009-09-25 22:20 ` [patch 3/4] x86: export srat " David Rientjes
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: David Rientjes @ 2009-09-25 22:20 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner
  Cc: Yinghai Lu, Balbir Singh, Ankita Garg, Len Brown, x86, linux-kernel

To eventually interleave emulated nodes over physical nodes, we need to
know the physical topology of the machine without actually registering
it.  This does the k8 node setup in two parts: detection and
registration.  NUMA emulation can then used the physical topology
detected to setup the address ranges of emulated nodes accordingly.  If
emulation isn't used, the k8 nodes are registered as normal.

Two formals are added to the x86 NUMA setup functions: `acpi' and `k8'.
These represent whether ACPI or K8 NUMA has been detected; both cannot be
true at the same time.  This specifies to the NUMA emulation code whether
an underlying physical NUMA topology exists and which interface to use.

This patch deals solely with separating the k8 setup path into
Northbridge detection and registration steps and leaves the ACPI changes
for a subsequent patch.  The `acpi' formal is added here, however, to
avoid touching all the header files again in the next patch.

This approach also ensures emulated nodes will not span physical nodes so
the true memory latency is not misrepresented.

k8_get_nodes() may now be used to export the k8 physical topology of the
machine for NUMA emulation.

Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 arch/x86/include/asm/k8.h         |    4 ++-
 arch/x86/include/asm/page_types.h |    3 +-
 arch/x86/kernel/setup.c           |   10 ++++++-
 arch/x86/mm/init_32.c             |    4 +-
 arch/x86/mm/init_64.c             |    3 +-
 arch/x86/mm/k8topology_64.c       |   52 +++++++++++++++++++++++++++---------
 arch/x86/mm/numa_32.c             |    4 +-
 arch/x86/mm/numa_64.c             |    6 ++--
 8 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -10,7 +10,9 @@ extern struct pci_dev **k8_northbridges;
 extern int num_k8_northbridges;
 extern int cache_k8_northbridges(void);
 extern void k8_flush_garts(void);
-extern int k8_scan_nodes(unsigned long start, unsigned long end);
+extern int k8_get_nodes(struct bootnode *nodes);
+extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int k8_scan_nodes(void);
 
 #ifdef CONFIG_K8_NB
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -49,7 +49,8 @@ extern unsigned long max_pfn_mapped;
 extern unsigned long init_memory_mapping(unsigned long start,
 					 unsigned long end);
 
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+				int acpi, int k8);
 extern void free_initmem(void);
 
 #endif	/* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -106,6 +106,7 @@
 #include <asm/percpu.h>
 #include <asm/topology.h>
 #include <asm/apicdef.h>
+#include <asm/k8.h>
 #ifdef CONFIG_X86_64
 #include <asm/numa_64.h>
 #endif
@@ -691,6 +692,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
 
 void __init setup_arch(char **cmdline_p)
 {
+	int acpi = 0;
+	int k8 = 0;
+
 #ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	visws_early_detect();
@@ -937,7 +941,11 @@ void __init setup_arch(char **cmdline_p)
 	acpi_numa_init();
 #endif
 
-	initmem_init(0, max_pfn);
+#ifdef CONFIG_K8_NUMA
+	k8 = !k8_numa_init(0, max_pfn);
+#endif
+
+	initmem_init(0, max_pfn, acpi, k8);
 
 #ifdef CONFIG_ACPI_SLEEP
 	/*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -703,8 +703,8 @@ void __init find_low_pfn_range(void)
 }
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(unsigned long start_pfn,
-				  unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+				int acpi, int k8)
 {
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start,
 }
 
 #ifndef CONFIG_NUMA
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+				int acpi, int k8)
 {
 	unsigned long bootmap_size, bootmap;
 
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -24,6 +24,9 @@
 #include <asm/apic.h>
 #include <asm/k8.h>
 
+static struct bootnode __initdata nodes[8];
+static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
+
 static __init int find_northbridge(void)
 {
 	int num;
@@ -76,12 +79,26 @@ static __init void early_get_boot_cpu_id(void)
 	early_init_lapic_mapping();
 }
 
-int __init k8_scan_nodes(unsigned long start, unsigned long end)
+int __init k8_get_nodes(struct bootnode *physnodes)
 {
-	unsigned numnodes, cores, bits, apicid_base;
+	int i;
+	int ret = 0;
+
+	for_each_node_mask(i, nodes_parsed) {
+		physnodes[ret].start = nodes[i].start;
+		physnodes[ret].end = nodes[i].end;
+		ret++;
+	}
+	return ret;
+}
+
+int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long start = PFN_PHYS(start_pfn);
+	unsigned long end = PFN_PHYS(end_pfn);
+	unsigned numnodes;
 	unsigned long prevbase;
-	struct bootnode nodes[8];
-	int i, j, nb, found = 0;
+	int i, nb, found = 0;
 	u32 nodeid, reg;
 
 	if (!early_pci_allowed())
@@ -98,9 +115,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 	if (numnodes <= 1)
 		return -1;
 
-	pr_info("Number of nodes %d\n", numnodes);
+	pr_info("Number of physical nodes %d\n", numnodes);
 
-	memset(&nodes, 0, sizeof(nodes));
 	prevbase = 0;
 	for (i = 0; i < 8; i++) {
 		unsigned long base, limit;
@@ -130,7 +146,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
 			return -1;
 		}
-		if (node_isset(nodeid, node_possible_map)) {
+		if (node_isset(nodeid, nodes_parsed)) {
 			pr_info("Node %d already present, skipping\n",
 				nodeid);
 			continue;
@@ -141,8 +157,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		limit |= (1<<24)-1;
 		limit++;
 
-		if (limit > max_pfn << PAGE_SHIFT)
-			limit = max_pfn << PAGE_SHIFT;
+		if (limit > end)
+			limit = end;
 		if (limit <= base)
 			continue;
 
@@ -180,12 +196,23 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 
 		prevbase = base;
 
-		node_set(nodeid, node_possible_map);
+		node_set(nodeid, nodes_parsed);
 	}
 
 	if (!found)
 		return -1;
+	return 0;
+}
 
+int __init k8_scan_nodes(void)
+{
+	unsigned int bits;
+	unsigned int cores;
+	unsigned int apicid_base;
+	int i;
+
+	BUG_ON(nodes_empty(nodes_parsed));
+	node_possible_map = nodes_parsed;
 	memnode_shift = compute_hash_shift(nodes, 8, NULL);
 	if (memnode_shift < 0) {
 		pr_err("No NUMA node hash function found. Contact maintainer\n");
@@ -204,9 +231,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		apicid_base = boot_cpu_physical_apicid;
 	}
 
-	for (i = 0; i < 8; i++) {
-		if (nodes[i].start == nodes[i].end)
-			continue;
+	for_each_node_mask(i, node_possible_map) {
+		int j;
 
 		e820_register_active_regions(i,
 				nodes[i].start >> PAGE_SHIFT,
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
 		(ulong) node_remap_end_vaddr[nid]);
 }
 
-void __init initmem_init(unsigned long start_pfn,
-				  unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+				int acpi, int k8)
 {
 	int nid;
 	long kva_target_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -524,7 +524,8 @@ out:
 }
 #endif /* CONFIG_NUMA_EMU */
 
-void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
+				int acpi, int k8)
 {
 	int i;
 
@@ -547,8 +548,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
 #endif
 
 #ifdef CONFIG_K8_NUMA
-	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
-					last_pfn<<PAGE_SHIFT))
+	if (!numa_off && k8 && !k8_scan_nodes())
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [patch 3/4] x86: export srat physical topology
  2009-09-25 22:19 [patch 1/4] x86: cleanup and add missing log levels for k8 David Rientjes
  2009-09-25 22:20 ` [patch 2/4] x86: export k8 physical topology David Rientjes
@ 2009-09-25 22:20 ` David Rientjes
  2009-10-12 21:32   ` [tip:x86/mm] x86: Export " tip-bot for David Rientjes
  2009-09-25 22:20 ` [patch 4/4] x86: interleave emulated nodes over physical nodes David Rientjes
  2009-10-12 21:31 ` [tip:x86/mm] x86: Clean up and add missing log levels for k8 tip-bot for David Rientjes
  3 siblings, 1 reply; 10+ messages in thread
From: David Rientjes @ 2009-09-25 22:20 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner
  Cc: Yinghai Lu, Balbir Singh, Ankita Garg, Len Brown, x86, linux-kernel

This is the counterpart to "x86: export k8 physical topology" for SRAT.
It is not as invasive because the acpi code already seperates node setup
into detection and registration steps, with the exception of registering
e820 active regions in acpi_numa_memory_affinity_init().  This is now
moved to acpi_scan_nodes() if NUMA emulation is disabled or deferred.

acpi_numa_init() now returns a value which specifies whether an
underlying SRAT was located.  If so, that topology can be used by the
emulation code to interleave emulated nodes over physical nodes or to
register the nodes for ACPI.

acpi_get_nodes() may now be used to export the srat physical topology of
the machine for NUMA emulation.

Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Cc: Len Brown <len.brown@intel.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 arch/x86/include/asm/acpi.h |    1 +
 arch/x86/kernel/setup.c     |    5 +++--
 arch/x86/mm/numa_64.c       |    4 ++--
 arch/x86/mm/srat_64.c       |   28 +++++++++++++++++++++-------
 drivers/acpi/numa.c         |   10 ++++++----
 5 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -158,6 +158,7 @@ struct bootnode;
 
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
+extern int acpi_get_nodes(struct bootnode *physnodes);
 extern int acpi_scan_nodes(unsigned long start, unsigned long end);
 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -938,11 +938,12 @@ void __init setup_arch(char **cmdline_p)
 	/*
 	 * Parse SRAT to discover nodes.
 	 */
-	acpi_numa_init();
+	acpi = acpi_numa_init();
 #endif
 
 #ifdef CONFIG_K8_NUMA
-	k8 = !k8_numa_init(0, max_pfn);
+	if (!acpi)
+		k8 = !k8_numa_init(0, max_pfn);
 #endif
 
 	initmem_init(0, max_pfn, acpi, k8);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -540,8 +540,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 #endif
 
 #ifdef CONFIG_ACPI_NUMA
-	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
-					  last_pfn << PAGE_SHIFT))
+	if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+						  last_pfn << PAGE_SHIFT))
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
 	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
 	       start, end);
-	e820_register_active_regions(node, start >> PAGE_SHIFT,
-				     end >> PAGE_SHIFT);
 
 	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
 		update_nodes_add(node, start, end);
@@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 
 void __init acpi_numa_arch_fixup(void) {}
 
+int __init acpi_get_nodes(struct bootnode *physnodes)
+{
+	int i;
+	int ret = 0;
+
+	for_each_node_mask(i, nodes_parsed) {
+		physnodes[ret].start = nodes[i].start;
+		physnodes[ret].end = nodes[i].end;
+		ret++;
+	}
+	return ret;
+}
+
 /* Use the information discovered above to actually set up the nodes. */
 int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 {
@@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	for (i = 0; i < MAX_NUMNODES; i++)
 		cutoff_node(i, start, end);
 
-	if (!nodes_cover_memory(nodes)) {
-		bad_srat();
-		return -1;
-	}
-
 	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
 					   memblk_nodeid);
 	if (memnode_shift < 0) {
@@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		return -1;
 	}
 
+	for_each_node_mask(i, nodes_parsed)
+		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+						nodes[i].end >> PAGE_SHIFT);
+	if (!nodes_cover_memory(nodes)) {
+		bad_srat();
+		return -1;
+	}
+
 	/* Account for nodes with cpus and no memory */
 	nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
 
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -283,22 +283,24 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 
 int __init acpi_numa_init(void)
 {
+	int ret = 0;
+
 	/* SRAT: Static Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
 				      acpi_parse_x2apic_affinity, NR_CPUS);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
 				      acpi_parse_processor_affinity, NR_CPUS);
-		acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
-				      acpi_parse_memory_affinity,
-				      NR_NODE_MEMBLKS);
+		ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
+					    acpi_parse_memory_affinity,
+					    NR_NODE_MEMBLKS);
 	}
 
 	/* SLIT: System Locality Information Table */
 	acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
 
 	acpi_numa_arch_fixup();
-	return 0;
+	return ret;
 }
 
 int acpi_get_pxm(acpi_handle h)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [patch 4/4] x86: interleave emulated nodes over physical nodes
  2009-09-25 22:19 [patch 1/4] x86: cleanup and add missing log levels for k8 David Rientjes
  2009-09-25 22:20 ` [patch 2/4] x86: export k8 physical topology David Rientjes
  2009-09-25 22:20 ` [patch 3/4] x86: export srat " David Rientjes
@ 2009-09-25 22:20 ` David Rientjes
  2009-10-01  8:56   ` Ingo Molnar
  2009-10-12 21:32   ` [tip:x86/mm] x86: Interleave " tip-bot for David Rientjes
  2009-10-12 21:31 ` [tip:x86/mm] x86: Clean up and add missing log levels for k8 tip-bot for David Rientjes
  3 siblings, 2 replies; 10+ messages in thread
From: David Rientjes @ 2009-09-25 22:20 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner
  Cc: Yinghai Lu, Balbir Singh, Ankita Garg, Len Brown, x86, linux-kernel

Add interleaved NUMA emulation support

This patch interleaves emulated nodes over the system's physical nodes.
This is required for interleave optimizations since mempolicies, for
example, operate by iterating over a nodemask and act without knowledge
of node distances.  It can also be used for testing memory latencies and
NUMA bugs in the kernel.

There're a couple of ways to do this:

 - divide the number of emulated nodes by the number of physical nodes
   and allocate the result on each physical node, or

 - allocate each successive emulated node on a different physical node
   until all memory is exhausted.

The disadvantage of the first option is, depending on the asymmetry in
node capacities of each physical node, emulated nodes may substantially
differ in size on a particular physical node compared to another.

The disadvantage of the second option is, also depending on the asymmetry
in node capacities of each physical node, there may be more emulated
nodes allocated on a single physical node as another.

This patch implements the second option; we sacrifice the possibility
that we may have slightly more emulated nodes on a particular physical
node compared to another in lieu of node size asymmetry.

 [ Note that "node capacity" of a physical node is not only a function of
   its addressable range, but also is affected by subtracting out the
   amount of reserved memory over that range.  NUMA emulation only deals
   with available, non-reserved memory quantities. ]

We ensure there is at least a minimal amount of available memory
allocated to each node.  We also make sure that at least this amount of
available memory is available in ZONE_DMA32 for any node that includes
both ZONE_DMA32 and ZONE_NORMAL.

This patch also cleans the emulation code up by no longer passing the
statically allocated struct bootnode array among the various functions.
This init.data array is not allocated on the stack since it may be very
large and thus it may be accessed at file scope.

The WARN_ON() for nodes_cover_memory() when faking proximity domains is
removed since it relies on successive nodes always having greater start
addresses than previous nodes; with interleaving this is no longer always
true.

Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 arch/x86/mm/numa_64.c |  211 ++++++++++++++++++++++++++++++++++++++++++------
 arch/x86/mm/srat_64.c |    1 -
 2 files changed, 184 insertions(+), 28 deletions(-)

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -306,8 +306,71 @@ void __init numa_init_array(void)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
+static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode physnodes[MAX_NUMNODES] __initdata;
 static char *cmdline __initdata;
 
+static int __init setup_physnodes(unsigned long start, unsigned long end,
+					int acpi, int k8)
+{
+	int nr_nodes = 0;
+	int ret = 0;
+	int i;
+
+#ifdef CONFIG_ACPI_NUMA
+	if (acpi)
+		nr_nodes = acpi_get_nodes(physnodes);
+#endif
+#ifdef CONFIG_K8_NUMA
+	if (k8)
+		nr_nodes = k8_get_nodes(physnodes);
+#endif
+	/*
+	 * Basic sanity checking on the physical node map: there may be errors
+	 * if the SRAT or K8 incorrectly reported the topology or the mem=
+	 * kernel parameter is used.
+	 */
+	for (i = 0; i < nr_nodes; i++) {
+		if (physnodes[i].start == physnodes[i].end)
+			continue;
+		if (physnodes[i].start > end) {
+			physnodes[i].end = physnodes[i].start;
+			continue;
+		}
+		if (physnodes[i].end < start) {
+			physnodes[i].start = physnodes[i].end;
+			continue;
+		}
+		if (physnodes[i].start < start)
+			physnodes[i].start = start;
+		if (physnodes[i].end > end)
+			physnodes[i].end = end;
+	}
+
+	/*
+	 * Remove all nodes that have no memory or were truncated because of the
+	 * limited address range.
+	 */
+	for (i = 0; i < nr_nodes; i++) {
+		if (physnodes[i].start == physnodes[i].end)
+			continue;
+		physnodes[ret].start = physnodes[i].start;
+		physnodes[ret].end = physnodes[i].end;
+		ret++;
+	}
+
+	/*
+	 * If no physical topology was detected, a single node is faked to cover
+	 * the entire address space.
+	 */
+	if (!ret) {
+		physnodes[ret].start = start;
+		physnodes[ret].end = end;
+		ret = 1;
+	}
+	return ret;
+}
+
 /*
  * Setups up nid to range from addr to addr + size.  If the end
  * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +378,9 @@ static char *cmdline __initdata;
  * allocation past addr and -1 otherwise.  addr is adjusted to be at
  * the end of the node.
  */
-static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
-				   u64 size, u64 max_addr)
+static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
 {
 	int ret = 0;
-
 	nodes[nid].start = *addr;
 	*addr += size;
 	if (*addr >= max_addr) {
@@ -335,12 +396,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
 }
 
 /*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_interleave(u64 addr, u64 max_addr,
+						int nr_phys_nodes, int nr_nodes)
+{
+	nodemask_t physnode_mask = NODE_MASK_NONE;
+	u64 size;
+	int big;
+	int ret = 0;
+	int i;
+
+	if (nr_nodes <= 0)
+		return -1;
+	if (nr_nodes > MAX_NUMNODES) {
+		pr_info("numa=fake=%d too large, reducing to %d\n",
+			nr_nodes, MAX_NUMNODES);
+		nr_nodes = MAX_NUMNODES;
+	}
+
+	size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
+	/*
+	 * Calculate the number of big nodes that can be allocated as a result
+	 * of consolidating the remainder.
+	 */
+	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
+		FAKE_NODE_MIN_SIZE;
+
+	size &= FAKE_NODE_MIN_HASH_MASK;
+	if (!size) {
+		pr_err("Not enough memory for each node.  "
+			"NUMA emulation disabled.\n");
+		return -1;
+	}
+
+	for (i = 0; i < nr_phys_nodes; i++)
+		if (physnodes[i].start != physnodes[i].end)
+			node_set(i, physnode_mask);
+
+	/*
+	 * Continue to fill physical nodes with fake nodes until there is no
+	 * memory left on any of them.
+	 */
+	while (nodes_weight(physnode_mask)) {
+		for_each_node_mask(i, physnode_mask) {
+			u64 end = physnodes[i].start + size;
+			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+
+			if (ret < big)
+				end += FAKE_NODE_MIN_SIZE;
+
+			/*
+			 * Continue to add memory to this fake node if its
+			 * non-reserved memory is less than the per-node size.
+			 */
+			while (end - physnodes[i].start -
+				e820_hole_size(physnodes[i].start, end) < size) {
+				end += FAKE_NODE_MIN_SIZE;
+				if (end > physnodes[i].end) {
+					end = physnodes[i].end;
+					break;
+				}
+			}
+
+			/*
+			 * If there won't be at least FAKE_NODE_MIN_SIZE of
+			 * non-reserved memory in ZONE_DMA32 for the next node,
+			 * this one must extend to the boundary.
+			 */
+			if (end < dma32_end && dma32_end - end -
+			    e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+				end = dma32_end;
+
+			/*
+			 * If there won't be enough non-reserved memory for the
+			 * next node, this one must extend to the end of the
+			 * physical node.
+			 */
+			if (physnodes[i].end - end -
+			    e820_hole_size(end, physnodes[i].end) < size)
+				end = physnodes[i].end;
+
+			/*
+			 * Avoid allocating more nodes than requested, which can
+			 * happen as a result of rounding down each node's size
+			 * to FAKE_NODE_MIN_SIZE.
+			 */
+			if (nodes_weight(physnode_mask) + ret >= nr_nodes)
+				end = physnodes[i].end;
+
+			if (setup_node_range(ret++, &physnodes[i].start,
+						end - physnodes[i].start,
+						physnodes[i].end) < 0)
+				node_clear(i, physnode_mask);
+		}
+	}
+	return ret;
+}
+
+/*
  * Splits num_nodes nodes up equally starting at node_start.  The return value
  * is the number of nodes split up and addr is adjusted to be at the end of the
  * last node allocated.
  */
-static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
-				      u64 max_addr, int node_start,
+static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
 				      int num_nodes)
 {
 	unsigned int big;
@@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
 					break;
 				}
 			}
-		if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
+		if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
 			break;
 	}
 	return i - node_start + 1;
@@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
  * always assigned to a final node and can be asymmetric.  Returns the number of
  * nodes split.
  */
-static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
-				      u64 max_addr, int node_start, u64 size)
+static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
+				      u64 size)
 {
 	int i = node_start;
 	size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
-	while (!setup_node_range(i++, nodes, addr, size, max_addr))
+	while (!setup_node_range(i++, addr, size, max_addr))
 		;
 	return i - node_start;
 }
@@ -413,15 +573,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
  * Sets up the system RAM area from start_pfn to last_pfn according to the
  * numa=fake command-line option.
  */
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
-
-static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
+static int __init numa_emulation(unsigned long start_pfn,
+			unsigned long last_pfn, int acpi, int k8)
 {
 	u64 size, addr = start_pfn << PAGE_SHIFT;
 	u64 max_addr = last_pfn << PAGE_SHIFT;
 	int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
+	int num_phys_nodes;
 
-	memset(&nodes, 0, sizeof(nodes));
+	num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
 	/*
 	 * If the numa=fake command-line is just a single number N, split the
 	 * system RAM into N fake nodes.
@@ -429,7 +589,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
 	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
 		long n = simple_strtol(cmdline, NULL, 0);
 
-		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
+		num_nodes = split_nodes_interleave(addr, max_addr,
+							num_phys_nodes, n);
 		if (num_nodes < 0)
 			return num_nodes;
 		goto out;
@@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
 			size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
 			if (size)
 				for (i = 0; i < coeff; i++, num_nodes++)
-					if (setup_node_range(num_nodes, nodes,
-						&addr, size, max_addr) < 0)
+					if (setup_node_range(num_nodes, &addr,
+						size, max_addr) < 0)
 						goto done;
 			if (!*cmdline)
 				break;
@@ -473,7 +634,7 @@ done:
 	if (addr < max_addr) {
 		if (coeff_flag && coeff < 0) {
 			/* Split remaining nodes into num-sized chunks */
-			num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
+			num_nodes += split_nodes_by_size(&addr, max_addr,
 							 num_nodes, num);
 			goto out;
 		}
@@ -482,7 +643,7 @@ done:
 			/* Split remaining nodes into coeff chunks */
 			if (coeff <= 0)
 				break;
-			num_nodes += split_nodes_equally(nodes, &addr, max_addr,
+			num_nodes += split_nodes_equally(&addr, max_addr,
 							 num_nodes, coeff);
 			break;
 		case ',':
@@ -490,8 +651,8 @@ done:
 			break;
 		default:
 			/* Give one final node */
-			setup_node_range(num_nodes, nodes, &addr,
-					 max_addr - addr, max_addr);
+			setup_node_range(num_nodes, &addr, max_addr - addr,
+					 max_addr);
 			num_nodes++;
 		}
 	}
@@ -505,14 +666,10 @@ out:
 	}
 
 	/*
-	 * We need to vacate all active ranges that may have been registered by
-	 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
-	 * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
+	 * We need to vacate all active ranges that may have been registered for
+	 * the e820 memory map.
 	 */
 	remove_all_active_ranges();
-#ifdef CONFIG_ACPI_NUMA
-	acpi_numa = -1;
-#endif
 	for_each_node_mask(i, node_possible_map) {
 		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 						nodes[i].end >> PAGE_SHIFT);
@@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	nodes_clear(node_online_map);
 
 #ifdef CONFIG_NUMA_EMU
-	if (cmdline && !numa_emulation(start_pfn, last_pfn))
+	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -468,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 	for (i = 0; i < num_nodes; i++)
 		if (fake_nodes[i].start != fake_nodes[i].end)
 			node_set(i, nodes_parsed);
-	WARN_ON(!nodes_cover_memory(fake_nodes));
 }
 
 static int null_slit_node_compare(int a, int b)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 4/4] x86: interleave emulated nodes over physical nodes
  2009-09-25 22:20 ` [patch 4/4] x86: interleave emulated nodes over physical nodes David Rientjes
@ 2009-10-01  8:56   ` Ingo Molnar
  2009-10-09  9:34     ` David Rientjes
  2009-10-12 21:32   ` [tip:x86/mm] x86: Interleave " tip-bot for David Rientjes
  1 sibling, 1 reply; 10+ messages in thread
From: Ingo Molnar @ 2009-10-01  8:56 UTC (permalink / raw)
  To: David Rientjes, H. Peter Anvin, Thomas Gleixner
  Cc: Ingo Molnar, Yinghai Lu, Balbir Singh, Ankita Garg, Len Brown,
	x86, linux-kernel


* David Rientjes <rientjes@google.com> wrote:

> Add interleaved NUMA emulation support
> 
> This patch interleaves emulated nodes over the system's physical 
> nodes. This is required for interleave optimizations since 
> mempolicies, for example, operate by iterating over a nodemask and act 
> without knowledge of node distances.  It can also be used for testing 
> memory latencies and NUMA bugs in the kernel.
> 
> There're a couple of ways to do this:
> 
>  - divide the number of emulated nodes by the number of physical nodes
>    and allocate the result on each physical node, or
> 
>  - allocate each successive emulated node on a different physical node
>    until all memory is exhausted.
> 
> The disadvantage of the first option is, depending on the asymmetry in 
> node capacities of each physical node, emulated nodes may 
> substantially differ in size on a particular physical node compared to 
> another.
> 
> The disadvantage of the second option is, also depending on the 
> asymmetry in node capacities of each physical node, there may be more 
> emulated nodes allocated on a single physical node as another.
> 
> This patch implements the second option; we sacrifice the possibility 
> that we may have slightly more emulated nodes on a particular physical 
> node compared to another in lieu of node size asymmetry.
> 
>  [ Note that "node capacity" of a physical node is not only a function of
>    its addressable range, but also is affected by subtracting out the
>    amount of reserved memory over that range.  NUMA emulation only deals
>    with available, non-reserved memory quantities. ]
> 
> We ensure there is at least a minimal amount of available memory
> allocated to each node.  We also make sure that at least this amount of
> available memory is available in ZONE_DMA32 for any node that includes
> both ZONE_DMA32 and ZONE_NORMAL.
> 
> This patch also cleans the emulation code up by no longer passing the
> statically allocated struct bootnode array among the various functions.
> This init.data array is not allocated on the stack since it may be very
> large and thus it may be accessed at file scope.
> 
> The WARN_ON() for nodes_cover_memory() when faking proximity domains is
> removed since it relies on successive nodes always having greater start
> addresses than previous nodes; with interleaving this is no longer always
> true.
> 
> Cc: Yinghai Lu <yinghai@kernel.org>
> Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
> Cc: Ankita Garg <ankita@in.ibm.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  arch/x86/mm/numa_64.c |  211 ++++++++++++++++++++++++++++++++++++++++++------
>  arch/x86/mm/srat_64.c |    1 -
>  2 files changed, 184 insertions(+), 28 deletions(-)

Looks very nice. Peter, Thomas, any objections against queueing this up 
in the x86 tree for more testing?

	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 4/4] x86: interleave emulated nodes over physical nodes
  2009-10-01  8:56   ` Ingo Molnar
@ 2009-10-09  9:34     ` David Rientjes
  0 siblings, 0 replies; 10+ messages in thread
From: David Rientjes @ 2009-10-09  9:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: H. Peter Anvin, Thomas Gleixner, Ingo Molnar, Yinghai Lu,
	Balbir Singh, Ankita Garg, Len Brown, x86, linux-kernel

On Thu, 1 Oct 2009, Ingo Molnar wrote:

> > This patch interleaves emulated nodes over the system's physical 
> > nodes. This is required for interleave optimizations since 
> > mempolicies, for example, operate by iterating over a nodemask and act 
> > without knowledge of node distances.  It can also be used for testing 
> > memory latencies and NUMA bugs in the kernel.
> > 
> > There're a couple of ways to do this:
> > 
> >  - divide the number of emulated nodes by the number of physical nodes
> >    and allocate the result on each physical node, or
> > 
> >  - allocate each successive emulated node on a different physical node
> >    until all memory is exhausted.
> > 
> > The disadvantage of the first option is, depending on the asymmetry in 
> > node capacities of each physical node, emulated nodes may 
> > substantially differ in size on a particular physical node compared to 
> > another.
> > 
> > The disadvantage of the second option is, also depending on the 
> > asymmetry in node capacities of each physical node, there may be more 
> > emulated nodes allocated on a single physical node as another.
> > 
> > This patch implements the second option; we sacrifice the possibility 
> > that we may have slightly more emulated nodes on a particular physical 
> > node compared to another in lieu of node size asymmetry.
> > 
> >  [ Note that "node capacity" of a physical node is not only a function of
> >    its addressable range, but also is affected by subtracting out the
> >    amount of reserved memory over that range.  NUMA emulation only deals
> >    with available, non-reserved memory quantities. ]
> > 
> > We ensure there is at least a minimal amount of available memory
> > allocated to each node.  We also make sure that at least this amount of
> > available memory is available in ZONE_DMA32 for any node that includes
> > both ZONE_DMA32 and ZONE_NORMAL.
> > 
> > This patch also cleans the emulation code up by no longer passing the
> > statically allocated struct bootnode array among the various functions.
> > This init.data array is not allocated on the stack since it may be very
> > large and thus it may be accessed at file scope.
> > 
> > The WARN_ON() for nodes_cover_memory() when faking proximity domains is
> > removed since it relies on successive nodes always having greater start
> > addresses than previous nodes; with interleaving this is no longer always
> > true.
> > 
> > Cc: Yinghai Lu <yinghai@kernel.org>
> > Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
> > Cc: Ankita Garg <ankita@in.ibm.com>
> > Signed-off-by: David Rientjes <rientjes@google.com>
> > ---
> >  arch/x86/mm/numa_64.c |  211 ++++++++++++++++++++++++++++++++++++++++++------
> >  arch/x86/mm/srat_64.c |    1 -
> >  2 files changed, 184 insertions(+), 28 deletions(-)
> 
> Looks very nice. Peter, Thomas, any objections against queueing this up 
> in the x86 tree for more testing?
> 

Thanks!  Do you know when this will be merged?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [tip:x86/mm] x86: Clean up and add missing log levels for k8
  2009-09-25 22:19 [patch 1/4] x86: cleanup and add missing log levels for k8 David Rientjes
                   ` (2 preceding siblings ...)
  2009-09-25 22:20 ` [patch 4/4] x86: interleave emulated nodes over physical nodes David Rientjes
@ 2009-10-12 21:31 ` tip-bot for David Rientjes
  3 siblings, 0 replies; 10+ messages in thread
From: tip-bot for David Rientjes @ 2009-10-12 21:31 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, yinghai, ankita, balbir, tglx,
	rientjes, mingo, len.brown

Commit-ID:  1af5ba514f0c2f2e2af965a4ffa5e8ab269271b9
Gitweb:     http://git.kernel.org/tip/1af5ba514f0c2f2e2af965a4ffa5e8ab269271b9
Author:     David Rientjes <rientjes@google.com>
AuthorDate: Fri, 25 Sep 2009 15:19:47 -0700
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 12 Oct 2009 22:56:45 +0200

x86: Clean up and add missing log levels for k8

Convert all printk's in arch/x86/mm/k8topology_64.c to use
pr_info() or pr_err() appropriately.

Adds log levels for messages currently lacking them.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Cc: Len Brown <len.brown@intel.com>
LKML-Reference: <alpine.DEB.1.00.0909251517440.14754@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/k8topology_64.c |   39 +++++++++++++++++++--------------------
 1 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 268f825..a81561a 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -91,14 +91,14 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 	if (nb < 0)
 		return nb;
 
-	printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
+	pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
 
 	reg = read_pci_config(0, nb, 0, 0x60);
 	numnodes = ((reg >> 4) & 0xF) + 1;
 	if (numnodes <= 1)
 		return -1;
 
-	printk(KERN_INFO "Number of nodes %d\n", numnodes);
+	pr_info("Number of nodes %d\n", numnodes);
 
 	memset(&nodes, 0, sizeof(nodes));
 	prevbase = 0;
@@ -111,28 +111,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		nodeid = limit & 7;
 		if ((base & 3) == 0) {
 			if (i < numnodes)
-				printk("Skipping disabled node %d\n", i);
+				pr_info("Skipping disabled node %d\n", i);
 			continue;
 		}
 		if (nodeid >= numnodes) {
-			printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
-			       base, limit);
+			pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+				base, limit);
 			continue;
 		}
 
 		if (!limit) {
-			printk(KERN_INFO "Skipping node entry %d (base %lx)\n",
-			       i, base);
+			pr_info("Skipping node entry %d (base %lx)\n",
+				i, base);
 			continue;
 		}
 		if ((base >> 8) & 3 || (limit >> 8) & 3) {
-			printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
-			       nodeid, (base>>8)&3, (limit>>8) & 3);
+			pr_err("Node %d using interleaving mode %lx/%lx\n",
+			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
 			return -1;
 		}
 		if (node_isset(nodeid, node_possible_map)) {
-			printk(KERN_INFO "Node %d already present. Skipping\n",
-			       nodeid);
+			pr_info("Node %d already present, skipping\n",
+				nodeid);
 			continue;
 		}
 
@@ -154,24 +154,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		if (limit > end)
 			limit = end;
 		if (limit == base) {
-			printk(KERN_ERR "Empty node %d\n", nodeid);
+			pr_err("Empty node %d\n", nodeid);
 			continue;
 		}
 		if (limit < base) {
-			printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
+			pr_err("Node %d bogus settings %lx-%lx.\n",
 			       nodeid, base, limit);
 			continue;
 		}
 
 		/* Could sort here, but pun for now. Should not happen anyroads. */
 		if (prevbase > base) {
-			printk(KERN_ERR "Node map not sorted %lx,%lx\n",
+			pr_err("Node map not sorted %lx,%lx\n",
 			       prevbase, base);
 			return -1;
 		}
 
-		printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
-		       nodeid, base, limit);
+		pr_info("Node %d MemBase %016lx Limit %016lx\n",
+			nodeid, base, limit);
 
 		found++;
 
@@ -188,10 +188,10 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 
 	memnode_shift = compute_hash_shift(nodes, 8, NULL);
 	if (memnode_shift < 0) {
-		printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
+		pr_err("No NUMA node hash function found. Contact maintainer\n");
 		return -1;
 	}
-	printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
+	pr_info("Using node hash shift of %d\n", memnode_shift);
 
 	/* use the coreid bits from early_identify_cpu */
 	bits = boot_cpu_data.x86_coreid_bits;
@@ -200,8 +200,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 	/* need to get boot_cpu_id early for system with apicid lifting */
 	early_get_boot_cpu_id();
 	if (boot_cpu_physical_apicid > 0) {
-		printk(KERN_INFO "BSP APIC ID: %02x\n",
-				 boot_cpu_physical_apicid);
+		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
 		apicid_base = boot_cpu_physical_apicid;
 	}
 

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [tip:x86/mm] x86: Export k8 physical topology
  2009-09-25 22:20 ` [patch 2/4] x86: export k8 physical topology David Rientjes
@ 2009-10-12 21:31   ` tip-bot for David Rientjes
  0 siblings, 0 replies; 10+ messages in thread
From: tip-bot for David Rientjes @ 2009-10-12 21:31 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, yinghai, andreas.herrmann3, ankita,
	balbir, tglx, rientjes, mingo, len.brown

Commit-ID:  8ee2debce32412118cf8c239e0026ace56ea1425
Gitweb:     http://git.kernel.org/tip/8ee2debce32412118cf8c239e0026ace56ea1425
Author:     David Rientjes <rientjes@google.com>
AuthorDate: Fri, 25 Sep 2009 15:20:00 -0700
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 12 Oct 2009 22:56:45 +0200

x86: Export k8 physical topology

To eventually interleave emulated nodes over physical nodes, we
need to know the physical topology of the machine without actually
registering it.  This does the k8 node setup in two parts:
detection and registration.  NUMA emulation can then used the
physical topology detected to setup the address ranges of emulated
nodes accordingly.  If emulation isn't used, the k8 nodes are
registered as normal.

Two formals are added to the x86 NUMA setup functions: `acpi' and
`k8'. These represent whether ACPI or K8 NUMA has been detected;
both cannot be true at the same time.  This specifies to the NUMA
emulation code whether an underlying physical NUMA topology exists
and which interface to use.

This patch deals solely with separating the k8 setup path into
Northbridge detection and registration steps and leaves the ACPI
changes for a subsequent patch.  The `acpi' formal is added here,
however, to avoid touching all the header files again in the next
patch.

This approach also ensures emulated nodes will not span physical
nodes so the true memory latency is not misrepresented.

k8_get_nodes() may now be used to export the k8 physical topology
of the machine for NUMA emulation.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Cc: Len Brown <len.brown@intel.com>
LKML-Reference: <alpine.DEB.1.00.0909251518400.14754@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/k8.h         |    4 ++-
 arch/x86/include/asm/page_types.h |    3 +-
 arch/x86/kernel/setup.c           |   10 ++++++-
 arch/x86/mm/init_32.c             |    4 +-
 arch/x86/mm/init_64.c             |    3 +-
 arch/x86/mm/k8topology_64.c       |   52 +++++++++++++++++++++++++++---------
 arch/x86/mm/numa_32.c             |    4 +-
 arch/x86/mm/numa_64.c             |    6 ++--
 8 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index c2d1f3b..c092f72 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -10,7 +10,9 @@ extern struct pci_dev **k8_northbridges;
 extern int num_k8_northbridges;
 extern int cache_k8_northbridges(void);
 extern void k8_flush_garts(void);
-extern int k8_scan_nodes(unsigned long start, unsigned long end);
+extern int k8_get_nodes(struct bootnode *nodes);
+extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int k8_scan_nodes(void);
 
 #ifdef CONFIG_K8_NB
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 6473f5c..642fe34 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -49,7 +49,8 @@ extern unsigned long max_pfn_mapped;
 extern unsigned long init_memory_mapping(unsigned long start,
 					 unsigned long end);
 
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+				int acpi, int k8);
 extern void free_initmem(void);
 
 #endif	/* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e09f0e2..fda0032 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -106,6 +106,7 @@
 #include <asm/percpu.h>
 #include <asm/topology.h>
 #include <asm/apicdef.h>
+#include <asm/k8.h>
 #ifdef CONFIG_X86_64
 #include <asm/numa_64.h>
 #endif
@@ -691,6 +692,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
 
 void __init setup_arch(char **cmdline_p)
 {
+	int acpi = 0;
+	int k8 = 0;
+
 #ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	visws_early_detect();
@@ -937,7 +941,11 @@ void __init setup_arch(char **cmdline_p)
 	acpi_numa_init();
 #endif
 
-	initmem_init(0, max_pfn);
+#ifdef CONFIG_K8_NUMA
+	k8 = !k8_numa_init(0, max_pfn);
+#endif
+
+	initmem_init(0, max_pfn, acpi, k8);
 
 #ifdef CONFIG_ACPI_SLEEP
 	/*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1..5e32b07 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -703,8 +703,8 @@ void __init find_low_pfn_range(void)
 }
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(unsigned long start_pfn,
-				  unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+				int acpi, int k8)
 {
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a4398a..c20d30b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start,
 }
 
 #ifndef CONFIG_NUMA
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+				int acpi, int k8)
 {
 	unsigned long bootmap_size, bootmap;
 
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index a81561a..b9e2dbf 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -24,6 +24,9 @@
 #include <asm/apic.h>
 #include <asm/k8.h>
 
+static struct bootnode __initdata nodes[8];
+static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
+
 static __init int find_northbridge(void)
 {
 	int num;
@@ -76,12 +79,26 @@ static __init void early_get_boot_cpu_id(void)
 	early_init_lapic_mapping();
 }
 
-int __init k8_scan_nodes(unsigned long start, unsigned long end)
+int __init k8_get_nodes(struct bootnode *physnodes)
 {
-	unsigned numnodes, cores, bits, apicid_base;
+	int i;
+	int ret = 0;
+
+	for_each_node_mask(i, nodes_parsed) {
+		physnodes[ret].start = nodes[i].start;
+		physnodes[ret].end = nodes[i].end;
+		ret++;
+	}
+	return ret;
+}
+
+int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long start = PFN_PHYS(start_pfn);
+	unsigned long end = PFN_PHYS(end_pfn);
+	unsigned numnodes;
 	unsigned long prevbase;
-	struct bootnode nodes[8];
-	int i, j, nb, found = 0;
+	int i, nb, found = 0;
 	u32 nodeid, reg;
 
 	if (!early_pci_allowed())
@@ -98,9 +115,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 	if (numnodes <= 1)
 		return -1;
 
-	pr_info("Number of nodes %d\n", numnodes);
+	pr_info("Number of physical nodes %d\n", numnodes);
 
-	memset(&nodes, 0, sizeof(nodes));
 	prevbase = 0;
 	for (i = 0; i < 8; i++) {
 		unsigned long base, limit;
@@ -130,7 +146,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
 			return -1;
 		}
-		if (node_isset(nodeid, node_possible_map)) {
+		if (node_isset(nodeid, nodes_parsed)) {
 			pr_info("Node %d already present, skipping\n",
 				nodeid);
 			continue;
@@ -141,8 +157,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		limit |= (1<<24)-1;
 		limit++;
 
-		if (limit > max_pfn << PAGE_SHIFT)
-			limit = max_pfn << PAGE_SHIFT;
+		if (limit > end)
+			limit = end;
 		if (limit <= base)
 			continue;
 
@@ -180,12 +196,23 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 
 		prevbase = base;
 
-		node_set(nodeid, node_possible_map);
+		node_set(nodeid, nodes_parsed);
 	}
 
 	if (!found)
 		return -1;
+	return 0;
+}
 
+int __init k8_scan_nodes(void)
+{
+	unsigned int bits;
+	unsigned int cores;
+	unsigned int apicid_base;
+	int i;
+
+	BUG_ON(nodes_empty(nodes_parsed));
+	node_possible_map = nodes_parsed;
 	memnode_shift = compute_hash_shift(nodes, 8, NULL);
 	if (memnode_shift < 0) {
 		pr_err("No NUMA node hash function found. Contact maintainer\n");
@@ -204,9 +231,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		apicid_base = boot_cpu_physical_apicid;
 	}
 
-	for (i = 0; i < 8; i++) {
-		if (nodes[i].start == nodes[i].end)
-			continue;
+	for_each_node_mask(i, node_possible_map) {
+		int j;
 
 		e820_register_active_regions(i,
 				nodes[i].start >> PAGE_SHIFT,
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d253006..b20760c 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
 		(ulong) node_remap_end_vaddr[nid]);
 }
 
-void __init initmem_init(unsigned long start_pfn,
-				  unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+				int acpi, int k8)
 {
 	int nid;
 	long kva_target_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913b..dad5f42 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -524,7 +524,8 @@ out:
 }
 #endif /* CONFIG_NUMA_EMU */
 
-void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
+				int acpi, int k8)
 {
 	int i;
 
@@ -547,8 +548,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
 #endif
 
 #ifdef CONFIG_K8_NUMA
-	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
-					last_pfn<<PAGE_SHIFT))
+	if (!numa_off && k8 && !k8_scan_nodes())
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [tip:x86/mm] x86: Export srat physical topology
  2009-09-25 22:20 ` [patch 3/4] x86: export srat " David Rientjes
@ 2009-10-12 21:32   ` tip-bot for David Rientjes
  0 siblings, 0 replies; 10+ messages in thread
From: tip-bot for David Rientjes @ 2009-10-12 21:32 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, yinghai, andreas.herrmann3, ankita,
	balbir, tglx, rientjes, mingo, len.brown

Commit-ID:  8716273caef7f55f39fe4fc6c69c5f9f197f41f1
Gitweb:     http://git.kernel.org/tip/8716273caef7f55f39fe4fc6c69c5f9f197f41f1
Author:     David Rientjes <rientjes@google.com>
AuthorDate: Fri, 25 Sep 2009 15:20:04 -0700
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 12 Oct 2009 22:56:46 +0200

x86: Export srat physical topology

This is the counterpart to "x86: export k8 physical topology" for
SRAT. It is not as invasive because the acpi code already seperates
node setup into detection and registration steps, with the
exception of registering e820 active regions in
acpi_numa_memory_affinity_init().  This is now moved to
acpi_scan_nodes() if NUMA emulation is disabled or deferred.

acpi_numa_init() now returns a value which specifies whether an
underlying SRAT was located.  If so, that topology can be used by
the emulation code to interleave emulated nodes over physical nodes
or to register the nodes for ACPI.

acpi_get_nodes() may now be used to export the srat physical
topology of the machine for NUMA emulation.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Cc: Len Brown <len.brown@intel.com>
LKML-Reference: <alpine.DEB.1.00.0909251518580.14754@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/acpi.h |    1 +
 arch/x86/kernel/setup.c     |    5 +++--
 arch/x86/mm/numa_64.c       |    4 ++--
 arch/x86/mm/srat_64.c       |   28 +++++++++++++++++++++-------
 drivers/acpi/numa.c         |   10 ++++++----
 5 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc5..e3d4a0d 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -158,6 +158,7 @@ struct bootnode;
 
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
+extern int acpi_get_nodes(struct bootnode *physnodes);
 extern int acpi_scan_nodes(unsigned long start, unsigned long end);
 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fda0032..f891419 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -938,11 +938,12 @@ void __init setup_arch(char **cmdline_p)
 	/*
 	 * Parse SRAT to discover nodes.
 	 */
-	acpi_numa_init();
+	acpi = acpi_numa_init();
 #endif
 
 #ifdef CONFIG_K8_NUMA
-	k8 = !k8_numa_init(0, max_pfn);
+	if (!acpi)
+		k8 = !k8_numa_init(0, max_pfn);
 #endif
 
 	initmem_init(0, max_pfn, acpi, k8);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index dad5f42..d1a3d94 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -540,8 +540,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 #endif
 
 #ifdef CONFIG_ACPI_NUMA
-	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
-					  last_pfn << PAGE_SHIFT))
+	if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+						  last_pfn << PAGE_SHIFT))
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index dbb5381..891cbe6 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
 	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
 	       start, end);
-	e820_register_active_regions(node, start >> PAGE_SHIFT,
-				     end >> PAGE_SHIFT);
 
 	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
 		update_nodes_add(node, start, end);
@@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 
 void __init acpi_numa_arch_fixup(void) {}
 
+int __init acpi_get_nodes(struct bootnode *physnodes)
+{
+	int i;
+	int ret = 0;
+
+	for_each_node_mask(i, nodes_parsed) {
+		physnodes[ret].start = nodes[i].start;
+		physnodes[ret].end = nodes[i].end;
+		ret++;
+	}
+	return ret;
+}
+
 /* Use the information discovered above to actually set up the nodes. */
 int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 {
@@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	for (i = 0; i < MAX_NUMNODES; i++)
 		cutoff_node(i, start, end);
 
-	if (!nodes_cover_memory(nodes)) {
-		bad_srat();
-		return -1;
-	}
-
 	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
 					   memblk_nodeid);
 	if (memnode_shift < 0) {
@@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		return -1;
 	}
 
+	for_each_node_mask(i, nodes_parsed)
+		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+						nodes[i].end >> PAGE_SHIFT);
+	if (!nodes_cover_memory(nodes)) {
+		bad_srat();
+		return -1;
+	}
+
 	/* Account for nodes with cpus and no memory */
 	nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
 
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 202dd0c..2be2fb6 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -283,22 +283,24 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 
 int __init acpi_numa_init(void)
 {
+	int ret = 0;
+
 	/* SRAT: Static Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
 				      acpi_parse_x2apic_affinity, NR_CPUS);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
 				      acpi_parse_processor_affinity, NR_CPUS);
-		acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
-				      acpi_parse_memory_affinity,
-				      NR_NODE_MEMBLKS);
+		ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
+					    acpi_parse_memory_affinity,
+					    NR_NODE_MEMBLKS);
 	}
 
 	/* SLIT: System Locality Information Table */
 	acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
 
 	acpi_numa_arch_fixup();
-	return 0;
+	return ret;
 }
 
 int acpi_get_pxm(acpi_handle h)

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [tip:x86/mm] x86: Interleave emulated nodes over physical nodes
  2009-09-25 22:20 ` [patch 4/4] x86: interleave emulated nodes over physical nodes David Rientjes
  2009-10-01  8:56   ` Ingo Molnar
@ 2009-10-12 21:32   ` tip-bot for David Rientjes
  1 sibling, 0 replies; 10+ messages in thread
From: tip-bot for David Rientjes @ 2009-10-12 21:32 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, yinghai, andreas.herrmann3, torvalds,
	ankita, balbir, tglx, rientjes, mingo, len.brown

Commit-ID:  adc1938994f7f1112d335d998b5218b0aa680ad6
Gitweb:     http://git.kernel.org/tip/adc1938994f7f1112d335d998b5218b0aa680ad6
Author:     David Rientjes <rientjes@google.com>
AuthorDate: Fri, 25 Sep 2009 15:20:09 -0700
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 12 Oct 2009 22:56:46 +0200

x86: Interleave emulated nodes over physical nodes

Add interleaved NUMA emulation support

This patch interleaves emulated nodes over the system's physical
nodes. This is required for interleave optimizations since
mempolicies, for example, operate by iterating over a nodemask and
act without knowledge of node distances.  It can also be used for
testing memory latencies and NUMA bugs in the kernel.

There're a couple of ways to do this:

 - divide the number of emulated nodes by the number of physical
   nodes and allocate the result on each physical node, or

 - allocate each successive emulated node on a different physical
   node until all memory is exhausted.

The disadvantage of the first option is, depending on the asymmetry
in node capacities of each physical node, emulated nodes may
substantially differ in size on a particular physical node compared
to another.

The disadvantage of the second option is, also depending on the
asymmetry in node capacities of each physical node, there may be
more emulated nodes allocated on a single physical node as another.

This patch implements the second option; we sacrifice the
possibility that we may have slightly more emulated nodes on a
particular physical node compared to another in lieu of node size
asymmetry.

 [ Note that "node capacity" of a physical node is not only a
   function of its addressable range, but also is affected by
   subtracting out the amount of reserved memory over that range.
   NUMA emulation only deals with available, non-reserved memory
   quantities. ]

We ensure there is at least a minimal amount of available memory
allocated to each node.  We also make sure that at least this
amount of available memory is available in ZONE_DMA32 for any node
that includes both ZONE_DMA32 and ZONE_NORMAL.

This patch also cleans the emulation code up by no longer passing
the statically allocated struct bootnode array among the various
functions. This init.data array is not allocated on the stack since
it may be very large and thus it may be accessed at file scope.

The WARN_ON() for nodes_cover_memory() when faking proximity
domains is removed since it relies on successive nodes always
having greater start addresses than previous nodes; with
interleaving this is no longer always true.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Cc: Len Brown <len.brown@intel.com>
LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/numa_64.c |  211 ++++++++++++++++++++++++++++++++++++++++++------
 arch/x86/mm/srat_64.c |    1 -
 2 files changed, 184 insertions(+), 28 deletions(-)

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index d1a3d94..086f98a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -306,8 +306,71 @@ void __init numa_init_array(void)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
+static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode physnodes[MAX_NUMNODES] __initdata;
 static char *cmdline __initdata;
 
+static int __init setup_physnodes(unsigned long start, unsigned long end,
+					int acpi, int k8)
+{
+	int nr_nodes = 0;
+	int ret = 0;
+	int i;
+
+#ifdef CONFIG_ACPI_NUMA
+	if (acpi)
+		nr_nodes = acpi_get_nodes(physnodes);
+#endif
+#ifdef CONFIG_K8_NUMA
+	if (k8)
+		nr_nodes = k8_get_nodes(physnodes);
+#endif
+	/*
+	 * Basic sanity checking on the physical node map: there may be errors
+	 * if the SRAT or K8 incorrectly reported the topology or the mem=
+	 * kernel parameter is used.
+	 */
+	for (i = 0; i < nr_nodes; i++) {
+		if (physnodes[i].start == physnodes[i].end)
+			continue;
+		if (physnodes[i].start > end) {
+			physnodes[i].end = physnodes[i].start;
+			continue;
+		}
+		if (physnodes[i].end < start) {
+			physnodes[i].start = physnodes[i].end;
+			continue;
+		}
+		if (physnodes[i].start < start)
+			physnodes[i].start = start;
+		if (physnodes[i].end > end)
+			physnodes[i].end = end;
+	}
+
+	/*
+	 * Remove all nodes that have no memory or were truncated because of the
+	 * limited address range.
+	 */
+	for (i = 0; i < nr_nodes; i++) {
+		if (physnodes[i].start == physnodes[i].end)
+			continue;
+		physnodes[ret].start = physnodes[i].start;
+		physnodes[ret].end = physnodes[i].end;
+		ret++;
+	}
+
+	/*
+	 * If no physical topology was detected, a single node is faked to cover
+	 * the entire address space.
+	 */
+	if (!ret) {
+		physnodes[ret].start = start;
+		physnodes[ret].end = end;
+		ret = 1;
+	}
+	return ret;
+}
+
 /*
  * Setups up nid to range from addr to addr + size.  If the end
  * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +378,9 @@ static char *cmdline __initdata;
  * allocation past addr and -1 otherwise.  addr is adjusted to be at
  * the end of the node.
  */
-static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
-				   u64 size, u64 max_addr)
+static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
 {
 	int ret = 0;
-
 	nodes[nid].start = *addr;
 	*addr += size;
 	if (*addr >= max_addr) {
@@ -335,12 +396,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
 }
 
 /*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_interleave(u64 addr, u64 max_addr,
+						int nr_phys_nodes, int nr_nodes)
+{
+	nodemask_t physnode_mask = NODE_MASK_NONE;
+	u64 size;
+	int big;
+	int ret = 0;
+	int i;
+
+	if (nr_nodes <= 0)
+		return -1;
+	if (nr_nodes > MAX_NUMNODES) {
+		pr_info("numa=fake=%d too large, reducing to %d\n",
+			nr_nodes, MAX_NUMNODES);
+		nr_nodes = MAX_NUMNODES;
+	}
+
+	size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
+	/*
+	 * Calculate the number of big nodes that can be allocated as a result
+	 * of consolidating the remainder.
+	 */
+	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
+		FAKE_NODE_MIN_SIZE;
+
+	size &= FAKE_NODE_MIN_HASH_MASK;
+	if (!size) {
+		pr_err("Not enough memory for each node.  "
+			"NUMA emulation disabled.\n");
+		return -1;
+	}
+
+	for (i = 0; i < nr_phys_nodes; i++)
+		if (physnodes[i].start != physnodes[i].end)
+			node_set(i, physnode_mask);
+
+	/*
+	 * Continue to fill physical nodes with fake nodes until there is no
+	 * memory left on any of them.
+	 */
+	while (nodes_weight(physnode_mask)) {
+		for_each_node_mask(i, physnode_mask) {
+			u64 end = physnodes[i].start + size;
+			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+
+			if (ret < big)
+				end += FAKE_NODE_MIN_SIZE;
+
+			/*
+			 * Continue to add memory to this fake node if its
+			 * non-reserved memory is less than the per-node size.
+			 */
+			while (end - physnodes[i].start -
+				e820_hole_size(physnodes[i].start, end) < size) {
+				end += FAKE_NODE_MIN_SIZE;
+				if (end > physnodes[i].end) {
+					end = physnodes[i].end;
+					break;
+				}
+			}
+
+			/*
+			 * If there won't be at least FAKE_NODE_MIN_SIZE of
+			 * non-reserved memory in ZONE_DMA32 for the next node,
+			 * this one must extend to the boundary.
+			 */
+			if (end < dma32_end && dma32_end - end -
+			    e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+				end = dma32_end;
+
+			/*
+			 * If there won't be enough non-reserved memory for the
+			 * next node, this one must extend to the end of the
+			 * physical node.
+			 */
+			if (physnodes[i].end - end -
+			    e820_hole_size(end, physnodes[i].end) < size)
+				end = physnodes[i].end;
+
+			/*
+			 * Avoid allocating more nodes than requested, which can
+			 * happen as a result of rounding down each node's size
+			 * to FAKE_NODE_MIN_SIZE.
+			 */
+			if (nodes_weight(physnode_mask) + ret >= nr_nodes)
+				end = physnodes[i].end;
+
+			if (setup_node_range(ret++, &physnodes[i].start,
+						end - physnodes[i].start,
+						physnodes[i].end) < 0)
+				node_clear(i, physnode_mask);
+		}
+	}
+	return ret;
+}
+
+/*
  * Splits num_nodes nodes up equally starting at node_start.  The return value
  * is the number of nodes split up and addr is adjusted to be at the end of the
  * last node allocated.
  */
-static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
-				      u64 max_addr, int node_start,
+static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
 				      int num_nodes)
 {
 	unsigned int big;
@@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
 					break;
 				}
 			}
-		if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
+		if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
 			break;
 	}
 	return i - node_start + 1;
@@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
  * always assigned to a final node and can be asymmetric.  Returns the number of
  * nodes split.
  */
-static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
-				      u64 max_addr, int node_start, u64 size)
+static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
+				      u64 size)
 {
 	int i = node_start;
 	size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
-	while (!setup_node_range(i++, nodes, addr, size, max_addr))
+	while (!setup_node_range(i++, addr, size, max_addr))
 		;
 	return i - node_start;
 }
@@ -413,15 +573,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
  * Sets up the system RAM area from start_pfn to last_pfn according to the
  * numa=fake command-line option.
  */
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
-
-static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
+static int __init numa_emulation(unsigned long start_pfn,
+			unsigned long last_pfn, int acpi, int k8)
 {
 	u64 size, addr = start_pfn << PAGE_SHIFT;
 	u64 max_addr = last_pfn << PAGE_SHIFT;
 	int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
+	int num_phys_nodes;
 
-	memset(&nodes, 0, sizeof(nodes));
+	num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
 	/*
 	 * If the numa=fake command-line is just a single number N, split the
 	 * system RAM into N fake nodes.
@@ -429,7 +589,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
 	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
 		long n = simple_strtol(cmdline, NULL, 0);
 
-		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
+		num_nodes = split_nodes_interleave(addr, max_addr,
+							num_phys_nodes, n);
 		if (num_nodes < 0)
 			return num_nodes;
 		goto out;
@@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
 			size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
 			if (size)
 				for (i = 0; i < coeff; i++, num_nodes++)
-					if (setup_node_range(num_nodes, nodes,
-						&addr, size, max_addr) < 0)
+					if (setup_node_range(num_nodes, &addr,
+						size, max_addr) < 0)
 						goto done;
 			if (!*cmdline)
 				break;
@@ -473,7 +634,7 @@ done:
 	if (addr < max_addr) {
 		if (coeff_flag && coeff < 0) {
 			/* Split remaining nodes into num-sized chunks */
-			num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
+			num_nodes += split_nodes_by_size(&addr, max_addr,
 							 num_nodes, num);
 			goto out;
 		}
@@ -482,7 +643,7 @@ done:
 			/* Split remaining nodes into coeff chunks */
 			if (coeff <= 0)
 				break;
-			num_nodes += split_nodes_equally(nodes, &addr, max_addr,
+			num_nodes += split_nodes_equally(&addr, max_addr,
 							 num_nodes, coeff);
 			break;
 		case ',':
@@ -490,8 +651,8 @@ done:
 			break;
 		default:
 			/* Give one final node */
-			setup_node_range(num_nodes, nodes, &addr,
-					 max_addr - addr, max_addr);
+			setup_node_range(num_nodes, &addr, max_addr - addr,
+					 max_addr);
 			num_nodes++;
 		}
 	}
@@ -505,14 +666,10 @@ out:
 	}
 
 	/*
-	 * We need to vacate all active ranges that may have been registered by
-	 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
-	 * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
+	 * We need to vacate all active ranges that may have been registered for
+	 * the e820 memory map.
 	 */
 	remove_all_active_ranges();
-#ifdef CONFIG_ACPI_NUMA
-	acpi_numa = -1;
-#endif
 	for_each_node_mask(i, node_possible_map) {
 		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 						nodes[i].end >> PAGE_SHIFT);
@@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	nodes_clear(node_online_map);
 
 #ifdef CONFIG_NUMA_EMU
-	if (cmdline && !numa_emulation(start_pfn, last_pfn))
+	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 891cbe6..34aa438 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -468,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 	for (i = 0; i < num_nodes; i++)
 		if (fake_nodes[i].start != fake_nodes[i].end)
 			node_set(i, nodes_parsed);
-	WARN_ON(!nodes_cover_memory(fake_nodes));
 }
 
 static int null_slit_node_compare(int a, int b)

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2009-10-12 21:34 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-25 22:19 [patch 1/4] x86: cleanup and add missing log levels for k8 David Rientjes
2009-09-25 22:20 ` [patch 2/4] x86: export k8 physical topology David Rientjes
2009-10-12 21:31   ` [tip:x86/mm] x86: Export " tip-bot for David Rientjes
2009-09-25 22:20 ` [patch 3/4] x86: export srat " David Rientjes
2009-10-12 21:32   ` [tip:x86/mm] x86: Export " tip-bot for David Rientjes
2009-09-25 22:20 ` [patch 4/4] x86: interleave emulated nodes over physical nodes David Rientjes
2009-10-01  8:56   ` Ingo Molnar
2009-10-09  9:34     ` David Rientjes
2009-10-12 21:32   ` [tip:x86/mm] x86: Interleave " tip-bot for David Rientjes
2009-10-12 21:31 ` [tip:x86/mm] x86: Clean up and add missing log levels for k8 tip-bot for David Rientjes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).