All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] x86/apic: Rename MAX_LOCAL_APIC to MAX_LOCAL_APICID
@ 2015-10-02 19:12 Denys Vlasenko
  2015-10-02 19:12 ` [PATCH 2/3] x86/apic: Make apic_version[] smaller Denys Vlasenko
  2015-10-02 19:12 ` [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping Denys Vlasenko
  0 siblings, 2 replies; 14+ messages in thread
From: Denys Vlasenko @ 2015-10-02 19:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Denys Vlasenko, Daniel J Blueman, Jiang Liu, Thomas Gleixner,
	Len Brown, x86, linux-kernel

On 09/26/2015 08:08 AM, Ingo Molnar wrote:
>> # define MAX_IO_APICS 128
>> # define MAX_LOCAL_APIC 32768
>> #endif
>>
>> (It seems to be a bit of a misnomer, it's not a maximum
>> number of APICs we support, it's the highest APIC _id_
>> we support.)
>
> Gah, that's a pretty serious misnomer - so it should be fixed regardless of
> the outcome of the sizing discussion.

So renamed. No code changes.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Ingo Molnar <mingo@kernel.org>
CC: Daniel J Blueman <daniel@numascale.com>
CC: Jiang Liu <jiang.liu@linux.intel.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Len Brown <len.brown@intel.com>
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/apicdef.h |  4 ++--
 arch/x86/include/asm/mpspec.h  | 20 ++++++++++----------
 arch/x86/include/asm/numa.h    |  2 +-
 arch/x86/kernel/acpi/boot.c    |  8 ++++----
 arch/x86/kernel/apic/apic.c    |  2 +-
 arch/x86/kernel/cpu/amd.c      |  2 +-
 arch/x86/mm/numa.c             |  6 +++---
 arch/x86/mm/srat.c             |  4 ++--
 arch/x86/platform/sfi/sfi.c    |  4 ++--
 9 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index c46bb99..ccb3a04 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -149,10 +149,10 @@
 
 #ifdef CONFIG_X86_32
 # define MAX_IO_APICS 64
-# define MAX_LOCAL_APIC 256
+# define MAX_LOCAL_APICID 256
 #else
 # define MAX_IO_APICS 128
-# define MAX_LOCAL_APIC 32768
+# define MAX_LOCAL_APICID 32768
 #endif
 
 /*
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index b07233b..e84e542 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -87,7 +87,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
 
 int generic_processor_info(int apicid, int version);
 
-#define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_LOCAL_APIC)
+#define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_LOCAL_APICID)
 
 struct physid_mask {
 	unsigned long mask[PHYSID_ARRAY_SIZE];
@@ -102,31 +102,31 @@ typedef struct physid_mask physid_mask_t;
 	test_and_set_bit(physid, (map).mask)
 
 #define physids_and(dst, src1, src2)					\
-	bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
+	bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APICID)
 
 #define physids_or(dst, src1, src2)					\
-	bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
+	bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APICID)
 
 #define physids_clear(map)					\
-	bitmap_zero((map).mask, MAX_LOCAL_APIC)
+	bitmap_zero((map).mask, MAX_LOCAL_APICID)
 
 #define physids_complement(dst, src)				\
-	bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
+	bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APICID)
 
 #define physids_empty(map)					\
-	bitmap_empty((map).mask, MAX_LOCAL_APIC)
+	bitmap_empty((map).mask, MAX_LOCAL_APICID)
 
 #define physids_equal(map1, map2)				\
-	bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
+	bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APICID)
 
 #define physids_weight(map)					\
-	bitmap_weight((map).mask, MAX_LOCAL_APIC)
+	bitmap_weight((map).mask, MAX_LOCAL_APICID)
 
 #define physids_shift_right(d, s, n)				\
-	bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
+	bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APICID)
 
 #define physids_shift_left(d, s, n)				\
-	bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
+	bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APICID)
 
 static inline unsigned long physids_coerce(physid_mask_t *map)
 {
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 01b493e..c2ecfd0 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -27,7 +27,7 @@ extern int numa_off;
  * should be accessed by the accessors - set_apicid_to_node() and
  * numa_cpu_node().
  */
-extern s16 __apicid_to_node[MAX_LOCAL_APIC];
+extern s16 __apicid_to_node[MAX_LOCAL_APICID];
 extern nodemask_t numa_nodes_parsed __initdata;
 
 extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ded848c..2a0397b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -169,7 +169,7 @@ static int acpi_register_lapic(int id, u8 enabled)
 {
 	unsigned int ver = 0;
 
-	if (id >= MAX_LOCAL_APIC) {
+	if (id >= MAX_LOCAL_APICID) {
 		printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
 		return -EINVAL;
 	}
@@ -996,13 +996,13 @@ static int __init acpi_parse_madt_lapic_entries(void)
 	register_lapic_address(acpi_lapic_addr);
 
 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
-				      acpi_parse_sapic, MAX_LOCAL_APIC);
+				      acpi_parse_sapic, MAX_LOCAL_APICID);
 
 	if (!count) {
 		x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
-					acpi_parse_x2apic, MAX_LOCAL_APIC);
+					acpi_parse_x2apic, MAX_LOCAL_APICID);
 		count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
-					acpi_parse_lapic, MAX_LOCAL_APIC);
+					acpi_parse_lapic, MAX_LOCAL_APICID);
 	}
 	if (!count && !x2count) {
 		printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 24e94ce..b08b447 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1798,7 +1798,7 @@ void __init register_lapic_address(unsigned long address)
 	}
 }
 
-int apic_version[MAX_LOCAL_APIC];
+int apic_version[MAX_LOCAL_APICID];
 
 /*
  * Local APIC interrupts
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 4a70fc6..57bc7dd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -281,7 +281,7 @@ static int nearby_node(int apicid)
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
-	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+	for (i = apicid + 1; i < MAX_LOCAL_APICID; i++) {
 		node = __apicid_to_node[i];
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index c3b3f65..c80d43b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -56,8 +56,8 @@ early_param("numa", numa_setup);
 /*
  * apicid, cpu, node mappings
  */
-s16 __apicid_to_node[MAX_LOCAL_APIC] = {
-	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+s16 __apicid_to_node[MAX_LOCAL_APICID] = {
+	[0 ... MAX_LOCAL_APICID-1] = NUMA_NO_NODE
 };
 
 int numa_cpu_node(int cpu)
@@ -607,7 +607,7 @@ static int __init numa_init(int (*init_func)(void))
 	int i;
 	int ret;
 
-	for (i = 0; i < MAX_LOCAL_APIC; i++)
+	for (i = 0; i < MAX_LOCAL_APICID; i++)
 		set_apicid_to_node(i, NUMA_NO_NODE);
 
 	nodes_clear(numa_nodes_parsed);
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index c2aea63..02896fc 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -98,7 +98,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 		return;
 	}
 
-	if (apic_id >= MAX_LOCAL_APIC) {
+	if (apic_id >= MAX_LOCAL_APICID) {
 		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
 		return;
 	}
@@ -139,7 +139,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	else
 		apic_id = pa->apic_id;
 
-	if (apic_id >= MAX_LOCAL_APIC) {
+	if (apic_id >= MAX_LOCAL_APICID) {
 		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
 		return;
 	}
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index 6c7111b..4c541ac 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -38,9 +38,9 @@ static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 /* All CPUs enumerated by SFI must be present and enabled */
 static void __init mp_sfi_register_lapic(u8 id)
 {
-	if (MAX_LOCAL_APIC - id <= 0) {
+	if (MAX_LOCAL_APICID - id <= 0) {
 		pr_warning("Processor #%d invalid (max %d)\n",
-			id, MAX_LOCAL_APIC);
+			id, MAX_LOCAL_APICID);
 		return;
 	}
 
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] x86/apic: Make apic_version[] smaller
  2015-10-02 19:12 [PATCH 1/3] x86/apic: Rename MAX_LOCAL_APIC to MAX_LOCAL_APICID Denys Vlasenko
@ 2015-10-02 19:12 ` Denys Vlasenko
  2015-10-02 19:12 ` [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping Denys Vlasenko
  1 sibling, 0 replies; 14+ messages in thread
From: Denys Vlasenko @ 2015-10-02 19:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Denys Vlasenko, Daniel J Blueman, Jiang Liu, Thomas Gleixner,
	Len Brown, x86, linux-kernel

apic_version[] array is changed from int to u8 -
APIC version values as of year 2015 are no larger than 0x1f
on all known CPUs.

A bit of code added to ensure that the statement
    apic_version[apicid] = version;
in generic_processor_info() is safe wrt bad values in both
'apicid' and 'version' variables.

This reduces apic_version[] from 128 kbytes to 32.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Ingo Molnar <mingo@kernel.org>
CC: Daniel J Blueman <daniel@numascale.com>
CC: Jiang Liu <jiang.liu@linux.intel.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Len Brown <len.brown@intel.com>
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/mpspec.h |  2 +-
 arch/x86/kernel/apic/apic.c   | 19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index e84e542..83140ab 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,7 +6,7 @@
 #include <asm/x86_init.h>
 #include <asm/apicdef.h>
 
-extern int apic_version[];
+extern u8 apic_version[];
 extern int pic_mode;
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b08b447..d0f135c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1798,7 +1798,7 @@ void __init register_lapic_address(unsigned long address)
 	}
 }
 
-int apic_version[MAX_LOCAL_APICID];
+u8 apic_version[MAX_LOCAL_APICID];
 
 /*
  * Local APIC interrupts
@@ -2054,6 +2054,23 @@ int generic_processor_info(int apicid, int version)
 		return -EINVAL;
 	}
 
+	if ((unsigned)apicid >= ARRAY_SIZE(apic_version)) {
+		int thiscpu = max + disabled_cpus;
+		pr_warning("APIC: APIC id 0x%x is too large."
+			   " Processor %d ignored.\n",
+			   apicid, thiscpu);
+		disabled_cpus++;
+		return -EINVAL;
+	}
+	if ((unsigned)version > 255) {
+		int thiscpu = max + disabled_cpus;
+		pr_warning("APIC: APIC version 0x%x is too large."
+			   " Processor %d ignored.\n",
+			   version, thiscpu);
+		disabled_cpus++;
+		return -EINVAL;
+	}
+
 	num_processors++;
 	if (apicid == boot_cpu_physical_apicid) {
 		/*
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-02 19:12 [PATCH 1/3] x86/apic: Rename MAX_LOCAL_APIC to MAX_LOCAL_APICID Denys Vlasenko
  2015-10-02 19:12 ` [PATCH 2/3] x86/apic: Make apic_version[] smaller Denys Vlasenko
@ 2015-10-02 19:12 ` Denys Vlasenko
  2015-10-03  7:44   ` Ingo Molnar
  2015-10-09 15:35   ` [PATCH 3/3] " Jiang Liu
  1 sibling, 2 replies; 14+ messages in thread
From: Denys Vlasenko @ 2015-10-02 19:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Daniel J Blueman, Jiang Liu, Thomas Gleixner, Len Brown, x86,
	linux-kernel

From: Daniel J Blueman <daniel@numascale.com>

The Intel x2APIC spec states the upper 16-bits of APIC ID is the
cluster ID [1, p2-12], intended for future distributed systems. Beyond
the legacy 8-bit APIC ID, Numascale NumaConnect uses 4-bits for the
position of a server on each axis of a multi-dimension torus; SGI
NUMAlink also structures the APIC ID space.

Instead, define an array based on NR_CPUs to achieve a 1:1 mapping and
perform linear search; this addresses the binary bloat and the present
artificial APIC ID limits. With CONFIG_NR_CPUS=256:

$ size vmlinux vmlinux-patched
  text      data     bss      dec     hex filename
18232877 1849656 2281472 22364005 1553f65 vmlinux
18233034 1786168 2281472 22300674 1544802 vmlinux-patched

That is, ~64 kbytes less data.

Works peachy on a 256-core system with a 20-bit APIC ID space, and on a
48-core legacy 8-bit APIC ID system. If we care, I can make
numa_cpu_node O(1) lookup for typical cases.

Signed-off-by: Daniel J Blueman <daniel@numascale.com>
CC: Ingo Molnar <mingo@kernel.org>
CC: Daniel J Blueman <daniel@numascale.com>
CC: Jiang Liu <jiang.liu@linux.intel.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Len Brown <len.brown@intel.com>
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org

[1]
http://www.intel.com/content/dam/doc/specification-update/64-architecture-x2apic-specification.pdf
---

I added forgotten change in arch/x86/mm/numa_emulation.c (Denys)

 arch/x86/include/asm/numa.h  | 13 +++++++------
 arch/x86/kernel/cpu/amd.c    |  8 ++++----
 arch/x86/mm/numa.c           | 31 +++++++++++++++++++++++--------
 arch/x86/mm/numa_emulation.c |  6 +++---
 4 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index c2ecfd0..33becb8 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -17,6 +17,11 @@
  */
 #define NODE_MIN_SIZE (4*1024*1024)
 
+struct apicid_to_node {
+	int apicid;
+	s16 node;
+};
+
 extern int numa_off;
 
 /*
@@ -27,17 +32,13 @@ extern int numa_off;
  * should be accessed by the accessors - set_apicid_to_node() and
  * numa_cpu_node().
  */
-extern s16 __apicid_to_node[MAX_LOCAL_APICID];
+extern struct apicid_to_node __apicid_to_node[NR_CPUS];
 extern nodemask_t numa_nodes_parsed __initdata;
 
 extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
 extern void __init numa_set_distance(int from, int to, int distance);
 
-static inline void set_apicid_to_node(int apicid, s16 node)
-{
-	__apicid_to_node[apicid] = node;
-}
-
+extern void set_apicid_to_node(int apicid, s16 node);
 extern int numa_cpu_node(int cpu);
 
 #else	/* CONFIG_NUMA */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 57bc7dd..4699e7b 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -277,12 +277,12 @@ static int nearby_node(int apicid)
 	int i, node;
 
 	for (i = apicid - 1; i >= 0; i--) {
-		node = __apicid_to_node[i];
+		node = __apicid_to_node[i].node;
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
 	for (i = apicid + 1; i < MAX_LOCAL_APICID; i++) {
-		node = __apicid_to_node[i];
+		node = __apicid_to_node[i].node;
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
@@ -422,8 +422,8 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 		int ht_nodeid = c->initial_apicid;
 
 		if (ht_nodeid >= 0 &&
-		    __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-			node = __apicid_to_node[ht_nodeid];
+		    __apicid_to_node[ht_nodeid].node != NUMA_NO_NODE)
+			node = __apicid_to_node[ht_nodeid].node;
 		/* Pick a nearby node */
 		if (!node_online(node))
 			node = nearby_node(apicid);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index c80d43b..70f03a0 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -56,16 +56,34 @@ early_param("numa", numa_setup);
 /*
  * apicid, cpu, node mappings
  */
-s16 __apicid_to_node[MAX_LOCAL_APICID] = {
-	[0 ... MAX_LOCAL_APICID-1] = NUMA_NO_NODE
+
+struct apicid_to_node __apicid_to_node[NR_CPUS] = {
+	[0 ... NR_CPUS-1] = {-1, NUMA_NO_NODE}
 };
 
+void set_apicid_to_node(int apicid, s16 node)
+{
+	static int ent;
+
+	/* Protect against small kernel on large system */
+	if (ent >= NR_CPUS)
+		return;
+
+	__apicid_to_node[ent].apicid = apicid;
+	__apicid_to_node[ent].node = node;
+	ent++;
+}
+
 int numa_cpu_node(int cpu)
 {
-	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+	int ent, apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+	if (apicid == BAD_APICID)
+		return NUMA_NO_NODE;
+
+	for (ent = 0; ent < NR_CPUS; ent++)
+		if (__apicid_to_node[ent].apicid == apicid)
+			return __apicid_to_node[ent].node;
 
-	if (apicid != BAD_APICID)
-		return __apicid_to_node[apicid];
 	return NUMA_NO_NODE;
 }
 
@@ -607,9 +625,6 @@ static int __init numa_init(int (*init_func)(void))
 	int i;
 	int ret;
 
-	for (i = 0; i < MAX_LOCAL_APICID; i++)
-		set_apicid_to_node(i, NUMA_NO_NODE);
-
 	nodes_clear(numa_nodes_parsed);
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index a8f90ce..1a0e112 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -399,12 +399,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	 * back to zero just in case.
 	 */
 	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
-		if (__apicid_to_node[i] == NUMA_NO_NODE)
+		if (__apicid_to_node[i].node == NUMA_NO_NODE)
 			continue;
 		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
-			if (__apicid_to_node[i] == emu_nid_to_phys[j])
+			if (__apicid_to_node[i].node == emu_nid_to_phys[j])
 				break;
-		__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+		__apicid_to_node[i].node = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
 	}
 
 	/* make sure all emulated nodes are mapped to a physical node */
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-02 19:12 ` [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping Denys Vlasenko
@ 2015-10-03  7:44   ` Ingo Molnar
  2015-10-03 20:26     ` Denys Vlasenko
  2015-10-05  4:32     ` [PATCH v2] " Daniel J Blueman
  2015-10-09 15:35   ` [PATCH 3/3] " Jiang Liu
  1 sibling, 2 replies; 14+ messages in thread
From: Ingo Molnar @ 2015-10-03  7:44 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Daniel J Blueman, Jiang Liu, Thomas Gleixner, Len Brown, x86,
	linux-kernel


* Denys Vlasenko <dvlasenk@redhat.com> wrote:

> @@ -56,16 +56,34 @@ early_param("numa", numa_setup);
>  /*
>   * apicid, cpu, node mappings
>   */
> -s16 __apicid_to_node[MAX_LOCAL_APICID] = {
> -	[0 ... MAX_LOCAL_APICID-1] = NUMA_NO_NODE
> +
> +struct apicid_to_node __apicid_to_node[NR_CPUS] = {
> +	[0 ... NR_CPUS-1] = {-1, NUMA_NO_NODE}
>  };
>  
> +void set_apicid_to_node(int apicid, s16 node)
> +{
> +	static int ent;

having such statics inside functions is really obscure and makes review harder.
I had to look twice to see it. Please move it outside and also name it 
appropriately.

> +	/* Protect against small kernel on large system */
> +	if (ent >= NR_CPUS)
> +		return;
> +
> +	__apicid_to_node[ent].apicid = apicid;
> +	__apicid_to_node[ent].node = node;
> +	ent++;
> +}

So what happens if we run a small kernel and run out of entries? We just silently 
seem to return, no warning, no nothing - the system will likely fail to boot in 
myserious ways, right?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-03  7:44   ` Ingo Molnar
@ 2015-10-03 20:26     ` Denys Vlasenko
  2015-10-05  4:32     ` [PATCH v2] " Daniel J Blueman
  1 sibling, 0 replies; 14+ messages in thread
From: Denys Vlasenko @ 2015-10-03 20:26 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Daniel J Blueman, Jiang Liu, Thomas Gleixner, Len Brown, x86,
	linux-kernel

On 10/03/2015 09:44 AM, Ingo Molnar wrote:
> 
> * Denys Vlasenko <dvlasenk@redhat.com> wrote:
> 
>> @@ -56,16 +56,34 @@ early_param("numa", numa_setup);
>>  /*
>>   * apicid, cpu, node mappings
>>   */
>> -s16 __apicid_to_node[MAX_LOCAL_APICID] = {
>> -	[0 ... MAX_LOCAL_APICID-1] = NUMA_NO_NODE
>> +
>> +struct apicid_to_node __apicid_to_node[NR_CPUS] = {
>> +	[0 ... NR_CPUS-1] = {-1, NUMA_NO_NODE}
>>  };
>>  
>> +void set_apicid_to_node(int apicid, s16 node)
>> +{
>> +	static int ent;
> 
> having such statics inside functions is really obscure and makes review harder.
> I had to look twice to see it. Please move it outside and also name it 
> appropriately.

Just to confirm: you want it to be a static data, but not inside a function?


>> +	/* Protect against small kernel on large system */
>> +	if (ent >= NR_CPUS)
>> +		return;
>> +
>> +	__apicid_to_node[ent].apicid = apicid;
>> +	__apicid_to_node[ent].node = node;
>> +	ent++;
>> +}
> 
> So what happens if we run a small kernel and run out of entries? We just silently 
> seem to return, no warning, no nothing - the system will likely fail to boot in 
> myserious ways, right?

Good question.

I tested this.

I built a NR_CPUS=8 kernel and booted it on 144 cpu and 240 cpu machines.
Both booted fine:

[    0.000000] ACPI: Local APIC address 0xfee00000
[    0.000000] ACPI: NR_CPUS/possible_cpus limit of 8 reached.  Processor 8/0x16 ignored.
...
[    0.000000] ACPI: NR_CPUS/possible_cpus limit of 8 reached.  Processor 143/0xf7 ignored.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-03  7:44   ` Ingo Molnar
  2015-10-03 20:26     ` Denys Vlasenko
@ 2015-10-05  4:32     ` Daniel J Blueman
  2015-10-09 14:15       ` Thomas Gleixner
  1 sibling, 1 reply; 14+ messages in thread
From: Daniel J Blueman @ 2015-10-05  4:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Daniel J Blueman, Denys Vlasenko, Thomas Gleixner, Jiang Liu,
	Len Brown, Steffen Persvold, linux-kernel, x86

The Intel x2APIC spec states the upper 16-bits of APIC ID is the
cluster ID [1, p2-12], intended for future distributed systems. Beyond
the legacy 8-bit APIC ID, Numascale NumaConnect uses 4-bits for the
position of a server on each axis of a multi-dimension torus; SGI
NUMAlink also structures the APIC ID space.

Instead, define an array based on NR_CPUs to achieve a 1:1 mapping and
perform linear search; we see "ACPI: NR_CPUS/possible_cpus limit of X
reached.  Processor 8/0x16 ignored." when config-limited. This addresses
the binary bloat and the present artificial APIC ID limits. With
CONFIG_NR_CPUS=256, we save ~64KB of vmlinux data:

$ size vmlinux vmlinux-patched
  text      data     bss      dec     hex filename
18232877 1849656 2281472 22364005 1553f65 vmlinux
18233034 1786168 2281472 22300674 1544802 vmlinux-patched

Tested on a 256-core system with a 20-bit APIC ID space, and on a
48-core legacy 8-bit APIC ID system with and without CONFIG_NUMA,
CONFIG_NUMA_EMU and CONFIG_AMD_NUMA.

v2: Improved readability by moving static variable out; integrated Denys's
numa emulation fix

Signed-off-by: Daniel J Blueman <daniel@numascale.com>
CC: Denys Vlasenko <dvlasenk@redhat.com>
CC: Ingo Molnar <mingo@kernel.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Jiang Liu <jiang.liu@linux.intel.com>
CC: Len Brown <len.brown@intel.com>
CC: Steffen Persvold <sp@numascale.com>
CC: linux-kernel@vger.kernel.org
CC: x86@kernel.org

[1] http://www.intel.com/content/dam/doc/specification-update/64-architecture-x2apic-specification.pdf
---
 arch/x86/include/asm/numa.h  | 13 +++++++------
 arch/x86/kernel/cpu/amd.c    | 11 ++++++-----
 arch/x86/mm/numa.c           | 29 +++++++++++++++++++++--------
 arch/x86/mm/numa_emulation.c |  6 +++---
 4 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 01b493e..33becb8 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -17,6 +17,11 @@
  */
 #define NODE_MIN_SIZE (4*1024*1024)
 
+struct apicid_to_node {
+	int apicid;
+	s16 node;
+};
+
 extern int numa_off;
 
 /*
@@ -27,17 +32,13 @@ extern int numa_off;
  * should be accessed by the accessors - set_apicid_to_node() and
  * numa_cpu_node().
  */
-extern s16 __apicid_to_node[MAX_LOCAL_APIC];
+extern struct apicid_to_node __apicid_to_node[NR_CPUS];
 extern nodemask_t numa_nodes_parsed __initdata;
 
 extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
 extern void __init numa_set_distance(int from, int to, int distance);
 
-static inline void set_apicid_to_node(int apicid, s16 node)
-{
-	__apicid_to_node[apicid] = node;
-}
-
+extern void set_apicid_to_node(int apicid, s16 node);
 extern int numa_cpu_node(int cpu);
 
 #else	/* CONFIG_NUMA */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 4a70fc6..9494f0e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -277,12 +277,13 @@ static int nearby_node(int apicid)
 	int i, node;
 
 	for (i = apicid - 1; i >= 0; i--) {
-		node = __apicid_to_node[i];
+		node = __apicid_to_node[i].node;
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
-	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-		node = __apicid_to_node[i];
+	for (i = apicid + 1; i < NR_CPUS; i++) {
+		node = __apicid_to_node[i].node;
+
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
@@ -422,8 +423,8 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 		int ht_nodeid = c->initial_apicid;
 
 		if (ht_nodeid >= 0 &&
-		    __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-			node = __apicid_to_node[ht_nodeid];
+		    __apicid_to_node[ht_nodeid].node != NUMA_NO_NODE)
+			node = __apicid_to_node[ht_nodeid].node;
 		/* Pick a nearby node */
 		if (!node_online(node))
 			node = nearby_node(apicid);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index c3b3f65..849a113 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -26,6 +26,7 @@ nodemask_t numa_nodes_parsed __initdata;
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
+static unsigned apicids;
 static struct numa_meminfo numa_meminfo
 #ifndef CONFIG_MEMORY_HOTPLUG
 __initdata
@@ -56,16 +57,31 @@ early_param("numa", numa_setup);
 /*
  * apicid, cpu, node mappings
  */
-s16 __apicid_to_node[MAX_LOCAL_APIC] = {
-	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+struct apicid_to_node __apicid_to_node[NR_CPUS] = {
+	[0 ... NR_CPUS-1] = {-1, NUMA_NO_NODE}
 };
 
+void set_apicid_to_node(int apicid, s16 node)
+{
+	/* Protect against small kernel on large system */
+	if (apicids >= NR_CPUS)
+		return;
+
+	__apicid_to_node[apicids].apicid = apicid;
+	__apicid_to_node[apicids].node = node;
+	apicids++;
+}
+
 int numa_cpu_node(int cpu)
 {
-	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+	int ent, apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+	if (apicid == BAD_APICID)
+		return NUMA_NO_NODE;
+
+	for (ent = 0; ent < NR_CPUS; ent++)
+		if (__apicid_to_node[ent].apicid == apicid)
+			return __apicid_to_node[ent].node;
 
-	if (apicid != BAD_APICID)
-		return __apicid_to_node[apicid];
 	return NUMA_NO_NODE;
 }
 
@@ -607,9 +623,6 @@ static int __init numa_init(int (*init_func)(void))
 	int i;
 	int ret;
 
-	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		set_apicid_to_node(i, NUMA_NO_NODE);
-
 	nodes_clear(numa_nodes_parsed);
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index a8f90ce..1a0e112 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -399,12 +399,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	 * back to zero just in case.
 	 */
 	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
-		if (__apicid_to_node[i] == NUMA_NO_NODE)
+		if (__apicid_to_node[i].node == NUMA_NO_NODE)
 			continue;
 		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
-			if (__apicid_to_node[i] == emu_nid_to_phys[j])
+			if (__apicid_to_node[i].node == emu_nid_to_phys[j])
 				break;
-		__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+		__apicid_to_node[i].node = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
 	}
 
 	/* make sure all emulated nodes are mapped to a physical node */
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-05  4:32     ` [PATCH v2] " Daniel J Blueman
@ 2015-10-09 14:15       ` Thomas Gleixner
  2015-10-09 15:16         ` Jiang Liu
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Gleixner @ 2015-10-09 14:15 UTC (permalink / raw)
  To: Daniel J Blueman
  Cc: Ingo Molnar, Denys Vlasenko, Jiang Liu, Len Brown,
	Steffen Persvold, linux-kernel, x86

On Mon, 5 Oct 2015, Daniel J Blueman wrote:
> +struct apicid_to_node {
> +	int apicid;
> +	s16 node;
> +};

Instead of having this array, why don't you use a radix tree and be
done with it?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-09 14:15       ` Thomas Gleixner
@ 2015-10-09 15:16         ` Jiang Liu
  2015-10-09 20:40           ` Thomas Gleixner
  0 siblings, 1 reply; 14+ messages in thread
From: Jiang Liu @ 2015-10-09 15:16 UTC (permalink / raw)
  To: Thomas Gleixner, Daniel J Blueman
  Cc: Ingo Molnar, Denys Vlasenko, Len Brown, Steffen Persvold,
	linux-kernel, x86

On 2015/10/9 22:15, Thomas Gleixner wrote:
> On Mon, 5 Oct 2015, Daniel J Blueman wrote:
>> +struct apicid_to_node {
>> +	int apicid;
>> +	s16 node;
>> +};
> 
> Instead of having this array, why don't you use a radix tree and be
> done with it?
Hi Thomas,
	It's in early booting stage and memory allocator isn't ready
yet.
Thanks,
Gerry

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-02 19:12 ` [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping Denys Vlasenko
  2015-10-03  7:44   ` Ingo Molnar
@ 2015-10-09 15:35   ` Jiang Liu
  2015-10-12 10:21     ` Daniel J Blueman
  1 sibling, 1 reply; 14+ messages in thread
From: Jiang Liu @ 2015-10-09 15:35 UTC (permalink / raw)
  To: Denys Vlasenko, Ingo Molnar
  Cc: Daniel J Blueman, Thomas Gleixner, Len Brown, x86, linux-kernel

On 2015/10/3 3:12, Denys Vlasenko wrote:
> From: Daniel J Blueman <daniel@numascale.com>
> 
> The Intel x2APIC spec states the upper 16-bits of APIC ID is the
> cluster ID [1, p2-12], intended for future distributed systems. Beyond
> the legacy 8-bit APIC ID, Numascale NumaConnect uses 4-bits for the
> position of a server on each axis of a multi-dimension torus; SGI
> NUMAlink also structures the APIC ID space.
> 
> Instead, define an array based on NR_CPUs to achieve a 1:1 mapping and
> perform linear search; this addresses the binary bloat and the present
> artificial APIC ID limits. With CONFIG_NR_CPUS=256:
> 
> $ size vmlinux vmlinux-patched
>   text      data     bss      dec     hex filename
> 18232877 1849656 2281472 22364005 1553f65 vmlinux
> 18233034 1786168 2281472 22300674 1544802 vmlinux-patched
> 
> That is, ~64 kbytes less data.
> 
> Works peachy on a 256-core system with a 20-bit APIC ID space, and on a
> 48-core legacy 8-bit APIC ID system. If we care, I can make
> numa_cpu_node O(1) lookup for typical cases.
> 
> Signed-off-by: Daniel J Blueman <daniel@numascale.com>
> CC: Ingo Molnar <mingo@kernel.org>
> CC: Daniel J Blueman <daniel@numascale.com>
> CC: Jiang Liu <jiang.liu@linux.intel.com>
> CC: Thomas Gleixner <tglx@linutronix.de>
> CC: Len Brown <len.brown@intel.com>
> CC: x86@kernel.org
> CC: linux-kernel@vger.kernel.org
> 
> [1]
> http://www.intel.com/content/dam/doc/specification-update/64-architecture-x2apic-specification.pdf
> ---
> 
> I added forgotten change in arch/x86/mm/numa_emulation.c (Denys)
> 
>  arch/x86/include/asm/numa.h  | 13 +++++++------
>  arch/x86/kernel/cpu/amd.c    |  8 ++++----
>  arch/x86/mm/numa.c           | 31 +++++++++++++++++++++++--------
>  arch/x86/mm/numa_emulation.c |  6 +++---
>  4 files changed, 37 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
> index c2ecfd0..33becb8 100644
> --- a/arch/x86/include/asm/numa.h
> +++ b/arch/x86/include/asm/numa.h
> @@ -17,6 +17,11 @@
>   */
>  #define NODE_MIN_SIZE (4*1024*1024)
>  
> +struct apicid_to_node {
> +	int apicid;
> +	s16 node;
> +};
> +
>  extern int numa_off;
>  
>  /*
> @@ -27,17 +32,13 @@ extern int numa_off;
>   * should be accessed by the accessors - set_apicid_to_node() and
>   * numa_cpu_node().
>   */
> -extern s16 __apicid_to_node[MAX_LOCAL_APICID];
> +extern struct apicid_to_node __apicid_to_node[NR_CPUS];
Hi Denys and Daniel,
	I still have some concerns about limiting the array to NR_CPUS.
__apicid_to_node are populated according to the order that CPUs are
listed in ACPI SRAT table. And CPU IDs are allocated according to the
order that CPUs are listed in ACPI MADT(APIC) order. So it may cause
trouble if:
1) system has more than NR_CPUS CPUs
2) CPUs are listed in different order in SRAT and MADT tables.

<snit>
> @@ -607,9 +625,6 @@ static int __init numa_init(int (*init_func)(void))
>  	int i;
>  	int ret;
>  
> -	for (i = 0; i < MAX_LOCAL_APICID; i++)
> -		set_apicid_to_node(i, NUMA_NO_NODE);
> -
	Why remove above code? numa_init() may be called multiple times
so it needs to reset __apicid_to_node array on the second and following
calls. So we need another way to reset __apicid_to_node array instead
of simply deleting above code.
Thanks,
Gerry

>  	nodes_clear(numa_nodes_parsed);
>  	nodes_clear(node_possible_map);
>  	nodes_clear(node_online_map);
> diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
> index a8f90ce..1a0e112 100644
> --- a/arch/x86/mm/numa_emulation.c
> +++ b/arch/x86/mm/numa_emulation.c
> @@ -399,12 +399,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
>  	 * back to zero just in case.
>  	 */
>  	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
> -		if (__apicid_to_node[i] == NUMA_NO_NODE)
> +		if (__apicid_to_node[i].node == NUMA_NO_NODE)
>  			continue;
>  		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
> -			if (__apicid_to_node[i] == emu_nid_to_phys[j])
> +			if (__apicid_to_node[i].node == emu_nid_to_phys[j])
>  				break;
> -		__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
> +		__apicid_to_node[i].node = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
>  	}
>  
>  	/* make sure all emulated nodes are mapped to a physical node */
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-09 15:16         ` Jiang Liu
@ 2015-10-09 20:40           ` Thomas Gleixner
  0 siblings, 0 replies; 14+ messages in thread
From: Thomas Gleixner @ 2015-10-09 20:40 UTC (permalink / raw)
  To: Jiang Liu
  Cc: Daniel J Blueman, Ingo Molnar, Denys Vlasenko, Len Brown,
	Steffen Persvold, linux-kernel, x86

On Fri, 9 Oct 2015, Jiang Liu wrote:
> On 2015/10/9 22:15, Thomas Gleixner wrote:
> > On Mon, 5 Oct 2015, Daniel J Blueman wrote:
> >> +struct apicid_to_node {
> >> +	int apicid;
> >> +	s16 node;
> >> +};
> > 
> > Instead of having this array, why don't you use a radix tree and be
> > done with it?
> Hi Thomas,
> 	It's in early booting stage and memory allocator isn't ready
> yet.

Why do we need that information so early?

Thanks,

	tglx



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-09 15:35   ` [PATCH 3/3] " Jiang Liu
@ 2015-10-12 10:21     ` Daniel J Blueman
  2015-10-12 10:25       ` Thomas Gleixner
  0 siblings, 1 reply; 14+ messages in thread
From: Daniel J Blueman @ 2015-10-12 10:21 UTC (permalink / raw)
  To: Jiang Liu, Thomas Gleixner, Denys Vlasenko, Ingo Molnar
  Cc: Len Brown, x86, linux-kernel

On Fri, Oct 9, 2015 at 11:35 PM, Jiang Liu <jiang.liu@linux.intel.com> 
wrote:
> On 2015/10/3 3:12, Denys Vlasenko wrote:
>>  From: Daniel J Blueman <daniel@numascale.com>
>> 
>>  The Intel x2APIC spec states the upper 16-bits of APIC ID is the
>>  cluster ID [1, p2-12], intended for future distributed systems. 
>> Beyond
>>  the legacy 8-bit APIC ID, Numascale NumaConnect uses 4-bits for the
>>  position of a server on each axis of a multi-dimension torus; SGI
>>  NUMAlink also structures the APIC ID space.
>> 
>>  Instead, define an array based on NR_CPUs to achieve a 1:1 mapping 
>> and
>>  perform linear search; this addresses the binary bloat and the 
>> present
>>  artificial APIC ID limits. With CONFIG_NR_CPUS=256:
>> 
>>  $ size vmlinux vmlinux-patched
>>    text      data     bss      dec     hex filename
>>  18232877 1849656 2281472 22364005 1553f65 vmlinux
>>  18233034 1786168 2281472 22300674 1544802 vmlinux-patched
>> 
>>  That is, ~64 kbytes less data.
>> 
>>  Works peachy on a 256-core system with a 20-bit APIC ID space, and 
>> on a
>>  48-core legacy 8-bit APIC ID system. If we care, I can make
>>  numa_cpu_node O(1) lookup for typical cases.
>> 
>>  Signed-off-by: Daniel J Blueman <daniel@numascale.com>
>>  CC: Ingo Molnar <mingo@kernel.org>
>>  CC: Daniel J Blueman <daniel@numascale.com>
>>  CC: Jiang Liu <jiang.liu@linux.intel.com>
>>  CC: Thomas Gleixner <tglx@linutronix.de>
>>  CC: Len Brown <len.brown@intel.com>
>>  CC: x86@kernel.org
>>  CC: linux-kernel@vger.kernel.org
>> 
>>  [1]
>>  
>> http://www.intel.com/content/dam/doc/specification-update/64-architecture-x2apic-specification.pdf
>>  ---
>> 
>>  I added forgotten change in arch/x86/mm/numa_emulation.c (Denys)
>> 
>>   arch/x86/include/asm/numa.h  | 13 +++++++------
>>   arch/x86/kernel/cpu/amd.c    |  8 ++++----
>>   arch/x86/mm/numa.c           | 31 +++++++++++++++++++++++--------
>>   arch/x86/mm/numa_emulation.c |  6 +++---
>>   4 files changed, 37 insertions(+), 21 deletions(-)
>> 
>>  diff --git a/arch/x86/include/asm/numa.h 
>> b/arch/x86/include/asm/numa.h
>>  index c2ecfd0..33becb8 100644
>>  --- a/arch/x86/include/asm/numa.h
>>  +++ b/arch/x86/include/asm/numa.h
>>  @@ -17,6 +17,11 @@
>>    */
>>   #define NODE_MIN_SIZE (4*1024*1024)
>> 
>>  +struct apicid_to_node {
>>  +	int apicid;
>>  +	s16 node;
>>  +};
>>  +
>>   extern int numa_off;
>> 
>>   /*
>>  @@ -27,17 +32,13 @@ extern int numa_off;
>>    * should be accessed by the accessors - set_apicid_to_node() and
>>    * numa_cpu_node().
>>    */
>>  -extern s16 __apicid_to_node[MAX_LOCAL_APICID];
>>  +extern struct apicid_to_node __apicid_to_node[NR_CPUS];
> Hi Denys and Daniel,
> 	I still have some concerns about limiting the array to NR_CPUS.
> __apicid_to_node are populated according to the order that CPUs are
> listed in ACPI SRAT table. And CPU IDs are allocated according to the
> order that CPUs are listed in ACPI MADT(APIC) order. So it may cause
> trouble if:
> 1) system has more than NR_CPUS CPUs
> 2) CPUs are listed in different order in SRAT and MADT tables.

Another approach which may be suitable without changing SRAT parsing to 
be after the memory allocator is up, is to exploit the associativity of 
the bottom APIC ID bits.

We'd have a searchable static array based on NUMA_SHIFT and use the 
bit-shift encoded in the MSRs. That said, this may run into the issue 
Jiang cited albeit with CONFIG_NUMA_SHIFT. Perhaps the constraints or 
risk of restructuring SRAT parsing aren't worth the payoff?

Finally, the only alternative is as the current mapping is initialised 
in numa_init, we can drop the static initialisation and move the 64KB 
to the BSS to avoid bloating the binary image, but this may not achieve 
the initial goal of runtime footprint reduction.

Daniel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-12 10:21     ` Daniel J Blueman
@ 2015-10-12 10:25       ` Thomas Gleixner
  2015-10-13  9:32         ` Jiang Liu
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Gleixner @ 2015-10-12 10:25 UTC (permalink / raw)
  To: Daniel J Blueman
  Cc: Jiang Liu, Denys Vlasenko, Ingo Molnar, Len Brown, x86, linux-kernel

On Mon, 12 Oct 2015, Daniel J Blueman wrote:
> Another approach which may be suitable without changing SRAT parsing to be
> after the memory allocator is up, is to exploit the associativity of the
> bottom APIC ID bits.

What's the problem with moving (SRAT/ACPI/whatever) APIC parsing after
the memory allocator is up and available?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-12 10:25       ` Thomas Gleixner
@ 2015-10-13  9:32         ` Jiang Liu
  2015-10-13 12:55           ` Thomas Gleixner
  0 siblings, 1 reply; 14+ messages in thread
From: Jiang Liu @ 2015-10-13  9:32 UTC (permalink / raw)
  To: Thomas Gleixner, Daniel J Blueman
  Cc: Denys Vlasenko, Ingo Molnar, Len Brown, x86, linux-kernel

On 2015/10/12 18:25, Thomas Gleixner wrote:
> On Mon, 12 Oct 2015, Daniel J Blueman wrote:
>> Another approach which may be suitable without changing SRAT parsing to be
>> after the memory allocator is up, is to exploit the associativity of the
>> bottom APIC ID bits.
> 
> What's the problem with moving (SRAT/ACPI/whatever) APIC parsing after
> the memory allocator is up and available?
Hi Thomas,
	The work flow is as below at boot:
1) figure out memory NUMA topology info by walking ACPI table or probing
   AMD northbirdge.
2) initialize memory allocation based on memory NUMA topology.

And to make code simple, it also scan CPU NUMA topology in step 1, so
we could avoid walking ACPI tables twice. On the other hand, there are
several subsystems having code pattern as follow before booting APs.
up:
for_each_possible_cpu(cpu)
	alloc_page_node(size, cpu_to_node(cpu))
So it's a little hard to find a suitable hook point to delay CPU NUMA
topology scanning after memory allocator is ready.
Thanks,
Gerry

> 
> Thanks,
> 
> 	tglx
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping
  2015-10-13  9:32         ` Jiang Liu
@ 2015-10-13 12:55           ` Thomas Gleixner
  0 siblings, 0 replies; 14+ messages in thread
From: Thomas Gleixner @ 2015-10-13 12:55 UTC (permalink / raw)
  To: Jiang Liu
  Cc: Daniel J Blueman, Denys Vlasenko, Ingo Molnar, Len Brown, x86,
	linux-kernel

On Tue, 13 Oct 2015, Jiang Liu wrote:
> On 2015/10/12 18:25, Thomas Gleixner wrote:
> > On Mon, 12 Oct 2015, Daniel J Blueman wrote:
> >> Another approach which may be suitable without changing SRAT parsing to be
> >> after the memory allocator is up, is to exploit the associativity of the
> >> bottom APIC ID bits.
> > 
> > What's the problem with moving (SRAT/ACPI/whatever) APIC parsing after
> > the memory allocator is up and available?
> Hi Thomas,
> 	The work flow is as below at boot:
> 1) figure out memory NUMA topology info by walking ACPI table or probing
>    AMD northbirdge.
> 2) initialize memory allocation based on memory NUMA topology.
> 
> And to make code simple, it also scan CPU NUMA topology in step 1, so
> we could avoid walking ACPI tables twice. On the other hand, there are
> several subsystems having code pattern as follow before booting APs.
> up:
> for_each_possible_cpu(cpu)
> 	alloc_page_node(size, cpu_to_node(cpu))
> So it's a little hard to find a suitable hook point to delay CPU NUMA
> topology scanning after memory allocator is ready.

Not really. The memory allocator is available very early and long
before smp_prepare_cpus(). So we should try hard to move it after that
point, even if that means that we need to walk the tables twice.

Alternatively, use a bootmem allocation and convert it to a radix tree
when the allocator is up.

Thanks,

	tglx



^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-10-13 12:56 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-02 19:12 [PATCH 1/3] x86/apic: Rename MAX_LOCAL_APIC to MAX_LOCAL_APICID Denys Vlasenko
2015-10-02 19:12 ` [PATCH 2/3] x86/apic: Make apic_version[] smaller Denys Vlasenko
2015-10-02 19:12 ` [PATCH 3/3] x86/apic: Use smaller array for __apicid_to_node[] mapping Denys Vlasenko
2015-10-03  7:44   ` Ingo Molnar
2015-10-03 20:26     ` Denys Vlasenko
2015-10-05  4:32     ` [PATCH v2] " Daniel J Blueman
2015-10-09 14:15       ` Thomas Gleixner
2015-10-09 15:16         ` Jiang Liu
2015-10-09 20:40           ` Thomas Gleixner
2015-10-09 15:35   ` [PATCH 3/3] " Jiang Liu
2015-10-12 10:21     ` Daniel J Blueman
2015-10-12 10:25       ` Thomas Gleixner
2015-10-13  9:32         ` Jiang Liu
2015-10-13 12:55           ` Thomas Gleixner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.