[RFC PATCH 1/2] powerpc/numa: Introduce logical numa id

* [RFC PATCH 1/2] powerpc/numa: Introduce logical numa id
@ 2020-07-31 11:19 Aneesh Kumar K.V
  2020-07-31 11:22 ` [RFC PATCH 2/2] powerpc/powernv/cpufreq: Don't assume chip id is same as Linux node id Aneesh Kumar K.V
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Aneesh Kumar K.V @ 2020-07-31 11:19 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Nathan Lynch, Aneesh Kumar K.V, Srikar Dronamraju

We use ibm,associativity and ibm,associativity-lookup-arrays to derive the numa
node numbers. These device tree properties are firmware indicated grouping of
resources based on their hierarchy in the platform. These numbers (group id) are
not sequential and hypervisor/firmware can follow different numbering schemes.
For ex: on powernv platforms, we group them in the below order.

 *     - CCM node ID
 *     - HW card ID
 *     - HW module ID
 *     - Chip ID
 *     - Core ID

Based on ibm,associativity-reference-points we use one of the above group ids as
Linux NUMA node id. (On PowerNV platform Chip ID is used). This results
in Linux reporting non-linear NUMA node id and which also results in Linux
reporting empty node 0 NUMA nodes.

This can  be resolved by mapping the firmware provided group id to a logical Linux
NUMA id. In this patch, we do this only for pseries platforms considering the
firmware group id is a virtualized entity and users would not have drawn any
conclusion based on the Linux Numa Node id.

On PowerNV platform since we have historically mapped Chip ID as Linux NUMA node
id, we keep the existing Linux NUMA node id numbering.

Before Fix:
 # numactl -H
available: 2 nodes (0-1)
node 0 cpus:
node 0 size: 0 MB
node 0 free: 0 MB
node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
node 1 size: 50912 MB
node 1 free: 45248 MB
node distances:
node   0   1
  0:  10  40
  1:  40  10

after fix
 # numactl  -H
available: 1 nodes (0)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
node 0 size: 50912 MB
node 0 free: 49724 MB
node distances:
node   0
  0:  10

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/topology.h |  1 +
 arch/powerpc/mm/numa.c              | 49 ++++++++++++++++++++++-------
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index f0b6300e7dd3..15b0424a27a8 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -118,5 +118,6 @@ int get_physical_package_id(int cpu);
 #endif
 #endif
 
+int firmware_group_id_to_nid(int firmware_gid);
 #endif /* __KERNEL__ */
 #endif	/* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index e437a9ac4956..6c659aada55b 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -221,25 +221,51 @@ static void initialize_distance_lookup_table(int nid,
 	}
 }
 
+static u32 nid_map[MAX_NUMNODES] = {[0 ... MAX_NUMNODES - 1] =  NUMA_NO_NODE};
+
+int firmware_group_id_to_nid(int firmware_gid)
+{
+	static int last_nid = 0;
+
+	/*
+	 * For PowerNV we don't change the node id. This helps to avoid
+	 * confusion w.r.t the expected node ids. On pseries, node numbers
+	 * are virtualized. Hence do logical node id for pseries.
+	 */
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		return firmware_gid;
+
+	if (firmware_gid ==  -1)
+		return NUMA_NO_NODE;
+
+	if (nid_map[firmware_gid] == NUMA_NO_NODE)
+		nid_map[firmware_gid] = last_nid++;
+
+	return nid_map[firmware_gid];
+}
+
 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
  * info is found.
  */
 static int associativity_to_nid(const __be32 *associativity)
 {
 	int nid = NUMA_NO_NODE;
+	int firmware_gid = -1;
 
 	if (!numa_enabled)
 		goto out;
 
 	if (of_read_number(associativity, 1) >= min_common_depth)
-		nid = of_read_number(&associativity[min_common_depth], 1);
+		firmware_gid = of_read_number(&associativity[min_common_depth], 1);
 
 	/* POWER4 LPAR uses 0xffff as invalid node */
-	if (nid == 0xffff || nid >= MAX_NUMNODES)
-		nid = NUMA_NO_NODE;
+	if (firmware_gid == 0xffff || firmware_gid >= MAX_NUMNODES)
+		firmware_gid = -1;
+
+	nid = firmware_group_id_to_nid(firmware_gid);
 
 	if (nid > 0 &&
-		of_read_number(associativity, 1) >= distance_ref_points_depth) {
+	    of_read_number(associativity, 1) >= distance_ref_points_depth) {
 		/*
 		 * Skip the length field and send start of associativity array
 		 */
@@ -432,24 +458,25 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
 static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
 {
 	struct assoc_arrays aa = { .arrays = NULL };
-	int default_nid = NUMA_NO_NODE;
-	int nid = default_nid;
+	int nid = NUMA_NO_NODE, firmware_gid;
 	int rc, index;
 
 	if ((min_common_depth < 0) || !numa_enabled)
-		return default_nid;
+		return NUMA_NO_NODE;
 
 	rc = of_get_assoc_arrays(&aa);
 	if (rc)
-		return default_nid;
+		return NUMA_NO_NODE;
 
 	if (min_common_depth <= aa.array_sz &&
 	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
 		index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
-		nid = of_read_number(&aa.arrays[index], 1);
+		firmware_gid = of_read_number(&aa.arrays[index], 1);
 
-		if (nid == 0xffff || nid >= MAX_NUMNODES)
-			nid = default_nid;
+		if (firmware_gid == 0xffff || firmware_gid >= MAX_NUMNODES)
+			firmware_gid = -1;
+
+		nid = firmware_group_id_to_nid(firmware_gid);
 
 		if (nid > 0) {
 			index = lmb->aa_index * aa.array_sz;
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread