All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 0/8] Add support for FORM2 associativity
@ 2021-06-14 16:39 Aneesh Kumar K.V
  2021-06-14 16:39 ` [RFC PATCH 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index Aneesh Kumar K.V
                   ` (8 more replies)
  0 siblings, 9 replies; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-14 16:39 UTC (permalink / raw)
  To: linuxppc-dev, mpe
  Cc: Nathan Lynch, Aneesh Kumar K.V, Daniel Henrique Barboza, David Gibson

Form2 associativity adds a much more flexible NUMA topology layout
than what is provided by Form1. This also allows PAPR SCM device
to use better associativity when using the device as DAX KMEM
device. More details can be found in patch x

$ ndctl list -N -v
[
  {
    "dev":"namespace0.0",
    "mode":"devdax",
    "map":"dev",
    "size":1071644672,
    "uuid":"37dea198-ddb5-4e42-915a-99a915e24188",
    "raw_uuid":"148deeaa-4a2f-41d1-8d74-fd9a942d26ba",
    "daxregion":{
      "id":0,
      "size":1071644672,
      "devices":[
        {
          "chardev":"dax0.0",
          "size":1071644672,
          "target_node":4,
          "mode":"devdax"
        }
      ]
    },
    "align":2097152,
    "numa_node":1
  }
]

$ numactl -H
...
node distances:
node   0   1   2   3 
  0:  10  11  222  33 
  1:  44  10  55  66 
  2:  77  88  10  99 
  3:  101  121  132  10 
$

After DAX KMEM
# numactl -H
available: 5 nodes (0-4)
...
node distances:
node   0   1   2   3   4 
  0:  10  11  22  33  255 
  1:  44  10  55  66  255 
  2:  77  88  10  99  255 
  3:  101  121  132  10  255 
  4:  255  255  255  255  10 
# 

The above output is with a Qemu command line

-numa node,nodeid=4 \
-numa dist,src=0,dst=1,val=11 -numa dist,src=0,dst=2,val=22 -numa dist,src=0,dst=3,val=33 -numa dist,src=0,dst=4,val=255 \
-numa dist,src=1,dst=0,val=44 -numa dist,src=1,dst=2,val=55 -numa dist,src=1,dst=3,val=66 -numa dist,src=1,dst=4,val=255 \
-numa dist,src=2,dst=0,val=77 -numa dist,src=2,dst=1,val=88 -numa dist,src=2,dst=3,val=99 -numa dist,src=2,dst=4,val=255 \
-numa dist,src=3,dst=0,val=101 -numa dist,src=3,dst=1,val=121 -numa dist,src=3,dst=2,val=132 -numa dist,src=3,dst=4,val=255 \
-numa dist,src=4,dst=0,val=255 -numa dist,src=4,dst=1,val=255 -numa dist,src=4,dst=2,val=255 -numa dist,src=4,dst=3,val=255 \
-object memory-backend-file,id=memnvdimm1,prealloc=yes,mem-path=$PMEM_DISK,share=yes,size=${PMEM_SIZE}  \
-device nvdimm,label-size=128K,memdev=memnvdimm1,id=nvdimm1,slot=4,uuid=72511b67-0b3b-42fd-8d1d-5be3cae8bcaa,node=4,persistent-nodeid=1



Aneesh Kumar K.V (8):
  powerpc/pseries: rename min_common_depth to primary_domain_index
  powerpc/pseries: rename distance_ref_points_depth to max_domain_index
  powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
  powerpc/pseries: Consolidate DLPAR NUMA distance update
  powerpc/pseries: Consolidate NUMA distance update during boot
  powerpc/pseries: Add a helper for form1 cpu distance
  powerpc/pseries: Add support for FORM2 associativity
  powerpc/papr_scm: Use FORM2 associativity details

 Documentation/powerpc/associativity.rst       | 139 ++++++
 arch/powerpc/include/asm/firmware.h           |   7 +-
 arch/powerpc/include/asm/prom.h               |   3 +-
 arch/powerpc/kernel/prom_init.c               |   3 +-
 arch/powerpc/mm/numa.c                        | 436 ++++++++++++++----
 arch/powerpc/platforms/pseries/firmware.c     |   3 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
 .../platforms/pseries/hotplug-memory.c        |   2 +
 arch/powerpc/platforms/pseries/papr_scm.c     |  26 +-
 arch/powerpc/platforms/pseries/pseries.h      |   2 +
 10 files changed, 522 insertions(+), 101 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

-- 
2.31.1


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [RFC PATCH 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index
  2021-06-14 16:39 [RFC PATCH 0/8] Add support for FORM2 associativity Aneesh Kumar K.V
@ 2021-06-14 16:39 ` Aneesh Kumar K.V
  2021-06-15  3:00   ` David Gibson
  2021-06-14 16:39 ` [RFC PATCH 2/8] powerpc/pseries: rename distance_ref_points_depth to max_domain_index Aneesh Kumar K.V
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-14 16:39 UTC (permalink / raw)
  To: linuxppc-dev, mpe
  Cc: Nathan Lynch, Aneesh Kumar K.V, Daniel Henrique Barboza, David Gibson

No functional change in this patch.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/numa.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f2bf98bdcea2..8365b298ec48 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(node_to_cpumask_map);
 EXPORT_SYMBOL(node_data);
 
-static int min_common_depth;
+static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
@@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity)
 	if (!numa_enabled)
 		goto out;
 
-	if (of_read_number(associativity, 1) >= min_common_depth)
-		nid = of_read_number(&associativity[min_common_depth], 1);
+	if (of_read_number(associativity, 1) >= primary_domain_index)
+		nid = of_read_number(&associativity[primary_domain_index], 1);
 
 	/* POWER4 LPAR uses 0xffff as invalid node */
 	if (nid == 0xffff || nid >= nr_node_ids)
@@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
-static int __init find_min_common_depth(void)
+static int __init find_primary_domain_index(void)
 {
-	int depth;
+	int index;
 	struct device_node *root;
 
 	if (firmware_has_feature(FW_FEATURE_OPAL))
@@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
 	}
 
 	if (form1_affinity) {
-		depth = of_read_number(distance_ref_points, 1);
+		index = of_read_number(distance_ref_points, 1);
 	} else {
 		if (distance_ref_points_depth < 2) {
 			printk(KERN_WARNING "NUMA: "
@@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
 			goto err;
 		}
 
-		depth = of_read_number(&distance_ref_points[1], 1);
+		index = of_read_number(&distance_ref_points[1], 1);
 	}
 
 	/*
@@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
 	}
 
 	of_node_put(root);
-	return depth;
+	return index;
 
 err:
 	of_node_put(root);
@@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
 	int nid = default_nid;
 	int rc, index;
 
-	if ((min_common_depth < 0) || !numa_enabled)
+	if ((primary_domain_index < 0) || !numa_enabled)
 		return default_nid;
 
 	rc = of_get_assoc_arrays(&aa);
 	if (rc)
 		return default_nid;
 
-	if (min_common_depth <= aa.array_sz &&
+	if (primary_domain_index <= aa.array_sz &&
 	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
-		index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
+		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
 		nid = of_read_number(&aa.arrays[index], 1);
 
 		if (nid == 0xffff || nid >= nr_node_ids)
@@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
 		return -1;
 	}
 
-	min_common_depth = find_min_common_depth();
+	primary_domain_index = find_primary_domain_index();
 
-	if (min_common_depth < 0) {
+	if (primary_domain_index < 0) {
 		/*
-		 * if we fail to parse min_common_depth from device tree
+		 * if we fail to parse primary_domain_index from device tree
 		 * mark the numa disabled, boot with numa disabled.
 		 */
 		numa_enabled = false;
-		return min_common_depth;
+		return primary_domain_index;
 	}
 
-	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+	dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index);
 
 	/*
 	 * Even though we connect cpus to numa domains later in SMP
@@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
 			goto out;
 	}
 
-	max_nodes = of_read_number(&domains[min_common_depth], 1);
+	max_nodes = of_read_number(&domains[primary_domain_index], 1);
 	for (i = 0; i < max_nodes; i++) {
 		if (!node_possible(i))
 			node_set(i, node_possible_map);
 	}
 
 	prop_length /= sizeof(int);
-	if (prop_length > min_common_depth + 2)
+	if (prop_length > primary_domain_index + 2)
 		coregroup_enabled = 1;
 
 out:
@@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
 		goto out;
 
 	index = of_read_number(associativity, 1);
-	if (index > min_common_depth + 1)
+	if (index > primary_domain_index + 1)
 		return of_read_number(&associativity[index - 1], 1);
 
 out:
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [RFC PATCH 2/8] powerpc/pseries: rename distance_ref_points_depth to max_domain_index
  2021-06-14 16:39 [RFC PATCH 0/8] Add support for FORM2 associativity Aneesh Kumar K.V
  2021-06-14 16:39 ` [RFC PATCH 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index Aneesh Kumar K.V
@ 2021-06-14 16:39 ` Aneesh Kumar K.V
  2021-06-15  3:01   ` David Gibson
  2021-06-14 16:39 ` [RFC PATCH 3/8] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY Aneesh Kumar K.V
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-14 16:39 UTC (permalink / raw)
  To: linuxppc-dev, mpe
  Cc: Nathan Lynch, Aneesh Kumar K.V, Daniel Henrique Barboza, David Gibson

No functional change in this patch

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/numa.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8365b298ec48..5941da201fa3 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
 #define MAX_DISTANCE_REF_POINTS 4
-static int distance_ref_points_depth;
+static int max_domain_index;
 static const __be32 *distance_ref_points;
 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
 
@@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 
 	int i, index;
 
-	for (i = 0; i < distance_ref_points_depth; i++) {
+	for (i = 0; i < max_domain_index; i++) {
 		index = be32_to_cpu(distance_ref_points[i]);
 		if (cpu1_assoc[index] == cpu2_assoc[index])
 			break;
@@ -193,7 +193,7 @@ int __node_distance(int a, int b)
 	if (!form1_affinity)
 		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
-	for (i = 0; i < distance_ref_points_depth; i++) {
+	for (i = 0; i < max_domain_index; i++) {
 		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
 			break;
 
@@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
 	if (!form1_affinity)
 		return;
 
-	for (i = 0; i < distance_ref_points_depth; i++) {
+	for (i = 0; i < max_domain_index; i++) {
 		const __be32 *entry;
 
 		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
@@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 *associativity)
 		nid = NUMA_NO_NODE;
 
 	if (nid > 0 &&
-		of_read_number(associativity, 1) >= distance_ref_points_depth) {
+		of_read_number(associativity, 1) >= max_domain_index) {
 		/*
 		 * Skip the length field and send start of associativity array
 		 */
@@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
 	 */
 	distance_ref_points = of_get_property(root,
 					"ibm,associativity-reference-points",
-					&distance_ref_points_depth);
+					&max_domain_index);
 
 	if (!distance_ref_points) {
 		dbg("NUMA: ibm,associativity-reference-points not found.\n");
 		goto err;
 	}
 
-	distance_ref_points_depth /= sizeof(int);
+	max_domain_index /= sizeof(int);
 
 	if (firmware_has_feature(FW_FEATURE_OPAL) ||
 	    firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
@@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
 	if (form1_affinity) {
 		index = of_read_number(distance_ref_points, 1);
 	} else {
-		if (distance_ref_points_depth < 2) {
+		if (max_domain_index < 2) {
 			printk(KERN_WARNING "NUMA: "
 				"short ibm,associativity-reference-points\n");
 			goto err;
@@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
 	 * Warn and cap if the hardware supports more than
 	 * MAX_DISTANCE_REF_POINTS domains.
 	 */
-	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+	if (max_domain_index > MAX_DISTANCE_REF_POINTS) {
 		printk(KERN_WARNING "NUMA: distance array capped at "
 			"%d entries\n", MAX_DISTANCE_REF_POINTS);
-		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+		max_domain_index = MAX_DISTANCE_REF_POINTS;
 	}
 
 	of_node_put(root);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [RFC PATCH 3/8] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
  2021-06-14 16:39 [RFC PATCH 0/8] Add support for FORM2 associativity Aneesh Kumar K.V
  2021-06-14 16:39 ` [RFC PATCH 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index Aneesh Kumar K.V
  2021-06-14 16:39 ` [RFC PATCH 2/8] powerpc/pseries: rename distance_ref_points_depth to max_domain_index Aneesh Kumar K.V
@ 2021-06-14 16:39 ` Aneesh Kumar K.V
  2021-06-15  3:04   ` David Gibson
  2021-06-14 16:39 ` [RFC PATCH 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update Aneesh Kumar K.V
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-14 16:39 UTC (permalink / raw)
  To: linuxppc-dev, mpe
  Cc: Nathan Lynch, Aneesh Kumar K.V, Daniel Henrique Barboza, David Gibson

Also make related code cleanup that will allow adding FORM2_AFFINITY in
later patches. No functional change in this patch.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/firmware.h       |  4 +--
 arch/powerpc/include/asm/prom.h           |  2 +-
 arch/powerpc/kernel/prom_init.c           |  2 +-
 arch/powerpc/mm/numa.c                    | 35 ++++++++++++++---------
 arch/powerpc/platforms/pseries/firmware.c |  2 +-
 5 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index 7604673787d6..60b631161360 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -44,7 +44,7 @@
 #define FW_FEATURE_OPAL		ASM_CONST(0x0000000010000000)
 #define FW_FEATURE_SET_MODE	ASM_CONST(0x0000000040000000)
 #define FW_FEATURE_BEST_ENERGY	ASM_CONST(0x0000000080000000)
-#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000)
+#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0000000100000000)
 #define FW_FEATURE_PRRN		ASM_CONST(0x0000000200000000)
 #define FW_FEATURE_DRMEM_V2	ASM_CONST(0x0000000400000000)
 #define FW_FEATURE_DRC_INFO	ASM_CONST(0x0000000800000000)
@@ -69,7 +69,7 @@ enum {
 		FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
 		FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
 		FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
-		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
+		FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN |
 		FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
 		FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
 		FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 324a13351749..df9fec9d232c 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop,
 #define OV5_MSI			0x0201	/* PCIe/MSI support */
 #define OV5_CMO			0x0480	/* Cooperative Memory Overcommitment */
 #define OV5_XCMO		0x0440	/* Page Coalescing */
-#define OV5_TYPE1_AFFINITY	0x0580	/* Type 1 NUMA affinity */
+#define OV5_FORM1_AFFINITY	0x0580	/* FORM1 NUMA affinity */
 #define OV5_PRRN		0x0540	/* Platform Resource Reassignment */
 #define OV5_HP_EVT		0x0604	/* Hot Plug Event support */
 #define OV5_RESIZE_HPT		0x0601	/* Hash Page Table resizing */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 41ed7e33d897..64b9593038a7 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1070,7 +1070,7 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = {
 #else
 		0,
 #endif
-		.associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
+		.associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN),
 		.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
 		.micro_checkpoint = 0,
 		.reserved0 = 0,
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 5941da201fa3..192067991f8a 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data);
 
 static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
-static int form1_affinity;
+
+#define FORM0_AFFINITY 0
+#define FORM1_AFFINITY 1
+static int affinity_form;
 
 #define MAX_DISTANCE_REF_POINTS 4
 static int max_domain_index;
@@ -190,7 +193,7 @@ int __node_distance(int a, int b)
 	int i;
 	int distance = LOCAL_DISTANCE;
 
-	if (!form1_affinity)
+	if (affinity_form == FORM0_AFFINITY)
 		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
 	for (i = 0; i < max_domain_index; i++) {
@@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
 {
 	int i;
 
-	if (!form1_affinity)
+	if (affinity_form != FORM1_AFFINITY)
 		return;
 
 	for (i = 0; i < max_domain_index; i++) {
@@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void)
 	int index;
 	struct device_node *root;
 
+	/*
+	 * Check for which form of affinity.
+	 */
+	if (firmware_has_feature(FW_FEATURE_OPAL)) {
+		affinity_form = FORM1_AFFINITY;
+	} else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
+		dbg("Using form 1 affinity\n");
+		affinity_form = FORM1_AFFINITY;
+	} else
+		affinity_form = FORM0_AFFINITY;
+
 	if (firmware_has_feature(FW_FEATURE_OPAL))
 		root = of_find_node_by_path("/ibm,opal");
 	else
@@ -318,23 +332,16 @@ static int __init find_primary_domain_index(void)
 	}
 
 	max_domain_index /= sizeof(int);
-
-	if (firmware_has_feature(FW_FEATURE_OPAL) ||
-	    firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
-		dbg("Using form 1 affinity\n");
-		form1_affinity = 1;
-	}
-
-	if (form1_affinity) {
-		index = of_read_number(distance_ref_points, 1);
-	} else {
+	if (affinity_form == FORM0_AFFINITY) {
 		if (max_domain_index < 2) {
 			printk(KERN_WARNING "NUMA: "
-				"short ibm,associativity-reference-points\n");
+			       "short ibm,associativity-reference-points\n");
 			goto err;
 		}
 
 		index = of_read_number(&distance_ref_points[1], 1);
+	} else {
+		index = of_read_number(distance_ref_points, 1);
 	}
 
 	/*
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
index 4c7b7f5a2ebc..5d4c2bc20bba 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -119,7 +119,7 @@ struct vec5_fw_feature {
 
 static __initdata struct vec5_fw_feature
 vec5_fw_features_table[] = {
-	{FW_FEATURE_TYPE1_AFFINITY,	OV5_TYPE1_AFFINITY},
+	{FW_FEATURE_FORM1_AFFINITY,	OV5_FORM1_AFFINITY},
 	{FW_FEATURE_PRRN,		OV5_PRRN},
 	{FW_FEATURE_DRMEM_V2,		OV5_DRMEM_V2},
 	{FW_FEATURE_DRC_INFO,		OV5_DRC_INFO},
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [RFC PATCH 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update
  2021-06-14 16:39 [RFC PATCH 0/8] Add support for FORM2 associativity Aneesh Kumar K.V
                   ` (2 preceding siblings ...)
  2021-06-14 16:39 ` [RFC PATCH 3/8] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY Aneesh Kumar K.V
@ 2021-06-14 16:39 ` Aneesh Kumar K.V
  2021-06-15  3:13   ` David Gibson
  2021-06-14 16:40 ` [RFC PATCH 5/8] powerpc/pseries: Consolidate NUMA distance update during boot Aneesh Kumar K.V
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-14 16:39 UTC (permalink / raw)
  To: linuxppc-dev, mpe
  Cc: Nathan Lynch, Aneesh Kumar K.V, Daniel Henrique Barboza, David Gibson

The associativity details of the newly added resourced are collected from
the hypervisor via "ibm,configure-connector" rtas call. Update the numa
distance details of the newly added numa node after the above call. In
later patch we will remove updating NUMA distance when we are looking
for node id from associativity array.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/numa.c                        | 41 +++++++++++++++++++
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |  2 +
 .../platforms/pseries/hotplug-memory.c        |  2 +
 arch/powerpc/platforms/pseries/pseries.h      |  1 +
 4 files changed, 46 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 192067991f8a..fec47981c1ef 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -287,6 +287,47 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
+static void __initialize_form1_numa_distance(const __be32 *associativity)
+{
+	int i, nid;
+
+	if (of_read_number(associativity, 1) >= primary_domain_index) {
+		nid = of_read_number(&associativity[primary_domain_index], 1);
+
+		for (i = 0; i < max_domain_index; i++) {
+			const __be32 *entry;
+
+			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
+			distance_lookup_table[nid][i] = of_read_number(entry, 1);
+		}
+	}
+}
+
+static void initialize_form1_numa_distance(struct device_node *node)
+{
+	const __be32 *associativity;
+
+	associativity = of_get_associativity(node);
+	if (!associativity)
+		return;
+
+	__initialize_form1_numa_distance(associativity);
+	return;
+}
+
+/*
+ * Used to update distance information w.r.t newly added node.
+ */
+void update_numa_distance(struct device_node *node)
+{
+	if (affinity_form == FORM0_AFFINITY)
+		return;
+	else if (affinity_form == FORM1_AFFINITY) {
+		initialize_form1_numa_distance(node);
+		return;
+	}
+}
+
 static int __init find_primary_domain_index(void)
 {
 	int index;
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 7e970f81d8ff..778b6ab35f0d 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
 		return saved_rc;
 	}
 
+	update_numa_distance(dn);
+
 	rc = dlpar_online_cpu(dn);
 	if (rc) {
 		saved_rc = rc;
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 8377f1f7c78e..0e602c3b01ea 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
 		return -ENODEV;
 	}
 
+	update_numa_distance(lmb_node);
+
 	dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
 	if (!dr_node) {
 		dlpar_free_cc_nodes(lmb_node);
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 1f051a786fb3..663a0859cf13 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
 void pseries_setup_security_mitigations(void);
 void pseries_lpar_read_hblkrm_characteristics(void);
 
+void update_numa_distance(struct device_node *node);
 #endif /* _PSERIES_PSERIES_H */
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [RFC PATCH 5/8] powerpc/pseries: Consolidate NUMA distance update during boot
  2021-06-14 16:39 [RFC PATCH 0/8] Add support for FORM2 associativity Aneesh Kumar K.V
                   ` (3 preceding siblings ...)
  2021-06-14 16:39 ` [RFC PATCH 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update Aneesh Kumar K.V
@ 2021-06-14 16:40 ` Aneesh Kumar K.V
  2021-06-14 16:40 ` [RFC PATCH 6/8] powerpc/pseries: Add a helper for form1 cpu distance Aneesh Kumar K.V
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-14 16:40 UTC (permalink / raw)
  To: linuxppc-dev, mpe
  Cc: Nathan Lynch, Aneesh Kumar K.V, Daniel Henrique Barboza, David Gibson

Instead of updating NUMA distance every time we lookup a node id
from the associativity property, add helpers that can be used
during boot which does this only once. Also remove the distance
update from node id lookup helpers.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/numa.c | 133 +++++++++++++++++++++++++++--------------
 1 file changed, 87 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index fec47981c1ef..64caaf07cf82 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -208,22 +208,6 @@ int __node_distance(int a, int b)
 }
 EXPORT_SYMBOL(__node_distance);
 
-static void initialize_distance_lookup_table(int nid,
-		const __be32 *associativity)
-{
-	int i;
-
-	if (affinity_form != FORM1_AFFINITY)
-		return;
-
-	for (i = 0; i < max_domain_index; i++) {
-		const __be32 *entry;
-
-		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
-		distance_lookup_table[nid][i] = of_read_number(entry, 1);
-	}
-}
-
 /*
  * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
  * info is found.
@@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 *associativity)
 	/* POWER4 LPAR uses 0xffff as invalid node */
 	if (nid == 0xffff || nid >= nr_node_ids)
 		nid = NUMA_NO_NODE;
-
-	if (nid > 0 &&
-		of_read_number(associativity, 1) >= max_domain_index) {
-		/*
-		 * Skip the length field and send start of associativity array
-		 */
-		initialize_distance_lookup_table(nid, associativity + 1);
-	}
-
 out:
 	return nid;
 }
@@ -291,6 +266,9 @@ static void __initialize_form1_numa_distance(const __be32 *associativity)
 {
 	int i, nid;
 
+	if (affinity_form != FORM1_AFFINITY)
+		return;
+
 	if (of_read_number(associativity, 1) >= primary_domain_index) {
 		nid = of_read_number(&associativity[primary_domain_index], 1);
 
@@ -474,6 +452,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
 	return 0;
 }
 
+static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
+{
+	struct assoc_arrays aa = { .arrays = NULL };
+	int default_nid = NUMA_NO_NODE;
+	int nid = default_nid;
+	int rc, index;
+
+	if ((primary_domain_index < 0) || !numa_enabled)
+		return default_nid;
+
+	rc = of_get_assoc_arrays(&aa);
+	if (rc)
+		return default_nid;
+
+	if (primary_domain_index <= aa.array_sz &&
+	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
+		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
+		nid = of_read_number(&aa.arrays[index], 1);
+
+		if (nid == 0xffff || nid >= nr_node_ids)
+			nid = default_nid;
+		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
+			int i;
+			const __be32 *associativity;
+
+			index = lmb->aa_index * aa.array_sz;
+			associativity = &aa.arrays[index];
+			/*
+			 * lookup array associativity entries have different format
+			 * There is no length of the array as the first element.
+			 */
+			for (i = 0; i < max_domain_index; i++) {
+				const __be32 *entry;
+
+				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
+				distance_lookup_table[nid][i] = of_read_number(entry, 1);
+			}
+		}
+	}
+	return nid;
+}
+
 /*
  * This is like of_node_to_nid_single() for memory represented in the
  * ibm,dynamic-reconfiguration-memory node.
@@ -499,21 +519,14 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
 
 		if (nid == 0xffff || nid >= nr_node_ids)
 			nid = default_nid;
-
-		if (nid > 0) {
-			index = lmb->aa_index * aa.array_sz;
-			initialize_distance_lookup_table(nid,
-							&aa.arrays[index]);
-		}
 	}
-
 	return nid;
 }
 
 #ifdef CONFIG_PPC_SPLPAR
-static int vphn_get_nid(long lcpu)
+
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
 {
-	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
 	long rc, hwid;
 
 	/*
@@ -533,10 +546,22 @@ static int vphn_get_nid(long lcpu)
 
 		rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
 		if (rc == H_SUCCESS)
-			return associativity_to_nid(associativity);
+			return 0;
 	}
 
+	return -1;
+}
+
+static int vphn_get_nid(long lcpu)
+{
+	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+
+
+	if (!__vphn_get_associativity(lcpu, associativity))
+		return associativity_to_nid(associativity);
+
 	return NUMA_NO_NODE;
+
 }
 #else
 static int vphn_get_nid(long unused)
@@ -733,7 +758,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
 			size = read_n_cells(n_mem_size_cells, usm);
 		}
 
-		nid = of_drconf_to_nid_single(lmb);
+		nid = get_nid_and_numa_distance(lmb);
 		fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
 					  &nid);
 		node_set_online(nid);
@@ -750,6 +775,7 @@ static int __init parse_numa_properties(void)
 	struct device_node *memory;
 	int default_nid = 0;
 	unsigned long i;
+	const __be32 *associativity;
 
 	if (numa_enabled == 0) {
 		printk(KERN_WARNING "NUMA disabled by user\n");
@@ -775,18 +801,30 @@ static int __init parse_numa_properties(void)
 	 * each node to be onlined must have NODE_DATA etc backing it.
 	 */
 	for_each_present_cpu(i) {
+		__be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
 		struct device_node *cpu;
-		int nid = vphn_get_nid(i);
+		int nid = NUMA_NO_NODE;
 
-		/*
-		 * Don't fall back to default_nid yet -- we will plug
-		 * cpus into nodes once the memory scan has discovered
-		 * the topology.
-		 */
-		if (nid == NUMA_NO_NODE) {
+		memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
+
+		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
+			nid = associativity_to_nid(vphn_assoc);
+			__initialize_form1_numa_distance(vphn_assoc);
+		} else {
+
+			/*
+			 * Don't fall back to default_nid yet -- we will plug
+			 * cpus into nodes once the memory scan has discovered
+			 * the topology.
+			 */
 			cpu = of_get_cpu_node(i, NULL);
 			BUG_ON(!cpu);
-			nid = of_node_to_nid_single(cpu);
+
+			associativity = of_get_associativity(cpu);
+			if (associativity) {
+				nid = associativity_to_nid(associativity);
+				__initialize_form1_numa_distance(associativity);
+			}
 			of_node_put(cpu);
 		}
 
@@ -822,8 +860,11 @@ static int __init parse_numa_properties(void)
 		 * have associativity properties.  If none, then
 		 * everything goes to default_nid.
 		 */
-		nid = of_node_to_nid_single(memory);
-		if (nid < 0)
+		associativity = of_get_associativity(memory);
+		if (associativity) {
+			nid = associativity_to_nid(associativity);
+			__initialize_form1_numa_distance(associativity);
+		} else
 			nid = default_nid;
 
 		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [RFC PATCH 6/8] powerpc/pseries: Add a helper for form1 cpu distance
  2021-06-14 16:39 [RFC PATCH 0/8] Add support for FORM2 associativity Aneesh Kumar K.V
                   ` (4 preceding siblings ...)
  2021-06-14 16:40 ` [RFC PATCH 5/8] powerpc/pseries: Consolidate NUMA distance update during boot Aneesh Kumar K.V
@ 2021-06-14 16:40 ` Aneesh Kumar K.V
  2021-06-15  3:21   ` David Gibson
  2021-06-14 16:40 ` [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity Aneesh Kumar K.V
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-14 16:40 UTC (permalink / raw)
  To: linuxppc-dev, mpe
  Cc: Nathan Lynch, Aneesh Kumar K.V, Daniel Henrique Barboza, David Gibson

This helper is only used with the dispatch trace log collection.
A later patch will add Form2 affinity support and this change helps
in keeping that simpler. Also add a comment explaining we don't expect
the code to be called with FORM0

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/numa.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 64caaf07cf82..696e5bfe1414 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
 
-int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 {
 	int dist = 0;
 
@@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 	return dist;
 }
 
+int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+	/* We should not get called with FORM0 */
+	VM_WARN_ON(affinity_form == FORM0_AFFINITY);
+
+	return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
+}
+
 /* must hold reference to node during call */
 static const __be32 *of_get_associativity(struct device_node *dev)
 {
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity
  2021-06-14 16:39 [RFC PATCH 0/8] Add support for FORM2 associativity Aneesh Kumar K.V
                   ` (5 preceding siblings ...)
  2021-06-14 16:40 ` [RFC PATCH 6/8] powerpc/pseries: Add a helper for form1 cpu distance Aneesh Kumar K.V
@ 2021-06-14 16:40 ` Aneesh Kumar K.V
  2021-06-15  3:53   ` David Gibson
  2021-06-14 16:40 ` [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details Aneesh Kumar K.V
  2021-06-15  1:47 ` [RFC PATCH 0/8] Add support for FORM2 associativity Daniel Henrique Barboza
  8 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-14 16:40 UTC (permalink / raw)
  To: linuxppc-dev, mpe
  Cc: Nathan Lynch, Aneesh Kumar K.V, Daniel Henrique Barboza, David Gibson

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=n, Size: 17240 bytes --]

Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 Documentation/powerpc/associativity.rst   | 139 ++++++++++++++++++++
 arch/powerpc/include/asm/firmware.h       |   3 +-
 arch/powerpc/include/asm/prom.h           |   1 +
 arch/powerpc/kernel/prom_init.c           |   3 +-
 arch/powerpc/mm/numa.c                    | 149 +++++++++++++++++++++-
 arch/powerpc/platforms/pseries/firmware.c |   1 +
 6 files changed, 290 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst
new file mode 100644
index 000000000000..58abedea81d7
--- /dev/null
+++ b/Documentation/powerpc/associativity.rst
@@ -0,0 +1,139 @@
+============================
+NUMA resource associativity
+=============================
+
+Associativity represents the groupings of the various platform resources into
+domains of substantially similar mean performance relative to resources outside
+of that domain. Resources subsets of a given domain that exhibit better
+performance relative to each other than relative to other resources subsets
+are represented as being members of a sub-grouping domain. This performance
+characteristic is presented in terms of NUMA node distance within the Linux kernel.
+From the platform view, these groups are also referred to as domains.
+
+PAPR interface currently supports two different ways of communicating these resource
+grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping.
+Form 0 is the older format and is now considered deprecated.
+
+Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
+Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
+A value of 1 indicates the usage of Form 1 associativity.
+
+Form 0
+-----
+Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
+
+Form 1
+-----
+With Form 1 a combination of ibm,associativity-reference-points and ibm,associativity
+device tree properties are used to determine the NUMA distance between resource groups/domains. 
+
+The “ibm,associativity” property contains one or more lists of numbers (domainID)
+representing the resource’s platform grouping domains.
+
+The “ibm,associativity-reference-points” property contains one or more list of numbers
+(domain index) that represents the 1 based ordinal in the associativity lists of the most
+significant boundary, with subsequent entries indicating progressively less significant boundaries.
+
+Linux kernel uses the domain id of the most significant boundary (aka primary domain)
+as the NUMA node id. Linux kernel computes NUMA distance between two domains by
+recursively comparing if they belong to the same higher-level domains. For mismatch
+at every higher level of the resource group, the kernel doubles the NUMA distance between
+the comparing domains.
+
+Form 2
+-------
+Form 2 associativity format adds separate device tree properties representing NUMA node distance
+thereby making the node distance computation flexible. Form 2 also allows flexible primary
+domain numbering. With numa distance computation now detached from the index value of
+"ibm,associativity" property, Form 2 allows a large number of primary domain ids at the
+same domain index representing resource groups of different performance/latency characteristics.
+
+Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the
+"ibm,architecture-vec-5" property.
+
+"ibm,numa-lookup-index-table" property contains one or more list numbers representing
+the domainIDs present in the system. The offset of the domainID in this property is considered
+the domainID index.
+
+prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by
+N domainID encoded as with encode-int
+
+For ex:
+ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for domainID 8 is 1.
+
+"ibm,numa-distance-table" property contains one or more list of numbers representing the NUMA
+distance between resource groups/domains present in the system.
+
+prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
+N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
+
+For ex:
+ibm,numa-lookup-index-table =  {3, 0, 8, 40}
+ibm,numa-distance-table     =  {9, 1, 2, 8, 2, 1, 16, 8, 16, 1}
+
+  | 0    8   40
+--|------------
+  |
+0 | 10   20  80
+  |
+8 | 20   10  160
+  |
+40| 80   160  10
+
+With Form2 "ibm,associativity" for resources is listed as below:
+
+"ibm,associativity" property for resources in node 0, 8 and 40
+{ 4, 6, 7, 0, 0}
+{ 4, 6, 9, 8, 8}
+{ 4, 6, 7, 0, 40}
+
+With "ibm,associativity-reference-points"  { 0x4, 0x3, 0x2 }
+
+With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
+the kernel should use when using persistent memory devices. Persistent memory devices
+can also be used as regular memory using DAX KMEM driver and primary domainID indicates
+the numa node number OS should use when using these devices as regular memory. Secondary
+domainID is the numa node number that should be used when using this device as
+persistent memory. In the later case, we are interested in the locality of the
+device to an established numa node. In the above example, if the last row represents a
+persistent memory device/resource, NUMA node number 40 will be used when using the device
+as regular memory and NUMA node number 0 will be the device numa node when using it as
+a persistent memory device.
+
+Each resource (drcIndex) now also supports additional optional device tree properties.
+These properties are marked optional because the platform can choose not to export
+them and provide the system topology details using the earlier defined device tree
+properties alone. The optional device tree properties are used when adding new resources
+(DLPAR) and when the platform didn't provide the topology details of the domain which
+contains the newly added resource during boot.
+
+"ibm,numa-lookup-index" property contains a number representing the domainID index to be used
+when building the NUMA distance of the numa node to which this resource belongs. The domain id
+of the new resource can be obtained from the existing "ibm,associativity" property. This
+can be used to build distance information of a newly onlined NUMA node via DLPAR operation.
+The value is 1 based array index value.
+
+prop-encoded-array: An integer encoded as with encode-int specifying the domainID index
+
+"ibm,numa-distance" property contains one or more list of numbers presenting the NUMA distance
+from this resource domain to other resources.
+
+prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
+N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
+
+For ex:
+ibm,associativity     = { 4, 5, 6, 7, 50}
+ibm,numa-lookup-index = { 4 }
+ibm,numa-distance   =  {8, 16, 32, 8, 1, 16, 32, 8, 1}
+
+resulting in a new toplogy as below.
+  | 0    8   40   50
+--|------------------
+  |
+0 | 10   20  80   160
+  |
+8 | 20   10  160  320
+  |
+40| 80   160  10  80
+  |
+50| 160  320  80  10
diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index 60b631161360..97a3bd9ffeb9 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -53,6 +53,7 @@
 #define FW_FEATURE_ULTRAVISOR	ASM_CONST(0x0000004000000000)
 #define FW_FEATURE_STUFF_TCE	ASM_CONST(0x0000008000000000)
 #define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000)
+#define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -73,7 +74,7 @@ enum {
 		FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
 		FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
 		FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
-		FW_FEATURE_RPT_INVALIDATE,
+		FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY,
 	FW_FEATURE_PSERIES_ALWAYS = 0,
 	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR,
 	FW_FEATURE_POWERNV_ALWAYS = 0,
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index df9fec9d232c..5c80152e8f18 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -149,6 +149,7 @@ extern int of_read_drc_info_cell(struct property **prop,
 #define OV5_XCMO		0x0440	/* Page Coalescing */
 #define OV5_FORM1_AFFINITY	0x0580	/* FORM1 NUMA affinity */
 #define OV5_PRRN		0x0540	/* Platform Resource Reassignment */
+#define OV5_FORM2_AFFINITY	0x0520	/* Form2 NUMA affinity */
 #define OV5_HP_EVT		0x0604	/* Hot Plug Event support */
 #define OV5_RESIZE_HPT		0x0601	/* Hash Page Table resizing */
 #define OV5_PFO_HW_RNG		0x1180	/* PFO Random Number Generator */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 64b9593038a7..496fdac54c29 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1070,7 +1070,8 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = {
 #else
 		0,
 #endif
-		.associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN),
+		.associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN) |
+		OV5_FEAT(OV5_FORM2_AFFINITY),
 		.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
 		.micro_checkpoint = 0,
 		.reserved0 = 0,
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 696e5bfe1414..86cd2af014f7 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -56,12 +56,17 @@ static int n_mem_addr_cells, n_mem_size_cells;
 
 #define FORM0_AFFINITY 0
 #define FORM1_AFFINITY 1
+#define FORM2_AFFINITY 2
 static int affinity_form;
 
 #define MAX_DISTANCE_REF_POINTS 4
 static int max_domain_index;
 static const __be32 *distance_ref_points;
 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
+static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
+	[0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
+};
+static int numa_id_index_table[MAX_NUMNODES];
 
 /*
  * Allocate node_to_cpumask_map based on number of available nodes
@@ -166,6 +171,27 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
 
+/*
+ * With FORM2 if we are not using logical domain ids, we will find
+ * both primary and seconday domains with same value. Hence always
+ * start comparison from secondary domains
+ */
+static int __cpu_form2_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+	int dist = 0;
+
+	int i, index;
+
+	for (i = 1; i < max_domain_index; i++) {
+		index = be32_to_cpu(distance_ref_points[i]);
+		if (cpu1_assoc[index] == cpu2_assoc[index])
+			break;
+		dist++;
+	}
+
+	return dist;
+}
+
 static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 {
 	int dist = 0;
@@ -178,7 +204,6 @@ static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 			break;
 		dist++;
 	}
-
 	return dist;
 }
 
@@ -186,8 +211,9 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 {
 	/* We should not get called with FORM0 */
 	VM_WARN_ON(affinity_form == FORM0_AFFINITY);
-
-	return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
+	if (affinity_form == FORM1_AFFINITY)
+		return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
+	return __cpu_form2_distance(cpu1_assoc, cpu2_assoc);
 }
 
 /* must hold reference to node during call */
@@ -201,7 +227,9 @@ int __node_distance(int a, int b)
 	int i;
 	int distance = LOCAL_DISTANCE;
 
-	if (affinity_form == FORM0_AFFINITY)
+	if (affinity_form == FORM2_AFFINITY)
+		return numa_distance_table[a][b];
+	else if (affinity_form == FORM0_AFFINITY)
 		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
 	for (i = 0; i < max_domain_index; i++) {
@@ -303,15 +331,116 @@ static void initialize_form1_numa_distance(struct device_node *node)
 
 /*
  * Used to update distance information w.r.t newly added node.
+ * ibm,numa-lookup-index -> 4
+ * ibm,numa-distance -> {5, 20, 40, 60, 80, 10 }
  */
 void update_numa_distance(struct device_node *node)
 {
+	int i, nid, other_nid, other_nid_index = 0;
+	const __be32 *numa_indexp;
+	const __u8  *numa_distancep;
+	int numa_index, max_numa_index, numa_distance;
+
 	if (affinity_form == FORM0_AFFINITY)
 		return;
 	else if (affinity_form == FORM1_AFFINITY) {
 		initialize_form1_numa_distance(node);
 		return;
 	}
+	/* FORM2 affinity  */
+
+	nid = of_node_to_nid_single(node);
+	if (nid == NUMA_NO_NODE)
+		return;
+
+	/* Already initialized */
+	if (numa_distance_table[nid][nid] != -1)
+		return;
+	/*
+	 * update node distance if not already populated.
+	 */
+	numa_distancep = of_get_property(node, "ibm,numa-distance", NULL);
+	if (!numa_distancep)
+		return;
+
+	numa_indexp = of_get_property(node, "ibm,numa-lookup-index", NULL);
+	if (!numa_indexp)
+		return;
+
+	numa_index = of_read_number(numa_indexp, 1);
+	/*
+	 * update the numa_id_index_table. Device tree look at index table as
+	 * 1 based array indexing.
+	 */
+	numa_id_index_table[numa_index - 1] = nid;
+
+	max_numa_index = of_read_number((const __be32 *)numa_distancep, 1);
+	VM_WARN_ON(max_numa_index != 2 * numa_index);
+	/* Skip the size which is encoded int */
+	numa_distancep += sizeof(__be32);
+
+	/*
+	 * First fill the distance information from other node to this node.
+	 */
+	other_nid_index = 0;
+	for (i = 0; i < numa_index; i++) {
+		numa_distance = numa_distancep[i];
+		other_nid = numa_id_index_table[other_nid_index++];
+		numa_distance_table[other_nid][nid] = numa_distance;
+	}
+
+	other_nid_index = 0;
+	for (; i < max_numa_index; i++) {
+		numa_distance = numa_distancep[i];
+		other_nid = numa_id_index_table[other_nid_index++];
+		numa_distance_table[nid][other_nid] = numa_distance;
+	}
+}
+
+/*
+ * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
+ * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
+ */
+static void initialize_form2_numa_distance_lookup_table(struct device_node *root)
+{
+	const __u8 *numa_dist_table;
+	const __be32 *numa_lookup_index;
+	int numa_dist_table_length;
+	int max_numa_index, distance_index;
+	int i, curr_row = 0, curr_column = 0;
+
+	numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
+	max_numa_index = of_read_number(&numa_lookup_index[0], 1);
+
+	/* first element of the array is the size and is encode-int */
+	numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL);
+	numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1);
+	/* Skip the size which is encoded int */
+	numa_dist_table += sizeof(__be32);
+
+	pr_debug(" numa_dist_table_len = %d, numa_dist_indexes_len = %d \n",
+		 numa_dist_table_length, max_numa_index);
+
+	for (i = 0; i < max_numa_index; i++)
+		/* +1 skip the max_numa_index in the property */
+		numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
+
+
+	VM_WARN_ON(numa_dist_table_length != max_numa_index * max_numa_index);
+
+	for (distance_index = 0; distance_index < numa_dist_table_length; distance_index++) {
+		int nodeA = numa_id_index_table[curr_row];
+		int nodeB = numa_id_index_table[curr_column++];
+
+		numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index];
+
+		pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]);
+		if (curr_column >= max_numa_index) {
+			curr_row++;
+			/* reset the column */
+			curr_column = 0;
+		}
+	}
 }
 
 static int __init find_primary_domain_index(void)
@@ -324,6 +453,9 @@ static int __init find_primary_domain_index(void)
 	 */
 	if (firmware_has_feature(FW_FEATURE_OPAL)) {
 		affinity_form = FORM1_AFFINITY;
+	} else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
+		dbg("Using form 2 affinity\n");
+		affinity_form = FORM2_AFFINITY;
 	} else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
 		dbg("Using form 1 affinity\n");
 		affinity_form = FORM1_AFFINITY;
@@ -368,8 +500,17 @@ static int __init find_primary_domain_index(void)
 
 		index = of_read_number(&distance_ref_points[1], 1);
 	} else {
+		/*
+		 * Both FORM1 and FORM2 affinity find the primary domain details
+		 * at the same offset.
+		 */
 		index = of_read_number(distance_ref_points, 1);
 	}
+	/*
+	 * If it is FORM2 also initialize the distance table here.
+	 */
+	if (affinity_form == FORM2_AFFINITY)
+		initialize_form2_numa_distance_lookup_table(root);
 
 	/*
 	 * Warn and cap if the hardware supports more than
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
index 5d4c2bc20bba..f162156b7b68 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -123,6 +123,7 @@ vec5_fw_features_table[] = {
 	{FW_FEATURE_PRRN,		OV5_PRRN},
 	{FW_FEATURE_DRMEM_V2,		OV5_DRMEM_V2},
 	{FW_FEATURE_DRC_INFO,		OV5_DRC_INFO},
+	{FW_FEATURE_FORM2_AFFINITY,	OV5_FORM2_AFFINITY},
 };
 
 static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-14 16:39 [RFC PATCH 0/8] Add support for FORM2 associativity Aneesh Kumar K.V
                   ` (6 preceding siblings ...)
  2021-06-14 16:40 ` [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity Aneesh Kumar K.V
@ 2021-06-14 16:40 ` Aneesh Kumar K.V
  2021-06-15  3:55   ` David Gibson
  2021-06-15  1:47 ` [RFC PATCH 0/8] Add support for FORM2 associativity Daniel Henrique Barboza
  8 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-14 16:40 UTC (permalink / raw)
  To: linuxppc-dev, mpe
  Cc: Nathan Lynch, Aneesh Kumar K.V, Daniel Henrique Barboza, David Gibson

FORM2 introduce a concept of secondary domain which is identical to the
conceept of FORM1 primary domain. Use secondary domain as the numa node
when using persistent memory device. For DAX kmem use the logical domain
id introduced in FORM2. This new numa node

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
 arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
 arch/powerpc/platforms/pseries/pseries.h  |  1 +
 3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 86cd2af014f7..b9ac6d02e944 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
 	return nid;
 }
 
+int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
+{
+	int secondary_index;
+	const __be32 *associativity;
+
+	if (!numa_enabled) {
+		*primary = NUMA_NO_NODE;
+		*secondary = NUMA_NO_NODE;
+		return 0;
+	}
+
+	associativity = of_get_associativity(node);
+	if (!associativity)
+		return -ENODEV;
+
+	if (of_read_number(associativity, 1) >= primary_domain_index) {
+		*primary = of_read_number(&associativity[primary_domain_index], 1);
+		secondary_index = of_read_number(&distance_ref_points[1], 1);
+		*secondary = of_read_number(&associativity[secondary_index], 1);
+	}
+	if (*primary == 0xffff || *primary >= nr_node_ids)
+		*primary = NUMA_NO_NODE;
+
+	if (*secondary == 0xffff || *secondary >= nr_node_ids)
+		*secondary = NUMA_NO_NODE;
+	return 0;
+}
+
 /* Returns the nid associated with the given device tree node,
  * or -1 if not found.
  */
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index ef26fe40efb0..9bf2f1f3ddc5 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -18,6 +18,7 @@
 #include <asm/plpar_wrappers.h>
 #include <asm/papr_pdsm.h>
 #include <asm/mce.h>
+#include "pseries.h"
 
 #define BIND_ANY_ADDR (~0ul)
 
@@ -88,6 +89,8 @@ struct papr_scm_perf_stats {
 struct papr_scm_priv {
 	struct platform_device *pdev;
 	struct device_node *dn;
+	int numa_node;
+	int target_node;
 	uint32_t drc_index;
 	uint64_t blocks;
 	uint64_t block_size;
@@ -923,7 +926,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	struct nd_mapping_desc mapping;
 	struct nd_region_desc ndr_desc;
 	unsigned long dimm_flags;
-	int target_nid, online_nid;
 	ssize_t stat_size;
 
 	p->bus_desc.ndctl = papr_scm_ndctl;
@@ -974,10 +976,8 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
 
 	memset(&ndr_desc, 0, sizeof(ndr_desc));
-	target_nid = dev_to_node(&p->pdev->dev);
-	online_nid = numa_map_to_online_node(target_nid);
-	ndr_desc.numa_node = online_nid;
-	ndr_desc.target_node = target_nid;
+	ndr_desc.numa_node = p->numa_node;
+	ndr_desc.target_node = p->target_node;
 	ndr_desc.res = &p->res;
 	ndr_desc.of_node = p->dn;
 	ndr_desc.provider_data = p;
@@ -1001,9 +1001,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 				ndr_desc.res, p->dn);
 		goto err;
 	}
-	if (target_nid != online_nid)
-		dev_info(dev, "Region registered with target node %d and online node %d",
-			 target_nid, online_nid);
 
 	mutex_lock(&papr_ndr_lock);
 	list_add_tail(&p->region_list, &papr_nd_regions);
@@ -1096,7 +1093,7 @@ static int papr_scm_probe(struct platform_device *pdev)
 	struct papr_scm_priv *p;
 	const char *uuid_str;
 	u64 uuid[2];
-	int rc;
+	int rc, numa_node;
 
 	/* check we have all the required DT properties */
 	if (of_property_read_u32(dn, "ibm,my-drc-index", &drc_index)) {
@@ -1119,11 +1116,20 @@ static int papr_scm_probe(struct platform_device *pdev)
 		return -ENODEV;
 	}
 
-
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
 
+	if (get_primary_and_secondary_domain(dn, &p->target_node, &numa_node)) {
+		dev_err(&pdev->dev, "%pOF: missing NUMA attributes!\n", dn);
+		rc = -ENODEV;
+		goto err;
+	}
+	p->numa_node = numa_map_to_online_node(numa_node);
+	if (numa_node != p->numa_node)
+		dev_info(&pdev->dev, "Region registered with online node %d and device tree node %d",
+			 p->numa_node, numa_node);
+
 	/* Initialize the dimm mutex */
 	mutex_init(&p->health_mutex);
 
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 663a0859cf13..9c2a1fc9ded1 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -114,4 +114,5 @@ void pseries_setup_security_mitigations(void);
 void pseries_lpar_read_hblkrm_characteristics(void);
 
 void update_numa_distance(struct device_node *node);
+int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary);
 #endif /* _PSERIES_PSERIES_H */
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 0/8] Add support for FORM2 associativity
  2021-06-14 16:39 [RFC PATCH 0/8] Add support for FORM2 associativity Aneesh Kumar K.V
                   ` (7 preceding siblings ...)
  2021-06-14 16:40 ` [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details Aneesh Kumar K.V
@ 2021-06-15  1:47 ` Daniel Henrique Barboza
  8 siblings, 0 replies; 38+ messages in thread
From: Daniel Henrique Barboza @ 2021-06-15  1:47 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev, mpe; +Cc: Nathan Lynch, David Gibson



On 6/14/21 1:39 PM, Aneesh Kumar K.V wrote:
> Form2 associativity adds a much more flexible NUMA topology layout
> than what is provided by Form1. This also allows PAPR SCM device
> to use better associativity when using the device as DAX KMEM
> device. More details can be found in patch x
> 
> $ ndctl list -N -v
> [
>    {
>      "dev":"namespace0.0",
>      "mode":"devdax",
>      "map":"dev",
>      "size":1071644672,
>      "uuid":"37dea198-ddb5-4e42-915a-99a915e24188",
>      "raw_uuid":"148deeaa-4a2f-41d1-8d74-fd9a942d26ba",
>      "daxregion":{
>        "id":0,
>        "size":1071644672,
>        "devices":[
>          {
>            "chardev":"dax0.0",
>            "size":1071644672,
>            "target_node":4,
>            "mode":"devdax"
>          }
>        ]
>      },
>      "align":2097152,
>      "numa_node":1
>    }
> ]
> 
> $ numactl -H
> ...
> node distances:
> node   0   1   2   3
>    0:  10  11  222  33
>    1:  44  10  55  66
>    2:  77  88  10  99
>    3:  101  121  132  10
> $
> 
> After DAX KMEM
> # numactl -H
> available: 5 nodes (0-4)
> ...
> node distances:
> node   0   1   2   3   4
>    0:  10  11  22  33  255
>    1:  44  10  55  66  255
>    2:  77  88  10  99  255
>    3:  101  121  132  10  255
>    4:  255  255  255  255  10
> #
> 
> The above output is with a Qemu command line


For reference, this QEMU:


https://github.com/danielhb/qemu/tree/form2_affinity_v1

https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg03617.html


but ...

> 
> -numa node,nodeid=4 \
> -numa dist,src=0,dst=1,val=11 -numa dist,src=0,dst=2,val=22 -numa dist,src=0,dst=3,val=33 -numa dist,src=0,dst=4,val=255 \
> -numa dist,src=1,dst=0,val=44 -numa dist,src=1,dst=2,val=55 -numa dist,src=1,dst=3,val=66 -numa dist,src=1,dst=4,val=255 \
> -numa dist,src=2,dst=0,val=77 -numa dist,src=2,dst=1,val=88 -numa dist,src=2,dst=3,val=99 -numa dist,src=2,dst=4,val=255 \
> -numa dist,src=3,dst=0,val=101 -numa dist,src=3,dst=1,val=121 -numa dist,src=3,dst=2,val=132 -numa dist,src=3,dst=4,val=255 \
> -numa dist,src=4,dst=0,val=255 -numa dist,src=4,dst=1,val=255 -numa dist,src=4,dst=2,val=255 -numa dist,src=4,dst=3,val=255 \
> -object memory-backend-file,id=memnvdimm1,prealloc=yes,mem-path=$PMEM_DISK,share=yes,size=${PMEM_SIZE}  \
> -device nvdimm,label-size=128K,memdev=memnvdimm1,id=nvdimm1,slot=4,uuid=72511b67-0b3b-42fd-8d1d-5be3cae8bcaa,node=4,persistent-nodeid=1


with 'device-node=1' instead of 'persistent=nodeid=1' in the nvdimm parameter
up here.


> 
> 
> 
> Aneesh Kumar K.V (8):
>    powerpc/pseries: rename min_common_depth to primary_domain_index
>    powerpc/pseries: rename distance_ref_points_depth to max_domain_index
>    powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
>    powerpc/pseries: Consolidate DLPAR NUMA distance update
>    powerpc/pseries: Consolidate NUMA distance update during boot
>    powerpc/pseries: Add a helper for form1 cpu distance
>    powerpc/pseries: Add support for FORM2 associativity
>    powerpc/papr_scm: Use FORM2 associativity details


Series:


Tested-by: Daniel Henrique Barboza <danielhb413@gmail.com>



> 
>   Documentation/powerpc/associativity.rst       | 139 ++++++
>   arch/powerpc/include/asm/firmware.h           |   7 +-
>   arch/powerpc/include/asm/prom.h               |   3 +-
>   arch/powerpc/kernel/prom_init.c               |   3 +-
>   arch/powerpc/mm/numa.c                        | 436 ++++++++++++++----
>   arch/powerpc/platforms/pseries/firmware.c     |   3 +-
>   arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
>   .../platforms/pseries/hotplug-memory.c        |   2 +
>   arch/powerpc/platforms/pseries/papr_scm.c     |  26 +-
>   arch/powerpc/platforms/pseries/pseries.h      |   2 +
>   10 files changed, 522 insertions(+), 101 deletions(-)
>   create mode 100644 Documentation/powerpc/associativity.rst
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index
  2021-06-14 16:39 ` [RFC PATCH 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index Aneesh Kumar K.V
@ 2021-06-15  3:00   ` David Gibson
  2021-06-15  8:21     ` Aneesh Kumar K.V
  0 siblings, 1 reply; 38+ messages in thread
From: David Gibson @ 2021-06-15  3:00 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 5080 bytes --]

On Mon, Jun 14, 2021 at 10:09:56PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch.

I think this needs a rationale as to why 'primary_domain_index' is a
better name than 'min_common_depth'.  The meaning isn't obvious to me
from either name.

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c | 38 +++++++++++++++++++-------------------
>  1 file changed, 19 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index f2bf98bdcea2..8365b298ec48 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
>  EXPORT_SYMBOL(node_to_cpumask_map);
>  EXPORT_SYMBOL(node_data);
>  
> -static int min_common_depth;
> +static int primary_domain_index;
>  static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
> @@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity)
>  	if (!numa_enabled)
>  		goto out;
>  
> -	if (of_read_number(associativity, 1) >= min_common_depth)
> -		nid = of_read_number(&associativity[min_common_depth], 1);
> +	if (of_read_number(associativity, 1) >= primary_domain_index)
> +		nid = of_read_number(&associativity[primary_domain_index], 1);
>  
>  	/* POWER4 LPAR uses 0xffff as invalid node */
>  	if (nid == 0xffff || nid >= nr_node_ids)
> @@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> -static int __init find_min_common_depth(void)
> +static int __init find_primary_domain_index(void)
>  {
> -	int depth;
> +	int index;
>  	struct device_node *root;
>  
>  	if (firmware_has_feature(FW_FEATURE_OPAL))
> @@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
>  	}
>  
>  	if (form1_affinity) {
> -		depth = of_read_number(distance_ref_points, 1);
> +		index = of_read_number(distance_ref_points, 1);
>  	} else {
>  		if (distance_ref_points_depth < 2) {
>  			printk(KERN_WARNING "NUMA: "
> @@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
>  			goto err;
>  		}
>  
> -		depth = of_read_number(&distance_ref_points[1], 1);
> +		index = of_read_number(&distance_ref_points[1], 1);
>  	}
>  
>  	/*
> @@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
>  	}
>  
>  	of_node_put(root);
> -	return depth;
> +	return index;
>  
>  err:
>  	of_node_put(root);
> @@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>  	int nid = default_nid;
>  	int rc, index;
>  
> -	if ((min_common_depth < 0) || !numa_enabled)
> +	if ((primary_domain_index < 0) || !numa_enabled)
>  		return default_nid;
>  
>  	rc = of_get_assoc_arrays(&aa);
>  	if (rc)
>  		return default_nid;
>  
> -	if (min_common_depth <= aa.array_sz &&
> +	if (primary_domain_index <= aa.array_sz &&
>  	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
> -		index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
> +		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
>  		nid = of_read_number(&aa.arrays[index], 1);
>  
>  		if (nid == 0xffff || nid >= nr_node_ids)
> @@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
>  		return -1;
>  	}
>  
> -	min_common_depth = find_min_common_depth();
> +	primary_domain_index = find_primary_domain_index();
>  
> -	if (min_common_depth < 0) {
> +	if (primary_domain_index < 0) {
>  		/*
> -		 * if we fail to parse min_common_depth from device tree
> +		 * if we fail to parse primary_domain_index from device tree
>  		 * mark the numa disabled, boot with numa disabled.
>  		 */
>  		numa_enabled = false;
> -		return min_common_depth;
> +		return primary_domain_index;
>  	}
>  
> -	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
> +	dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index);
>  
>  	/*
>  	 * Even though we connect cpus to numa domains later in SMP
> @@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
>  			goto out;
>  	}
>  
> -	max_nodes = of_read_number(&domains[min_common_depth], 1);
> +	max_nodes = of_read_number(&domains[primary_domain_index], 1);
>  	for (i = 0; i < max_nodes; i++) {
>  		if (!node_possible(i))
>  			node_set(i, node_possible_map);
>  	}
>  
>  	prop_length /= sizeof(int);
> -	if (prop_length > min_common_depth + 2)
> +	if (prop_length > primary_domain_index + 2)
>  		coregroup_enabled = 1;
>  
>  out:
> @@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
>  		goto out;
>  
>  	index = of_read_number(associativity, 1);
> -	if (index > min_common_depth + 1)
> +	if (index > primary_domain_index + 1)
>  		return of_read_number(&associativity[index - 1], 1);
>  
>  out:

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 2/8] powerpc/pseries: rename distance_ref_points_depth to max_domain_index
  2021-06-14 16:39 ` [RFC PATCH 2/8] powerpc/pseries: rename distance_ref_points_depth to max_domain_index Aneesh Kumar K.V
@ 2021-06-15  3:01   ` David Gibson
  2021-06-15  8:22     ` Aneesh Kumar K.V
  0 siblings, 1 reply; 38+ messages in thread
From: David Gibson @ 2021-06-15  3:01 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 3960 bytes --]

On Mon, Jun 14, 2021 at 10:09:57PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch

As with 1/8 an explanation of what this actually means and therefore
why this is a better name would be very helpful.

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c | 20 ++++++++++----------
>  1 file changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 8365b298ec48..5941da201fa3 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
> -static int distance_ref_points_depth;
> +static int max_domain_index;
>  static const __be32 *distance_ref_points;
>  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
>  
> @@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  
>  	int i, index;
>  
> -	for (i = 0; i < distance_ref_points_depth; i++) {
> +	for (i = 0; i < max_domain_index; i++) {
>  		index = be32_to_cpu(distance_ref_points[i]);
>  		if (cpu1_assoc[index] == cpu2_assoc[index])
>  			break;
> @@ -193,7 +193,7 @@ int __node_distance(int a, int b)
>  	if (!form1_affinity)
>  		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
> -	for (i = 0; i < distance_ref_points_depth; i++) {
> +	for (i = 0; i < max_domain_index; i++) {
>  		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
>  			break;
>  
> @@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
>  	if (!form1_affinity)
>  		return;
>  
> -	for (i = 0; i < distance_ref_points_depth; i++) {
> +	for (i = 0; i < max_domain_index; i++) {
>  		const __be32 *entry;
>  
>  		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> @@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 *associativity)
>  		nid = NUMA_NO_NODE;
>  
>  	if (nid > 0 &&
> -		of_read_number(associativity, 1) >= distance_ref_points_depth) {
> +		of_read_number(associativity, 1) >= max_domain_index) {
>  		/*
>  		 * Skip the length field and send start of associativity array
>  		 */
> @@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
>  	 */
>  	distance_ref_points = of_get_property(root,
>  					"ibm,associativity-reference-points",
> -					&distance_ref_points_depth);
> +					&max_domain_index);
>  
>  	if (!distance_ref_points) {
>  		dbg("NUMA: ibm,associativity-reference-points not found.\n");
>  		goto err;
>  	}
>  
> -	distance_ref_points_depth /= sizeof(int);
> +	max_domain_index /= sizeof(int);
>  
>  	if (firmware_has_feature(FW_FEATURE_OPAL) ||
>  	    firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
> @@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
>  	if (form1_affinity) {
>  		index = of_read_number(distance_ref_points, 1);
>  	} else {
> -		if (distance_ref_points_depth < 2) {
> +		if (max_domain_index < 2) {
>  			printk(KERN_WARNING "NUMA: "
>  				"short ibm,associativity-reference-points\n");
>  			goto err;
> @@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
>  	 * Warn and cap if the hardware supports more than
>  	 * MAX_DISTANCE_REF_POINTS domains.
>  	 */
> -	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
> +	if (max_domain_index > MAX_DISTANCE_REF_POINTS) {
>  		printk(KERN_WARNING "NUMA: distance array capped at "
>  			"%d entries\n", MAX_DISTANCE_REF_POINTS);
> -		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
> +		max_domain_index = MAX_DISTANCE_REF_POINTS;
>  	}
>  
>  	of_node_put(root);

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 3/8] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
  2021-06-14 16:39 ` [RFC PATCH 3/8] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY Aneesh Kumar K.V
@ 2021-06-15  3:04   ` David Gibson
  0 siblings, 0 replies; 38+ messages in thread
From: David Gibson @ 2021-06-15  3:04 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 6496 bytes --]

On Mon, Jun 14, 2021 at 10:09:58PM +0530, Aneesh Kumar K.V wrote:
> Also make related code cleanup that will allow adding FORM2_AFFINITY in
> later patches. No functional change in this patch.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/include/asm/firmware.h       |  4 +--
>  arch/powerpc/include/asm/prom.h           |  2 +-
>  arch/powerpc/kernel/prom_init.c           |  2 +-
>  arch/powerpc/mm/numa.c                    | 35 ++++++++++++++---------
>  arch/powerpc/platforms/pseries/firmware.c |  2 +-
>  5 files changed, 26 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
> index 7604673787d6..60b631161360 100644
> --- a/arch/powerpc/include/asm/firmware.h
> +++ b/arch/powerpc/include/asm/firmware.h
> @@ -44,7 +44,7 @@
>  #define FW_FEATURE_OPAL		ASM_CONST(0x0000000010000000)
>  #define FW_FEATURE_SET_MODE	ASM_CONST(0x0000000040000000)
>  #define FW_FEATURE_BEST_ENERGY	ASM_CONST(0x0000000080000000)
> -#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000)
> +#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0000000100000000)
>  #define FW_FEATURE_PRRN		ASM_CONST(0x0000000200000000)
>  #define FW_FEATURE_DRMEM_V2	ASM_CONST(0x0000000400000000)
>  #define FW_FEATURE_DRC_INFO	ASM_CONST(0x0000000800000000)
> @@ -69,7 +69,7 @@ enum {
>  		FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
>  		FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
>  		FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
> -		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
> +		FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN |
>  		FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
>  		FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
>  		FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
> diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
> index 324a13351749..df9fec9d232c 100644
> --- a/arch/powerpc/include/asm/prom.h
> +++ b/arch/powerpc/include/asm/prom.h
> @@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop,
>  #define OV5_MSI			0x0201	/* PCIe/MSI support */
>  #define OV5_CMO			0x0480	/* Cooperative Memory Overcommitment */
>  #define OV5_XCMO		0x0440	/* Page Coalescing */
> -#define OV5_TYPE1_AFFINITY	0x0580	/* Type 1 NUMA affinity */
> +#define OV5_FORM1_AFFINITY	0x0580	/* FORM1 NUMA affinity */
>  #define OV5_PRRN		0x0540	/* Platform Resource Reassignment */
>  #define OV5_HP_EVT		0x0604	/* Hot Plug Event support */
>  #define OV5_RESIZE_HPT		0x0601	/* Hash Page Table resizing */
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index 41ed7e33d897..64b9593038a7 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -1070,7 +1070,7 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = {
>  #else
>  		0,
>  #endif
> -		.associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
> +		.associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN),
>  		.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
>  		.micro_checkpoint = 0,
>  		.reserved0 = 0,
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 5941da201fa3..192067991f8a 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data);
>  
>  static int primary_domain_index;
>  static int n_mem_addr_cells, n_mem_size_cells;
> -static int form1_affinity;
> +
> +#define FORM0_AFFINITY 0
> +#define FORM1_AFFINITY 1
> +static int affinity_form;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
>  static int max_domain_index;
> @@ -190,7 +193,7 @@ int __node_distance(int a, int b)
>  	int i;
>  	int distance = LOCAL_DISTANCE;
>  
> -	if (!form1_affinity)
> +	if (affinity_form == FORM0_AFFINITY)
>  		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
>  	for (i = 0; i < max_domain_index; i++) {
> @@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
>  {
>  	int i;
>  
> -	if (!form1_affinity)
> +	if (affinity_form != FORM1_AFFINITY)
>  		return;
>  
>  	for (i = 0; i < max_domain_index; i++) {
> @@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void)
>  	int index;
>  	struct device_node *root;
>  
> +	/*
> +	 * Check for which form of affinity.
> +	 */
> +	if (firmware_has_feature(FW_FEATURE_OPAL)) {
> +		affinity_form = FORM1_AFFINITY;
> +	} else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
> +		dbg("Using form 1 affinity\n");
> +		affinity_form = FORM1_AFFINITY;
> +	} else
> +		affinity_form = FORM0_AFFINITY;
> +
>  	if (firmware_has_feature(FW_FEATURE_OPAL))
>  		root = of_find_node_by_path("/ibm,opal");
>  	else
> @@ -318,23 +332,16 @@ static int __init find_primary_domain_index(void)
>  	}
>  
>  	max_domain_index /= sizeof(int);
> -
> -	if (firmware_has_feature(FW_FEATURE_OPAL) ||
> -	    firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
> -		dbg("Using form 1 affinity\n");
> -		form1_affinity = 1;
> -	}
> -
> -	if (form1_affinity) {
> -		index = of_read_number(distance_ref_points, 1);
> -	} else {
> +	if (affinity_form == FORM0_AFFINITY) {
>  		if (max_domain_index < 2) {
>  			printk(KERN_WARNING "NUMA: "
> -				"short ibm,associativity-reference-points\n");
> +			       "short ibm,associativity-reference-points\n");
>  			goto err;
>  		}
>  
>  		index = of_read_number(&distance_ref_points[1], 1);
> +	} else {
> +		index = of_read_number(distance_ref_points, 1);
>  	}
>  
>  	/*
> diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
> index 4c7b7f5a2ebc..5d4c2bc20bba 100644
> --- a/arch/powerpc/platforms/pseries/firmware.c
> +++ b/arch/powerpc/platforms/pseries/firmware.c
> @@ -119,7 +119,7 @@ struct vec5_fw_feature {
>  
>  static __initdata struct vec5_fw_feature
>  vec5_fw_features_table[] = {
> -	{FW_FEATURE_TYPE1_AFFINITY,	OV5_TYPE1_AFFINITY},
> +	{FW_FEATURE_FORM1_AFFINITY,	OV5_FORM1_AFFINITY},
>  	{FW_FEATURE_PRRN,		OV5_PRRN},
>  	{FW_FEATURE_DRMEM_V2,		OV5_DRMEM_V2},
>  	{FW_FEATURE_DRC_INFO,		OV5_DRC_INFO},

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update
  2021-06-14 16:39 ` [RFC PATCH 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update Aneesh Kumar K.V
@ 2021-06-15  3:13   ` David Gibson
  2021-06-15  8:26     ` Aneesh Kumar K.V
  0 siblings, 1 reply; 38+ messages in thread
From: David Gibson @ 2021-06-15  3:13 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 4204 bytes --]

On Mon, Jun 14, 2021 at 10:09:59PM +0530, Aneesh Kumar K.V wrote:
> The associativity details of the newly added resourced are collected from
> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> distance details of the newly added numa node after the above call. In
> later patch we will remove updating NUMA distance when we are looking
> for node id from associativity array.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c                        | 41 +++++++++++++++++++
>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |  2 +
>  .../platforms/pseries/hotplug-memory.c        |  2 +
>  arch/powerpc/platforms/pseries/pseries.h      |  1 +
>  4 files changed, 46 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 192067991f8a..fec47981c1ef 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -287,6 +287,47 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> +{
> +	int i, nid;
> +
> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> +		nid = of_read_number(&associativity[primary_domain_index], 1);
> +
> +		for (i = 0; i < max_domain_index; i++) {
> +			const __be32 *entry;
> +
> +			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
> +			distance_lookup_table[nid][i] = of_read_number(entry, 1);
> +		}
> +	}
> +}

This logic is almost identicaly to initialize_distance_lookup_table()
- it would be good if they could be consolidated, so it's clear that
coldplugged and hotplugged nodes are parsing the NUMA information in
the same way.

> +
> +static void initialize_form1_numa_distance(struct device_node *node)
> +{
> +	const __be32 *associativity;
> +
> +	associativity = of_get_associativity(node);
> +	if (!associativity)
> +		return;
> +
> +	__initialize_form1_numa_distance(associativity);
> +	return;
> +}
> +
> +/*
> + * Used to update distance information w.r.t newly added node.
> + */
> +void update_numa_distance(struct device_node *node)
> +{
> +	if (affinity_form == FORM0_AFFINITY)
> +		return;
> +	else if (affinity_form == FORM1_AFFINITY) {
> +		initialize_form1_numa_distance(node);
> +		return;
> +	}
> +}
> +
>  static int __init find_primary_domain_index(void)
>  {
>  	int index;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 7e970f81d8ff..778b6ab35f0d 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>  		return saved_rc;
>  	}
>  
> +	update_numa_distance(dn);
> +
>  	rc = dlpar_online_cpu(dn);
>  	if (rc) {
>  		saved_rc = rc;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 8377f1f7c78e..0e602c3b01ea 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
>  		return -ENODEV;
>  	}
>  
> +	update_numa_distance(lmb_node);
> +
>  	dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>  	if (!dr_node) {
>  		dlpar_free_cc_nodes(lmb_node);
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index 1f051a786fb3..663a0859cf13 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
>  void pseries_setup_security_mitigations(void);
>  void pseries_lpar_read_hblkrm_characteristics(void);
>  
> +void update_numa_distance(struct device_node *node);
>  #endif /* _PSERIES_PSERIES_H */

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 6/8] powerpc/pseries: Add a helper for form1 cpu distance
  2021-06-14 16:40 ` [RFC PATCH 6/8] powerpc/pseries: Add a helper for form1 cpu distance Aneesh Kumar K.V
@ 2021-06-15  3:21   ` David Gibson
  0 siblings, 0 replies; 38+ messages in thread
From: David Gibson @ 2021-06-15  3:21 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 1701 bytes --]

On Mon, Jun 14, 2021 at 10:10:01PM +0530, Aneesh Kumar K.V wrote:
> This helper is only used with the dispatch trace log collection.
> A later patch will add Form2 affinity support and this change helps
> in keeping that simpler. Also add a comment explaining we don't expect
> the code to be called with FORM0
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/mm/numa.c | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 64caaf07cf82..696e5bfe1414 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  {
>  	int dist = 0;
>  
> @@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  	return dist;
>  }
>  
> +int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +{
> +	/* We should not get called with FORM0 */
> +	VM_WARN_ON(affinity_form == FORM0_AFFINITY);
> +
> +	return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
> +}
> +
>  /* must hold reference to node during call */
>  static const __be32 *of_get_associativity(struct device_node *dev)
>  {

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity
  2021-06-14 16:40 ` [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity Aneesh Kumar K.V
@ 2021-06-15  3:53   ` David Gibson
  2021-06-15  5:28     ` Aneesh Kumar K.V
  0 siblings, 1 reply; 38+ messages in thread
From: David Gibson @ 2021-06-15  3:53 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 20184 bytes --]

On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  Documentation/powerpc/associativity.rst   | 139 ++++++++++++++++++++
>  arch/powerpc/include/asm/firmware.h       |   3 +-
>  arch/powerpc/include/asm/prom.h           |   1 +
>  arch/powerpc/kernel/prom_init.c           |   3 +-
>  arch/powerpc/mm/numa.c                    | 149 +++++++++++++++++++++-
>  arch/powerpc/platforms/pseries/firmware.c |   1 +
>  6 files changed, 290 insertions(+), 6 deletions(-)
>  create mode 100644 Documentation/powerpc/associativity.rst
> 
> diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst
> new file mode 100644
> index 000000000000..58abedea81d7
> --- /dev/null
> +++ b/Documentation/powerpc/associativity.rst
> @@ -0,0 +1,139 @@
> +============================
> +NUMA resource associativity
> +=============================
> +
> +Associativity represents the groupings of the various platform resources into
> +domains of substantially similar mean performance relative to resources outside
> +of that domain. Resources subsets of a given domain that exhibit better
> +performance relative to each other than relative to other resources subsets
> +are represented as being members of a sub-grouping domain. This performance
> +characteristic is presented in terms of NUMA node distance within the Linux kernel.
> +From the platform view, these groups are also referred to as domains.
> +
> +PAPR interface currently supports two different ways of communicating these resource

You describe form 2 below as well, which contradicts this.

> +grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping.
> +Form 0 is the older format and is now considered deprecated.
> +
> +Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
> +A value of 1 indicates the usage of Form 1 associativity.
> +
> +Form 0
> +-----
> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
> +
> +Form 1
> +-----
> +With Form 1 a combination of ibm,associativity-reference-points and ibm,associativity
> +device tree properties are used to determine the NUMA distance between resource groups/domains. 
> +
> +The “ibm,associativity” property contains one or more lists of numbers (domainID)
> +representing the resource’s platform grouping domains.
> +
> +The “ibm,associativity-reference-points” property contains one or more list of numbers
> +(domain index) that represents the 1 based ordinal in the associativity lists of the most
> +significant boundary, with subsequent entries indicating progressively less significant boundaries.
> +
> +Linux kernel uses the domain id of the most significant boundary (aka primary domain)

I thought we used the *least* significant boundary (the smallest
grouping, not the largest).  That is, the last index, not the first.

Actually... come to think of it, I'm not even sure how to interpret
"most significant".  Does that mean a change in grouping at that "most
significant" level results in the largest perfomance difference?

> +as the NUMA node id. Linux kernel computes NUMA distance between two domains by
> +recursively comparing if they belong to the same higher-level domains. For mismatch
> +at every higher level of the resource group, the kernel doubles the NUMA distance between
> +the comparing domains.
> +
> +Form 2
> +-------
> +Form 2 associativity format adds separate device tree properties representing NUMA node distance
> +thereby making the node distance computation flexible. Form 2 also allows flexible primary
> +domain numbering. With numa distance computation now detached from the index value of
> +"ibm,associativity" property, Form 2 allows a large number of primary domain ids at the
> +same domain index representing resource groups of different
> performance/latency characteristics.

The meaning of "domain index" is not clear to me here.

> +
> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the
> +"ibm,architecture-vec-5" property.
> +
> +"ibm,numa-lookup-index-table" property contains one or more list numbers representing
> +the domainIDs present in the system. The offset of the domainID in this property is considered
> +the domainID index.

You haven't really introduced the term "domainID".  Is "domainID
index" the same as "domain index" above?  It's not clear to me.

The distinction between "domain index" and "primary domain id" is also
not clear to me.

> +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by
> +N domainID encoded as with encode-int
> +
> +For ex:
> +ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for domainID 8 is 1.

Above you say "Form 2 allows a large number of primary domain ids at
the same domain index", but this encoding doesn't appear to permit
that.

> +"ibm,numa-distance-table" property contains one or more list of numbers representing the NUMA
> +distance between resource groups/domains present in the system.
> +
> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
> +
> +For ex:
> +ibm,numa-lookup-index-table =  {3, 0, 8, 40}
> +ibm,numa-distance-table     =  {9, 1, 2, 8, 2, 1, 16, 8, 16, 1}
> +
> +  | 0    8   40
> +--|------------
> +  |
> +0 | 10   20  80
> +  |
> +8 | 20   10  160
> +  |
> +40| 80   160  10

What's the reason for multiplying the values by 10 in the expanded
table version?

> +
> +With Form2 "ibm,associativity" for resources is listed as below:
> +
> +"ibm,associativity" property for resources in node 0, 8 and 40
> +{ 4, 6, 7, 0, 0}
> +{ 4, 6, 9, 8, 8}
> +{ 4, 6, 7, 0, 40}
> +
> +With "ibm,associativity-reference-points"  { 0x4, 0x3, 0x2 }
> +
> +With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes

What the heck is the secondary domainID?

> +the kernel should use when using persistent memory devices. Persistent memory devices
> +can also be used as regular memory using DAX KMEM driver and primary domainID indicates
> +the numa node number OS should use when using these devices as regular memory. Secondary
> +domainID is the numa node number that should be used when using this device as
> +persistent memory. In the later case, we are interested in the locality of the
> +device to an established numa node. In the above example, if the last row represents a
> +persistent memory device/resource, NUMA node number 40 will be used when using the device
> +as regular memory and NUMA node number 0 will be the device numa node when using it as
> +a persistent memory device.
> +
> +Each resource (drcIndex) now also supports additional optional device tree properties.
> +These properties are marked optional because the platform can choose not to export
> +them and provide the system topology details using the earlier defined device tree
> +properties alone. The optional device tree properties are used when adding new resources
> +(DLPAR) and when the platform didn't provide the topology details of the domain which
> +contains the newly added resource during boot.
> +
> +"ibm,numa-lookup-index" property contains a number representing the domainID index to be used
> +when building the NUMA distance of the numa node to which this resource belongs. The domain id
> +of the new resource can be obtained from the existing "ibm,associativity" property. This
> +can be used to build distance information of a newly onlined NUMA node via DLPAR operation.
> +The value is 1 based array index value.

Am I correct in thinking that if we have an entirely form2 world, we'd
only need this and the ibm,associativity properties could be dropped?

> +
> +prop-encoded-array: An integer encoded as with encode-int specifying the domainID index
> +
> +"ibm,numa-distance" property contains one or more list of numbers presenting the NUMA distance
> +from this resource domain to other resources.

IIUC this is about extending the global distance table with
information for a new node.  Is that correct?

The global distance table allows for the possibility of asymmetric
distances between nodes, but this does not.  Is that intentional?

> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
> +
> +For ex:
> +ibm,associativity     = { 4, 5, 6, 7, 50}
> +ibm,numa-lookup-index = { 4 }
> +ibm,numa-distance   =  {8, 16, 32, 8, 1, 16, 32, 8, 1}
> +
> +resulting in a new toplogy as below.
> +  | 0    8   40   50
> +--|------------------
> +  |
> +0 | 10   20  80   160
> +  |
> +8 | 20   10  160  320
> +  |
> +40| 80   160  10  80
> +  |
> +50| 160  320  80  10
> diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
> index 60b631161360..97a3bd9ffeb9 100644
> --- a/arch/powerpc/include/asm/firmware.h
> +++ b/arch/powerpc/include/asm/firmware.h
> @@ -53,6 +53,7 @@
>  #define FW_FEATURE_ULTRAVISOR	ASM_CONST(0x0000004000000000)
>  #define FW_FEATURE_STUFF_TCE	ASM_CONST(0x0000008000000000)
>  #define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000)
> +#define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000)
>  
>  #ifndef __ASSEMBLY__
>  
> @@ -73,7 +74,7 @@ enum {
>  		FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
>  		FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
>  		FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
> -		FW_FEATURE_RPT_INVALIDATE,
> +		FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY,
>  	FW_FEATURE_PSERIES_ALWAYS = 0,
>  	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR,
>  	FW_FEATURE_POWERNV_ALWAYS = 0,
> diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
> index df9fec9d232c..5c80152e8f18 100644
> --- a/arch/powerpc/include/asm/prom.h
> +++ b/arch/powerpc/include/asm/prom.h
> @@ -149,6 +149,7 @@ extern int of_read_drc_info_cell(struct property **prop,
>  #define OV5_XCMO		0x0440	/* Page Coalescing */
>  #define OV5_FORM1_AFFINITY	0x0580	/* FORM1 NUMA affinity */
>  #define OV5_PRRN		0x0540	/* Platform Resource Reassignment */
> +#define OV5_FORM2_AFFINITY	0x0520	/* Form2 NUMA affinity */
>  #define OV5_HP_EVT		0x0604	/* Hot Plug Event support */
>  #define OV5_RESIZE_HPT		0x0601	/* Hash Page Table resizing */
>  #define OV5_PFO_HW_RNG		0x1180	/* PFO Random Number Generator */
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index 64b9593038a7..496fdac54c29 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -1070,7 +1070,8 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = {
>  #else
>  		0,
>  #endif
> -		.associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN),
> +		.associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN) |
> +		OV5_FEAT(OV5_FORM2_AFFINITY),
>  		.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
>  		.micro_checkpoint = 0,
>  		.reserved0 = 0,
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 696e5bfe1414..86cd2af014f7 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -56,12 +56,17 @@ static int n_mem_addr_cells, n_mem_size_cells;
>  
>  #define FORM0_AFFINITY 0
>  #define FORM1_AFFINITY 1
> +#define FORM2_AFFINITY 2
>  static int affinity_form;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
>  static int max_domain_index;
>  static const __be32 *distance_ref_points;
>  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
> +static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
> +	[0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
> +};
> +static int numa_id_index_table[MAX_NUMNODES];
>  
>  /*
>   * Allocate node_to_cpumask_map based on number of available nodes
> @@ -166,6 +171,27 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> +/*
> + * With FORM2 if we are not using logical domain ids, we will find
> + * both primary and seconday domains with same value. Hence always
> + * start comparison from secondary domains
> + */
> +static int __cpu_form2_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +{
> +	int dist = 0;
> +
> +	int i, index;
> +
> +	for (i = 1; i < max_domain_index; i++) {
> +		index = be32_to_cpu(distance_ref_points[i]);
> +		if (cpu1_assoc[index] == cpu2_assoc[index])
> +			break;
> +		dist++;
> +	}
> +
> +	return dist;
> +}
> +
>  static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  {
>  	int dist = 0;
> @@ -178,7 +204,6 @@ static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  			break;
>  		dist++;
>  	}
> -
>  	return dist;
>  }
>  
> @@ -186,8 +211,9 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  {
>  	/* We should not get called with FORM0 */
>  	VM_WARN_ON(affinity_form == FORM0_AFFINITY);
> -
> -	return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
> +	if (affinity_form == FORM1_AFFINITY)
> +		return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
> +	return __cpu_form2_distance(cpu1_assoc, cpu2_assoc);
>  }
>  
>  /* must hold reference to node during call */
> @@ -201,7 +227,9 @@ int __node_distance(int a, int b)
>  	int i;
>  	int distance = LOCAL_DISTANCE;
>  
> -	if (affinity_form == FORM0_AFFINITY)
> +	if (affinity_form == FORM2_AFFINITY)
> +		return numa_distance_table[a][b];
> +	else if (affinity_form == FORM0_AFFINITY)
>  		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
>  	for (i = 0; i < max_domain_index; i++) {
> @@ -303,15 +331,116 @@ static void initialize_form1_numa_distance(struct device_node *node)
>  
>  /*
>   * Used to update distance information w.r.t newly added node.
> + * ibm,numa-lookup-index -> 4
> + * ibm,numa-distance -> {5, 20, 40, 60, 80, 10 }
>   */
>  void update_numa_distance(struct device_node *node)
>  {
> +	int i, nid, other_nid, other_nid_index = 0;
> +	const __be32 *numa_indexp;
> +	const __u8  *numa_distancep;
> +	int numa_index, max_numa_index, numa_distance;
> +
>  	if (affinity_form == FORM0_AFFINITY)
>  		return;
>  	else if (affinity_form == FORM1_AFFINITY) {
>  		initialize_form1_numa_distance(node);
>  		return;
>  	}
> +	/* FORM2 affinity  */
> +
> +	nid = of_node_to_nid_single(node);
> +	if (nid == NUMA_NO_NODE)
> +		return;
> +
> +	/* Already initialized */
> +	if (numa_distance_table[nid][nid] != -1)
> +		return;
> +	/*
> +	 * update node distance if not already populated.
> +	 */
> +	numa_distancep = of_get_property(node, "ibm,numa-distance", NULL);
> +	if (!numa_distancep)
> +		return;
> +
> +	numa_indexp = of_get_property(node, "ibm,numa-lookup-index", NULL);
> +	if (!numa_indexp)
> +		return;
> +
> +	numa_index = of_read_number(numa_indexp, 1);
> +	/*
> +	 * update the numa_id_index_table. Device tree look at index table as
> +	 * 1 based array indexing.
> +	 */
> +	numa_id_index_table[numa_index - 1] = nid;
> +
> +	max_numa_index = of_read_number((const __be32 *)numa_distancep, 1);
> +	VM_WARN_ON(max_numa_index != 2 * numa_index);
> +	/* Skip the size which is encoded int */
> +	numa_distancep += sizeof(__be32);
> +
> +	/*
> +	 * First fill the distance information from other node to this node.
> +	 */
> +	other_nid_index = 0;
> +	for (i = 0; i < numa_index; i++) {
> +		numa_distance = numa_distancep[i];
> +		other_nid = numa_id_index_table[other_nid_index++];
> +		numa_distance_table[other_nid][nid] = numa_distance;
> +	}
> +
> +	other_nid_index = 0;
> +	for (; i < max_numa_index; i++) {
> +		numa_distance = numa_distancep[i];
> +		other_nid = numa_id_index_table[other_nid_index++];
> +		numa_distance_table[nid][other_nid] = numa_distance;
> +	}
> +}
> +
> +/*
> + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
> + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
> + */
> +static void initialize_form2_numa_distance_lookup_table(struct device_node *root)
> +{
> +	const __u8 *numa_dist_table;
> +	const __be32 *numa_lookup_index;
> +	int numa_dist_table_length;
> +	int max_numa_index, distance_index;
> +	int i, curr_row = 0, curr_column = 0;
> +
> +	numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
> +	max_numa_index = of_read_number(&numa_lookup_index[0], 1);
> +
> +	/* first element of the array is the size and is encode-int */
> +	numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL);
> +	numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1);
> +	/* Skip the size which is encoded int */
> +	numa_dist_table += sizeof(__be32);
> +
> +	pr_debug(" numa_dist_table_len = %d, numa_dist_indexes_len = %d \n",
> +		 numa_dist_table_length, max_numa_index);
> +
> +	for (i = 0; i < max_numa_index; i++)
> +		/* +1 skip the max_numa_index in the property */
> +		numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
> +
> +
> +	VM_WARN_ON(numa_dist_table_length != max_numa_index * max_numa_index);
> +
> +	for (distance_index = 0; distance_index < numa_dist_table_length; distance_index++) {
> +		int nodeA = numa_id_index_table[curr_row];
> +		int nodeB = numa_id_index_table[curr_column++];
> +
> +		numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index];
> +
> +		pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]);
> +		if (curr_column >= max_numa_index) {
> +			curr_row++;
> +			/* reset the column */
> +			curr_column = 0;
> +		}
> +	}
>  }
>  
>  static int __init find_primary_domain_index(void)
> @@ -324,6 +453,9 @@ static int __init find_primary_domain_index(void)
>  	 */
>  	if (firmware_has_feature(FW_FEATURE_OPAL)) {
>  		affinity_form = FORM1_AFFINITY;
> +	} else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
> +		dbg("Using form 2 affinity\n");
> +		affinity_form = FORM2_AFFINITY;
>  	} else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
>  		dbg("Using form 1 affinity\n");
>  		affinity_form = FORM1_AFFINITY;
> @@ -368,8 +500,17 @@ static int __init find_primary_domain_index(void)
>  
>  		index = of_read_number(&distance_ref_points[1], 1);
>  	} else {
> +		/*
> +		 * Both FORM1 and FORM2 affinity find the primary domain details
> +		 * at the same offset.
> +		 */
>  		index = of_read_number(distance_ref_points, 1);
>  	}
> +	/*
> +	 * If it is FORM2 also initialize the distance table here.
> +	 */
> +	if (affinity_form == FORM2_AFFINITY)
> +		initialize_form2_numa_distance_lookup_table(root);
>  
>  	/*
>  	 * Warn and cap if the hardware supports more than
> diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
> index 5d4c2bc20bba..f162156b7b68 100644
> --- a/arch/powerpc/platforms/pseries/firmware.c
> +++ b/arch/powerpc/platforms/pseries/firmware.c
> @@ -123,6 +123,7 @@ vec5_fw_features_table[] = {
>  	{FW_FEATURE_PRRN,		OV5_PRRN},
>  	{FW_FEATURE_DRMEM_V2,		OV5_DRMEM_V2},
>  	{FW_FEATURE_DRC_INFO,		OV5_DRC_INFO},
> +	{FW_FEATURE_FORM2_AFFINITY,	OV5_FORM2_AFFINITY},
>  };
>  
>  static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-14 16:40 ` [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details Aneesh Kumar K.V
@ 2021-06-15  3:55   ` David Gibson
  2021-06-15  5:57     ` Aneesh Kumar K.V
  0 siblings, 1 reply; 38+ messages in thread
From: David Gibson @ 2021-06-15  3:55 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 5763 bytes --]

On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> FORM2 introduce a concept of secondary domain which is identical to the
> conceept of FORM1 primary domain. Use secondary domain as the numa node
> when using persistent memory device. For DAX kmem use the logical domain
> id introduced in FORM2. This new numa node
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
>  3 files changed, 45 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 86cd2af014f7..b9ac6d02e944 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>  	return nid;
>  }
>  
> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
> +{
> +	int secondary_index;
> +	const __be32 *associativity;
> +
> +	if (!numa_enabled) {
> +		*primary = NUMA_NO_NODE;
> +		*secondary = NUMA_NO_NODE;
> +		return 0;
> +	}
> +
> +	associativity = of_get_associativity(node);
> +	if (!associativity)
> +		return -ENODEV;
> +
> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
> +		secondary_index = of_read_number(&distance_ref_points[1], 1);

Secondary ID is always the second reference point, but primary depends
on the length of resources?  That seems very weird.

> +		*secondary = of_read_number(&associativity[secondary_index], 1);
> +	}
> +	if (*primary == 0xffff || *primary >= nr_node_ids)
> +		*primary = NUMA_NO_NODE;
> +
> +	if (*secondary == 0xffff || *secondary >= nr_node_ids)
> +		*secondary = NUMA_NO_NODE;
> +	return 0;
> +}
> +
>  /* Returns the nid associated with the given device tree node,
>   * or -1 if not found.
>   */
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
> index ef26fe40efb0..9bf2f1f3ddc5 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -18,6 +18,7 @@
>  #include <asm/plpar_wrappers.h>
>  #include <asm/papr_pdsm.h>
>  #include <asm/mce.h>
> +#include "pseries.h"
>  
>  #define BIND_ANY_ADDR (~0ul)
>  
> @@ -88,6 +89,8 @@ struct papr_scm_perf_stats {
>  struct papr_scm_priv {
>  	struct platform_device *pdev;
>  	struct device_node *dn;
> +	int numa_node;
> +	int target_node;
>  	uint32_t drc_index;
>  	uint64_t blocks;
>  	uint64_t block_size;
> @@ -923,7 +926,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  	struct nd_mapping_desc mapping;
>  	struct nd_region_desc ndr_desc;
>  	unsigned long dimm_flags;
> -	int target_nid, online_nid;
>  	ssize_t stat_size;
>  
>  	p->bus_desc.ndctl = papr_scm_ndctl;
> @@ -974,10 +976,8 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  	mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
>  
>  	memset(&ndr_desc, 0, sizeof(ndr_desc));
> -	target_nid = dev_to_node(&p->pdev->dev);
> -	online_nid = numa_map_to_online_node(target_nid);
> -	ndr_desc.numa_node = online_nid;
> -	ndr_desc.target_node = target_nid;
> +	ndr_desc.numa_node = p->numa_node;
> +	ndr_desc.target_node = p->target_node;
>  	ndr_desc.res = &p->res;
>  	ndr_desc.of_node = p->dn;
>  	ndr_desc.provider_data = p;
> @@ -1001,9 +1001,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  				ndr_desc.res, p->dn);
>  		goto err;
>  	}
> -	if (target_nid != online_nid)
> -		dev_info(dev, "Region registered with target node %d and online node %d",
> -			 target_nid, online_nid);
>  
>  	mutex_lock(&papr_ndr_lock);
>  	list_add_tail(&p->region_list, &papr_nd_regions);
> @@ -1096,7 +1093,7 @@ static int papr_scm_probe(struct platform_device *pdev)
>  	struct papr_scm_priv *p;
>  	const char *uuid_str;
>  	u64 uuid[2];
> -	int rc;
> +	int rc, numa_node;
>  
>  	/* check we have all the required DT properties */
>  	if (of_property_read_u32(dn, "ibm,my-drc-index", &drc_index)) {
> @@ -1119,11 +1116,20 @@ static int papr_scm_probe(struct platform_device *pdev)
>  		return -ENODEV;
>  	}
>  
> -
>  	p = kzalloc(sizeof(*p), GFP_KERNEL);
>  	if (!p)
>  		return -ENOMEM;
>  
> +	if (get_primary_and_secondary_domain(dn, &p->target_node, &numa_node)) {
> +		dev_err(&pdev->dev, "%pOF: missing NUMA attributes!\n", dn);
> +		rc = -ENODEV;
> +		goto err;
> +	}
> +	p->numa_node = numa_map_to_online_node(numa_node);
> +	if (numa_node != p->numa_node)
> +		dev_info(&pdev->dev, "Region registered with online node %d and device tree node %d",
> +			 p->numa_node, numa_node);
> +
>  	/* Initialize the dimm mutex */
>  	mutex_init(&p->health_mutex);
>  
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index 663a0859cf13..9c2a1fc9ded1 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -114,4 +114,5 @@ void pseries_setup_security_mitigations(void);
>  void pseries_lpar_read_hblkrm_characteristics(void);
>  
>  void update_numa_distance(struct device_node *node);
> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary);
>  #endif /* _PSERIES_PSERIES_H */

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity
  2021-06-15  3:53   ` David Gibson
@ 2021-06-15  5:28     ` Aneesh Kumar K.V
  2021-06-15  6:25       ` David Gibson
  0 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-15  5:28 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

David Gibson <david@gibson.dropbear.id.au> writes:

> On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
>> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  Documentation/powerpc/associativity.rst   | 139 ++++++++++++++++++++
>>  arch/powerpc/include/asm/firmware.h       |   3 +-
>>  arch/powerpc/include/asm/prom.h           |   1 +
>>  arch/powerpc/kernel/prom_init.c           |   3 +-
>>  arch/powerpc/mm/numa.c                    | 149 +++++++++++++++++++++-
>>  arch/powerpc/platforms/pseries/firmware.c |   1 +
>>  6 files changed, 290 insertions(+), 6 deletions(-)
>>  create mode 100644 Documentation/powerpc/associativity.rst
>> 
>> diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst
>> new file mode 100644
>> index 000000000000..58abedea81d7
>> --- /dev/null
>> +++ b/Documentation/powerpc/associativity.rst
>> @@ -0,0 +1,139 @@
>> +============================
>> +NUMA resource associativity
>> +=============================
>> +
>> +Associativity represents the groupings of the various platform resources into
>> +domains of substantially similar mean performance relative to resources outside
>> +of that domain. Resources subsets of a given domain that exhibit better
>> +performance relative to each other than relative to other resources subsets
>> +are represented as being members of a sub-grouping domain. This performance
>> +characteristic is presented in terms of NUMA node distance within the Linux kernel.
>> +From the platform view, these groups are also referred to as domains.
>> +
>> +PAPR interface currently supports two different ways of communicating these resource
>
> You describe form 2 below as well, which contradicts this.

Fixed as below.

PAPR interface currently supports different ways of communicating these resource
grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
associativity grouping. Form 0 is the older format and is now considered deprecated.

Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity
bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.



>
>> +grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping.
>> +Form 0 is the older format and is now considered deprecated.
>> +
>> +Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
>> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
>> +A value of 1 indicates the usage of Form 1 associativity.
>> +
>> +Form 0
>> +-----
>> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
>> +
>> +Form 1
>> +-----
>> +With Form 1 a combination of ibm,associativity-reference-points and ibm,associativity
>> +device tree properties are used to determine the NUMA distance between resource groups/domains. 
>> +
>> +The “ibm,associativity” property contains one or more lists of numbers (domainID)
>> +representing the resource’s platform grouping domains.
>> +
>> +The “ibm,associativity-reference-points” property contains one or more list of numbers
>> +(domain index) that represents the 1 based ordinal in the associativity lists of the most
>> +significant boundary, with subsequent entries indicating progressively less significant boundaries.
>> +
>> +Linux kernel uses the domain id of the most significant boundary (aka primary domain)
>
> I thought we used the *least* significant boundary (the smallest
> grouping, not the largest).  That is, the last index, not the first.
>
> Actually... come to think of it, I'm not even sure how to interpret
> "most significant".  Does that mean a change in grouping at that "most
> significant" level results in the largest perfomance difference?

PAPR defines "most significant" as below

When the “ibm,architecture-vec-5” property byte 5 bit 0 has the value of one, the “ibm,associativ-
ity-reference-points” property indicates boundaries between associativity domains presented by the
“ibm,associativity” property containing “near” and “far” resources. The
first such boundary in the list represents the 1 based ordinal in the
associativity lists of the most significant boundary, with subsequent
entries indicating progressively less significant boundaries

I would interpret it as the boundary where we start defining NUMA nodes.

>
>> +as the NUMA node id. Linux kernel computes NUMA distance between two domains by
>> +recursively comparing if they belong to the same higher-level domains. For mismatch
>> +at every higher level of the resource group, the kernel doubles the NUMA distance between
>> +the comparing domains.
>> +
>> +Form 2
>> +-------
>> +Form 2 associativity format adds separate device tree properties representing NUMA node distance
>> +thereby making the node distance computation flexible. Form 2 also allows flexible primary
>> +domain numbering. With numa distance computation now detached from the index value of
>> +"ibm,associativity" property, Form 2 allows a large number of primary domain ids at the
>> +same domain index representing resource groups of different
>> performance/latency characteristics.
>
> The meaning of "domain index" is not clear to me here.

Sorry for the confusion there. domain index is the index where domainID
is appearing. W.r.t "ibm,associativity"  we have 

The “ibm,associativity” property contains one or more lists of numbers (domainID)
representing the resource’s platform grouping domains. If we can look at
an example property.

{ 4, 6, 7, 0, 0}
{ 4, 6, 7, 0, 40}

With Form 1 both NUMA node 0 and 40 will appear at the same distance.
They both are at domain index 4. With Form 2 we can represent them with
different NUMA distance values. 


>
>> +
>> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the
>> +"ibm,architecture-vec-5" property.
>> +
>> +"ibm,numa-lookup-index-table" property contains one or more list numbers representing
>> +the domainIDs present in the system. The offset of the domainID in this property is considered
>> +the domainID index.
>
> You haven't really introduced the term "domainID".  Is "domainID
> index" the same as "domain index" above?  It's not clear to me.

The earlier part of the documented said 

The “ibm,associativity” property contains one or more lists of numbers (domainID)
representing the resource’s platform grouping domains.

I will update domain index to domainID index. 

>
> The distinction between "domain index" and "primary domain id" is also
> not clear to me.

primary domain id is the domainID appearing in the primary domainID
index. Linux kenrel also use that as the NUMA node number.
Primary domainID index is defined by ibm,associativity-reference-points
and we consider that as the most significant resource group boundary.

ibm,associativity-reference-points can be looked at as
{ primary domainID index, secondary domainID index, tertiary domainID index.. }


>
>> +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by
>> +N domainID encoded as with encode-int
>> +
>> +For ex:
>> +ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for domainID 8 is 1.
>
> Above you say "Form 2 allows a large number of primary domain ids at
> the same domain index", but this encoding doesn't appear to permit
> that.

I didn't follow that question.

>
>> +"ibm,numa-distance-table" property contains one or more list of numbers representing the NUMA
>> +distance between resource groups/domains present in the system.
>> +
>> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
>> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
>> +
>> +For ex:
>> +ibm,numa-lookup-index-table =  {3, 0, 8, 40}
>> +ibm,numa-distance-table     =  {9, 1, 2, 8, 2, 1, 16, 8, 16, 1}
>> +
>> +  | 0    8   40
>> +--|------------
>> +  |
>> +0 | 10   20  80
>> +  |
>> +8 | 20   10  160
>> +  |
>> +40| 80   160  10
>
> What's the reason for multiplying the values by 10 in the expanded
> table version?

That was me missing a document update. Since we used 8 bits to encode
distance at some point we were looking at a SCALE factor. But later
realized other architectures also restrict distance to 8 bits. I will
update ibm,numa-distance-table in the document.

>
>> +
>> +With Form2 "ibm,associativity" for resources is listed as below:
>> +
>> +"ibm,associativity" property for resources in node 0, 8 and 40
>> +{ 4, 6, 7, 0, 0}
>> +{ 4, 6, 9, 8, 8}
>> +{ 4, 6, 7, 0, 40}
>> +
>> +With "ibm,associativity-reference-points"  { 0x4, 0x3, 0x2 }
>> +
>> +With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>
> What the heck is the secondary domainID

domainID appearing the secondary domainID index.
ibm,associativity-reference-points gives an indication of different
hierachy of resource grouping as below.

ibm,associativity-reference-points can be looked at as
{ primary domainID index, secondary domainID index, tertiary domainID index.. }

>
>> +the kernel should use when using persistent memory devices. Persistent memory devices
>> +can also be used as regular memory using DAX KMEM driver and primary domainID indicates
>> +the numa node number OS should use when using these devices as regular memory. Secondary
>> +domainID is the numa node number that should be used when using this device as
>> +persistent memory. In the later case, we are interested in the locality of the
>> +device to an established numa node. In the above example, if the last row represents a
>> +persistent memory device/resource, NUMA node number 40 will be used when using the device
>> +as regular memory and NUMA node number 0 will be the device numa node when using it as
>> +a persistent memory device.
>> +
>> +Each resource (drcIndex) now also supports additional optional device tree properties.
>> +These properties are marked optional because the platform can choose not to export
>> +them and provide the system topology details using the earlier defined device tree
>> +properties alone. The optional device tree properties are used when adding new resources
>> +(DLPAR) and when the platform didn't provide the topology details of the domain which
>> +contains the newly added resource during boot.
>> +
>> +"ibm,numa-lookup-index" property contains a number representing the domainID index to be used
>> +when building the NUMA distance of the numa node to which this resource belongs. The domain id
>> +of the new resource can be obtained from the existing "ibm,associativity" property. This
>> +can be used to build distance information of a newly onlined NUMA node via DLPAR operation.
>> +The value is 1 based array index value.
>
> Am I correct in thinking that if we have an entirely form2 world, we'd
> only need this and the ibm,associativity properties could be dropped?

Not really. ibm,numa-lookup-index-table was added to have a concise
representation of numa distance via ibm,numa-distance-table. 

For ex: With domainID 0, 4, 5 we could do a 5x5 matrix to represent the
numa distance. Instead ibm,numa-lookup-index-table allows us to present
the same in a 3x3 matrix  distance[index0][index1] is the  distance
between NUMA node 0 and 4 and distance[index0][index2] is the distance
between NUMA node 0 and 5


>
>> +
>> +prop-encoded-array: An integer encoded as with encode-int specifying the domainID index
>> +
>> +"ibm,numa-distance" property contains one or more list of numbers presenting the NUMA distance
>> +from this resource domain to other resources.
>
> IIUC this is about extending the global distance table with
> information for a new node.  Is that correct?

correct.

>
> The global distance table allows for the possibility of asymmetric
> distances between nodes, but this does not.  Is that intentional?

This also does, For example with 3 nodes currently present and 4 node
getting added ibm,numa-distance have 8 elements enabling us to have
distance[Node0][Node50] being different from distance[Node50][Node0]
as shown below.

>
>> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
>> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
>> +
>> +For ex:
>> +ibm,associativity     = { 4, 5, 6, 7, 50}
>> +ibm,numa-lookup-index = { 4 }
>> +ibm,numa-distance   =  {8, 16, 32, 8, 1, 16, 32, 8, 1}
>> +
>> +resulting in a new toplogy as below.
>> +  | 0    8   40   50
>> +--|------------------
>> +  |
>> +0 | 10   20  80   160
>> +  |
>> +8 | 20   10  160  320
>> +  |
>> +40| 80   160  10  80
>> +  |
>> +50| 160  320  80  10

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-15  3:55   ` David Gibson
@ 2021-06-15  5:57     ` Aneesh Kumar K.V
  2021-06-15  6:34       ` David Gibson
  0 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-15  5:57 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

David Gibson <david@gibson.dropbear.id.au> writes:

> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>> FORM2 introduce a concept of secondary domain which is identical to the
>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>> when using persistent memory device. For DAX kmem use the logical domain
>> id introduced in FORM2. This new numa node
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>  3 files changed, 45 insertions(+), 10 deletions(-)
>> 
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index 86cd2af014f7..b9ac6d02e944 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>  	return nid;
>>  }
>>  
>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>> +{
>> +	int secondary_index;
>> +	const __be32 *associativity;
>> +
>> +	if (!numa_enabled) {
>> +		*primary = NUMA_NO_NODE;
>> +		*secondary = NUMA_NO_NODE;
>> +		return 0;
>> +	}
>> +
>> +	associativity = of_get_associativity(node);
>> +	if (!associativity)
>> +		return -ENODEV;
>> +
>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>
> Secondary ID is always the second reference point, but primary depends
> on the length of resources?  That seems very weird.

primary_domain_index is distance_ref_point[0]. With Form2 we would find
both primary and secondary domain ID same for all resources other than
persistent memory device. The usage w.r.t. persistent memory is
explained in patch 7.

With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
the kernel should use when using persistent memory devices. Persistent memory devices
can also be used as regular memory using DAX KMEM driver and primary domainID indicates
the numa node number OS should use when using these devices as regular memory. Secondary
domainID is the numa node number that should be used when using this device as
persistent memory. In the later case, we are interested in the locality of the
device to an established numa node. In the above example, if the last row represents a
persistent memory device/resource, NUMA node number 40 will be used when using the device
as regular memory and NUMA node number 0 will be the device numa node when using it as
a persistent memory device.


>
>> +		*secondary = of_read_number(&associativity[secondary_index], 1);
>> +	}
>> +	if (*primary == 0xffff || *primary >= nr_node_ids)
>> +		*primary = NUMA_NO_NODE;
>> +
>> +	if (*secondary == 0xffff || *secondary >= nr_node_ids)
>> +		*secondary = NUMA_NO_NODE;
>> +	return 0;
>> +}
>> +
>>  /* Returns the nid associated with the given device tree node,
>>   * or -1 if not found.
>>   */
>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
>> index ef26fe40efb0..9bf2f1f3ddc5 100644
>> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>> @@ -18,6 +18,7 @@
>>  #include <asm/plpar_wrappers.h>
>>  #include <asm/papr_pdsm.h>
>>  #include <asm/mce.h>
>> +#include "pseries.h"
>>  
>>  #define BIND_ANY_ADDR (~0ul)
>>  
>> @@ -88,6 +89,8 @@ struct papr_scm_perf_stats {
>>  struct papr_scm_priv {
>>  	struct platform_device *pdev;
>>  	struct device_node *dn;
>> +	int numa_node;
>> +	int target_node;
>>  	uint32_t drc_index;
>>  	uint64_t blocks;
>>  	uint64_t block_size;
>> @@ -923,7 +926,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  	struct nd_mapping_desc mapping;
>>  	struct nd_region_desc ndr_desc;
>>  	unsigned long dimm_flags;
>> -	int target_nid, online_nid;
>>  	ssize_t stat_size;
>>  
>>  	p->bus_desc.ndctl = papr_scm_ndctl;
>> @@ -974,10 +976,8 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  	mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
>>  
>>  	memset(&ndr_desc, 0, sizeof(ndr_desc));
>> -	target_nid = dev_to_node(&p->pdev->dev);
>> -	online_nid = numa_map_to_online_node(target_nid);
>> -	ndr_desc.numa_node = online_nid;
>> -	ndr_desc.target_node = target_nid;
>> +	ndr_desc.numa_node = p->numa_node;
>> +	ndr_desc.target_node = p->target_node;
>>  	ndr_desc.res = &p->res;
>>  	ndr_desc.of_node = p->dn;
>>  	ndr_desc.provider_data = p;
>> @@ -1001,9 +1001,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  				ndr_desc.res, p->dn);
>>  		goto err;
>>  	}
>> -	if (target_nid != online_nid)
>> -		dev_info(dev, "Region registered with target node %d and online node %d",
>> -			 target_nid, online_nid);
>>  
>>  	mutex_lock(&papr_ndr_lock);
>>  	list_add_tail(&p->region_list, &papr_nd_regions);
>> @@ -1096,7 +1093,7 @@ static int papr_scm_probe(struct platform_device *pdev)
>>  	struct papr_scm_priv *p;
>>  	const char *uuid_str;
>>  	u64 uuid[2];
>> -	int rc;
>> +	int rc, numa_node;
>>  
>>  	/* check we have all the required DT properties */
>>  	if (of_property_read_u32(dn, "ibm,my-drc-index", &drc_index)) {
>> @@ -1119,11 +1116,20 @@ static int papr_scm_probe(struct platform_device *pdev)
>>  		return -ENODEV;
>>  	}
>>  
>> -
>>  	p = kzalloc(sizeof(*p), GFP_KERNEL);
>>  	if (!p)
>>  		return -ENOMEM;
>>  
>> +	if (get_primary_and_secondary_domain(dn, &p->target_node, &numa_node)) {
>> +		dev_err(&pdev->dev, "%pOF: missing NUMA attributes!\n", dn);
>> +		rc = -ENODEV;
>> +		goto err;
>> +	}
>> +	p->numa_node = numa_map_to_online_node(numa_node);
>> +	if (numa_node != p->numa_node)
>> +		dev_info(&pdev->dev, "Region registered with online node %d and device tree node %d",
>> +			 p->numa_node, numa_node);
>> +
>>  	/* Initialize the dimm mutex */
>>  	mutex_init(&p->health_mutex);
>>  
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
>> index 663a0859cf13..9c2a1fc9ded1 100644
>> --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -114,4 +114,5 @@ void pseries_setup_security_mitigations(void);
>>  void pseries_lpar_read_hblkrm_characteristics(void);
>>  
>>  void update_numa_distance(struct device_node *node);
>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary);
>>  #endif /* _PSERIES_PSERIES_H */
>
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity
  2021-06-15  5:28     ` Aneesh Kumar K.V
@ 2021-06-15  6:25       ` David Gibson
  2021-06-15  7:40         ` Aneesh Kumar K.V
  0 siblings, 1 reply; 38+ messages in thread
From: David Gibson @ 2021-06-15  6:25 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 16363 bytes --]

On Tue, Jun 15, 2021 at 10:58:42AM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
> >> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> ---
> >>  Documentation/powerpc/associativity.rst   | 139 ++++++++++++++++++++
> >>  arch/powerpc/include/asm/firmware.h       |   3 +-
> >>  arch/powerpc/include/asm/prom.h           |   1 +
> >>  arch/powerpc/kernel/prom_init.c           |   3 +-
> >>  arch/powerpc/mm/numa.c                    | 149 +++++++++++++++++++++-
> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
> >>  6 files changed, 290 insertions(+), 6 deletions(-)
> >>  create mode 100644 Documentation/powerpc/associativity.rst
> >> 
> >> diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst
> >> new file mode 100644
> >> index 000000000000..58abedea81d7
> >> --- /dev/null
> >> +++ b/Documentation/powerpc/associativity.rst
> >> @@ -0,0 +1,139 @@
> >> +============================
> >> +NUMA resource associativity
> >> +=============================
> >> +
> >> +Associativity represents the groupings of the various platform resources into
> >> +domains of substantially similar mean performance relative to resources outside
> >> +of that domain. Resources subsets of a given domain that exhibit better
> >> +performance relative to each other than relative to other resources subsets
> >> +are represented as being members of a sub-grouping domain. This performance
> >> +characteristic is presented in terms of NUMA node distance within the Linux kernel.
> >> +From the platform view, these groups are also referred to as domains.
> >> +
> >> +PAPR interface currently supports two different ways of communicating these resource
> >
> > You describe form 2 below as well, which contradicts this.
> 
> Fixed as below.
> 
> PAPR interface currently supports different ways of communicating these resource
> grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
> associativity grouping. Form 0 is the older format and is now considered deprecated.
> 
> Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
> Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
> A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity
> bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.

LGTM.

> >> +grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping.
> >> +Form 0 is the older format and is now considered deprecated.
> >> +
> >> +Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
> >> +A value of 1 indicates the usage of Form 1 associativity.
> >> +
> >> +Form 0
> >> +-----
> >> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
> >> +
> >> +Form 1
> >> +-----
> >> +With Form 1 a combination of ibm,associativity-reference-points and ibm,associativity
> >> +device tree properties are used to determine the NUMA distance between resource groups/domains. 
> >> +
> >> +The “ibm,associativity” property contains one or more lists of numbers (domainID)
> >> +representing the resource’s platform grouping domains.
> >> +
> >> +The “ibm,associativity-reference-points” property contains one or more list of numbers
> >> +(domain index) that represents the 1 based ordinal in the associativity lists of the most
> >> +significant boundary, with subsequent entries indicating progressively less significant boundaries.
> >> +
> >> +Linux kernel uses the domain id of the most significant boundary (aka primary domain)
> >
> > I thought we used the *least* significant boundary (the smallest
> > grouping, not the largest).  That is, the last index, not the first.
> >
> > Actually... come to think of it, I'm not even sure how to interpret
> > "most significant".  Does that mean a change in grouping at that "most
> > significant" level results in the largest perfomance difference?
> 
> PAPR defines "most significant" as below
> 
> When the “ibm,architecture-vec-5” property byte 5 bit 0 has the value of one, the “ibm,associativ-
> ity-reference-points” property indicates boundaries between associativity domains presented by the
> “ibm,associativity” property containing “near” and “far” resources. The
> first such boundary in the list represents the 1 based ordinal in the
> associativity lists of the most significant boundary, with subsequent
> entries indicating progressively less significant boundaries

No... that's not a definition.  Like your draft PAPR uses the term
while entirely failing to define it.  From what I can tell about how
it is used the "most significant" boundary corresponds to what Linux
simply thinks of as the node id.  But intuitively, I'd think of that
as the "least significant" boundary, since that's basically the
smallest granularity at which we care about NUMA distances.


> I would interpret it as the boundary where we start defining NUMA
> nodes.

That isn't any clearer to me.

> >> +as the NUMA node id. Linux kernel computes NUMA distance between two domains by
> >> +recursively comparing if they belong to the same higher-level domains. For mismatch
> >> +at every higher level of the resource group, the kernel doubles the NUMA distance between
> >> +the comparing domains.
> >> +
> >> +Form 2
> >> +-------
> >> +Form 2 associativity format adds separate device tree properties representing NUMA node distance
> >> +thereby making the node distance computation flexible. Form 2 also allows flexible primary
> >> +domain numbering. With numa distance computation now detached from the index value of
> >> +"ibm,associativity" property, Form 2 allows a large number of primary domain ids at the
> >> +same domain index representing resource groups of different
> >> performance/latency characteristics.
> >
> > The meaning of "domain index" is not clear to me here.
> 
> Sorry for the confusion there. domain index is the index where domainID
> is appearing. W.r.t "ibm,associativity"  we have

Ok, I think I eventually deduced that.  We should start out clearly
defining both domainID and index here.

Also.. I think we need to find more distinct terms, because "index" is
being used for both where the ID appears in an associativity array,
and also when an ID appears in the Form2 "lookup-index-table" and the
two usages are totally unconnected.

> The “ibm,associativity” property contains one or more lists of numbers (domainID)
> representing the resource’s platform grouping domains. If we can look at
> an example property.
> 
> { 4, 6, 7, 0, 0}
> { 4, 6, 7, 0, 40}
> 
> With Form 1 both NUMA node 0 and 40 will appear at the same distance.
> They both are at domain index 4. With Form 2 we can represent them with
> different NUMA distance values.

Ok.  Note that PAPR was never clear about what space domain IDs need
to be unique within: do they need to be (a) globally unique (not true
in practice), (b) unique at their index level or (c) unique only
within their "parent" node at the previous index level.

We should take the opportunity with Form2 to make that clearer.

My understanding is that with Form2 it should be entirely feasible to
built a dt have associativity arrays that are always of length 1.  Is
that correct?

> >> +
> >> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the
> >> +"ibm,architecture-vec-5" property.
> >> +
> >> +"ibm,numa-lookup-index-table" property contains one or more list numbers representing
> >> +the domainIDs present in the system. The offset of the domainID in this property is considered
> >> +the domainID index.
> >
> > You haven't really introduced the term "domainID".  Is "domainID
> > index" the same as "domain index" above?  It's not clear to me.
> 
> The earlier part of the documented said 
> 
> The “ibm,associativity” property contains one or more lists of numbers (domainID)
> representing the resource’s platform grouping domains.
> 
> I will update domain index to domainID index. 
> 
> >
> > The distinction between "domain index" and "primary domain id" is also
> > not clear to me.
> 
> primary domain id is the domainID appearing in the primary domainID
> index. Linux kenrel also use that as the NUMA node number.

nit s/kenrel/kernel/

> Primary domainID index is defined by ibm,associativity-reference-points
> and we consider that as the most significant resource group boundary.
> 
> ibm,associativity-reference-points can be looked at as
> { primary domainID index, secondary domainID index, tertiary domainID index.. }

Ok, explicitly stating that in the doc would help a lot.

> >
> >> +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by
> >> +N domainID encoded as with encode-int
> >> +
> >> +For ex:
> >> +ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for domainID 8 is 1.
> >
> > Above you say "Form 2 allows a large number of primary domain ids at
> > the same domain index", but this encoding doesn't appear to permit
> > that.
> 
> I didn't follow that question.

Ah, that's because I was thinking of index here as the index within
the lookup-index-table, not the index within the associativity
arrays.

> >
> >> +"ibm,numa-distance-table" property contains one or more list of numbers representing the NUMA
> >> +distance between resource groups/domains present in the system.
> >> +
> >> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
> >> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
> >> +
> >> +For ex:
> >> +ibm,numa-lookup-index-table =  {3, 0, 8, 40}
> >> +ibm,numa-distance-table     =  {9, 1, 2, 8, 2, 1, 16, 8, 16, 1}
> >> +
> >> +  | 0    8   40
> >> +--|------------
> >> +  |
> >> +0 | 10   20  80
> >> +  |
> >> +8 | 20   10  160
> >> +  |
> >> +40| 80   160  10
> >
> > What's the reason for multiplying the values by 10 in the expanded
> > table version?
> 
> That was me missing a document update. Since we used 8 bits to encode
> distance at some point we were looking at a SCALE factor. But later
> realized other architectures also restrict distance to 8 bits. I will
> update ibm,numa-distance-table in the document.

Ok.

> >> +
> >> +With Form2 "ibm,associativity" for resources is listed as below:
> >> +
> >> +"ibm,associativity" property for resources in node 0, 8 and 40
> >> +{ 4, 6, 7, 0, 0}
> >> +{ 4, 6, 9, 8, 8}
> >> +{ 4, 6, 7, 0, 40}
> >> +
> >> +With "ibm,associativity-reference-points"  { 0x4, 0x3, 0x2 }
> >> +
> >> +With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
> >
> > What the heck is the secondary domainID
> 
> domainID appearing the secondary domainID index.

I understand that from the clarifications you've made about, but
second domainID index wasn't any more defined in the original draft.

> ibm,associativity-reference-points gives an indication of different
> hierachy of resource grouping as below.
> 
> ibm,associativity-reference-points can be looked at as
> { primary domainID index, secondary domainID index, tertiary domainID index.. }
> 
> >
> >> +the kernel should use when using persistent memory devices. Persistent memory devices
> >> +can also be used as regular memory using DAX KMEM driver and primary domainID indicates
> >> +the numa node number OS should use when using these devices as regular memory. Secondary
> >> +domainID is the numa node number that should be used when using this device as
> >> +persistent memory. In the later case, we are interested in the locality of the
> >> +device to an established numa node. In the above example, if the last row represents a
> >> +persistent memory device/resource, NUMA node number 40 will be used when using the device
> >> +as regular memory and NUMA node number 0 will be the device numa node when using it as
> >> +a persistent memory device.
> >> +
> >> +Each resource (drcIndex) now also supports additional optional device tree properties.
> >> +These properties are marked optional because the platform can choose not to export
> >> +them and provide the system topology details using the earlier defined device tree
> >> +properties alone. The optional device tree properties are used when adding new resources
> >> +(DLPAR) and when the platform didn't provide the topology details of the domain which
> >> +contains the newly added resource during boot.
> >> +
> >> +"ibm,numa-lookup-index" property contains a number representing the domainID index to be used
> >> +when building the NUMA distance of the numa node to which this resource belongs. The domain id
> >> +of the new resource can be obtained from the existing "ibm,associativity" property. This
> >> +can be used to build distance information of a newly onlined NUMA node via DLPAR operation.
> >> +The value is 1 based array index value.
> >
> > Am I correct in thinking that if we have an entirely form2 world, we'd
> > only need this and the ibm,associativity properties could be dropped?
> 
> Not really. ibm,numa-lookup-index-table was added to have a concise
> representation of numa distance via ibm,numa-distance-table. 
> 
> For ex: With domainID 0, 4, 5 we could do a 5x5 matrix to represent the
> numa distance. Instead ibm,numa-lookup-index-table allows us to present
> the same in a 3x3 matrix  distance[index0][index1] is the  distance
> between NUMA node 0 and 4 and distance[index0][index2] is the distance
> between NUMA node 0 and 5

Right, I get the purpose of it, and I realized I misphrashed my
question.  My point is that in a Form2 world, the *only* thing the
associativity array is used for is to deduce its position in
lookup-index-table.  Once you have have that for each resource, you
have everything you need, yes?

> 
> 
> >
> >> +
> >> +prop-encoded-array: An integer encoded as with encode-int specifying the domainID index
> >> +
> >> +"ibm,numa-distance" property contains one or more list of numbers presenting the NUMA distance
> >> +from this resource domain to other resources.
> >
> > IIUC this is about extending the global distance table with
> > information for a new node.  Is that correct?
> 
> correct.
> 
> >
> > The global distance table allows for the possibility of asymmetric
> > distances between nodes, but this does not.  Is that intentional?
> 
> This also does, For example with 3 nodes currently present and 4 node
> getting added ibm,numa-distance have 8 elements enabling us to have
> distance[Node0][Node50] being different from distance[Node50][Node0]
> as shown below.

Ok that's not clear from the above.  Rather than "one or more lists of
numbers" I think you want to explicitly give two options.  Either one
list, which gives symmetric distances, or two which gives distances
to, then distance from.

> 
> >
> >> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
> >> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
> >> +
> >> +For ex:
> >> +ibm,associativity     = { 4, 5, 6, 7, 50}
> >> +ibm,numa-lookup-index = { 4 }
> >> +ibm,numa-distance   =  {8, 16, 32, 8, 1, 16, 32, 8, 1}
> >> +
> >> +resulting in a new toplogy as below.
> >> +  | 0    8   40   50
> >> +--|------------------
> >> +  |
> >> +0 | 10   20  80   160
> >> +  |
> >> +8 | 20   10  160  320
> >> +  |
> >> +40| 80   160  10  80
> >> +  |
> >> +50| 160  320  80  10
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-15  5:57     ` Aneesh Kumar K.V
@ 2021-06-15  6:34       ` David Gibson
  2021-06-15  7:05         ` Aneesh Kumar K.V
  0 siblings, 1 reply; 38+ messages in thread
From: David Gibson @ 2021-06-15  6:34 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 3883 bytes --]

On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> >> FORM2 introduce a concept of secondary domain which is identical to the
> >> conceept of FORM1 primary domain. Use secondary domain as the numa node
> >> when using persistent memory device. For DAX kmem use the logical domain
> >> id introduced in FORM2. This new numa node
> >> 
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> ---
> >>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
> >>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
> >>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
> >>  3 files changed, 45 insertions(+), 10 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> index 86cd2af014f7..b9ac6d02e944 100644
> >> --- a/arch/powerpc/mm/numa.c
> >> +++ b/arch/powerpc/mm/numa.c
> >> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
> >>  	return nid;
> >>  }
> >>  
> >> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
> >> +{
> >> +	int secondary_index;
> >> +	const __be32 *associativity;
> >> +
> >> +	if (!numa_enabled) {
> >> +		*primary = NUMA_NO_NODE;
> >> +		*secondary = NUMA_NO_NODE;
> >> +		return 0;
> >> +	}
> >> +
> >> +	associativity = of_get_associativity(node);
> >> +	if (!associativity)
> >> +		return -ENODEV;
> >> +
> >> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> >> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
> >> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
> >
> > Secondary ID is always the second reference point, but primary depends
> > on the length of resources?  That seems very weird.
> 
> primary_domain_index is distance_ref_point[0]. With Form2 we would find
> both primary and secondary domain ID same for all resources other than
> persistent memory device. The usage w.r.t. persistent memory is
> explained in patch 7.

Right, I misunderstood

> 
> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
> the kernel should use when using persistent memory devices.

This seems kind of bogus.  With Form1, the primary/secondary ID are a
sort of heirarchy of distance (things with same primary ID are very
close, things with same secondary are kinda-close, etc.).  With Form2,
it's referring to their effective node for different purposes.

Using the same terms for different meanings seems unnecessarily
confusing.

> Persistent memory devices
> can also be used as regular memory using DAX KMEM driver and primary domainID indicates
> the numa node number OS should use when using these devices as regular memory. Secondary
> domainID is the numa node number that should be used when using this device as
> persistent memory.

It's weird to me that you'd want to consider them in different nodes
for those different purposes.

> In the later case, we are interested in the locality of the
> device to an established numa node. In the above example, if the last row represents a
> persistent memory device/resource, NUMA node number 40 will be used when using the device
> as regular memory and NUMA node number 0 will be the device numa node when using it as
> a persistent memory device.

I don't really get what you mean by "locality of the device to an
established numa node".  Or at least how that's different from
anything else we're handling here.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-15  6:34       ` David Gibson
@ 2021-06-15  7:05         ` Aneesh Kumar K.V
  2021-06-17  7:46           ` David Gibson
  0 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-15  7:05 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

David Gibson <david@gibson.dropbear.id.au> writes:

> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>> 
>> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>> >> FORM2 introduce a concept of secondary domain which is identical to the
>> >> conceept of FORM1 primary domain. Use secondary domain as the numa node
>> >> when using persistent memory device. For DAX kmem use the logical domain
>> >> id introduced in FORM2. This new numa node
>> >> 
>> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> >> ---
>> >>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>> >>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>> >>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
>> >>  3 files changed, 45 insertions(+), 10 deletions(-)
>> >> 
>> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> >> index 86cd2af014f7..b9ac6d02e944 100644
>> >> --- a/arch/powerpc/mm/numa.c
>> >> +++ b/arch/powerpc/mm/numa.c
>> >> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>> >>  	return nid;
>> >>  }
>> >>  
>> >> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>> >> +{
>> >> +	int secondary_index;
>> >> +	const __be32 *associativity;
>> >> +
>> >> +	if (!numa_enabled) {
>> >> +		*primary = NUMA_NO_NODE;
>> >> +		*secondary = NUMA_NO_NODE;
>> >> +		return 0;
>> >> +	}
>> >> +
>> >> +	associativity = of_get_associativity(node);
>> >> +	if (!associativity)
>> >> +		return -ENODEV;
>> >> +
>> >> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>> >> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>> >> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>> >
>> > Secondary ID is always the second reference point, but primary depends
>> > on the length of resources?  That seems very weird.
>> 
>> primary_domain_index is distance_ref_point[0]. With Form2 we would find
>> both primary and secondary domain ID same for all resources other than
>> persistent memory device. The usage w.r.t. persistent memory is
>> explained in patch 7.
>
> Right, I misunderstood
>
>> 
>> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>> the kernel should use when using persistent memory devices.
>
> This seems kind of bogus.  With Form1, the primary/secondary ID are a
> sort of heirarchy of distance (things with same primary ID are very
> close, things with same secondary are kinda-close, etc.).  With Form2,
> it's referring to their effective node for different purposes.
>
> Using the same terms for different meanings seems unnecessarily
> confusing.

They are essentially domainIDs. The interpretation of them are different
between Form1 and Form2. Hence I kept referring to them as primary and
secondary domainID. Any suggestion on what to name them with Form2?

>
>> Persistent memory devices
>> can also be used as regular memory using DAX KMEM driver and primary domainID indicates
>> the numa node number OS should use when using these devices as regular memory. Secondary
>> domainID is the numa node number that should be used when using this device as
>> persistent memory.
>
> It's weird to me that you'd want to consider them in different nodes
> for those different purposes.


   --------------------------------------
  |                            NUMA node0 |
  |    ProcA -----> MEMA                  |
  |     |                                 |
  |	|                                 |
  |	-------------------> PMEMB        |
  |                                       |
   ---------------------------------------

   ---------------------------------------
  |                            NUMA node1 |
  |                                       |
  |    ProcB -------> MEMC                |
  |	|                                 |
  |	-------------------> PMEMD        |
  |                                       |
  |                                       |
   ---------------------------------------
 

For a topology like the above application running of ProcA wants to find out
persistent memory mount local to its NUMA node. Hence when using it as
pmem fsdax mount or devdax device we want PMEMB to have associativity
of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
we want to use it as memory using dax kmem driver, we want both PMEMB
and PMEMD to appear as memory only NUMA node at a distance that is
derived based on the latency of the media. 

>
>> In the later case, we are interested in the locality of the
>> device to an established numa node. In the above example, if the last row represents a
>> persistent memory device/resource, NUMA node number 40 will be used when using the device
>> as regular memory and NUMA node number 0 will be the device numa node when using it as
>> a persistent memory device.
>
> I don't really get what you mean by "locality of the device to an
> established numa node".  Or at least how that's different from
> anything else we're handling here.


-aneesh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity
  2021-06-15  6:25       ` David Gibson
@ 2021-06-15  7:40         ` Aneesh Kumar K.V
  2021-06-17  7:50           ` David Gibson
  0 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-15  7:40 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

David Gibson <david@gibson.dropbear.id.au> writes:

> On Tue, Jun 15, 2021 at 10:58:42AM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>> 
>> > On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
>> >> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
>> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> >> ---
>> >>  Documentation/powerpc/associativity.rst   | 139 ++++++++++++++++++++
>> >>  arch/powerpc/include/asm/firmware.h       |   3 +-
>> >>  arch/powerpc/include/asm/prom.h           |   1 +
>> >>  arch/powerpc/kernel/prom_init.c           |   3 +-
>> >>  arch/powerpc/mm/numa.c                    | 149 +++++++++++++++++++++-
>> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
>> >>  6 files changed, 290 insertions(+), 6 deletions(-)
>> >>  create mode 100644 Documentation/powerpc/associativity.rst
>> >> 
>> >> diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst
>> >> new file mode 100644
>> >> index 000000000000..58abedea81d7
>> >> --- /dev/null
>> >> +++ b/Documentation/powerpc/associativity.rst
>> >> @@ -0,0 +1,139 @@
>> >> +============================
>> >> +NUMA resource associativity
>> >> +=============================
>> >> +
>> >> +Associativity represents the groupings of the various platform resources into
>> >> +domains of substantially similar mean performance relative to resources outside
>> >> +of that domain. Resources subsets of a given domain that exhibit better
>> >> +performance relative to each other than relative to other resources subsets
>> >> +are represented as being members of a sub-grouping domain. This performance
>> >> +characteristic is presented in terms of NUMA node distance within the Linux kernel.
>> >> +From the platform view, these groups are also referred to as domains.
>> >> +
>> >> +PAPR interface currently supports two different ways of communicating these resource
>> >
>> > You describe form 2 below as well, which contradicts this.
>> 
>> Fixed as below.
>> 
>> PAPR interface currently supports different ways of communicating these resource
>> grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
>> associativity grouping. Form 0 is the older format and is now considered deprecated.
>> 
>> Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
>> Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
>> A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity
>> bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
>
> LGTM.
>
>> >> +grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping.
>> >> +Form 0 is the older format and is now considered deprecated.
>> >> +
>> >> +Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
>> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
>> >> +A value of 1 indicates the usage of Form 1 associativity.
>> >> +
>> >> +Form 0
>> >> +-----
>> >> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
>> >> +
>> >> +Form 1
>> >> +-----
>> >> +With Form 1 a combination of ibm,associativity-reference-points and ibm,associativity
>> >> +device tree properties are used to determine the NUMA distance between resource groups/domains. 
>> >> +
>> >> +The “ibm,associativity” property contains one or more lists of numbers (domainID)
>> >> +representing the resource’s platform grouping domains.
>> >> +
>> >> +The “ibm,associativity-reference-points” property contains one or more list of numbers
>> >> +(domain index) that represents the 1 based ordinal in the associativity lists of the most
>> >> +significant boundary, with subsequent entries indicating progressively less significant boundaries.
>> >> +
>> >> +Linux kernel uses the domain id of the most significant boundary (aka primary domain)
>> >
>> > I thought we used the *least* significant boundary (the smallest
>> > grouping, not the largest).  That is, the last index, not the first.
>> >
>> > Actually... come to think of it, I'm not even sure how to interpret
>> > "most significant".  Does that mean a change in grouping at that "most
>> > significant" level results in the largest perfomance difference?
>> 
>> PAPR defines "most significant" as below
>> 
>> When the “ibm,architecture-vec-5” property byte 5 bit 0 has the value of one, the “ibm,associativ-
>> ity-reference-points” property indicates boundaries between associativity domains presented by the
>> “ibm,associativity” property containing “near” and “far” resources. The
>> first such boundary in the list represents the 1 based ordinal in the
>> associativity lists of the most significant boundary, with subsequent
>> entries indicating progressively less significant boundaries
>
> No... that's not a definition.  Like your draft PAPR uses the term
> while entirely failing to define it.  From what I can tell about how
> it is used the "most significant" boundary corresponds to what Linux
> simply thinks of as the node id.  But intuitively, I'd think of that
> as the "least significant" boundary, since that's basically the
> smallest granularity at which we care about NUMA distances.
>
>
>> I would interpret it as the boundary where we start defining NUMA
>> nodes.
>
> That isn't any clearer to me.

How about calling it least significant boundary then? 

The “ibm,associativity-reference-points” property contains one or more list of numbers
(domainID index) that represents the 1 based ordinal in the associativity lists of the
least significant boundary, with subsequent entries indicating progressively higher
significant boundaries.

ex:
{ primary domainID index, secondary domainID index, tertiary domainID index.. }

Linux kernel uses the domainID of the least significant boundary (aka primary domain)
as the NUMA node id. Linux kernel computes NUMA distance between two domains by
recursively comparing if they belong to the same higher-level domains. For mismatch
at every higher level of the resource group, the kernel doubles the NUMA distance between
the comparing domains.

>
>> >> +as the NUMA node id. Linux kernel computes NUMA distance between two domains by
>> >> +recursively comparing if they belong to the same higher-level domains. For mismatch
>> >> +at every higher level of the resource group, the kernel doubles the NUMA distance between
>> >> +the comparing domains.
>> >> +
>> >> +Form 2
>> >> +-------
>> >> +Form 2 associativity format adds separate device tree properties representing NUMA node distance
>> >> +thereby making the node distance computation flexible. Form 2 also allows flexible primary
>> >> +domain numbering. With numa distance computation now detached from the index value of
>> >> +"ibm,associativity" property, Form 2 allows a large number of primary domain ids at the
>> >> +same domain index representing resource groups of different
>> >> performance/latency characteristics.
>> >
>> > The meaning of "domain index" is not clear to me here.
>> 
>> Sorry for the confusion there. domain index is the index where domainID
>> is appearing. W.r.t "ibm,associativity"  we have
>
> Ok, I think I eventually deduced that.  We should start out clearly
> defining both domainID and index here.
>
> Also.. I think we need to find more distinct terms, because "index" is
> being used for both where the ID appears in an associativity array,
> and also when an ID appears in the Form2 "lookup-index-table" and the
> two usages are totally unconnected.
>
>> The “ibm,associativity” property contains one or more lists of numbers (domainID)
>> representing the resource’s platform grouping domains. If we can look at
>> an example property.
>> 
>> { 4, 6, 7, 0, 0}
>> { 4, 6, 7, 0, 40}
>> 
>> With Form 1 both NUMA node 0 and 40 will appear at the same distance.
>> They both are at domain index 4. With Form 2 we can represent them with
>> different NUMA distance values.
>
> Ok.  Note that PAPR was never clear about what space domain IDs need
> to be unique within: do they need to be (a) globally unique (not true
> in practice), (b) unique at their index level or (c) unique only
> within their "parent" node at the previous index level.
>
> We should take the opportunity with Form2 to make that clearer.
>
> My understanding is that with Form2 it should be entirely feasible to
> built a dt have associativity arrays that are always of length 1.  Is
> that correct?

Correct, unless you have persistent memory device attached in which case
you need two entries.

>
>> >> +
>> >> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the
>> >> +"ibm,architecture-vec-5" property.
>> >> +
>> >> +"ibm,numa-lookup-index-table" property contains one or more list numbers representing
>> >> +the domainIDs present in the system. The offset of the domainID in this property is considered
>> >> +the domainID index.
>> >
>> > You haven't really introduced the term "domainID".  Is "domainID
>> > index" the same as "domain index" above?  It's not clear to me.
>> 
>> The earlier part of the documented said 
>> 
>> The “ibm,associativity” property contains one or more lists of numbers (domainID)
>> representing the resource’s platform grouping domains.
>> 
>> I will update domain index to domainID index. 
>> 
>> >
>> > The distinction between "domain index" and "primary domain id" is also
>> > not clear to me.
>> 
>> primary domain id is the domainID appearing in the primary domainID
>> index. Linux kenrel also use that as the NUMA node number.
>
> nit s/kenrel/kernel/
>
>> Primary domainID index is defined by ibm,associativity-reference-points
>> and we consider that as the most significant resource group boundary.
>> 
>> ibm,associativity-reference-points can be looked at as
>> { primary domainID index, secondary domainID index, tertiary domainID index.. }
>
> Ok, explicitly stating that in the doc would help a lot.
>
>> >
>> >> +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by
>> >> +N domainID encoded as with encode-int
>> >> +
>> >> +For ex:
>> >> +ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for domainID 8 is 1.
>> >
>> > Above you say "Form 2 allows a large number of primary domain ids at
>> > the same domain index", but this encoding doesn't appear to permit
>> > that.
>> 
>> I didn't follow that question.
>
> Ah, that's because I was thinking of index here as the index within
> the lookup-index-table, not the index within the associativity
> arrays.
>
>> >
>> >> +"ibm,numa-distance-table" property contains one or more list of numbers representing the NUMA
>> >> +distance between resource groups/domains present in the system.
>> >> +
>> >> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
>> >> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
>> >> +
>> >> +For ex:
>> >> +ibm,numa-lookup-index-table =  {3, 0, 8, 40}
>> >> +ibm,numa-distance-table     =  {9, 1, 2, 8, 2, 1, 16, 8, 16, 1}
>> >> +
>> >> +  | 0    8   40
>> >> +--|------------
>> >> +  |
>> >> +0 | 10   20  80
>> >> +  |
>> >> +8 | 20   10  160
>> >> +  |
>> >> +40| 80   160  10
>> >
>> > What's the reason for multiplying the values by 10 in the expanded
>> > table version?
>> 
>> That was me missing a document update. Since we used 8 bits to encode
>> distance at some point we were looking at a SCALE factor. But later
>> realized other architectures also restrict distance to 8 bits. I will
>> update ibm,numa-distance-table in the document.
>
> Ok.
>
>> >> +
>> >> +With Form2 "ibm,associativity" for resources is listed as below:
>> >> +
>> >> +"ibm,associativity" property for resources in node 0, 8 and 40
>> >> +{ 4, 6, 7, 0, 0}
>> >> +{ 4, 6, 9, 8, 8}
>> >> +{ 4, 6, 7, 0, 40}
>> >> +
>> >> +With "ibm,associativity-reference-points"  { 0x4, 0x3, 0x2 }
>> >> +
>> >> +With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>> >
>> > What the heck is the secondary domainID
>> 
>> domainID appearing the secondary domainID index.
>
> I understand that from the clarifications you've made about, but
> second domainID index wasn't any more defined in the original draft.
>
>> ibm,associativity-reference-points gives an indication of different
>> hierachy of resource grouping as below.
>> 
>> ibm,associativity-reference-points can be looked at as
>> { primary domainID index, secondary domainID index, tertiary domainID index.. }
>> 
>> >
>> >> +the kernel should use when using persistent memory devices. Persistent memory devices
>> >> +can also be used as regular memory using DAX KMEM driver and primary domainID indicates
>> >> +the numa node number OS should use when using these devices as regular memory. Secondary
>> >> +domainID is the numa node number that should be used when using this device as
>> >> +persistent memory. In the later case, we are interested in the locality of the
>> >> +device to an established numa node. In the above example, if the last row represents a
>> >> +persistent memory device/resource, NUMA node number 40 will be used when using the device
>> >> +as regular memory and NUMA node number 0 will be the device numa node when using it as
>> >> +a persistent memory device.
>> >> +
>> >> +Each resource (drcIndex) now also supports additional optional device tree properties.
>> >> +These properties are marked optional because the platform can choose not to export
>> >> +them and provide the system topology details using the earlier defined device tree
>> >> +properties alone. The optional device tree properties are used when adding new resources
>> >> +(DLPAR) and when the platform didn't provide the topology details of the domain which
>> >> +contains the newly added resource during boot.
>> >> +
>> >> +"ibm,numa-lookup-index" property contains a number representing the domainID index to be used
>> >> +when building the NUMA distance of the numa node to which this resource belongs. The domain id
>> >> +of the new resource can be obtained from the existing "ibm,associativity" property. This
>> >> +can be used to build distance information of a newly onlined NUMA node via DLPAR operation.
>> >> +The value is 1 based array index value.
>> >
>> > Am I correct in thinking that if we have an entirely form2 world, we'd
>> > only need this and the ibm,associativity properties could be dropped?
>> 
>> Not really. ibm,numa-lookup-index-table was added to have a concise
>> representation of numa distance via ibm,numa-distance-table. 
>> 
>> For ex: With domainID 0, 4, 5 we could do a 5x5 matrix to represent the
>> numa distance. Instead ibm,numa-lookup-index-table allows us to present
>> the same in a 3x3 matrix  distance[index0][index1] is the  distance
>> between NUMA node 0 and 4 and distance[index0][index2] is the distance
>> between NUMA node 0 and 5
>
> Right, I get the purpose of it, and I realized I misphrashed my
> question.  My point is that in a Form2 world, the *only* thing the
> associativity array is used for is to deduce its position in
> lookup-index-table.  Once you have have that for each resource, you
> have everything you need, yes?


ibm,associativity is used find the domainID/NUMA node id of the
resource.

ibm,lookup-index-table is used compute the distance information between
NUMA nodes using ibm,numa-distance-table.


>
>> 
>> 
>> >
>> >> +
>> >> +prop-encoded-array: An integer encoded as with encode-int specifying the domainID index
>> >> +
>> >> +"ibm,numa-distance" property contains one or more list of numbers presenting the NUMA distance
>> >> +from this resource domain to other resources.
>> >
>> > IIUC this is about extending the global distance table with
>> > information for a new node.  Is that correct?
>> 
>> correct.
>> 
>> >
>> > The global distance table allows for the possibility of asymmetric
>> > distances between nodes, but this does not.  Is that intentional?
>> 
>> This also does, For example with 3 nodes currently present and 4 node
>> getting added ibm,numa-distance have 8 elements enabling us to have
>> distance[Node0][Node50] being different from distance[Node50][Node0]
>> as shown below.
>
> Ok that's not clear from the above.  Rather than "one or more lists of
> numbers" I think you want to explicitly give two options.  Either one
> list, which gives symmetric distances, or two which gives distances
> to, then distance from.
>
>> 
>> >
>> >> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
>> >> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
>> >> +
>> >> +For ex:
>> >> +ibm,associativity     = { 4, 5, 6, 7, 50}
>> >> +ibm,numa-lookup-index = { 4 }
>> >> +ibm,numa-distance   =  {8, 16, 32, 8, 1, 16, 32, 8, 1}
>> >> +
>> >> +resulting in a new toplogy as below.
>> >> +  | 0    8   40   50
>> >> +--|------------------
>> >> +  |
>> >> +0 | 10   20  80   160
>> >> +  |
>> >> +8 | 20   10  160  320
>> >> +  |
>> >> +40| 80   160  10  80
>> >> +  |
>> >> +50| 160  320  80  10
>> 
>

-aneesh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index
  2021-06-15  3:00   ` David Gibson
@ 2021-06-15  8:21     ` Aneesh Kumar K.V
  0 siblings, 0 replies; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-15  8:21 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

On 6/15/21 8:30 AM, David Gibson wrote:
> On Mon, Jun 14, 2021 at 10:09:56PM +0530, Aneesh Kumar K.V wrote:
>> No functional change in this patch.
> 
> I think this needs a rationale as to why 'primary_domain_index' is a
> better name than 'min_common_depth'.  The meaning isn't obvious to me
> from either name.
> 
>

The documentation added in patch 7 explains the name to some extent. Is 
that sufficient?

-aneesh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 2/8] powerpc/pseries: rename distance_ref_points_depth to max_domain_index
  2021-06-15  3:01   ` David Gibson
@ 2021-06-15  8:22     ` Aneesh Kumar K.V
  0 siblings, 0 replies; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-15  8:22 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

On 6/15/21 8:31 AM, David Gibson wrote:
> On Mon, Jun 14, 2021 at 10:09:57PM +0530, Aneesh Kumar K.V wrote:
>> No functional change in this patch
> 
> As with 1/8 an explanation of what this actually means and therefore
> why this is a better name would be very helpful.
> 
>>
>>

how about max_associativity_domain_index? The documentation added in 
patch 7 explains the details.  Let me know if we need to add more in 
this patch.

-aneesh


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update
  2021-06-15  3:13   ` David Gibson
@ 2021-06-15  8:26     ` Aneesh Kumar K.V
  0 siblings, 0 replies; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-15  8:26 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

On 6/15/21 8:43 AM, David Gibson wrote:
> On Mon, Jun 14, 2021 at 10:09:59PM +0530, Aneesh Kumar K.V wrote:
>> The associativity details of the newly added resourced are collected from
>> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
>> distance details of the newly added numa node after the above call. In
>> later patch we will remove updating NUMA distance when we are looking
>> for node id from associativity array.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   arch/powerpc/mm/numa.c                        | 41 +++++++++++++++++++
>>   arch/powerpc/platforms/pseries/hotplug-cpu.c  |  2 +
>>   .../platforms/pseries/hotplug-memory.c        |  2 +
>>   arch/powerpc/platforms/pseries/pseries.h      |  1 +
>>   4 files changed, 46 insertions(+)
>>
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index 192067991f8a..fec47981c1ef 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -287,6 +287,47 @@ int of_node_to_nid(struct device_node *device)
>>   }
>>   EXPORT_SYMBOL(of_node_to_nid);
>>   
>> +static void __initialize_form1_numa_distance(const __be32 *associativity)
>> +{
>> +	int i, nid;
>> +
>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>> +		nid = of_read_number(&associativity[primary_domain_index], 1);
>> +
>> +		for (i = 0; i < max_domain_index; i++) {
>> +			const __be32 *entry;
>> +
>> +			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
>> +			distance_lookup_table[nid][i] = of_read_number(entry, 1);
>> +		}
>> +	}
>> +}
> 
> This logic is almost identicaly to initialize_distance_lookup_table()
> - it would be good if they could be consolidated, so it's clear that
> coldplugged and hotplugged nodes are parsing the NUMA information in
> the same way.

initialize_distance_lookup_table() gets removed in the next patch.

-aneesh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-15  7:05         ` Aneesh Kumar K.V
@ 2021-06-17  7:46           ` David Gibson
  2021-06-17 10:53             ` Daniel Henrique Barboza
                               ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: David Gibson @ 2021-06-17  7:46 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 5562 bytes --]

On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> >> David Gibson <david@gibson.dropbear.id.au> writes:
> >> 
> >> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> >> >> FORM2 introduce a concept of secondary domain which is identical to the
> >> >> conceept of FORM1 primary domain. Use secondary domain as the numa node
> >> >> when using persistent memory device. For DAX kmem use the logical domain
> >> >> id introduced in FORM2. This new numa node
> >> >> 
> >> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> >> ---
> >> >>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
> >> >>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
> >> >>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
> >> >>  3 files changed, 45 insertions(+), 10 deletions(-)
> >> >> 
> >> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> >> index 86cd2af014f7..b9ac6d02e944 100644
> >> >> --- a/arch/powerpc/mm/numa.c
> >> >> +++ b/arch/powerpc/mm/numa.c
> >> >> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
> >> >>  	return nid;
> >> >>  }
> >> >>  
> >> >> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
> >> >> +{
> >> >> +	int secondary_index;
> >> >> +	const __be32 *associativity;
> >> >> +
> >> >> +	if (!numa_enabled) {
> >> >> +		*primary = NUMA_NO_NODE;
> >> >> +		*secondary = NUMA_NO_NODE;
> >> >> +		return 0;
> >> >> +	}
> >> >> +
> >> >> +	associativity = of_get_associativity(node);
> >> >> +	if (!associativity)
> >> >> +		return -ENODEV;
> >> >> +
> >> >> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> >> >> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
> >> >> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
> >> >
> >> > Secondary ID is always the second reference point, but primary depends
> >> > on the length of resources?  That seems very weird.
> >> 
> >> primary_domain_index is distance_ref_point[0]. With Form2 we would find
> >> both primary and secondary domain ID same for all resources other than
> >> persistent memory device. The usage w.r.t. persistent memory is
> >> explained in patch 7.
> >
> > Right, I misunderstood
> >
> >> 
> >> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
> >> the kernel should use when using persistent memory devices.
> >
> > This seems kind of bogus.  With Form1, the primary/secondary ID are a
> > sort of heirarchy of distance (things with same primary ID are very
> > close, things with same secondary are kinda-close, etc.).  With Form2,
> > it's referring to their effective node for different purposes.
> >
> > Using the same terms for different meanings seems unnecessarily
> > confusing.
> 
> They are essentially domainIDs. The interpretation of them are different
> between Form1 and Form2. Hence I kept referring to them as primary and
> secondary domainID. Any suggestion on what to name them with Form2?

My point is that reusing associativity-reference-points for something
with completely unrelated semantics seems like a very poor choice.

> >> Persistent memory devices
> >> can also be used as regular memory using DAX KMEM driver and primary domainID indicates
> >> the numa node number OS should use when using these devices as regular memory. Secondary
> >> domainID is the numa node number that should be used when using this device as
> >> persistent memory.
> >
> > It's weird to me that you'd want to consider them in different nodes
> > for those different purposes.
> 
> 
>    --------------------------------------
>   |                            NUMA node0 |
>   |    ProcA -----> MEMA                  |
>   |     |                                 |
>   |	|                                 |
>   |	-------------------> PMEMB        |
>   |                                       |
>    ---------------------------------------
> 
>    ---------------------------------------
>   |                            NUMA node1 |
>   |                                       |
>   |    ProcB -------> MEMC                |
>   |	|                                 |
>   |	-------------------> PMEMD        |
>   |                                       |
>   |                                       |
>    ---------------------------------------
>  
> 
> For a topology like the above application running of ProcA wants to find out
> persistent memory mount local to its NUMA node. Hence when using it as
> pmem fsdax mount or devdax device we want PMEMB to have associativity
> of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
> we want to use it as memory using dax kmem driver, we want both PMEMB
> and PMEMD to appear as memory only NUMA node at a distance that is
> derived based on the latency of the media.

I'm still not understanding why the latency we care about is different
in the two cases.  Can you give an example of when this would result
in different actual node assignments for the two different cases?

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity
  2021-06-15  7:40         ` Aneesh Kumar K.V
@ 2021-06-17  7:50           ` David Gibson
  2021-06-17 10:46             ` Aneesh Kumar K.V
  0 siblings, 1 reply; 38+ messages in thread
From: David Gibson @ 2021-06-17  7:50 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 17413 bytes --]

On Tue, Jun 15, 2021 at 01:10:27PM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Tue, Jun 15, 2021 at 10:58:42AM +0530, Aneesh Kumar K.V wrote:
> >> David Gibson <david@gibson.dropbear.id.au> writes:
> >> 
> >> > On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
> >> >> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> >> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> >> ---
> >> >>  Documentation/powerpc/associativity.rst   | 139 ++++++++++++++++++++
> >> >>  arch/powerpc/include/asm/firmware.h       |   3 +-
> >> >>  arch/powerpc/include/asm/prom.h           |   1 +
> >> >>  arch/powerpc/kernel/prom_init.c           |   3 +-
> >> >>  arch/powerpc/mm/numa.c                    | 149 +++++++++++++++++++++-
> >> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
> >> >>  6 files changed, 290 insertions(+), 6 deletions(-)
> >> >>  create mode 100644 Documentation/powerpc/associativity.rst
> >> >> 
> >> >> diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst
> >> >> new file mode 100644
> >> >> index 000000000000..58abedea81d7
> >> >> --- /dev/null
> >> >> +++ b/Documentation/powerpc/associativity.rst
> >> >> @@ -0,0 +1,139 @@
> >> >> +============================
> >> >> +NUMA resource associativity
> >> >> +=============================
> >> >> +
> >> >> +Associativity represents the groupings of the various platform resources into
> >> >> +domains of substantially similar mean performance relative to resources outside
> >> >> +of that domain. Resources subsets of a given domain that exhibit better
> >> >> +performance relative to each other than relative to other resources subsets
> >> >> +are represented as being members of a sub-grouping domain. This performance
> >> >> +characteristic is presented in terms of NUMA node distance within the Linux kernel.
> >> >> +From the platform view, these groups are also referred to as domains.
> >> >> +
> >> >> +PAPR interface currently supports two different ways of communicating these resource
> >> >
> >> > You describe form 2 below as well, which contradicts this.
> >> 
> >> Fixed as below.
> >> 
> >> PAPR interface currently supports different ways of communicating these resource
> >> grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
> >> associativity grouping. Form 0 is the older format and is now considered deprecated.
> >> 
> >> Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
> >> Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
> >> A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity
> >> bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
> >
> > LGTM.
> >
> >> >> +grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping.
> >> >> +Form 0 is the older format and is now considered deprecated.
> >> >> +
> >> >> +Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
> >> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
> >> >> +A value of 1 indicates the usage of Form 1 associativity.
> >> >> +
> >> >> +Form 0
> >> >> +-----
> >> >> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
> >> >> +
> >> >> +Form 1
> >> >> +-----
> >> >> +With Form 1 a combination of ibm,associativity-reference-points and ibm,associativity
> >> >> +device tree properties are used to determine the NUMA distance between resource groups/domains. 
> >> >> +
> >> >> +The “ibm,associativity” property contains one or more lists of numbers (domainID)
> >> >> +representing the resource’s platform grouping domains.
> >> >> +
> >> >> +The “ibm,associativity-reference-points” property contains one or more list of numbers
> >> >> +(domain index) that represents the 1 based ordinal in the associativity lists of the most
> >> >> +significant boundary, with subsequent entries indicating progressively less significant boundaries.
> >> >> +
> >> >> +Linux kernel uses the domain id of the most significant boundary (aka primary domain)
> >> >
> >> > I thought we used the *least* significant boundary (the smallest
> >> > grouping, not the largest).  That is, the last index, not the first.
> >> >
> >> > Actually... come to think of it, I'm not even sure how to interpret
> >> > "most significant".  Does that mean a change in grouping at that "most
> >> > significant" level results in the largest perfomance difference?
> >> 
> >> PAPR defines "most significant" as below
> >> 
> >> When the “ibm,architecture-vec-5” property byte 5 bit 0 has the value of one, the “ibm,associativ-
> >> ity-reference-points” property indicates boundaries between associativity domains presented by the
> >> “ibm,associativity” property containing “near” and “far” resources. The
> >> first such boundary in the list represents the 1 based ordinal in the
> >> associativity lists of the most significant boundary, with subsequent
> >> entries indicating progressively less significant boundaries
> >
> > No... that's not a definition.  Like your draft PAPR uses the term
> > while entirely failing to define it.  From what I can tell about how
> > it is used the "most significant" boundary corresponds to what Linux
> > simply thinks of as the node id.  But intuitively, I'd think of that
> > as the "least significant" boundary, since that's basically the
> > smallest granularity at which we care about NUMA distances.
> >
> >
> >> I would interpret it as the boundary where we start defining NUMA
> >> nodes.
> >
> > That isn't any clearer to me.
> 
> How about calling it least significant boundary then?

Heck, no.  My whole point here is that the meaning is unclear: my
first guess at the meaning is different from whoever wrote that text.
We need to come up with a way of describing it that's clearer.

> The “ibm,associativity-reference-points” property contains one or more list of numbers
> (domainID index) that represents the 1 based ordinal in the associativity lists of the
> least significant boundary, with subsequent entries indicating progressively higher
> significant boundaries.
> 
> ex:
> { primary domainID index, secondary domainID index, tertiary domainID index.. }
> 
> Linux kernel uses the domainID of the least significant boundary (aka primary domain)
> as the NUMA node id. Linux kernel computes NUMA distance between two domains by
> recursively comparing if they belong to the same higher-level domains. For mismatch
> at every higher level of the resource group, the kernel doubles the NUMA distance between
> the comparing domains.
> 
> >
> >> >> +as the NUMA node id. Linux kernel computes NUMA distance between two domains by
> >> >> +recursively comparing if they belong to the same higher-level domains. For mismatch
> >> >> +at every higher level of the resource group, the kernel doubles the NUMA distance between
> >> >> +the comparing domains.
> >> >> +
> >> >> +Form 2
> >> >> +-------
> >> >> +Form 2 associativity format adds separate device tree properties representing NUMA node distance
> >> >> +thereby making the node distance computation flexible. Form 2 also allows flexible primary
> >> >> +domain numbering. With numa distance computation now detached from the index value of
> >> >> +"ibm,associativity" property, Form 2 allows a large number of primary domain ids at the
> >> >> +same domain index representing resource groups of different
> >> >> performance/latency characteristics.
> >> >
> >> > The meaning of "domain index" is not clear to me here.
> >> 
> >> Sorry for the confusion there. domain index is the index where domainID
> >> is appearing. W.r.t "ibm,associativity"  we have
> >
> > Ok, I think I eventually deduced that.  We should start out clearly
> > defining both domainID and index here.
> >
> > Also.. I think we need to find more distinct terms, because "index" is
> > being used for both where the ID appears in an associativity array,
> > and also when an ID appears in the Form2 "lookup-index-table" and the
> > two usages are totally unconnected.
> >
> >> The “ibm,associativity” property contains one or more lists of numbers (domainID)
> >> representing the resource’s platform grouping domains. If we can look at
> >> an example property.
> >> 
> >> { 4, 6, 7, 0, 0}
> >> { 4, 6, 7, 0, 40}
> >> 
> >> With Form 1 both NUMA node 0 and 40 will appear at the same distance.
> >> They both are at domain index 4. With Form 2 we can represent them with
> >> different NUMA distance values.
> >
> > Ok.  Note that PAPR was never clear about what space domain IDs need
> > to be unique within: do they need to be (a) globally unique (not true
> > in practice), (b) unique at their index level or (c) unique only
> > within their "parent" node at the previous index level.
> >
> > We should take the opportunity with Form2 to make that clearer.
> >
> > My understanding is that with Form2 it should be entirely feasible to
> > built a dt have associativity arrays that are always of length 1.  Is
> > that correct?
> 
> Correct, unless you have persistent memory device attached in which case
> you need two entries.
> 
> >
> >> >> +
> >> >> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the
> >> >> +"ibm,architecture-vec-5" property.
> >> >> +
> >> >> +"ibm,numa-lookup-index-table" property contains one or more list numbers representing
> >> >> +the domainIDs present in the system. The offset of the domainID in this property is considered
> >> >> +the domainID index.
> >> >
> >> > You haven't really introduced the term "domainID".  Is "domainID
> >> > index" the same as "domain index" above?  It's not clear to me.
> >> 
> >> The earlier part of the documented said 
> >> 
> >> The “ibm,associativity” property contains one or more lists of numbers (domainID)
> >> representing the resource’s platform grouping domains.
> >> 
> >> I will update domain index to domainID index. 
> >> 
> >> >
> >> > The distinction between "domain index" and "primary domain id" is also
> >> > not clear to me.
> >> 
> >> primary domain id is the domainID appearing in the primary domainID
> >> index. Linux kenrel also use that as the NUMA node number.
> >
> > nit s/kenrel/kernel/
> >
> >> Primary domainID index is defined by ibm,associativity-reference-points
> >> and we consider that as the most significant resource group boundary.
> >> 
> >> ibm,associativity-reference-points can be looked at as
> >> { primary domainID index, secondary domainID index, tertiary domainID index.. }
> >
> > Ok, explicitly stating that in the doc would help a lot.
> >
> >> >
> >> >> +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by
> >> >> +N domainID encoded as with encode-int
> >> >> +
> >> >> +For ex:
> >> >> +ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for domainID 8 is 1.
> >> >
> >> > Above you say "Form 2 allows a large number of primary domain ids at
> >> > the same domain index", but this encoding doesn't appear to permit
> >> > that.
> >> 
> >> I didn't follow that question.
> >
> > Ah, that's because I was thinking of index here as the index within
> > the lookup-index-table, not the index within the associativity
> > arrays.
> >
> >> >
> >> >> +"ibm,numa-distance-table" property contains one or more list of numbers representing the NUMA
> >> >> +distance between resource groups/domains present in the system.
> >> >> +
> >> >> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
> >> >> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
> >> >> +
> >> >> +For ex:
> >> >> +ibm,numa-lookup-index-table =  {3, 0, 8, 40}
> >> >> +ibm,numa-distance-table     =  {9, 1, 2, 8, 2, 1, 16, 8, 16, 1}
> >> >> +
> >> >> +  | 0    8   40
> >> >> +--|------------
> >> >> +  |
> >> >> +0 | 10   20  80
> >> >> +  |
> >> >> +8 | 20   10  160
> >> >> +  |
> >> >> +40| 80   160  10
> >> >
> >> > What's the reason for multiplying the values by 10 in the expanded
> >> > table version?
> >> 
> >> That was me missing a document update. Since we used 8 bits to encode
> >> distance at some point we were looking at a SCALE factor. But later
> >> realized other architectures also restrict distance to 8 bits. I will
> >> update ibm,numa-distance-table in the document.
> >
> > Ok.
> >
> >> >> +
> >> >> +With Form2 "ibm,associativity" for resources is listed as below:
> >> >> +
> >> >> +"ibm,associativity" property for resources in node 0, 8 and 40
> >> >> +{ 4, 6, 7, 0, 0}
> >> >> +{ 4, 6, 9, 8, 8}
> >> >> +{ 4, 6, 7, 0, 40}
> >> >> +
> >> >> +With "ibm,associativity-reference-points"  { 0x4, 0x3, 0x2 }
> >> >> +
> >> >> +With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
> >> >
> >> > What the heck is the secondary domainID
> >> 
> >> domainID appearing the secondary domainID index.
> >
> > I understand that from the clarifications you've made about, but
> > second domainID index wasn't any more defined in the original draft.
> >
> >> ibm,associativity-reference-points gives an indication of different
> >> hierachy of resource grouping as below.
> >> 
> >> ibm,associativity-reference-points can be looked at as
> >> { primary domainID index, secondary domainID index, tertiary domainID index.. }
> >> 
> >> >
> >> >> +the kernel should use when using persistent memory devices. Persistent memory devices
> >> >> +can also be used as regular memory using DAX KMEM driver and primary domainID indicates
> >> >> +the numa node number OS should use when using these devices as regular memory. Secondary
> >> >> +domainID is the numa node number that should be used when using this device as
> >> >> +persistent memory. In the later case, we are interested in the locality of the
> >> >> +device to an established numa node. In the above example, if the last row represents a
> >> >> +persistent memory device/resource, NUMA node number 40 will be used when using the device
> >> >> +as regular memory and NUMA node number 0 will be the device numa node when using it as
> >> >> +a persistent memory device.
> >> >> +
> >> >> +Each resource (drcIndex) now also supports additional optional device tree properties.
> >> >> +These properties are marked optional because the platform can choose not to export
> >> >> +them and provide the system topology details using the earlier defined device tree
> >> >> +properties alone. The optional device tree properties are used when adding new resources
> >> >> +(DLPAR) and when the platform didn't provide the topology details of the domain which
> >> >> +contains the newly added resource during boot.
> >> >> +
> >> >> +"ibm,numa-lookup-index" property contains a number representing the domainID index to be used
> >> >> +when building the NUMA distance of the numa node to which this resource belongs. The domain id
> >> >> +of the new resource can be obtained from the existing "ibm,associativity" property. This
> >> >> +can be used to build distance information of a newly onlined NUMA node via DLPAR operation.
> >> >> +The value is 1 based array index value.
> >> >
> >> > Am I correct in thinking that if we have an entirely form2 world, we'd
> >> > only need this and the ibm,associativity properties could be dropped?
> >> 
> >> Not really. ibm,numa-lookup-index-table was added to have a concise
> >> representation of numa distance via ibm,numa-distance-table. 
> >> 
> >> For ex: With domainID 0, 4, 5 we could do a 5x5 matrix to represent the
> >> numa distance. Instead ibm,numa-lookup-index-table allows us to present
> >> the same in a 3x3 matrix  distance[index0][index1] is the  distance
> >> between NUMA node 0 and 4 and distance[index0][index2] is the distance
> >> between NUMA node 0 and 5
> >
> > Right, I get the purpose of it, and I realized I misphrashed my
> > question.  My point is that in a Form2 world, the *only* thing the
> > associativity array is used for is to deduce its position in
> > lookup-index-table.  Once you have have that for each resource, you
> > have everything you need, yes?
> 
> 
> ibm,associativity is used find the domainID/NUMA node id of the
> resource.
> 
> ibm,lookup-index-table is used compute the distance information between
> NUMA nodes using ibm,numa-distance-table.

I get that you need to use lookup-index-table to work out how to
interpret numa-distance-table.  My point is that IIUC once you've done
the lookup in lookup-index-table once for each associativity array
value, the number you get out (which just a compacted version of the
node id) should be all you need ever again.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity
  2021-06-17  7:50           ` David Gibson
@ 2021-06-17 10:46             ` Aneesh Kumar K.V
  0 siblings, 0 replies; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-17 10:46 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

On 6/17/21 1:20 PM, David Gibson wrote:
> On Tue, Jun 15, 2021 at 01:10:27PM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:


....

>>>> PAPR defines "most significant" as below
>>>>
>>>> When the “ibm,architecture-vec-5” property byte 5 bit 0 has the value of one, the “ibm,associativ-
>>>> ity-reference-points” property indicates boundaries between associativity domains presented by the
>>>> “ibm,associativity” property containing “near” and “far” resources. The
>>>> first such boundary in the list represents the 1 based ordinal in the
>>>> associativity lists of the most significant boundary, with subsequent
>>>> entries indicating progressively less significant boundaries
>>>
>>> No... that's not a definition.  Like your draft PAPR uses the term
>>> while entirely failing to define it.  From what I can tell about how
>>> it is used the "most significant" boundary corresponds to what Linux
>>> simply thinks of as the node id.  But intuitively, I'd think of that
>>> as the "least significant" boundary, since that's basically the
>>> smallest granularity at which we care about NUMA distances.
>>>
>>>
>>>> I would interpret it as the boundary where we start defining NUMA
>>>> nodes.
>>>
>>> That isn't any clearer to me.
>>
>> How about calling it least significant boundary then?
> 
> Heck, no.  My whole point here is that the meaning is unclear: my
> first guess at the meaning is different from whoever wrote that text.
> We need to come up with a way of describing it that's clearer.
> 
>> The “ibm,associativity-reference-points” property contains one or more list of numbers
>> (domainID index) that represents the 1 based ordinal in the associativity lists of the
>> least significant boundary, with subsequent entries indicating progressively higher
>> significant boundaries.
>>
>> ex:
>> { primary domainID index, secondary domainID index, tertiary domainID index.. }
>>
>> Linux kernel uses the domainID of the least significant boundary (aka primary domain)
>> as the NUMA node id. Linux kernel computes NUMA distance between two domains by
>> recursively comparing if they belong to the same higher-level domains. For mismatch
>> at every higher level of the resource group, the kernel doubles the NUMA distance between
>> the comparing domains.
>>
>

Any suggestion on how to reword the above section then? We could say
associativity-reference-points is list of domainID index representing 
increasing hierarchy of resource group. I am not sure that explains it 
any better?

....
>>>> For ex: With domainID 0, 4, 5 we could do a 5x5 matrix to represent the
>>>> numa distance. Instead ibm,numa-lookup-index-table allows us to present
>>>> the same in a 3x3 matrix  distance[index0][index1] is the  distance
>>>> between NUMA node 0 and 4 and distance[index0][index2] is the distance
>>>> between NUMA node 0 and 5
>>>
>>> Right, I get the purpose of it, and I realized I misphrashed my
>>> question.  My point is that in a Form2 world, the *only* thing the
>>> associativity array is used for is to deduce its position in
>>> lookup-index-table.  Once you have have that for each resource, you
>>> have everything you need, yes?
>>
>>
>> ibm,associativity is used find the domainID/NUMA node id of the
>> resource.
>>
>> ibm,lookup-index-table is used compute the distance information between
>> NUMA nodes using ibm,numa-distance-table.
> 
> I get that you need to use lookup-index-table to work out how to
> interpret numa-distance-table.  My point is that IIUC once you've done
> the lookup in lookup-index-table once for each associativity array
> value, the number you get out (which just a compacted version of the
> node id) should be all you need ever again.
> 

That is correct. We will continue to use the index to nodeid map during 
DLPAR, if such an operation adds a new numa node. update_numa_distance() 
shows the detail.

-aneesh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-17  7:46           ` David Gibson
@ 2021-06-17 10:53             ` Daniel Henrique Barboza
  2021-06-17 11:11               ` Aneesh Kumar K.V
  2021-06-17 10:59             ` Aneesh Kumar K.V
  2021-06-17 13:55             ` Aneesh Kumar K.V
  2 siblings, 1 reply; 38+ messages in thread
From: Daniel Henrique Barboza @ 2021-06-17 10:53 UTC (permalink / raw)
  To: David Gibson, Aneesh Kumar K.V; +Cc: Nathan Lynch, linuxppc-dev



On 6/17/21 4:46 AM, David Gibson wrote:
> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>>
>>> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>
>>>>> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>>>>>> FORM2 introduce a concept of secondary domain which is identical to the
>>>>>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>>>>>> when using persistent memory device. For DAX kmem use the logical domain
>>>>>> id introduced in FORM2. This new numa node
>>>>>>
>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>> ---
>>>>>>   arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>>>>>   arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>>>>>   arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>>>>>   3 files changed, 45 insertions(+), 10 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>>>>> index 86cd2af014f7..b9ac6d02e944 100644
>>>>>> --- a/arch/powerpc/mm/numa.c
>>>>>> +++ b/arch/powerpc/mm/numa.c
>>>>>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>>>>>   	return nid;
>>>>>>   }
>>>>>>   
>>>>>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>>>>>> +{
>>>>>> +	int secondary_index;
>>>>>> +	const __be32 *associativity;
>>>>>> +
>>>>>> +	if (!numa_enabled) {
>>>>>> +		*primary = NUMA_NO_NODE;
>>>>>> +		*secondary = NUMA_NO_NODE;
>>>>>> +		return 0;
>>>>>> +	}
>>>>>> +
>>>>>> +	associativity = of_get_associativity(node);
>>>>>> +	if (!associativity)
>>>>>> +		return -ENODEV;
>>>>>> +
>>>>>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>>>>>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>>>>>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>>>>>
>>>>> Secondary ID is always the second reference point, but primary depends
>>>>> on the length of resources?  That seems very weird.
>>>>
>>>> primary_domain_index is distance_ref_point[0]. With Form2 we would find
>>>> both primary and secondary domain ID same for all resources other than
>>>> persistent memory device. The usage w.r.t. persistent memory is
>>>> explained in patch 7.
>>>
>>> Right, I misunderstood
>>>
>>>>
>>>> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>>>> the kernel should use when using persistent memory devices.
>>>
>>> This seems kind of bogus.  With Form1, the primary/secondary ID are a
>>> sort of heirarchy of distance (things with same primary ID are very
>>> close, things with same secondary are kinda-close, etc.).  With Form2,
>>> it's referring to their effective node for different purposes.
>>>
>>> Using the same terms for different meanings seems unnecessarily
>>> confusing.
>>
>> They are essentially domainIDs. The interpretation of them are different
>> between Form1 and Form2. Hence I kept referring to them as primary and
>> secondary domainID. Any suggestion on what to name them with Form2?
> 
> My point is that reusing associativity-reference-points for something
> with completely unrelated semantics seems like a very poor choice.


I agree that this reuse can be confusing. I could argue that there is
precedent for that in PAPR - FORM0 puts a different spin on the same
property as well - but there is no need to keep following existing PAPR
practices in new spec (and some might argue it's best not to).

As far as QEMU goes, renaming this property to "numa-associativity-mode"
(just an example) is a quick change to do since we separated FORM1 and FORM2
code over there.

Doing such a rename can also help with the issue of having to describe new
FORM2 semantics using "least significant boundary" or "primary domain" or
any FORM0|FORM1 related terminology.


Thanks,


Daniel



> 
>>>> Persistent memory devices
>>>> can also be used as regular memory using DAX KMEM driver and primary domainID indicates
>>>> the numa node number OS should use when using these devices as regular memory. Secondary
>>>> domainID is the numa node number that should be used when using this device as
>>>> persistent memory.
>>>
>>> It's weird to me that you'd want to consider them in different nodes
>>> for those different purposes.
>>
>>
>>     --------------------------------------
>>    |                            NUMA node0 |
>>    |    ProcA -----> MEMA                  |
>>    |     |                                 |
>>    |	|                                 |
>>    |	-------------------> PMEMB        |
>>    |                                       |
>>     ---------------------------------------
>>
>>     ---------------------------------------
>>    |                            NUMA node1 |
>>    |                                       |
>>    |    ProcB -------> MEMC                |
>>    |	|                                 |
>>    |	-------------------> PMEMD        |
>>    |                                       |
>>    |                                       |
>>     ---------------------------------------
>>   
>>
>> For a topology like the above application running of ProcA wants to find out
>> persistent memory mount local to its NUMA node. Hence when using it as
>> pmem fsdax mount or devdax device we want PMEMB to have associativity
>> of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
>> we want to use it as memory using dax kmem driver, we want both PMEMB
>> and PMEMD to appear as memory only NUMA node at a distance that is
>> derived based on the latency of the media.
> 
> I'm still not understanding why the latency we care about is different
> in the two cases.  Can you give an example of when this would result
> in different actual node assignments for the two different cases?
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-17  7:46           ` David Gibson
  2021-06-17 10:53             ` Daniel Henrique Barboza
@ 2021-06-17 10:59             ` Aneesh Kumar K.V
  2021-06-24  3:16               ` David Gibson
  2021-06-17 13:55             ` Aneesh Kumar K.V
  2 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-17 10:59 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

On 6/17/21 1:16 PM, David Gibson wrote:
> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>>
>>> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>>> David Gibson <david@gibson.dropbear.id.au> writes:

...

>>> It's weird to me that you'd want to consider them in different nodes
>>> for those different purposes.
>>
>>
>>     --------------------------------------
>>    |                            NUMA node0 |
>>    |    ProcA -----> MEMA                  |
>>    |     |                                 |
>>    |	|                                 |
>>    |	-------------------> PMEMB        |
>>    |                                       |
>>     ---------------------------------------
>>
>>     ---------------------------------------
>>    |                            NUMA node1 |
>>    |                                       |
>>    |    ProcB -------> MEMC                |
>>    |	|                                 |
>>    |	-------------------> PMEMD        |
>>    |                                       |
>>    |                                       |
>>     ---------------------------------------
>>   
>>
>> For a topology like the above application running of ProcA wants to find out
>> persistent memory mount local to its NUMA node. Hence when using it as
>> pmem fsdax mount or devdax device we want PMEMB to have associativity
>> of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
>> we want to use it as memory using dax kmem driver, we want both PMEMB
>> and PMEMD to appear as memory only NUMA node at a distance that is
>> derived based on the latency of the media.
> 
> I'm still not understanding why the latency we care about is different
> in the two cases.  Can you give an example of when this would result
> in different actual node assignments for the two different cases?
> 

In the above example in order allow use of PMEMB and PMEMD as memory 
only NUMA nodes
we need platform to represent them in its own domainID. Let's assume that
platform assigned id 40 and 41 and hence both PMEMB and PMEMD will have 
associativity array like below

{ 4, 6, 0}  -> PROCA/MEMA
{ 4, 6, 40} -> PMEMB
{ 4, 6, 41} -> PMEMD
{ 4, 6, 1} ->  PROCB/MEMB

When we want to use this device PMEMB and PMEMD as fsdax/devdax devices, 
we essentially look for the first nearest online node. Which means both 
PMEMB and PMEMD will appear as devices attached to node0. That is not 
ideal for for many applications.

using secondary domainID index as explained here helps to associate
each PMEM device to the right group. On a non virtualized config or hard 
partitioned config such a device tree representation can be looked at as 
a hint to identify which socket the actual device is connected to.

-aneesh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-17 10:53             ` Daniel Henrique Barboza
@ 2021-06-17 11:11               ` Aneesh Kumar K.V
  2021-06-17 11:46                 ` Aneesh Kumar K.V
  2021-06-17 20:00                 ` Daniel Henrique Barboza
  0 siblings, 2 replies; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-17 11:11 UTC (permalink / raw)
  To: Daniel Henrique Barboza, David Gibson; +Cc: Nathan Lynch, linuxppc-dev

Daniel Henrique Barboza <danielhb413@gmail.com> writes:

> On 6/17/21 4:46 AM, David Gibson wrote:
>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>
>>>> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>>
>>>>>> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>>>>>>> FORM2 introduce a concept of secondary domain which is identical to the
>>>>>>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>>>>>>> when using persistent memory device. For DAX kmem use the logical domain
>>>>>>> id introduced in FORM2. This new numa node
>>>>>>>
>>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>>> ---
>>>>>>>   arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>>>>>>   arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>>>>>>   arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>>>>>>   3 files changed, 45 insertions(+), 10 deletions(-)
>>>>>>>
>>>>>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>>>>>> index 86cd2af014f7..b9ac6d02e944 100644
>>>>>>> --- a/arch/powerpc/mm/numa.c
>>>>>>> +++ b/arch/powerpc/mm/numa.c
>>>>>>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>>>>>>   	return nid;
>>>>>>>   }
>>>>>>>   
>>>>>>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>>>>>>> +{
>>>>>>> +	int secondary_index;
>>>>>>> +	const __be32 *associativity;
>>>>>>> +
>>>>>>> +	if (!numa_enabled) {
>>>>>>> +		*primary = NUMA_NO_NODE;
>>>>>>> +		*secondary = NUMA_NO_NODE;
>>>>>>> +		return 0;
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	associativity = of_get_associativity(node);
>>>>>>> +	if (!associativity)
>>>>>>> +		return -ENODEV;
>>>>>>> +
>>>>>>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>>>>>>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>>>>>>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>>>>>>
>>>>>> Secondary ID is always the second reference point, but primary depends
>>>>>> on the length of resources?  That seems very weird.
>>>>>
>>>>> primary_domain_index is distance_ref_point[0]. With Form2 we would find
>>>>> both primary and secondary domain ID same for all resources other than
>>>>> persistent memory device. The usage w.r.t. persistent memory is
>>>>> explained in patch 7.
>>>>
>>>> Right, I misunderstood
>>>>
>>>>>
>>>>> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>>>>> the kernel should use when using persistent memory devices.
>>>>
>>>> This seems kind of bogus.  With Form1, the primary/secondary ID are a
>>>> sort of heirarchy of distance (things with same primary ID are very
>>>> close, things with same secondary are kinda-close, etc.).  With Form2,
>>>> it's referring to their effective node for different purposes.
>>>>
>>>> Using the same terms for different meanings seems unnecessarily
>>>> confusing.
>>>
>>> They are essentially domainIDs. The interpretation of them are different
>>> between Form1 and Form2. Hence I kept referring to them as primary and
>>> secondary domainID. Any suggestion on what to name them with Form2?
>> 
>> My point is that reusing associativity-reference-points for something
>> with completely unrelated semantics seems like a very poor choice.
>
>
> I agree that this reuse can be confusing. I could argue that there is
> precedent for that in PAPR - FORM0 puts a different spin on the same
> property as well - but there is no need to keep following existing PAPR
> practices in new spec (and some might argue it's best not to).
>
> As far as QEMU goes, renaming this property to "numa-associativity-mode"
> (just an example) is a quick change to do since we separated FORM1 and FORM2
> code over there.
>
> Doing such a rename can also help with the issue of having to describe new
> FORM2 semantics using "least significant boundary" or "primary domain" or
> any FORM0|FORM1 related terminology.
>

It is not just changing the name, we will then have to explain the
meaning of ibm,associativity-reference-points with FORM2 right?

With FORM2 we want to represent the topology better

 --------------------------------------------------------------------------------
|                                                         domainID 20            |
|   ---------------------------------------                                      |
|  |                            NUMA node1 |                                     |
|  |                                       |            --------------------     |
|  |    ProcB -------> MEMC                |           |        NUMA node40 |    |
|  |	|                                  |           |                    |    |
|  |	---------------------------------- |-------->  |  PMEMD             |    |
|  |                                       |            --------------------     |
|  |                                       |                                     |
|   ---------------------------------------                                      |
 --------------------------------------------------------------------------------

ibm,associativity:
        { 20, 1, 40}  -> PMEMD
        { 20, 1, 1}  -> PROCB/MEMC

is the suggested FORM2 representation.

-aneesh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-17 11:11               ` Aneesh Kumar K.V
@ 2021-06-17 11:46                 ` Aneesh Kumar K.V
  2021-06-17 20:00                 ` Daniel Henrique Barboza
  1 sibling, 0 replies; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-17 11:46 UTC (permalink / raw)
  To: Daniel Henrique Barboza, David Gibson; +Cc: Nathan Lynch, linuxppc-dev

On 6/17/21 4:41 PM, Aneesh Kumar K.V wrote:
> Daniel Henrique Barboza <danielhb413@gmail.com> writes:
> 
>> On 6/17/21 4:46 AM, David Gibson wrote:
>>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>
>>>>> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>>>
>>>>>>> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>>>>>>>> FORM2 introduce a concept of secondary domain which is identical to the
>>>>>>>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>>>>>>>> when using persistent memory device. For DAX kmem use the logical domain
>>>>>>>> id introduced in FORM2. This new numa node
>>>>>>>>
>>>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>>>> ---
>>>>>>>>    arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>>>>>>>    arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>>>>>>>    arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>>>>>>>    3 files changed, 45 insertions(+), 10 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>>>>>>> index 86cd2af014f7..b9ac6d02e944 100644
>>>>>>>> --- a/arch/powerpc/mm/numa.c
>>>>>>>> +++ b/arch/powerpc/mm/numa.c
>>>>>>>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>>>>>>>    	return nid;
>>>>>>>>    }
>>>>>>>>    
>>>>>>>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>>>>>>>> +{
>>>>>>>> +	int secondary_index;
>>>>>>>> +	const __be32 *associativity;
>>>>>>>> +
>>>>>>>> +	if (!numa_enabled) {
>>>>>>>> +		*primary = NUMA_NO_NODE;
>>>>>>>> +		*secondary = NUMA_NO_NODE;
>>>>>>>> +		return 0;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	associativity = of_get_associativity(node);
>>>>>>>> +	if (!associativity)
>>>>>>>> +		return -ENODEV;
>>>>>>>> +
>>>>>>>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>>>>>>>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>>>>>>>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>>>>>>>
>>>>>>> Secondary ID is always the second reference point, but primary depends
>>>>>>> on the length of resources?  That seems very weird.
>>>>>>
>>>>>> primary_domain_index is distance_ref_point[0]. With Form2 we would find
>>>>>> both primary and secondary domain ID same for all resources other than
>>>>>> persistent memory device. The usage w.r.t. persistent memory is
>>>>>> explained in patch 7.
>>>>>
>>>>> Right, I misunderstood
>>>>>
>>>>>>
>>>>>> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>>>>>> the kernel should use when using persistent memory devices.
>>>>>
>>>>> This seems kind of bogus.  With Form1, the primary/secondary ID are a
>>>>> sort of heirarchy of distance (things with same primary ID are very
>>>>> close, things with same secondary are kinda-close, etc.).  With Form2,
>>>>> it's referring to their effective node for different purposes.
>>>>>
>>>>> Using the same terms for different meanings seems unnecessarily
>>>>> confusing.
>>>>
>>>> They are essentially domainIDs. The interpretation of them are different
>>>> between Form1 and Form2. Hence I kept referring to them as primary and
>>>> secondary domainID. Any suggestion on what to name them with Form2?
>>>
>>> My point is that reusing associativity-reference-points for something
>>> with completely unrelated semantics seems like a very poor choice.
>>
>>
>> I agree that this reuse can be confusing. I could argue that there is
>> precedent for that in PAPR - FORM0 puts a different spin on the same
>> property as well - but there is no need to keep following existing PAPR
>> practices in new spec (and some might argue it's best not to).
>>
>> As far as QEMU goes, renaming this property to "numa-associativity-mode"
>> (just an example) is a quick change to do since we separated FORM1 and FORM2
>> code over there.
>>
>> Doing such a rename can also help with the issue of having to describe new
>> FORM2 semantics using "least significant boundary" or "primary domain" or
>> any FORM0|FORM1 related terminology.
>>
> 
> It is not just changing the name, we will then have to explain the
> meaning of ibm,associativity-reference-points with FORM2 right?
> 
> With FORM2 we want to represent the topology better
> 
>   --------------------------------------------------------------------------------
> |                                                         domainID 20            |
> |   ---------------------------------------                                      |
> |  |                            NUMA node1 |                                     |
> |  |                                       |            --------------------     |
> |  |    ProcB -------> MEMC                |           |        NUMA node40 |    |
> |  |	|                                  |           |                    |    |
> |  |	---------------------------------- |-------->  |  PMEMD             |    |
> |  |                                       |            --------------------     |
> |  |                                       |                                     |
> |   ---------------------------------------                                      |
>   --------------------------------------------------------------------------------
> 
> ibm,associativity:
>          { 20, 1, 40}  -> PMEMD
>          { 20, 1, 1}  -> PROCB/MEMC
> 
> is the suggested FORM2 representation.
> 
>

We can simplify this as below too

ibm,associativity:
         { 20, 1, 40}  -> PMEMD
         { 20, 1 }  -> PROCB/MEMC

-aneesh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-17  7:46           ` David Gibson
  2021-06-17 10:53             ` Daniel Henrique Barboza
  2021-06-17 10:59             ` Aneesh Kumar K.V
@ 2021-06-17 13:55             ` Aneesh Kumar K.V
  2021-06-17 14:04               ` Aneesh Kumar K.V
  2 siblings, 1 reply; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-17 13:55 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

David Gibson <david@gibson.dropbear.id.au> writes:

> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>> 
>> > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>> >> David Gibson <david@gibson.dropbear.id.au> writes:
>> >> 
>> >> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
.....

> I'm still not understanding why the latency we care about is different
> in the two cases.  Can you give an example of when this would result
> in different actual node assignments for the two different cases?

How about the below update?

With Form2 "ibm,associativity" for resources is listed as below:

"ibm,associativity" property for resources in node 0, 8 and 40
{ 3, 6, 7, 0 }
{ 3, 6, 9, 8 }
{ 4, 6, 7, 0, 40}

With "ibm,associativity-reference-points"  { 0x3, 0x2 }

Form2 adds additional property which can be used with devices like persistence
memory devices which would also like to be presented as memory-only NUMA nodes.

"ibm,associativity-memory-node-reference-point" property contains a number
representing the domainID index to be used to find the domainID that should be used
when using the resource as memory only NUMA node. The NUMA distance information
w.r.t this domainID will take into consideration the latency of the media. A
high latency memory device will have a large NUMA distance value assigned w.r.t
the domainID found at at "ibm,associativity-memory-node-reference-point" domainID index.

prop-encoded-array: An integer encoded as with encode-int specifying the domainID index

In the above example:
"ibm,associativity-memory-node-reference-point"  { 0x4 }

ex:

   --------------------------------------
  |                            NUMA node0 |
  |    ProcA -----> MEMA                  |
  |     |                                 |
  |	|                                 |
  |	-------------------> PMEMB        |
  |                                       |
   ---------------------------------------

   ---------------------------------------
  |                            NUMA node1 |
  |                                       |
  |    ProcB -------> MEMC                |
  |	|                                 |
  |	-------------------> PMEMD        |
  |                                       |
  |                                       |
   ---------------------------------------

 --------------------------------------------------------------------------------
|                                                      domainID 20               |
|   ---------------------------------------                                      |
|  |                            NUMA node0 |                                     |
|  |                                       |            --------------------     |
|  |    ProcA -------> MEMA                |           |        NUMA node40 |    |
|  |	|                                  |           |                    |    |
|  |	---------------------------------- |-------->  |  PMEMB             |    |
|  |                                       |            --------------------     |
|  |                                       |                                     |
|   ---------------------------------------                                      |
|                                                                                |
|   ---------------------------------------                                      |
|  |                            NUMA node1 |                                     |
|  |                                       |                                     |
|  |    ProcB -------> MEMC                |           -------------------       |
|  |	|                                  |          |       NUMA node41 |      |
|  |	--------------------------------------------> | PMEMD             |      |
|  |                                       |           -------------------       |
|  |                                       |                                     |
|   ---------------------------------------                                      |
|                                                                                |
 --------------------------------------------------------------------------------

For a topology like the above application running of ProcA wants to find out
persistent memory mount local to its NUMA node. Hence when using it as
pmem fsdax mount or devdax device we want PMEMB to have associativity
of NUMA node0 and PMEMD to have associativity of NUMA node1. But when
we want to use it as memory using dax kmem driver, we want both PMEMB
and PMEMD to appear as memory only NUMA node at a distance that is
derived based on the latency of the media.

"ibm,associativity":
PROCA/MEMA -> { 2, 20, 0 } 
PROCB/MEMC -> { 2, 20, 1 } 
PMEMB      -> { 3, 20, 0, 40}
PMEMB      -> { 3, 20, 1, 41}

"ibm,associativity-reference-points" -> { 2, 1 }
"ibm,associativity-memory-node-reference-points" -> { 3 }

-aneesh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-17 13:55             ` Aneesh Kumar K.V
@ 2021-06-17 14:04               ` Aneesh Kumar K.V
  0 siblings, 0 replies; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-17 14:04 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> writes:

> David Gibson <david@gibson.dropbear.id.au> writes:
>
>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>> 
>>> > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>> >> David Gibson <david@gibson.dropbear.id.au> writes:
>>> >> 
>>> >> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> .....
>
>> I'm still not understanding why the latency we care about is different
>> in the two cases.  Can you give an example of when this would result
>> in different actual node assignments for the two different cases?
>
> How about the below update?
>
> With Form2 "ibm,associativity" for resources is listed as below:
>
> "ibm,associativity" property for resources in node 0, 8 and 40
> { 3, 6, 7, 0 }
> { 3, 6, 9, 8 }
> { 4, 6, 7, 0, 40}
>
> With "ibm,associativity-reference-points"  { 0x3, 0x2 }
>
> Form2 adds additional property which can be used with devices like persistence
> memory devices which would also like to be presented as memory-only NUMA nodes.
>
> "ibm,associativity-memory-node-reference-point" property contains a number
> representing the domainID index to be used to find the domainID that should be used
> when using the resource as memory only NUMA node. The NUMA distance information
> w.r.t this domainID will take into consideration the latency of the media. A
> high latency memory device will have a large NUMA distance value assigned w.r.t
> the domainID found at at "ibm,associativity-memory-node-reference-point" domainID index.
>
> prop-encoded-array: An integer encoded as with encode-int specifying the domainID index
>
> In the above example:
> "ibm,associativity-memory-node-reference-point"  { 0x4 }
>
> ex:
>
>    --------------------------------------
>   |                            NUMA node0 |
>   |    ProcA -----> MEMA                  |
>   |     |                                 |
>   |	|                                 |
>   |	-------------------> PMEMB        |
>   |                                       |
>    ---------------------------------------
>
>    ---------------------------------------
>   |                            NUMA node1 |
>   |                                       |
>   |    ProcB -------> MEMC                |
>   |	|                                 |
>   |	-------------------> PMEMD        |
>   |                                       |
>   |                                       |
>    ---------------------------------------
>
>  --------------------------------------------------------------------------------
> |                                                      domainID 20               |
> |   ---------------------------------------                                      |
> |  |                            NUMA node0 |                                     |
> |  |                                       |            --------------------     |
> |  |    ProcA -------> MEMA                |           |        NUMA node40 |    |
> |  |	|                                  |           |                    |    |
> |  |	---------------------------------- |-------->  |  PMEMB             |    |
> |  |                                       |            --------------------     |
> |  |                                       |                                     |
> |   ---------------------------------------                                      |
> |                                                                                |
> |   ---------------------------------------                                      |
> |  |                            NUMA node1 |                                     |
> |  |                                       |                                     |
> |  |    ProcB -------> MEMC                |           -------------------       |
> |  |	|                                  |          |       NUMA node41 |      |
> |  |	--------------------------------------------> | PMEMD             |      |
> |  |                                       |           -------------------       |
> |  |                                       |                                     |
> |   ---------------------------------------                                      |
> |                                                                                |
>  --------------------------------------------------------------------------------
>
> For a topology like the above application running of ProcA wants to find out
> persistent memory mount local to its NUMA node. Hence when using it as
> pmem fsdax mount or devdax device we want PMEMB to have associativity
> of NUMA node0 and PMEMD to have associativity of NUMA node1. But when
> we want to use it as memory using dax kmem driver, we want both PMEMB
> and PMEMD to appear as memory only NUMA node at a distance that is
> derived based on the latency of the media.
>
> "ibm,associativity":
> PROCA/MEMA -> { 2, 20, 0 } 
> PROCB/MEMC -> { 2, 20, 1 } 
> PMEMB      -> { 3, 20, 0, 40}
> PMEMB      -> { 3, 20, 1, 41}
>
> "ibm,associativity-reference-points" -> { 2, 1 }
> "ibm,associativity-memory-node-reference-points" -> { 3 }

Another option is to make sure that numa-distance-value is populated
such that PMEMB distance indicates it is closer to node0 when compared
to node1. ie, node_distance[40][0] < node_distance[40][1]. One could
possibly infer the grouping based on the distance value and not deepend
on ibm,associativity for that purpose.

-aneesh

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-17 11:11               ` Aneesh Kumar K.V
  2021-06-17 11:46                 ` Aneesh Kumar K.V
@ 2021-06-17 20:00                 ` Daniel Henrique Barboza
  2021-06-18  3:18                   ` Aneesh Kumar K.V
  1 sibling, 1 reply; 38+ messages in thread
From: Daniel Henrique Barboza @ 2021-06-17 20:00 UTC (permalink / raw)
  To: Aneesh Kumar K.V, David Gibson; +Cc: Nathan Lynch, linuxppc-dev



On 6/17/21 8:11 AM, Aneesh Kumar K.V wrote:
> Daniel Henrique Barboza <danielhb413@gmail.com> writes:
> 
>> On 6/17/21 4:46 AM, David Gibson wrote:
>>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>
>>>>> On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
>>>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>>>>>
>>>>>>> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>>>>>>>> FORM2 introduce a concept of secondary domain which is identical to the
>>>>>>>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>>>>>>>> when using persistent memory device. For DAX kmem use the logical domain
>>>>>>>> id introduced in FORM2. This new numa node
>>>>>>>>
>>>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>>>> ---
>>>>>>>>    arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>>>>>>>    arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>>>>>>>    arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>>>>>>>    3 files changed, 45 insertions(+), 10 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>>>>>>> index 86cd2af014f7..b9ac6d02e944 100644
>>>>>>>> --- a/arch/powerpc/mm/numa.c
>>>>>>>> +++ b/arch/powerpc/mm/numa.c
>>>>>>>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>>>>>>>    	return nid;
>>>>>>>>    }
>>>>>>>>    
>>>>>>>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>>>>>>>> +{
>>>>>>>> +	int secondary_index;
>>>>>>>> +	const __be32 *associativity;
>>>>>>>> +
>>>>>>>> +	if (!numa_enabled) {
>>>>>>>> +		*primary = NUMA_NO_NODE;
>>>>>>>> +		*secondary = NUMA_NO_NODE;
>>>>>>>> +		return 0;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	associativity = of_get_associativity(node);
>>>>>>>> +	if (!associativity)
>>>>>>>> +		return -ENODEV;
>>>>>>>> +
>>>>>>>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>>>>>>>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>>>>>>>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>>>>>>>
>>>>>>> Secondary ID is always the second reference point, but primary depends
>>>>>>> on the length of resources?  That seems very weird.
>>>>>>
>>>>>> primary_domain_index is distance_ref_point[0]. With Form2 we would find
>>>>>> both primary and secondary domain ID same for all resources other than
>>>>>> persistent memory device. The usage w.r.t. persistent memory is
>>>>>> explained in patch 7.
>>>>>
>>>>> Right, I misunderstood
>>>>>
>>>>>>
>>>>>> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>>>>>> the kernel should use when using persistent memory devices.
>>>>>
>>>>> This seems kind of bogus.  With Form1, the primary/secondary ID are a
>>>>> sort of heirarchy of distance (things with same primary ID are very
>>>>> close, things with same secondary are kinda-close, etc.).  With Form2,
>>>>> it's referring to their effective node for different purposes.
>>>>>
>>>>> Using the same terms for different meanings seems unnecessarily
>>>>> confusing.
>>>>
>>>> They are essentially domainIDs. The interpretation of them are different
>>>> between Form1 and Form2. Hence I kept referring to them as primary and
>>>> secondary domainID. Any suggestion on what to name them with Form2?
>>>
>>> My point is that reusing associativity-reference-points for something
>>> with completely unrelated semantics seems like a very poor choice.
>>
>>
>> I agree that this reuse can be confusing. I could argue that there is
>> precedent for that in PAPR - FORM0 puts a different spin on the same
>> property as well - but there is no need to keep following existing PAPR
>> practices in new spec (and some might argue it's best not to).
>>
>> As far as QEMU goes, renaming this property to "numa-associativity-mode"
>> (just an example) is a quick change to do since we separated FORM1 and FORM2
>> code over there.
>>
>> Doing such a rename can also help with the issue of having to describe new
>> FORM2 semantics using "least significant boundary" or "primary domain" or
>> any FORM0|FORM1 related terminology.
>>
> 
> It is not just changing the name, we will then have to explain the
> meaning of ibm,associativity-reference-points with FORM2 right?

Hmmmm why? My idea over there was to add a new property that indicates that
resource might have a different NUMA affinity based on the mode of operation
(like PMEM), and get rid of ibm,associativity-reference-points altogether.

The NUMA distances already express the topology. Closer distances indicates
closer proximity, larger distances indicates otherwise. Having
"associativity-reference-points" to reflect a  associativity domain
relationship, when you already have all the distances from each node, is
somewhat redundant.

The concept of 'associativity domain' was necessary in FORM1 because we had no
other way of telling distance between NUMA nodes. We needed to rely on these
overly complex and convoluted subdomain abstractions to say that "nodeA belongs
to the same third-level domain as node B, and in the second-level domain with
node C". The kernel would read that and calculate that each level is doubling
the distance from the level before and local_distance is 10, so:

distAA = 10  distAB= 20 distAC = 40

With FORM2, if this information is already explicit in ibm,numa-distance-table,
why bother calculating associativity domains? If you want to know whether
PROCA is closer to PROCB or PROCX, just look at the NUMA distance table and
see which one is closer.

  

> 
> With FORM2 we want to represent the topology better
> 
>   --------------------------------------------------------------------------------
> |                                                         domainID 20            |
> |   ---------------------------------------                                      |
> |  |                            NUMA node1 |                                     |
> |  |                                       |            --------------------     |
> |  |    ProcB -------> MEMC                |           |        NUMA node40 |    |
> |  |	|                                  |           |                    |    |
> |  |	---------------------------------- |-------->  |  PMEMD             |    |
> |  |                                       |            --------------------     |
> |  |                                       |                                     |
> |   ---------------------------------------                                      |
>   --------------------------------------------------------------------------------
> 
> ibm,associativity:
>          { 20, 1, 40}  -> PMEMD
>          { 20, 1, 1}  -> PROCB/MEMC
> 
> is the suggested FORM2 representation.


The way I see it, the '20' over there is not needed at all. What utility it
brings? And why create an associativity domain '1' in the MEMC associativity
at 0x3?

What the current QEMU FORM2 implementation is doing would be this:

           { 0, 0, 1, 40}  -> PMEMD
           { 0, 0, 0, 1}  -> PROCB/MEMC


PMEMD has a pointer to the NUMA node in which it would run as persistent
memory, node 1. All the memory/cpu nodes of node1 would be oblivious
to what PMEMD is doing.

I don't see the need of creating an associativity domain between node1
and node40 in 0x3. Besides, if a device_add operation of a PMEM that wants
to use nodeN as the node for persistent memory would trigger a massive
ibm,associativity update, on all LMBs that belongs to nodeN, because then
everyone needs to have the same third level associativity domain as the
hotplugged PMEM. To avoid that, if the idea is to 'just duplicate the
logical_domain_id in 0x3 for all non-PMEM devices' then what's the
difference of looking into the logical_numa_id at 0x4 in the first
place?



In fact, the more I speak about this PMEM scenario the more I wonder:
why doesn't the PMEM driver, when switching from persistent to regular
memory and vice-versa, take care of all the necessary updates in the
numa-distance-table and kernel internals to reflect the current distances
of its current mode? Is this a technical limitation?



Thanks


Daniel


> 
> -aneesh
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-17 20:00                 ` Daniel Henrique Barboza
@ 2021-06-18  3:18                   ` Aneesh Kumar K.V
  0 siblings, 0 replies; 38+ messages in thread
From: Aneesh Kumar K.V @ 2021-06-18  3:18 UTC (permalink / raw)
  To: Daniel Henrique Barboza, David Gibson; +Cc: Nathan Lynch, linuxppc-dev

On 6/18/21 1:30 AM, Daniel Henrique Barboza wrote:
> 
> 
> On 6/17/21 8:11 AM, Aneesh Kumar K.V wrote:
>> Daniel Henrique Barboza <danielhb413@gmail.com> writes:
>>
>>> On 6/17/21 4:46 AM, David Gibson wrote:
>>>> On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
>>>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>

> 
> 
> In fact, the more I speak about this PMEM scenario the more I wonder:
> why doesn't the PMEM driver, when switching from persistent to regular
> memory and vice-versa, take care of all the necessary updates in the
> numa-distance-table and kernel internals to reflect the current distances
> of its current mode? Is this a technical limitation?
> 
> 

I sent v4 doing something similar to this .

-aneesh


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
  2021-06-17 10:59             ` Aneesh Kumar K.V
@ 2021-06-24  3:16               ` David Gibson
  0 siblings, 0 replies; 38+ messages in thread
From: David Gibson @ 2021-06-24  3:16 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 3286 bytes --]

On Thu, Jun 17, 2021 at 04:29:01PM +0530, Aneesh Kumar K.V wrote:
> On 6/17/21 1:16 PM, David Gibson wrote:
> > On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
> > > David Gibson <david@gibson.dropbear.id.au> writes:
> > > 
> > > > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> > > > > David Gibson <david@gibson.dropbear.id.au> writes:
> 
> ...
> 
> > > > It's weird to me that you'd want to consider them in different nodes
> > > > for those different purposes.
> > > 
> > > 
> > >     --------------------------------------
> > >    |                            NUMA node0 |
> > >    |    ProcA -----> MEMA                  |
> > >    |     |                                 |
> > >    |	|                                 |
> > >    |	-------------------> PMEMB        |
> > >    |                                       |
> > >     ---------------------------------------
> > > 
> > >     ---------------------------------------
> > >    |                            NUMA node1 |
> > >    |                                       |
> > >    |    ProcB -------> MEMC                |
> > >    |	|                                 |
> > >    |	-------------------> PMEMD        |
> > >    |                                       |
> > >    |                                       |
> > >     ---------------------------------------
> > > 
> > > For a topology like the above application running of ProcA wants to find out
> > > persistent memory mount local to its NUMA node. Hence when using it as
> > > pmem fsdax mount or devdax device we want PMEMB to have associativity
> > > of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
> > > we want to use it as memory using dax kmem driver, we want both PMEMB
> > > and PMEMD to appear as memory only NUMA node at a distance that is
> > > derived based on the latency of the media.
> > 
> > I'm still not understanding why the latency we care about is different
> > in the two cases.  Can you give an example of when this would result
> > in different actual node assignments for the two different cases?
> > 
> 
> In the above example in order allow use of PMEMB and PMEMD as memory only
> NUMA nodes
> we need platform to represent them in its own domainID. Let's assume that
> platform assigned id 40 and 41 and hence both PMEMB and PMEMD will have
> associativity array like below
> 
> { 4, 6, 0}  -> PROCA/MEMA
> { 4, 6, 40} -> PMEMB
> { 4, 6, 41} -> PMEMD
> { 4, 6, 1} ->  PROCB/MEMB
> 
> When we want to use this device PMEMB and PMEMD as fsdax/devdax devices, we
> essentially look for the first nearest online node. Which means both PMEMB
> and PMEMD will appear as devices attached to node0. That is not ideal for
> for many applications.

Not if you actually look at the distance table which tells you that
PMEMB is closer to node0 and PMEMD is closer to node1.  That's exactly
what the distance table is for - making this information explicit,
rather than intuited from a confusing set of nested domains.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2021-06-24  3:19 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-14 16:39 [RFC PATCH 0/8] Add support for FORM2 associativity Aneesh Kumar K.V
2021-06-14 16:39 ` [RFC PATCH 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index Aneesh Kumar K.V
2021-06-15  3:00   ` David Gibson
2021-06-15  8:21     ` Aneesh Kumar K.V
2021-06-14 16:39 ` [RFC PATCH 2/8] powerpc/pseries: rename distance_ref_points_depth to max_domain_index Aneesh Kumar K.V
2021-06-15  3:01   ` David Gibson
2021-06-15  8:22     ` Aneesh Kumar K.V
2021-06-14 16:39 ` [RFC PATCH 3/8] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY Aneesh Kumar K.V
2021-06-15  3:04   ` David Gibson
2021-06-14 16:39 ` [RFC PATCH 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update Aneesh Kumar K.V
2021-06-15  3:13   ` David Gibson
2021-06-15  8:26     ` Aneesh Kumar K.V
2021-06-14 16:40 ` [RFC PATCH 5/8] powerpc/pseries: Consolidate NUMA distance update during boot Aneesh Kumar K.V
2021-06-14 16:40 ` [RFC PATCH 6/8] powerpc/pseries: Add a helper for form1 cpu distance Aneesh Kumar K.V
2021-06-15  3:21   ` David Gibson
2021-06-14 16:40 ` [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity Aneesh Kumar K.V
2021-06-15  3:53   ` David Gibson
2021-06-15  5:28     ` Aneesh Kumar K.V
2021-06-15  6:25       ` David Gibson
2021-06-15  7:40         ` Aneesh Kumar K.V
2021-06-17  7:50           ` David Gibson
2021-06-17 10:46             ` Aneesh Kumar K.V
2021-06-14 16:40 ` [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details Aneesh Kumar K.V
2021-06-15  3:55   ` David Gibson
2021-06-15  5:57     ` Aneesh Kumar K.V
2021-06-15  6:34       ` David Gibson
2021-06-15  7:05         ` Aneesh Kumar K.V
2021-06-17  7:46           ` David Gibson
2021-06-17 10:53             ` Daniel Henrique Barboza
2021-06-17 11:11               ` Aneesh Kumar K.V
2021-06-17 11:46                 ` Aneesh Kumar K.V
2021-06-17 20:00                 ` Daniel Henrique Barboza
2021-06-18  3:18                   ` Aneesh Kumar K.V
2021-06-17 10:59             ` Aneesh Kumar K.V
2021-06-24  3:16               ` David Gibson
2021-06-17 13:55             ` Aneesh Kumar K.V
2021-06-17 14:04               ` Aneesh Kumar K.V
2021-06-15  1:47 ` [RFC PATCH 0/8] Add support for FORM2 associativity Daniel Henrique Barboza

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.