linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: Anton Blanchard <anton@samba.org>
To: benh@kernel.crashing.org
Cc: linuxppc-dev@lists.ozlabs.org
Subject: [PATCH 3/3] powerpc: Use form 1 affinity to setup node distance
Date: Fri, 30 Apr 2010 14:43:56 +1000	[thread overview]
Message-ID: <20100430044356.GD4622@kryten> (raw)
In-Reply-To: <20100430043407.GC4622@kryten>


Form 1 affinity allows multiple entries in ibm,associativity-reference-points
which represent affinity domains in decreasing order of importance. The
Linux concept of a node is always the first entry, but using the other
values as an input to node_distance() allows the memory allocator to make
better decisions on which node to go first when local memory has been
exhausted.

We keep things simple and create an array indexed by NUMA node, capped at
4 entries. Each time we lookup an associativity property we initialise
the array which is overkill, but since we should only hit this path during
boot it didn't seem worth adding a per node valid bit.

Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: linux-2.6/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6.orig/arch/powerpc/include/asm/topology.h	2010-04-29 15:58:58.000000000 +1000
+++ linux-2.6/arch/powerpc/include/asm/topology.h	2010-04-29 15:59:00.000000000 +1000
@@ -77,6 +77,9 @@ static inline int pcibus_to_node(struct 
 	.balance_interval	= 1,					\
 }
 
+extern int __node_distance(int, int);
+#define node_distance(a, b) __node_distance(a, b)
+
 extern void __init dump_numa_cpu_topology(void);
 
 extern int sysfs_add_device_to_node(struct sys_device *dev, int nid);
Index: linux-2.6/arch/powerpc/mm/numa.c
===================================================================
--- linux-2.6.orig/arch/powerpc/mm/numa.c	2010-04-29 15:58:59.000000000 +1000
+++ linux-2.6/arch/powerpc/mm/numa.c	2010-04-29 22:05:24.000000000 +1000
@@ -42,6 +42,12 @@ EXPORT_SYMBOL(node_data);
 
 static int min_common_depth;
 static int n_mem_addr_cells, n_mem_size_cells;
+static int form1_affinity;
+
+#define MAX_DISTANCE_REF_POINTS 4
+static int distance_ref_points_depth;
+static const unsigned int *distance_ref_points;
+static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
 
 static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
 						unsigned int *nid)
@@ -179,6 +185,39 @@ static const u32 *of_get_usable_memory(s
 	return prop;
 }
 
+int __node_distance(int a, int b)
+{
+	int i;
+	int distance = LOCAL_DISTANCE;
+
+	if (!form1_affinity)
+		return distance;
+
+	for (i = 0; i < distance_ref_points_depth; i++) {
+		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
+			break;
+
+		/* Double the distance for each NUMA level */
+		distance *= 2;
+	}
+
+	return distance;
+}
+
+static void initialize_distance_lookup_table(int nid,
+		const unsigned int *associativity)
+{
+	int i;
+
+	if (!form1_affinity)
+		return;
+
+	for (i = 0; i < distance_ref_points_depth; i++) {
+		distance_lookup_table[nid][i] =
+			associativity[distance_ref_points[i]];
+	}
+}
+
 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
  * info is found.
  */
@@ -200,6 +239,10 @@ static int of_node_to_nid_single(struct 
 	/* POWER4 LPAR uses 0xffff as invalid node */
 	if (nid == 0xffff || nid >= MAX_NUMNODES)
 		nid = -1;
+
+	if (nid > 0 && tmp[0] >= distance_ref_points_depth)
+		initialize_distance_lookup_table(nid, tmp);
+
 out:
 	return nid;
 }
@@ -226,26 +269,10 @@ int of_node_to_nid(struct device_node *d
 }
 EXPORT_SYMBOL_GPL(of_node_to_nid);
 
-/*
- * In theory, the "ibm,associativity" property may contain multiple
- * associativity lists because a resource may be multiply connected
- * into the machine.  This resource then has different associativity
- * characteristics relative to its multiple connections.  We ignore
- * this for now.  We also assume that all cpu and memory sets have
- * their distances represented at a common level.  This won't be
- * true for hierarchical NUMA.
- *
- * In any case the ibm,associativity-reference-points should give
- * the correct depth for a normal NUMA system.
- *
- * - Dave Hansen <haveblue@us.ibm.com>
- */
 static int __init find_min_common_depth(void)
 {
-	int depth, index;
-	const unsigned int *ref_points;
+	int depth;
 	struct device_node *rtas_root;
-	unsigned int len;
 	struct device_node *options;
 
 	rtas_root = of_find_node_by_path("/rtas");
@@ -254,35 +281,62 @@ static int __init find_min_common_depth(
 		return -1;
 
 	/*
-	 * this property is 2 32-bit integers, each representing a level of
-	 * depth in the associativity nodes.  The first is for an SMP
-	 * configuration (should be all 0's) and the second is for a normal
-	 * NUMA configuration.
+	 * This property is a set of 32-bit integers, each representing
+	 * an index into the ibm,associativity nodes.
+	 *
+	 * With form 0 affinity the first integer is for an SMP configuration
+	 * (should be all 0's) and the second is for a normal NUMA
+	 * configuration. We have only one level of NUMA.
+	 *
+	 * With form 1 affinity the first integer is the most significant
+	 * NUMA boundary and the following are progressively less significant
+	 * boundaries. There can be more than one level of NUMA.
 	 */
-	index = 1;
-	ref_points = of_get_property(rtas_root,
-			"ibm,associativity-reference-points", &len);
+	distance_ref_points = of_get_property(rtas_root,
+			"ibm,associativity-reference-points",
+			&distance_ref_points_depth);
+
+	if (!distance_ref_points)
+		goto err;
+
+	distance_ref_points_depth /= sizeof(int);
 
-	/*
-	 * For type 1 affinity information we want the first field
-	 */
 	options = of_find_node_by_path("/options");
 	if (options) {
 		const char *str;
 		str = of_get_property(options, "ibm,associativity-form", NULL);
 		if (str && !strcmp(str, "1"))
-                        index = 0;
+			form1_affinity = 1;
 	}
 
-	if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
-		depth = ref_points[index];
+	if (form1_affinity) {
+		depth = distance_ref_points[0];
 	} else {
-		dbg("NUMA: ibm,associativity-reference-points not found.\n");
-		depth = -1;
+		if (distance_ref_points_depth < 2)
+			goto err;
+
+		depth = distance_ref_points[1];
 	}
+
+	/*
+	 * Warn and cap if the hardware supports more than
+	 * MAX_DISTANCE_REF_POINTS domains.
+	 */
+	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+		printk(KERN_WARNING
+		       "NUMA: distance array capped at %d entries\n",
+			MAX_DISTANCE_REF_POINTS);
+		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+	}
+
 	of_node_put(rtas_root);
 
 	return depth;
+
+err:
+	dbg("NUMA: ibm,associativity-reference-points not found.\n");
+	of_node_put(rtas_root);
+	return -1;
 }
 
 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)

  reply	other threads:[~2010-04-30  4:43 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-04-30  4:33 [PATCH 1/3] powerpc: Set a smaller value for RECLAIM_DISTANCE to enable zone reclaim Anton Blanchard
2010-04-30  4:34 ` [PATCH 2/3] powerpc: Add form 1 NUMA affinity Anton Blanchard
2010-04-30  4:43   ` Anton Blanchard [this message]
2010-05-06  6:50     ` [PATCH 3/3] powerpc: Use form 1 affinity to setup node distance Benjamin Herrenschmidt
2010-04-30  7:24   ` [PATCH 2/3] powerpc: Add form 1 NUMA affinity Benjamin Herrenschmidt
2010-04-30  7:33     ` Anton Blanchard

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100430044356.GD4622@kryten \
    --to=anton@samba.org \
    --cc=benh@kernel.crashing.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).