* [PATCH] sched,topology: Update sched topology atomically
@ 2022-04-06 7:01 Huang Ying
2022-04-08 16:52 ` Valentin Schneider
0 siblings, 1 reply; 4+ messages in thread
From: Huang Ying @ 2022-04-06 7:01 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-kernel, Huang Ying, Valentin Schneider, Ingo Molnar,
Mel Gorman, Rik van Riel, Srikar Dronamraju
When Peter Zijlstra reviewed commit 0fb3978b0aac ("sched/numa: Fix
NUMA topology for systems with CPU-less nodes") [1], he pointed out
that sched_domains_numa_distance and sched_domains_numa_masks are made
separate RCU variables. That could go side-ways if there were a
function using both, although there isn't for now.
So we update sched_domains_numa_distance and sched_domains_numa_masks
and some other related sched topology parameters atomically to address
the potential issues.
[1] https://lkml.kernel.org/r/20220214121553.582248-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@surriel.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
kernel/sched/fair.c | 47 ++++++---
kernel/sched/sched.h | 14 ++-
kernel/sched/topology.c | 223 +++++++++++++++++++---------------------
3 files changed, 151 insertions(+), 133 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d4bd299d67ab..546e7347fafc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1294,17 +1294,22 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int lim_dist, bool task)
{
unsigned long score = 0;
- int node, max_dist;
+ int node, max_dist, type;
+ struct sched_numa_topology *topology;
+
+ rcu_read_lock();
+ topology = rcu_dereference(sched_numa_topology);
+ type = topology->type;
+ max_dist = topology->max_distance;
+ rcu_read_unlock();
/*
* All nodes are directly connected, and the same distance
* from each other. No need for fancy placement algorithms.
*/
- if (sched_numa_topology_type == NUMA_DIRECT)
+ if (type == NUMA_DIRECT)
return 0;
- /* sched_max_numa_distance may be changed in parallel. */
- max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
* which should be ok given the number of nodes rarely exceeds 8.
@@ -1327,7 +1332,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
- if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
+ if (type == NUMA_BACKPLANE && dist >= lim_dist)
continue;
/* Add up the faults from nearby nodes. */
@@ -1344,7 +1349,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* The further away a node is, the less the faults count.
* This seems to result in good task placement.
*/
- if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ if (type == NUMA_GLUELESS_MESH) {
faults *= (max_dist - dist);
faults /= (max_dist - LOCAL_DISTANCE);
}
@@ -1971,7 +1976,13 @@ static int task_numa_migrate(struct task_struct *p)
long taskimp, groupimp;
struct numa_group *ng;
struct rq *best_rq;
- int nid, ret, dist;
+ int nid, ret, dist, type;
+ struct sched_numa_topology *topology;
+
+ rcu_read_lock();
+ topology = rcu_dereference(sched_numa_topology);
+ type = topology->type;
+ rcu_read_unlock();
/*
* Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -2026,8 +2037,7 @@ static int task_numa_migrate(struct task_struct *p)
continue;
dist = node_distance(env.src_nid, env.dst_nid);
- if (sched_numa_topology_type == NUMA_BACKPLANE &&
- dist != env.dist) {
+ if (type == NUMA_BACKPLANE && dist != env.dist) {
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
}
@@ -2262,10 +2272,17 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
static int preferred_group_nid(struct task_struct *p, int nid)
{
nodemask_t nodes;
- int dist;
+ int dist, max_dist, type;
+ struct sched_numa_topology *topology;
+
+ rcu_read_lock();
+ topology = rcu_dereference(sched_numa_topology);
+ type = topology->type;
+ max_dist = topology->max_distance;
+ rcu_read_unlock();
/* Direct connections between all NUMA nodes. */
- if (sched_numa_topology_type == NUMA_DIRECT)
+ if (type == NUMA_DIRECT)
return nid;
/*
@@ -2273,14 +2290,12 @@ static int preferred_group_nid(struct task_struct *p, int nid)
* scores nodes according to the number of NUMA hinting faults on
* both the node itself, and on nearby nodes.
*/
- if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ if (type == NUMA_GLUELESS_MESH) {
unsigned long score, max_score = 0;
int node, max_node = nid;
- dist = sched_max_numa_distance;
-
for_each_node_state(node, N_CPU) {
- score = group_weight(p, node, dist);
+ score = group_weight(p, node, max_dist);
if (score > max_score) {
max_score = score;
max_node = node;
@@ -2299,7 +2314,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
* keep the complexity of the search down.
*/
nodes = node_states[N_CPU];
- for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
+ for (dist = max_dist; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE;
int a, b;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 58263f90c559..21a7aea01286 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1644,8 +1644,18 @@ enum numa_topology_type {
NUMA_GLUELESS_MESH,
NUMA_BACKPLANE,
};
-extern enum numa_topology_type sched_numa_topology_type;
-extern int sched_max_numa_distance;
+
+struct sched_numa_topology {
+ enum numa_topology_type type;
+ int nr_levels;
+ int max_distance;
+ int *distances;
+ struct cpumask ***masks;
+ struct rcu_head rcu;
+};
+
+extern struct sched_numa_topology *sched_numa_topology;
+
extern bool find_numa_distance(int distance);
extern void sched_init_numa(int offline_node);
extern void sched_update_numa(int cpu, bool online);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 810750e62118..19523b23034f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1483,14 +1483,10 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
}
#ifdef CONFIG_NUMA
-enum numa_topology_type sched_numa_topology_type;
+struct sched_numa_topology default_numa_topology;
+struct sched_numa_topology *sched_numa_topology = &default_numa_topology;
-static int sched_domains_numa_levels;
static int sched_domains_curr_level;
-
-int sched_max_numa_distance;
-static int *sched_domains_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
#endif
/*
@@ -1601,7 +1597,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
+ if (sched_numa_topology->distances[tl->numa_level] > node_reclaim_distance) {
sd->flags &= ~(SD_BALANCE_EXEC |
SD_BALANCE_FORK |
SD_WAKE_AFFINE);
@@ -1648,7 +1644,8 @@ static struct sched_domain_topology_level default_topology[] = {
static struct sched_domain_topology_level *sched_domain_topology =
default_topology;
-static struct sched_domain_topology_level *sched_domain_topology_saved;
+static struct sched_domain_topology_level *sched_domain_topology_default =
+ default_topology;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
@@ -1659,14 +1656,14 @@ void set_sched_topology(struct sched_domain_topology_level *tl)
return;
sched_domain_topology = tl;
- sched_domain_topology_saved = NULL;
+ sched_domain_topology_default = tl;
}
#ifdef CONFIG_NUMA
static const struct cpumask *sd_numa_mask(int cpu)
{
- return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+ return sched_numa_topology->masks[sched_domains_curr_level][cpu_to_node(cpu)];
}
static void sched_numa_warn(const char *str)
@@ -1697,22 +1694,20 @@ static void sched_numa_warn(const char *str)
bool find_numa_distance(int distance)
{
bool found = false;
- int i, *distances;
+ int i;
+ struct sched_numa_topology *topology;
if (distance == node_distance(0, 0))
return true;
rcu_read_lock();
- distances = rcu_dereference(sched_domains_numa_distance);
- if (!distances)
- goto unlock;
- for (i = 0; i < sched_domains_numa_levels; i++) {
- if (distances[i] == distance) {
+ topology = rcu_dereference(sched_numa_topology);
+ for (i = 0; i < topology->nr_levels; i++) {
+ if (topology->distances[i] == distance) {
found = true;
break;
}
}
-unlock:
rcu_read_unlock();
return found;
@@ -1743,14 +1738,15 @@ bool find_numa_distance(int distance)
* there is an intermediary node C, which is < N hops away from both
* nodes A and B, the system is a glueless mesh.
*/
-static void init_numa_topology_type(int offline_node)
+static void init_numa_topology_type(int offline_node,
+ struct sched_numa_topology *topology)
{
int a, b, c, n;
- n = sched_max_numa_distance;
+ n = topology->max_distance;
- if (sched_domains_numa_levels <= 2) {
- sched_numa_topology_type = NUMA_DIRECT;
+ if (topology->nr_levels <= 2) {
+ topology->type = NUMA_DIRECT;
return;
}
@@ -1764,32 +1760,77 @@ static void init_numa_topology_type(int offline_node)
for_each_cpu_node_but(c, offline_node) {
if (node_distance(a, c) < n &&
node_distance(b, c) < n) {
- sched_numa_topology_type =
- NUMA_GLUELESS_MESH;
+ topology->type = NUMA_GLUELESS_MESH;
return;
}
}
- sched_numa_topology_type = NUMA_BACKPLANE;
+ topology->type = NUMA_BACKPLANE;
return;
}
}
pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n");
- sched_numa_topology_type = NUMA_DIRECT;
+ topology->type = NUMA_DIRECT;
+}
+
+static void sched_numa_topology_free(struct sched_numa_topology *topology)
+{
+ int i, j;
+
+ if (!topology)
+ return;
+
+ kfree(topology->distances);
+ for (i = 0; i < topology->nr_levels && topology->masks; i++) {
+ if (!topology->masks[i])
+ continue;
+ for_each_node(j)
+ kfree(topology->masks[i][j]);
+ kfree(topology->masks[i]);
+ }
+ kfree(topology->masks);
+ kfree(topology);
}
+static void sched_numa_topology_free_rcu(struct rcu_head *head)
+{
+ struct sched_numa_topology *topology;
+
+ topology = container_of(head, struct sched_numa_topology, rcu);
+ sched_numa_topology_free(topology);
+}
+
+static void sched_numa_topology_switch(struct sched_numa_topology *topology,
+ struct sched_domain_topology_level *tl)
+{
+ struct sched_numa_topology *old_topology = sched_numa_topology;
+ struct sched_domain_topology_level *old_tl = sched_domain_topology;
+
+ rcu_assign_pointer(sched_numa_topology, topology);
+ sched_domain_topology = tl;
+
+ if (old_topology == &default_numa_topology)
+ return;
+
+ init_rcu_head(&old_topology->rcu);
+ call_rcu(&old_topology->rcu, sched_numa_topology_free_rcu);
+ kfree(old_tl);
+}
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
void sched_init_numa(int offline_node)
{
struct sched_domain_topology_level *tl;
- unsigned long *distance_map;
+ unsigned long *distance_map = NULL;
int nr_levels = 0;
int i, j;
- int *distances;
- struct cpumask ***masks;
+ struct sched_numa_topology *topology;
+
+ topology = kzalloc(sizeof(*topology), GFP_KERNEL);
+ if (!topology)
+ goto err;
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
@@ -1797,7 +1838,7 @@ void sched_init_numa(int offline_node)
*/
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
if (!distance_map)
- return;
+ goto err;
bitmap_zero(distance_map, NR_DISTANCE_VALUES);
for_each_cpu_node_but(i, offline_node) {
@@ -1806,8 +1847,7 @@ void sched_init_numa(int offline_node)
if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
sched_numa_warn("Invalid distance value range");
- bitmap_free(distance_map);
- return;
+ goto err;
}
bitmap_set(distance_map, distance, 1);
@@ -1818,20 +1858,19 @@ void sched_init_numa(int offline_node)
* allocate memory accordingly.
*/
nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
+ topology->nr_levels = nr_levels;
- distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
- if (!distances) {
- bitmap_free(distance_map);
- return;
- }
+ topology->distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!topology->distances)
+ goto err;
for (i = 0, j = 0; i < nr_levels; i++, j++) {
j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
- distances[i] = j;
+ topology->distances[i] = j;
}
- rcu_assign_pointer(sched_domains_numa_distance, distances);
bitmap_free(distance_map);
+ distance_map = NULL;
/*
* 'nr_levels' contains the number of unique distances
@@ -1840,51 +1879,39 @@ void sched_init_numa(int offline_node)
* numbers.
*/
- /*
- * Here, we should temporarily reset sched_domains_numa_levels to 0.
- * If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'nr_levels' members. This could be
- * dangerous when we use it to iterate array sched_domains_numa_masks[][]
- * in other functions.
- *
- * We reset it to 'nr_levels' at the end of this function.
- */
- sched_domains_numa_levels = 0;
-
- masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
- if (!masks)
- return;
+ topology->masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
+ if (!topology->masks)
+ goto err;
/*
* Now for each level, construct a mask per node which contains all
* CPUs of nodes that are that many hops away from us.
*/
for (i = 0; i < nr_levels; i++) {
- masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
- if (!masks[i])
- return;
+ topology->masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!topology->masks[i])
+ goto err;
for_each_cpu_node_but(j, offline_node) {
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
int k;
if (!mask)
- return;
+ goto err;
- masks[i][j] = mask;
+ topology->masks[i][j] = mask;
for_each_cpu_node_but(k, offline_node) {
if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
- if (node_distance(j, k) > sched_domains_numa_distance[i])
+ if (node_distance(j, k) > topology->distances[i])
continue;
cpumask_or(mask, mask, cpumask_of_node(k));
}
}
}
- rcu_assign_pointer(sched_domains_numa_masks, masks);
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
@@ -1892,7 +1919,7 @@ void sched_init_numa(int offline_node)
tl = kzalloc((i + nr_levels + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
- return;
+ goto err;
/*
* Copy the default topology bits..
@@ -1922,48 +1949,17 @@ void sched_init_numa(int offline_node)
};
}
- sched_domain_topology_saved = sched_domain_topology;
- sched_domain_topology = tl;
+ topology->max_distance = topology->distances[nr_levels - 1];
+ init_numa_topology_type(offline_node, topology);
- sched_domains_numa_levels = nr_levels;
- WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
+ sched_numa_topology_switch(topology, tl);
- init_numa_topology_type(offline_node);
-}
-
-
-static void sched_reset_numa(void)
-{
- int nr_levels, *distances;
- struct cpumask ***masks;
-
- nr_levels = sched_domains_numa_levels;
- sched_domains_numa_levels = 0;
- sched_max_numa_distance = 0;
- sched_numa_topology_type = NUMA_DIRECT;
- distances = sched_domains_numa_distance;
- rcu_assign_pointer(sched_domains_numa_distance, NULL);
- masks = sched_domains_numa_masks;
- rcu_assign_pointer(sched_domains_numa_masks, NULL);
- if (distances || masks) {
- int i, j;
-
- synchronize_rcu();
- kfree(distances);
- for (i = 0; i < nr_levels && masks; i++) {
- if (!masks[i])
- continue;
- for_each_node(j)
- kfree(masks[i][j]);
- kfree(masks[i]);
- }
- kfree(masks);
- }
- if (sched_domain_topology_saved) {
- kfree(sched_domain_topology);
- sched_domain_topology = sched_domain_topology_saved;
- sched_domain_topology_saved = NULL;
- }
+ return;
+err:
+ bitmap_free(distance_map);
+ sched_numa_topology_free(topology);
+ sched_numa_topology_switch(&default_numa_topology,
+ sched_domain_topology_default);
}
/*
@@ -1981,7 +1977,6 @@ void sched_update_numa(int cpu, bool online)
if (cpumask_weight(cpumask_of_node(node)) != 1)
return;
- sched_reset_numa();
sched_init_numa(online ? NUMA_NO_NODE : node);
}
@@ -1990,14 +1985,15 @@ void sched_domains_numa_masks_set(unsigned int cpu)
int node = cpu_to_node(cpu);
int i, j;
- for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (i = 0; i < sched_numa_topology->nr_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
if (!node_state(j, N_CPU))
continue;
/* Set ourselves in the remote node's masks */
- if (node_distance(j, node) <= sched_domains_numa_distance[i])
- cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ if (node_distance(j, node) <=
+ sched_numa_topology->distances[i])
+ cpumask_set_cpu(cpu, sched_numa_topology->masks[i][j]);
}
}
}
@@ -2006,10 +2002,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
{
int i, j;
- for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (i = 0; i < sched_numa_topology->nr_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
- if (sched_domains_numa_masks[i][j])
- cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ if (sched_numa_topology->masks[i][j])
+ cpumask_clear_cpu(cpu, sched_numa_topology->masks[i][j]);
}
}
}
@@ -2025,22 +2021,19 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
{
int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
- struct cpumask ***masks;
+ struct sched_numa_topology *topology;
rcu_read_lock();
- masks = rcu_dereference(sched_domains_numa_masks);
- if (!masks)
- goto unlock;
- for (i = 0; i < sched_domains_numa_levels; i++) {
- if (!masks[i][j])
+ topology = rcu_dereference(sched_numa_topology);
+ for (i = 0; i < topology->nr_levels; i++) {
+ if (!topology->masks[i][j])
break;
- cpu = cpumask_any_and(cpus, masks[i][j]);
+ cpu = cpumask_any_and(cpus, topology->masks[i][j]);
if (cpu < nr_cpu_ids) {
found = cpu;
break;
}
}
-unlock:
rcu_read_unlock();
return found;
--
2.30.2
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] sched,topology: Update sched topology atomically
2022-04-06 7:01 [PATCH] sched,topology: Update sched topology atomically Huang Ying
@ 2022-04-08 16:52 ` Valentin Schneider
2022-04-11 8:22 ` Valentin Schneider
2022-04-11 9:01 ` ying.huang
0 siblings, 2 replies; 4+ messages in thread
From: Valentin Schneider @ 2022-04-08 16:52 UTC (permalink / raw)
To: Huang Ying, Peter Zijlstra
Cc: linux-kernel, Huang Ying, Valentin Schneider, Ingo Molnar,
Mel Gorman, Rik van Riel, Srikar Dronamraju
Hi,
On 06/04/22 15:01, Huang Ying wrote:
> When Peter Zijlstra reviewed commit 0fb3978b0aac ("sched/numa: Fix
> NUMA topology for systems with CPU-less nodes") [1], he pointed out
> that sched_domains_numa_distance and sched_domains_numa_masks are made
> separate RCU variables. That could go side-ways if there were a
> function using both, although there isn't for now.
>
> So we update sched_domains_numa_distance and sched_domains_numa_masks
> and some other related sched topology parameters atomically to address
> the potential issues.
>
> [1] https://lkml.kernel.org/r/20220214121553.582248-1-ying.huang@intel.com
>
I had to re-acquaint myself with [1], but AFAICT this looks mostly
OK. Have some nits/comments below, I haven't actually tested this yet as
I'm still setting stuff up over here...
> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Cc: Valentin Schneider <valentin.schneider@arm.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Rik van Riel <riel@surriel.com>
> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
> kernel/sched/fair.c | 47 ++++++---
> kernel/sched/sched.h | 14 ++-
> kernel/sched/topology.c | 223 +++++++++++++++++++---------------------
> 3 files changed, 151 insertions(+), 133 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d4bd299d67ab..546e7347fafc 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1294,17 +1294,22 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
> int lim_dist, bool task)
> {
> unsigned long score = 0;
> - int node, max_dist;
> + int node, max_dist, type;
> + struct sched_numa_topology *topology;
> +
> + rcu_read_lock();
> + topology = rcu_dereference(sched_numa_topology);
> + type = topology->type;
> + max_dist = topology->max_distance;
> + rcu_read_unlock();
>
One thing worth pointing out, the NUMA topology type *wasn't* RCU protected
previously, which I think we've been wanting since
0083242c9375 ("sched/topology: Skip updating masks for non-online nodes")
(before then the topology was init'd once-and-for-all).
Another thing; this pattern is now repeated a handful of times in fair.c,
what about bundling it up? As a bonus it lets us keep the definition of
struct sched_numa_topology internal to topology.c:
void sched_fetch_numa_topology(int *type int *max_distance)
{
struct sched_numa_topology __rcu *topology;
rcu_read_lock();
topology = rcu_dereference(sched_numa_topology);
if (type)
*type = topology->type;
if (max_distance)
*max_distance = topology->max_distance;
rcu_read_unlock();
}
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 810750e62118..19523b23034f 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1483,14 +1483,10 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
> }
>
> #ifdef CONFIG_NUMA
> -enum numa_topology_type sched_numa_topology_type;
> +struct sched_numa_topology default_numa_topology;
Shouldn't this be static?
> +struct sched_numa_topology *sched_numa_topology = &default_numa_topology;
This should also be __rcu (sparse should complain about this)
>
> -static int sched_domains_numa_levels;
> static int sched_domains_curr_level;
> -
> -int sched_max_numa_distance;
> -static int *sched_domains_numa_distance;
> -static struct cpumask ***sched_domains_numa_masks;
> #endif
>
> /*
> +static void sched_numa_topology_switch(struct sched_numa_topology *topology,
> + struct sched_domain_topology_level *tl)
> +{
> + struct sched_numa_topology *old_topology = sched_numa_topology;
> + struct sched_domain_topology_level *old_tl = sched_domain_topology;
> +
> + rcu_assign_pointer(sched_numa_topology, topology);
> + sched_domain_topology = tl;
> +
> + if (old_topology == &default_numa_topology)
> + return;
> +
> + init_rcu_head(&old_topology->rcu);
I'm not familiar with this and I can find only a handful of users, should
we be doing that for struct sched_domain?
> + call_rcu(&old_topology->rcu, sched_numa_topology_free_rcu);
> + kfree(old_tl);
> +}
>
> #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
>
> void sched_init_numa(int offline_node)
> {
> struct sched_domain_topology_level *tl;
> - unsigned long *distance_map;
> + unsigned long *distance_map = NULL;
> int nr_levels = 0;
> int i, j;
> - int *distances;
> - struct cpumask ***masks;
> + struct sched_numa_topology *topology;
> +
> + topology = kzalloc(sizeof(*topology), GFP_KERNEL);
> + if (!topology)
> + goto err;
>
> /*
> * O(nr_nodes^2) deduplicating selection sort -- in order to find the
> + return;
> +err:
I was expecting the "usual" multi-label setup, but this seems to be working
too - though not in the most obvious way ;)
> + bitmap_free(distance_map);
> + sched_numa_topology_free(topology);
> + sched_numa_topology_switch(&default_numa_topology,
> + sched_domain_topology_default);
> }
>
> /*
> @@ -1981,7 +1977,6 @@ void sched_update_numa(int cpu, bool online)
> if (cpumask_weight(cpumask_of_node(node)) != 1)
> return;
>
> - sched_reset_numa();
> sched_init_numa(online ? NUMA_NO_NODE : node);
> }
>
> @@ -1990,14 +1985,15 @@ void sched_domains_numa_masks_set(unsigned int cpu)
> int node = cpu_to_node(cpu);
> int i, j;
>
> - for (i = 0; i < sched_domains_numa_levels; i++) {
> + for (i = 0; i < sched_numa_topology->nr_levels; i++) {
> for (j = 0; j < nr_node_ids; j++) {
> if (!node_state(j, N_CPU))
> continue;
>
> /* Set ourselves in the remote node's masks */
> - if (node_distance(j, node) <= sched_domains_numa_distance[i])
> - cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
> + if (node_distance(j, node) <=
> + sched_numa_topology->distances[i])
> + cpumask_set_cpu(cpu, sched_numa_topology->masks[i][j]);
I'm guessing the assumption here is that this is done in
sched_cpu_{de,}activate(), so we have the hotplug lock and so the masks are
stable... Should we sprinkle in a lockdep_assert_cpus_held() for good
measure?
Having a second look at this, I think the same applies to your change to
sd_numa_mask() - those end up used in the domain rebuild which itself might
be deferred cpuset_hotplug_work, but that should still hold the hotplug
lock...
> }
> }
> }
> @@ -2006,10 +2002,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
> {
> int i, j;
>
> - for (i = 0; i < sched_domains_numa_levels; i++) {
> + for (i = 0; i < sched_numa_topology->nr_levels; i++) {
> for (j = 0; j < nr_node_ids; j++) {
> - if (sched_domains_numa_masks[i][j])
> - cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
> + if (sched_numa_topology->masks[i][j])
> + cpumask_clear_cpu(cpu, sched_numa_topology->masks[i][j]);
> }
> }
> }
> @@ -2025,22 +2021,19 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
> int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
> {
> int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
> - struct cpumask ***masks;
> + struct sched_numa_topology *topology;
>
> rcu_read_lock();
> - masks = rcu_dereference(sched_domains_numa_masks);
> - if (!masks)
> - goto unlock;
> - for (i = 0; i < sched_domains_numa_levels; i++) {
> - if (!masks[i][j])
> + topology = rcu_dereference(sched_numa_topology);
> + for (i = 0; i < topology->nr_levels; i++) {
> + if (!topology->masks[i][j])
If I got default_numa_topology right and it's meant to be static,
sched_numa_topology->masks can still be NULL, but here this would be gated
by nr_levels being 0... Did I get that right?
> break;
> - cpu = cpumask_any_and(cpus, masks[i][j]);
> + cpu = cpumask_any_and(cpus, topology->masks[i][j]);
> if (cpu < nr_cpu_ids) {
> found = cpu;
> break;
> }
> }
> -unlock:
> rcu_read_unlock();
>
> return found;
> --
> 2.30.2
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] sched,topology: Update sched topology atomically
2022-04-08 16:52 ` Valentin Schneider
@ 2022-04-11 8:22 ` Valentin Schneider
2022-04-11 9:01 ` ying.huang
1 sibling, 0 replies; 4+ messages in thread
From: Valentin Schneider @ 2022-04-11 8:22 UTC (permalink / raw)
To: Huang Ying, Peter Zijlstra
Cc: linux-kernel, Huang Ying, Valentin Schneider, Ingo Molnar,
Mel Gorman, Rik van Riel, Srikar Dronamraju
On 08/04/22 17:52, Valentin Schneider wrote:
>> + topology = rcu_dereference(sched_numa_topology);
>> + for (i = 0; i < topology->nr_levels; i++) {
>> + if (!topology->masks[i][j])
>
> If I got default_numa_topology right and it's meant to be static,
> sched_numa_topology->masks can still be NULL, but here this would be gated
> by nr_levels being 0... Did I get that right?
>
Being a file-global variable, that's already init'd-as-0, (I'll use
my longish break as an excuse for forgetting that one), but the rest still
applies.
>> break;
>> - cpu = cpumask_any_and(cpus, masks[i][j]);
>> + cpu = cpumask_any_and(cpus, topology->masks[i][j]);
>> if (cpu < nr_cpu_ids) {
>> found = cpu;
>> break;
>> }
>> }
>> -unlock:
>> rcu_read_unlock();
>>
>> return found;
>> --
>> 2.30.2
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] sched,topology: Update sched topology atomically
2022-04-08 16:52 ` Valentin Schneider
2022-04-11 8:22 ` Valentin Schneider
@ 2022-04-11 9:01 ` ying.huang
1 sibling, 0 replies; 4+ messages in thread
From: ying.huang @ 2022-04-11 9:01 UTC (permalink / raw)
To: Valentin Schneider, Peter Zijlstra, Paul E. McKenney
Cc: linux-kernel, Valentin Schneider, Ingo Molnar, Mel Gorman,
Rik van Riel, Srikar Dronamraju
Hi, Valentin,
Thanks a lot for review!
On Fri, 2022-04-08 at 17:52 +0100, Valentin Schneider wrote:
>
> Hi,
>
> On 06/04/22 15:01, Huang Ying wrote:
> > When Peter Zijlstra reviewed commit 0fb3978b0aac ("sched/numa: Fix
> > NUMA topology for systems with CPU-less nodes") [1], he pointed out
> > that sched_domains_numa_distance and sched_domains_numa_masks are made
> > separate RCU variables. That could go side-ways if there were a
> > function using both, although there isn't for now.
> >
> > So we update sched_domains_numa_distance and sched_domains_numa_masks
> > and some other related sched topology parameters atomically to address
> > the potential issues.
> >
> > [1] https://lkml.kernel.org/r/20220214121553.582248-1-
> > ying.huang@intel.com
> >
>
> I had to re-acquaint myself with [1], but AFAICT this looks mostly
> OK. Have some nits/comments below, I haven't actually tested this yet as
> I'm still setting stuff up over here...
>
> > Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> > Suggested-by: Peter Zijlstra <peterz@infradead.org>
> > Cc: Valentin Schneider <valentin.schneider@arm.com>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Mel Gorman <mgorman@suse.de>
> > Cc: Rik van Riel <riel@surriel.com>
> > Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> > ---
> > kernel/sched/fair.c | 47 ++++++---
> > kernel/sched/sched.h | 14 ++-
> > kernel/sched/topology.c | 223 +++++++++++++++++++---------------------
> > 3 files changed, 151 insertions(+), 133 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index d4bd299d67ab..546e7347fafc 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1294,17 +1294,22 @@ static unsigned long score_nearby_nodes(struct
> > task_struct *p, int nid,
> > int lim_dist, bool task)
> > {
> > unsigned long score = 0;
> > - int node, max_dist;
> > + int node, max_dist, type;
> > + struct sched_numa_topology *topology;
> > +
> > + rcu_read_lock();
> > + topology = rcu_dereference(sched_numa_topology);
> > + type = topology->type;
> > + max_dist = topology->max_distance;
> > + rcu_read_unlock();
> >
>
> One thing worth pointing out, the NUMA topology type *wasn't* RCU protected
> previously, which I think we've been wanting since
>
> 0083242c9375 ("sched/topology: Skip updating masks for non-online nodes")
>
> (before then the topology was init'd once-and-for-all).
>
> Another thing; this pattern is now repeated a handful of times in fair.c,
> what about bundling it up? As a bonus it lets us keep the definition of
> struct sched_numa_topology internal to topology.c:
>
> void sched_fetch_numa_topology(int *type int *max_distance)
> {
> struct sched_numa_topology __rcu *topology;
>
> rcu_read_lock();
> topology = rcu_dereference(sched_numa_topology);
>
> if (type)
> *type = topology->type;
>
> if (max_distance)
> *max_distance = topology->max_distance;
>
>
> rcu_read_unlock();
> }
>
This looks good. Will use it in the next version. Thanks!
> > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> > index 810750e62118..19523b23034f 100644
> > --- a/kernel/sched/topology.c
> > +++ b/kernel/sched/topology.c
> > @@ -1483,14 +1483,10 @@ static void claim_allocations(int cpu, struct
> > sched_domain *sd)
> > }
> >
> > #ifdef CONFIG_NUMA
> > -enum numa_topology_type sched_numa_topology_type;
> > +struct sched_numa_topology default_numa_topology;
>
> Shouldn't this be static?
Yes. Will change it in the next version.
> > +struct sched_numa_topology *sched_numa_topology =
> > &default_numa_topology;
>
> This should also be __rcu (sparse should complain about this)
Yes. Will change it in the next version.
> >
> > -static int sched_domains_numa_levels;
> > static int sched_domains_curr_level;
> > -
> > -int sched_max_numa_distance;
> > -static int *sched_domains_numa_distance;
> > -static struct cpumask ***sched_domains_numa_masks;
> > #endif
> >
> > /*
>
> > +static void sched_numa_topology_switch(struct sched_numa_topology
> > *topology,
> > + struct sched_domain_topology_level
> > *tl)
> > +{
> > + struct sched_numa_topology *old_topology = sched_numa_topology;
> > + struct sched_domain_topology_level *old_tl =
> > sched_domain_topology;
> > +
> > + rcu_assign_pointer(sched_numa_topology, topology);
> > + sched_domain_topology = tl;
> > +
> > + if (old_topology == &default_numa_topology)
> > + return;
> > +
> > + init_rcu_head(&old_topology->rcu);
>
> I'm not familiar with this and I can find only a handful of users, should
> we be doing that for struct sched_domain?
Per my understanding, this is for debug and we should call it before
call_rcu(). Copying Paul for confirmation.
>
> > + call_rcu(&old_topology->rcu, sched_numa_topology_free_rcu);
> > + kfree(old_tl);
> > +}
> >
> > #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
> >
> > void sched_init_numa(int offline_node)
> > {
> > struct sched_domain_topology_level *tl;
> > - unsigned long *distance_map;
> > + unsigned long *distance_map = NULL;
> > int nr_levels = 0;
> > int i, j;
> > - int *distances;
> > - struct cpumask ***masks;
> > + struct sched_numa_topology *topology;
> > +
> > + topology = kzalloc(sizeof(*topology), GFP_KERNEL);
> > + if (!topology)
> > + goto err;
> >
> > /*
> > * O(nr_nodes^2) deduplicating selection sort -- in order to find
> > the
>
> > + return;
> > +err:
>
> I was expecting the "usual" multi-label setup, but this seems to be working
> too - though not in the most obvious way ;)
OK. Will try to convert it to multi-label.
>
> > + bitmap_free(distance_map);
> > + sched_numa_topology_free(topology);
> > + sched_numa_topology_switch(&default_numa_topology,
> > + sched_domain_topology_default);
> > }
> >
> > /*
> > @@ -1981,7 +1977,6 @@ void sched_update_numa(int cpu, bool online)
> > if (cpumask_weight(cpumask_of_node(node)) != 1)
> > return;
> >
> > - sched_reset_numa();
> > sched_init_numa(online ? NUMA_NO_NODE : node);
> > }
> >
> > @@ -1990,14 +1985,15 @@ void sched_domains_numa_masks_set(unsigned int
> > cpu)
> > int node = cpu_to_node(cpu);
> > int i, j;
> >
> > - for (i = 0; i < sched_domains_numa_levels; i++) {
> > + for (i = 0; i < sched_numa_topology->nr_levels; i++) {
> > for (j = 0; j < nr_node_ids; j++) {
> > if (!node_state(j, N_CPU))
> > continue;
> >
> > /* Set ourselves in the remote node's masks */
> > - if (node_distance(j, node) <=
> > sched_domains_numa_distance[i])
> > - cpumask_set_cpu(cpu,
> > sched_domains_numa_masks[i][j]);
> > + if (node_distance(j, node) <=
> > + sched_numa_topology->distances[i])
> > + cpumask_set_cpu(cpu, sched_numa_topology-
> > >masks[i][j]);
> will be 0
> I'm guessing the assumption here is that this is done in
> sched_cpu_{de,}activate(), so we have the hotplug lock and so the masks are
> stable... Should we sprinkle in a lockdep_assert_cpus_held() for good
> measure?
>
> Having a second look at this, I think the same applies to your change to
> sd_numa_mask() - those end up used in the domain rebuild which itself might
> be deferred cpuset_hotplug_work, but that should still hold the hotplug
> lock...
Yes. hotplug lock needs to be held in these places. Will add
lockdep_assert_cpus_held() in necessary places.
> > }
> > }
> > }
> > @@ -2006,10 +2002,10 @@ void sched_domains_numa_masks_clear(unsigned int
> > cpu)
> > {
> > int i, j;
> >
> > - for (i = 0; i < sched_domains_numa_levels; i++) {
> > + for (i = 0; i < sched_numa_topology->nr_levels; i++) {
> > for (j = 0; j < nr_node_ids; j++) {
> > - if (sched_domains_numa_masks[i][j])
> > - cpumask_clear_cpu(cpu,
> > sched_domains_numa_masks[i][j]);
> > + if (sched_numa_topology->masks[i][j])
> > + cpumask_clear_cpu(cpu,
> > sched_numa_topology->masks[i][j]);
> > }
> > }
> > }
> > @@ -2025,22 +2021,19 @@ void sched_domains_numa_masks_clear(unsigned int
> > cpu)
> > int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
> > {
> > int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
> > - struct cpumask ***masks;
> > + struct sched_numa_topology *topology;
> >
> > rcu_read_lock();
> > - masks = rcu_dereference(sched_domains_numa_masks);
> > - if (!masks)
> > - goto unlock;
> > - for (i = 0; i < sched_domains_numa_levels; i++) {
> > - if (!masks[i][j])
> > + topology = rcu_dereference(sched_numa_topology);
> > + for (i = 0; i < topology->nr_levels; i++) {
> > + if (!topology->masks[i][j])
>
> If I got default_numa_topology right and it's meant to be static,
> sched_numa_topology->masks can still be NULL, but here this would be gated
> by nr_levels being 0... Did I get that right?
Yes. nr_levels in default_numa_topology is 0.
>
> > break;
> > - cpu = cpumask_any_and(cpus, masks[i][j]);
> > + cpu = cpumask_any_and(cpus, topology->masks[i][j]);
> > if (cpu < nr_cpu_ids) {
> > found = cpu;
> > break;
> > }
> > }
> > -unlock:
> > rcu_read_unlock();
> >
> > return found;
> > --
> > 2.30.2
>
Best Regards,
Huang, Ying
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-04-11 9:01 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-06 7:01 [PATCH] sched,topology: Update sched topology atomically Huang Ying
2022-04-08 16:52 ` Valentin Schneider
2022-04-11 8:22 ` Valentin Schneider
2022-04-11 9:01 ` ying.huang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).