[RFC][PATCH 3/7] sched: Introduce struct sched_domain_shared

From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org, linux-kernel@vger.kernel.org
Cc: clm@fb.com, matt@codeblueprint.co.uk, mgalbraith@suse.de,
	tglx@linutronix.de, fweisbec@gmail.com, peterz@infradead.org
Subject: [RFC][PATCH 3/7] sched: Introduce struct sched_domain_shared
Date: Mon, 09 May 2016 12:48:10 +0200	[thread overview]
Message-ID: <20160509105210.568400596@infradead.org> (raw)
In-Reply-To: 20160509104807.284575300@infradead.org

[-- Attachment #1: peterz-sched-kill-sd_busy-2.patch --]
[-- Type: text/plain, Size: 5187 bytes --]

Since struct sched_domain is strictly per cpu; introduce a structure
that is shared between all 'identical' sched_domains.

Limit to SD_SHARE_PKG_RESOURCES domains for now, as we'll only use it
for shared cache state; if another use comes up later we can easily
relax this.

While the sched_group's are normally shared between CPUs, these are
not natural to use when we need some shared state on a domain level --
since that would require the domain to have a parent, which is not a
given.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h |    6 ++++++
 kernel/sched/core.c   |   40 ++++++++++++++++++++++++++++++++++------
 2 files changed, 40 insertions(+), 6 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1057,6 +1057,10 @@ extern int sched_domain_level_max;
 
 struct sched_group;
 
+struct sched_domain_shared {
+	atomic_t	ref;
+};
+
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain *parent;	/* top domain must be null terminated */
@@ -1125,6 +1129,7 @@ struct sched_domain {
 		void *private;		/* used during construction */
 		struct rcu_head rcu;	/* used during destruction */
 	};
+	struct sched_domain_shared *shared;
 
 	unsigned int span_weight;
 	/*
@@ -1158,6 +1163,7 @@ typedef int (*sched_domain_flags_f)(void
 
 struct sd_data {
 	struct sched_domain **__percpu sd;
+	struct sched_domain_shared **__percpu sds;
 	struct sched_group **__percpu sg;
 	struct sched_group_capacity **__percpu sgc;
 };
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5836,6 +5836,8 @@ static void free_sched_domain(struct sch
 		kfree(sd->groups->sgc);
 		kfree(sd->groups);
 	}
+	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+		kfree(sd->shared);
 	kfree(sd);
 }
 
@@ -6270,6 +6272,9 @@ static void claim_allocations(int cpu, s
 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
 
+	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+		*per_cpu_ptr(sdd->sds, cpu) = NULL;
+
 	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
 
@@ -6305,10 +6310,12 @@ static int sched_domains_curr_level;
 	 SD_SHARE_POWERDOMAIN)
 
 static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+	const struct cpumask *cpu_map, int cpu)
 {
-	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-	int sd_weight, sd_flags = 0;
+	struct sd_data *sdd = &tl->data;
+	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+	int sd_id, sd_weight, sd_flags = 0;
 
 #ifdef CONFIG_NUMA
 	/*
@@ -6362,6 +6369,9 @@ sd_init(struct sched_domain_topology_lev
 #endif
 	};
 
+	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+	sd_id = cpumask_first(sched_domain_span(sd));
+
 	/*
 	 * Convert topological properties into behaviour.
 	 */
@@ -6376,6 +6386,9 @@ sd_init(struct sched_domain_topology_lev
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
 
+		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+		atomic_inc(&sd->shared->ref);
+
 #ifdef CONFIG_NUMA
 	} else if (sd->flags & SD_NUMA) {
 		sd->cache_nice_tries = 2;
@@ -6397,7 +6410,7 @@ sd_init(struct sched_domain_topology_lev
 		sd->idle_idx = 1;
 	}
 
-	sd->private = &tl->data;
+	sd->private = sdd;
 
 	return sd;
 }
@@ -6704,6 +6717,10 @@ static int __sdt_alloc(const struct cpum
 		if (!sdd->sd)
 			return -ENOMEM;
 
+		sdd->sds = alloc_percpu(struct sched_domain_shared *);
+		if (!sdd->sds)
+			return -ENOMEM;
+
 		sdd->sg = alloc_percpu(struct sched_group *);
 		if (!sdd->sg)
 			return -ENOMEM;
@@ -6714,6 +6731,7 @@ static int __sdt_alloc(const struct cpum
 
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
+			struct sched_domain_shared *sds;
 			struct sched_group *sg;
 			struct sched_group_capacity *sgc;
 
@@ -6724,6 +6742,13 @@ static int __sdt_alloc(const struct cpum
 
 			*per_cpu_ptr(sdd->sd, j) = sd;
 
+			sds = kzalloc_node(sizeof(struct sched_domain_shared),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sds)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sds, j) = sds;
+
 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sg)
@@ -6763,6 +6788,8 @@ static void __sdt_free(const struct cpum
 				kfree(*per_cpu_ptr(sdd->sd, j));
 			}
 
+			if (sdd->sds)
+				kfree(*per_cpu_ptr(sdd->sds, j));
 			if (sdd->sg)
 				kfree(*per_cpu_ptr(sdd->sg, j));
 			if (sdd->sgc)
@@ -6770,6 +6797,8 @@ static void __sdt_free(const struct cpum
 		}
 		free_percpu(sdd->sd);
 		sdd->sd = NULL;
+		free_percpu(sdd->sds);
+		sdd->sds = NULL;
 		free_percpu(sdd->sg);
 		sdd->sg = NULL;
 		free_percpu(sdd->sgc);
@@ -6781,11 +6810,10 @@ struct sched_domain *build_sched_domain(
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 		struct sched_domain *child, int cpu)
 {
-	struct sched_domain *sd = sd_init(tl, cpu);
+	struct sched_domain *sd = sd_init(tl, cpu_map, cpu);
 	if (!sd)
 		return child;
 
-	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
 	if (child) {
 		sd->level = child->level + 1;
 		sched_domain_level_max = max(sched_domain_level_max, sd->level);