Re: [PATCH 4/4] sched/topology: the group balance cpu must be a cpu where the group is installed

From: Peter Zijlstra <peterz@infradead.org>
To: Lauro Venancio <lvenanci@redhat.com>
Cc: lwang@redhat.com, riel@redhat.com, Mike Galbraith <efault@gmx.de>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@kernel.org>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH 4/4] sched/topology: the group balance cpu must be a cpu where the group is installed
Date: Tue, 25 Apr 2017 18:26:36 +0200	[thread overview]
Message-ID: <20170425162636.3jvmiys6ej5gtsxx@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <91317113-f1a7-a1c6-812e-cbda5284d404@redhat.com>

On Tue, Apr 25, 2017 at 12:56:23PM -0300, Lauro Venancio wrote:

> > Another thing I've been thinking about; I think we can do away with the
> > kzalloc() in build_group_from_child_sched_domain() and use the sdd->sg
> > storage.
> I considered this too. I decided to do not change this because I was not
> sure if the kzalloc() was there for performance reasons. Currently, all
> groups are allocated in the NUMA node they are used.
> If we use sdd->sg storage, we may have groups allocated in one NUMA node
> being used in another node.

Right.. I cannot remember :/

/me once again kicks himself for not writing more comments

It does save a few lines.. and I suspect that if we do this, we could
actually completely get rid of sched_group_capacity, since its now
always the same as the group (again), which should removes more lines
still.

But I'll shelf this patch for now.. we've got enough changes as is.

I still need to write a changelog for the new #2, which has become ugly
again, because its needs a second sched_domains_tmpmask.

(compile tested only)

---
 kernel/sched/topology.c |   76 ++++++++++++++++++------------------------------
 1 file changed, 29 insertions(+), 47 deletions(-)

--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -501,10 +501,8 @@ enum s_alloc {
  * balancing.
  */
 static void
-build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
+build_group_mask(struct sd_data *sdd, struct cpumask *sg_span, struct cpumask *mask)
 {
-	const struct cpumask *sg_span = sched_group_cpus(sg);
-	struct sd_data *sdd = sd->private;
 	struct sched_domain *sibling;
 	int i;
 
@@ -542,49 +540,34 @@ int group_balance_cpu(struct sched_group
 }
 
 static struct sched_group *
-build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
+get_overlap_group(struct sd_data *sdd, int cpu)
 {
-	struct sched_group *sg;
-	struct cpumask *sg_span;
+	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+	struct sched_domain *child = sd->child;
+	struct sched_group *group;
+	struct cpumask *mask = sched_domains_tmpmask2;
 
-	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-			GFP_KERNEL, cpu_to_node(cpu));
+	/*
+	 * Overlap must have !overlap children.
+	 * This is before degenerate throws them out.
+	 */
+	BUG_ON(!sd->child);
 
-	if (!sg)
-		return NULL;
+	build_group_mask(sdd, sched_domain_span(child), mask);
+	cpu = cpumask_first_and(sched_domain_span(child), mask);
 
-	sg_span = sched_group_cpus(sg);
-	if (sd->child)
-		cpumask_copy(sg_span, sched_domain_span(sd->child));
-	else
-		cpumask_copy(sg_span, sched_domain_span(sd));
+	BUG_ON(cpu >= nr_cpu_ids);
 
-	return sg;
-}
+	group = *per_cpu_ptr(sdd->sg, cpu);
+	group->sgc = *per_cpu_ptr(sdd->sgc, cpu);
 
-static void init_overlap_sched_group(struct sched_domain *sd,
-				     struct sched_group *sg)
-{
-	struct cpumask *mask = sched_domains_tmpmask2;
-	struct sd_data *sdd = sd->private;
-	struct cpumask *sg_span;
-	int cpu;
+	atomic_inc(&group->ref);
+	atomic_inc(&group->sgc->ref);
 
-	build_group_mask(sd, sg, mask);
-	cpu = cpumask_first_and(sched_group_cpus(sg), mask);
+	cpumask_copy(sched_group_cpus(group), sched_domain_span(child));
+	cpumask_copy(sched_group_mask(group), mask);
 
-	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
-	if (atomic_inc_return(&sg->sgc->ref) == 1)
-		cpumask_copy(sched_group_mask(sg), mask);
-
-	/*
-	 * Initialize sgc->capacity such that even if we mess up the
-	 * domains and no possible iteration will get us here, we won't
-	 * die on a /0 trap.
-	 */
-	sg_span = sched_group_cpus(sg);
-	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
-	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+	return group;
 }
 
 static int
@@ -620,14 +603,18 @@ build_overlap_sched_groups(struct sched_
 		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
 			continue;
 
-		sg = build_group_from_child_sched_domain(sibling, cpu);
-		if (!sg)
-			goto fail;
+		sg = get_overlap_group(sdd, i);
 
 		sg_span = sched_group_cpus(sg);
 		cpumask_or(covered, covered, sg_span);
 
-		init_overlap_sched_group(sd, sg);
+		/*
+		 * Initialize sgc->capacity such that even if we mess up the
+		 * domains and no possible iteration will get us here, we won't
+		 * die on a /0 trap.
+		 */
+		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+		sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
 
 		if (!first)
 			first = sg;
@@ -639,11 +626,6 @@ build_overlap_sched_groups(struct sched_
 	sd->groups = first;
 
 	return 0;
-
-fail:
-	free_sched_groups(first, 0);
-
-	return -ENOMEM;
 }
 
 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)