Re: [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs

From: Peter Zijlstra <peterz@infradead.org>
To: Tejun Heo <tj@kernel.org>
Cc: "Ingo Molnar" <mingo@redhat.com>,
	"“linux-kernel@vger.kernel.org”" <linux-kernel@vger.kernel.org>,
	"Linus Torvalds" <torvalds@linux-foundation.org>,
	"Mike Galbraith" <efault@gmx.de>, "Paul Turner" <pjt@google.com>,
	"Chris Mason" <clm@fb.com>,
	"“kernel-team@fb.com”" <kernel-team@fb.com>,
	"Paul McKenney" <paulmck@linux.vnet.ibm.com>
Subject: Re: [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs
Date: Mon, 1 May 2017 18:59:39 +0200	[thread overview]
Message-ID: <20170501165939.ujfhr5oi35bozsnr@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20170426004039.GA3222@wtj.duckdns.org>

On Tue, Apr 25, 2017 at 05:40:39PM -0700, Tejun Heo wrote:
> @@ -9355,24 +9366,18 @@ void online_fair_sched_group(struct task
>  void unregister_fair_sched_group(struct task_group *tg)
>  {
>  	unsigned long flags;
> -	struct rq *rq;
>  	int cpu;
>  
>  	for_each_possible_cpu(cpu) {
> +		struct rq *rq = cpu_rq(cpu);
> +		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
> +
>  		if (tg->se[cpu])
>  			remove_entity_load_avg(tg->se[cpu]);
>  
> -		/*
> -		 * Only empty task groups can be destroyed; so we can speculatively
> -		 * check on_list without danger of it being re-added.
> -		 */
> -		if (!tg->cfs_rq[cpu]->on_list)
> -			continue;
> -
> -		rq = cpu_rq(cpu);
> -
>  		raw_spin_lock_irqsave(&rq->lock, flags);
> -		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
> +		list_del_leaf_cfs_rq(cfs_rq);
> +		cfs_rq->online = 0;
>  		raw_spin_unlock_irqrestore(&rq->lock, flags);
>  	}
>  }

It would be nice to be able to retain that on_list check and avoid
taking all those rq->lock's.

Since per the previous mail, those online/offline loops are protected by
rq->lock, all we need is to ensure an rcu_sched GP passes between here
and sched_free_group().

Which I think we can do differently, see below. Then we don't need the
->online thing and can keep using the ->on_list check.


> @@ -9523,11 +9528,11 @@ const struct sched_class fair_sched_clas
>  #ifdef CONFIG_SCHED_DEBUG
>  void print_cfs_stats(struct seq_file *m, int cpu)
>  {
> -	struct cfs_rq *cfs_rq;
> +	struct task_group *tg;
>  
>  	rcu_read_lock();
> -	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
> -		print_cfs_rq(m, cpu, cfs_rq);
> +	list_for_each_entry_rcu(tg, &task_groups, list)
> +		print_cfs_rq(m, cpu, tg->cfs_rq[cpu]);
>  	rcu_read_unlock();
>  }

If you have a gazillion empty groups, do we really want to print them?

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 74d9c3a1feee..8bad2c371b18 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -58,6 +58,11 @@ static inline void cond_synchronize_sched(unsigned long oldstate)
 	might_sleep();
 }
 
+static bool void should_synchornize_sched(unsigned long oldstate)
+{
+	return true;
+}
+
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 0bacb6b2af69..9918b632db27 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -78,6 +78,7 @@ unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
 unsigned long get_state_synchronize_sched(void);
 void cond_synchronize_sched(unsigned long oldstate);
+bool should_synchronize_sched(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 91fff49d5869..59eb0d115217 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3423,6 +3423,18 @@ void cond_synchronize_sched(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_sched);
 
+bool should_synchronize_sched(unsigned long oldstate)
+{
+	unsigned long newstate;
+
+	/*
+	 * Ensure that this load happens before any RCU-destructive
+	 * actions the caller might carry out after we return.
+	 */
+	newstate = smp_load_acquire(&rcu_sched_state.completed);
+	return ULONG_CMP_GE(oldstate, newstate);
+}
+
 /*
  * Check to see if there is any immediate RCU-related work to be done
  * by the current CPU, for the specified type of RCU, returning 1 if so.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ecad8e2d2a29..d7630b34d197 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6362,7 +6362,7 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
-static void sched_free_group(struct task_group *tg)
+static void __sched_free_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
@@ -6370,6 +6370,22 @@ static void sched_free_group(struct task_group *tg)
 	kmem_cache_free(task_group_cache, tg);
 }
 
+static void sched_free_group_rcu_sched(struct rcu_head *rhp)
+{
+	/* Now it should be safe to free those cfs_rqs: */
+	__sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+static void sched_free_group(struct task_group *tg)
+{
+	if (should_synchronize_sched(tg->rcu_stamp)) {
+		call_rcu_sched(&tg->rcu, sched_free_group_rcu_sched);
+		return;
+	}
+
+	__sched_free_group(tg);
+}
+
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(struct task_group *parent)
 {
@@ -6434,6 +6450,8 @@ void sched_offline_group(struct task_group *tg)
 	list_del_rcu(&tg->list);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
+
+	tg->rcu_stamp = get_state_synchronize_sched();
 }
 
 static void sched_change_group(struct task_struct *tsk, int type)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2e5e9e64c86..1dbab563e798 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -302,6 +302,7 @@ struct task_group {
 #endif
 
 	struct rcu_head rcu;
+	long rcu_stamp;
 	struct list_head list;
 
 	struct task_group *parent;