All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] psi: Optimize task switch inside shared cgroups
@ 2021-02-09  7:14 Chengming Zhou
  2021-02-16 20:00 ` Johannes Weiner
  0 siblings, 1 reply; 3+ messages in thread
From: Chengming Zhou @ 2021-02-09  7:14 UTC (permalink / raw)
  To: hannes, mingo, peterz, juri.lelli, vincent.guittot, rostedt
  Cc: linux-kernel, songmuchun, zhouchengming

The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
cgroups") only update cgroups whose state actually changes during a
task switch only in task preempt case, not in task sleep case.

We actually don't need to clear and set TSK_ONCPU state for common cgroups
of next and prev task in sleep case, that can save many psi_group_change
especially when most activity comes from one leaf cgroup.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 kernel/sched/psi.c   | 27 +++++++++++++++++----------
 kernel/sched/stats.h | 17 +++--------------
 2 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 6e46d9eb279b..6061e87089ac 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -836,20 +836,27 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		}
 	}
 
-	/*
-	 * If this is a voluntary sleep, dequeue will have taken care
-	 * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
-	 * only need to deal with it during preemption.
-	 */
-	if (sleep)
-		return;
-
 	if (prev->pid) {
-		psi_flags_change(prev, TSK_ONCPU, 0);
+		int clear = 0, set = 0;
+
+		if (sleep) {
+			clear |= TSK_RUNNING;
+			if (prev->in_iowait)
+				set |= TSK_IOWAIT;
+		}
+
+		psi_flags_change(prev, clear | TSK_ONCPU, set);
 
 		iter = NULL;
 		while ((group = iterate_groups(prev, &iter)) && group != common)
-			psi_group_change(group, cpu, TSK_ONCPU, 0, true);
+			psi_group_change(group, cpu, clear | TSK_ONCPU, set, true);
+
+		if (sleep) {
+			while (group) {
+				psi_group_change(group, cpu, clear, set, true);
+				group = iterate_groups(prev, &iter);
+			}
+		}
 	}
 }
 
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 9e4e67a94731..2d92c8467678 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -84,28 +84,17 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
 
 static inline void psi_dequeue(struct task_struct *p, bool sleep)
 {
-	int clear = TSK_RUNNING, set = 0;
-
 	if (static_branch_likely(&psi_disabled))
 		return;
 
 	if (!sleep) {
+		int clear = TSK_RUNNING;
+
 		if (p->in_memstall)
 			clear |= TSK_MEMSTALL;
-	} else {
-		/*
-		 * When a task sleeps, schedule() dequeues it before
-		 * switching to the next one. Merge the clearing of
-		 * TSK_RUNNING and TSK_ONCPU to save an unnecessary
-		 * psi_task_change() call in psi_sched_switch().
-		 */
-		clear |= TSK_ONCPU;
 
-		if (p->in_iowait)
-			set |= TSK_IOWAIT;
+		psi_task_change(p, clear, 0);
 	}
-
-	psi_task_change(p, clear, set);
 }
 
 static inline void psi_ttwu_dequeue(struct task_struct *p)
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] psi: Optimize task switch inside shared cgroups
  2021-02-09  7:14 [PATCH] psi: Optimize task switch inside shared cgroups Chengming Zhou
@ 2021-02-16 20:00 ` Johannes Weiner
  2021-02-17  5:08   ` [External] " Chengming Zhou
  0 siblings, 1 reply; 3+ messages in thread
From: Johannes Weiner @ 2021-02-16 20:00 UTC (permalink / raw)
  To: Chengming Zhou
  Cc: mingo, peterz, juri.lelli, vincent.guittot, rostedt,
	linux-kernel, songmuchun

Hello Chengming,

This patch looks useful to me. A couple of comments below:

On Tue, Feb 09, 2021 at 03:14:13PM +0800, Chengming Zhou wrote:
> The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
> cgroups") only update cgroups whose state actually changes during a
> task switch only in task preempt case, not in task sleep case.
> 
> We actually don't need to clear and set TSK_ONCPU state for common cgroups
> of next and prev task in sleep case, that can save many psi_group_change
> especially when most activity comes from one leaf cgroup.

Can you please make this a bit more concrete? Maybe include this:

sleep before:
psi_dequeue()
  while ((group = iterate_groups(prev)))			# all ancestors
    psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU)
psi_task_switch()
  while ((group = iterate_groups(next)))			# all ancestors
    psi_group_change(next, .set=TSK_ONCPU)

sleep after:
psi_dequeue()
  nop
psi_task_switch()
  while ((group = iterate_groups(next)))			# until (prev & next)
    psi_group_change(next, .set=TSK_ONCPU)
  while ((group = iterate_groups(prev)))			# all ancestors
    psi_group_change(prev, .clear = common ? TSK_RUNNING : TSK_RUNNING|TSK_ONCPU)

When a voluntary sleep switches to another task, we remove one call of
psi_group_change() for every common cgroup ancestor of the two tasks.

> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> ---
>  kernel/sched/psi.c   | 27 +++++++++++++++++----------
>  kernel/sched/stats.h | 17 +++--------------
>  2 files changed, 20 insertions(+), 24 deletions(-)
> 
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 6e46d9eb279b..6061e87089ac 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -836,20 +836,27 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>  		}
>  	}
>  
> -	/*
> -	 * If this is a voluntary sleep, dequeue will have taken care
> -	 * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
> -	 * only need to deal with it during preemption.
> -	 */
> -	if (sleep)
> -		return;
> -
>  	if (prev->pid) {
> -		psi_flags_change(prev, TSK_ONCPU, 0);
> +		int clear = 0, set = 0;
> +
> +		if (sleep) {
> +			clear |= TSK_RUNNING;
> +			if (prev->in_iowait)
> +				set |= TSK_IOWAIT;
> +		}

This needs a comment why it's doing psi_dequeue()'s job. How about this?

		/*
		 * When we're going to sleep, psi_dequeue() lets us handle
		 * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
		 * with TSK_ONCPU and save walking common ancestors twice.
		 */
		if (sleep) {
			...

> +		psi_flags_change(prev, clear | TSK_ONCPU, set);
>  
>  		iter = NULL;
>  		while ((group = iterate_groups(prev, &iter)) && group != common)
> -			psi_group_change(group, cpu, TSK_ONCPU, 0, true);
> +			psi_group_change(group, cpu, clear | TSK_ONCPU, set, true);
> +
> +		if (sleep) {
> +			while (group) {
> +				psi_group_change(group, cpu, clear, set, true);
> +				group = iterate_groups(prev, &iter);
> +			}
> +		}

This function is *primarily* about handling TSK_ONCPU and secondarily
optimizes the dequeue. It would be a bit clearer to do:

	int clear = TSK_ONCPU, set = 0;

	...

	/*
	 * TSK_ONCPU is handled up to the common ancestor. If we're tasked
	 * with dequeuing too, finish that for the rest of the hierarchy.
	 */
	if (sleep) {
		clear &= TSK_ONCPU;
		for (; group; group = iterate_groups(prev, &iter))
			psi_group_change(group, cpu, clear, set, true);			
	}


> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 9e4e67a94731..2d92c8467678 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -84,28 +84,17 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
>  
>  static inline void psi_dequeue(struct task_struct *p, bool sleep)
>  {
> -	int clear = TSK_RUNNING, set = 0;
> -
>  	if (static_branch_likely(&psi_disabled))
>  		return;
>  
>  	if (!sleep) {
> +		int clear = TSK_RUNNING;
> +
>  		if (p->in_memstall)
>  			clear |= TSK_MEMSTALL;
> -	} else {
> -		/*
> -		 * When a task sleeps, schedule() dequeues it before
> -		 * switching to the next one. Merge the clearing of
> -		 * TSK_RUNNING and TSK_ONCPU to save an unnecessary
> -		 * psi_task_change() call in psi_sched_switch().
> -		 */
> -		clear |= TSK_ONCPU;
>  
> -		if (p->in_iowait)
> -			set |= TSK_IOWAIT;
> +		psi_task_change(p, clear, 0);
>  	}

Likewise, this really should have a comment for why it's not handling
TSK_RUNNING to match psi_enqueue()!

	int clear = TSK_RUNNING;

	/*
	 * A voluntary sleep is a dequeue followed by a task switch. To
	 * avoid walking all ancestors twice, psi_task_switch() handles
	 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
	 * Do nothing here.
	 */
	 if (sleep)
		return;

	if (p->in_memstall)
		clear |= TSK_MEMSTALL;

	psi_task_change(p, clear, 0);
	
Thanks

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [External] Re: [PATCH] psi: Optimize task switch inside shared cgroups
  2021-02-16 20:00 ` Johannes Weiner
@ 2021-02-17  5:08   ` Chengming Zhou
  0 siblings, 0 replies; 3+ messages in thread
From: Chengming Zhou @ 2021-02-17  5:08 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: mingo, peterz, juri.lelli, vincent.guittot, rostedt,
	linux-kernel, songmuchun

Hello Johannes,

在 2021/2/17 上午4:00, Johannes Weiner 写道:
> Hello Chengming,
>
> This patch looks useful to me. A couple of comments below:
>
> On Tue, Feb 09, 2021 at 03:14:13PM +0800, Chengming Zhou wrote:
>> The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
>> cgroups") only update cgroups whose state actually changes during a
>> task switch only in task preempt case, not in task sleep case.
>>
>> We actually don't need to clear and set TSK_ONCPU state for common cgroups
>> of next and prev task in sleep case, that can save many psi_group_change
>> especially when most activity comes from one leaf cgroup.
> Can you please make this a bit more concrete? Maybe include this:
>
> sleep before:
> psi_dequeue()
>   while ((group = iterate_groups(prev)))			# all ancestors
>     psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU)
> psi_task_switch()
>   while ((group = iterate_groups(next)))			# all ancestors
>     psi_group_change(next, .set=TSK_ONCPU)
>
> sleep after:
> psi_dequeue()
>   nop
> psi_task_switch()
>   while ((group = iterate_groups(next)))			# until (prev & next)
>     psi_group_change(next, .set=TSK_ONCPU)
>   while ((group = iterate_groups(prev)))			# all ancestors
>     psi_group_change(prev, .clear = common ? TSK_RUNNING : TSK_RUNNING|TSK_ONCPU)
>
> When a voluntary sleep switches to another task, we remove one call of
> psi_group_change() for every common cgroup ancestor of the two tasks.
Thank you for the very beautiful and detailed comments, must be included : )
>> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>> ---
>>  kernel/sched/psi.c   | 27 +++++++++++++++++----------
>>  kernel/sched/stats.h | 17 +++--------------
>>  2 files changed, 20 insertions(+), 24 deletions(-)
>>
>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> index 6e46d9eb279b..6061e87089ac 100644
>> --- a/kernel/sched/psi.c
>> +++ b/kernel/sched/psi.c
>> @@ -836,20 +836,27 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>>  		}
>>  	}
>>  
>> -	/*
>> -	 * If this is a voluntary sleep, dequeue will have taken care
>> -	 * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
>> -	 * only need to deal with it during preemption.
>> -	 */
>> -	if (sleep)
>> -		return;
>> -
>>  	if (prev->pid) {
>> -		psi_flags_change(prev, TSK_ONCPU, 0);
>> +		int clear = 0, set = 0;
>> +
>> +		if (sleep) {
>> +			clear |= TSK_RUNNING;
>> +			if (prev->in_iowait)
>> +				set |= TSK_IOWAIT;
>> +		}
> This needs a comment why it's doing psi_dequeue()'s job. How about this?
>
> 		/*
> 		 * When we're going to sleep, psi_dequeue() lets us handle
> 		 * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
> 		 * with TSK_ONCPU and save walking common ancestors twice.
> 		 */
> 		if (sleep) {
> 			...
Make sense, will be added.
>> +		psi_flags_change(prev, clear | TSK_ONCPU, set);
>>  
>>  		iter = NULL;
>>  		while ((group = iterate_groups(prev, &iter)) && group != common)
>> -			psi_group_change(group, cpu, TSK_ONCPU, 0, true);
>> +			psi_group_change(group, cpu, clear | TSK_ONCPU, set, true);
>> +
>> +		if (sleep) {
>> +			while (group) {
>> +				psi_group_change(group, cpu, clear, set, true);
>> +				group = iterate_groups(prev, &iter);
>> +			}
>> +		}
> This function is *primarily* about handling TSK_ONCPU and secondarily
> optimizes the dequeue. It would be a bit clearer to do:
>
> 	int clear = TSK_ONCPU, set = 0;
>
> 	...
>
> 	/*
> 	 * TSK_ONCPU is handled up to the common ancestor. If we're tasked
> 	 * with dequeuing too, finish that for the rest of the hierarchy.
> 	 */
> 	if (sleep) {
> 		clear &= TSK_ONCPU;
> 		for (; group; group = iterate_groups(prev, &iter))
> 			psi_group_change(group, cpu, clear, set, true);			
> 	}
>
Make sense, I will modify the patch and send a patch-v2.

BTW there is a typo above: clear &= ~TSK_ONCPU;

Thanks.

>> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
>> index 9e4e67a94731..2d92c8467678 100644
>> --- a/kernel/sched/stats.h
>> +++ b/kernel/sched/stats.h
>> @@ -84,28 +84,17 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
>>  
>>  static inline void psi_dequeue(struct task_struct *p, bool sleep)
>>  {
>> -	int clear = TSK_RUNNING, set = 0;
>> -
>>  	if (static_branch_likely(&psi_disabled))
>>  		return;
>>  
>>  	if (!sleep) {
>> +		int clear = TSK_RUNNING;
>> +
>>  		if (p->in_memstall)
>>  			clear |= TSK_MEMSTALL;
>> -	} else {
>> -		/*
>> -		 * When a task sleeps, schedule() dequeues it before
>> -		 * switching to the next one. Merge the clearing of
>> -		 * TSK_RUNNING and TSK_ONCPU to save an unnecessary
>> -		 * psi_task_change() call in psi_sched_switch().
>> -		 */
>> -		clear |= TSK_ONCPU;
>>  
>> -		if (p->in_iowait)
>> -			set |= TSK_IOWAIT;
>> +		psi_task_change(p, clear, 0);
>>  	}
> Likewise, this really should have a comment for why it's not handling
> TSK_RUNNING to match psi_enqueue()!
>
> 	int clear = TSK_RUNNING;
>
> 	/*
> 	 * A voluntary sleep is a dequeue followed by a task switch. To
> 	 * avoid walking all ancestors twice, psi_task_switch() handles
> 	 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
> 	 * Do nothing here.
> 	 */
> 	 if (sleep)
> 		return;
>
> 	if (p->in_memstall)
> 		clear |= TSK_MEMSTALL;
>
> 	psi_task_change(p, clear, 0);
> 	
> Thanks

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-02-17  5:09 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-09  7:14 [PATCH] psi: Optimize task switch inside shared cgroups Chengming Zhou
2021-02-16 20:00 ` Johannes Weiner
2021-02-17  5:08   ` [External] " Chengming Zhou

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.