All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] psi: Remove the redundant psi_task_tick
@ 2021-02-09  7:10 Chengming Zhou
  2021-02-09 15:48 ` Johannes Weiner
  0 siblings, 1 reply; 3+ messages in thread
From: Chengming Zhou @ 2021-02-09  7:10 UTC (permalink / raw)
  To: hannes, mingo, peterz, juri.lelli, vincent.guittot, rostedt
  Cc: linux-kernel, songmuchun, zhouchengming

When the current task in a cgroup is in_memstall, the corresponding groupc
on that cpu is in PSI_MEM_FULL state, so we can exploit that to remove the
redundant psi_task_tick from scheduler_tick to save this periodic cost.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 include/linux/psi.h  |  1 -
 kernel/sched/core.c  |  1 -
 kernel/sched/psi.c   | 49 ++++++++++++++-----------------------------------
 kernel/sched/stats.h |  9 ---------
 4 files changed, 14 insertions(+), 46 deletions(-)

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7361023f3fdd..65eb1476ac70 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep);
 
-void psi_memstall_tick(struct task_struct *task, int cpu);
 void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 15d2562118d1..31788a9b335b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4533,7 +4533,6 @@ void scheduler_tick(void)
 	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
 	curr->sched_class->task_tick(rq, curr, 0);
 	calc_global_load_tick(rq);
-	psi_task_tick(rq);
 
 	rq_unlock(rq, &rf);
 
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 2293c45d289d..6e46d9eb279b 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
 	wake_up_interruptible(&group->poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu,
-			 bool memstall_tick)
+static void record_times(struct psi_group_cpu *groupc, int cpu)
 {
 	u32 delta;
 	u64 now;
@@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
 		groupc->times[PSI_MEM_SOME] += delta;
 		if (groupc->state_mask & (1 << PSI_MEM_FULL))
 			groupc->times[PSI_MEM_FULL] += delta;
-		else if (memstall_tick) {
-			u32 sample;
-			/*
-			 * Since we care about lost potential, a
-			 * memstall is FULL when there are no other
-			 * working tasks, but also when the CPU is
-			 * actively reclaiming and nothing productive
-			 * could run even if it were runnable.
-			 *
-			 * When the timer tick sees a reclaiming CPU,
-			 * regardless of runnable tasks, sample a FULL
-			 * tick (or less if it hasn't been a full tick
-			 * since the last state change).
-			 */
-			sample = min(delta, (u32)jiffies_to_nsecs(1));
-			groupc->times[PSI_MEM_FULL] += sample;
-		}
 	}
 
 	if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
@@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	 */
 	write_seqcount_begin(&groupc->seq);
 
-	record_times(groupc, cpu, false);
+	record_times(groupc, cpu);
 
 	for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
 		if (!(m & (1 << t)))
@@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		if (test_state(groupc->tasks, s))
 			state_mask |= (1 << s);
 	}
+
+	/*
+	 * Since we care about lost potential, a memstall is FULL
+	 * when there are no other working tasks, but also when
+	 * the CPU is actively reclaiming and nothing productive
+	 * could run even if it were runnable. So when the current
+	 * task in a cgroup is in_memstall, the corresponding groupc
+	 * on that cpu is in PSI_MEM_FULL state.
+	 */
+	if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)
+		state_mask |= (1 << PSI_MEM_FULL);
+
 	groupc->state_mask = state_mask;
 
 	write_seqcount_end(&groupc->seq);
@@ -859,21 +853,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 	}
 }
 
-void psi_memstall_tick(struct task_struct *task, int cpu)
-{
-	struct psi_group *group;
-	void *iter = NULL;
-
-	while ((group = iterate_groups(task, &iter))) {
-		struct psi_group_cpu *groupc;
-
-		groupc = per_cpu_ptr(group->pcpu, cpu);
-		write_seqcount_begin(&groupc->seq);
-		record_times(groupc, cpu, true);
-		write_seqcount_end(&groupc->seq);
-	}
-}
-
 /**
  * psi_memstall_enter - mark the beginning of a memory stall section
  * @flags: flags to handle nested sections
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 33d0daf83842..9e4e67a94731 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -144,14 +144,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
 	psi_task_switch(prev, next, sleep);
 }
 
-static inline void psi_task_tick(struct rq *rq)
-{
-	if (static_branch_likely(&psi_disabled))
-		return;
-
-	if (unlikely(rq->curr->in_memstall))
-		psi_memstall_tick(rq->curr, cpu_of(rq));
-}
 #else /* CONFIG_PSI */
 static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
 static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
@@ -159,7 +151,6 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
 				    struct task_struct *next,
 				    bool sleep) {}
-static inline void psi_task_tick(struct rq *rq) {}
 #endif /* CONFIG_PSI */
 
 #ifdef CONFIG_SCHED_INFO
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] psi: Remove the redundant psi_task_tick
  2021-02-09  7:10 [PATCH v2] psi: Remove the redundant psi_task_tick Chengming Zhou
@ 2021-02-09 15:48 ` Johannes Weiner
  2021-02-10  2:46   ` [External] " Chengming Zhou
  0 siblings, 1 reply; 3+ messages in thread
From: Johannes Weiner @ 2021-02-09 15:48 UTC (permalink / raw)
  To: Chengming Zhou
  Cc: mingo, peterz, juri.lelli, vincent.guittot, rostedt,
	linux-kernel, songmuchun

Hello Chengming,

On Tue, Feb 09, 2021 at 03:10:33PM +0800, Chengming Zhou wrote:
> When the current task in a cgroup is in_memstall, the corresponding groupc
> on that cpu is in PSI_MEM_FULL state, so we can exploit that to remove the
> redundant psi_task_tick from scheduler_tick to save this periodic cost.

Can you please update the patch name and the changelog to the new
version of the patch? It's not removing the redundant tick, it's
moving the reclaim detection from the timer tick to the task state
tracking machinery using the recently added ONCPU state.

> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> ---
>  include/linux/psi.h  |  1 -
>  kernel/sched/core.c  |  1 -
>  kernel/sched/psi.c   | 49 ++++++++++++++-----------------------------------
>  kernel/sched/stats.h |  9 ---------
>  4 files changed, 14 insertions(+), 46 deletions(-)
> 
> diff --git a/include/linux/psi.h b/include/linux/psi.h
> index 7361023f3fdd..65eb1476ac70 100644
> --- a/include/linux/psi.h
> +++ b/include/linux/psi.h
> @@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set);
>  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>  		     bool sleep);
>  
> -void psi_memstall_tick(struct task_struct *task, int cpu);
>  void psi_memstall_enter(unsigned long *flags);
>  void psi_memstall_leave(unsigned long *flags);
>  
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 15d2562118d1..31788a9b335b 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4533,7 +4533,6 @@ void scheduler_tick(void)
>  	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
>  	curr->sched_class->task_tick(rq, curr, 0);
>  	calc_global_load_tick(rq);
> -	psi_task_tick(rq);
>  
>  	rq_unlock(rq, &rf);
>  
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 2293c45d289d..6e46d9eb279b 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
>  	wake_up_interruptible(&group->poll_wait);
>  }
>  
> -static void record_times(struct psi_group_cpu *groupc, int cpu,
> -			 bool memstall_tick)
> +static void record_times(struct psi_group_cpu *groupc, int cpu)
>  {
>  	u32 delta;
>  	u64 now;
> @@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
>  		groupc->times[PSI_MEM_SOME] += delta;
>  		if (groupc->state_mask & (1 << PSI_MEM_FULL))
>  			groupc->times[PSI_MEM_FULL] += delta;
> -		else if (memstall_tick) {
> -			u32 sample;
> -			/*
> -			 * Since we care about lost potential, a
> -			 * memstall is FULL when there are no other
> -			 * working tasks, but also when the CPU is
> -			 * actively reclaiming and nothing productive
> -			 * could run even if it were runnable.
> -			 *
> -			 * When the timer tick sees a reclaiming CPU,
> -			 * regardless of runnable tasks, sample a FULL
> -			 * tick (or less if it hasn't been a full tick
> -			 * since the last state change).
> -			 */
> -			sample = min(delta, (u32)jiffies_to_nsecs(1));
> -			groupc->times[PSI_MEM_FULL] += sample;
> -		}
>  	}
>  
>  	if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
> @@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
>  	 */
>  	write_seqcount_begin(&groupc->seq);
>  
> -	record_times(groupc, cpu, false);
> +	record_times(groupc, cpu);
>  
>  	for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
>  		if (!(m & (1 << t)))
> @@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int cpu,
>  		if (test_state(groupc->tasks, s))
>  			state_mask |= (1 << s);
>  	}
> +
> +	/*
> +	 * Since we care about lost potential, a memstall is FULL
> +	 * when there are no other working tasks, but also when
> +	 * the CPU is actively reclaiming and nothing productive
> +	 * could run even if it were runnable. So when the current
> +	 * task in a cgroup is in_memstall, the corresponding groupc
> +	 * on that cpu is in PSI_MEM_FULL state.
> +	 */
> +	if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)
> +		state_mask |= (1 << PSI_MEM_FULL);

This doesn't really work with the psi_task_switch() optimization. If
we switch between two tasks inside a leaf group, where one is memstall
and the other is not, we don't update the parents properly. So you
need another branch in there as well for checking memstall. At which
point the timer tick implementation is likely cheaper and simpler...

> @@ -144,14 +144,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
>  	psi_task_switch(prev, next, sleep);
>  }
>  
> -static inline void psi_task_tick(struct rq *rq)
> -{
> -	if (static_branch_likely(&psi_disabled))
> -		return;
> -
> -	if (unlikely(rq->curr->in_memstall))
> -		psi_memstall_tick(rq->curr, cpu_of(rq));
> -}
>  #else /* CONFIG_PSI */
>  static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
>  static inline void psi_dequeue(struct task_struct *p, bool sleep) {}

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [External] Re: [PATCH v2] psi: Remove the redundant psi_task_tick
  2021-02-09 15:48 ` Johannes Weiner
@ 2021-02-10  2:46   ` Chengming Zhou
  0 siblings, 0 replies; 3+ messages in thread
From: Chengming Zhou @ 2021-02-10  2:46 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: mingo, peterz, juri.lelli, vincent.guittot, rostedt,
	linux-kernel, songmuchun

Hello Johannes,

在 2021/2/9 下午11:48, Johannes Weiner 写道:
> Hello Chengming,
>
> On Tue, Feb 09, 2021 at 03:10:33PM +0800, Chengming Zhou wrote:
>> When the current task in a cgroup is in_memstall, the corresponding groupc
>> on that cpu is in PSI_MEM_FULL state, so we can exploit that to remove the
>> redundant psi_task_tick from scheduler_tick to save this periodic cost.
> Can you please update the patch name and the changelog to the new
> version of the patch? It's not removing the redundant tick, it's
> moving the reclaim detection from the timer tick to the task state
> tracking machinery using the recently added ONCPU state.

Yes, I will change the name and changelog, it will be clearer for this patch : )

>> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>> ---
>>  include/linux/psi.h  |  1 -
>>  kernel/sched/core.c  |  1 -
>>  kernel/sched/psi.c   | 49 ++++++++++++++-----------------------------------
>>  kernel/sched/stats.h |  9 ---------
>>  4 files changed, 14 insertions(+), 46 deletions(-)
>>
>> diff --git a/include/linux/psi.h b/include/linux/psi.h
>> index 7361023f3fdd..65eb1476ac70 100644
>> --- a/include/linux/psi.h
>> +++ b/include/linux/psi.h
>> @@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set);
>>  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>>  		     bool sleep);
>>  
>> -void psi_memstall_tick(struct task_struct *task, int cpu);
>>  void psi_memstall_enter(unsigned long *flags);
>>  void psi_memstall_leave(unsigned long *flags);
>>  
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 15d2562118d1..31788a9b335b 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -4533,7 +4533,6 @@ void scheduler_tick(void)
>>  	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
>>  	curr->sched_class->task_tick(rq, curr, 0);
>>  	calc_global_load_tick(rq);
>> -	psi_task_tick(rq);
>>  
>>  	rq_unlock(rq, &rf);
>>  
>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> index 2293c45d289d..6e46d9eb279b 100644
>> --- a/kernel/sched/psi.c
>> +++ b/kernel/sched/psi.c
>> @@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
>>  	wake_up_interruptible(&group->poll_wait);
>>  }
>>  
>> -static void record_times(struct psi_group_cpu *groupc, int cpu,
>> -			 bool memstall_tick)
>> +static void record_times(struct psi_group_cpu *groupc, int cpu)
>>  {
>>  	u32 delta;
>>  	u64 now;
>> @@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
>>  		groupc->times[PSI_MEM_SOME] += delta;
>>  		if (groupc->state_mask & (1 << PSI_MEM_FULL))
>>  			groupc->times[PSI_MEM_FULL] += delta;
>> -		else if (memstall_tick) {
>> -			u32 sample;
>> -			/*
>> -			 * Since we care about lost potential, a
>> -			 * memstall is FULL when there are no other
>> -			 * working tasks, but also when the CPU is
>> -			 * actively reclaiming and nothing productive
>> -			 * could run even if it were runnable.
>> -			 *
>> -			 * When the timer tick sees a reclaiming CPU,
>> -			 * regardless of runnable tasks, sample a FULL
>> -			 * tick (or less if it hasn't been a full tick
>> -			 * since the last state change).
>> -			 */
>> -			sample = min(delta, (u32)jiffies_to_nsecs(1));
>> -			groupc->times[PSI_MEM_FULL] += sample;
>> -		}
>>  	}
>>  
>>  	if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
>> @@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
>>  	 */
>>  	write_seqcount_begin(&groupc->seq);
>>  
>> -	record_times(groupc, cpu, false);
>> +	record_times(groupc, cpu);
>>  
>>  	for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
>>  		if (!(m & (1 << t)))
>> @@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int cpu,
>>  		if (test_state(groupc->tasks, s))
>>  			state_mask |= (1 << s);
>>  	}
>> +
>> +	/*
>> +	 * Since we care about lost potential, a memstall is FULL
>> +	 * when there are no other working tasks, but also when
>> +	 * the CPU is actively reclaiming and nothing productive
>> +	 * could run even if it were runnable. So when the current
>> +	 * task in a cgroup is in_memstall, the corresponding groupc
>> +	 * on that cpu is in PSI_MEM_FULL state.
>> +	 */
>> +	if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)
>> +		state_mask |= (1 << PSI_MEM_FULL);
> This doesn't really work with the psi_task_switch() optimization. If
> we switch between two tasks inside a leaf group, where one is memstall
> and the other is not, we don't update the parents properly. So you
> need another branch in there as well for checking memstall. At which
> point the timer tick implementation is likely cheaper and simpler...

Thank you for pointing out this. I will add another branch there to check memstall to update

the parents properly and send a patch-v2.

Thanks.

>> @@ -144,14 +144,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
>>  	psi_task_switch(prev, next, sleep);
>>  }
>>  
>> -static inline void psi_task_tick(struct rq *rq)
>> -{
>> -	if (static_branch_likely(&psi_disabled))
>> -		return;
>> -
>> -	if (unlikely(rq->curr->in_memstall))
>> -		psi_memstall_tick(rq->curr, cpu_of(rq));
>> -}
>>  #else /* CONFIG_PSI */
>>  static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
>>  static inline void psi_dequeue(struct task_struct *p, bool sleep) {}

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-02-10  2:49 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-09  7:10 [PATCH v2] psi: Remove the redundant psi_task_tick Chengming Zhou
2021-02-09 15:48 ` Johannes Weiner
2021-02-10  2:46   ` [External] " Chengming Zhou

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.