All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] sched/psi: Fix avgs_work re-arm in psi_avgs_work()
@ 2022-10-14 11:05 Chengming Zhou
  2022-10-14 14:18 ` Johannes Weiner
  2022-10-31 10:53 ` [tip: sched/core] " tip-bot2 for Chengming Zhou
  0 siblings, 2 replies; 5+ messages in thread
From: Chengming Zhou @ 2022-10-14 11:05 UTC (permalink / raw)
  To: hannes, surenb, quic_pkondeti
  Cc: peterz, quic_charante, linux-kernel, Chengming Zhou

Pavan reported a problem that PSI avgs_work idle shutoff is not
working at all. Because PSI_NONIDLE condition would be observed in
psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
only the kworker running avgs_work on the CPU.

Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
still will always re-arm the avgs_work, so shutoff is not working.

This patch changes to use PSI_STATE_RESCHEDULE to flag whether to
re-arm avgs_work in get_recent_times(). For the current CPU, we re-arm
avgs_work only when (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0),
for other CPUs we can just check PSI_NONIDLE delta. The new flag
is only used in psi_avgs_work(), so we check in get_recent_times()
that current_work() is avgs_work.

One potential problem is that the brief period of non-idle time
incurred between the aggregation run and the kworker's dequeue will
be stranded in the per-cpu buckets until avgs_work run next time.
The buckets can hold 4s worth of time, and future activity will wake
the avgs_work with a 2s delay, giving us 2s worth of data we can leave
behind when shut off the avgs_work. If the kworker run other works after
avgs_work shut off and doesn't have any scheduler activities for 2s,
this maybe a problem.

Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
v2:
 - need to check NR_IOWAIT and NR_MEMSTALL of the current CPU per Pavan.
 - copy groupc->tasks[] out of the retry loop per Suren.
 - use new flag PSI_STATE_RESCHEDULE instead of PSI_NONIDLE per Suren.
 - test current_work() is avgs_work to only use PSI_STATE_RESCHEDULE
   for re-arming avgs_work per Johannes.
---
 include/linux/psi_types.h |  3 +++
 kernel/sched/psi.c        | 30 +++++++++++++++++++++++++++---
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 6e4372735068..325981833587 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -72,6 +72,9 @@ enum psi_states {
 /* Use one bit in the state mask to track TSK_ONCPU */
 #define PSI_ONCPU	(1 << NR_PSI_STATES)
 
+/* Flag whether to re-arm avgs_work, see details in get_recent_times() */
+#define PSI_STATE_RESCHEDULE	(1 << (NR_PSI_STATES + 1))
+
 enum psi_aggregators {
 	PSI_AVGS = 0,
 	PSI_POLL,
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ee2ecc081422..e347e06fdd68 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
 			     u32 *pchanged_states)
 {
 	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+	int current_cpu = raw_smp_processor_id();
+	unsigned int tasks[NR_PSI_TASK_COUNTS];
 	u64 now, state_start;
 	enum psi_states s;
 	unsigned int seq;
@@ -256,6 +258,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
 		memcpy(times, groupc->times, sizeof(groupc->times));
 		state_mask = groupc->state_mask;
 		state_start = groupc->state_start;
+		if (cpu == current_cpu)
+			memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
 	} while (read_seqcount_retry(&groupc->seq, seq));
 
 	/* Calculate state time deltas against the previous snapshot */
@@ -280,6 +284,28 @@ static void get_recent_times(struct psi_group *group, int cpu,
 		if (delta)
 			*pchanged_states |= (1 << s);
 	}
+
+	/*
+	 * When collect_percpu_times() from the avgs_work, we don't want to
+	 * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
+	 * this avgs_work is never IDLE, cause avgs_work can't be shut off.
+	 * So for the current CPU, we need to re-arm avgs_work only when
+	 * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
+	 * we can just check PSI_NONIDLE delta.
+	 */
+	if (current_work() == &group->avgs_work.work) {
+		bool reschedule;
+
+		if (cpu == current_cpu)
+			reschedule = tasks[NR_RUNNING] +
+				     tasks[NR_IOWAIT] +
+				     tasks[NR_MEMSTALL] > 1;
+		else
+			reschedule = *pchanged_states & (1 << PSI_NONIDLE);
+
+		if (reschedule)
+			*pchanged_states |= PSI_STATE_RESCHEDULE;
+	}
 }
 
 static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -415,7 +441,6 @@ static void psi_avgs_work(struct work_struct *work)
 	struct delayed_work *dwork;
 	struct psi_group *group;
 	u32 changed_states;
-	bool nonidle;
 	u64 now;
 
 	dwork = to_delayed_work(work);
@@ -426,7 +451,6 @@ static void psi_avgs_work(struct work_struct *work)
 	now = sched_clock();
 
 	collect_percpu_times(group, PSI_AVGS, &changed_states);
-	nonidle = changed_states & (1 << PSI_NONIDLE);
 	/*
 	 * If there is task activity, periodically fold the per-cpu
 	 * times and feed samples into the running averages. If things
@@ -437,7 +461,7 @@ static void psi_avgs_work(struct work_struct *work)
 	if (now >= group->avg_next_update)
 		group->avg_next_update = update_averages(group, now);
 
-	if (nonidle) {
+	if (changed_states & PSI_STATE_RESCHEDULE) {
 		schedule_delayed_work(dwork, nsecs_to_jiffies(
 				group->avg_next_update - now) + 1);
 	}
-- 
2.37.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] sched/psi: Fix avgs_work re-arm in psi_avgs_work()
  2022-10-14 11:05 [PATCH v2] sched/psi: Fix avgs_work re-arm in psi_avgs_work() Chengming Zhou
@ 2022-10-14 14:18 ` Johannes Weiner
  2022-10-14 16:41   ` Suren Baghdasaryan
  2022-10-31 10:53 ` [tip: sched/core] " tip-bot2 for Chengming Zhou
  1 sibling, 1 reply; 5+ messages in thread
From: Johannes Weiner @ 2022-10-14 14:18 UTC (permalink / raw)
  To: Chengming Zhou; +Cc: surenb, quic_pkondeti, peterz, quic_charante, linux-kernel

On Fri, Oct 14, 2022 at 07:05:51PM +0800, Chengming Zhou wrote:
> Pavan reported a problem that PSI avgs_work idle shutoff is not
> working at all. Because PSI_NONIDLE condition would be observed in
> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
> only the kworker running avgs_work on the CPU.
> 
> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
> still will always re-arm the avgs_work, so shutoff is not working.
> 
> This patch changes to use PSI_STATE_RESCHEDULE to flag whether to
> re-arm avgs_work in get_recent_times(). For the current CPU, we re-arm
> avgs_work only when (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0),
> for other CPUs we can just check PSI_NONIDLE delta. The new flag
> is only used in psi_avgs_work(), so we check in get_recent_times()
> that current_work() is avgs_work.
> 
> One potential problem is that the brief period of non-idle time
> incurred between the aggregation run and the kworker's dequeue will
> be stranded in the per-cpu buckets until avgs_work run next time.
> The buckets can hold 4s worth of time, and future activity will wake
> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
> behind when shut off the avgs_work. If the kworker run other works after
> avgs_work shut off and doesn't have any scheduler activities for 2s,
> this maybe a problem.
> 
> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] sched/psi: Fix avgs_work re-arm in psi_avgs_work()
  2022-10-14 14:18 ` Johannes Weiner
@ 2022-10-14 16:41   ` Suren Baghdasaryan
  2022-10-15  9:43     ` Chengming Zhou
  0 siblings, 1 reply; 5+ messages in thread
From: Suren Baghdasaryan @ 2022-10-14 16:41 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Chengming Zhou, quic_pkondeti, peterz, quic_charante, linux-kernel

On Fri, Oct 14, 2022 at 7:18 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Fri, Oct 14, 2022 at 07:05:51PM +0800, Chengming Zhou wrote:
> > Pavan reported a problem that PSI avgs_work idle shutoff is not
> > working at all. Because PSI_NONIDLE condition would be observed in
> > psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
> > only the kworker running avgs_work on the CPU.
> >
> > Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
> > avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
> > still will always re-arm the avgs_work, so shutoff is not working.
> >
> > This patch changes to use PSI_STATE_RESCHEDULE to flag whether to
> > re-arm avgs_work in get_recent_times(). For the current CPU, we re-arm
> > avgs_work only when (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0),
> > for other CPUs we can just check PSI_NONIDLE delta. The new flag
> > is only used in psi_avgs_work(), so we check in get_recent_times()
> > that current_work() is avgs_work.
> >
> > One potential problem is that the brief period of non-idle time
> > incurred between the aggregation run and the kworker's dequeue will
> > be stranded in the per-cpu buckets until avgs_work run next time.
> > The buckets can hold 4s worth of time, and future activity will wake
> > the avgs_work with a 2s delay, giving us 2s worth of data we can leave
> > behind when shut off the avgs_work. If the kworker run other works after
> > avgs_work shut off and doesn't have any scheduler activities for 2s,
> > this maybe a problem.
> >
> > Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
> > Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>

Please make sure to test the final version. With that done,

Acked-by: Suren Baghdasaryan <surenb@google.com>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] sched/psi: Fix avgs_work re-arm in psi_avgs_work()
  2022-10-14 16:41   ` Suren Baghdasaryan
@ 2022-10-15  9:43     ` Chengming Zhou
  0 siblings, 0 replies; 5+ messages in thread
From: Chengming Zhou @ 2022-10-15  9:43 UTC (permalink / raw)
  To: peterz, Suren Baghdasaryan, Johannes Weiner
  Cc: quic_pkondeti, quic_charante, linux-kernel

On 2022/10/15 00:41, Suren Baghdasaryan wrote:
> On Fri, Oct 14, 2022 at 7:18 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>>
>> On Fri, Oct 14, 2022 at 07:05:51PM +0800, Chengming Zhou wrote:
>>> Pavan reported a problem that PSI avgs_work idle shutoff is not
>>> working at all. Because PSI_NONIDLE condition would be observed in
>>> psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
>>> only the kworker running avgs_work on the CPU.
>>>
>>> Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
>>> avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
>>> still will always re-arm the avgs_work, so shutoff is not working.
>>>
>>> This patch changes to use PSI_STATE_RESCHEDULE to flag whether to
>>> re-arm avgs_work in get_recent_times(). For the current CPU, we re-arm
>>> avgs_work only when (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0),
>>> for other CPUs we can just check PSI_NONIDLE delta. The new flag
>>> is only used in psi_avgs_work(), so we check in get_recent_times()
>>> that current_work() is avgs_work.
>>>
>>> One potential problem is that the brief period of non-idle time
>>> incurred between the aggregation run and the kworker's dequeue will
>>> be stranded in the per-cpu buckets until avgs_work run next time.
>>> The buckets can hold 4s worth of time, and future activity will wake
>>> the avgs_work with a 2s delay, giving us 2s worth of data we can leave
>>> behind when shut off the avgs_work. If the kworker run other works after
>>> avgs_work shut off and doesn't have any scheduler activities for 2s,
>>> this maybe a problem.
>>>
>>> Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
>>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>>
>> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> 
> Please make sure to test the final version. With that done,
> 
> Acked-by: Suren Baghdasaryan <surenb@google.com>

Thanks! Yes, I did test with this version on VM yesterday, so add:

Tested-by: Chengming Zhou <zhouchengming@bytedance.com>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [tip: sched/core] sched/psi: Fix avgs_work re-arm in psi_avgs_work()
  2022-10-14 11:05 [PATCH v2] sched/psi: Fix avgs_work re-arm in psi_avgs_work() Chengming Zhou
  2022-10-14 14:18 ` Johannes Weiner
@ 2022-10-31 10:53 ` tip-bot2 for Chengming Zhou
  1 sibling, 0 replies; 5+ messages in thread
From: tip-bot2 for Chengming Zhou @ 2022-10-31 10:53 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Pavan Kondeti, Chengming Zhou, Peter Zijlstra (Intel),
	Johannes Weiner, Suren Baghdasaryan, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     2fcd7bbae90a6d844da8660a9d27079281dfbba2
Gitweb:        https://git.kernel.org/tip/2fcd7bbae90a6d844da8660a9d27079281dfbba2
Author:        Chengming Zhou <zhouchengming@bytedance.com>
AuthorDate:    Fri, 14 Oct 2022 19:05:51 +08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sun, 30 Oct 2022 10:12:14 +01:00

sched/psi: Fix avgs_work re-arm in psi_avgs_work()

Pavan reported a problem that PSI avgs_work idle shutoff is not
working at all. Because PSI_NONIDLE condition would be observed in
psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
only the kworker running avgs_work on the CPU.

Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
still will always re-arm the avgs_work, so shutoff is not working.

This patch changes to use PSI_STATE_RESCHEDULE to flag whether to
re-arm avgs_work in get_recent_times(). For the current CPU, we re-arm
avgs_work only when (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0),
for other CPUs we can just check PSI_NONIDLE delta. The new flag
is only used in psi_avgs_work(), so we check in get_recent_times()
that current_work() is avgs_work.

One potential problem is that the brief period of non-idle time
incurred between the aggregation run and the kworker's dequeue will
be stranded in the per-cpu buckets until avgs_work run next time.
The buckets can hold 4s worth of time, and future activity will wake
the avgs_work with a 2s delay, giving us 2s worth of data we can leave
behind when shut off the avgs_work. If the kworker run other works after
avgs_work shut off and doesn't have any scheduler activities for 2s,
this maybe a problem.

Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Chengming Zhou <zhouchengming@bytedance.com>
Link: https://lore.kernel.org/r/20221014110551.22695-1-zhouchengming@bytedance.com
---
 include/linux/psi_types.h |  3 +++
 kernel/sched/psi.c        | 30 +++++++++++++++++++++++++++---
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 6e43727..3259818 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -72,6 +72,9 @@ enum psi_states {
 /* Use one bit in the state mask to track TSK_ONCPU */
 #define PSI_ONCPU	(1 << NR_PSI_STATES)
 
+/* Flag whether to re-arm avgs_work, see details in get_recent_times() */
+#define PSI_STATE_RESCHEDULE	(1 << (NR_PSI_STATES + 1))
+
 enum psi_aggregators {
 	PSI_AVGS = 0,
 	PSI_POLL,
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7f40d87..a4348af 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -242,6 +242,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
 			     u32 *pchanged_states)
 {
 	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+	int current_cpu = raw_smp_processor_id();
+	unsigned int tasks[NR_PSI_TASK_COUNTS];
 	u64 now, state_start;
 	enum psi_states s;
 	unsigned int seq;
@@ -256,6 +258,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
 		memcpy(times, groupc->times, sizeof(groupc->times));
 		state_mask = groupc->state_mask;
 		state_start = groupc->state_start;
+		if (cpu == current_cpu)
+			memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
 	} while (read_seqcount_retry(&groupc->seq, seq));
 
 	/* Calculate state time deltas against the previous snapshot */
@@ -280,6 +284,28 @@ static void get_recent_times(struct psi_group *group, int cpu,
 		if (delta)
 			*pchanged_states |= (1 << s);
 	}
+
+	/*
+	 * When collect_percpu_times() from the avgs_work, we don't want to
+	 * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
+	 * this avgs_work is never IDLE, cause avgs_work can't be shut off.
+	 * So for the current CPU, we need to re-arm avgs_work only when
+	 * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
+	 * we can just check PSI_NONIDLE delta.
+	 */
+	if (current_work() == &group->avgs_work.work) {
+		bool reschedule;
+
+		if (cpu == current_cpu)
+			reschedule = tasks[NR_RUNNING] +
+				     tasks[NR_IOWAIT] +
+				     tasks[NR_MEMSTALL] > 1;
+		else
+			reschedule = *pchanged_states & (1 << PSI_NONIDLE);
+
+		if (reschedule)
+			*pchanged_states |= PSI_STATE_RESCHEDULE;
+	}
 }
 
 static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -415,7 +441,6 @@ static void psi_avgs_work(struct work_struct *work)
 	struct delayed_work *dwork;
 	struct psi_group *group;
 	u32 changed_states;
-	bool nonidle;
 	u64 now;
 
 	dwork = to_delayed_work(work);
@@ -426,7 +451,6 @@ static void psi_avgs_work(struct work_struct *work)
 	now = sched_clock();
 
 	collect_percpu_times(group, PSI_AVGS, &changed_states);
-	nonidle = changed_states & (1 << PSI_NONIDLE);
 	/*
 	 * If there is task activity, periodically fold the per-cpu
 	 * times and feed samples into the running averages. If things
@@ -437,7 +461,7 @@ static void psi_avgs_work(struct work_struct *work)
 	if (now >= group->avg_next_update)
 		group->avg_next_update = update_averages(group, now);
 
-	if (nonidle) {
+	if (changed_states & PSI_STATE_RESCHEDULE) {
 		schedule_delayed_work(dwork, nsecs_to_jiffies(
 				group->avg_next_update - now) + 1);
 	}

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-10-31 10:54 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-14 11:05 [PATCH v2] sched/psi: Fix avgs_work re-arm in psi_avgs_work() Chengming Zhou
2022-10-14 14:18 ` Johannes Weiner
2022-10-14 16:41   ` Suren Baghdasaryan
2022-10-15  9:43     ` Chengming Zhou
2022-10-31 10:53 ` [tip: sched/core] " tip-bot2 for Chengming Zhou

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.