linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] psi: fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
@ 2021-11-10 21:33 Brian Chen
  2021-11-12 16:53 ` Johannes Weiner
  2021-11-17 13:59 ` [tip: sched/core] psi: Fix " tip-bot2 for Brian Chen
  0 siblings, 2 replies; 4+ messages in thread
From: Brian Chen @ 2021-11-10 21:33 UTC (permalink / raw)
  To: hannes
  Cc: brianchen118, brianc118, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira, open list

We've noticed cases where tasks in a cgroup are stalled on memory but
there is little memory FULL pressure since tasks stay on the runqueue
in reclaim.

A simple example involves a single threaded program that keeps leaking
and touching large amounts of memory. It runs in a cgroup with swap
enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though
there is significant CPU pressure and memory SOME, there is barely any
memory FULL since the task enters reclaim and stays on the runqueue.
However, this memory-bound task is effectively stalled on memory and
we expect memory FULL to match memory SOME in this scenario.

The code is confused about memstall && running, thinking there is a
stalled task and a productive task when there's only one task: a
reclaimer that's counted as both. To fix this, we redefine the
condition for PSI_MEM_FULL to check that all running tasks are in an
active memstall instead of checking that there are no running tasks.

        case PSI_MEM_FULL:
-               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+               return unlikely(tasks[NR_MEMSTALL] &&
+                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);

This will capture reclaimers. It will also capture tasks that called
psi_memstall_enter() and are about to sleep, but this should be
negligible noise.

Signed-off-by: Brian Chen <brianchen118@gmail.com>
---
 include/linux/psi_types.h | 13 ++++++++++-
 kernel/sched/psi.c        | 45 ++++++++++++++++++++++++---------------
 kernel/sched/stats.h      |  5 ++++-
 3 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 0a23300d49af..0819c82dba92 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -21,7 +21,17 @@ enum psi_task_count {
 	 * don't have to special case any state tracking for it.
 	 */
 	NR_ONCPU,
-	NR_PSI_TASK_COUNTS = 4,
+	/*
+	 * For IO and CPU stalls the presence of running/oncpu tasks
+	 * in the domain means a partial rather than a full stall.
+	 * For memory it's not so simple because of page reclaimers:
+	 * they are running/oncpu while representing a stall. To tell
+	 * whether a domain has productivity left or not, we need to
+	 * distinguish between regular running (i.e. productive)
+	 * threads and memstall ones.
+	 */
+	NR_MEMSTALL_RUNNING,
+	NR_PSI_TASK_COUNTS = 5,
 };
 
 /* Task state bitmasks */
@@ -29,6 +39,7 @@ enum psi_task_count {
 #define TSK_MEMSTALL	(1 << NR_MEMSTALL)
 #define TSK_RUNNING	(1 << NR_RUNNING)
 #define TSK_ONCPU	(1 << NR_ONCPU)
+#define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)
 
 /* Resources that workloads could be stalled on */
 enum psi_res {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1652f2bb54b7..69b19d3af690 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -34,13 +34,19 @@
  * delayed on that resource such that nobody is advancing and the CPU
  * goes idle. This leaves both workload and CPU unproductive.
  *
- * Naturally, the FULL state doesn't exist for the CPU resource at the
- * system level, but exist at the cgroup level, means all non-idle tasks
- * in a cgroup are delayed on the CPU resource which used by others outside
- * of the cgroup or throttled by the cgroup cpu.max configuration.
- *
  *	SOME = nr_delayed_tasks != 0
- *	FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ *	FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
  *
  * The percentage of wallclock time spent in those compound stall
  * states gives pressure numbers between 0 and 100 for each resource,
@@ -81,13 +87,13 @@
  *
  *	threads = min(nr_nonidle_tasks, nr_cpus)
  *	   SOME = min(nr_delayed_tasks / threads, 1)
- *	   FULL = (threads - min(nr_running_tasks, threads)) / threads
+ *	   FULL = (threads - min(nr_productive_tasks, threads)) / threads
  *
  * For the 257 number crunchers on 256 CPUs, this yields:
  *
  *	threads = min(257, 256)
  *	   SOME = min(1 / 256, 1)             = 0.4%
- *	   FULL = (256 - min(257, 256)) / 256 = 0%
+ *	   FULL = (256 - min(256, 256)) / 256 = 0%
  *
  * For the 1 out of 4 memory-delayed tasks, this yields:
  *
@@ -112,7 +118,7 @@
  * For each runqueue, we track:
  *
  *	   tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- *	   tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ *	   tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
  *	tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
  *
  * and then periodically aggregate:
@@ -233,7 +239,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 	case PSI_MEM_SOME:
 		return unlikely(tasks[NR_MEMSTALL]);
 	case PSI_MEM_FULL:
-		return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+		return unlikely(tasks[NR_MEMSTALL] &&
+			tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
 	case PSI_CPU_SOME:
 		return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
 	case PSI_CPU_FULL:
@@ -710,10 +717,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		if (groupc->tasks[t]) {
 			groupc->tasks[t]--;
 		} else if (!psi_bug) {
-			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
 					cpu, t, groupc->tasks[0],
 					groupc->tasks[1], groupc->tasks[2],
-					groupc->tasks[3], clear, set);
+					groupc->tasks[3], groupc->tasks[4],
+					clear, set);
 			psi_bug = 1;
 		}
 	}
@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		int clear = TSK_ONCPU, set = 0;
 
 		/*
-		 * When we're going to sleep, psi_dequeue() lets us handle
-		 * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
-		 * with TSK_ONCPU and save walking common ancestors twice.
+		 * When we're going to sleep, psi_dequeue() lets us
+		 * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+		 * TSK_IOWAIT here, where we can combine it with
+		 * TSK_ONCPU and save walking common ancestors twice.
 		 */
 		if (sleep) {
 			clear |= TSK_RUNNING;
+			if (prev->in_memstall)
+				clear |= TSK_MEMSTALL_RUNNING;
 			if (prev->in_iowait)
 				set |= TSK_IOWAIT;
 		}
@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
 	rq = this_rq_lock_irq(&rf);
 
 	current->in_memstall = 1;
-	psi_task_change(current, 0, TSK_MEMSTALL);
+	psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
 
 	rq_unlock_irq(rq, &rf);
 }
@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
 	rq = this_rq_lock_irq(&rf);
 
 	current->in_memstall = 0;
-	psi_task_change(current, TSK_MEMSTALL, 0);
+	psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
 
 	rq_unlock_irq(rq, &rf);
 }
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index cfb0893a83d4..3a3c826dd83a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
+	if (p->in_memstall)
+		set |= TSK_MEMSTALL_RUNNING;
+
 	if (!wakeup || p->sched_psi_wake_requeue) {
 		if (p->in_memstall)
 			set |= TSK_MEMSTALL;
@@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
 		return;
 
 	if (p->in_memstall)
-		clear |= TSK_MEMSTALL;
+		clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
 
 	psi_task_change(p, clear, 0);
 }
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] psi: fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
  2021-11-10 21:33 [PATCH] psi: fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim Brian Chen
@ 2021-11-12 16:53 ` Johannes Weiner
  2021-11-13  7:39   ` Peter Zijlstra
  2021-11-17 13:59 ` [tip: sched/core] psi: Fix " tip-bot2 for Brian Chen
  1 sibling, 1 reply; 4+ messages in thread
From: Johannes Weiner @ 2021-11-12 16:53 UTC (permalink / raw)
  To: Brian Chen
  Cc: brianc118, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira, open list

On Wed, Nov 10, 2021 at 09:33:12PM +0000, Brian Chen wrote:
> We've noticed cases where tasks in a cgroup are stalled on memory but
> there is little memory FULL pressure since tasks stay on the runqueue
> in reclaim.
> 
> A simple example involves a single threaded program that keeps leaking
> and touching large amounts of memory. It runs in a cgroup with swap
> enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though
> there is significant CPU pressure and memory SOME, there is barely any
> memory FULL since the task enters reclaim and stays on the runqueue.
> However, this memory-bound task is effectively stalled on memory and
> we expect memory FULL to match memory SOME in this scenario.
> 
> The code is confused about memstall && running, thinking there is a
> stalled task and a productive task when there's only one task: a
> reclaimer that's counted as both. To fix this, we redefine the
> condition for PSI_MEM_FULL to check that all running tasks are in an
> active memstall instead of checking that there are no running tasks.
> 
>         case PSI_MEM_FULL:
> -               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
> +               return unlikely(tasks[NR_MEMSTALL] &&
> +                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
> 
> This will capture reclaimers. It will also capture tasks that called
> psi_memstall_enter() and are about to sleep, but this should be
> negligible noise.
> 
> Signed-off-by: Brian Chen <brianchen118@gmail.com>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

This bug essentially causes us to count memory-some in walltime and
memory-full in tasktime, which can be quite confusing and misleading
in combined CPU and memory pressure situations.

The fix looks good to me, thanks Brian.

The bug's been there since the initial psi commit, so I don't think a
stable backport is warranted.

Peter, absent objections, can you please pick this up through -tip?

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] psi: fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
  2021-11-12 16:53 ` Johannes Weiner
@ 2021-11-13  7:39   ` Peter Zijlstra
  0 siblings, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2021-11-13  7:39 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Brian Chen, brianc118, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, open list

On Fri, Nov 12, 2021 at 11:53:20AM -0500, Johannes Weiner wrote:
> On Wed, Nov 10, 2021 at 09:33:12PM +0000, Brian Chen wrote:
> > We've noticed cases where tasks in a cgroup are stalled on memory but
> > there is little memory FULL pressure since tasks stay on the runqueue
> > in reclaim.
> > 
> > A simple example involves a single threaded program that keeps leaking
> > and touching large amounts of memory. It runs in a cgroup with swap
> > enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though
> > there is significant CPU pressure and memory SOME, there is barely any
> > memory FULL since the task enters reclaim and stays on the runqueue.
> > However, this memory-bound task is effectively stalled on memory and
> > we expect memory FULL to match memory SOME in this scenario.
> > 
> > The code is confused about memstall && running, thinking there is a
> > stalled task and a productive task when there's only one task: a
> > reclaimer that's counted as both. To fix this, we redefine the
> > condition for PSI_MEM_FULL to check that all running tasks are in an
> > active memstall instead of checking that there are no running tasks.
> > 
> >         case PSI_MEM_FULL:
> > -               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
> > +               return unlikely(tasks[NR_MEMSTALL] &&
> > +                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
> > 
> > This will capture reclaimers. It will also capture tasks that called
> > psi_memstall_enter() and are about to sleep, but this should be
> > negligible noise.
> > 
> > Signed-off-by: Brian Chen <brianchen118@gmail.com>
> 
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> 
> This bug essentially causes us to count memory-some in walltime and
> memory-full in tasktime, which can be quite confusing and misleading
> in combined CPU and memory pressure situations.
> 
> The fix looks good to me, thanks Brian.
> 
> The bug's been there since the initial psi commit, so I don't think a
> stable backport is warranted.
> 
> Peter, absent objections, can you please pick this up through -tip?

Yep can do. Note that our psi_group_cpu data structure is now completely
filled (the extra tasks state filled the last hole):

struct psi_group_cpu {
	seqcount_t                 seq __attribute__((__aligned__(64))); /*     0     4 */
	unsigned int               tasks[5];             /*     4    20 */
	u32                        state_mask;           /*    24     4 */
	u32                        times[7];             /*    28    28 */
	u64                        state_start;          /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	u32                        times_prev[2][7] __attribute__((__aligned__(64))); /*    64    56 */

	/* size: 128, cachelines: 2, members: 6 */
	/* padding: 8 */
	/* forced alignments: 2 */
} __attribute__((__aligned__(64)));


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tip: sched/core] psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
  2021-11-10 21:33 [PATCH] psi: fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim Brian Chen
  2021-11-12 16:53 ` Johannes Weiner
@ 2021-11-17 13:59 ` tip-bot2 for Brian Chen
  1 sibling, 0 replies; 4+ messages in thread
From: tip-bot2 for Brian Chen @ 2021-11-17 13:59 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Brian Chen, Peter Zijlstra (Intel), Johannes Weiner, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     cb0e52b7748737b2cf6481fdd9b920ce7e1ebbdf
Gitweb:        https://git.kernel.org/tip/cb0e52b7748737b2cf6481fdd9b920ce7e1ebbdf
Author:        Brian Chen <brianchen118@gmail.com>
AuthorDate:    Wed, 10 Nov 2021 21:33:12 
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Wed, 17 Nov 2021 14:49:00 +01:00

psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim

We've noticed cases where tasks in a cgroup are stalled on memory but
there is little memory FULL pressure since tasks stay on the runqueue
in reclaim.

A simple example involves a single threaded program that keeps leaking
and touching large amounts of memory. It runs in a cgroup with swap
enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though
there is significant CPU pressure and memory SOME, there is barely any
memory FULL since the task enters reclaim and stays on the runqueue.
However, this memory-bound task is effectively stalled on memory and
we expect memory FULL to match memory SOME in this scenario.

The code is confused about memstall && running, thinking there is a
stalled task and a productive task when there's only one task: a
reclaimer that's counted as both. To fix this, we redefine the
condition for PSI_MEM_FULL to check that all running tasks are in an
active memstall instead of checking that there are no running tasks.

        case PSI_MEM_FULL:
-               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+               return unlikely(tasks[NR_MEMSTALL] &&
+                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);

This will capture reclaimers. It will also capture tasks that called
psi_memstall_enter() and are about to sleep, but this should be
negligible noise.

Signed-off-by: Brian Chen <brianchen118@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com
---
 include/linux/psi_types.h | 13 ++++++++++-
 kernel/sched/psi.c        | 45 +++++++++++++++++++++++---------------
 kernel/sched/stats.h      |  5 +++-
 3 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index bf50068..516c0fe 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -22,7 +22,17 @@ enum psi_task_count {
 	 * don't have to special case any state tracking for it.
 	 */
 	NR_ONCPU,
-	NR_PSI_TASK_COUNTS = 4,
+	/*
+	 * For IO and CPU stalls the presence of running/oncpu tasks
+	 * in the domain means a partial rather than a full stall.
+	 * For memory it's not so simple because of page reclaimers:
+	 * they are running/oncpu while representing a stall. To tell
+	 * whether a domain has productivity left or not, we need to
+	 * distinguish between regular running (i.e. productive)
+	 * threads and memstall ones.
+	 */
+	NR_MEMSTALL_RUNNING,
+	NR_PSI_TASK_COUNTS = 5,
 };
 
 /* Task state bitmasks */
@@ -30,6 +40,7 @@ enum psi_task_count {
 #define TSK_MEMSTALL	(1 << NR_MEMSTALL)
 #define TSK_RUNNING	(1 << NR_RUNNING)
 #define TSK_ONCPU	(1 << NR_ONCPU)
+#define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)
 
 /* Resources that workloads could be stalled on */
 enum psi_res {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 3397fa0..a679613 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -35,13 +35,19 @@
  * delayed on that resource such that nobody is advancing and the CPU
  * goes idle. This leaves both workload and CPU unproductive.
  *
- * Naturally, the FULL state doesn't exist for the CPU resource at the
- * system level, but exist at the cgroup level, means all non-idle tasks
- * in a cgroup are delayed on the CPU resource which used by others outside
- * of the cgroup or throttled by the cgroup cpu.max configuration.
- *
  *	SOME = nr_delayed_tasks != 0
- *	FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ *	FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
  *
  * The percentage of wallclock time spent in those compound stall
  * states gives pressure numbers between 0 and 100 for each resource,
@@ -82,13 +88,13 @@
  *
  *	threads = min(nr_nonidle_tasks, nr_cpus)
  *	   SOME = min(nr_delayed_tasks / threads, 1)
- *	   FULL = (threads - min(nr_running_tasks, threads)) / threads
+ *	   FULL = (threads - min(nr_productive_tasks, threads)) / threads
  *
  * For the 257 number crunchers on 256 CPUs, this yields:
  *
  *	threads = min(257, 256)
  *	   SOME = min(1 / 256, 1)             = 0.4%
- *	   FULL = (256 - min(257, 256)) / 256 = 0%
+ *	   FULL = (256 - min(256, 256)) / 256 = 0%
  *
  * For the 1 out of 4 memory-delayed tasks, this yields:
  *
@@ -113,7 +119,7 @@
  * For each runqueue, we track:
  *
  *	   tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- *	   tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ *	   tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
  *	tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
  *
  * and then periodically aggregate:
@@ -234,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 	case PSI_MEM_SOME:
 		return unlikely(tasks[NR_MEMSTALL]);
 	case PSI_MEM_FULL:
-		return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+		return unlikely(tasks[NR_MEMSTALL] &&
+			tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
 	case PSI_CPU_SOME:
 		return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
 	case PSI_CPU_FULL:
@@ -711,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		if (groupc->tasks[t]) {
 			groupc->tasks[t]--;
 		} else if (!psi_bug) {
-			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
 					cpu, t, groupc->tasks[0],
 					groupc->tasks[1], groupc->tasks[2],
-					groupc->tasks[3], clear, set);
+					groupc->tasks[3], groupc->tasks[4],
+					clear, set);
 			psi_bug = 1;
 		}
 	}
@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		int clear = TSK_ONCPU, set = 0;
 
 		/*
-		 * When we're going to sleep, psi_dequeue() lets us handle
-		 * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
-		 * with TSK_ONCPU and save walking common ancestors twice.
+		 * When we're going to sleep, psi_dequeue() lets us
+		 * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+		 * TSK_IOWAIT here, where we can combine it with
+		 * TSK_ONCPU and save walking common ancestors twice.
 		 */
 		if (sleep) {
 			clear |= TSK_RUNNING;
+			if (prev->in_memstall)
+				clear |= TSK_MEMSTALL_RUNNING;
 			if (prev->in_iowait)
 				set |= TSK_IOWAIT;
 		}
@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
 	rq = this_rq_lock_irq(&rf);
 
 	current->in_memstall = 1;
-	psi_task_change(current, 0, TSK_MEMSTALL);
+	psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
 
 	rq_unlock_irq(rq, &rf);
 }
@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
 	rq = this_rq_lock_irq(&rf);
 
 	current->in_memstall = 0;
-	psi_task_change(current, TSK_MEMSTALL, 0);
+	psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
 
 	rq_unlock_irq(rq, &rf);
 }
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index cfb0893..3a3c826 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
+	if (p->in_memstall)
+		set |= TSK_MEMSTALL_RUNNING;
+
 	if (!wakeup || p->sched_psi_wake_requeue) {
 		if (p->in_memstall)
 			set |= TSK_MEMSTALL;
@@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
 		return;
 
 	if (p->in_memstall)
-		clear |= TSK_MEMSTALL;
+		clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
 
 	psi_task_change(p, clear, 0);
 }

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-11-17 14:00 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-10 21:33 [PATCH] psi: fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim Brian Chen
2021-11-12 16:53 ` Johannes Weiner
2021-11-13  7:39   ` Peter Zijlstra
2021-11-17 13:59 ` [tip: sched/core] psi: Fix " tip-bot2 for Brian Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).