linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] Cleanup and Fix for wrong accounting of migrated cache hot tasks
@ 2023-06-14 10:22 Swapnil Sapkal
  2023-06-14 10:22 ` [PATCH 1/2] sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat Swapnil Sapkal
  2023-06-14 10:22 ` [PATCH 2/2] sched/fair: Cleanup in migrate_degrades_locality() to improve readability Swapnil Sapkal
  0 siblings, 2 replies; 7+ messages in thread
From: Swapnil Sapkal @ 2023-06-14 10:22 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	iamjoonsoo.kim, linux-kernel, gautham.shenoy, kprateek.nayak,
	wyes.karny, Swapnil Sapkal

In /proc/schedstat, lb_hot_gained reports the number of times cache-hot
tasks were migrated as a part of load balancing. This value is incremented 
in can_migrate_task() if the task is cache hot and migratable. But after
incrementing this value, it is possible that the task won't get migrated,
in which case this value will be incorrect. Fix this by incrementing it
in detach_task().

While at it, cleanup migrate_degrades_locality() by making it return
an enum instead of the {-1,0,1} to improve the readability of 
can_migrate_task().

Swapnil Sapkal (2):
  sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat
  sched/fair: Cleanup in migrate_degrades_locality() to improve
    readability

 kernel/sched/fair.c | 100 +++++++++++++++++++++++++++++---------------
 1 file changed, 66 insertions(+), 34 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/2] sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat
  2023-06-14 10:22 [PATCH 0/2] Cleanup and Fix for wrong accounting of migrated cache hot tasks Swapnil Sapkal
@ 2023-06-14 10:22 ` Swapnil Sapkal
  2023-06-19  9:22   ` Peter Zijlstra
  2023-06-14 10:22 ` [PATCH 2/2] sched/fair: Cleanup in migrate_degrades_locality() to improve readability Swapnil Sapkal
  1 sibling, 1 reply; 7+ messages in thread
From: Swapnil Sapkal @ 2023-06-14 10:22 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	iamjoonsoo.kim, linux-kernel, gautham.shenoy, kprateek.nayak,
	wyes.karny, Swapnil Sapkal

In /proc/schedstat, lb_hot_gained reports the number hot tasks pulled
during load balance. This value is incremented in can_migrate_task()
if the task is migratable and hot. After incrementing the value,
load balancer can still decide not to migrate this task leading to wrong
accounting. Fix this by incrementing stats when hot tasks are detached.
This issue only exits in detach_tasks() where we can decide to not
migrate hot task even if it is migratable. However, in detach_one_task(),
we migrate it unconditionally.

Fixes: d31980846f96 ("sched: Move up affinity check to mitigate useless redoing overhead")
Reported-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
---
 kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 373ff5f55884..9a8e5dcbe7e6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8507,9 +8507,9 @@ static inline int migrate_degrades_locality(struct task_struct *p,
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
-int can_migrate_task(struct task_struct *p, struct lb_env *env)
+int can_migrate_task(struct task_struct *p, struct lb_env *env, int *tsk_cache_hot)
 {
-	int tsk_cache_hot;
+	int degrades_locality;
 
 	lockdep_assert_rq_held(env->src_rq);
 
@@ -8578,18 +8578,19 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if (env->flags & LBF_ACTIVE_LB)
 		return 1;
 
-	tsk_cache_hot = migrate_degrades_locality(p, env);
-	if (tsk_cache_hot == -1)
-		tsk_cache_hot = task_hot(p, env);
+	degrades_locality = migrate_degrades_locality(p, env);
+	if (degrades_locality == -1)
+		*tsk_cache_hot = task_hot(p, env);
+	else
+		*tsk_cache_hot = degrades_locality;
 
-	if (tsk_cache_hot <= 0 ||
-	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-		if (tsk_cache_hot == 1) {
-			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
-			schedstat_inc(p->stats.nr_forced_migrations);
-		}
+	/*
+	 * Can migrate a hot task only after the attempts to reach balance
+	 * without the task have exceeded the cache_nice_tries threshold.
+	 */
+	if (!(*tsk_cache_hot) ||
+		env->sd->nr_balance_failed > env->sd->cache_nice_tries)
 		return 1;
-	}
 
 	schedstat_inc(p->stats.nr_failed_migrations_hot);
 	return 0;
@@ -8598,10 +8599,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 /*
  * detach_task() -- detach the task for the migration specified in env
  */
-static void detach_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct lb_env *env, int tsk_cache_hot)
 {
 	lockdep_assert_rq_held(env->src_rq);
 
+	if (tsk_cache_hot == 1) {
+		schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+		schedstat_inc(p->stats.nr_forced_migrations);
+	}
+
 	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
 	set_task_cpu(p, env->dst_cpu);
 }
@@ -8620,10 +8626,12 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 
 	list_for_each_entry_reverse(p,
 			&env->src_rq->cfs_tasks, se.group_node) {
-		if (!can_migrate_task(p, env))
+		int tsk_cache_hot = 0;
+
+		if (!can_migrate_task(p, env, &tsk_cache_hot))
 			continue;
 
-		detach_task(p, env);
+		detach_task(p, env, tsk_cache_hot);
 
 		/*
 		 * Right now, this is only the second place where
@@ -8665,6 +8673,8 @@ static int detach_tasks(struct lb_env *env)
 		return 0;
 
 	while (!list_empty(tasks)) {
+		int tsk_cache_hot = 0;
+
 		/*
 		 * We don't want to steal all, otherwise we may be treated likewise,
 		 * which could at worst lead to a livelock crash.
@@ -8690,7 +8700,7 @@ static int detach_tasks(struct lb_env *env)
 
 		p = list_last_entry(tasks, struct task_struct, se.group_node);
 
-		if (!can_migrate_task(p, env))
+		if (!can_migrate_task(p, env, &tsk_cache_hot))
 			goto next;
 
 		switch (env->migration_type) {
@@ -8742,7 +8752,7 @@ static int detach_tasks(struct lb_env *env)
 			break;
 		}
 
-		detach_task(p, env);
+		detach_task(p, env, tsk_cache_hot);
 		list_add(&p->se.group_node, &env->tasks);
 
 		detached++;
@@ -8766,6 +8776,9 @@ static int detach_tasks(struct lb_env *env)
 
 		continue;
 next:
+		if (tsk_cache_hot == 1)
+			schedstat_inc(p->stats.nr_failed_migrations_hot);
+
 		list_move(&p->se.group_node, tasks);
 	}
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/2] sched/fair: Cleanup in migrate_degrades_locality() to improve readability
  2023-06-14 10:22 [PATCH 0/2] Cleanup and Fix for wrong accounting of migrated cache hot tasks Swapnil Sapkal
  2023-06-14 10:22 ` [PATCH 1/2] sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat Swapnil Sapkal
@ 2023-06-14 10:22 ` Swapnil Sapkal
  2023-06-19  9:45   ` Peter Zijlstra
  1 sibling, 1 reply; 7+ messages in thread
From: Swapnil Sapkal @ 2023-06-14 10:22 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	iamjoonsoo.kim, linux-kernel, gautham.shenoy, kprateek.nayak,
	wyes.karny, Swapnil Sapkal

The migrate_degrades_locality() returns tristate value whether
the migration will improve locality, degrades locality or no
impact. Handle this return values with enum to improve the
readability.

Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
---
 kernel/sched/fair.c | 69 +++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a8e5dcbe7e6..06813ce5356e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8443,45 +8443,52 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 
+enum migration_impact {
+	/* if task migration is not affected by locality */
+	no_change = -1,
+
+	/* if task migration improves locality i.e migration preferred */
+	improves_locality = 0,
+
+	/* if task migration degrades locality */
+	degrades_locality = 1
+};
+
 #ifdef CONFIG_NUMA_BALANCING
-/*
- * Returns 1, if task migration degrades locality
- * Returns 0, if task migration improves locality i.e migration preferred.
- * Returns -1, if task migration is not affected by locality.
- */
-static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+static enum migration_impact
+migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
 	unsigned long src_weight, dst_weight;
 	int src_nid, dst_nid, dist;
 
 	if (!static_branch_likely(&sched_numa_balancing))
-		return -1;
+		return no_change;
 
 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
-		return -1;
+		return no_change;
 
 	src_nid = cpu_to_node(env->src_cpu);
 	dst_nid = cpu_to_node(env->dst_cpu);
 
 	if (src_nid == dst_nid)
-		return -1;
+		return no_change;
 
 	/* Migrating away from the preferred node is always bad. */
 	if (src_nid == p->numa_preferred_nid) {
 		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
-			return 1;
+			return degrades_locality;
 		else
-			return -1;
+			return no_change;
 	}
 
 	/* Encourage migration to the preferred node. */
 	if (dst_nid == p->numa_preferred_nid)
-		return 0;
+		return improves_locality;
 
 	/* Leaving a core idle is often worse than degrading locality. */
 	if (env->idle == CPU_IDLE)
-		return -1;
+		return no_change;
 
 	dist = node_distance(src_nid, dst_nid);
 	if (numa_group) {
@@ -8492,14 +8499,14 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 		dst_weight = task_weight(p, dst_nid, dist);
 	}
 
-	return dst_weight < src_weight;
+	return (dst_weight < src_weight) ? degrades_locality : improves_locality;
 }
 
 #else
-static inline int migrate_degrades_locality(struct task_struct *p,
-					     struct lb_env *env)
+static inline enum migration_impact
+migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
-	return -1;
+	return no_change;
 }
 #endif
 
@@ -8509,7 +8516,7 @@ static inline int migrate_degrades_locality(struct task_struct *p,
 static
 int can_migrate_task(struct task_struct *p, struct lb_env *env, int *tsk_cache_hot)
 {
-	int degrades_locality;
+	enum migration_impact migration_impact;
 
 	lockdep_assert_rq_held(env->src_rq);
 
@@ -8578,18 +8585,30 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env, int *tsk_cache_h
 	if (env->flags & LBF_ACTIVE_LB)
 		return 1;
 
-	degrades_locality = migrate_degrades_locality(p, env);
-	if (degrades_locality == -1)
+	migration_impact = migrate_degrades_locality(p, env);
+
+	switch (migration_impact) {
+	case no_change:
 		*tsk_cache_hot = task_hot(p, env);
-	else
-		*tsk_cache_hot = degrades_locality;
+		break;
+
+	case degrades_locality:
+		*tsk_cache_hot = 1;
+		break;
+
+	case improves_locality:
+		*tsk_cache_hot = 0;
+		break;
+	}
+
+	if (!(*tsk_cache_hot))
+		return 1;
 
 	/*
-	 * Can migrate a hot task only after the attempts to reach balance
+	 * Can migrate a task only after the attempts to reach balance
 	 * without the task have exceeded the cache_nice_tries threshold.
 	 */
-	if (!(*tsk_cache_hot) ||
-		env->sd->nr_balance_failed > env->sd->cache_nice_tries)
+	if (env->sd->nr_balance_failed > env->sd->cache_nice_tries)
 		return 1;
 
 	schedstat_inc(p->stats.nr_failed_migrations_hot);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat
  2023-06-14 10:22 ` [PATCH 1/2] sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat Swapnil Sapkal
@ 2023-06-19  9:22   ` Peter Zijlstra
  2023-06-21  4:38     ` Swapnil Sapkal
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2023-06-19  9:22 UTC (permalink / raw)
  To: Swapnil Sapkal
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, iamjoonsoo.kim,
	linux-kernel, gautham.shenoy, kprateek.nayak, wyes.karny

On Wed, Jun 14, 2023 at 10:22:23AM +0000, Swapnil Sapkal wrote:
> In /proc/schedstat, lb_hot_gained reports the number hot tasks pulled
> during load balance. This value is incremented in can_migrate_task()
> if the task is migratable and hot. After incrementing the value,
> load balancer can still decide not to migrate this task leading to wrong
> accounting. Fix this by incrementing stats when hot tasks are detached.
> This issue only exits in detach_tasks() where we can decide to not
> migrate hot task even if it is migratable. However, in detach_one_task(),
> we migrate it unconditionally.
> 
> Fixes: d31980846f96 ("sched: Move up affinity check to mitigate useless redoing overhead")
> Reported-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
> Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
> ---
>  kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++----------------
>  1 file changed, 30 insertions(+), 17 deletions(-)

All this for just a number hardly anybody looks at :-(

Does this also work?

Please double check the order of the task_struct::sched_bitfield thing,
I've not had much wake-up juice.

---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 14 ++++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1292d38d66cc..eba0a78ac2a9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -887,6 +887,7 @@ struct task_struct {
 	unsigned			sched_reset_on_fork:1;
 	unsigned			sched_contributes_to_load:1;
 	unsigned			sched_migrated:1;
+	unsigned			sched_task_hot:1;
 
 	/* Force alignment to the next boundary: */
 	unsigned			:0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6189d1a45635..a88577132b20 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8569,6 +8569,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	int tsk_cache_hot;
 
 	lockdep_assert_rq_held(env->src_rq);
+	if (p->sched_task_hot)
+		p->sched_task_hot = 0;
 
 	/*
 	 * We do not migrate tasks that are:
@@ -8641,10 +8643,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 	if (tsk_cache_hot <= 0 ||
 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-		if (tsk_cache_hot == 1) {
-			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
-			schedstat_inc(p->stats.nr_forced_migrations);
-		}
+		if (tsk_cache_hot == 1)
+			p->sched_task_hot = 1;
 		return 1;
 	}
 
@@ -8659,6 +8659,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_rq_held(env->src_rq);
 
+	if (p->sched_task_hot) {
+		p->sched_task_hot = 0;
+		schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+		schedstat_inc(p->stats.nr_forced_migrations);
+	}
+
 	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
 	set_task_cpu(p, env->dst_cpu);
 }

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] sched/fair: Cleanup in migrate_degrades_locality() to improve readability
  2023-06-14 10:22 ` [PATCH 2/2] sched/fair: Cleanup in migrate_degrades_locality() to improve readability Swapnil Sapkal
@ 2023-06-19  9:45   ` Peter Zijlstra
  2023-06-21  4:44     ` Swapnil Sapkal
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2023-06-19  9:45 UTC (permalink / raw)
  To: Swapnil Sapkal
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, iamjoonsoo.kim,
	linux-kernel, gautham.shenoy, kprateek.nayak, wyes.karny

On Wed, Jun 14, 2023 at 10:22:24AM +0000, Swapnil Sapkal wrote:
> The migrate_degrades_locality() returns tristate value whether
> the migration will improve locality, degrades locality or no
> impact. Handle this return values with enum to improve the
> readability.

I can see how you ended up there, that tristate is weird, but perhaps
don't make it more complicated than it should be?

---
 kernel/sched/fair.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 373ff5f55884..a8449f594348 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8446,42 +8446,42 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * Returns 1, if task migration degrades locality
- * Returns 0, if task migration improves locality i.e migration preferred.
- * Returns -1, if task migration is not affected by locality.
+ * Returns 0, if task migration is not affected by locality.
+ * Returns -1, if task migration improves locality i.e migration preferred.
  */
-static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
 	unsigned long src_weight, dst_weight;
 	int src_nid, dst_nid, dist;
 
 	if (!static_branch_likely(&sched_numa_balancing))
-		return -1;
+		return 0;
 
 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
-		return -1;
+		return 0;
 
 	src_nid = cpu_to_node(env->src_cpu);
 	dst_nid = cpu_to_node(env->dst_cpu);
 
 	if (src_nid == dst_nid)
-		return -1;
+		return 0;
 
 	/* Migrating away from the preferred node is always bad. */
 	if (src_nid == p->numa_preferred_nid) {
 		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
 			return 1;
 		else
-			return -1;
+			return 0;
 	}
 
 	/* Encourage migration to the preferred node. */
 	if (dst_nid == p->numa_preferred_nid)
-		return 0;
+		return -1;
 
 	/* Leaving a core idle is often worse than degrading locality. */
 	if (env->idle == CPU_IDLE)
-		return -1;
+		return 0;
 
 	dist = node_distance(src_nid, dst_nid);
 	if (numa_group) {
@@ -8492,14 +8492,14 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 		dst_weight = task_weight(p, dst_nid, dist);
 	}
 
-	return dst_weight < src_weight;
+	return src_weight - dst_weight;
 }
 
 #else
-static inline int migrate_degrades_locality(struct task_struct *p,
+static inline long migrate_degrades_locality(struct task_struct *p,
 					     struct lb_env *env)
 {
-	return -1;
+	return 0;
 }
 #endif
 
@@ -8509,7 +8509,7 @@ static inline int migrate_degrades_locality(struct task_struct *p,
 static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
-	int tsk_cache_hot;
+	long degrades, hot;
 
 	lockdep_assert_rq_held(env->src_rq);
 
@@ -8578,13 +8578,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if (env->flags & LBF_ACTIVE_LB)
 		return 1;
 
-	tsk_cache_hot = migrate_degrades_locality(p, env);
-	if (tsk_cache_hot == -1)
-		tsk_cache_hot = task_hot(p, env);
+	degrades = migrate_degrades_locality(p, env);
+	if (!degrades)
+		hot = task_hot(p, env);
+	else
+		hot = degrades > 0;
 
-	if (tsk_cache_hot <= 0 ||
-	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-		if (tsk_cache_hot == 1) {
+	if (env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+		if (hot) {
 			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
 			schedstat_inc(p->stats.nr_forced_migrations);
 		}

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/2] sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat
  2023-06-19  9:22   ` Peter Zijlstra
@ 2023-06-21  4:38     ` Swapnil Sapkal
  0 siblings, 0 replies; 7+ messages in thread
From: Swapnil Sapkal @ 2023-06-21  4:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, iamjoonsoo.kim,
	linux-kernel, gautham.shenoy, kprateek.nayak, wyes.karny

Hello Peter,

On 6/19/2023 2:52 PM, Peter Zijlstra wrote:
> On Wed, Jun 14, 2023 at 10:22:23AM +0000, Swapnil Sapkal wrote:
>> In /proc/schedstat, lb_hot_gained reports the number hot tasks pulled
>> during load balance. This value is incremented in can_migrate_task()
>> if the task is migratable and hot. After incrementing the value,
>> load balancer can still decide not to migrate this task leading to wrong
>> accounting. Fix this by incrementing stats when hot tasks are detached.
>> This issue only exits in detach_tasks() where we can decide to not
>> migrate hot task even if it is migratable. However, in detach_one_task(),
>> we migrate it unconditionally.
>>
>> Fixes: d31980846f96 ("sched: Move up affinity check to mitigate useless redoing overhead")
>> Reported-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
>> Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
>> ---
>>   kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++----------------
>>   1 file changed, 30 insertions(+), 17 deletions(-)
> 
> All this for just a number hardly anybody looks at :-(
> 
> Does this also work?

Thank you for this patch which looks much simpler.
This will work. In addition we need to handle the following case: when the task is
hot and migratable, and detach_tasks() decides not to migrate the task, then we
should be incrementing nr_failed_migrations_hot. I am appending this hunk at the
end of this mail.

> 
> Please double check the order of the task_struct::sched_bitfield thing,
> I've not had much wake-up juice
> 
> ---
>   include/linux/sched.h |  1 +
>   kernel/sched/fair.c   | 14 ++++++++++----
>   2 files changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 1292d38d66cc..eba0a78ac2a9 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -887,6 +887,7 @@ struct task_struct {
>   	unsigned			sched_reset_on_fork:1;
>   	unsigned			sched_contributes_to_load:1;
>   	unsigned			sched_migrated:1;
> +	unsigned			sched_task_hot:1;
>   
>   	/* Force alignment to the next boundary: */
>   	unsigned			:0;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6189d1a45635..a88577132b20 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8569,6 +8569,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
>   	int tsk_cache_hot;
>   
>   	lockdep_assert_rq_held(env->src_rq);
> +	if (p->sched_task_hot)
> +		p->sched_task_hot = 0;
>   
>   	/*
>   	 * We do not migrate tasks that are:
> @@ -8641,10 +8643,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
>   
>   	if (tsk_cache_hot <= 0 ||
>   	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
> -		if (tsk_cache_hot == 1) {
> -			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
> -			schedstat_inc(p->stats.nr_forced_migrations);
> -		}
> +		if (tsk_cache_hot == 1)
> +			p->sched_task_hot = 1;
>   		return 1;
>   	}
>   
> @@ -8659,6 +8659,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
>   {
>   	lockdep_assert_rq_held(env->src_rq);
>   
> +	if (p->sched_task_hot) {
> +		p->sched_task_hot = 0;
> +		schedstat_inc(env->sd->lb_hot_gained[env->idle]);
> +		schedstat_inc(p->stats.nr_forced_migrations);
> +	}
> +
>   	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
>   	set_task_cpu(p, env->dst_cpu);
>   }
---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b6a738514047..eb836629560f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8772,6 +8772,9 @@ static int detach_tasks(struct lb_env *env)

                 continue;
  next:
+               if (p->sched_task_hot)
+                       schedstat_inc(p->stats.nr_failed_migrations_hot);
+
                 list_move(&p->se.group_node, tasks);
         }

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] sched/fair: Cleanup in migrate_degrades_locality() to improve readability
  2023-06-19  9:45   ` Peter Zijlstra
@ 2023-06-21  4:44     ` Swapnil Sapkal
  0 siblings, 0 replies; 7+ messages in thread
From: Swapnil Sapkal @ 2023-06-21  4:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, iamjoonsoo.kim,
	linux-kernel, gautham.shenoy, kprateek.nayak, wyes.karny

Hello Peter,

On 6/19/2023 3:15 PM, Peter Zijlstra wrote:
> On Wed, Jun 14, 2023 at 10:22:24AM +0000, Swapnil Sapkal wrote:
>> The migrate_degrades_locality() returns tristate value whether
>> the migration will improve locality, degrades locality or no
>> impact. Handle this return values with enum to improve the
>> readability.
> 
> I can see how you ended up there, that tristate is weird, but perhaps
> don't make it more complicated than it should be?
> 
> ---
>   kernel/sched/fair.c | 39 ++++++++++++++++++++-------------------
>   1 file changed, 20 insertions(+), 19 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 373ff5f55884..a8449f594348 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8446,42 +8446,42 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>   #ifdef CONFIG_NUMA_BALANCING
>   /*
>    * Returns 1, if task migration degrades locality
> - * Returns 0, if task migration improves locality i.e migration preferred.
> - * Returns -1, if task migration is not affected by locality.
> + * Returns 0, if task migration is not affected by locality.
> + * Returns -1, if task migration improves locality i.e migration preferred.
>    */
Because of the following hunk:

> @@ -8492,14 +8492,14 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>   		dst_weight = task_weight(p, dst_nid, dist);
>   	}
>   
> -	return dst_weight < src_weight;
> +	return src_weight - dst_weight;
>   }
>   

I suppose we should also change the comment to:
   /*
    * Returns a positive value, if task migration degrades locality
    * Returns 0, if task migration is not affected by locality.
    * Returns a negative value, if task migration improves locality i.e migration preferred.
    */

Do I need to resend v2 with your changes for this patchset?

> -static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>   {
>   	struct numa_group *numa_group = rcu_dereference(p->numa_group);
>   	unsigned long src_weight, dst_weight;
>   	int src_nid, dst_nid, dist;
>   
>   	if (!static_branch_likely(&sched_numa_balancing))
> -		return -1;
> +		return 0;
>   
>   	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
> -		return -1;
> +		return 0;
>   
>   	src_nid = cpu_to_node(env->src_cpu);
>   	dst_nid = cpu_to_node(env->dst_cpu);
>   
>   	if (src_nid == dst_nid)
> -		return -1;
> +		return 0;
>   
>   	/* Migrating away from the preferred node is always bad. */
>   	if (src_nid == p->numa_preferred_nid) {
>   		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
>   			return 1;
>   		else
> -			return -1;
> +			return 0;
>   	}
>   
>   	/* Encourage migration to the preferred node. */
>   	if (dst_nid == p->numa_preferred_nid)
> -		return 0;
> +		return -1;
>   
>   	/* Leaving a core idle is often worse than degrading locality. */
>   	if (env->idle == CPU_IDLE)
> -		return -1;
> +		return 0;
>   
>   	dist = node_distance(src_nid, dst_nid);
>   	if (numa_group) {
> @@ -8492,14 +8492,14 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>   		dst_weight = task_weight(p, dst_nid, dist);
>   	}
>   
> -	return dst_weight < src_weight;
> +	return src_weight - dst_weight;
>   }
--
Thanks and regards,
Swapnil

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-06-21  4:44 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-14 10:22 [PATCH 0/2] Cleanup and Fix for wrong accounting of migrated cache hot tasks Swapnil Sapkal
2023-06-14 10:22 ` [PATCH 1/2] sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat Swapnil Sapkal
2023-06-19  9:22   ` Peter Zijlstra
2023-06-21  4:38     ` Swapnil Sapkal
2023-06-14 10:22 ` [PATCH 2/2] sched/fair: Cleanup in migrate_degrades_locality() to improve readability Swapnil Sapkal
2023-06-19  9:45   ` Peter Zijlstra
2023-06-21  4:44     ` Swapnil Sapkal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).