linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] v2 sched: Fix tg->load when offlining a CPU
@ 2023-12-23 11:15 Vincent Guittot
  2023-12-29 12:29 ` [tip: sched/urgent] sched/fair: " tip-bot2 for Vincent Guittot
  2023-12-30 17:56 ` [PATCH] v2 sched: " Dietmar Eggemann
  0 siblings, 2 replies; 3+ messages in thread
From: Vincent Guittot @ 2023-12-23 11:15 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vschneid, imran.f.khan, aaron.lu, linux-kernel
  Cc: Vincent Guittot

When a CPU taken offline, the contribution of its cfs_rqs to task_groups'
load may remain and impact the calculation of the share of the online
CPUs.
Clear the contribution of an offlining CPU to task groups' load and skip
its contribution while it is inactive.

Reported-by: Imran Khan <imran.f.khan@oracle.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-and-tested-by: Imran Khan <imran.f.khan@oracle.com>
---

- Fix !CONFIG_FAIR_GROUP_SCHED case

 kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcea3d55d95d..4b09237d24b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4100,6 +4100,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 	if (cfs_rq->tg == &root_task_group)
 		return;
 
+	/* rq has been offline and doesn't contribute anymore to the share */
+	if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+		return;
+
 	/*
 	 * For migration heavy workloads, access to tg->load_avg can be
 	 * unbound. Limit the update rate to at most once per ms.
@@ -4116,6 +4120,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 	}
 }
 
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+	long delta;
+	u64 now;
+
+	/*
+	 * No need to update load_avg for root_task_group as it is not used.
+	 */
+	if (cfs_rq->tg == &root_task_group)
+		return;
+
+	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+	delta = 0 - cfs_rq->tg_load_avg_contrib;
+	atomic_long_add(delta, &cfs_rq->tg->load_avg);
+	cfs_rq->tg_load_avg_contrib = 0;
+	cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* cpu offline callback */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+	struct task_group *tg;
+
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * The rq clock has already been updated in the
+	 * set_rq_offline(), so we should skip updating
+	 * the rq clock again in unthrottle_cfs_rq().
+	 */
+	rq_clock_start_loop_update(rq);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+		clear_tg_load_avg(cfs_rq);
+	}
+	rcu_read_unlock();
+
+	rq_clock_stop_loop_update(rq);
+}
+
 /*
  * Called within set_task_rq() right before setting a task's CPU. The
  * caller only guarantees p->pi_lock is held; no other assumptions,
@@ -4412,6 +4459,8 @@ static inline bool skip_blocked_update(struct sched_entity *se)
 
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
 
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
+
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
 	return 0;
@@ -12446,6 +12495,9 @@ static void rq_offline_fair(struct rq *rq)
 
 	/* Ensure any throttled groups are reachable by pick_next_task */
 	unthrottle_offline_cfs_rqs(rq);
+
+	/* Ensure that we remove rq contribution to group share */
+	clear_tg_offline_cfs_rqs(rq);
 }
 
 #endif /* CONFIG_SMP */
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [tip: sched/urgent] sched/fair: Fix tg->load when offlining a CPU
  2023-12-23 11:15 [PATCH] v2 sched: Fix tg->load when offlining a CPU Vincent Guittot
@ 2023-12-29 12:29 ` tip-bot2 for Vincent Guittot
  2023-12-30 17:56 ` [PATCH] v2 sched: " Dietmar Eggemann
  1 sibling, 0 replies; 3+ messages in thread
From: tip-bot2 for Vincent Guittot @ 2023-12-29 12:29 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Imran Khan, Aaron Lu, Vincent Guittot, Ingo Molnar,
	Peter Zijlstra, Borislav Petkov, x86, linux-kernel

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     f60a631ab9ed5df15e446269ea515f2b8948ba0c
Gitweb:        https://git.kernel.org/tip/f60a631ab9ed5df15e446269ea515f2b8948ba0c
Author:        Vincent Guittot <vincent.guittot@linaro.org>
AuthorDate:    Thu, 21 Dec 2023 17:40:14 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 29 Dec 2023 13:22:03 +01:00

sched/fair: Fix tg->load when offlining a CPU

When a CPU is taken offline, the contribution of its cfs_rqs to task_groups'
load may remain and will negatively impact the calculation of the share of
the online CPUs.

To fix this bug, clear the contribution of an offlining CPU to task groups'
load and skip its contribution while it is inactive.

Here's the reproducer of the anomaly, by Imran Khan:

	"So far I have encountered only one rather lengthy way of reproducing this issue,
	which is as follows:

	1. Take a KVM guest (booted with 4 CPUs and can be scaled up to 124 CPUs) and
	   create 2 custom cgroups: /sys/fs/cgroup/cpu/test_group_1 and /sys/fs/cgroup/
	   cpu/test_group_2

	2. Assign a CPU intensive workload to each of these cgroups and start the
	   workload.

	For my tests I am using following app:

	int main(int argc, char *argv[])
	{
		unsigned long count, i, val;
		if (argc != 2) {
		      printf("usage: ./a.out <number of random nums to generate> \n");
		      return 0;
		}

		count = strtoul(argv[1], NULL, 10);

		printf("Generating %lu random numbers \n", count);
		for (i = 0; i < count; i++) {
			val = rand();
			val = val % 2;
			//usleep(1);
		}
		printf("Generated %lu random numbers \n", count);
		return 0;
	}

	Also since the system is booted with 4 CPUs, in order to completely load the
	system I am also launching 4 instances of same test app under:

	   /sys/fs/cgroup/cpu/

	3. We can see that both of the cgroups get similar CPU time:

        # systemd-cgtop --depth 1
	Path                                 Tasks    %CPU  Memory  Input/s    Output/s
	/                                      659      -     5.5G        -        -
	/system.slice                            -      -     5.7G        -        -
	/test_group_1                            4      -        -        -        -
	/test_group_2                            3      -        -        -        -
	/user.slice                             31      -    56.5M        -        -

	Path                                 Tasks   %CPU   Memory  Input/s    Output/s
	/                                      659  394.6     5.5G        -        -
	/test_group_2                            3   65.7        -        -        -
	/user.slice                             29   55.1    48.0M        -        -
	/test_group_1                            4   47.3        -        -        -
	/system.slice                            -    2.2     5.7G        -        -

	Path                                 Tasks  %CPU    Memory  Input/s    Output/s
	/                                      659  394.8     5.5G        -        -
	/test_group_1                            4   62.9        -        -        -
	/user.slice                             28   44.9    54.2M        -        -
	/test_group_2                            3   44.7        -        -        -
	/system.slice                            -    0.9     5.7G        -        -

	Path                                 Tasks  %CPU    Memory  Input/s     Output/s
	/                                      659  394.4     5.5G        -        -
	/test_group_2                            3   58.8        -        -        -
	/test_group_1                            4   51.9        -        -        -
	/user.slice                              30   39.3    59.6M        -        -
	/system.slice                            -    1.9     5.7G        -        -

	Path                                 Tasks  %CPU     Memory  Input/s    Output/s
	/                                      659  394.7     5.5G        -        -
	/test_group_1                            4   60.9        -        -        -
	/test_group_2                            3   57.9        -        -        -
	/user.slice                             28   43.5    36.9M        -        -
	/system.slice                            -    3.0     5.7G        -        -

	Path                                 Tasks  %CPU     Memory  Input/s     Output/s
	/                                      659  395.0     5.5G        -        -
	/test_group_1                            4   66.8        -        -        -
	/test_group_2                            3   56.3        -        -        -
	/user.slice                             29   43.1    51.8M        -        -
	/system.slice                            -    0.7     5.7G        -        -

	4. Now move systemd-udevd to one of these test groups, say test_group_1, and
	   perform scale up to 124 CPUs followed by scale down back to 4 CPUs from the
	   host side.

	5. Run the same workload i.e 4 instances of CPU hogger under /sys/fs/cgroup/cpu
	   and one instance of  CPU hogger each in /sys/fs/cgroup/cpu/test_group_1 and
	   /sys/fs/cgroup/test_group_2.

	It can be seen that test_group_1 (the one where systemd-udevd was moved) is getting
	much less CPU time than the test_group_2, even though at this point of time both of
	these groups have only CPU hogger running:

        # systemd-cgtop --depth 1
	Path                                   Tasks   %CPU   Memory  Input/s   Output/s
	/                                      1219     -     5.4G        -        -
	/system.slice                           -       -     5.6G        -        -
	/test_group_1                           4       -        -        -        -
	/test_group_2                           3       -        -        -        -
	/user.slice                            26       -    91.3M        -        -

	Path                                   Tasks  %CPU     Memory  Input/s   Output/s
	/                                      1221  394.3     5.4G        -        -
	/test_group_2                             3   82.7        -        -        -
	/test_group_1                             4   14.3        -        -        -
	/system.slice                             -    0.8     5.6G        -        -
	/user.slice                              26    0.4    91.2M        -        -

	Path                                   Tasks  %CPU    Memory  Input/s    Output/s
	/                                      1221  394.6     5.4G        -        -
	/test_group_2                             3   67.4        -        -        -
	/system.slice                             -   24.6     5.6G        -        -
	/test_group_1                             4   12.5        -        -        -
	/user.slice                              26    0.4    91.2M        -        -

	Path                                  Tasks  %CPU    Memory  Input/s    Output/s
	/                                     1221  395.2     5.4G        -        -
	/test_group_2                            3   60.9        -        -        -
	/system.slice                            -   27.9     5.6G        -        -
	/test_group_1                            4   12.2        -        -        -
	/user.slice                             26    0.4    91.2M        -        -

	Path                                  Tasks  %CPU    Memory  Input/s    Output/s
	/                                     1221  395.2     5.4G        -        -
	/test_group_2                            3   69.4        -        -        -
	/test_group_1                            4   13.9        -        -        -
	/user.slice                             28    1.6    92.0M        -        -
	/system.slice                            -    1.0     5.6G        -        -

	Path                                  Tasks  %CPU    Memory  Input/s    Output/s
	/                                      1221  395.6     5.4G        -        -
	/test_group_2                             3   59.3        -        -        -
	/test_group_1                             4   14.1        -        -        -
	/user.slice                              28    1.3    92.2M        -        -
	/system.slice                             -    0.7     5.6G        -        -

	Path                                  Tasks  %CPU    Memory  Input/s    Output/s
	/                                      1221  395.5     5.4G        -        -
	/test_group_2                            3   67.2        -        -        -
	/test_group_1                            4   11.5        -        -        -
	/user.slice                             28    1.3    92.5M        -        -
	/system.slice                            -    0.6     5.6G        -        -

	Path                                  Tasks  %CPU    Memory  Input/s    Output/s
	/                                      1221  395.1     5.4G        -        -
	/test_group_2                             3   76.8        -        -        -
	/test_group_1                             4   12.9        -        -        -
	/user.slice                              28    1.3    92.8M        -        -
	/system.slice                             -    1.2     5.6G        -        -

	From sched_debug data it can be seen that in bad case the load.weight of per-CPU
	sched entities corresponding to test_group_1 has reduced significantly and
	also load_avg of test_group_1 remains much higher than that of test_group_2,
	even though systemd-udevd stopped running long time back and at this point of
	time both cgroups just have the CPU hogger app as running entity."

[ mingo: Added details from the original discussion, plus minor edits to the patch. ]

Reported-by: Imran Khan <imran.f.khan@oracle.com>
Tested-by: Imran Khan <imran.f.khan@oracle.com>
Tested-by: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Imran Khan <imran.f.khan@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Link: https://lore.kernel.org/r/20231223111545.62135-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7a3c63..43c1216 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4096,6 +4096,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 	if (cfs_rq->tg == &root_task_group)
 		return;
 
+	/* rq has been offline and doesn't contribute to the share anymore: */
+	if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+		return;
+
 	/*
 	 * For migration heavy workloads, access to tg->load_avg can be
 	 * unbound. Limit the update rate to at most once per ms.
@@ -4112,6 +4116,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 	}
 }
 
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+	long delta;
+	u64 now;
+
+	/*
+	 * No need to update load_avg for root_task_group, as it is not used.
+	 */
+	if (cfs_rq->tg == &root_task_group)
+		return;
+
+	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+	delta = 0 - cfs_rq->tg_load_avg_contrib;
+	atomic_long_add(delta, &cfs_rq->tg->load_avg);
+	cfs_rq->tg_load_avg_contrib = 0;
+	cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* CPU offline callback: */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+	struct task_group *tg;
+
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * The rq clock has already been updated in
+	 * set_rq_offline(), so we should skip updating
+	 * the rq clock again in unthrottle_cfs_rq().
+	 */
+	rq_clock_start_loop_update(rq);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+		clear_tg_load_avg(cfs_rq);
+	}
+	rcu_read_unlock();
+
+	rq_clock_stop_loop_update(rq);
+}
+
 /*
  * Called within set_task_rq() right before setting a task's CPU. The
  * caller only guarantees p->pi_lock is held; no other assumptions,
@@ -4408,6 +4455,8 @@ static inline bool skip_blocked_update(struct sched_entity *se)
 
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
 
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
+
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
 	return 0;
@@ -12413,6 +12462,9 @@ static void rq_offline_fair(struct rq *rq)
 
 	/* Ensure any throttled groups are reachable by pick_next_task */
 	unthrottle_offline_cfs_rqs(rq);
+
+	/* Ensure that we remove rq contribution to group share: */
+	clear_tg_offline_cfs_rqs(rq);
 }
 
 #endif /* CONFIG_SMP */

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] v2 sched: Fix tg->load when offlining a CPU
  2023-12-23 11:15 [PATCH] v2 sched: Fix tg->load when offlining a CPU Vincent Guittot
  2023-12-29 12:29 ` [tip: sched/urgent] sched/fair: " tip-bot2 for Vincent Guittot
@ 2023-12-30 17:56 ` Dietmar Eggemann
  1 sibling, 0 replies; 3+ messages in thread
From: Dietmar Eggemann @ 2023-12-30 17:56 UTC (permalink / raw)
  To: Vincent Guittot, mingo, peterz, juri.lelli, rostedt, bsegall,
	mgorman, bristot, vschneid, imran.f.khan, aaron.lu, linux-kernel

On 23/12/2023 12:15, Vincent Guittot wrote:
> When a CPU taken offline, the contribution of its cfs_rqs to task_groups'
> load may remain and impact the calculation of the share of the online
> CPUs.
> Clear the contribution of an offlining CPU to task groups' load and skip
> its contribution while it is inactive.

The patch got applied to tip/sched/core a couple of hours ago so this
might be too late ...

Isn't this fix in rq_offline_fair() also affecting all the other cpus
via cpuset_hotplug_workfn() -> rebuild_sched_domains_locked() ->
partition_sched_domains_locked() -> cpu_attach_domain() ->
rq_attach_root() -> set_rq_offline() ?

> 
> Reported-by: Imran Khan <imran.f.khan@oracle.com>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> Reviewed-and-tested-by: Imran Khan <imran.f.khan@oracle.com>
> ---
> 
> - Fix !CONFIG_FAIR_GROUP_SCHED case
> 
>  kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 52 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bcea3d55d95d..4b09237d24b9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4100,6 +4100,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
>  	if (cfs_rq->tg == &root_task_group)
>  		return;
>  
> +	/* rq has been offline and doesn't contribute anymore to the share */
> +	if (!cpu_active(cpu_of(rq_of(cfs_rq))))
> +		return;
> +
>  	/*
>  	 * For migration heavy workloads, access to tg->load_avg can be
>  	 * unbound. Limit the update rate to at most once per ms.
> @@ -4116,6 +4120,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
>  	}
>  }
>  
> +static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
> +{
> +	long delta;
> +	u64 now;
> +
> +	/*
> +	 * No need to update load_avg for root_task_group as it is not used.
> +	 */
> +	if (cfs_rq->tg == &root_task_group)
> +		return;
> +
> +	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
> +	delta = 0 - cfs_rq->tg_load_avg_contrib;
> +	atomic_long_add(delta, &cfs_rq->tg->load_avg);

Why not:

atomic_long_sub(cfs_rq->tg_load_avg_contrib, &cfs_rq->tg->load_avg) w/o
the local `long delta`?

> +	cfs_rq->tg_load_avg_contrib = 0;
> +	cfs_rq->last_update_tg_load_avg = now;
> +}
> +
> +/* cpu offline callback */
> +static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
> +{
> +	struct task_group *tg;
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	/*
> +	 * The rq clock has already been updated in the
> +	 * set_rq_offline(), so we should skip updating
> +	 * the rq clock again in unthrottle_cfs_rq().

This comment seems misleading here since it looks that it is tailored to
unthrottle_offline_cfs_rqs(), the other cpu offline callback.

> +	 */
> +	rq_clock_start_loop_update(rq);
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(tg, &task_groups, list) {
> +		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +
> +		clear_tg_load_avg(cfs_rq);
> +	}
> +	rcu_read_unlock();
> +
> +	rq_clock_stop_loop_update(rq);
> +}
> +
>  /*
>   * Called within set_task_rq() right before setting a task's CPU. The
>   * caller only guarantees p->pi_lock is held; no other assumptions,
> @@ -4412,6 +4459,8 @@ static inline bool skip_blocked_update(struct sched_entity *se)
>  
>  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
>  
> +static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
> +
>  static inline int propagate_entity_load_avg(struct sched_entity *se)
>  {
>  	return 0;
> @@ -12446,6 +12495,9 @@ static void rq_offline_fair(struct rq *rq)
>  
>  	/* Ensure any throttled groups are reachable by pick_next_task */
>  	unthrottle_offline_cfs_rqs(rq);
> +
> +	/* Ensure that we remove rq contribution to group share */
> +	clear_tg_offline_cfs_rqs(rq);
>  }
>  
>  #endif /* CONFIG_SMP */


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-12-30 17:57 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-23 11:15 [PATCH] v2 sched: Fix tg->load when offlining a CPU Vincent Guittot
2023-12-29 12:29 ` [tip: sched/urgent] sched/fair: " tip-bot2 for Vincent Guittot
2023-12-30 17:56 ` [PATCH] v2 sched: " Dietmar Eggemann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).