linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
@ 2022-08-10  1:56 zhangsong
  2022-08-10  4:02 ` Abel Wu
  0 siblings, 1 reply; 13+ messages in thread
From: zhangsong @ 2022-08-10  1:56 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, zhangsong, kernel test robot

For co-location with NORMAL and IDLE tasks, when CFS trigger load balance,
it is reasonable to prefer migrating NORMAL(Latency Sensitive) tasks from
the busy src CPU to dst CPU, and migrating IDLE tasks lastly.

This is very important for reducing interference from IDLE tasks.
So the CFS load balance can be optimized to below:

1.`cfs_tasks` list of CPU rq is owned by NORMAL tasks.
2.`cfs_idle_tasks` list of CPU rq which is owned by IDLE tasks.
3.Prefer to migrate NORMAL tasks of cfs_tasks to dst CPU.
4.Lastly migrate IDLE tasks of cfs_idle_tasks to dst CPU.

This was tested with the following reproduction:
- small number of NORMAL tasks colocated with a large number of IDLE tasks

With this patch, NORMAL tasks latency can be reduced
about 5~10% compared with current.

Signed-off-by: zhangsong <zhangsong34@huawei.com>
Reported-by: kernel test robot <lkp@intel.com>
---
V1->V2:
- fix build test error
---
 kernel/sched/core.c  |  1 +
 kernel/sched/fair.c  | 45 ++++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h |  1 +
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ee28253c9ac0..7325c6e552d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9733,6 +9733,7 @@ void __init sched_init(void)
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
+		INIT_LIST_HEAD(&rq->cfs_idle_tasks);
 
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 914096c5b1ae..b62bec5b1eb9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3034,6 +3034,21 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
 
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_SMP
+static void
+adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
+	struct rq *rq,
+	struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	if (task_has_idle_policy(task_of(se)) || tg_is_idle(cfs_rq->tg))
+		(*list_op)(&se->group_node, &rq->cfs_idle_tasks);
+	else
+		(*list_op)(&se->group_node, &rq->cfs_tasks);
+}
+#endif
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -3043,7 +3058,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		struct rq *rq = rq_of(cfs_rq);
 
 		account_numa_enqueue(rq, task_of(se));
-		list_add(&se->group_node, &rq->cfs_tasks);
+		adjust_rq_cfs_tasks(list_add, rq, se);
 	}
 #endif
 	cfs_rq->nr_running++;
@@ -7465,7 +7480,7 @@ done: __maybe_unused;
 	 * the list, so our cfs_tasks list becomes MRU
 	 * one.
 	 */
-	list_move(&p->se.group_node, &rq->cfs_tasks);
+	adjust_rq_cfs_tasks(list_move, rq, &p->se);
 #endif
 
 	if (hrtick_enabled_fair(rq))
@@ -7788,6 +7803,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 	if (unlikely(task_has_idle_policy(p)))
 		return 0;
 
+	if (tg_is_idle(cfs_rq_of(&p->se)->tg))
+		return 0;
+
 	/* SMT siblings share cache */
 	if (env->sd->flags & SD_SHARE_CPUCAPACITY)
 		return 0;
@@ -7800,6 +7818,11 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 			 &p->se == cfs_rq_of(&p->se)->last))
 		return 1;
 
+	/* Preempt sched idle cpu do not consider migration cost */
+	if (cpus_share_cache(env->src_cpu, env->dst_cpu) &&
+	    sched_idle_cpu(env->dst_cpu))
+		return 0;
+
 	if (sysctl_sched_migration_cost == -1)
 		return 1;
 
@@ -7990,11 +8013,14 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 static struct task_struct *detach_one_task(struct lb_env *env)
 {
 	struct task_struct *p;
+	struct list_head *tasks = &env->src_rq->cfs_tasks;
+	int loop = 0;
 
 	lockdep_assert_rq_held(env->src_rq);
 
+again:
 	list_for_each_entry_reverse(p,
-			&env->src_rq->cfs_tasks, se.group_node) {
+			tasks, se.group_node) {
 		if (!can_migrate_task(p, env))
 			continue;
 
@@ -8009,6 +8035,10 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 		schedstat_inc(env->sd->lb_gained[env->idle]);
 		return p;
 	}
+	if (++loop == 1) {
+		tasks = &env->src_rq->cfs_idle_tasks;
+		goto again;
+	}
 	return NULL;
 }
 
@@ -8026,6 +8056,7 @@ static int detach_tasks(struct lb_env *env)
 	unsigned long util, load;
 	struct task_struct *p;
 	int detached = 0;
+	int loop = 0;
 
 	lockdep_assert_rq_held(env->src_rq);
 
@@ -8041,6 +8072,7 @@ static int detach_tasks(struct lb_env *env)
 	if (env->imbalance <= 0)
 		return 0;
 
+again:
 	while (!list_empty(tasks)) {
 		/*
 		 * We don't want to steal all, otherwise we may be treated likewise,
@@ -8142,6 +8174,11 @@ static int detach_tasks(struct lb_env *env)
 		list_move(&p->se.group_node, tasks);
 	}
 
+	if (env->imbalance > 0 && ++loop == 1) {
+		tasks = &env->src_rq->cfs_idle_tasks;
+		goto again;
+	}
+
 	/*
 	 * Right now, this is one of only two places we collect this stat
 	 * so we can safely collect detach_one_task() stats here rather
@@ -11643,7 +11680,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 		 * Move the next running task to the front of the list, so our
 		 * cfs_tasks list becomes MRU one.
 		 */
-		list_move(&se->group_node, &rq->cfs_tasks);
+		adjust_rq_cfs_tasks(list_move, rq, se);
 	}
 #endif
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e26688d387ae..accb4eea9769 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1068,6 +1068,7 @@ struct rq {
 	int			online;
 
 	struct list_head cfs_tasks;
+	struct list_head cfs_idle_tasks;
 
 	struct sched_avg	avg_rt;
 	struct sched_avg	avg_dl;
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-10  1:56 [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks zhangsong
@ 2022-08-10  4:02 ` Abel Wu
  2022-08-10  7:34   ` zhangsong (J)
  0 siblings, 1 reply; 13+ messages in thread
From: Abel Wu @ 2022-08-10  4:02 UTC (permalink / raw)
  To: zhangsong, mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, kernel test robot

Hi Zhang Song,

On 8/10/22 9:56 AM, zhangsong Wrote:
> For co-location with NORMAL and IDLE tasks, when CFS trigger load balance,
> it is reasonable to prefer migrating NORMAL(Latency Sensitive) tasks from
> the busy src CPU to dst CPU, and migrating IDLE tasks lastly.

Considering the large weight difference between normal and idle tasks,
does the re-ordering really change things? It would be helpful if you
can offer more detailed info.

> 
> This is very important for reducing interference from IDLE tasks.
> So the CFS load balance can be optimized to below:
> 
> 1.`cfs_tasks` list of CPU rq is owned by NORMAL tasks.
> 2.`cfs_idle_tasks` list of CPU rq which is owned by IDLE tasks.
> 3.Prefer to migrate NORMAL tasks of cfs_tasks to dst CPU.
> 4.Lastly migrate IDLE tasks of cfs_idle_tasks to dst CPU.
> 
> This was tested with the following reproduction:
> - small number of NORMAL tasks colocated with a large number of IDLE tasks
> 
> With this patch, NORMAL tasks latency can be reduced
> about 5~10% compared with current.
> 
> Signed-off-by: zhangsong <zhangsong34@huawei.com>
> Reported-by: kernel test robot <lkp@intel.com>

The Reported-by tag is usually used for reporting a bug in the mainline
kernel, and build error of your patch is not one of them :)

> ---
> V1->V2:
> - fix build test error
> ---
>   kernel/sched/core.c  |  1 +
>   kernel/sched/fair.c  | 45 ++++++++++++++++++++++++++++++++++++++++----
>   kernel/sched/sched.h |  1 +
>   3 files changed, 43 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ee28253c9ac0..7325c6e552d8 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9733,6 +9733,7 @@ void __init sched_init(void)
>   		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
>   
>   		INIT_LIST_HEAD(&rq->cfs_tasks);
> +		INIT_LIST_HEAD(&rq->cfs_idle_tasks);
>   
>   		rq_attach_root(rq, &def_root_domain);
>   #ifdef CONFIG_NO_HZ_COMMON
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 914096c5b1ae..b62bec5b1eb9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3034,6 +3034,21 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
>   
>   #endif /* CONFIG_NUMA_BALANCING */
>   
> +#ifdef CONFIG_SMP
> +static void
> +adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
> +	struct rq *rq,
> +	struct sched_entity *se)
> +{
> +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> +	if (task_has_idle_policy(task_of(se)) || tg_is_idle(cfs_rq->tg))

The tg_is_idle() doesn't have hierarchical judgement on parent task
groups, while rq->cfs{,_idle}_tasks is rq wide. Say A->B where tgA
is idle and tgB isn't, a task from B will be added to the non-idle
list, is this what you want?

> +		(*list_op)(&se->group_node, &rq->cfs_idle_tasks);
> +	else
> +		(*list_op)(&se->group_node, &rq->cfs_tasks);
> +}
> +#endif
> +
>   static void
>   account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>   {
> @@ -3043,7 +3058,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>   		struct rq *rq = rq_of(cfs_rq);
>   
>   		account_numa_enqueue(rq, task_of(se));
> -		list_add(&se->group_node, &rq->cfs_tasks);
> +		adjust_rq_cfs_tasks(list_add, rq, se);
>   	}
>   #endif
>   	cfs_rq->nr_running++;
> @@ -7465,7 +7480,7 @@ done: __maybe_unused;
>   	 * the list, so our cfs_tasks list becomes MRU
>   	 * one.
>   	 */
> -	list_move(&p->se.group_node, &rq->cfs_tasks);
> +	adjust_rq_cfs_tasks(list_move, rq, &p->se);
>   #endif
>   
>   	if (hrtick_enabled_fair(rq))
> @@ -7788,6 +7803,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>   	if (unlikely(task_has_idle_policy(p)))
>   		return 0;
>   
> +	if (tg_is_idle(cfs_rq_of(&p->se)->tg))
> +		return 0;
> +

Same as above. But I am not sure this is the right way to do it. We
still want to maintain policy behavior inside an idle task group.

>   	/* SMT siblings share cache */
>   	if (env->sd->flags & SD_SHARE_CPUCAPACITY)
>   		return 0;
> @@ -7800,6 +7818,11 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>   			 &p->se == cfs_rq_of(&p->se)->last))
>   		return 1;
>   
> +	/* Preempt sched idle cpu do not consider migration cost */
> +	if (cpus_share_cache(env->src_cpu, env->dst_cpu) &&
> +	    sched_idle_cpu(env->dst_cpu))
> +		return 0;
> +
>   	if (sysctl_sched_migration_cost == -1)
>   		return 1;
>   
> @@ -7990,11 +8013,14 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
>   static struct task_struct *detach_one_task(struct lb_env *env)
>   {
>   	struct task_struct *p;
> +	struct list_head *tasks = &env->src_rq->cfs_tasks;
> +	int loop = 0;

Maybe a boolean variable is enough (and more readable)?

Thanks,
Abel

>   
>   	lockdep_assert_rq_held(env->src_rq);
>   
> +again:
>   	list_for_each_entry_reverse(p,
> -			&env->src_rq->cfs_tasks, se.group_node) {
> +			tasks, se.group_node) {
>   		if (!can_migrate_task(p, env))
>   			continue;
>   
> @@ -8009,6 +8035,10 @@ static struct task_struct *detach_one_task(struct lb_env *env)
>   		schedstat_inc(env->sd->lb_gained[env->idle]);
>   		return p;
>   	}
> +	if (++loop == 1) {
> +		tasks = &env->src_rq->cfs_idle_tasks;
> +		goto again;
> +	}
>   	return NULL;
>   }
>   
> @@ -8026,6 +8056,7 @@ static int detach_tasks(struct lb_env *env)
>   	unsigned long util, load;
>   	struct task_struct *p;
>   	int detached = 0;
> +	int loop = 0;
>   
>   	lockdep_assert_rq_held(env->src_rq);
>   
> @@ -8041,6 +8072,7 @@ static int detach_tasks(struct lb_env *env)
>   	if (env->imbalance <= 0)
>   		return 0;
>   
> +again:
>   	while (!list_empty(tasks)) {
>   		/*
>   		 * We don't want to steal all, otherwise we may be treated likewise,
> @@ -8142,6 +8174,11 @@ static int detach_tasks(struct lb_env *env)
>   		list_move(&p->se.group_node, tasks);
>   	}
>   
> +	if (env->imbalance > 0 && ++loop == 1) {
> +		tasks = &env->src_rq->cfs_idle_tasks;
> +		goto again;
> +	}
> +
>   	/*
>   	 * Right now, this is one of only two places we collect this stat
>   	 * so we can safely collect detach_one_task() stats here rather
> @@ -11643,7 +11680,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
>   		 * Move the next running task to the front of the list, so our
>   		 * cfs_tasks list becomes MRU one.
>   		 */
> -		list_move(&se->group_node, &rq->cfs_tasks);
> +		adjust_rq_cfs_tasks(list_move, rq, se);
>   	}
>   #endif
>   
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e26688d387ae..accb4eea9769 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1068,6 +1068,7 @@ struct rq {
>   	int			online;
>   
>   	struct list_head cfs_tasks;
> +	struct list_head cfs_idle_tasks;
>   
>   	struct sched_avg	avg_rt;
>   	struct sched_avg	avg_dl;

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-10  4:02 ` Abel Wu
@ 2022-08-10  7:34   ` zhangsong (J)
  2022-08-15 11:03     ` Abel Wu
  0 siblings, 1 reply; 13+ messages in thread
From: zhangsong (J) @ 2022-08-10  7:34 UTC (permalink / raw)
  To: Abel Wu, mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, kernel test robot

Thanks for your reply !

On 2022/8/10 12:02, Abel Wu wrote:
> Hi Zhang Song,
>
> On 8/10/22 9:56 AM, zhangsong Wrote:
>> For co-location with NORMAL and IDLE tasks, when CFS trigger load 
>> balance,
>> it is reasonable to prefer migrating NORMAL(Latency Sensitive) tasks 
>> from
>> the busy src CPU to dst CPU, and migrating IDLE tasks lastly.
>
> Considering the large weight difference between normal and idle tasks,
> does the re-ordering really change things? It would be helpful if you
> can offer more detailed info.
Please consider the situation that CPU A has several normal tasks and 
hundreds of idle tasks
while CPU B is idle, and CPU B needs to pull some tasks from CPU A, but 
the cfs_tasks in CPU A
are not in order of priority, and the max number of pulling tasks 
depends on env->loop_max,
which value is sysctl_sched_nr_migrate, i.e. 32.
We now cannot guarantee that CPU B can pull a certain number of normal 
tasks instead of idle tasks
from the waiting queue of CPU A, so It is necessary to divide cfs_tasks 
into two different lists
and ensure that tasks in none-idle list can be migrated first.
>
>>
>> This is very important for reducing interference from IDLE tasks.
>> So the CFS load balance can be optimized to below:
>>
>> 1.`cfs_tasks` list of CPU rq is owned by NORMAL tasks.
>> 2.`cfs_idle_tasks` list of CPU rq which is owned by IDLE tasks.
>> 3.Prefer to migrate NORMAL tasks of cfs_tasks to dst CPU.
>> 4.Lastly migrate IDLE tasks of cfs_idle_tasks to dst CPU.
>>
>> This was tested with the following reproduction:
>> - small number of NORMAL tasks colocated with a large number of IDLE 
>> tasks
>>
>> With this patch, NORMAL tasks latency can be reduced
>> about 5~10% compared with current.
>>
>> Signed-off-by: zhangsong <zhangsong34@huawei.com>
>> Reported-by: kernel test robot <lkp@intel.com>
>
> The Reported-by tag is usually used for reporting a bug in the mainline
> kernel, and build error of your patch is not one of them :)
OK,I will remove this tag later.
>
>> ---
>> V1->V2:
>> - fix build test error
>> ---
>>   kernel/sched/core.c  |  1 +
>>   kernel/sched/fair.c  | 45 ++++++++++++++++++++++++++++++++++++++++----
>>   kernel/sched/sched.h |  1 +
>>   3 files changed, 43 insertions(+), 4 deletions(-)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index ee28253c9ac0..7325c6e552d8 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -9733,6 +9733,7 @@ void __init sched_init(void)
>>           rq->max_idle_balance_cost = sysctl_sched_migration_cost;
>>             INIT_LIST_HEAD(&rq->cfs_tasks);
>> +        INIT_LIST_HEAD(&rq->cfs_idle_tasks);
>>             rq_attach_root(rq, &def_root_domain);
>>   #ifdef CONFIG_NO_HZ_COMMON
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 914096c5b1ae..b62bec5b1eb9 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -3034,6 +3034,21 @@ static inline void update_scan_period(struct 
>> task_struct *p, int new_cpu)
>>     #endif /* CONFIG_NUMA_BALANCING */
>>   +#ifdef CONFIG_SMP
>> +static void
>> +adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct 
>> list_head *),
>> +    struct rq *rq,
>> +    struct sched_entity *se)
>> +{
>> +    struct cfs_rq *cfs_rq = cfs_rq_of(se);
>> +
>> +    if (task_has_idle_policy(task_of(se)) || tg_is_idle(cfs_rq->tg))
>
> The tg_is_idle() doesn't have hierarchical judgement on parent task
> groups, while rq->cfs{,_idle}_tasks is rq wide. Say A->B where tgA
> is idle and tgB isn't, a task from B will be added to the non-idle
> list, is this what you want?
The tg_is_idle is used to check whether the task group of se is idle.
I think it is unlikely that the parent group is idle but the child group 
is none-idle.
If it happens, the current load balance policy yet not be affected.
>
>> + (*list_op)(&se->group_node, &rq->cfs_idle_tasks);
>> +    else
>> +        (*list_op)(&se->group_node, &rq->cfs_tasks);
>> +}
>> +#endif
>> +
>>   static void
>>   account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>   {
>> @@ -3043,7 +3058,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, 
>> struct sched_entity *se)
>>           struct rq *rq = rq_of(cfs_rq);
>>             account_numa_enqueue(rq, task_of(se));
>> -        list_add(&se->group_node, &rq->cfs_tasks);
>> +        adjust_rq_cfs_tasks(list_add, rq, se);
>>       }
>>   #endif
>>       cfs_rq->nr_running++;
>> @@ -7465,7 +7480,7 @@ done: __maybe_unused;
>>        * the list, so our cfs_tasks list becomes MRU
>>        * one.
>>        */
>> -    list_move(&p->se.group_node, &rq->cfs_tasks);
>> +    adjust_rq_cfs_tasks(list_move, rq, &p->se);
>>   #endif
>>         if (hrtick_enabled_fair(rq))
>> @@ -7788,6 +7803,9 @@ static int task_hot(struct task_struct *p, 
>> struct lb_env *env)
>>       if (unlikely(task_has_idle_policy(p)))
>>           return 0;
>>   +    if (tg_is_idle(cfs_rq_of(&p->se)->tg))
>> +        return 0;
>> +
>
> Same as above. But I am not sure this is the right way to do it. We
> still want to maintain policy behavior inside an idle task group.
Actually, we do not change the policy behavior of idle task group.
We only want to ensure migration prioritily when load balance
pulling/pushing tasks between CPUs.
>
>>       /* SMT siblings share cache */
>>       if (env->sd->flags & SD_SHARE_CPUCAPACITY)
>>           return 0;
>> @@ -7800,6 +7818,11 @@ static int task_hot(struct task_struct *p, 
>> struct lb_env *env)
>>                &p->se == cfs_rq_of(&p->se)->last))
>>           return 1;
>>   +    /* Preempt sched idle cpu do not consider migration cost */
>> +    if (cpus_share_cache(env->src_cpu, env->dst_cpu) &&
>> +        sched_idle_cpu(env->dst_cpu))
>> +        return 0;
>> +
>>       if (sysctl_sched_migration_cost == -1)
>>           return 1;
>>   @@ -7990,11 +8013,14 @@ static void detach_task(struct task_struct 
>> *p, struct lb_env *env)
>>   static struct task_struct *detach_one_task(struct lb_env *env)
>>   {
>>       struct task_struct *p;
>> +    struct list_head *tasks = &env->src_rq->cfs_tasks;
>> +    int loop = 0;
>
> Maybe a boolean variable is enough (and more readable)?
>
> Thanks,
> Abel
OK, let me think more about how to define it an appropriate variable.
>
>> lockdep_assert_rq_held(env->src_rq);
>>   +again:
>>       list_for_each_entry_reverse(p,
>> -            &env->src_rq->cfs_tasks, se.group_node) {
>> +            tasks, se.group_node) {
>>           if (!can_migrate_task(p, env))
>>               continue;
>>   @@ -8009,6 +8035,10 @@ static struct task_struct 
>> *detach_one_task(struct lb_env *env)
>>           schedstat_inc(env->sd->lb_gained[env->idle]);
>>           return p;
>>       }
>> +    if (++loop == 1) {
>> +        tasks = &env->src_rq->cfs_idle_tasks;
>> +        goto again;
>> +    }
>>       return NULL;
>>   }
>>   @@ -8026,6 +8056,7 @@ static int detach_tasks(struct lb_env *env)
>>       unsigned long util, load;
>>       struct task_struct *p;
>>       int detached = 0;
>> +    int loop = 0;
>>         lockdep_assert_rq_held(env->src_rq);
>>   @@ -8041,6 +8072,7 @@ static int detach_tasks(struct lb_env *env)
>>       if (env->imbalance <= 0)
>>           return 0;
>>   +again:
>>       while (!list_empty(tasks)) {
>>           /*
>>            * We don't want to steal all, otherwise we may be treated 
>> likewise,
>> @@ -8142,6 +8174,11 @@ static int detach_tasks(struct lb_env *env)
>>           list_move(&p->se.group_node, tasks);
>>       }
>>   +    if (env->imbalance > 0 && ++loop == 1) {
>> +        tasks = &env->src_rq->cfs_idle_tasks;
>> +        goto again;
>> +    }
>> +
>>       /*
>>        * Right now, this is one of only two places we collect this stat
>>        * so we can safely collect detach_one_task() stats here rather
>> @@ -11643,7 +11680,7 @@ static void set_next_task_fair(struct rq *rq, 
>> struct task_struct *p, bool first)
>>            * Move the next running task to the front of the list, so our
>>            * cfs_tasks list becomes MRU one.
>>            */
>> -        list_move(&se->group_node, &rq->cfs_tasks);
>> +        adjust_rq_cfs_tasks(list_move, rq, se);
>>       }
>>   #endif
>>   diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index e26688d387ae..accb4eea9769 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -1068,6 +1068,7 @@ struct rq {
>>       int            online;
>>         struct list_head cfs_tasks;
>> +    struct list_head cfs_idle_tasks;
>>         struct sched_avg    avg_rt;
>>       struct sched_avg    avg_dl;
> .

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-10  7:34   ` zhangsong (J)
@ 2022-08-15 11:03     ` Abel Wu
       [not found]       ` <6ae319c0-e6ed-4aad-64b8-d3f6cbea688d@huawei.com>
  0 siblings, 1 reply; 13+ messages in thread
From: Abel Wu @ 2022-08-15 11:03 UTC (permalink / raw)
  To: zhangsong (J), mingo, peterz, juri.lelli, vincent.guittot
  Cc: dietmar.eggemann, rostedt, bsegall, mgorman, bristot, vschneid,
	linux-kernel, kernel test robot

On 8/10/22 3:34 PM, zhangsong (J) Wrote:
> Thanks for your reply !
> 
> On 2022/8/10 12:02, Abel Wu wrote:
>> Hi Zhang Song,
>>
>> On 8/10/22 9:56 AM, zhangsong Wrote:
>>> For co-location with NORMAL and IDLE tasks, when CFS trigger load 
>>> balance,
>>> it is reasonable to prefer migrating NORMAL(Latency Sensitive) tasks 
>>> from
>>> the busy src CPU to dst CPU, and migrating IDLE tasks lastly.
>>
>> Considering the large weight difference between normal and idle tasks,
>> does the re-ordering really change things? It would be helpful if you
>> can offer more detailed info.
> Please consider the situation that CPU A has several normal tasks and 
> hundreds of idle tasks
> while CPU B is idle, and CPU B needs to pull some tasks from CPU A, but 
> the cfs_tasks in CPU A
> are not in order of priority, and the max number of pulling tasks 
> depends on env->loop_max,
> which value is sysctl_sched_nr_migrate, i.e. 32.

The case you elaborated above is really rare, the only possibility I
can imagine is that all these tasks are affined to one single cpu and
suddenly remove the affinity constrain. Otherwise, the load balancing
including wakeup cpu selection logic will make things right.

> We now cannot guarantee that CPU B can pull a certain number of normal 
> tasks instead of idle tasks
> from the waiting queue of CPU A, so It is necessary to divide cfs_tasks 
> into two different lists
> and ensure that tasks in none-idle list can be migrated first.

Yes, there is no guarantee. And I think your problem is valid (although
rare), while turning cfs_tasks into weight-sorted might be helpful?

>>
>>> ...
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -3034,6 +3034,21 @@ static inline void update_scan_period(struct 
>>> task_struct *p, int new_cpu)
>>>     #endif /* CONFIG_NUMA_BALANCING */
>>>   +#ifdef CONFIG_SMP
>>> +static void
>>> +adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct 
>>> list_head *),
>>> +    struct rq *rq,
>>> +    struct sched_entity *se)
>>> +{
>>> +    struct cfs_rq *cfs_rq = cfs_rq_of(se);
>>> +
>>> +    if (task_has_idle_policy(task_of(se)) || tg_is_idle(cfs_rq->tg))
>>
>> The tg_is_idle() doesn't have hierarchical judgement on parent task
>> groups, while rq->cfs{,_idle}_tasks is rq wide. Say A->B where tgA
>> is idle and tgB isn't, a task from B will be added to the non-idle
>> list, is this what you want?
> The tg_is_idle is used to check whether the task group of se is idle.
> I think it is unlikely that the parent group is idle but the child group 
> is none-idle.
> If it happens, the current load balance policy yet not be affected.

It's quite common that an idle group contains several non-idle groups,
and it is so designed. In this case with your patch, tgB's tasks will
be wrongly put to the rq->cfs_tasks list.

>>
>>> + (*list_op)(&se->group_node, &rq->cfs_idle_tasks);
>>> +    else
>>> +        (*list_op)(&se->group_node, &rq->cfs_tasks);
>>> +}
>>> +#endif
>>> +
>>>   static void
>>>   account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>>   {
>>> @@ -3043,7 +3058,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, 
>>> struct sched_entity *se)
>>>           struct rq *rq = rq_of(cfs_rq);
>>>             account_numa_enqueue(rq, task_of(se));
>>> -        list_add(&se->group_node, &rq->cfs_tasks);
>>> +        adjust_rq_cfs_tasks(list_add, rq, se);
>>>       }
>>>   #endif
>>>       cfs_rq->nr_running++;
>>> @@ -7465,7 +7480,7 @@ done: __maybe_unused;
>>>        * the list, so our cfs_tasks list becomes MRU
>>>        * one.
>>>        */
>>> -    list_move(&p->se.group_node, &rq->cfs_tasks);
>>> +    adjust_rq_cfs_tasks(list_move, rq, &p->se);
>>>   #endif
>>>         if (hrtick_enabled_fair(rq))
>>> @@ -7788,6 +7803,9 @@ static int task_hot(struct task_struct *p, 
>>> struct lb_env *env)
>>>       if (unlikely(task_has_idle_policy(p)))
>>>           return 0;
>>>   +    if (tg_is_idle(cfs_rq_of(&p->se)->tg))
>>> +        return 0;
>>> +
>>
>> Same as above. But I am not sure this is the right way to do it. We
>> still want to maintain policy behavior inside an idle task group.
> Actually, we do not change the policy behavior of idle task group.
> We only want to ensure migration prioritily when load balance
> pulling/pushing tasks between CPUs.

But the tasks from idle tg could still be SCHED_NORMAL.

>>
>>>       /* SMT siblings share cache */
>>>       if (env->sd->flags & SD_SHARE_CPUCAPACITY)
>>>           return 0;
>>> @@ -7800,6 +7818,11 @@ static int task_hot(struct task_struct *p, 
>>> struct lb_env *env)
>>>                &p->se == cfs_rq_of(&p->se)->last))
>>>           return 1;
>>>   +    /* Preempt sched idle cpu do not consider migration cost */
>>> +    if (cpus_share_cache(env->src_cpu, env->dst_cpu) &&
>>> +        sched_idle_cpu(env->dst_cpu))
>>> +        return 0;
>>> +
>>>       if (sysctl_sched_migration_cost == -1)
>>>           return 1;
>>>   @@ -7990,11 +8013,14 @@ static void detach_task(struct task_struct 
>>> *p, struct lb_env *env)
>>>   static struct task_struct *detach_one_task(struct lb_env *env)
>>>   {
>>>       struct task_struct *p;
>>> +    struct list_head *tasks = &env->src_rq->cfs_tasks;
>>> +    int loop = 0;
>>
>> Maybe a boolean variable is enough (and more readable)?
>>
>> Thanks,
>> Abel
> OK, let me think more about how to define it an appropriate variable.
>>
>>> lockdep_assert_rq_held(env->src_rq);
>>>   +again:
>>>       list_for_each_entry_reverse(p,
>>> -            &env->src_rq->cfs_tasks, se.group_node) {
>>> +            tasks, se.group_node) {
>>>           if (!can_migrate_task(p, env))
>>>               continue;
>>>   @@ -8009,6 +8035,10 @@ static struct task_struct 
>>> *detach_one_task(struct lb_env *env)
>>>           schedstat_inc(env->sd->lb_gained[env->idle]);
>>>           return p;
>>>       }
>>> +    if (++loop == 1) {
>>> +        tasks = &env->src_rq->cfs_idle_tasks;
>>> +        goto again;
>>> +    }
>>>       return NULL;
>>>   }
>>>   @@ -8026,6 +8056,7 @@ static int detach_tasks(struct lb_env *env)
>>>       unsigned long util, load;
>>>       struct task_struct *p;
>>>       int detached = 0;
>>> +    int loop = 0;
>>>         lockdep_assert_rq_held(env->src_rq);
>>>   @@ -8041,6 +8072,7 @@ static int detach_tasks(struct lb_env *env)
>>>       if (env->imbalance <= 0)
>>>           return 0;
>>>   +again:
>>>       while (!list_empty(tasks)) {
>>>           /*
>>>            * We don't want to steal all, otherwise we may be treated 
>>> likewise,
>>> @@ -8142,6 +8174,11 @@ static int detach_tasks(struct lb_env *env)
>>>           list_move(&p->se.group_node, tasks);
>>>       }
>>>   +    if (env->imbalance > 0 && ++loop == 1) {
>>> +        tasks = &env->src_rq->cfs_idle_tasks;
>>> +        goto again;
>>> +    }
>>> +
>>>       /*
>>>        * Right now, this is one of only two places we collect this stat
>>>        * so we can safely collect detach_one_task() stats here rather
>>> @@ -11643,7 +11680,7 @@ static void set_next_task_fair(struct rq *rq, 
>>> struct task_struct *p, bool first)
>>>            * Move the next running task to the front of the list, so our
>>>            * cfs_tasks list becomes MRU one.
>>>            */
>>> -        list_move(&se->group_node, &rq->cfs_tasks);
>>> +        adjust_rq_cfs_tasks(list_move, rq, se);
>>>       }
>>>   #endif
>>>   diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>> index e26688d387ae..accb4eea9769 100644
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -1068,6 +1068,7 @@ struct rq {
>>>       int            online;
>>>         struct list_head cfs_tasks;
>>> +    struct list_head cfs_idle_tasks;
>>>         struct sched_avg    avg_rt;
>>>       struct sched_avg    avg_dl;
>> .

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
       [not found]       ` <6ae319c0-e6ed-4aad-64b8-d3f6cbea688d@huawei.com>
@ 2022-08-17 12:58         ` Vincent Guittot
  2022-08-18  2:46           ` Abel Wu
  0 siblings, 1 reply; 13+ messages in thread
From: Vincent Guittot @ 2022-08-17 12:58 UTC (permalink / raw)
  To: zhangsong (J)
  Cc: Abel Wu, mingo, peterz, juri.lelli, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, linux-kernel,
	kernel test robot

On Tue, 16 Aug 2022 at 04:53, zhangsong (J) <zhangsong34@huawei.com> wrote:
>
>
> On 2022/8/15 19:03, Abel Wu wrote:
>
> On 8/10/22 3:34 PM, zhangsong (J) Wrote:
>
> Thanks for your reply !
>
> On 2022/8/10 12:02, Abel Wu wrote:
>
> Hi Zhang Song,
>
> On 8/10/22 9:56 AM, zhangsong Wrote:
>
> For co-location with NORMAL and IDLE tasks, when CFS trigger load balance,
> it is reasonable to prefer migrating NORMAL(Latency Sensitive) tasks from
> the busy src CPU to dst CPU, and migrating IDLE tasks lastly.
>
>
> Considering the large weight difference between normal and idle tasks,
> does the re-ordering really change things? It would be helpful if you
> can offer more detailed info.
>
> Please consider the situation that CPU A has several normal tasks and hundreds of idle tasks
> while CPU B is idle, and CPU B needs to pull some tasks from CPU A, but the cfs_tasks in CPU A
> are not in order of priority, and the max number of pulling tasks depends on env->loop_max,
> which value is sysctl_sched_nr_migrate, i.e. 32.
>
>
> The case you elaborated above is really rare, the only possibility I
> can imagine is that all these tasks are affined to one single cpu and
> suddenly remove the affinity constrain. Otherwise, the load balancing
> including wakeup cpu selection logic will make things right.
>
>
> Yes, this is usually a corner case, but suppose that some non-idle tasks bounds to CPU 1-2
>
> and idle tasks bounds to CPU 0-1, so CPU 1 may has many idle tasks and some non-idle
>
> tasks while idle tasks on CPU 1 can not be pulled to CPU 2, when trigger load balance if
>
> CPU 2 should pull some tasks from CPU 1, the bad result is idle tasks of CPU 1 cannot be
>
> migrated and non-idle tasks also cannot be migrated in case of env->loop_max constraint.

env->loop_max adds a break but load_balance will continue with next
tasks so it also tries to pull your non idle task at the end after
several breaks.

>
> This will cause non-idle  tasks cannot achieve  more CPU utilization.

Your problem is not linked to IDLE vs NORMAL tasks but to the large
number of pinned tasks that can't migrate on CPU2. You can end with
the same behavior without using IDLE tasks but only NORMAL tasks.

>
> The testcase is below:
>
> 1) 1000 idle tasks bounds to CPU 0-1 and 10 non-idle tasks bounds to CPU 1-2
>
> 2) Get non-idle tasks performance metrics with perf tool
>
>
> Without this patch:
>
>           6,930.88 msec cpu-clock                 #    1.385 CPUs utilized
>             21,458      context-switches          #    0.003 M/sec
>                  1      cpu-migrations            #    0.000 K/sec
>                  0      page-faults               #    0.000 K/sec
>     13,761,631,912      cycles                    #    1.986 GHz
>      6,646,844,169      instructions              #    0.48  insn per cycle
>      2,188,053,035      branches                  #  315.696 M/sec
>          1,690,507      branch-misses             #    0.08% of all branches
>
>        5.002701004 seconds time elapsed
>
>
> With this patch:
>
>           8,103.47 msec cpu-clock                 #    1.620 CPUs utilized
>             25,358      context-switches          #    0.003 M/sec
>                318      cpu-migrations            #    0.039 K/sec
>                  0      page-faults               #    0.000 K/sec
>     15,707,367,741      cycles                    #    1.938 GHz
>      7,618,354,166      instructions              #    0.49  insn per cycle
>      2,506,951,511      branches                  #  309.367 M/sec
>          2,017,301      branch-misses             #    0.08% of all branches
>
>        5.002151474 seconds time elapsed
>
>
>
> From above test result comparsion, we can see that non-idle tasks CPU utilization can improve 10%+.
>
>
>
> We now cannot guarantee that CPU B can pull a certain number of normal tasks instead of idle tasks
> from the waiting queue of CPU A, so It is necessary to divide cfs_tasks into two different lists
> and ensure that tasks in none-idle list can be migrated first.
>
>
> Yes, there is no guarantee. And I think your problem is valid (although
> rare), while turning cfs_tasks into weight-sorted might be helpful?
>
>
> I think turning cfs_tasks into weight-sorted may be more complex, this will
>
> change current data structure of cfs_tasks, it will be less helpful.
>
> Thanks a lot.
>
>
>
>
> ...
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3034,6 +3034,21 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
>     #endif /* CONFIG_NUMA_BALANCING */
>   +#ifdef CONFIG_SMP
> +static void
> +adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
> +    struct rq *rq,
> +    struct sched_entity *se)
> +{
> +    struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> +    if (task_has_idle_policy(task_of(se)) || tg_is_idle(cfs_rq->tg))
>
>
> The tg_is_idle() doesn't have hierarchical judgement on parent task
> groups, while rq->cfs{,_idle}_tasks is rq wide. Say A->B where tgA
> is idle and tgB isn't, a task from B will be added to the non-idle
> list, is this what you want?
>
> The tg_is_idle is used to check whether the task group of se is idle.
> I think it is unlikely that the parent group is idle but the child group is none-idle.
> If it happens, the current load balance policy yet not be affected.
>
>
> It's quite common that an idle group contains several non-idle groups,
> and it is so designed. In this case with your patch, tgB's tasks will
> be wrongly put to the rq->cfs_tasks list.
>
>
> OK, I agree it, this task idle judgement may be modified to below:
>
> static int task_h_idle(struct task_struct *p)
> {
>     struct sched_entity *se = &p->se;
>
>     if (task_has_idle_policy(p))
>         return 1;
>
>     for_each_sched_entity(se)
>         if (cfs_rq_is_idle(cfs_rq_of(se)))
>             return 1;
>
>     return 0;
> }
>
>
> What do you think of above modify?
>
>
>
>
> + (*list_op)(&se->group_node, &rq->cfs_idle_tasks);
> +    else
> +        (*list_op)(&se->group_node, &rq->cfs_tasks);
> +}
> +#endif
> +
>   static void
>   account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>   {
> @@ -3043,7 +3058,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>           struct rq *rq = rq_of(cfs_rq);
>             account_numa_enqueue(rq, task_of(se));
> -        list_add(&se->group_node, &rq->cfs_tasks);
> +        adjust_rq_cfs_tasks(list_add, rq, se);
>       }
>   #endif
>       cfs_rq->nr_running++;
> @@ -7465,7 +7480,7 @@ done: __maybe_unused;
>        * the list, so our cfs_tasks list becomes MRU
>        * one.
>        */
> -    list_move(&p->se.group_node, &rq->cfs_tasks);
> +    adjust_rq_cfs_tasks(list_move, rq, &p->se);
>   #endif
>         if (hrtick_enabled_fair(rq))
> @@ -7788,6 +7803,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>       if (unlikely(task_has_idle_policy(p)))
>           return 0;
>   +    if (tg_is_idle(cfs_rq_of(&p->se)->tg))
> +        return 0;
> +
>
>
> Same as above. But I am not sure this is the right way to do it. We
> still want to maintain policy behavior inside an idle task group.
>
> Actually, we do not change the policy behavior of idle task group.
> We only want to ensure migration prioritily when load balance
> pulling/pushing tasks between CPUs.
>
>
> But the tasks from idle tg could still be SCHED_NORMAL.
>
>
> Yes you are right, please see above reply.
>
>
>
>
>       /* SMT siblings share cache */
>       if (env->sd->flags & SD_SHARE_CPUCAPACITY)
>           return 0;
> @@ -7800,6 +7818,11 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>                &p->se == cfs_rq_of(&p->se)->last))
>           return 1;
>   +    /* Preempt sched idle cpu do not consider migration cost */
> +    if (cpus_share_cache(env->src_cpu, env->dst_cpu) &&
> +        sched_idle_cpu(env->dst_cpu))
> +        return 0;
> +
>       if (sysctl_sched_migration_cost == -1)
>           return 1;
>   @@ -7990,11 +8013,14 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
>   static struct task_struct *detach_one_task(struct lb_env *env)
>   {
>       struct task_struct *p;
> +    struct list_head *tasks = &env->src_rq->cfs_tasks;
> +    int loop = 0;
>
>
> Maybe a boolean variable is enough (and more readable)?
>
> Thanks,
> Abel
>
> OK, let me think more about how to define it an appropriate variable.
>
>
> lockdep_assert_rq_held(env->src_rq);
>   +again:
>       list_for_each_entry_reverse(p,
> -            &env->src_rq->cfs_tasks, se.group_node) {
> +            tasks, se.group_node) {
>           if (!can_migrate_task(p, env))
>               continue;
>   @@ -8009,6 +8035,10 @@ static struct task_struct *detach_one_task(struct lb_env *env)
>           schedstat_inc(env->sd->lb_gained[env->idle]);
>           return p;
>       }
> +    if (++loop == 1) {
> +        tasks = &env->src_rq->cfs_idle_tasks;
> +        goto again;
> +    }
>       return NULL;
>   }
>   @@ -8026,6 +8056,7 @@ static int detach_tasks(struct lb_env *env)
>       unsigned long util, load;
>       struct task_struct *p;
>       int detached = 0;
> +    int loop = 0;
>         lockdep_assert_rq_held(env->src_rq);
>   @@ -8041,6 +8072,7 @@ static int detach_tasks(struct lb_env *env)
>       if (env->imbalance <= 0)
>           return 0;
>   +again:
>       while (!list_empty(tasks)) {
>           /*
>            * We don't want to steal all, otherwise we may be treated likewise,
> @@ -8142,6 +8174,11 @@ static int detach_tasks(struct lb_env *env)
>           list_move(&p->se.group_node, tasks);
>       }
>   +    if (env->imbalance > 0 && ++loop == 1) {
> +        tasks = &env->src_rq->cfs_idle_tasks;
> +        goto again;
> +    }
> +
>       /*
>        * Right now, this is one of only two places we collect this stat
>        * so we can safely collect detach_one_task() stats here rather
> @@ -11643,7 +11680,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
>            * Move the next running task to the front of the list, so our
>            * cfs_tasks list becomes MRU one.
>            */
> -        list_move(&se->group_node, &rq->cfs_tasks);
> +        adjust_rq_cfs_tasks(list_move, rq, se);
>       }
>   #endif
>   diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e26688d387ae..accb4eea9769 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1068,6 +1068,7 @@ struct rq {
>       int            online;
>         struct list_head cfs_tasks;
> +    struct list_head cfs_idle_tasks;
>         struct sched_avg    avg_rt;
>       struct sched_avg    avg_dl;
>
> .
>
> .

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-17 12:58         ` Vincent Guittot
@ 2022-08-18  2:46           ` Abel Wu
  2022-08-18  8:31             ` Vincent Guittot
  0 siblings, 1 reply; 13+ messages in thread
From: Abel Wu @ 2022-08-18  2:46 UTC (permalink / raw)
  To: Vincent Guittot, zhangsong (J)
  Cc: mingo, peterz, juri.lelli, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vschneid, linux-kernel, kernel test robot

On 8/17/22 8:58 PM, Vincent Guittot Wrote:
> On Tue, 16 Aug 2022 at 04:53, zhangsong (J) <zhangsong34@huawei.com> wrote:
>>
>> For co-location with NORMAL and IDLE tasks, when CFS trigger load balance,
>> it is reasonable to prefer migrating NORMAL(Latency Sensitive) tasks from
>> the busy src CPU to dst CPU, and migrating IDLE tasks lastly.
>>
>>
>> Considering the large weight difference between normal and idle tasks,
>> does the re-ordering really change things? It would be helpful if you
>> can offer more detailed info.
>>
>> Please consider the situation that CPU A has several normal tasks and hundreds of idle tasks
>> while CPU B is idle, and CPU B needs to pull some tasks from CPU A, but the cfs_tasks in CPU A
>> are not in order of priority, and the max number of pulling tasks depends on env->loop_max,
>> which value is sysctl_sched_nr_migrate, i.e. 32.
>>
>>
>> The case you elaborated above is really rare, the only possibility I
>> can imagine is that all these tasks are affined to one single cpu and
>> suddenly remove the affinity constrain. Otherwise, the load balancing
>> including wakeup cpu selection logic will make things right.
>>
>>
>> Yes, this is usually a corner case, but suppose that some non-idle tasks bounds to CPU 1-2
>>
>> and idle tasks bounds to CPU 0-1, so CPU 1 may has many idle tasks and some non-idle
>>
>> tasks while idle tasks on CPU 1 can not be pulled to CPU 2, when trigger load balance if
>>
>> CPU 2 should pull some tasks from CPU 1, the bad result is idle tasks of CPU 1 cannot be
>>
>> migrated and non-idle tasks also cannot be migrated in case of env->loop_max constraint.
> 
> env->loop_max adds a break but load_balance will continue with next
> tasks so it also tries to pull your non idle task at the end after
> several breaks.

Loop will be terminated without LBF_NEED_BREAK if exceeds loop_max :)

> 
>>
>> This will cause non-idle  tasks cannot achieve  more CPU utilization.
> 
> Your problem is not linked to IDLE vs NORMAL tasks but to the large
> number of pinned tasks that can't migrate on CPU2. You can end with
> the same behavior without using IDLE tasks but only NORMAL tasks.

I feel the same thing.

Best,
Abel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-18  2:46           ` Abel Wu
@ 2022-08-18  8:31             ` Vincent Guittot
  2022-08-19 10:54               ` zhangsong (J)
  0 siblings, 1 reply; 13+ messages in thread
From: Vincent Guittot @ 2022-08-18  8:31 UTC (permalink / raw)
  To: Abel Wu
  Cc: zhangsong (J),
	mingo, peterz, juri.lelli, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vschneid, linux-kernel, kernel test robot

Le jeudi 18 août 2022 à 10:46:55 (+0800), Abel Wu a écrit :
> On 8/17/22 8:58 PM, Vincent Guittot Wrote:
> > On Tue, 16 Aug 2022 at 04:53, zhangsong (J) <zhangsong34@huawei.com> wrote:
> > > 
> > > 

...

> > > Yes, this is usually a corner case, but suppose that some non-idle tasks bounds to CPU 1-2
> > > 
> > > and idle tasks bounds to CPU 0-1, so CPU 1 may has many idle tasks and some non-idle
> > > 
> > > tasks while idle tasks on CPU 1 can not be pulled to CPU 2, when trigger load balance if
> > > 
> > > CPU 2 should pull some tasks from CPU 1, the bad result is idle tasks of CPU 1 cannot be
> > > 
> > > migrated and non-idle tasks also cannot be migrated in case of env->loop_max constraint.
> > 
> > env->loop_max adds a break but load_balance will continue with next
> > tasks so it also tries to pull your non idle task at the end after
> > several breaks.
> 
> Loop will be terminated without LBF_NEED_BREAK if exceeds loop_max :)

Argh yes, my brain is not yet back from vacation
I have been confused by loop_max and loop_break being set to the same value 32

Zhang Song, Could you try the patch below ? If it works, I will prepare a
clean patch with all tags



sched/fair: make sure to try to detach at least one movable task

During load balance we try at most env->loop_max time to move a task. But
it can happen that the LRU tasks (ie tail of the cfs_tasks list) can't
be moved to dst_cpu because of affinity. In this case, loop in the list
until we found at least one.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da388657d5ac..02b7b808e186 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8052,8 +8052,12 @@ static int detach_tasks(struct lb_env *env)
 		p = list_last_entry(tasks, struct task_struct, se.group_node);

 		env->loop++;
-		/* We've more or less seen every task there is, call it quits */
-		if (env->loop > env->loop_max)
+		/*
+		 * We've more or less seen every task there is, call it quits
+		 * unless we haven't found any movable task yet.
+		 */
+		if (env->loop > env->loop_max &&
+		    !(env->flags & LBF_ALL_PINNED))
 			break;

 		/* take a breather every nr_migrate tasks */
@@ -10182,7 +10186,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,

 		if (env.flags & LBF_NEED_BREAK) {
 			env.flags &= ~LBF_NEED_BREAK;
-			goto more_balance;
+			/* Stop if we tried all running tasks */
+			if (env.loop < busiest->nr_running)
+				goto more_balance;
 		}

 		/*
--
2.17.1

> 
> > 
> > > 
> > > This will cause non-idle  tasks cannot achieve  more CPU utilization.
> > 
> > Your problem is not linked to IDLE vs NORMAL tasks but to the large
> > number of pinned tasks that can't migrate on CPU2. You can end with
> > the same behavior without using IDLE tasks but only NORMAL tasks.
> 
> I feel the same thing.
> 
> Best,
> Abel

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-18  8:31             ` Vincent Guittot
@ 2022-08-19 10:54               ` zhangsong (J)
  2022-08-19 12:35                 ` Vincent Guittot
  0 siblings, 1 reply; 13+ messages in thread
From: zhangsong (J) @ 2022-08-19 10:54 UTC (permalink / raw)
  To: Vincent Guittot, Abel Wu
  Cc: mingo, peterz, juri.lelli, dietmar.eggemann, rostedt, bsegall,
	mgorman, bristot, vschneid, linux-kernel, kernel test robot


On 2022/8/18 16:31, Vincent Guittot wrote:
> Le jeudi 18 août 2022 à 10:46:55 (+0800), Abel Wu a écrit :
>> On 8/17/22 8:58 PM, Vincent Guittot Wrote:
>>> On Tue, 16 Aug 2022 at 04:53, zhangsong (J) <zhangsong34@huawei.com> wrote:
>>>>
> ...
>
>>>> Yes, this is usually a corner case, but suppose that some non-idle tasks bounds to CPU 1-2
>>>>
>>>> and idle tasks bounds to CPU 0-1, so CPU 1 may has many idle tasks and some non-idle
>>>>
>>>> tasks while idle tasks on CPU 1 can not be pulled to CPU 2, when trigger load balance if
>>>>
>>>> CPU 2 should pull some tasks from CPU 1, the bad result is idle tasks of CPU 1 cannot be
>>>>
>>>> migrated and non-idle tasks also cannot be migrated in case of env->loop_max constraint.
>>> env->loop_max adds a break but load_balance will continue with next
>>> tasks so it also tries to pull your non idle task at the end after
>>> several breaks.
>> Loop will be terminated without LBF_NEED_BREAK if exceeds loop_max :)
> Argh yes, my brain is not yet back from vacation
> I have been confused by loop_max and loop_break being set to the same value 32
>
> Zhang Song, Could you try the patch below ? If it works, I will prepare a
> clean patch with all tags
>
>
>
> sched/fair: make sure to try to detach at least one movable task
>
> During load balance we try at most env->loop_max time to move a task. But
> it can happen that the LRU tasks (ie tail of the cfs_tasks list) can't
> be moved to dst_cpu because of affinity. In this case, loop in the list
> until we found at least one.
>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>   kernel/sched/fair.c | 12 +++++++++---
>   1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index da388657d5ac..02b7b808e186 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8052,8 +8052,12 @@ static int detach_tasks(struct lb_env *env)
>   		p = list_last_entry(tasks, struct task_struct, se.group_node);
>
>   		env->loop++;
> -		/* We've more or less seen every task there is, call it quits */
> -		if (env->loop > env->loop_max)
> +		/*
> +		 * We've more or less seen every task there is, call it quits
> +		 * unless we haven't found any movable task yet.
> +		 */
> +		if (env->loop > env->loop_max &&
> +		    !(env->flags & LBF_ALL_PINNED))
>   			break;
>
>   		/* take a breather every nr_migrate tasks */
> @@ -10182,7 +10186,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>
>   		if (env.flags & LBF_NEED_BREAK) {
>   			env.flags &= ~LBF_NEED_BREAK;
> -			goto more_balance;
> +			/* Stop if we tried all running tasks */
> +			if (env.loop < busiest->nr_running)
> +				goto more_balance;
>   		}
>
>   		/*
> --
> 2.17.1

Thanks for your reply.
I have tried your patch and run test compared with it, it seems that the 
patch you provide makes no sense.
The test result is below(1000 idle tasks bounds to CPU 0-1 and 10 normal 
tasks bounds to CPU 1-2):

=================================================================

Without patch:


           6,777.37 msec cpu-clock                 #    1.355 CPUs utilized
             20,812      context-switches          #    0.003 M/sec
                  0      cpu-migrations            #    0.000 K/sec
                  0      page-faults               #    0.000 K/sec
     13,333,983,148      cycles                    #    1.967 GHz
      6,457,930,305      instructions              #    0.48  insn per cycle
      2,125,644,649      branches                  #  313.639 M/sec
          1,690,587      branch-misses             #    0.08% of all 
branches
       5.001931983 seconds time elapsed

With your patch:


           6,791.46 msec cpu-clock                 #    1.358 CPUs utilized
             20,996      context-switches          #    0.003 M/sec
                  0      cpu-migrations            #    0.000 K/sec
                  0      page-faults               #    0.000 K/sec
     13,467,573,052      cycles                    #    1.983 GHz
      6,516,989,062      instructions              #    0.48  insn per cycle
      2,145,139,220      branches                  #  315.858 M/sec
          1,751,454      branch-misses             #    0.08% of all 
branches

        5.002274267 seconds time elapsed

With my patch:


           7,495.14 msec cpu-clock                 #    1.499 CPUs utilized
             23,176      context-switches          #    0.003 M/sec
                309      cpu-migrations            #    0.041 K/sec
                  0      page-faults               #    0.000 K/sec
     14,849,083,489      cycles                    #    1.981 GHz
      7,180,832,268      instructions              #    0.48  insn per cycle
      2,363,300,644      branches                  #  315.311 M/sec
          1,964,169      branch-misses             #    0.08% of all 
branches

        5.001713352 seconds time elapsed
===============================================================

Obviously,  when your patch is applied, the cpu-migrations of normal 
tasks is still 0 and the
CPU ulization of normal tasks have no improvement compared with no patch 
applied.
When apply my patch, the cpu-migrations and CPU ulization of normal 
tasks can both improve.
I cannot explain the result with your patch, you also can test it by 
yourself.

Best,
Zhang Song

>
>>>> This will cause non-idle  tasks cannot achieve  more CPU utilization.
>>> Your problem is not linked to IDLE vs NORMAL tasks but to the large
>>> number of pinned tasks that can't migrate on CPU2. You can end with
>>> the same behavior without using IDLE tasks but only NORMAL tasks.
>> I feel the same thing.
>>
>> Best,
>> Abel
> .

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-19 10:54               ` zhangsong (J)
@ 2022-08-19 12:35                 ` Vincent Guittot
  2022-08-19 16:04                   ` Vincent Guittot
  0 siblings, 1 reply; 13+ messages in thread
From: Vincent Guittot @ 2022-08-19 12:35 UTC (permalink / raw)
  To: zhangsong (J)
  Cc: Abel Wu, mingo, peterz, juri.lelli, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, linux-kernel,
	kernel test robot

Hi Zhang,

On Fri, 19 Aug 2022 at 12:54, zhangsong (J) <zhangsong34@huawei.com> wrote:
>
>
> On 2022/8/18 16:31, Vincent Guittot wrote:
> > Le jeudi 18 août 2022 à 10:46:55 (+0800), Abel Wu a écrit :
> >> On 8/17/22 8:58 PM, Vincent Guittot Wrote:
> >>> On Tue, 16 Aug 2022 at 04:53, zhangsong (J) <zhangsong34@huawei.com> wrote:
> >>>>
> > ...
> >
> >>>> Yes, this is usually a corner case, but suppose that some non-idle tasks bounds to CPU 1-2
> >>>>
> >>>> and idle tasks bounds to CPU 0-1, so CPU 1 may has many idle tasks and some non-idle
> >>>>
> >>>> tasks while idle tasks on CPU 1 can not be pulled to CPU 2, when trigger load balance if
> >>>>
> >>>> CPU 2 should pull some tasks from CPU 1, the bad result is idle tasks of CPU 1 cannot be
> >>>>
> >>>> migrated and non-idle tasks also cannot be migrated in case of env->loop_max constraint.
> >>> env->loop_max adds a break but load_balance will continue with next
> >>> tasks so it also tries to pull your non idle task at the end after
> >>> several breaks.
> >> Loop will be terminated without LBF_NEED_BREAK if exceeds loop_max :)
> > Argh yes, my brain is not yet back from vacation
> > I have been confused by loop_max and loop_break being set to the same value 32
> >
> > Zhang Song, Could you try the patch below ? If it works, I will prepare a
> > clean patch with all tags
> >
> >
> >
> > sched/fair: make sure to try to detach at least one movable task
> >
> > During load balance we try at most env->loop_max time to move a task. But
> > it can happen that the LRU tasks (ie tail of the cfs_tasks list) can't
> > be moved to dst_cpu because of affinity. In this case, loop in the list
> > until we found at least one.
> >
> > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > ---
> >   kernel/sched/fair.c | 12 +++++++++---
> >   1 file changed, 9 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index da388657d5ac..02b7b808e186 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -8052,8 +8052,12 @@ static int detach_tasks(struct lb_env *env)
> >               p = list_last_entry(tasks, struct task_struct, se.group_node);
> >
> >               env->loop++;
> > -             /* We've more or less seen every task there is, call it quits */
> > -             if (env->loop > env->loop_max)
> > +             /*
> > +              * We've more or less seen every task there is, call it quits
> > +              * unless we haven't found any movable task yet.
> > +              */
> > +             if (env->loop > env->loop_max &&
> > +                 !(env->flags & LBF_ALL_PINNED))
> >                       break;
> >
> >               /* take a breather every nr_migrate tasks */
> > @@ -10182,7 +10186,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
> >
> >               if (env.flags & LBF_NEED_BREAK) {
> >                       env.flags &= ~LBF_NEED_BREAK;
> > -                     goto more_balance;
> > +                     /* Stop if we tried all running tasks */
> > +                     if (env.loop < busiest->nr_running)
> > +                             goto more_balance;
> >               }
> >
> >               /*
> > --
> > 2.17.1
>
> Thanks for your reply.
> I have tried your patch and run test compared with it, it seems that the
> patch you provide makes no sense.
> The test result is below(1000 idle tasks bounds to CPU 0-1 and 10 normal
> tasks bounds to CPU 1-2):
>
> =================================================================
>
> Without patch:
>
>
>            6,777.37 msec cpu-clock                 #    1.355 CPUs utilized
>              20,812      context-switches          #    0.003 M/sec
>                   0      cpu-migrations            #    0.000 K/sec
>                   0      page-faults               #    0.000 K/sec
>      13,333,983,148      cycles                    #    1.967 GHz
>       6,457,930,305      instructions              #    0.48  insn per cycle
>       2,125,644,649      branches                  #  313.639 M/sec
>           1,690,587      branch-misses             #    0.08% of all
> branches
>        5.001931983 seconds time elapsed
>
> With your patch:
>
>
>            6,791.46 msec cpu-clock                 #    1.358 CPUs utilized
>              20,996      context-switches          #    0.003 M/sec
>                   0      cpu-migrations            #    0.000 K/sec
>                   0      page-faults               #    0.000 K/sec
>      13,467,573,052      cycles                    #    1.983 GHz
>       6,516,989,062      instructions              #    0.48  insn per cycle
>       2,145,139,220      branches                  #  315.858 M/sec
>           1,751,454      branch-misses             #    0.08% of all
> branches
>
>         5.002274267 seconds time elapsed
>
> With my patch:
>
>
>            7,495.14 msec cpu-clock                 #    1.499 CPUs utilized
>              23,176      context-switches          #    0.003 M/sec
>                 309      cpu-migrations            #    0.041 K/sec
>                   0      page-faults               #    0.000 K/sec
>      14,849,083,489      cycles                    #    1.981 GHz
>       7,180,832,268      instructions              #    0.48  insn per cycle
>       2,363,300,644      branches                  #  315.311 M/sec
>           1,964,169      branch-misses             #    0.08% of all
> branches
>
>         5.001713352 seconds time elapsed
> ===============================================================
>
> Obviously,  when your patch is applied, the cpu-migrations of normal
> tasks is still 0 and the
> CPU ulization of normal tasks have no improvement compared with no patch
> applied.
> When apply my patch, the cpu-migrations and CPU ulization of normal
> tasks can both improve.
> I cannot explain the result with your patch, you also can test it by
> yourself.

Do you have more details about the test that your are running ?

Do cpu0-2 share their cache ?
Which kingd of task are the normal and idle tasks ? always running tasks ?

I'm going to try to reproduce your problem locally

Regards,
Vincent

>
> Best,
> Zhang Song
>
> >
> >>>> This will cause non-idle  tasks cannot achieve  more CPU utilization.
> >>> Your problem is not linked to IDLE vs NORMAL tasks but to the large
> >>> number of pinned tasks that can't migrate on CPU2. You can end with
> >>> the same behavior without using IDLE tasks but only NORMAL tasks.
> >> I feel the same thing.
> >>
> >> Best,
> >> Abel
> > .

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-19 12:35                 ` Vincent Guittot
@ 2022-08-19 16:04                   ` Vincent Guittot
  2022-08-22  6:49                     ` zhangsong (J)
  0 siblings, 1 reply; 13+ messages in thread
From: Vincent Guittot @ 2022-08-19 16:04 UTC (permalink / raw)
  To: zhangsong (J)
  Cc: Abel Wu, mingo, peterz, juri.lelli, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, linux-kernel,
	kernel test robot

On Fri, 19 Aug 2022 at 14:35, Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>
> Hi Zhang,
>
> On Fri, 19 Aug 2022 at 12:54, zhangsong (J) <zhangsong34@huawei.com> wrote:
> >
> >
> > On 2022/8/18 16:31, Vincent Guittot wrote:
> > > Le jeudi 18 août 2022 à 10:46:55 (+0800), Abel Wu a écrit :
> > >> On 8/17/22 8:58 PM, Vincent Guittot Wrote:
> > >>> On Tue, 16 Aug 2022 at 04:53, zhangsong (J) <zhangsong34@huawei.com> wrote:
> > >>>>
> > > ...
> > >
> > >>>> Yes, this is usually a corner case, but suppose that some non-idle tasks bounds to CPU 1-2
> > >>>>
> > >>>> and idle tasks bounds to CPU 0-1, so CPU 1 may has many idle tasks and some non-idle
> > >>>>
> > >>>> tasks while idle tasks on CPU 1 can not be pulled to CPU 2, when trigger load balance if
> > >>>>
> > >>>> CPU 2 should pull some tasks from CPU 1, the bad result is idle tasks of CPU 1 cannot be
> > >>>>
> > >>>> migrated and non-idle tasks also cannot be migrated in case of env->loop_max constraint.
> > >>> env->loop_max adds a break but load_balance will continue with next
> > >>> tasks so it also tries to pull your non idle task at the end after
> > >>> several breaks.
> > >> Loop will be terminated without LBF_NEED_BREAK if exceeds loop_max :)
> > > Argh yes, my brain is not yet back from vacation
> > > I have been confused by loop_max and loop_break being set to the same value 32
> > >
> > > Zhang Song, Could you try the patch below ? If it works, I will prepare a
> > > clean patch with all tags
> > >
> > >
> > >
> > > sched/fair: make sure to try to detach at least one movable task
> > >
> > > During load balance we try at most env->loop_max time to move a task. But
> > > it can happen that the LRU tasks (ie tail of the cfs_tasks list) can't
> > > be moved to dst_cpu because of affinity. In this case, loop in the list
> > > until we found at least one.
> > >
> > > Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> > > ---
> > >   kernel/sched/fair.c | 12 +++++++++---
> > >   1 file changed, 9 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index da388657d5ac..02b7b808e186 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -8052,8 +8052,12 @@ static int detach_tasks(struct lb_env *env)
> > >               p = list_last_entry(tasks, struct task_struct, se.group_node);
> > >
> > >               env->loop++;
> > > -             /* We've more or less seen every task there is, call it quits */
> > > -             if (env->loop > env->loop_max)
> > > +             /*
> > > +              * We've more or less seen every task there is, call it quits
> > > +              * unless we haven't found any movable task yet.
> > > +              */
> > > +             if (env->loop > env->loop_max &&
> > > +                 !(env->flags & LBF_ALL_PINNED))
> > >                       break;
> > >
> > >               /* take a breather every nr_migrate tasks */
> > > @@ -10182,7 +10186,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
> > >
> > >               if (env.flags & LBF_NEED_BREAK) {
> > >                       env.flags &= ~LBF_NEED_BREAK;
> > > -                     goto more_balance;
> > > +                     /* Stop if we tried all running tasks */
> > > +                     if (env.loop < busiest->nr_running)
> > > +                             goto more_balance;
> > >               }
> > >
> > >               /*
> > > --
> > > 2.17.1
> >
> > Thanks for your reply.
> > I have tried your patch and run test compared with it, it seems that the
> > patch you provide makes no sense.
> > The test result is below(1000 idle tasks bounds to CPU 0-1 and 10 normal
> > tasks bounds to CPU 1-2):
> >
> > =================================================================
> >
> > Without patch:
> >
> >
> >            6,777.37 msec cpu-clock                 #    1.355 CPUs utilized
> >              20,812      context-switches          #    0.003 M/sec
> >                   0      cpu-migrations            #    0.000 K/sec
> >                   0      page-faults               #    0.000 K/sec
> >      13,333,983,148      cycles                    #    1.967 GHz
> >       6,457,930,305      instructions              #    0.48  insn per cycle
> >       2,125,644,649      branches                  #  313.639 M/sec
> >           1,690,587      branch-misses             #    0.08% of all
> > branches
> >        5.001931983 seconds time elapsed
> >
> > With your patch:
> >
> >
> >            6,791.46 msec cpu-clock                 #    1.358 CPUs utilized
> >              20,996      context-switches          #    0.003 M/sec
> >                   0      cpu-migrations            #    0.000 K/sec
> >                   0      page-faults               #    0.000 K/sec
> >      13,467,573,052      cycles                    #    1.983 GHz
> >       6,516,989,062      instructions              #    0.48  insn per cycle
> >       2,145,139,220      branches                  #  315.858 M/sec
> >           1,751,454      branch-misses             #    0.08% of all
> > branches
> >
> >         5.002274267 seconds time elapsed
> >
> > With my patch:
> >
> >
> >            7,495.14 msec cpu-clock                 #    1.499 CPUs utilized
> >              23,176      context-switches          #    0.003 M/sec
> >                 309      cpu-migrations            #    0.041 K/sec
> >                   0      page-faults               #    0.000 K/sec
> >      14,849,083,489      cycles                    #    1.981 GHz
> >       7,180,832,268      instructions              #    0.48  insn per cycle
> >       2,363,300,644      branches                  #  315.311 M/sec
> >           1,964,169      branch-misses             #    0.08% of all
> > branches
> >
> >         5.001713352 seconds time elapsed
> > ===============================================================
> >
> > Obviously,  when your patch is applied, the cpu-migrations of normal
> > tasks is still 0 and the
> > CPU ulization of normal tasks have no improvement compared with no patch
> > applied.
> > When apply my patch, the cpu-migrations and CPU ulization of normal
> > tasks can both improve.
> > I cannot explain the result with your patch, you also can test it by
> > yourself.
>
> Do you have more details about the test that your are running ?
>
> Do cpu0-2 share their cache ?
> Which kingd of task are the normal and idle tasks ? always running tasks ?
>
> I'm going to try to reproduce your problem locally

Some details of your UC are missing. I have tried to reproduce your
example above:
    1000 idle tasks bounds to CPU 0-1 and 10 normal tasks bounds to CPU 1-2

Let assume that for any reason, the 10 normal tasks wake up on CPU 1.
Then, the thousand of idle tasks are moved to CPU0 by load balance and
only normal tasks stay on CPU1. Then load balance will move some
normal tasks to CPU2.

My only way to reproduce something similar to your example, is to pin
the 1000 idle tasks on CPU1 so they can't move to CPU0. Then I can see
that load balance reaches loop_max limit and gets hard time moving
normal tasks on CPU2. But in this later case, my patch helps to move
normal tasks on CPU2. Something is missing in the description of your
UC.

Sidenote, I have the same kind of problem with 1000 normal task with
low priority so it's not a matter of idle vs normal tasks

Regards,
Vincent

>
> Regards,
> Vincent
>
> >
> > Best,
> > Zhang Song
> >
> > >
> > >>>> This will cause non-idle  tasks cannot achieve  more CPU utilization.
> > >>> Your problem is not linked to IDLE vs NORMAL tasks but to the large
> > >>> number of pinned tasks that can't migrate on CPU2. You can end with
> > >>> the same behavior without using IDLE tasks but only NORMAL tasks.
> > >> I feel the same thing.
> > >>
> > >> Best,
> > >> Abel
> > > .

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-19 16:04                   ` Vincent Guittot
@ 2022-08-22  6:49                     ` zhangsong (J)
  2022-08-23 13:19                       ` Vincent Guittot
  0 siblings, 1 reply; 13+ messages in thread
From: zhangsong (J) @ 2022-08-22  6:49 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Abel Wu, mingo, peterz, juri.lelli, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, linux-kernel,
	kernel test robot

Hi, Vincent,

On 2022/8/20 0:04, Vincent Guittot wrote:
> On Fri, 19 Aug 2022 at 14:35, Vincent Guittot
> <vincent.guittot@linaro.org> wrote:
>>
>> Hi Zhang,
>>
>> On Fri, 19 Aug 2022 at 12:54, zhangsong (J) <zhangsong34@huawei.com> wrote:
>>>
>>>
>>> On 2022/8/18 16:31, Vincent Guittot wrote:
>>>> Le jeudi 18 août 2022 à 10:46:55 (+0800), Abel Wu a écrit :
>>>>> On 8/17/22 8:58 PM, Vincent Guittot Wrote:
>>>>>> On Tue, 16 Aug 2022 at 04:53, zhangsong (J) <zhangsong34@huawei.com> wrote:
>>>>>>>
>>>> ...
>>>>
>>>>>>> Yes, this is usually a corner case, but suppose that some non-idle tasks bounds to CPU 1-2
>>>>>>>
>>>>>>> and idle tasks bounds to CPU 0-1, so CPU 1 may has many idle tasks and some non-idle
>>>>>>>
>>>>>>> tasks while idle tasks on CPU 1 can not be pulled to CPU 2, when trigger load balance if
>>>>>>>
>>>>>>> CPU 2 should pull some tasks from CPU 1, the bad result is idle tasks of CPU 1 cannot be
>>>>>>>
>>>>>>> migrated and non-idle tasks also cannot be migrated in case of env->loop_max constraint.
>>>>>> env->loop_max adds a break but load_balance will continue with next
>>>>>> tasks so it also tries to pull your non idle task at the end after
>>>>>> several breaks.
>>>>> Loop will be terminated without LBF_NEED_BREAK if exceeds loop_max :)
>>>> Argh yes, my brain is not yet back from vacation
>>>> I have been confused by loop_max and loop_break being set to the same value 32
>>>>
>>>> Zhang Song, Could you try the patch below ? If it works, I will prepare a
>>>> clean patch with all tags
>>>>
>>>>
>>>>
>>>> sched/fair: make sure to try to detach at least one movable task
>>>>
>>>> During load balance we try at most env->loop_max time to move a task. But
>>>> it can happen that the LRU tasks (ie tail of the cfs_tasks list) can't
>>>> be moved to dst_cpu because of affinity. In this case, loop in the list
>>>> until we found at least one.
>>>>
>>>> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
>>>> ---
>>>>    kernel/sched/fair.c | 12 +++++++++---
>>>>    1 file changed, 9 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index da388657d5ac..02b7b808e186 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -8052,8 +8052,12 @@ static int detach_tasks(struct lb_env *env)
>>>>                p = list_last_entry(tasks, struct task_struct, se.group_node);
>>>>
>>>>                env->loop++;
>>>> -             /* We've more or less seen every task there is, call it quits */
>>>> -             if (env->loop > env->loop_max)
>>>> +             /*
>>>> +              * We've more or less seen every task there is, call it quits
>>>> +              * unless we haven't found any movable task yet.
>>>> +              */
>>>> +             if (env->loop > env->loop_max &&
>>>> +                 !(env->flags & LBF_ALL_PINNED))
>>>>                        break;
>>>>
>>>>                /* take a breather every nr_migrate tasks */
>>>> @@ -10182,7 +10186,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>>>
>>>>                if (env.flags & LBF_NEED_BREAK) {
>>>>                        env.flags &= ~LBF_NEED_BREAK;
>>>> -                     goto more_balance;
>>>> +                     /* Stop if we tried all running tasks */
>>>> +                     if (env.loop < busiest->nr_running)
>>>> +                             goto more_balance;
>>>>                }
>>>>
>>>>                /*
>>>> --
>>>> 2.17.1
>>>
>>> Thanks for your reply.
>>> I have tried your patch and run test compared with it, it seems that the
>>> patch you provide makes no sense.
>>> The test result is below(1000 idle tasks bounds to CPU 0-1 and 10 normal
>>> tasks bounds to CPU 1-2):
>>>
>>> =================================================================
>>>
>>> Without patch:
>>>
>>>
>>>             6,777.37 msec cpu-clock                 #    1.355 CPUs utilized
>>>               20,812      context-switches          #    0.003 M/sec
>>>                    0      cpu-migrations            #    0.000 K/sec
>>>                    0      page-faults               #    0.000 K/sec
>>>       13,333,983,148      cycles                    #    1.967 GHz
>>>        6,457,930,305      instructions              #    0.48  insn per cycle
>>>        2,125,644,649      branches                  #  313.639 M/sec
>>>            1,690,587      branch-misses             #    0.08% of all
>>> branches
>>>         5.001931983 seconds time elapsed
>>>
>>> With your patch:
>>>
>>>
>>>             6,791.46 msec cpu-clock                 #    1.358 CPUs utilized
>>>               20,996      context-switches          #    0.003 M/sec
>>>                    0      cpu-migrations            #    0.000 K/sec
>>>                    0      page-faults               #    0.000 K/sec
>>>       13,467,573,052      cycles                    #    1.983 GHz
>>>        6,516,989,062      instructions              #    0.48  insn per cycle
>>>        2,145,139,220      branches                  #  315.858 M/sec
>>>            1,751,454      branch-misses             #    0.08% of all
>>> branches
>>>
>>>          5.002274267 seconds time elapsed
>>>
>>> With my patch:
>>>
>>>
>>>             7,495.14 msec cpu-clock                 #    1.499 CPUs utilized
>>>               23,176      context-switches          #    0.003 M/sec
>>>                  309      cpu-migrations            #    0.041 K/sec
>>>                    0      page-faults               #    0.000 K/sec
>>>       14,849,083,489      cycles                    #    1.981 GHz
>>>        7,180,832,268      instructions              #    0.48  insn per cycle
>>>        2,363,300,644      branches                  #  315.311 M/sec
>>>            1,964,169      branch-misses             #    0.08% of all
>>> branches
>>>
>>>          5.001713352 seconds time elapsed
>>> ===============================================================
>>>
>>> Obviously,  when your patch is applied, the cpu-migrations of normal
>>> tasks is still 0 and the
>>> CPU ulization of normal tasks have no improvement compared with no patch
>>> applied.
>>> When apply my patch, the cpu-migrations and CPU ulization of normal
>>> tasks can both improve.
>>> I cannot explain the result with your patch, you also can test it by
>>> yourself.
>>
>> Do you have more details about the test that your are running ?
>>
>> Do cpu0-2 share their cache ?
>> Which kingd of task are the normal and idle tasks ? always running tasks ?
>>
>> I'm going to try to reproduce your problem locally
> 
> Some details of your UC are missing. I have tried to reproduce your
> example above:
>      1000 idle tasks bounds to CPU 0-1 and 10 normal tasks bounds to CPU 1-2
> 
> Let assume that for any reason, the 10 normal tasks wake up on CPU 1.
> Then, the thousand of idle tasks are moved to CPU0 by load balance and
> only normal tasks stay on CPU1. Then load balance will move some
> normal tasks to CPU2.
> 
> My only way to reproduce something similar to your example, is to pin
> the 1000 idle tasks on CPU1 so they can't move to CPU0. Then I can see
> that load balance reaches loop_max limit and gets hard time moving
> normal tasks on CPU2. But in this later case, my patch helps to move
> normal tasks on CPU2. Something is missing in the description of your
> UC.
> 
> Sidenote, I have the same kind of problem with 1000 normal task with
> low priority so it's not a matter of idle vs normal tasks
> 
> Regards,
> Vincent
> 

Sorry for my slow reply.

I have found a test case which can more illustrate this problem 
accurately. The test case is below.

1. a dead foreach loop process run as normal or idle task
$ cat test.c
int main(int argc, char **argv)
{
         int i = 0;
         int duration = atoi(argv[1]);

         while(1) {
                 usleep(duration);
                 for(i = 0; i < 100000; i++) {}
         }
}
$ gcc -o test test.c

2. firstly spawn 500 idle tasks which bounds to CPU 0-2
3. secondly spawn 10 normal tasks which also bounds to CPU 0-2
4. lastly spawn 500 idle tasks which bounds to CPU 0 only
5. perf stat normal tasks and get CPU utilization and cpu-migrations


Below is the whole test script.
$ cat test.sh
#!/bin/bash

# create normal and idle cgroup path
mkdir /sys/fs/cgroup/cpu/normal/
mkdir /sys/fs/cgroup/cpu/idle/

# spawn 500 idle tasks and bind tasks to CPU 0-2
for ((i = 0; i < 500; i++))
do
		taskset -c 0-2 ./test 200 &
		pid=$!
		# change to SCHED_IDLE policy
		chrt -i -p 0 $pid
		echo $pid > /sys/fs/cgroup/cpu/idle/tasks
done

# spawn 10 normal tasks and bind tasks to CPU 0-2
normal_tasks=
for ((i = 0; i < 10; i++))
do
		taskset -c 0-2 ./test 500 &
		pid=$!
		normal_tasks+=$pid,
		echo $pid > /sys/fs/cgroup/cpu/normal/tasks
done

# spawn 500 idle tasks and bind tasks to CPU 0 only
for ((i = 0; i < 500; i++))
do
		taskset -c 0 ./test 200 &
		pid=$!
		# change to SCHED_IDLE policy
		chrt -i -p 0 $pid
		echo $pid > /sys/fs/cgroup/cpu/idle/tasks
done

# perf stat normal tasks
perf stat -a -p $normal_tasks sleep 5
pkill -f test


You can try the above case and test it with your patch.

Regards,
Zhang Song

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-22  6:49                     ` zhangsong (J)
@ 2022-08-23 13:19                       ` Vincent Guittot
  2022-08-25  1:57                         ` zhangsong (J)
  0 siblings, 1 reply; 13+ messages in thread
From: Vincent Guittot @ 2022-08-23 13:19 UTC (permalink / raw)
  To: zhangsong (J)
  Cc: Abel Wu, mingo, peterz, juri.lelli, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, linux-kernel,
	kernel test robot

Hi Zhang,

On Mon, 22 Aug 2022 at 08:49, zhangsong (J) <zhangsong34@huawei.com> wrote:
>
> Hi, Vincent,
>
> On 2022/8/20 0:04, Vincent Guittot wrote:
> > On Fri, 19 Aug 2022 at 14:35, Vincent Guittot
> > <vincent.guittot@linaro.org> wrote:
> >>
> >> Hi Zhang,
> >>
> >> On Fri, 19 Aug 2022 at 12:54, zhangsong (J) <zhangsong34@huawei.com> wrote:
> >>>
> >>>
> >>> On 2022/8/18 16:31, Vincent Guittot wrote:
> >>>> Le jeudi 18 août 2022 à 10:46:55 (+0800), Abel Wu a écrit :
> >>>>> On 8/17/22 8:58 PM, Vincent Guittot Wrote:
> >>>>>> On Tue, 16 Aug 2022 at 04:53, zhangsong (J) <zhangsong34@huawei.com> wrote:
> >>>>>>>
> >>>> ...

...

> >>>
> >>> Thanks for your reply.
> >>> I have tried your patch and run test compared with it, it seems that the
> >>> patch you provide makes no sense.
> >>> The test result is below(1000 idle tasks bounds to CPU 0-1 and 10 normal
> >>> tasks bounds to CPU 1-2):
> >>>
> >>> =================================================================
> >>>
> >>> Without patch:
> >>>
> >>>
> >>>             6,777.37 msec cpu-clock                 #    1.355 CPUs utilized
> >>>               20,812      context-switches          #    0.003 M/sec
> >>>                    0      cpu-migrations            #    0.000 K/sec
> >>>                    0      page-faults               #    0.000 K/sec
> >>>       13,333,983,148      cycles                    #    1.967 GHz
> >>>        6,457,930,305      instructions              #    0.48  insn per cycle
> >>>        2,125,644,649      branches                  #  313.639 M/sec
> >>>            1,690,587      branch-misses             #    0.08% of all
> >>> branches
> >>>         5.001931983 seconds time elapsed
> >>>
> >>> With your patch:
> >>>
> >>>
> >>>             6,791.46 msec cpu-clock                 #    1.358 CPUs utilized
> >>>               20,996      context-switches          #    0.003 M/sec
> >>>                    0      cpu-migrations            #    0.000 K/sec
> >>>                    0      page-faults               #    0.000 K/sec
> >>>       13,467,573,052      cycles                    #    1.983 GHz
> >>>        6,516,989,062      instructions              #    0.48  insn per cycle
> >>>        2,145,139,220      branches                  #  315.858 M/sec
> >>>            1,751,454      branch-misses             #    0.08% of all
> >>> branches
> >>>
> >>>          5.002274267 seconds time elapsed
> >>>
> >>> With my patch:
> >>>
> >>>
> >>>             7,495.14 msec cpu-clock                 #    1.499 CPUs utilized
> >>>               23,176      context-switches          #    0.003 M/sec
> >>>                  309      cpu-migrations            #    0.041 K/sec
> >>>                    0      page-faults               #    0.000 K/sec
> >>>       14,849,083,489      cycles                    #    1.981 GHz
> >>>        7,180,832,268      instructions              #    0.48  insn per cycle
> >>>        2,363,300,644      branches                  #  315.311 M/sec
> >>>            1,964,169      branch-misses             #    0.08% of all
> >>> branches
> >>>
> >>>          5.001713352 seconds time elapsed
> >>> ===============================================================
> >>>
> >>> Obviously,  when your patch is applied, the cpu-migrations of normal
> >>> tasks is still 0 and the
> >>> CPU ulization of normal tasks have no improvement compared with no patch
> >>> applied.
> >>> When apply my patch, the cpu-migrations and CPU ulization of normal
> >>> tasks can both improve.
> >>> I cannot explain the result with your patch, you also can test it by
> >>> yourself.
> >>
> >> Do you have more details about the test that your are running ?
> >>
> >> Do cpu0-2 share their cache ?
> >> Which kingd of task are the normal and idle tasks ? always running tasks ?
> >>
> >> I'm going to try to reproduce your problem locally
> >
> > Some details of your UC are missing. I have tried to reproduce your
> > example above:
> >      1000 idle tasks bounds to CPU 0-1 and 10 normal tasks bounds to CPU 1-2
> >
> > Let assume that for any reason, the 10 normal tasks wake up on CPU 1.
> > Then, the thousand of idle tasks are moved to CPU0 by load balance and
> > only normal tasks stay on CPU1. Then load balance will move some
> > normal tasks to CPU2.
> >
> > My only way to reproduce something similar to your example, is to pin
> > the 1000 idle tasks on CPU1 so they can't move to CPU0. Then I can see
> > that load balance reaches loop_max limit and gets hard time moving
> > normal tasks on CPU2. But in this later case, my patch helps to move
> > normal tasks on CPU2. Something is missing in the description of your
> > UC.
> >
> > Sidenote, I have the same kind of problem with 1000 normal task with
> > low priority so it's not a matter of idle vs normal tasks
> >
> > Regards,
> > Vincent
> >
>
> Sorry for my slow reply.
>
> I have found a test case which can more illustrate this problem
> accurately. The test case is below.
>
> 1. a dead foreach loop process run as normal or idle task
> $ cat test.c
> int main(int argc, char **argv)
> {
>          int i = 0;
>          int duration = atoi(argv[1]);
>
>          while(1) {
>                  usleep(duration);
>                  for(i = 0; i < 100000; i++) {}
>          }
> }
> $ gcc -o test test.c
>
> 2. firstly spawn 500 idle tasks which bounds to CPU 0-2
> 3. secondly spawn 10 normal tasks which also bounds to CPU 0-2
> 4. lastly spawn 500 idle tasks which bounds to CPU 0 only
> 5. perf stat normal tasks and get CPU utilization and cpu-migrations
>
>
> Below is the whole test script.
> $ cat test.sh
> #!/bin/bash
>
> # create normal and idle cgroup path
> mkdir /sys/fs/cgroup/cpu/normal/
> mkdir /sys/fs/cgroup/cpu/idle/

so you put "idle" tasks in a task group and normal tasks in another
one. But both groups have default weight/share so you lose the idle
weight for idle tasks and each group will have half the cpus capacity,
i.e. 1.5 cpus per group. And this is  what I have with the current
kernel, with your patch or with my patch.

In fact, there is enough tasks (510) not pinned to cpu0 to make the
system balance correctly

With this group hierarchy, the idle tasks will have the same weight
priority as normal tasks. Is it really what you want ?

tip kernel:
          7 673,15 msec task-clock                #    1,534 CPUs
utilized
            12 662      context-switches          #    1,650 K/sec
                 0      cpu-migrations            #    0,000 /sec
       5,003493176 seconds time elapsed

your patch:
          7 488,35 msec task-clock                #    1,497 CPUs
utilized
            12 338      context-switches          #    1,648 K/sec
                 3      cpu-migrations            #    0,401 /sec
       5,003406005 seconds time elapsed

my patch
          7 569,57 msec task-clock                #    1,513 CPUs
utilized
            12 460      context-switches          #    1,646 K/sec
                 0      cpu-migrations            #    0,000 /sec
       5,003437278 seconds time elapsed


>
> # spawn 500 idle tasks and bind tasks to CPU 0-2
> for ((i = 0; i < 500; i++))
> do
>                 taskset -c 0-2 ./test 200 &
>                 pid=$!
>                 # change to SCHED_IDLE policy
>                 chrt -i -p 0 $pid
>                 echo $pid > /sys/fs/cgroup/cpu/idle/tasks
> done
>
> # spawn 10 normal tasks and bind tasks to CPU 0-2
> normal_tasks=
> for ((i = 0; i < 10; i++))
> do
>                 taskset -c 0-2 ./test 500 &
>                 pid=$!
>                 normal_tasks+=$pid,
>                 echo $pid > /sys/fs/cgroup/cpu/normal/tasks
> done
>
> # spawn 500 idle tasks and bind tasks to CPU 0 only
> for ((i = 0; i < 500; i++))
> do
>                 taskset -c 0 ./test 200 &
>                 pid=$!
>                 # change to SCHED_IDLE policy
>                 chrt -i -p 0 $pid
>                 echo $pid > /sys/fs/cgroup/cpu/idle/tasks
> done
>
> # perf stat normal tasks
> perf stat -a -p $normal_tasks sleep 5
> pkill -f test
>
>
> You can try the above case and test it with your patch.
>
> Regards,
> Zhang Song

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
  2022-08-23 13:19                       ` Vincent Guittot
@ 2022-08-25  1:57                         ` zhangsong (J)
  0 siblings, 0 replies; 13+ messages in thread
From: zhangsong (J) @ 2022-08-25  1:57 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Abel Wu, mingo, peterz, juri.lelli, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, vschneid, linux-kernel,
	kernel test robot



On 2022/8/23 21:19, Vincent Guittot wrote:
> Hi Zhang,
> 
> On Mon, 22 Aug 2022 at 08:49, zhangsong (J) <zhangsong34@huawei.com> wrote:
>>
>> Hi, Vincent,
>>
>> On 2022/8/20 0:04, Vincent Guittot wrote:
>>> On Fri, 19 Aug 2022 at 14:35, Vincent Guittot
>>> <vincent.guittot@linaro.org> wrote:
>>>>
>>>> Hi Zhang,
>>>>
>>>> On Fri, 19 Aug 2022 at 12:54, zhangsong (J) <zhangsong34@huawei.com> wrote:
>>>>>
>>>>>
>>>>> On 2022/8/18 16:31, Vincent Guittot wrote:
>>>>>> Le jeudi 18 août 2022 à 10:46:55 (+0800), Abel Wu a écrit :
>>>>>>> On 8/17/22 8:58 PM, Vincent Guittot Wrote:
>>>>>>>> On Tue, 16 Aug 2022 at 04:53, zhangsong (J) <zhangsong34@huawei.com> wrote:
>>>>>>>>>
>>>>>> ...
> 
> ...
> 
>>>>>
>>>>> Thanks for your reply.
>>>>> I have tried your patch and run test compared with it, it seems that the
>>>>> patch you provide makes no sense.
>>>>> The test result is below(1000 idle tasks bounds to CPU 0-1 and 10 normal
>>>>> tasks bounds to CPU 1-2):
>>>>>
>>>>> =================================================================
>>>>>
>>>>> Without patch:
>>>>>
>>>>>
>>>>>              6,777.37 msec cpu-clock                 #    1.355 CPUs utilized
>>>>>                20,812      context-switches          #    0.003 M/sec
>>>>>                     0      cpu-migrations            #    0.000 K/sec
>>>>>                     0      page-faults               #    0.000 K/sec
>>>>>        13,333,983,148      cycles                    #    1.967 GHz
>>>>>         6,457,930,305      instructions              #    0.48  insn per cycle
>>>>>         2,125,644,649      branches                  #  313.639 M/sec
>>>>>             1,690,587      branch-misses             #    0.08% of all
>>>>> branches
>>>>>          5.001931983 seconds time elapsed
>>>>>
>>>>> With your patch:
>>>>>
>>>>>
>>>>>              6,791.46 msec cpu-clock                 #    1.358 CPUs utilized
>>>>>                20,996      context-switches          #    0.003 M/sec
>>>>>                     0      cpu-migrations            #    0.000 K/sec
>>>>>                     0      page-faults               #    0.000 K/sec
>>>>>        13,467,573,052      cycles                    #    1.983 GHz
>>>>>         6,516,989,062      instructions              #    0.48  insn per cycle
>>>>>         2,145,139,220      branches                  #  315.858 M/sec
>>>>>             1,751,454      branch-misses             #    0.08% of all
>>>>> branches
>>>>>
>>>>>           5.002274267 seconds time elapsed
>>>>>
>>>>> With my patch:
>>>>>
>>>>>
>>>>>              7,495.14 msec cpu-clock                 #    1.499 CPUs utilized
>>>>>                23,176      context-switches          #    0.003 M/sec
>>>>>                   309      cpu-migrations            #    0.041 K/sec
>>>>>                     0      page-faults               #    0.000 K/sec
>>>>>        14,849,083,489      cycles                    #    1.981 GHz
>>>>>         7,180,832,268      instructions              #    0.48  insn per cycle
>>>>>         2,363,300,644      branches                  #  315.311 M/sec
>>>>>             1,964,169      branch-misses             #    0.08% of all
>>>>> branches
>>>>>
>>>>>           5.001713352 seconds time elapsed
>>>>> ===============================================================
>>>>>
>>>>> Obviously,  when your patch is applied, the cpu-migrations of normal
>>>>> tasks is still 0 and the
>>>>> CPU ulization of normal tasks have no improvement compared with no patch
>>>>> applied.
>>>>> When apply my patch, the cpu-migrations and CPU ulization of normal
>>>>> tasks can both improve.
>>>>> I cannot explain the result with your patch, you also can test it by
>>>>> yourself.
>>>>
>>>> Do you have more details about the test that your are running ?
>>>>
>>>> Do cpu0-2 share their cache ?
>>>> Which kingd of task are the normal and idle tasks ? always running tasks ?
>>>>
>>>> I'm going to try to reproduce your problem locally
>>>
>>> Some details of your UC are missing. I have tried to reproduce your
>>> example above:
>>>       1000 idle tasks bounds to CPU 0-1 and 10 normal tasks bounds to CPU 1-2
>>>
>>> Let assume that for any reason, the 10 normal tasks wake up on CPU 1.
>>> Then, the thousand of idle tasks are moved to CPU0 by load balance and
>>> only normal tasks stay on CPU1. Then load balance will move some
>>> normal tasks to CPU2.
>>>
>>> My only way to reproduce something similar to your example, is to pin
>>> the 1000 idle tasks on CPU1 so they can't move to CPU0. Then I can see
>>> that load balance reaches loop_max limit and gets hard time moving
>>> normal tasks on CPU2. But in this later case, my patch helps to move
>>> normal tasks on CPU2. Something is missing in the description of your
>>> UC.
>>>
>>> Sidenote, I have the same kind of problem with 1000 normal task with
>>> low priority so it's not a matter of idle vs normal tasks
>>>
>>> Regards,
>>> Vincent
>>>
>>
>> Sorry for my slow reply.
>>
>> I have found a test case which can more illustrate this problem
>> accurately. The test case is below.
>>
>> 1. a dead foreach loop process run as normal or idle task
>> $ cat test.c
>> int main(int argc, char **argv)
>> {
>>           int i = 0;
>>           int duration = atoi(argv[1]);
>>
>>           while(1) {
>>                   usleep(duration);
>>                   for(i = 0; i < 100000; i++) {}
>>           }
>> }
>> $ gcc -o test test.c
>>
>> 2. firstly spawn 500 idle tasks which bounds to CPU 0-2
>> 3. secondly spawn 10 normal tasks which also bounds to CPU 0-2
>> 4. lastly spawn 500 idle tasks which bounds to CPU 0 only
>> 5. perf stat normal tasks and get CPU utilization and cpu-migrations
>>
>>
>> Below is the whole test script.
>> $ cat test.sh
>> #!/bin/bash
>>
>> # create normal and idle cgroup path
>> mkdir /sys/fs/cgroup/cpu/normal/
>> mkdir /sys/fs/cgroup/cpu/idle/
> 
> so you put "idle" tasks in a task group and normal tasks in another
> one. But both groups have default weight/share so you lose the idle
> weight for idle tasks and each group will have half the cpus capacity,
> i.e. 1.5 cpus per group. And this is  what I have with the current
> kernel, with your patch or with my patch.
> 
> In fact, there is enough tasks (510) not pinned to cpu0 to make the
> system balance correctly
> 
> With this group hierarchy, the idle tasks will have the same weight
> priority as normal tasks. Is it really what you want ?
> 
> tip kernel:
>            7 673,15 msec task-clock                #    1,534 CPUs
> utilized
>              12 662      context-switches          #    1,650 K/sec
>                   0      cpu-migrations            #    0,000 /sec
>         5,003493176 seconds time elapsed
> 
> your patch:
>            7 488,35 msec task-clock                #    1,497 CPUs
> utilized
>              12 338      context-switches          #    1,648 K/sec
>                   3      cpu-migrations            #    0,401 /sec
>         5,003406005 seconds time elapsed
> 
> my patch
>            7 569,57 msec task-clock                #    1,513 CPUs
> utilized
>              12 460      context-switches          #    1,646 K/sec
>                   0      cpu-migrations            #    0,000 /sec
>         5,003437278 seconds time elapsed
> 
> 

I may have only consider the SCHE_IDLE policy of idle tasks without 
changing it's weight. It is true that if the weight of idle tasks is 
changed to the minimum value, my patch will not generate any revenue.

OK, I need to think it more.
Thanks a lot.

>>
>> # spawn 500 idle tasks and bind tasks to CPU 0-2
>> for ((i = 0; i < 500; i++))
>> do
>>                  taskset -c 0-2 ./test 200 &
>>                  pid=$!
>>                  # change to SCHED_IDLE policy
>>                  chrt -i -p 0 $pid
>>                  echo $pid > /sys/fs/cgroup/cpu/idle/tasks
>> done
>>
>> # spawn 10 normal tasks and bind tasks to CPU 0-2
>> normal_tasks=
>> for ((i = 0; i < 10; i++))
>> do
>>                  taskset -c 0-2 ./test 500 &
>>                  pid=$!
>>                  normal_tasks+=$pid,
>>                  echo $pid > /sys/fs/cgroup/cpu/normal/tasks
>> done
>>
>> # spawn 500 idle tasks and bind tasks to CPU 0 only
>> for ((i = 0; i < 500; i++))
>> do
>>                  taskset -c 0 ./test 200 &
>>                  pid=$!
>>                  # change to SCHED_IDLE policy
>>                  chrt -i -p 0 $pid
>>                  echo $pid > /sys/fs/cgroup/cpu/idle/tasks
>> done
>>
>> # perf stat normal tasks
>> perf stat -a -p $normal_tasks sleep 5
>> pkill -f test
>>
>>
>> You can try the above case and test it with your patch.
>>
>> Regards,
>> Zhang Song
> .

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-08-25  2:01 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-10  1:56 [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks zhangsong
2022-08-10  4:02 ` Abel Wu
2022-08-10  7:34   ` zhangsong (J)
2022-08-15 11:03     ` Abel Wu
     [not found]       ` <6ae319c0-e6ed-4aad-64b8-d3f6cbea688d@huawei.com>
2022-08-17 12:58         ` Vincent Guittot
2022-08-18  2:46           ` Abel Wu
2022-08-18  8:31             ` Vincent Guittot
2022-08-19 10:54               ` zhangsong (J)
2022-08-19 12:35                 ` Vincent Guittot
2022-08-19 16:04                   ` Vincent Guittot
2022-08-22  6:49                     ` zhangsong (J)
2022-08-23 13:19                       ` Vincent Guittot
2022-08-25  1:57                         ` zhangsong (J)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).