All of lore.kernel.org
 help / color / mirror / Atom feed
From: Abel Wu <wuyun.abel@bytedance.com>
To: zhangsong <zhangsong34@huawei.com>,
	mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com,
	vincent.guittot@linaro.org
Cc: dietmar.eggemann@arm.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, bristot@redhat.com,
	vschneid@redhat.com, linux-kernel@vger.kernel.org,
	kernel test robot <lkp@intel.com>
Subject: Re: [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks
Date: Wed, 10 Aug 2022 12:02:05 +0800	[thread overview]
Message-ID: <b62804cb-2b60-a534-5096-56785a1940bd@bytedance.com> (raw)
In-Reply-To: <20220810015636.3865248-1-zhangsong34@huawei.com>

Hi Zhang Song,

On 8/10/22 9:56 AM, zhangsong Wrote:
> For co-location with NORMAL and IDLE tasks, when CFS trigger load balance,
> it is reasonable to prefer migrating NORMAL(Latency Sensitive) tasks from
> the busy src CPU to dst CPU, and migrating IDLE tasks lastly.

Considering the large weight difference between normal and idle tasks,
does the re-ordering really change things? It would be helpful if you
can offer more detailed info.

> 
> This is very important for reducing interference from IDLE tasks.
> So the CFS load balance can be optimized to below:
> 
> 1.`cfs_tasks` list of CPU rq is owned by NORMAL tasks.
> 2.`cfs_idle_tasks` list of CPU rq which is owned by IDLE tasks.
> 3.Prefer to migrate NORMAL tasks of cfs_tasks to dst CPU.
> 4.Lastly migrate IDLE tasks of cfs_idle_tasks to dst CPU.
> 
> This was tested with the following reproduction:
> - small number of NORMAL tasks colocated with a large number of IDLE tasks
> 
> With this patch, NORMAL tasks latency can be reduced
> about 5~10% compared with current.
> 
> Signed-off-by: zhangsong <zhangsong34@huawei.com>
> Reported-by: kernel test robot <lkp@intel.com>

The Reported-by tag is usually used for reporting a bug in the mainline
kernel, and build error of your patch is not one of them :)

> ---
> V1->V2:
> - fix build test error
> ---
>   kernel/sched/core.c  |  1 +
>   kernel/sched/fair.c  | 45 ++++++++++++++++++++++++++++++++++++++++----
>   kernel/sched/sched.h |  1 +
>   3 files changed, 43 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ee28253c9ac0..7325c6e552d8 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9733,6 +9733,7 @@ void __init sched_init(void)
>   		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
>   
>   		INIT_LIST_HEAD(&rq->cfs_tasks);
> +		INIT_LIST_HEAD(&rq->cfs_idle_tasks);
>   
>   		rq_attach_root(rq, &def_root_domain);
>   #ifdef CONFIG_NO_HZ_COMMON
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 914096c5b1ae..b62bec5b1eb9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3034,6 +3034,21 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
>   
>   #endif /* CONFIG_NUMA_BALANCING */
>   
> +#ifdef CONFIG_SMP
> +static void
> +adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
> +	struct rq *rq,
> +	struct sched_entity *se)
> +{
> +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> +	if (task_has_idle_policy(task_of(se)) || tg_is_idle(cfs_rq->tg))

The tg_is_idle() doesn't have hierarchical judgement on parent task
groups, while rq->cfs{,_idle}_tasks is rq wide. Say A->B where tgA
is idle and tgB isn't, a task from B will be added to the non-idle
list, is this what you want?

> +		(*list_op)(&se->group_node, &rq->cfs_idle_tasks);
> +	else
> +		(*list_op)(&se->group_node, &rq->cfs_tasks);
> +}
> +#endif
> +
>   static void
>   account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>   {
> @@ -3043,7 +3058,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>   		struct rq *rq = rq_of(cfs_rq);
>   
>   		account_numa_enqueue(rq, task_of(se));
> -		list_add(&se->group_node, &rq->cfs_tasks);
> +		adjust_rq_cfs_tasks(list_add, rq, se);
>   	}
>   #endif
>   	cfs_rq->nr_running++;
> @@ -7465,7 +7480,7 @@ done: __maybe_unused;
>   	 * the list, so our cfs_tasks list becomes MRU
>   	 * one.
>   	 */
> -	list_move(&p->se.group_node, &rq->cfs_tasks);
> +	adjust_rq_cfs_tasks(list_move, rq, &p->se);
>   #endif
>   
>   	if (hrtick_enabled_fair(rq))
> @@ -7788,6 +7803,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>   	if (unlikely(task_has_idle_policy(p)))
>   		return 0;
>   
> +	if (tg_is_idle(cfs_rq_of(&p->se)->tg))
> +		return 0;
> +

Same as above. But I am not sure this is the right way to do it. We
still want to maintain policy behavior inside an idle task group.

>   	/* SMT siblings share cache */
>   	if (env->sd->flags & SD_SHARE_CPUCAPACITY)
>   		return 0;
> @@ -7800,6 +7818,11 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>   			 &p->se == cfs_rq_of(&p->se)->last))
>   		return 1;
>   
> +	/* Preempt sched idle cpu do not consider migration cost */
> +	if (cpus_share_cache(env->src_cpu, env->dst_cpu) &&
> +	    sched_idle_cpu(env->dst_cpu))
> +		return 0;
> +
>   	if (sysctl_sched_migration_cost == -1)
>   		return 1;
>   
> @@ -7990,11 +8013,14 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
>   static struct task_struct *detach_one_task(struct lb_env *env)
>   {
>   	struct task_struct *p;
> +	struct list_head *tasks = &env->src_rq->cfs_tasks;
> +	int loop = 0;

Maybe a boolean variable is enough (and more readable)?

Thanks,
Abel

>   
>   	lockdep_assert_rq_held(env->src_rq);
>   
> +again:
>   	list_for_each_entry_reverse(p,
> -			&env->src_rq->cfs_tasks, se.group_node) {
> +			tasks, se.group_node) {
>   		if (!can_migrate_task(p, env))
>   			continue;
>   
> @@ -8009,6 +8035,10 @@ static struct task_struct *detach_one_task(struct lb_env *env)
>   		schedstat_inc(env->sd->lb_gained[env->idle]);
>   		return p;
>   	}
> +	if (++loop == 1) {
> +		tasks = &env->src_rq->cfs_idle_tasks;
> +		goto again;
> +	}
>   	return NULL;
>   }
>   
> @@ -8026,6 +8056,7 @@ static int detach_tasks(struct lb_env *env)
>   	unsigned long util, load;
>   	struct task_struct *p;
>   	int detached = 0;
> +	int loop = 0;
>   
>   	lockdep_assert_rq_held(env->src_rq);
>   
> @@ -8041,6 +8072,7 @@ static int detach_tasks(struct lb_env *env)
>   	if (env->imbalance <= 0)
>   		return 0;
>   
> +again:
>   	while (!list_empty(tasks)) {
>   		/*
>   		 * We don't want to steal all, otherwise we may be treated likewise,
> @@ -8142,6 +8174,11 @@ static int detach_tasks(struct lb_env *env)
>   		list_move(&p->se.group_node, tasks);
>   	}
>   
> +	if (env->imbalance > 0 && ++loop == 1) {
> +		tasks = &env->src_rq->cfs_idle_tasks;
> +		goto again;
> +	}
> +
>   	/*
>   	 * Right now, this is one of only two places we collect this stat
>   	 * so we can safely collect detach_one_task() stats here rather
> @@ -11643,7 +11680,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
>   		 * Move the next running task to the front of the list, so our
>   		 * cfs_tasks list becomes MRU one.
>   		 */
> -		list_move(&se->group_node, &rq->cfs_tasks);
> +		adjust_rq_cfs_tasks(list_move, rq, se);
>   	}
>   #endif
>   
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e26688d387ae..accb4eea9769 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1068,6 +1068,7 @@ struct rq {
>   	int			online;
>   
>   	struct list_head cfs_tasks;
> +	struct list_head cfs_idle_tasks;
>   
>   	struct sched_avg	avg_rt;
>   	struct sched_avg	avg_dl;

  reply	other threads:[~2022-08-10  4:02 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-10  1:56 [PATCH v2] sched/fair: Introduce priority load balance to reduce interference from IDLE tasks zhangsong
2022-08-10  4:02 ` Abel Wu [this message]
2022-08-10  7:34   ` zhangsong (J)
2022-08-15 11:03     ` Abel Wu
     [not found]       ` <6ae319c0-e6ed-4aad-64b8-d3f6cbea688d@huawei.com>
2022-08-17 12:58         ` Vincent Guittot
2022-08-18  2:46           ` Abel Wu
2022-08-18  8:31             ` Vincent Guittot
2022-08-19 10:54               ` zhangsong (J)
2022-08-19 12:35                 ` Vincent Guittot
2022-08-19 16:04                   ` Vincent Guittot
2022-08-22  6:49                     ` zhangsong (J)
2022-08-23 13:19                       ` Vincent Guittot
2022-08-25  1:57                         ` zhangsong (J)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b62804cb-2b60-a534-5096-56785a1940bd@bytedance.com \
    --to=wuyun.abel@bytedance.com \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lkp@intel.com \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    --cc=zhangsong34@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.