All of lore.kernel.org
 help / color / mirror / Atom feed
From: Joel Fernandes <joel@joelfernandes.org>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Nishanth Aravamudan <naravamudan@digitalocean.com>,
	Julien Desfossez <jdesfossez@digitalocean.com>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	Vineeth Pillai <viremana@linux.microsoft.com>,
	Aaron Lu <aaron.lwe@gmail.com>,
	Aubrey Li <aubrey.intel@gmail.com>,
	tglx@linutronix.de, linux-kernel@vger.kernel.org,
	mingo@kernel.org, keescook@chromium.org, kerrnel@google.com,
	Phil Auld <pauld@redhat.com>,
	Valentin Schneider <valentin.schneider@arm.com>,
	Mel Gorman <mgorman@techsingularity.net>,
	Pawan Gupta <pawan.kumar.gupta@linux.intel.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	vineeth@bitbyteword.org, Chen Yu <yu.c.chen@intel.com>,
	Christian Brauner <christian.brauner@ubuntu.com>,
	Agata Gruza <agata.gruza@intel.com>,
	Antonio Gomez Iglesias <antonio.gomez.iglesias@intel.com>,
	graf@amazon.com, konrad.wilk@oracle.com, dfaggioli@suse.com,
	pjt@google.com, rostedt@goodmis.org, derkling@google.com,
	benbjiang@tencent.com,
	Alexandre Chartre <alexandre.chartre@oracle.com>,
	James.Bottomley@hansenpartnership.com, OWeisse@umich.edu,
	Dhaval Giani <dhaval.giani@oracle.com>,
	Junaid Shahid <junaids@google.com>,
	jsbarnes@google.com, chris.hyser@oracle.com,
	Aubrey Li <aubrey.li@linux.intel.com>,
	Tim Chen <tim.c.chen@intel.com>
Subject: Re: [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle
Date: Thu, 29 Oct 2020 14:24:29 -0400	[thread overview]
Message-ID: <20201029182429.GA1844482@google.com> (raw)
In-Reply-To: <20201026124724.GT2611@hirez.programming.kicks-ass.net>

On Mon, Oct 26, 2020 at 01:47:24PM +0100, Peter Zijlstra wrote:
[..] 
> How's something like this?
> 
>  - after each pick, such that the pick itself sees the divergence (see
>    above); either:
> 
>     - pull the vruntime_fi forward, when !fi
>     - freeze the vruntime_fi, when newly fi    (A)
> 
>  - either way, update vruntime_fi for each cfs_rq in the active
>    hierachy.
> 
>  - when comparing, and fi, update the vruntime_fi hierachy until we
>    encounter a mark from (A), per doing it during the pick, but before
>    runtime, this guaranteees it hasn't moved since (A).
> 
> XXX, still buggered on SMT>2, imagine having {ta, tb, fi, i} on an SMT4,
> then when comparing any two tasks that do not involve the fi, we should
> (probably) have pulled them fwd -- but we can't actually pull them,
> because then the fi thing would break, mooo.
> 
v> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -115,19 +115,8 @@ static inline bool prio_less(struct task
>  	if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
>  		return !dl_time_before(a->dl.deadline, b->dl.deadline);
>  
> -	if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
> -		u64 vruntime = b->se.vruntime;
> -
> -		/*
> -		 * Normalize the vruntime if tasks are in different cpus.
> -		 */
> -		if (task_cpu(a) != task_cpu(b)) {
> -			vruntime -= task_cfs_rq(b)->min_vruntime;
> -			vruntime += task_cfs_rq(a)->min_vruntime;
> -		}
> -
> -		return !((s64)(a->se.vruntime - vruntime) <= 0);
> -	}
> +	if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */
> +		return cfs_prio_less(a, b);
>  
>  	return false;
>  }
> @@ -4642,12 +4631,15 @@ pick_task(struct rq *rq, const struct sc
>  	return cookie_pick;
>  }
>  
> +extern void task_vruntime_update(struct rq *rq, struct task_struct *p);
> +
>  static struct task_struct *
>  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>  {
>  	struct task_struct *next, *max = NULL;
>  	const struct sched_class *class;
>  	const struct cpumask *smt_mask;
> +	bool fi_before = false;
>  	bool need_sync;
>  	int i, j, cpu;
>  
> @@ -4707,6 +4699,7 @@ pick_next_task(struct rq *rq, struct tas
>  	need_sync = !!rq->core->core_cookie;
>  	if (rq->core->core_forceidle) {
>  		need_sync = true;
> +		fi_before = true;
>  		rq->core->core_forceidle = false;
>  	}
>  
> @@ -4757,6 +4750,11 @@ pick_next_task(struct rq *rq, struct tas
>  				continue;
>  
>  			rq_i->core_pick = p;
> +			if (rq_i->idle == p && rq_i->nr_running) {
> +				rq->core->core_forceidle = true;
> +				if (!fi_before)
> +					rq->core->core_forceidle_seq++;
> +			}
>  
>  			/*
>  			 * If this new candidate is of higher priority than the
> @@ -4775,6 +4773,7 @@ pick_next_task(struct rq *rq, struct tas
>  				max = p;
>  
>  				if (old_max) {
> +					rq->core->core_forceidle = false;
>  					for_each_cpu(j, smt_mask) {
>  						if (j == i)
>  							continue;
> @@ -4823,10 +4822,8 @@ pick_next_task(struct rq *rq, struct tas
>  		if (!rq_i->core_pick)
>  			continue;
>  
> -		if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
> -		    !rq_i->core->core_forceidle) {
> -			rq_i->core->core_forceidle = true;
> -		}
> +		if (!(fi_before && rq->core->core_forceidle))
> +			task_vruntime_update(rq_i, rq_i->core_pick);

Shouldn't this be:

	if (!fi_before && rq->core->core_forceidle)
			task_vruntime_update(rq_i, rq_i->core_pick);

?

>  
>  		if (i == cpu) {
>  			rq_i->core_pick = NULL;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10686,6 +10686,67 @@ static inline void task_tick_core(struct
>  	    __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
>  		resched_curr(rq);
>  }
> +
> +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
> +{
> +	for_each_sched_entity(se) {
> +		struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> +		if (forceidle) {
> +			if (cfs_rq->forceidle_seq == fi_seq)
> +				break;
> +			cfs_rq->forceidle_seq = fi_seq;
> +		}
> +
> +		cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
> +	}
> +}
> +
> +void task_vruntime_update(struct rq *rq, struct task_struct *p)
> +{
> +	struct sched_entity *se = &p->se;
> +
> +	if (p->sched_class != &fair_sched_class)
> +		return;
> +
> +	se_fi_update(se, rq->core->core_forceidle_seq, rq->core->core_forceidle);
> +}
> +
> +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> +{
> +	struct rq *rq = task_rq(a);
> +	struct sched_entity *sea = &a->se;
> +	struct sched_entity *seb = &b->se;
> +	struct cfs_rq *cfs_rqa;
> +	struct cfs_rq *cfs_rqb;
> +	s64 delta;
> +
> +	SCHED_WARN_ON(task_rq(b)->core != rq->core);
> +
> +	while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
> +		int sea_depth = sea->depth;
> +		int seb_depth = seb->depth;
> +
> +		if (sea_depth >= seb_depth)
> +			sea = parent_entity(sea);
> +		if (sea_depth <= seb_depth)
> +			seb = parent_entity(seb);
> +	}
> +
> +	if (rq->core->core_forceidle) {
> +		se_fi_update(sea, rq->core->core_forceidle_seq, true);
> +		se_fi_update(seb, rq->core->core_forceidle_seq, true);
> +	}

As we chatted on IRC you mentioned the reason for the sync here is:

 say we have 2 cgroups (a,b) under root, and we go force-idle in a, then we
 update a and root. Then we pick and end up in b, but b hasn't been updated
 yet.

One thing I was wondering about that was, if the pick of 'b' happens much
later than 'a', then the snapshot might be happening too late right?

Maybe the snapshot should happen on all cfs_rqs on all siblings in
pick_next_task() itself? That way everything gets updated at the instant the
force-idle started. Thought that may be a bit more slow.

thanks,

 - Joel


  parent reply	other threads:[~2020-10-29 18:24 UTC|newest]

Thread overview: 98+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-20  1:43 [PATCH v8 -tip 00/26] Core scheduling Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 01/26] sched: Wrap rq::lock access Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 02/26] sched: Introduce sched_class::pick_task() Joel Fernandes (Google)
2020-10-22  7:59   ` Li, Aubrey
2020-10-22 15:25     ` Joel Fernandes
2020-10-23  5:25       ` Li, Aubrey
2020-10-23 21:47         ` Joel Fernandes
2020-10-24  2:48           ` Li, Aubrey
2020-10-24 11:10             ` Vineeth Pillai
2020-10-24 12:27               ` Vineeth Pillai
2020-10-24 23:48                 ` Li, Aubrey
2020-10-26  9:01                 ` Peter Zijlstra
2020-10-27  3:17                   ` Li, Aubrey
2020-10-27 14:19                   ` Joel Fernandes
2020-10-27 15:23                     ` Joel Fernandes
2020-10-27 14:14                 ` Joel Fernandes
2020-10-20  1:43 ` [PATCH v8 -tip 03/26] sched: Core-wide rq->lock Joel Fernandes (Google)
2020-10-26 11:59   ` Peter Zijlstra
2020-10-27 16:27     ` Joel Fernandes
2020-10-20  1:43 ` [PATCH v8 -tip 04/26] sched/fair: Add a few assertions Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 05/26] sched: Basic tracking of matching tasks Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 06/26] sched: Add core wide task selection and scheduling Joel Fernandes (Google)
2020-10-23 13:51   ` Peter Zijlstra
2020-10-23 13:54     ` Peter Zijlstra
2020-10-23 17:57       ` Joel Fernandes
2020-10-23 19:26         ` Peter Zijlstra
2020-10-23 21:31           ` Joel Fernandes
2020-10-26  8:28             ` Peter Zijlstra
2020-10-27 16:58               ` Joel Fernandes
2020-10-26  9:31             ` Peter Zijlstra
2020-11-05 18:50               ` Joel Fernandes
2020-11-05 22:07                 ` Joel Fernandes
2020-10-23 15:05   ` Peter Zijlstra
2020-10-23 17:59     ` Joel Fernandes
2020-10-20  1:43 ` [PATCH v8 -tip 07/26] sched/fair: Fix forced idle sibling starvation corner case Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle Joel Fernandes (Google)
2020-10-26 12:47   ` Peter Zijlstra
2020-10-28 15:29     ` Joel Fernandes
2020-10-28 18:39     ` Joel Fernandes
2020-10-29 16:59     ` Joel Fernandes
2020-10-29 18:24     ` Joel Fernandes [this message]
2020-10-29 18:59       ` Peter Zijlstra
2020-10-30  2:36         ` Joel Fernandes
2020-10-30  2:42           ` Joel Fernandes
2020-10-30  8:41             ` Peter Zijlstra
2020-10-31 21:41               ` Joel Fernandes
2020-10-20  1:43 ` [PATCH v8 -tip 09/26] sched: Trivial forced-newidle balancer Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 10/26] sched: migration changes for core scheduling Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 11/26] irq_work: Cleanup Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 12/26] arch/x86: Add a new TIF flag for untrusted tasks Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode Joel Fernandes (Google)
2020-10-20  3:41   ` Randy Dunlap
2020-11-03  0:20     ` Joel Fernandes
2020-10-22  5:48   ` Li, Aubrey
2020-11-03  0:50     ` Joel Fernandes
2020-10-30 10:29   ` Alexandre Chartre
2020-11-03  1:20     ` Joel Fernandes
2020-11-06 16:57       ` Alexandre Chartre
2020-11-06 17:43         ` Joel Fernandes
2020-11-06 18:07           ` Alexandre Chartre
2020-11-10  9:35       ` Alexandre Chartre
2020-11-10 22:42         ` Joel Fernandes
2020-11-16 10:08           ` Alexandre Chartre
2020-11-16 14:50             ` Joel Fernandes
2020-11-16 15:43               ` Joel Fernandes
2020-10-20  1:43 ` [PATCH v8 -tip 14/26] entry/idle: Enter and exit kernel protection during idle entry and exit Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 15/26] entry/kvm: Protect the kernel when entering from guest Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 16/26] sched: cgroup tagging interface for core scheduling Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 17/26] sched: Split the cookie and setup per-task cookie on fork Joel Fernandes (Google)
2020-11-04 22:30   ` chris hyser
2020-11-05 14:49     ` Joel Fernandes
2020-11-09 23:30     ` chris hyser
2020-10-20  1:43 ` [PATCH v8 -tip 18/26] sched: Add a per-thread core scheduling interface Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 19/26] sched: Add a second-level tag for nested CGroup usecase Joel Fernandes (Google)
2020-10-31  0:42   ` Josh Don
2020-11-03  2:54     ` Joel Fernandes
     [not found]   ` <6c07e70d-52f2-69ff-e1fa-690cd2c97f3d@linux.intel.com>
2020-11-05 15:52     ` Joel Fernandes
2020-10-20  1:43 ` [PATCH v8 -tip 20/26] sched: Release references to the per-task cookie on exit Joel Fernandes (Google)
2020-11-04 21:50   ` chris hyser
2020-11-05 15:46     ` Joel Fernandes
2020-10-20  1:43 ` [PATCH v8 -tip 21/26] sched: Handle task addition to CGroup Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 22/26] sched/debug: Add CGroup node for printing group cookie if SCHED_DEBUG Joel Fernandes (Google)
2020-10-20  1:43 ` [PATCH v8 -tip 23/26] kselftest: Add tests for core-sched interface Joel Fernandes (Google)
2020-10-30 22:20   ` [PATCH] sched: Change all 4 space tabs to actual tabs John B. Wyatt IV
2020-10-20  1:43 ` [PATCH v8 -tip 24/26] sched: Move core-scheduler interfacing code to a new file Joel Fernandes (Google)
2020-10-26  1:05   ` Li, Aubrey
2020-11-03  2:58     ` Joel Fernandes
2020-10-20  1:43 ` [PATCH v8 -tip 25/26] Documentation: Add core scheduling documentation Joel Fernandes (Google)
2020-10-20  3:36   ` Randy Dunlap
2020-11-12 16:11     ` Joel Fernandes
2020-10-20  1:43 ` [PATCH v8 -tip 26/26] sched: Debug bits Joel Fernandes (Google)
2020-10-30 13:26 ` [PATCH v8 -tip 00/26] Core scheduling Ning, Hongyu
2020-11-06  2:58   ` Li, Aubrey
2020-11-06 17:54     ` Joel Fernandes
2020-11-09  6:04       ` Li, Aubrey
2020-11-06 20:55 ` [RFT for v9] (Was Re: [PATCH v8 -tip 00/26] Core scheduling) Joel Fernandes
2020-11-13  9:22   ` Ning, Hongyu
2020-11-13 10:01     ` Ning, Hongyu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201029182429.GA1844482@google.com \
    --to=joel@joelfernandes.org \
    --cc=James.Bottomley@hansenpartnership.com \
    --cc=OWeisse@umich.edu \
    --cc=aaron.lwe@gmail.com \
    --cc=agata.gruza@intel.com \
    --cc=alexandre.chartre@oracle.com \
    --cc=antonio.gomez.iglesias@intel.com \
    --cc=aubrey.intel@gmail.com \
    --cc=aubrey.li@linux.intel.com \
    --cc=benbjiang@tencent.com \
    --cc=chris.hyser@oracle.com \
    --cc=christian.brauner@ubuntu.com \
    --cc=derkling@google.com \
    --cc=dfaggioli@suse.com \
    --cc=dhaval.giani@oracle.com \
    --cc=graf@amazon.com \
    --cc=jdesfossez@digitalocean.com \
    --cc=jsbarnes@google.com \
    --cc=junaids@google.com \
    --cc=keescook@chromium.org \
    --cc=kerrnel@google.com \
    --cc=konrad.wilk@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@techsingularity.net \
    --cc=mingo@kernel.org \
    --cc=naravamudan@digitalocean.com \
    --cc=pauld@redhat.com \
    --cc=pawan.kumar.gupta@linux.intel.com \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    --cc=tim.c.chen@intel.com \
    --cc=tim.c.chen@linux.intel.com \
    --cc=valentin.schneider@arm.com \
    --cc=vineeth@bitbyteword.org \
    --cc=viremana@linux.microsoft.com \
    --cc=yu.c.chen@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.