Re: [RFC PATCH 1/1] sched/eevdf: Skip eligibility check for current entity during wakeup preemption

From: K Prateek Nayak <kprateek.nayak@amd.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>,
	linux-kernel@vger.kernel.org,
	Tobias Huschle <huschle@linux.ibm.com>,
	Luis Machado <luis.machado@arm.com>,
	Chen Yu <yu.c.chen@intel.com>, Abel Wu <wuyun.abel@bytedance.com>,
	Tianchen Ding <dtcccc@linux.alibaba.com>,
	Youssef Esmat <youssefesmat@chromium.org>,
	Xuewen Yan <xuewen.yan94@gmail.com>,
	"Gautham R. Shenoy" <gautham.shenoy@amd.com>
Subject: Re: [RFC PATCH 1/1] sched/eevdf: Skip eligibility check for current entity during wakeup preemption
Date: Thu, 25 Apr 2024 09:04:02 +0530	[thread overview]
Message-ID: <8c4de476-bbd9-840f-4c23-4b84f3c7986a@amd.com> (raw)
In-Reply-To: <20240424150721.GQ30852@noisy.programming.kicks-ass.net>

Hello Peter,

On 4/24/2024 8:37 PM, Peter Zijlstra wrote:
> On Mon, Mar 25, 2024 at 11:32:26AM +0530, K Prateek Nayak wrote:
>> With the curr entity's eligibility check, a wakeup preemption is very
>> likely when an entity with positive lag joins the runqueue pushing the
>> avg_vruntime of the runqueue backwards, making the vruntime of the
>> current entity ineligible. This leads to aggressive wakeup preemption
>> which was previously guarded by wakeup_granularity_ns in legacy CFS.
>> Below figure depicts one such aggressive preemption scenario with EEVDF
>> in DeathStarBench [1]:
>>
>> 				deadline for Nginx
>> 		   |
>> 	+-------+  |			|
>>     /-- | Nginx | -|------------------> |
>>     |	+-------+  |			|
>>     |		   |
>>     |	-----------|-------------------------------> vruntime timeline
>>     |		   \--> rq->avg_vruntime
>>     |
>>     | 	wakes service on the same runqueue since system is busy
>>     |
>>     |	+---------+|
>>     \-->| Service || (service has +ve lag pushes avg_vruntime backwards)
>> 	+---------+|
>> 	  |	   |
>>    wakeup |	+--|-----+			 |
>>  preempts \---->| N|ginx | --------------------> | {deadline for Nginx}
>> 		+--|-----+  			 |
>> 	   (Nginx ineligible)
>> 	-----------|-------------------------------> vruntime timeline
>> 		   \--> rq->avg_vruntime
> 
> This graph is really hard to interpret. If you want to illustrate
> avg_vruntime moves back, you should not align it. That's really
> disorienting.
> 
> In both (upper and lower) nginx has the same vruntime thus *that* should
> be aligned. The lower will have service placed left and with that
> avg_vruntime also moves left, rendering nginx in-eligible.

Sorry about that. I'll keep this in mind for for future illustrations.

> 
> 
>> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
>> ---
>>  kernel/sched/fair.c     | 24 ++++++++++++++++++++----
>>  kernel/sched/features.h |  1 +
>>  2 files changed, 21 insertions(+), 4 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 6a16129f9a5c..a9b145a4eab0 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -875,7 +875,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
>>   *
>>   * Which allows tree pruning through eligibility.
>>   */
>> -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
>> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool wakeup_preempt)
>>  {
>>  	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
>>  	struct sched_entity *se = __pick_first_entity(cfs_rq);
>> @@ -889,7 +889,23 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
>>  	if (cfs_rq->nr_running == 1)
>>  		return curr && curr->on_rq ? curr : se;
>>  
>> -	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
>> +	if (curr && !curr->on_rq)
>> +		curr = NULL;
>> +
>> +	/*
>> +	 * When an entity with positive lag wakes up, it pushes the
>> +	 * avg_vruntime of the runqueue backwards. This may causes the
>> +	 * current entity to be ineligible soon into its run leading to
>> +	 * wakeup preemption.
>> +	 *
>> +	 * To prevent such aggressive preemption of the current running
>> +	 * entity during task wakeups, skip the eligibility check if the
>> +	 * slice promised to the entity since its selection has not yet
>> +	 * elapsed.
>> +	 */
>> +	if (curr &&
>> +	    !(sched_feat(RUN_TO_PARITY_WAKEUP) && wakeup_preempt && curr->vlag == curr->deadline) &&
>> +	    !entity_eligible(cfs_rq, curr))
>>  		curr = NULL;
>>  
>>  	/*
> 
> So I see what you want to do, but this is highly unreadable.
> 
> I'll try something like the below on top of queue/sched/eevdf, but I
> should probably first look at fixing those reported crashes on that tree
> :/

I'll give this a try since Mike's suggestion seems to have fixed the
crash I was observing :) Thank you for suggesting this alternative.
--
Thanks and Regards,
Prateek

> 
> ---
>  kernel/sched/fair.c     | 60 ++++++++++++++++++++++++++++++++++---------------
>  kernel/sched/features.h | 11 +++++----
>  2 files changed, 49 insertions(+), 22 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8a9206d8532f..23977ed1cb2c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -855,6 +855,39 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
>  	return __node_2_se(left);
>  }
>  
> +static inline bool pick_curr(struct cfs_rq *cfs_rq,
> +			     struct sched_entity *curr, struct sched_entity *wakee)
> +{
> +	/*
> +	 * Nothing to preserve...
> +	 */
> +	if (!curr || !sched_feat(RESPECT_SLICE))
> +		return false;
> +
> +	/*
> +	 * Allow preemption at the 0-lag point -- even if not all of the slice
> +	 * is consumed. Note: placement of positive lag can push V left and render
> +	 * @curr instantly ineligible irrespective the time on-cpu.
> +	 */
> +	if (sched_feat(RUN_TO_PARITY) && !entity_eligible(cfs_rq, curr))
> +		return false;
> +
> +	/*
> +	 * Don't preserve @curr when the @wakee has a shorter slice and earlier
> +	 * deadline. IOW, explicitly allow preemption.
> +	 */
> +	if (sched_feat(PREEMPT_SHORT) && wakee &&
> +	    wakee->slice < curr->slice &&
> +	    (s64)(wakee->deadline - curr->deadline) < 0)
> +		return false;
> +
> +	/*
> +	 * Preserve @curr to allow it to finish its first slice.
> +	 * See the HACK in set_next_entity().
> +	 */
> +	return curr->vlag == curr->deadline;
> +}
> +
>  /*
>   * Earliest Eligible Virtual Deadline First
>   *
> @@ -874,28 +907,27 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
>   *
>   * Which allows tree pruning through eligibility.
>   */
> -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *wakee)
>  {
>  	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
>  	struct sched_entity *se = __pick_first_entity(cfs_rq);
>  	struct sched_entity *curr = cfs_rq->curr;
>  	struct sched_entity *best = NULL;
>  
> +	if (curr && !curr->on_rq)
> +		curr = NULL;
> +
>  	/*
>  	 * We can safely skip eligibility check if there is only one entity
>  	 * in this cfs_rq, saving some cycles.
>  	 */
>  	if (cfs_rq->nr_running == 1)
> -		return curr && curr->on_rq ? curr : se;
> -
> -	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
> -		curr = NULL;
> +		return curr ?: se;
>  
>  	/*
> -	 * Once selected, run a task until it either becomes non-eligible or
> -	 * until it gets a new slice. See the HACK in set_next_entity().
> +	 * Preserve @curr to let it finish its slice.
>  	 */
> -	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
> +	if (pick_curr(cfs_rq, curr, wakee))
>  		return curr;
>  
>  	/* Pick the leftmost entity if it's eligible */
> @@ -5507,7 +5539,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
>  		return cfs_rq->next;
>  	}
>  
> -	struct sched_entity *se = pick_eevdf(cfs_rq);
> +	struct sched_entity *se = pick_eevdf(cfs_rq, NULL);
>  	if (se->sched_delayed) {
>  		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
>  		SCHED_WARN_ON(se->sched_delayed);
> @@ -8548,15 +8580,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	cfs_rq = cfs_rq_of(se);
>  	update_curr(cfs_rq);
>  
> -	if (sched_feat(PREEMPT_SHORT) && pse->slice < se->slice &&
> -	    entity_eligible(cfs_rq, pse) &&
> -	    (s64)(pse->deadline - se->deadline) < 0 &&
> -	    se->vlag == se->deadline) {
> -		/* negate RUN_TO_PARITY */
> -		se->vlag = se->deadline - 1;
> -	}
> -
> -	if (pick_eevdf(cfs_rq) == pse)
> +	if (pick_eevdf(cfs_rq, pse) == pse)
>  		goto preempt;
>  
>  	return;
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 64ce99cf04ec..2285dc30294c 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -10,12 +10,15 @@ SCHED_FEAT(PLACE_LAG, true)
>   */
>  SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
>  /*
> - * Inhibit (wakeup) preemption until the current task has either matched the
> - * 0-lag point or until is has exhausted it's slice.
> + * Inhibit (wakeup) preemption until the current task has exhausted its slice.
>   */
> -SCHED_FEAT(RUN_TO_PARITY, true)
> +SCHED_FEAT(RESPECT_SLICE, true)
>  /*
> - * Allow tasks with a shorter slice to disregard RUN_TO_PARITY
> + * Relax RESPECT_SLICE to allow preemption once current has reached 0-lag.
> + */
> +SCHED_FEAT(RUN_TO_PARITY, false)
> +/*
> + * Allow tasks with a shorter slice to disregard RESPECT_SLICE
>   */
>  SCHED_FEAT(PREEMPT_SHORT, true)
>