Re: [PATCH 2/3] sched: Cleanup bandwidth timers

From: bsegall@google.com
To: Peter Zijlstra <peterz@infradead.org>
Cc: tglx@linutronix.de, mingo@kernel.org,
	linux-kernel@vger.kernel.org, Paul Turner <pjt@google.com>,
	Roman Gushchin <klamm@yandex-team.ru>
Subject: Re: [PATCH 2/3] sched: Cleanup bandwidth timers
Date: Thu, 16 Apr 2015 13:03:33 -0700	[thread overview]
Message-ID: <xm26a8y796y2.fsf@sword-of-the-dawn.mtv.corp.google.com> (raw)
In-Reply-To: <20150415095011.804589208@infradead.org> (Peter Zijlstra's message of "Wed, 15 Apr 2015 11:41:57 +0200")

Peter Zijlstra <peterz@infradead.org> writes:

> Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth().
>
> The more I look at that code the more I'm convinced its crack, that
> entire __start_cfs_bandwidth() thing is brain melting, we don't need to
> cancel a timer before starting it, *hrtimer_start*() will happily remove
> the timer for you if its still enqueued.
>
> Removing that, removes a big part of the problem, no more ugly cancel
> loop to get stuck in.
>
> So now, if I understand things right, the entire reason you have this
> cfs_b->lock guarded ->timer_active nonsense is to make sure we don't
> accidentally loose the timer.
>
> It appears to me that it should be possible to guarantee that same by
> unconditionally (re)starting the timer when !queued. Because regardless
> what hrtimer::function will return, if we beat it to (re)enqueue the
> timer, it doesn't matter.
>
> Now, because hrtimers don't come with any serialization guarantees we
> must ensure both handler and (re)start loop serialize their access to
> the hrtimer to avoid both trying to forward the timer at the same
> time.
>
> Update the rt bandwidth timer to match.
>
> This effectively reverts: 09dc4ab03936 ("sched/fair: Fix
> tg_set_cfs_bandwidth() deadlock on rq->lock").
>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Paul Turner <pjt@google.com>
> Cc: Ben Segall <bsegall@google.com>
> Reported-by: Roman Gushchin <klamm@yandex-team.ru>

Reviewed-by: Ben Segall <bsegall@google.com>

So much nicer. Docs/comment issue only: s/loose/lose/

> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/core.c  |   15 ++++++------
>  kernel/sched/fair.c  |   59 ++++++++++++---------------------------------------
>  kernel/sched/rt.c    |   14 +++++-------
>  kernel/sched/sched.h |    4 +--
>  4 files changed, 31 insertions(+), 61 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -92,10 +92,13 @@
>  
>  void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
>  {
> -	if (hrtimer_active(period_timer))
> -		return;
> +	/*
> +	 * Do not forward the expiration time of active timers;
> +	 * we do not want to loose an overrun.
> +	 */
> +	if (!hrtimer_active(period_timer))
> +		hrtimer_forward_now(period_timer, period);
>  
> -	hrtimer_forward_now(period_timer, period);
>  	hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
>  }
>  
> @@ -8098,10 +8101,8 @@ static int tg_set_cfs_bandwidth(struct t
>  
>  	__refill_cfs_bandwidth_runtime(cfs_b);
>  	/* restart the period timer (if active) to handle new period expiry */
> -	if (runtime_enabled && cfs_b->timer_active) {
> -		/* force a reprogram */
> -		__start_cfs_bandwidth(cfs_b, true);
> -	}
> +	if (runtime_enabled)
> +		start_cfs_bandwidth(cfs_b);
>  	raw_spin_unlock_irq(&cfs_b->lock);
>  
>  	for_each_online_cpu(i) {
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3476,16 +3476,7 @@ static int assign_cfs_rq_runtime(struct
>  	if (cfs_b->quota == RUNTIME_INF)
>  		amount = min_amount;
>  	else {
> -		/*
> -		 * If the bandwidth pool has become inactive, then at least one
> -		 * period must have elapsed since the last consumption.
> -		 * Refresh the global state and ensure bandwidth timer becomes
> -		 * active.
> -		 */
> -		if (!cfs_b->timer_active) {
> -			__refill_cfs_bandwidth_runtime(cfs_b);
> -			__start_cfs_bandwidth(cfs_b, false);
> -		}
> +		start_cfs_bandwidth(cfs_b);
>  
>  		if (cfs_b->runtime > 0) {
>  			amount = min(cfs_b->runtime, min_amount);
> @@ -3634,6 +3625,7 @@ static void throttle_cfs_rq(struct cfs_r
>  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
>  	struct sched_entity *se;
>  	long task_delta, dequeue = 1;
> +	bool empty;
>  
>  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>  
> @@ -3663,13 +3655,21 @@ static void throttle_cfs_rq(struct cfs_r
>  	cfs_rq->throttled = 1;
>  	cfs_rq->throttled_clock = rq_clock(rq);
>  	raw_spin_lock(&cfs_b->lock);
> +	empty = list_empty(&cfs_rq->throttled_list);
> +
>  	/*
>  	 * Add to the _head_ of the list, so that an already-started
>  	 * distribute_cfs_runtime will not see us
>  	 */
>  	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> -	if (!cfs_b->timer_active)
> -		__start_cfs_bandwidth(cfs_b, false);
> +
> +	/*
> +	 * If we're the first throttled task, make sure the bandwidth
> +	 * timer is running.
> +	 */
> +	if (empty)
> +		start_cfs_bandwidth(cfs_b);
> +
>  	raw_spin_unlock(&cfs_b->lock);
>  }
>  
> @@ -3784,13 +3784,6 @@ static int do_sched_cfs_period_timer(str
>  	if (cfs_b->idle && !throttled)
>  		goto out_deactivate;
>  
> -	/*
> -	 * if we have relooped after returning idle once, we need to update our
> -	 * status as actually running, so that other cpus doing
> -	 * __start_cfs_bandwidth will stop trying to cancel us.
> -	 */
> -	cfs_b->timer_active = 1;
> -
>  	__refill_cfs_bandwidth_runtime(cfs_b);
>  
>  	if (!throttled) {
> @@ -3835,7 +3828,6 @@ static int do_sched_cfs_period_timer(str
>  	return 0;
>  
>  out_deactivate:
> -	cfs_b->timer_active = 0;
>  	return 1;
>  }
>  
> @@ -3999,6 +3991,7 @@ static enum hrtimer_restart sched_cfs_sl
>  {
>  	struct cfs_bandwidth *cfs_b =
>  		container_of(timer, struct cfs_bandwidth, slack_timer);
> +
>  	do_sched_cfs_slack_timer(cfs_b);
>  
>  	return HRTIMER_NORESTART;
> @@ -4008,15 +4001,12 @@ static enum hrtimer_restart sched_cfs_pe
>  {
>  	struct cfs_bandwidth *cfs_b =
>  		container_of(timer, struct cfs_bandwidth, period_timer);
> -	ktime_t now;
>  	int overrun;
>  	int idle = 0;
>  
>  	raw_spin_lock(&cfs_b->lock);
>  	for (;;) {
> -		now = hrtimer_cb_get_time(timer);
> -		overrun = hrtimer_forward(timer, now, cfs_b->period);
> -
> +		overrun = hrtimer_forward_now(timer, cfs_b->period);
>  		if (!overrun)
>  			break;
>  
> @@ -4047,27 +4037,8 @@ static void init_cfs_rq_runtime(struct c
>  	INIT_LIST_HEAD(&cfs_rq->throttled_list);
>  }
>  
> -/* requires cfs_b->lock, may release to reprogram timer */
> -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
> +void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
>  {
> -	/*
> -	 * The timer may be active because we're trying to set a new bandwidth
> -	 * period or because we're racing with the tear-down path
> -	 * (timer_active==0 becomes visible before the hrtimer call-back
> -	 * terminates).  In either case we ensure that it's re-programmed
> -	 */
> -	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
> -	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
> -		/* bounce the lock to allow do_sched_cfs_period_timer to run */
> -		raw_spin_unlock(&cfs_b->lock);
> -		cpu_relax();
> -		raw_spin_lock(&cfs_b->lock);
> -		/* if someone else restarted the timer then we're done */
> -		if (!force && cfs_b->timer_active)
> -			return;
> -	}
> -
> -	cfs_b->timer_active = 1;
>  	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
>  }
>  
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -18,19 +18,20 @@ static enum hrtimer_restart sched_rt_per
>  {
>  	struct rt_bandwidth *rt_b =
>  		container_of(timer, struct rt_bandwidth, rt_period_timer);
> -	ktime_t now;
> -	int overrun;
>  	int idle = 0;
> +	int overrun;
>  
> +	raw_spin_lock(&rt_b->rt_runtime_lock);
>  	for (;;) {
> -		now = hrtimer_cb_get_time(timer);
> -		overrun = hrtimer_forward(timer, now, rt_b->rt_period);
> -
> +		overrun = hrtimer_forward_now(timer, rt_b->rt_period);
>  		if (!overrun)
>  			break;
>  
> +		raw_spin_unlock(&rt_b->rt_runtime_lock);
>  		idle = do_sched_rt_period_timer(rt_b, overrun);
> +		raw_spin_lock(&rt_b->rt_runtime_lock);
>  	}
> +	raw_spin_unlock(&rt_b->rt_runtime_lock);
>  
>  	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
>  }
> @@ -52,9 +53,6 @@ static void start_rt_bandwidth(struct rt
>  	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
>  		return;
>  
> -	if (hrtimer_active(&rt_b->rt_period_timer))
> -		return;
> -
>  	raw_spin_lock(&rt_b->rt_runtime_lock);
>  	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
>  	raw_spin_unlock(&rt_b->rt_runtime_lock);
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -215,7 +215,7 @@ struct cfs_bandwidth {
>  	s64 hierarchical_quota;
>  	u64 runtime_expires;
>  
> -	int idle, timer_active;
> +	int idle;
>  	struct hrtimer period_timer, slack_timer;
>  	struct list_head throttled_cfs_rq;
>  
> @@ -306,7 +306,7 @@ extern void init_cfs_bandwidth(struct cf
>  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
>  
>  extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
> -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
> +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
>  extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
>  
>  extern void free_rt_sched_group(struct task_group *tg);