All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] sched/fair: Fix bandwidth timer clock drift condition
@ 2018-06-18  9:16 Xunlei Pang
  2018-06-18 18:44 ` bsegall
  0 siblings, 1 reply; 3+ messages in thread
From: Xunlei Pang @ 2018-06-18  9:16 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Ben Segall; +Cc: linux-kernel

The current condition to judge clock drift in expire_cfs_rq_runtime()
is wrong, the two runtime_expires are actually the same when clock
drift happens, so this condtion can never hit. The orginal design was
correctly done by commit a9cf55b28610 ("sched: Expire invalid runtime"),
but was changed to be the current one due to its locking issue.

This patch introduces another way, it adds a new field in both structure
cfs_rq and cfs_bandwidth to record the expiration update sequence, and
use them to figure out if clock drift happens(true if they equal).

This fix is also needed by the following patch.

Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")
Cc: Ben Segall <bsegall@google.com>
Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
 kernel/sched/fair.c  | 14 ++++++++------
 kernel/sched/sched.h |  6 ++++--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e497c05aab7f..9f384264e832 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4590,6 +4590,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 	now = sched_clock_cpu(smp_processor_id());
 	cfs_b->runtime = cfs_b->quota;
 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+	cfs_b->expires_seq++;
 }
 
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4612,6 +4613,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	struct task_group *tg = cfs_rq->tg;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
 	u64 amount = 0, min_amount, expires;
+	int expires_seq;
 
 	/* note: this is a positive sum as runtime_remaining <= 0 */
 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4629,6 +4631,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 		}
 	}
 	expires = cfs_b->runtime_expires;
+	expires_seq = cfs_b->expires_seq;
 	raw_spin_unlock(&cfs_b->lock);
 
 	cfs_rq->runtime_remaining += amount;
@@ -4637,8 +4640,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	 * spread between our sched_clock and the one on which runtime was
 	 * issued.
 	 */
-	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+	if ((s64)(expires - cfs_rq->runtime_expires) > 0) {
 		cfs_rq->runtime_expires = expires;
+		cfs_rq->expires_seq = expires_seq;
+	}
 
 	return cfs_rq->runtime_remaining > 0;
 }
@@ -4664,12 +4669,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	 * has not truly expired.
 	 *
 	 * Fortunately we can check determine whether this the case by checking
-	 * whether the global deadline has advanced. It is valid to compare
-	 * cfs_b->runtime_expires without any locks since we only care about
-	 * exact equality, so a partial write will still work.
+	 * whether the global deadline(cfs_b->expires_seq) has advanced.
 	 */
-
-	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
+	if (cfs_rq->expires_seq == cfs_b->expires_seq) {
 		/* extend local deadline, drift is bounded above by 2 ticks */
 		cfs_rq->runtime_expires += TICK_NSEC;
 	} else {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6601baf2361c..e977e04f8daf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -334,9 +334,10 @@ struct cfs_bandwidth {
 	u64			runtime;
 	s64			hierarchical_quota;
 	u64			runtime_expires;
+	int			expires_seq;
 
-	int			idle;
-	int			period_active;
+	short			idle;
+	short			period_active;
 	struct hrtimer		period_timer;
 	struct hrtimer		slack_timer;
 	struct list_head	throttled_cfs_rq;
@@ -551,6 +552,7 @@ struct cfs_rq {
 
 #ifdef CONFIG_CFS_BANDWIDTH
 	int			runtime_enabled;
+	int			expires_seq;
 	u64			runtime_expires;
 	s64			runtime_remaining;
 
-- 
2.14.1.40.g8e62ba1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH 1/2] sched/fair: Fix bandwidth timer clock drift condition
  2018-06-18  9:16 [PATCH 1/2] sched/fair: Fix bandwidth timer clock drift condition Xunlei Pang
@ 2018-06-18 18:44 ` bsegall
  2018-06-19  2:57   ` Xunlei Pang
  0 siblings, 1 reply; 3+ messages in thread
From: bsegall @ 2018-06-18 18:44 UTC (permalink / raw)
  To: Xunlei Pang; +Cc: Peter Zijlstra, Ingo Molnar, Ben Segall, linux-kernel

Xunlei Pang <xlpang@linux.alibaba.com> writes:

> The current condition to judge clock drift in expire_cfs_rq_runtime()
> is wrong, the two runtime_expires are actually the same when clock
> drift happens, so this condtion can never hit. The orginal design was
> correctly done by commit a9cf55b28610 ("sched: Expire invalid runtime"),
> but was changed to be the current one due to its locking issue.
>
> This patch introduces another way, it adds a new field in both structure
> cfs_rq and cfs_bandwidth to record the expiration update sequence, and
> use them to figure out if clock drift happens(true if they equal).

It might just be simplest to revert the comparison change - if we read a
torn value, the worst that happens is we extend incorrectly, and that
is exactly what happens if we just read the old value.

An extra int isn't exactly the worst thing though, so whichever.

>
> This fix is also needed by the following patch.
>
> Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")
> Cc: Ben Segall <bsegall@google.com>
> Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
> ---
>  kernel/sched/fair.c  | 14 ++++++++------
>  kernel/sched/sched.h |  6 ++++--
>  2 files changed, 12 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e497c05aab7f..9f384264e832 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4590,6 +4590,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
>  	now = sched_clock_cpu(smp_processor_id());
>  	cfs_b->runtime = cfs_b->quota;
>  	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
> +	cfs_b->expires_seq++;
>  }
>  
>  static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
> @@ -4612,6 +4613,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>  	struct task_group *tg = cfs_rq->tg;
>  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
>  	u64 amount = 0, min_amount, expires;
> +	int expires_seq;
>  
>  	/* note: this is a positive sum as runtime_remaining <= 0 */
>  	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
> @@ -4629,6 +4631,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>  		}
>  	}
>  	expires = cfs_b->runtime_expires;
> +	expires_seq = cfs_b->expires_seq;
>  	raw_spin_unlock(&cfs_b->lock);
>  
>  	cfs_rq->runtime_remaining += amount;
> @@ -4637,8 +4640,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>  	 * spread between our sched_clock and the one on which runtime was
>  	 * issued.
>  	 */
> -	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
> +	if ((s64)(expires - cfs_rq->runtime_expires) > 0) {
>  		cfs_rq->runtime_expires = expires;
> +		cfs_rq->expires_seq = expires_seq;
> +	}
>  
>  	return cfs_rq->runtime_remaining > 0;
>  }
> @@ -4664,12 +4669,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>  	 * has not truly expired.
>  	 *
>  	 * Fortunately we can check determine whether this the case by checking
> -	 * whether the global deadline has advanced. It is valid to compare
> -	 * cfs_b->runtime_expires without any locks since we only care about
> -	 * exact equality, so a partial write will still work.
> +	 * whether the global deadline(cfs_b->expires_seq) has advanced.
>  	 */
> -
> -	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
> +	if (cfs_rq->expires_seq == cfs_b->expires_seq) {
>  		/* extend local deadline, drift is bounded above by 2 ticks */
>  		cfs_rq->runtime_expires += TICK_NSEC;
>  	} else {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 6601baf2361c..e977e04f8daf 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -334,9 +334,10 @@ struct cfs_bandwidth {
>  	u64			runtime;
>  	s64			hierarchical_quota;
>  	u64			runtime_expires;
> +	int			expires_seq;
>  
> -	int			idle;
> -	int			period_active;
> +	short			idle;
> +	short			period_active;
>  	struct hrtimer		period_timer;
>  	struct hrtimer		slack_timer;
>  	struct list_head	throttled_cfs_rq;
> @@ -551,6 +552,7 @@ struct cfs_rq {
>  
>  #ifdef CONFIG_CFS_BANDWIDTH
>  	int			runtime_enabled;
> +	int			expires_seq;
>  	u64			runtime_expires;
>  	s64			runtime_remaining;

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH 1/2] sched/fair: Fix bandwidth timer clock drift condition
  2018-06-18 18:44 ` bsegall
@ 2018-06-19  2:57   ` Xunlei Pang
  0 siblings, 0 replies; 3+ messages in thread
From: Xunlei Pang @ 2018-06-19  2:57 UTC (permalink / raw)
  To: bsegall; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel

On 6/19/18 2:44 AM, bsegall@google.com wrote:
> Xunlei Pang <xlpang@linux.alibaba.com> writes:
> 
>> The current condition to judge clock drift in expire_cfs_rq_runtime()
>> is wrong, the two runtime_expires are actually the same when clock
>> drift happens, so this condtion can never hit. The orginal design was
>> correctly done by commit a9cf55b28610 ("sched: Expire invalid runtime"),
>> but was changed to be the current one due to its locking issue.
>>
>> This patch introduces another way, it adds a new field in both structure
>> cfs_rq and cfs_bandwidth to record the expiration update sequence, and
>> use them to figure out if clock drift happens(true if they equal).
> 
> It might just be simplest to revert the comparison change - if we read a
> torn value, the worst that happens is we extend incorrectly, and that
> is exactly what happens if we just read the old value.
> 
> An extra int isn't exactly the worst thing though, so whichever.

I tried that, it might still consume the old runtime in the worst case,
I choosed this way considering more cfs_b->runtime_expires change in the
2nd patch.

The extra fields added can gurantee the correct control, also it does
not increase the total size of the two structures.

Thanks,
Xunlei

> 
>>
>> This fix is also needed by the following patch.
>>
>> Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")
>> Cc: Ben Segall <bsegall@google.com>
>> Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
>> ---
>>  kernel/sched/fair.c  | 14 ++++++++------
>>  kernel/sched/sched.h |  6 ++++--
>>  2 files changed, 12 insertions(+), 8 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index e497c05aab7f..9f384264e832 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4590,6 +4590,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
>>  	now = sched_clock_cpu(smp_processor_id());
>>  	cfs_b->runtime = cfs_b->quota;
>>  	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
>> +	cfs_b->expires_seq++;
>>  }
>>  
>>  static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
>> @@ -4612,6 +4613,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>>  	struct task_group *tg = cfs_rq->tg;
>>  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
>>  	u64 amount = 0, min_amount, expires;
>> +	int expires_seq;
>>  
>>  	/* note: this is a positive sum as runtime_remaining <= 0 */
>>  	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
>> @@ -4629,6 +4631,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>>  		}
>>  	}
>>  	expires = cfs_b->runtime_expires;
>> +	expires_seq = cfs_b->expires_seq;
>>  	raw_spin_unlock(&cfs_b->lock);
>>  
>>  	cfs_rq->runtime_remaining += amount;
>> @@ -4637,8 +4640,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>>  	 * spread between our sched_clock and the one on which runtime was
>>  	 * issued.
>>  	 */
>> -	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
>> +	if ((s64)(expires - cfs_rq->runtime_expires) > 0) {
>>  		cfs_rq->runtime_expires = expires;
>> +		cfs_rq->expires_seq = expires_seq;
>> +	}
>>  
>>  	return cfs_rq->runtime_remaining > 0;
>>  }
>> @@ -4664,12 +4669,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>>  	 * has not truly expired.
>>  	 *
>>  	 * Fortunately we can check determine whether this the case by checking
>> -	 * whether the global deadline has advanced. It is valid to compare
>> -	 * cfs_b->runtime_expires without any locks since we only care about
>> -	 * exact equality, so a partial write will still work.
>> +	 * whether the global deadline(cfs_b->expires_seq) has advanced.
>>  	 */
>> -
>> -	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
>> +	if (cfs_rq->expires_seq == cfs_b->expires_seq) {
>>  		/* extend local deadline, drift is bounded above by 2 ticks */
>>  		cfs_rq->runtime_expires += TICK_NSEC;
>>  	} else {
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 6601baf2361c..e977e04f8daf 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -334,9 +334,10 @@ struct cfs_bandwidth {
>>  	u64			runtime;
>>  	s64			hierarchical_quota;
>>  	u64			runtime_expires;
>> +	int			expires_seq;
>>  
>> -	int			idle;
>> -	int			period_active;
>> +	short			idle;
>> +	short			period_active;
>>  	struct hrtimer		period_timer;
>>  	struct hrtimer		slack_timer;
>>  	struct list_head	throttled_cfs_rq;
>> @@ -551,6 +552,7 @@ struct cfs_rq {
>>  
>>  #ifdef CONFIG_CFS_BANDWIDTH
>>  	int			runtime_enabled;
>> +	int			expires_seq;
>>  	u64			runtime_expires;
>>  	s64			runtime_remaining;

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-06-19  2:57 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-06-18  9:16 [PATCH 1/2] sched/fair: Fix bandwidth timer clock drift condition Xunlei Pang
2018-06-18 18:44 ` bsegall
2018-06-19  2:57   ` Xunlei Pang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.