All of lore.kernel.org
 help / color / mirror / Atom feed
* [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota
@ 2018-10-08 14:36 Phil Auld
  2018-10-09  8:32 ` Ingo Molnar
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Phil Auld @ 2018-10-08 14:36 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar; +Cc: linux-kernel, stable

From: "Phil Auld" <pauld@redhat.com>

sched/fair: Avoid throttle_list starvation with low cfs quota

With a very low cpu.cfs_quota_us setting, such as the minimum of 1000, 
distribute_cfs_runtime may not empty the throttled_list before it runs 
out of runtime to distribute. In that case, due to the change from 
c06f04c7048 to put throttled entries at the head of the list, later entries 
on the list will starve.  Essentially, the same X processes will get pulled 
off the list, given CPU time and then, when expired, get put back on the 
head of the list where distribute_cfs_runtime will give runtime to the same 
set of processes leaving the rest.

Fix the issue by setting a bit in struct cfs_bandwidth when 
distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can 
decide to put the throttled entry on the tail or the head of the list.  The 
bit is set/cleared by the callers of distribute_cfs_runtime while they hold 
cfs_bandwidth->lock. 

Signed-off-by: Phil Auld <pauld@redhat.com>
Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
---

This is easy to reproduce with a handful of cpu consumers. I use crash on 
the live system. In some cases you can simply look at the throttled list and 
see the later entries are not changing:

crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
  1     ffff90b56cb2d200  -976050
  2     ffff90b56cb2cc00  -484925
  3     ffff90b56cb2bc00  -658814
  4     ffff90b56cb2ba00  -275365
  5     ffff90b166a45600  -135138
  6     ffff90b56cb2da00  -282505
  7     ffff90b56cb2e000  -148065
  8     ffff90b56cb2fa00  -872591
  9     ffff90b56cb2c000  -84687
 10     ffff90b56cb2f000  -87237
 11     ffff90b166a40a00  -164582
crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
  1     ffff90b56cb2d200  -994147
  2     ffff90b56cb2cc00  -306051
  3     ffff90b56cb2bc00  -961321
  4     ffff90b56cb2ba00  -24490
  5     ffff90b166a45600  -135138
  6     ffff90b56cb2da00  -282505
  7     ffff90b56cb2e000  -148065
  8     ffff90b56cb2fa00  -872591
  9     ffff90b56cb2c000  -84687
 10     ffff90b56cb2f000  -87237
 11     ffff90b166a40a00  -164582

Sometimes it is easier to see by finding a process getting starved and looking 
at the sched_info:

crash> task ffff8eb765994500 sched_info
PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
  sched_info = {
    pcount = 8, 
    run_delay = 697094208, 
    last_arrival = 240260125039, 
    last_queued = 240260327513
  }, 
crash> task ffff8eb765994500 sched_info
PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
  sched_info = {
    pcount = 8, 
    run_delay = 697094208, 
    last_arrival = 240260125039, 
    last_queued = 240260327513
  }, 


 fair.c  |   22 +++++++++++++++++++---
 sched.h |    2 ++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7fc4a371bdd2..f88e00705b55 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	/*
 	 * Add to the _head_ of the list, so that an already-started
-	 * distribute_cfs_runtime will not see us
+	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+	 * not running add to the tail so that later runqueues don't get starved.
 	 */
-	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	if (cfs_b->distribute_running)
+		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	else
+		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 
 	/*
 	 * If we're the first throttled task, make sure the bandwidth
@@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	 * in us over-using our runtime if it is all used during this loop, but
 	 * only by limited amounts in that extreme case.
 	 */
-	while (throttled && cfs_b->runtime > 0) {
+	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
 		runtime = cfs_b->runtime;
+		cfs_b->distribute_running = 1;
 		raw_spin_unlock(&cfs_b->lock);
 		/* we can't nest cfs_b->lock while distributing bandwidth */
 		runtime = distribute_cfs_runtime(cfs_b, runtime,
 						 runtime_expires);
 		raw_spin_lock(&cfs_b->lock);
 
+		cfs_b->distribute_running = 0;
 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 
 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 
 	/* confirm we're still not at a refresh boundary */
 	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->distribute_running) {
+		raw_spin_unlock(&cfs_b->lock);
+		return;
+	}
+
 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
 		raw_spin_unlock(&cfs_b->lock);
 		return;
@@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 		runtime = cfs_b->runtime;
 
 	expires = cfs_b->runtime_expires;
+	if (runtime)
+		cfs_b->distribute_running = 1;
+
 	raw_spin_unlock(&cfs_b->lock);
 
 	if (!runtime)
@@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 	raw_spin_lock(&cfs_b->lock);
 	if (expires == cfs_b->runtime_expires)
 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
+	cfs_b->distribute_running = 0;
 	raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
+	cfs_b->distribute_running = 0;
 }
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa330de04..9683f458aec7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -346,6 +346,8 @@ struct cfs_bandwidth {
 	int			nr_periods;
 	int			nr_throttled;
 	u64			throttled_time;
+
+	bool                    distribute_running;
 #endif
 };
 


-- 

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota
  2018-10-08 14:36 [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota Phil Auld
@ 2018-10-09  8:32 ` Ingo Molnar
  2018-10-09 12:44   ` Phil Auld
  2018-10-10 17:49   ` bsegall
  2018-10-11 10:39 ` [tip:sched/urgent] sched/fair: Fix throttle_list starvation with low CFS quota tip-bot for Phil Auld
  2018-10-11 11:12 ` tip-bot for Phil Auld
  2 siblings, 2 replies; 8+ messages in thread
From: Ingo Molnar @ 2018-10-09  8:32 UTC (permalink / raw)
  To: Phil Auld, Ben Segall, Joel Fernandes, Steve Muckle, Paul Turner,
	Vincent Guittot, Morten Rasmussen
  Cc: Peter Zijlstra, linux-kernel, stable


I've Cc:-ed a handful of gents who worked on CFS bandwidth details to widen the discussion. 
Patch quoted below.

Looks like a real bug that needs to be fixed - and at first sight the quota of 1000 looks very 
low - could we improve the arithmetics perhaps?

A low quota of 1000 is used because there's many VMs or containers provisioned on the system 
that is triggering the bug, right?

Thanks,

	Ingo

* Phil Auld <pauld@redhat.com> wrote:

> From: "Phil Auld" <pauld@redhat.com>
> 
> sched/fair: Avoid throttle_list starvation with low cfs quota
> 
> With a very low cpu.cfs_quota_us setting, such as the minimum of 1000, 
> distribute_cfs_runtime may not empty the throttled_list before it runs 
> out of runtime to distribute. In that case, due to the change from 
> c06f04c7048 to put throttled entries at the head of the list, later entries 
> on the list will starve.  Essentially, the same X processes will get pulled 
> off the list, given CPU time and then, when expired, get put back on the 
> head of the list where distribute_cfs_runtime will give runtime to the same 
> set of processes leaving the rest.
> 
> Fix the issue by setting a bit in struct cfs_bandwidth when 
> distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can 
> decide to put the throttled entry on the tail or the head of the list.  The 
> bit is set/cleared by the callers of distribute_cfs_runtime while they hold 
> cfs_bandwidth->lock. 
> 
> Signed-off-by: Phil Auld <pauld@redhat.com>
> Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: stable@vger.kernel.org
> ---
> 
> This is easy to reproduce with a handful of cpu consumers. I use crash on 
> the live system. In some cases you can simply look at the throttled list and 
> see the later entries are not changing:
> 
> crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
>   1     ffff90b56cb2d200  -976050
>   2     ffff90b56cb2cc00  -484925
>   3     ffff90b56cb2bc00  -658814
>   4     ffff90b56cb2ba00  -275365
>   5     ffff90b166a45600  -135138
>   6     ffff90b56cb2da00  -282505
>   7     ffff90b56cb2e000  -148065
>   8     ffff90b56cb2fa00  -872591
>   9     ffff90b56cb2c000  -84687
>  10     ffff90b56cb2f000  -87237
>  11     ffff90b166a40a00  -164582
> crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
>   1     ffff90b56cb2d200  -994147
>   2     ffff90b56cb2cc00  -306051
>   3     ffff90b56cb2bc00  -961321
>   4     ffff90b56cb2ba00  -24490
>   5     ffff90b166a45600  -135138
>   6     ffff90b56cb2da00  -282505
>   7     ffff90b56cb2e000  -148065
>   8     ffff90b56cb2fa00  -872591
>   9     ffff90b56cb2c000  -84687
>  10     ffff90b56cb2f000  -87237
>  11     ffff90b166a40a00  -164582
> 
> Sometimes it is easier to see by finding a process getting starved and looking 
> at the sched_info:
> 
> crash> task ffff8eb765994500 sched_info
> PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
>   sched_info = {
>     pcount = 8, 
>     run_delay = 697094208, 
>     last_arrival = 240260125039, 
>     last_queued = 240260327513
>   }, 
> crash> task ffff8eb765994500 sched_info
> PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
>   sched_info = {
>     pcount = 8, 
>     run_delay = 697094208, 
>     last_arrival = 240260125039, 
>     last_queued = 240260327513
>   }, 
> 
> 
>  fair.c  |   22 +++++++++++++++++++---
>  sched.h |    2 ++
>  2 files changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7fc4a371bdd2..f88e00705b55 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>  	/*
>  	 * Add to the _head_ of the list, so that an already-started
> -	 * distribute_cfs_runtime will not see us
> +	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
> +	 * not running add to the tail so that later runqueues don't get starved.
>  	 */
> -	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> +	if (cfs_b->distribute_running)
> +		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> +	else
> +		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
>  
>  	/*
>  	 * If we're the first throttled task, make sure the bandwidth
> @@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
>  	 * in us over-using our runtime if it is all used during this loop, but
>  	 * only by limited amounts in that extreme case.
>  	 */
> -	while (throttled && cfs_b->runtime > 0) {
> +	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
>  		runtime = cfs_b->runtime;
> +		cfs_b->distribute_running = 1;
>  		raw_spin_unlock(&cfs_b->lock);
>  		/* we can't nest cfs_b->lock while distributing bandwidth */
>  		runtime = distribute_cfs_runtime(cfs_b, runtime,
>  						 runtime_expires);
>  		raw_spin_lock(&cfs_b->lock);
>  
> +		cfs_b->distribute_running = 0;
>  		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
>  
>  		cfs_b->runtime -= min(runtime, cfs_b->runtime);
> @@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>  
>  	/* confirm we're still not at a refresh boundary */
>  	raw_spin_lock(&cfs_b->lock);
> +	if (cfs_b->distribute_running) {
> +		raw_spin_unlock(&cfs_b->lock);
> +		return;
> +	}
> +
>  	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
>  		raw_spin_unlock(&cfs_b->lock);
>  		return;
> @@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>  		runtime = cfs_b->runtime;
>  
>  	expires = cfs_b->runtime_expires;
> +	if (runtime)
> +		cfs_b->distribute_running = 1;
> +
>  	raw_spin_unlock(&cfs_b->lock);
>  
>  	if (!runtime)
> @@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>  	raw_spin_lock(&cfs_b->lock);
>  	if (expires == cfs_b->runtime_expires)
>  		cfs_b->runtime -= min(runtime, cfs_b->runtime);
> +	cfs_b->distribute_running = 0;
>  	raw_spin_unlock(&cfs_b->lock);
>  }
>  
> @@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
>  	cfs_b->period_timer.function = sched_cfs_period_timer;
>  	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>  	cfs_b->slack_timer.function = sched_cfs_slack_timer;
> +	cfs_b->distribute_running = 0;
>  }
>  
>  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 455fa330de04..9683f458aec7 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -346,6 +346,8 @@ struct cfs_bandwidth {
>  	int			nr_periods;
>  	int			nr_throttled;
>  	u64			throttled_time;
> +
> +	bool                    distribute_running;
>  #endif
>  };
>  
> 
> 
> -- 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota
  2018-10-09  8:32 ` Ingo Molnar
@ 2018-10-09 12:44   ` Phil Auld
  2018-10-10 17:49   ` bsegall
  1 sibling, 0 replies; 8+ messages in thread
From: Phil Auld @ 2018-10-09 12:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Ben Segall, Joel Fernandes, Steve Muckle, Paul Turner,
	Vincent Guittot, Morten Rasmussen, Peter Zijlstra, linux-kernel,
	stable


Hi,

On Tue, Oct 09, 2018 at 10:32:44AM +0200 Ingo Molnar wrote:
> 
> I've Cc:-ed a handful of gents who worked on CFS bandwidth details to widen the discussion. 
> Patch quoted below.
> 
> Looks like a real bug that needs to be fixed - and at first sight the quota of 1000 looks very 
> low - could we improve the arithmetics perhaps?
> 
> A low quota of 1000 is used because there's many VMs or containers provisioned on the system 
> that is triggering the bug, right?
> 


Thanks for taking a look.


The quota is extremely low. I believe that's a different issue, though.  The kernel 
allows this setting and should handle it better than it currently does.  The proposed 
patch fixes it so that all the tasks make progress (even if not much progress) rather 
than having some starve at the back of the list. 


Cheers,
Phil


> Thanks,
> 
> 	Ingo
> 
> * Phil Auld <pauld@redhat.com> wrote:
> 
> > From: "Phil Auld" <pauld@redhat.com>
> > 
> > sched/fair: Avoid throttle_list starvation with low cfs quota
> > 
> > With a very low cpu.cfs_quota_us setting, such as the minimum of 1000, 
> > distribute_cfs_runtime may not empty the throttled_list before it runs 
> > out of runtime to distribute. In that case, due to the change from 
> > c06f04c7048 to put throttled entries at the head of the list, later entries 
> > on the list will starve.  Essentially, the same X processes will get pulled 
> > off the list, given CPU time and then, when expired, get put back on the 
> > head of the list where distribute_cfs_runtime will give runtime to the same 
> > set of processes leaving the rest.
> > 
> > Fix the issue by setting a bit in struct cfs_bandwidth when 
> > distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can 
> > decide to put the throttled entry on the tail or the head of the list.  The 
> > bit is set/cleared by the callers of distribute_cfs_runtime while they hold 
> > cfs_bandwidth->lock. 
> > 
> > Signed-off-by: Phil Auld <pauld@redhat.com>
> > Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Ingo Molnar <mingo@kernel.org>
> > Cc: stable@vger.kernel.org
> > ---
> > 
> > This is easy to reproduce with a handful of cpu consumers. I use crash on 
> > the live system. In some cases you can simply look at the throttled list and 
> > see the later entries are not changing:
> > 
> > crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
> >   1     ffff90b56cb2d200  -976050
> >   2     ffff90b56cb2cc00  -484925
> >   3     ffff90b56cb2bc00  -658814
> >   4     ffff90b56cb2ba00  -275365
> >   5     ffff90b166a45600  -135138
> >   6     ffff90b56cb2da00  -282505
> >   7     ffff90b56cb2e000  -148065
> >   8     ffff90b56cb2fa00  -872591
> >   9     ffff90b56cb2c000  -84687
> >  10     ffff90b56cb2f000  -87237
> >  11     ffff90b166a40a00  -164582
> > crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
> >   1     ffff90b56cb2d200  -994147
> >   2     ffff90b56cb2cc00  -306051
> >   3     ffff90b56cb2bc00  -961321
> >   4     ffff90b56cb2ba00  -24490
> >   5     ffff90b166a45600  -135138
> >   6     ffff90b56cb2da00  -282505
> >   7     ffff90b56cb2e000  -148065
> >   8     ffff90b56cb2fa00  -872591
> >   9     ffff90b56cb2c000  -84687
> >  10     ffff90b56cb2f000  -87237
> >  11     ffff90b166a40a00  -164582
> > 
> > Sometimes it is easier to see by finding a process getting starved and looking 
> > at the sched_info:
> > 
> > crash> task ffff8eb765994500 sched_info
> > PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
> >   sched_info = {
> >     pcount = 8, 
> >     run_delay = 697094208, 
> >     last_arrival = 240260125039, 
> >     last_queued = 240260327513
> >   }, 
> > crash> task ffff8eb765994500 sched_info
> > PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
> >   sched_info = {
> >     pcount = 8, 
> >     run_delay = 697094208, 
> >     last_arrival = 240260125039, 
> >     last_queued = 240260327513
> >   }, 
> > 
> > 
> >  fair.c  |   22 +++++++++++++++++++---
> >  sched.h |    2 ++
> >  2 files changed, 21 insertions(+), 3 deletions(-)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 7fc4a371bdd2..f88e00705b55 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> >  
> >  	/*
> >  	 * Add to the _head_ of the list, so that an already-started
> > -	 * distribute_cfs_runtime will not see us
> > +	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
> > +	 * not running add to the tail so that later runqueues don't get starved.
> >  	 */
> > -	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> > +	if (cfs_b->distribute_running)
> > +		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> > +	else
> > +		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> >  
> >  	/*
> >  	 * If we're the first throttled task, make sure the bandwidth
> > @@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
> >  	 * in us over-using our runtime if it is all used during this loop, but
> >  	 * only by limited amounts in that extreme case.
> >  	 */
> > -	while (throttled && cfs_b->runtime > 0) {
> > +	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
> >  		runtime = cfs_b->runtime;
> > +		cfs_b->distribute_running = 1;
> >  		raw_spin_unlock(&cfs_b->lock);
> >  		/* we can't nest cfs_b->lock while distributing bandwidth */
> >  		runtime = distribute_cfs_runtime(cfs_b, runtime,
> >  						 runtime_expires);
> >  		raw_spin_lock(&cfs_b->lock);
> >  
> > +		cfs_b->distribute_running = 0;
> >  		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
> >  
> >  		cfs_b->runtime -= min(runtime, cfs_b->runtime);
> > @@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
> >  
> >  	/* confirm we're still not at a refresh boundary */
> >  	raw_spin_lock(&cfs_b->lock);
> > +	if (cfs_b->distribute_running) {
> > +		raw_spin_unlock(&cfs_b->lock);
> > +		return;
> > +	}
> > +
> >  	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
> >  		raw_spin_unlock(&cfs_b->lock);
> >  		return;
> > @@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
> >  		runtime = cfs_b->runtime;
> >  
> >  	expires = cfs_b->runtime_expires;
> > +	if (runtime)
> > +		cfs_b->distribute_running = 1;
> > +
> >  	raw_spin_unlock(&cfs_b->lock);
> >  
> >  	if (!runtime)
> > @@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
> >  	raw_spin_lock(&cfs_b->lock);
> >  	if (expires == cfs_b->runtime_expires)
> >  		cfs_b->runtime -= min(runtime, cfs_b->runtime);
> > +	cfs_b->distribute_running = 0;
> >  	raw_spin_unlock(&cfs_b->lock);
> >  }
> >  
> > @@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> >  	cfs_b->period_timer.function = sched_cfs_period_timer;
> >  	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> >  	cfs_b->slack_timer.function = sched_cfs_slack_timer;
> > +	cfs_b->distribute_running = 0;
> >  }
> >  
> >  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 455fa330de04..9683f458aec7 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -346,6 +346,8 @@ struct cfs_bandwidth {
> >  	int			nr_periods;
> >  	int			nr_throttled;
> >  	u64			throttled_time;
> > +
> > +	bool                    distribute_running;
> >  #endif
> >  };
> >  
> > 
> > 
> > -- 

-- 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota
  2018-10-09  8:32 ` Ingo Molnar
  2018-10-09 12:44   ` Phil Auld
@ 2018-10-10 17:49   ` bsegall
  2018-10-10 18:37     ` Phil Auld
  1 sibling, 1 reply; 8+ messages in thread
From: bsegall @ 2018-10-10 17:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Phil Auld, Ben Segall, Joel Fernandes, Steve Muckle, Paul Turner,
	Vincent Guittot, Morten Rasmussen, Peter Zijlstra, linux-kernel,
	stable

Ingo Molnar <mingo@kernel.org> writes:

> I've Cc:-ed a handful of gents who worked on CFS bandwidth details to widen the discussion. 
> Patch quoted below.
>
> Looks like a real bug that needs to be fixed - and at first sight the quota of 1000 looks very 
> low - could we improve the arithmetics perhaps?
>
> A low quota of 1000 is used because there's many VMs or containers provisioned on the system 
> that is triggering the bug, right?
>
> Thanks,
>
> 	Ingo
>
> * Phil Auld <pauld@redhat.com> wrote:
>
>> From: "Phil Auld" <pauld@redhat.com>
>> 
>> sched/fair: Avoid throttle_list starvation with low cfs quota
>> 
>> With a very low cpu.cfs_quota_us setting, such as the minimum of 1000, 
>> distribute_cfs_runtime may not empty the throttled_list before it runs 
>> out of runtime to distribute. In that case, due to the change from 
>> c06f04c7048 to put throttled entries at the head of the list, later entries 
>> on the list will starve.  Essentially, the same X processes will get pulled 
>> off the list, given CPU time and then, when expired, get put back on the 
>> head of the list where distribute_cfs_runtime will give runtime to the same 
>> set of processes leaving the rest.
>> 
>> Fix the issue by setting a bit in struct cfs_bandwidth when 
>> distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can 
>> decide to put the throttled entry on the tail or the head of the list.  The 
>> bit is set/cleared by the callers of distribute_cfs_runtime while they hold 
>> cfs_bandwidth->lock. 
>> 
>> Signed-off-by: Phil Auld <pauld@redhat.com>
>> Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Ingo Molnar <mingo@kernel.org>
>> Cc: stable@vger.kernel.org

Reviewed-by: Ben Segall <bsegall@google.com>


In theory this does mean the unfairness could still happen if distribute is still
running, but while a tiny quota makes it more likely, the fact that
we're not getting through much of the list makes it not really a worry.
If you wanted to be even more careful there could be some generation
counter or something, but it doesn't seem necessary.


>> ---
>> 
>> This is easy to reproduce with a handful of cpu consumers. I use crash on 
>> the live system. In some cases you can simply look at the throttled list and 
>> see the later entries are not changing:
>> 
>> crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
>>   1     ffff90b56cb2d200  -976050
>>   2     ffff90b56cb2cc00  -484925
>>   3     ffff90b56cb2bc00  -658814
>>   4     ffff90b56cb2ba00  -275365
>>   5     ffff90b166a45600  -135138
>>   6     ffff90b56cb2da00  -282505
>>   7     ffff90b56cb2e000  -148065
>>   8     ffff90b56cb2fa00  -872591
>>   9     ffff90b56cb2c000  -84687
>>  10     ffff90b56cb2f000  -87237
>>  11     ffff90b166a40a00  -164582
>> crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
>>   1     ffff90b56cb2d200  -994147
>>   2     ffff90b56cb2cc00  -306051
>>   3     ffff90b56cb2bc00  -961321
>>   4     ffff90b56cb2ba00  -24490
>>   5     ffff90b166a45600  -135138
>>   6     ffff90b56cb2da00  -282505
>>   7     ffff90b56cb2e000  -148065
>>   8     ffff90b56cb2fa00  -872591
>>   9     ffff90b56cb2c000  -84687
>>  10     ffff90b56cb2f000  -87237
>>  11     ffff90b166a40a00  -164582
>> 
>> Sometimes it is easier to see by finding a process getting starved and looking 
>> at the sched_info:
>> 
>> crash> task ffff8eb765994500 sched_info
>> PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
>>   sched_info = {
>>     pcount = 8, 
>>     run_delay = 697094208, 
>>     last_arrival = 240260125039, 
>>     last_queued = 240260327513
>>   }, 
>> crash> task ffff8eb765994500 sched_info
>> PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
>>   sched_info = {
>>     pcount = 8, 
>>     run_delay = 697094208, 
>>     last_arrival = 240260125039, 
>>     last_queued = 240260327513
>>   }, 
>> 
>> 
>>  fair.c  |   22 +++++++++++++++++++---
>>  sched.h |    2 ++
>>  2 files changed, 21 insertions(+), 3 deletions(-)
>> 
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 7fc4a371bdd2..f88e00705b55 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>>  
>>  	/*
>>  	 * Add to the _head_ of the list, so that an already-started
>> -	 * distribute_cfs_runtime will not see us
>> +	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
>> +	 * not running add to the tail so that later runqueues don't get starved.
>>  	 */
>> -	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
>> +	if (cfs_b->distribute_running)
>> +		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
>> +	else
>> +		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
>>
>>  	/*
>>  	 * If we're the first throttled task, make sure the bandwidth
>> @@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
>>  	 * in us over-using our runtime if it is all used during this loop, but
>>  	 * only by limited amounts in that extreme case.
>>  	 */
>> -	while (throttled && cfs_b->runtime > 0) {
>> +	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
>>  		runtime = cfs_b->runtime;
>> +		cfs_b->distribute_running = 1;
>>  		raw_spin_unlock(&cfs_b->lock);
>>  		/* we can't nest cfs_b->lock while distributing bandwidth */
>>  		runtime = distribute_cfs_runtime(cfs_b, runtime,
>>  						 runtime_expires);
>>  		raw_spin_lock(&cfs_b->lock);
>>  
>> +		cfs_b->distribute_running = 0;
>>  		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
>>  
>>  		cfs_b->runtime -= min(runtime, cfs_b->runtime);
>> @@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>>  
>>  	/* confirm we're still not at a refresh boundary */
>>  	raw_spin_lock(&cfs_b->lock);
>> +	if (cfs_b->distribute_running) {
>> +		raw_spin_unlock(&cfs_b->lock);
>> +		return;
>> +	}
>> +
>>  	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
>>  		raw_spin_unlock(&cfs_b->lock);
>>  		return;
>> @@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>>  		runtime = cfs_b->runtime;
>>  
>>  	expires = cfs_b->runtime_expires;
>> +	if (runtime)
>> +		cfs_b->distribute_running = 1;
>> +
>>  	raw_spin_unlock(&cfs_b->lock);
>>  
>>  	if (!runtime)
>> @@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>>  	raw_spin_lock(&cfs_b->lock);
>>  	if (expires == cfs_b->runtime_expires)
>>  		cfs_b->runtime -= min(runtime, cfs_b->runtime);
>> +	cfs_b->distribute_running = 0;
>>  	raw_spin_unlock(&cfs_b->lock);
>>  }
>>  
>> @@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
>>  	cfs_b->period_timer.function = sched_cfs_period_timer;
>>  	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>>  	cfs_b->slack_timer.function = sched_cfs_slack_timer;
>> +	cfs_b->distribute_running = 0;
>>  }
>>  
>>  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 455fa330de04..9683f458aec7 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -346,6 +346,8 @@ struct cfs_bandwidth {
>>  	int			nr_periods;
>>  	int			nr_throttled;
>>  	u64			throttled_time;
>> +
>> +	bool                    distribute_running;
>>  #endif
>>  };
>>  
>> 
>> 
>> -- 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota
  2018-10-10 17:49   ` bsegall
@ 2018-10-10 18:37     ` Phil Auld
  2018-11-11  9:20       ` Konstantin Khlebnikov
  0 siblings, 1 reply; 8+ messages in thread
From: Phil Auld @ 2018-10-10 18:37 UTC (permalink / raw)
  To: bsegall
  Cc: Ingo Molnar, Joel Fernandes, Steve Muckle, Paul Turner,
	Vincent Guittot, Morten Rasmussen, Peter Zijlstra, linux-kernel,
	stable

On Wed, Oct 10, 2018 at 10:49:25AM -0700 bsegall@google.com wrote:
> Ingo Molnar <mingo@kernel.org> writes:
> 
> > I've Cc:-ed a handful of gents who worked on CFS bandwidth details to widen the discussion. 
> > Patch quoted below.
> >
> > Looks like a real bug that needs to be fixed - and at first sight the quota of 1000 looks very 
> > low - could we improve the arithmetics perhaps?
> >
> > A low quota of 1000 is used because there's many VMs or containers provisioned on the system 
> > that is triggering the bug, right?
> >
> > Thanks,
> >
> > 	Ingo
> >
> > * Phil Auld <pauld@redhat.com> wrote:
> >
> >> From: "Phil Auld" <pauld@redhat.com>
> >> 
> >> sched/fair: Avoid throttle_list starvation with low cfs quota
> >> 
> >> With a very low cpu.cfs_quota_us setting, such as the minimum of 1000, 
> >> distribute_cfs_runtime may not empty the throttled_list before it runs 
> >> out of runtime to distribute. In that case, due to the change from 
> >> c06f04c7048 to put throttled entries at the head of the list, later entries 
> >> on the list will starve.  Essentially, the same X processes will get pulled 
> >> off the list, given CPU time and then, when expired, get put back on the 
> >> head of the list where distribute_cfs_runtime will give runtime to the same 
> >> set of processes leaving the rest.
> >> 
> >> Fix the issue by setting a bit in struct cfs_bandwidth when 
> >> distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can 
> >> decide to put the throttled entry on the tail or the head of the list.  The 
> >> bit is set/cleared by the callers of distribute_cfs_runtime while they hold 
> >> cfs_bandwidth->lock. 
> >> 
> >> Signed-off-by: Phil Auld <pauld@redhat.com>
> >> Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
> >> Cc: Peter Zijlstra <peterz@infradead.org>
> >> Cc: Ingo Molnar <mingo@kernel.org>
> >> Cc: stable@vger.kernel.org
> 
> Reviewed-by: Ben Segall <bsegall@google.com>
> 
> 
> In theory this does mean the unfairness could still happen if distribute is still
> running, but while a tiny quota makes it more likely, the fact that
> we're not getting through much of the list makes it not really a worry.
> If you wanted to be even more careful there could be some generation
> counter or something, but it doesn't seem necessary.
> 

Thanks for the review.  Yeah, I thought about a few other approaches, not explicitly
what you suggested, but they all complicated things. This one seemed the closest to 
"obviously correct".


Cheers,
Phil


> 
> >> ---
> >> 
> >> This is easy to reproduce with a handful of cpu consumers. I use crash on 
> >> the live system. In some cases you can simply look at the throttled list and 
> >> see the later entries are not changing:
> >> 
> >> crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
> >>   1     ffff90b56cb2d200  -976050
> >>   2     ffff90b56cb2cc00  -484925
> >>   3     ffff90b56cb2bc00  -658814
> >>   4     ffff90b56cb2ba00  -275365
> >>   5     ffff90b166a45600  -135138
> >>   6     ffff90b56cb2da00  -282505
> >>   7     ffff90b56cb2e000  -148065
> >>   8     ffff90b56cb2fa00  -872591
> >>   9     ffff90b56cb2c000  -84687
> >>  10     ffff90b56cb2f000  -87237
> >>  11     ffff90b166a40a00  -164582
> >> crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
> >>   1     ffff90b56cb2d200  -994147
> >>   2     ffff90b56cb2cc00  -306051
> >>   3     ffff90b56cb2bc00  -961321
> >>   4     ffff90b56cb2ba00  -24490
> >>   5     ffff90b166a45600  -135138
> >>   6     ffff90b56cb2da00  -282505
> >>   7     ffff90b56cb2e000  -148065
> >>   8     ffff90b56cb2fa00  -872591
> >>   9     ffff90b56cb2c000  -84687
> >>  10     ffff90b56cb2f000  -87237
> >>  11     ffff90b166a40a00  -164582
> >> 
> >> Sometimes it is easier to see by finding a process getting starved and looking 
> >> at the sched_info:
> >> 
> >> crash> task ffff8eb765994500 sched_info
> >> PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
> >>   sched_info = {
> >>     pcount = 8, 
> >>     run_delay = 697094208, 
> >>     last_arrival = 240260125039, 
> >>     last_queued = 240260327513
> >>   }, 
> >> crash> task ffff8eb765994500 sched_info
> >> PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
> >>   sched_info = {
> >>     pcount = 8, 
> >>     run_delay = 697094208, 
> >>     last_arrival = 240260125039, 
> >>     last_queued = 240260327513
> >>   }, 
> >> 
> >> 
> >>  fair.c  |   22 +++++++++++++++++++---
> >>  sched.h |    2 ++
> >>  2 files changed, 21 insertions(+), 3 deletions(-)
> >> 
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index 7fc4a371bdd2..f88e00705b55 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> >>  
> >>  	/*
> >>  	 * Add to the _head_ of the list, so that an already-started
> >> -	 * distribute_cfs_runtime will not see us
> >> +	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
> >> +	 * not running add to the tail so that later runqueues don't get starved.
> >>  	 */
> >> -	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> >> +	if (cfs_b->distribute_running)
> >> +		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> >> +	else
> >> +		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> >>
> >>  	/*
> >>  	 * If we're the first throttled task, make sure the bandwidth
> >> @@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
> >>  	 * in us over-using our runtime if it is all used during this loop, but
> >>  	 * only by limited amounts in that extreme case.
> >>  	 */
> >> -	while (throttled && cfs_b->runtime > 0) {
> >> +	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
> >>  		runtime = cfs_b->runtime;
> >> +		cfs_b->distribute_running = 1;
> >>  		raw_spin_unlock(&cfs_b->lock);
> >>  		/* we can't nest cfs_b->lock while distributing bandwidth */
> >>  		runtime = distribute_cfs_runtime(cfs_b, runtime,
> >>  						 runtime_expires);
> >>  		raw_spin_lock(&cfs_b->lock);
> >>  
> >> +		cfs_b->distribute_running = 0;
> >>  		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
> >>  
> >>  		cfs_b->runtime -= min(runtime, cfs_b->runtime);
> >> @@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
> >>  
> >>  	/* confirm we're still not at a refresh boundary */
> >>  	raw_spin_lock(&cfs_b->lock);
> >> +	if (cfs_b->distribute_running) {
> >> +		raw_spin_unlock(&cfs_b->lock);
> >> +		return;
> >> +	}
> >> +
> >>  	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
> >>  		raw_spin_unlock(&cfs_b->lock);
> >>  		return;
> >> @@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
> >>  		runtime = cfs_b->runtime;
> >>  
> >>  	expires = cfs_b->runtime_expires;
> >> +	if (runtime)
> >> +		cfs_b->distribute_running = 1;
> >> +
> >>  	raw_spin_unlock(&cfs_b->lock);
> >>  
> >>  	if (!runtime)
> >> @@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
> >>  	raw_spin_lock(&cfs_b->lock);
> >>  	if (expires == cfs_b->runtime_expires)
> >>  		cfs_b->runtime -= min(runtime, cfs_b->runtime);
> >> +	cfs_b->distribute_running = 0;
> >>  	raw_spin_unlock(&cfs_b->lock);
> >>  }
> >>  
> >> @@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> >>  	cfs_b->period_timer.function = sched_cfs_period_timer;
> >>  	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> >>  	cfs_b->slack_timer.function = sched_cfs_slack_timer;
> >> +	cfs_b->distribute_running = 0;
> >>  }
> >>  
> >>  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> >> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> >> index 455fa330de04..9683f458aec7 100644
> >> --- a/kernel/sched/sched.h
> >> +++ b/kernel/sched/sched.h
> >> @@ -346,6 +346,8 @@ struct cfs_bandwidth {
> >>  	int			nr_periods;
> >>  	int			nr_throttled;
> >>  	u64			throttled_time;
> >> +
> >> +	bool                    distribute_running;
> >>  #endif
> >>  };
> >>  
> >> 
> >> 
> >> -- 

-- 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tip:sched/urgent] sched/fair: Fix throttle_list starvation with low CFS quota
  2018-10-08 14:36 [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota Phil Auld
  2018-10-09  8:32 ` Ingo Molnar
@ 2018-10-11 10:39 ` tip-bot for Phil Auld
  2018-10-11 11:12 ` tip-bot for Phil Auld
  2 siblings, 0 replies; 8+ messages in thread
From: tip-bot for Phil Auld @ 2018-10-11 10:39 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: torvalds, mingo, linux-kernel, hpa, peterz, tglx, pauld

Commit-ID:  8b48300108248e950cde0bdc5708039fc3836623
Gitweb:     https://git.kernel.org/tip/8b48300108248e950cde0bdc5708039fc3836623
Author:     Phil Auld <pauld@redhat.com>
AuthorDate: Mon, 8 Oct 2018 10:36:40 -0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Thu, 11 Oct 2018 11:18:32 +0200

sched/fair: Fix throttle_list starvation with low CFS quota

With a very low cpu.cfs_quota_us setting, such as the minimum of 1000,
distribute_cfs_runtime may not empty the throttled_list before it runs
out of runtime to distribute. In that case, due to the change from
c06f04c7048 to put throttled entries at the head of the list, later entries
on the list will starve.  Essentially, the same X processes will get pulled
off the list, given CPU time and then, when expired, get put back on the
head of the list where distribute_cfs_runtime will give runtime to the same
set of processes leaving the rest.

Fix the issue by setting a bit in struct cfs_bandwidth when
distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can
decide to put the throttled entry on the tail or the head of the list.  The
bit is set/cleared by the callers of distribute_cfs_runtime while they hold
cfs_bandwidth->lock.

This is easy to reproduce with a handful of CPU consumers. I use 'crash' on
the live system. In some cases you can simply look at the throttled list and
see the later entries are not changing:

  crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
    1     ffff90b56cb2d200  -976050
    2     ffff90b56cb2cc00  -484925
    3     ffff90b56cb2bc00  -658814
    4     ffff90b56cb2ba00  -275365
    5     ffff90b166a45600  -135138
    6     ffff90b56cb2da00  -282505
    7     ffff90b56cb2e000  -148065
    8     ffff90b56cb2fa00  -872591
    9     ffff90b56cb2c000  -84687
   10     ffff90b56cb2f000  -87237
   11     ffff90b166a40a00  -164582

  crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
    1     ffff90b56cb2d200  -994147
    2     ffff90b56cb2cc00  -306051
    3     ffff90b56cb2bc00  -961321
    4     ffff90b56cb2ba00  -24490
    5     ffff90b166a45600  -135138
    6     ffff90b56cb2da00  -282505
    7     ffff90b56cb2e000  -148065
    8     ffff90b56cb2fa00  -872591
    9     ffff90b56cb2c000  -84687
   10     ffff90b56cb2f000  -87237
   11     ffff90b166a40a00  -164582

Sometimes it is easier to see by finding a process getting starved and looking
at the sched_info:

  crash> task ffff8eb765994500 sched_info
  PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
    sched_info = {
      pcount = 8,
      run_delay = 697094208,
      last_arrival = 240260125039,
      last_queued = 240260327513
    },
  crash> task ffff8eb765994500 sched_info
  PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
    sched_info = {
      pcount = 8,
      run_delay = 697094208,
      last_arrival = 240260125039,
      last_queued = 240260327513
    },

Signed-off-by: Phil Auld <pauld@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
Link: http://lkml.kernel.org/r/20181008143639.GA4019@pauld.bos.csb
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 22 +++++++++++++++++++---
 kernel/sched/sched.h |  2 ++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7fc4a371bdd2..f88e00705b55 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	/*
 	 * Add to the _head_ of the list, so that an already-started
-	 * distribute_cfs_runtime will not see us
+	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+	 * not running add to the tail so that later runqueues don't get starved.
 	 */
-	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	if (cfs_b->distribute_running)
+		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	else
+		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 
 	/*
 	 * If we're the first throttled task, make sure the bandwidth
@@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	 * in us over-using our runtime if it is all used during this loop, but
 	 * only by limited amounts in that extreme case.
 	 */
-	while (throttled && cfs_b->runtime > 0) {
+	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
 		runtime = cfs_b->runtime;
+		cfs_b->distribute_running = 1;
 		raw_spin_unlock(&cfs_b->lock);
 		/* we can't nest cfs_b->lock while distributing bandwidth */
 		runtime = distribute_cfs_runtime(cfs_b, runtime,
 						 runtime_expires);
 		raw_spin_lock(&cfs_b->lock);
 
+		cfs_b->distribute_running = 0;
 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 
 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 
 	/* confirm we're still not at a refresh boundary */
 	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->distribute_running) {
+		raw_spin_unlock(&cfs_b->lock);
+		return;
+	}
+
 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
 		raw_spin_unlock(&cfs_b->lock);
 		return;
@@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 		runtime = cfs_b->runtime;
 
 	expires = cfs_b->runtime_expires;
+	if (runtime)
+		cfs_b->distribute_running = 1;
+
 	raw_spin_unlock(&cfs_b->lock);
 
 	if (!runtime)
@@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 	raw_spin_lock(&cfs_b->lock);
 	if (expires == cfs_b->runtime_expires)
 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
+	cfs_b->distribute_running = 0;
 	raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
+	cfs_b->distribute_running = 0;
 }
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa330de04..9683f458aec7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -346,6 +346,8 @@ struct cfs_bandwidth {
 	int			nr_periods;
 	int			nr_throttled;
 	u64			throttled_time;
+
+	bool                    distribute_running;
 #endif
 };
 

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [tip:sched/urgent] sched/fair: Fix throttle_list starvation with low CFS quota
  2018-10-08 14:36 [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota Phil Auld
  2018-10-09  8:32 ` Ingo Molnar
  2018-10-11 10:39 ` [tip:sched/urgent] sched/fair: Fix throttle_list starvation with low CFS quota tip-bot for Phil Auld
@ 2018-10-11 11:12 ` tip-bot for Phil Auld
  2 siblings, 0 replies; 8+ messages in thread
From: tip-bot for Phil Auld @ 2018-10-11 11:12 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, bsegall, mingo, tglx, peterz, torvalds, pauld, hpa

Commit-ID:  baa9be4ffb55876923dc9716abc0a448e510ba30
Gitweb:     https://git.kernel.org/tip/baa9be4ffb55876923dc9716abc0a448e510ba30
Author:     Phil Auld <pauld@redhat.com>
AuthorDate: Mon, 8 Oct 2018 10:36:40 -0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Thu, 11 Oct 2018 13:10:18 +0200

sched/fair: Fix throttle_list starvation with low CFS quota

With a very low cpu.cfs_quota_us setting, such as the minimum of 1000,
distribute_cfs_runtime may not empty the throttled_list before it runs
out of runtime to distribute. In that case, due to the change from
c06f04c7048 to put throttled entries at the head of the list, later entries
on the list will starve.  Essentially, the same X processes will get pulled
off the list, given CPU time and then, when expired, get put back on the
head of the list where distribute_cfs_runtime will give runtime to the same
set of processes leaving the rest.

Fix the issue by setting a bit in struct cfs_bandwidth when
distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can
decide to put the throttled entry on the tail or the head of the list.  The
bit is set/cleared by the callers of distribute_cfs_runtime while they hold
cfs_bandwidth->lock.

This is easy to reproduce with a handful of CPU consumers. I use 'crash' on
the live system. In some cases you can simply look at the throttled list and
see the later entries are not changing:

  crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
    1     ffff90b56cb2d200  -976050
    2     ffff90b56cb2cc00  -484925
    3     ffff90b56cb2bc00  -658814
    4     ffff90b56cb2ba00  -275365
    5     ffff90b166a45600  -135138
    6     ffff90b56cb2da00  -282505
    7     ffff90b56cb2e000  -148065
    8     ffff90b56cb2fa00  -872591
    9     ffff90b56cb2c000  -84687
   10     ffff90b56cb2f000  -87237
   11     ffff90b166a40a00  -164582

  crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
    1     ffff90b56cb2d200  -994147
    2     ffff90b56cb2cc00  -306051
    3     ffff90b56cb2bc00  -961321
    4     ffff90b56cb2ba00  -24490
    5     ffff90b166a45600  -135138
    6     ffff90b56cb2da00  -282505
    7     ffff90b56cb2e000  -148065
    8     ffff90b56cb2fa00  -872591
    9     ffff90b56cb2c000  -84687
   10     ffff90b56cb2f000  -87237
   11     ffff90b166a40a00  -164582

Sometimes it is easier to see by finding a process getting starved and looking
at the sched_info:

  crash> task ffff8eb765994500 sched_info
  PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
    sched_info = {
      pcount = 8,
      run_delay = 697094208,
      last_arrival = 240260125039,
      last_queued = 240260327513
    },
  crash> task ffff8eb765994500 sched_info
  PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
    sched_info = {
      pcount = 8,
      run_delay = 697094208,
      last_arrival = 240260125039,
      last_queued = 240260327513
    },

Signed-off-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
Link: http://lkml.kernel.org/r/20181008143639.GA4019@pauld.bos.csb
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 22 +++++++++++++++++++---
 kernel/sched/sched.h |  2 ++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7fc4a371bdd2..f88e00705b55 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	/*
 	 * Add to the _head_ of the list, so that an already-started
-	 * distribute_cfs_runtime will not see us
+	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+	 * not running add to the tail so that later runqueues don't get starved.
 	 */
-	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	if (cfs_b->distribute_running)
+		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	else
+		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 
 	/*
 	 * If we're the first throttled task, make sure the bandwidth
@@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	 * in us over-using our runtime if it is all used during this loop, but
 	 * only by limited amounts in that extreme case.
 	 */
-	while (throttled && cfs_b->runtime > 0) {
+	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
 		runtime = cfs_b->runtime;
+		cfs_b->distribute_running = 1;
 		raw_spin_unlock(&cfs_b->lock);
 		/* we can't nest cfs_b->lock while distributing bandwidth */
 		runtime = distribute_cfs_runtime(cfs_b, runtime,
 						 runtime_expires);
 		raw_spin_lock(&cfs_b->lock);
 
+		cfs_b->distribute_running = 0;
 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 
 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 
 	/* confirm we're still not at a refresh boundary */
 	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->distribute_running) {
+		raw_spin_unlock(&cfs_b->lock);
+		return;
+	}
+
 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
 		raw_spin_unlock(&cfs_b->lock);
 		return;
@@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 		runtime = cfs_b->runtime;
 
 	expires = cfs_b->runtime_expires;
+	if (runtime)
+		cfs_b->distribute_running = 1;
+
 	raw_spin_unlock(&cfs_b->lock);
 
 	if (!runtime)
@@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 	raw_spin_lock(&cfs_b->lock);
 	if (expires == cfs_b->runtime_expires)
 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
+	cfs_b->distribute_running = 0;
 	raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
+	cfs_b->distribute_running = 0;
 }
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa330de04..9683f458aec7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -346,6 +346,8 @@ struct cfs_bandwidth {
 	int			nr_periods;
 	int			nr_throttled;
 	u64			throttled_time;
+
+	bool                    distribute_running;
 #endif
 };
 

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota
  2018-10-10 18:37     ` Phil Auld
@ 2018-11-11  9:20       ` Konstantin Khlebnikov
  0 siblings, 0 replies; 8+ messages in thread
From: Konstantin Khlebnikov @ 2018-11-11  9:20 UTC (permalink / raw)
  To: Phil Auld, bsegall
  Cc: Ingo Molnar, Joel Fernandes, Steve Muckle, Paul Turner,
	Vincent Guittot, Morten Rasmussen, Peter Zijlstra, linux-kernel,
	stable

On 10.10.2018 21:37, Phil Auld wrote:
> On Wed, Oct 10, 2018 at 10:49:25AM -0700 bsegall@google.com wrote:
>> Ingo Molnar <mingo@kernel.org> writes:
>>
>>> I've Cc:-ed a handful of gents who worked on CFS bandwidth details to widen the discussion.
>>> Patch quoted below.
>>>
>>> Looks like a real bug that needs to be fixed - and at first sight the quota of 1000 looks very
>>> low - could we improve the arithmetics perhaps?
>>>
>>> A low quota of 1000 is used because there's many VMs or containers provisioned on the system
>>> that is triggering the bug, right?
>>>
>>> Thanks,
>>>
>>> 	Ingo
>>>
>>> * Phil Auld <pauld@redhat.com> wrote:
>>>
>>>> From: "Phil Auld" <pauld@redhat.com>
>>>>
>>>> sched/fair: Avoid throttle_list starvation with low cfs quota
>>>>
>>>> With a very low cpu.cfs_quota_us setting, such as the minimum of 1000,
>>>> distribute_cfs_runtime may not empty the throttled_list before it runs
>>>> out of runtime to distribute. In that case, due to the change from
>>>> c06f04c7048 to put throttled entries at the head of the list, later entries
>>>> on the list will starve.  Essentially, the same X processes will get pulled
>>>> off the list, given CPU time and then, when expired, get put back on the
>>>> head of the list where distribute_cfs_runtime will give runtime to the same
>>>> set of processes leaving the rest.
>>>>
>>>> Fix the issue by setting a bit in struct cfs_bandwidth when
>>>> distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can
>>>> decide to put the throttled entry on the tail or the head of the list.  The
>>>> bit is set/cleared by the callers of distribute_cfs_runtime while they hold
>>>> cfs_bandwidth->lock.
>>>>
>>>> Signed-off-by: Phil Auld <pauld@redhat.com>
>>>> Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
>>>> Cc: Peter Zijlstra <peterz@infradead.org>
>>>> Cc: Ingo Molnar <mingo@kernel.org>
>>>> Cc: stable@vger.kernel.org
>>
>> Reviewed-by: Ben Segall <bsegall@google.com>
>>
>>
>> In theory this does mean the unfairness could still happen if distribute is still
>> running, but while a tiny quota makes it more likely, the fact that
>> we're not getting through much of the list makes it not really a worry.
>> If you wanted to be even more careful there could be some generation
>> counter or something, but it doesn't seem necessary.
>>
> 
> Thanks for the review.  Yeah, I thought about a few other approaches, not explicitly
> what you suggested, but they all complicated things. This one seemed the closest to
> "obviously correct".

I've sent patch about same problem couple years ago:
https://lore.kernel.org/patchwork/patch/750523/

I think my approach is closer to FIFO and more fair.

> 
> 
> Cheers,
> Phil
> 
> 
>>
>>>> ---
>>>>
>>>> This is easy to reproduce with a handful of cpu consumers. I use crash on
>>>> the live system. In some cases you can simply look at the throttled list and
>>>> see the later entries are not changing:
>>>>
>>>> crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
>>>>    1     ffff90b56cb2d200  -976050
>>>>    2     ffff90b56cb2cc00  -484925
>>>>    3     ffff90b56cb2bc00  -658814
>>>>    4     ffff90b56cb2ba00  -275365
>>>>    5     ffff90b166a45600  -135138
>>>>    6     ffff90b56cb2da00  -282505
>>>>    7     ffff90b56cb2e000  -148065
>>>>    8     ffff90b56cb2fa00  -872591
>>>>    9     ffff90b56cb2c000  -84687
>>>>   10     ffff90b56cb2f000  -87237
>>>>   11     ffff90b166a40a00  -164582
>>>> crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
>>>>    1     ffff90b56cb2d200  -994147
>>>>    2     ffff90b56cb2cc00  -306051
>>>>    3     ffff90b56cb2bc00  -961321
>>>>    4     ffff90b56cb2ba00  -24490
>>>>    5     ffff90b166a45600  -135138
>>>>    6     ffff90b56cb2da00  -282505
>>>>    7     ffff90b56cb2e000  -148065
>>>>    8     ffff90b56cb2fa00  -872591
>>>>    9     ffff90b56cb2c000  -84687
>>>>   10     ffff90b56cb2f000  -87237
>>>>   11     ffff90b166a40a00  -164582
>>>>
>>>> Sometimes it is easier to see by finding a process getting starved and looking
>>>> at the sched_info:
>>>>
>>>> crash> task ffff8eb765994500 sched_info
>>>> PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
>>>>    sched_info = {
>>>>      pcount = 8,
>>>>      run_delay = 697094208,
>>>>      last_arrival = 240260125039,
>>>>      last_queued = 240260327513
>>>>    },
>>>> crash> task ffff8eb765994500 sched_info
>>>> PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
>>>>    sched_info = {
>>>>      pcount = 8,
>>>>      run_delay = 697094208,
>>>>      last_arrival = 240260125039,
>>>>      last_queued = 240260327513
>>>>    },
>>>>
>>>>
>>>>   fair.c  |   22 +++++++++++++++++++---
>>>>   sched.h |    2 ++
>>>>   2 files changed, 21 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index 7fc4a371bdd2..f88e00705b55 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>>>>   
>>>>   	/*
>>>>   	 * Add to the _head_ of the list, so that an already-started
>>>> -	 * distribute_cfs_runtime will not see us
>>>> +	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
>>>> +	 * not running add to the tail so that later runqueues don't get starved.
>>>>   	 */
>>>> -	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
>>>> +	if (cfs_b->distribute_running)
>>>> +		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
>>>> +	else
>>>> +		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
>>>>
>>>>   	/*
>>>>   	 * If we're the first throttled task, make sure the bandwidth
>>>> @@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
>>>>   	 * in us over-using our runtime if it is all used during this loop, but
>>>>   	 * only by limited amounts in that extreme case.
>>>>   	 */
>>>> -	while (throttled && cfs_b->runtime > 0) {
>>>> +	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
>>>>   		runtime = cfs_b->runtime;
>>>> +		cfs_b->distribute_running = 1;
>>>>   		raw_spin_unlock(&cfs_b->lock);
>>>>   		/* we can't nest cfs_b->lock while distributing bandwidth */
>>>>   		runtime = distribute_cfs_runtime(cfs_b, runtime,
>>>>   						 runtime_expires);
>>>>   		raw_spin_lock(&cfs_b->lock);
>>>>   
>>>> +		cfs_b->distribute_running = 0;
>>>>   		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
>>>>   
>>>>   		cfs_b->runtime -= min(runtime, cfs_b->runtime);
>>>> @@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>>>>   
>>>>   	/* confirm we're still not at a refresh boundary */
>>>>   	raw_spin_lock(&cfs_b->lock);
>>>> +	if (cfs_b->distribute_running) {
>>>> +		raw_spin_unlock(&cfs_b->lock);
>>>> +		return;
>>>> +	}
>>>> +
>>>>   	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
>>>>   		raw_spin_unlock(&cfs_b->lock);
>>>>   		return;
>>>> @@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>>>>   		runtime = cfs_b->runtime;
>>>>   
>>>>   	expires = cfs_b->runtime_expires;
>>>> +	if (runtime)
>>>> +		cfs_b->distribute_running = 1;
>>>> +
>>>>   	raw_spin_unlock(&cfs_b->lock);
>>>>   
>>>>   	if (!runtime)
>>>> @@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
>>>>   	raw_spin_lock(&cfs_b->lock);
>>>>   	if (expires == cfs_b->runtime_expires)
>>>>   		cfs_b->runtime -= min(runtime, cfs_b->runtime);
>>>> +	cfs_b->distribute_running = 0;
>>>>   	raw_spin_unlock(&cfs_b->lock);
>>>>   }
>>>>   
>>>> @@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
>>>>   	cfs_b->period_timer.function = sched_cfs_period_timer;
>>>>   	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>>>>   	cfs_b->slack_timer.function = sched_cfs_slack_timer;
>>>> +	cfs_b->distribute_running = 0;
>>>>   }
>>>>   
>>>>   static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>>> index 455fa330de04..9683f458aec7 100644
>>>> --- a/kernel/sched/sched.h
>>>> +++ b/kernel/sched/sched.h
>>>> @@ -346,6 +346,8 @@ struct cfs_bandwidth {
>>>>   	int			nr_periods;
>>>>   	int			nr_throttled;
>>>>   	u64			throttled_time;
>>>> +
>>>> +	bool                    distribute_running;
>>>>   #endif
>>>>   };
>>>>   
>>>>
>>>>
>>>> -- 
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-11-11  9:20 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-08 14:36 [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota Phil Auld
2018-10-09  8:32 ` Ingo Molnar
2018-10-09 12:44   ` Phil Auld
2018-10-10 17:49   ` bsegall
2018-10-10 18:37     ` Phil Auld
2018-11-11  9:20       ` Konstantin Khlebnikov
2018-10-11 10:39 ` [tip:sched/urgent] sched/fair: Fix throttle_list starvation with low CFS quota tip-bot for Phil Auld
2018-10-11 11:12 ` tip-bot for Phil Auld

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.