LKML Archive on lore.kernel.org
 help / color / Atom feed
From: Phil Auld <pauld@redhat.com>
To: Peter Zijlstra <peterz@infradead.org>, Ingo Molnar <mingo@kernel.org>
Cc: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Subject: [Patch] sched/fair: Avoid throttle_list starvation with low cfs quota
Date: Mon, 8 Oct 2018 10:36:40 -0400
Message-ID: <20181008143639.GA4019@pauld.bos.csb> (raw)

From: "Phil Auld" <pauld@redhat.com>

sched/fair: Avoid throttle_list starvation with low cfs quota

With a very low cpu.cfs_quota_us setting, such as the minimum of 1000, 
distribute_cfs_runtime may not empty the throttled_list before it runs 
out of runtime to distribute. In that case, due to the change from 
c06f04c7048 to put throttled entries at the head of the list, later entries 
on the list will starve.  Essentially, the same X processes will get pulled 
off the list, given CPU time and then, when expired, get put back on the 
head of the list where distribute_cfs_runtime will give runtime to the same 
set of processes leaving the rest.

Fix the issue by setting a bit in struct cfs_bandwidth when 
distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can 
decide to put the throttled entry on the tail or the head of the list.  The 
bit is set/cleared by the callers of distribute_cfs_runtime while they hold 
cfs_bandwidth->lock. 

Signed-off-by: Phil Auld <pauld@redhat.com>
Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop")
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
---

This is easy to reproduce with a handful of cpu consumers. I use crash on 
the live system. In some cases you can simply look at the throttled list and 
see the later entries are not changing:

crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
  1     ffff90b56cb2d200  -976050
  2     ffff90b56cb2cc00  -484925
  3     ffff90b56cb2bc00  -658814
  4     ffff90b56cb2ba00  -275365
  5     ffff90b166a45600  -135138
  6     ffff90b56cb2da00  -282505
  7     ffff90b56cb2e000  -148065
  8     ffff90b56cb2fa00  -872591
  9     ffff90b56cb2c000  -84687
 10     ffff90b56cb2f000  -87237
 11     ffff90b166a40a00  -164582
crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
  1     ffff90b56cb2d200  -994147
  2     ffff90b56cb2cc00  -306051
  3     ffff90b56cb2bc00  -961321
  4     ffff90b56cb2ba00  -24490
  5     ffff90b166a45600  -135138
  6     ffff90b56cb2da00  -282505
  7     ffff90b56cb2e000  -148065
  8     ffff90b56cb2fa00  -872591
  9     ffff90b56cb2c000  -84687
 10     ffff90b56cb2f000  -87237
 11     ffff90b166a40a00  -164582

Sometimes it is easier to see by finding a process getting starved and looking 
at the sched_info:

crash> task ffff8eb765994500 sched_info
PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
  sched_info = {
    pcount = 8, 
    run_delay = 697094208, 
    last_arrival = 240260125039, 
    last_queued = 240260327513
  }, 
crash> task ffff8eb765994500 sched_info
PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
  sched_info = {
    pcount = 8, 
    run_delay = 697094208, 
    last_arrival = 240260125039, 
    last_queued = 240260327513
  }, 


 fair.c  |   22 +++++++++++++++++++---
 sched.h |    2 ++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7fc4a371bdd2..f88e00705b55 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	/*
 	 * Add to the _head_ of the list, so that an already-started
-	 * distribute_cfs_runtime will not see us
+	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+	 * not running add to the tail so that later runqueues don't get starved.
 	 */
-	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	if (cfs_b->distribute_running)
+		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	else
+		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 
 	/*
 	 * If we're the first throttled task, make sure the bandwidth
@@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	 * in us over-using our runtime if it is all used during this loop, but
 	 * only by limited amounts in that extreme case.
 	 */
-	while (throttled && cfs_b->runtime > 0) {
+	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
 		runtime = cfs_b->runtime;
+		cfs_b->distribute_running = 1;
 		raw_spin_unlock(&cfs_b->lock);
 		/* we can't nest cfs_b->lock while distributing bandwidth */
 		runtime = distribute_cfs_runtime(cfs_b, runtime,
 						 runtime_expires);
 		raw_spin_lock(&cfs_b->lock);
 
+		cfs_b->distribute_running = 0;
 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 
 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 
 	/* confirm we're still not at a refresh boundary */
 	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->distribute_running) {
+		raw_spin_unlock(&cfs_b->lock);
+		return;
+	}
+
 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
 		raw_spin_unlock(&cfs_b->lock);
 		return;
@@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 		runtime = cfs_b->runtime;
 
 	expires = cfs_b->runtime_expires;
+	if (runtime)
+		cfs_b->distribute_running = 1;
+
 	raw_spin_unlock(&cfs_b->lock);
 
 	if (!runtime)
@@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 	raw_spin_lock(&cfs_b->lock);
 	if (expires == cfs_b->runtime_expires)
 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
+	cfs_b->distribute_running = 0;
 	raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
+	cfs_b->distribute_running = 0;
 }
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa330de04..9683f458aec7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -346,6 +346,8 @@ struct cfs_bandwidth {
 	int			nr_periods;
 	int			nr_throttled;
 	u64			throttled_time;
+
+	bool                    distribute_running;
 #endif
 };
 


-- 

             reply index

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-10-08 14:36 Phil Auld [this message]
2018-10-09  8:32 ` Ingo Molnar
2018-10-09 12:44   ` Phil Auld
2018-10-10 17:49   ` bsegall
2018-10-10 18:37     ` Phil Auld
2018-11-11  9:20       ` Konstantin Khlebnikov
2018-10-11 10:39 ` [tip:sched/urgent] sched/fair: Fix throttle_list starvation with low CFS quota tip-bot for Phil Auld
2018-10-11 11:12 ` tip-bot for Phil Auld

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181008143639.GA4019@pauld.bos.csb \
    --to=pauld@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git