All of lore.kernel.org
 help / color / mirror / Atom feed
From: Shaohua Li <shli@fb.com>
To: <linux-kernel@vger.kernel.org>, <linux-block@vger.kernel.org>
Cc: <axboe@kernel.dk>, <tj@kernel.org>,
	Vivek Goyal <vgoyal@redhat.com>, <jmoyer@redhat.com>,
	<Kernel-team@fb.com>
Subject: [PATCH V7 06/18] blk-throttle: add upgrade logic for LIMIT_LOW state
Date: Mon, 27 Mar 2017 10:51:34 -0700	[thread overview]
Message-ID: <8efed3f0170b56b3e040b089879d0b0178e54330.1490634565.git.shli@fb.com> (raw)
In-Reply-To: <cover.1490634565.git.shli@fb.com>

When queue is in LIMIT_LOW state and all cgroups with low limit cross
the bps/iops limitation, we will upgrade queue's state to
LIMIT_MAX. To determine if a cgroup exceeds its limitation, we check if
the cgroup has pending request. Since cgroup is throttled according to
the limit, pending request means the cgroup reaches the limit.

If a cgroup has limit set for both read and write, we consider the
combination of them for upgrade. The reason is read IO and write IO can
interfere with each other. If we do the upgrade based in one direction
IO, the other direction IO could be severly harmed.

For a cgroup hierarchy, there are two cases. Children has lower low
limit than parent. Parent's low limit is meaningless. If children's
bps/iops cross low limit, we can upgrade queue state. The other case is
children has higher low limit than parent. Children's low limit is
meaningless. As long as parent's bps/iops (which is a sum of childrens
bps/iops) cross low limit, we can upgrade queue state.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 96 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1fade50..dd382d8 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -457,6 +457,7 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
 	td->limit_valid[LIMIT_LOW] = low_valid;
 }
 
+static void throtl_upgrade_state(struct throtl_data *td);
 static void throtl_pd_offline(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
@@ -468,9 +469,8 @@ static void throtl_pd_offline(struct blkg_policy_data *pd)
 
 	blk_throtl_update_limit_valid(tg->td);
 
-	if (tg->td->limit_index == LIMIT_LOW &&
-	    !tg->td->limit_valid[LIMIT_LOW])
-		tg->td->limit_index = LIMIT_MAX;
+	if (!tg->td->limit_valid[tg->td->limit_index])
+		throtl_upgrade_state(tg->td);
 }
 
 static void throtl_pd_free(struct blkg_policy_data *pd)
@@ -1081,6 +1081,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 	return nr_disp;
 }
 
+static bool throtl_can_upgrade(struct throtl_data *td,
+	struct throtl_grp *this_tg);
 /**
  * throtl_pending_timer_fn - timer function for service_queue->pending_timer
  * @arg: the throtl_service_queue being serviced
@@ -1107,6 +1109,9 @@ static void throtl_pending_timer_fn(unsigned long arg)
 	int ret;
 
 	spin_lock_irq(q->queue_lock);
+	if (throtl_can_upgrade(td, NULL))
+		throtl_upgrade_state(td);
+
 again:
 	parent_sq = sq->parent_sq;
 	dispatched = false;
@@ -1522,6 +1527,87 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_free_fn		= throtl_pd_free,
 };
 
+static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+{
+	struct throtl_service_queue *sq = &tg->service_queue;
+	bool read_limit, write_limit;
+
+	/*
+	 * if cgroup reaches low limit (if low limit is 0, the cgroup always
+	 * reaches), it's ok to upgrade to next limit
+	 */
+	read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
+	write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
+	if (!read_limit && !write_limit)
+		return true;
+	if (read_limit && sq->nr_queued[READ] &&
+	    (!write_limit || sq->nr_queued[WRITE]))
+		return true;
+	if (write_limit && sq->nr_queued[WRITE] &&
+	    (!read_limit || sq->nr_queued[READ]))
+		return true;
+	return false;
+}
+
+static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
+{
+	while (true) {
+		if (throtl_tg_can_upgrade(tg))
+			return true;
+		tg = sq_to_tg(tg->service_queue.parent_sq);
+		if (!tg || !tg_to_blkg(tg)->parent)
+			return false;
+	}
+	return false;
+}
+
+static bool throtl_can_upgrade(struct throtl_data *td,
+	struct throtl_grp *this_tg)
+{
+	struct cgroup_subsys_state *pos_css;
+	struct blkcg_gq *blkg;
+
+	if (td->limit_index != LIMIT_LOW)
+		return false;
+
+	rcu_read_lock();
+	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+		struct throtl_grp *tg = blkg_to_tg(blkg);
+
+		if (tg == this_tg)
+			continue;
+		if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+			continue;
+		if (!throtl_hierarchy_can_upgrade(tg)) {
+			rcu_read_unlock();
+			return false;
+		}
+	}
+	rcu_read_unlock();
+	return true;
+}
+
+static void throtl_upgrade_state(struct throtl_data *td)
+{
+	struct cgroup_subsys_state *pos_css;
+	struct blkcg_gq *blkg;
+
+	td->limit_index = LIMIT_MAX;
+	rcu_read_lock();
+	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+		struct throtl_grp *tg = blkg_to_tg(blkg);
+		struct throtl_service_queue *sq = &tg->service_queue;
+
+		tg->disptime = jiffies - 1;
+		throtl_select_dispatch(sq);
+		throtl_schedule_next_dispatch(sq, false);
+	}
+	rcu_read_unlock();
+	throtl_select_dispatch(&td->service_queue);
+	throtl_schedule_next_dispatch(&td->service_queue, false);
+	queue_work(kthrotld_workqueue, &td->dispatch_work);
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -1544,14 +1630,20 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 	sq = &tg->service_queue;
 
+again:
 	while (true) {
 		/* throtl is FIFO - if bios are already queued, should queue */
 		if (sq->nr_queued[rw])
 			break;
 
 		/* if above limits, break to queue */
-		if (!tg_may_dispatch(tg, bio, NULL))
+		if (!tg_may_dispatch(tg, bio, NULL)) {
+			if (throtl_can_upgrade(tg->td, tg)) {
+				throtl_upgrade_state(tg->td);
+				goto again;
+			}
 			break;
+		}
 
 		/* within limits, let's charge and dispatch directly */
 		throtl_charge_bio(tg, bio);
-- 
2.9.3

  parent reply	other threads:[~2017-03-27 17:51 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-27 17:51 [PATCH V7 00/18] blk-throttle: add .low limit Shaohua Li
2017-03-27 17:51 ` [PATCH V7 01/18] blk-throttle: use U64_MAX/UINT_MAX to replace -1 Shaohua Li
2017-03-27 17:51 ` [PATCH V7 02/18] blk-throttle: prepare support multiple limits Shaohua Li
2017-03-27 17:51 ` [PATCH V7 03/18] blk-throttle: add configure option for new .low interface Shaohua Li
2017-03-27 17:51 ` [PATCH V7 04/18] blk-throttle: add " Shaohua Li
2017-03-27 17:51 ` [PATCH V7 05/18] blk-throttle: configure bps/iops limit for cgroup in low limit Shaohua Li
2017-03-27 17:51 ` Shaohua Li [this message]
2017-03-27 17:51 ` [PATCH V7 07/18] blk-throttle: add downgrade logic Shaohua Li
2017-03-27 17:51 ` [PATCH V7 08/18] blk-throttle: make sure expire time isn't too big Shaohua Li
2017-03-27 17:51 ` [PATCH V7 09/18] blk-throttle: make throtl_slice tunable Shaohua Li
2017-03-27 17:51 ` [PATCH V7 10/18] blk-throttle: choose a small throtl_slice for SSD Shaohua Li
2017-03-27 17:51 ` [PATCH V7 11/18] blk-throttle: detect completed idle cgroup Shaohua Li
2017-03-27 17:51 ` [PATCH V7 12/18] blk-throttle: make bandwidth change smooth Shaohua Li
2017-03-27 17:51 ` [PATCH V7 13/18] blk-throttle: add a simple idle detection Shaohua Li
2017-03-27 17:51 ` [PATCH V7 14/18] blk-throttle: add interface to configure idle time threshold Shaohua Li
2017-03-27 17:51 ` [PATCH V7 15/18] blk-throttle: ignore idle cgroup limit Shaohua Li
2017-03-27 17:51 ` [PATCH V7 16/18] blk-throttle: add interface for per-cgroup target latency Shaohua Li
2017-03-27 17:51 ` [PATCH V7 17/18] blk-throttle: add a mechanism to estimate IO latency Shaohua Li
2017-03-27 17:51 ` [PATCH V7 18/18] blk-throttle: add latency target support Shaohua Li
2017-03-27 18:15 ` [PATCH V7 00/18] blk-throttle: add .low limit Jens Axboe
2017-03-27 19:00   ` Shaohua Li
2017-03-27 19:11     ` Jens Axboe
2017-03-27 22:19       ` [PATCH 0/3] blk-throttle: add .low limit fix Shaohua Li
2017-03-27 22:19         ` [PATCH 1/3] block: track request size in blk_issue_stat Shaohua Li
2017-03-27 22:19         ` [PATCH 2/3] blk-throttle: add a mechanism to estimate IO latency Shaohua Li
2017-03-27 22:19         ` [PATCH 3/3] blk-throttle: add latency target support Shaohua Li
2017-03-28 15:58         ` [PATCH 0/3] blk-throttle: add .low limit fix Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8efed3f0170b56b3e040b089879d0b0178e54330.1490634565.git.shli@fb.com \
    --to=shli@fb.com \
    --cc=Kernel-team@fb.com \
    --cc=axboe@kernel.dk \
    --cc=jmoyer@redhat.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.