linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Shaohua Li <shli@fb.com>
To: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>
Cc: <kernel-team@fb.com>, <axboe@fb.com>, <tj@kernel.org>,
	<vgoyal@redhat.com>
Subject: [PATCH V5 06/17] blk-throttle: add downgrade logic
Date: Thu, 15 Dec 2016 12:32:57 -0800	[thread overview]
Message-ID: <5c22098d912d73388bc214c1d7e404f25cfdad13.1481833017.git.shli@fb.com> (raw)
In-Reply-To: <cover.1481833017.git.shli@fb.com>

When queue state machine is in LIMIT_MAX state, but a cgroup is below
its low limit for some time, the queue should be downgraded to lower
state as one cgroup's low limit isn't met.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index cfd74cfc..0f65fce 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -140,6 +140,13 @@ struct throtl_grp {
 	/* Number of bio's dispatched in current slice */
 	unsigned int io_disp[2];
 
+	unsigned long last_low_overflow_time[2];
+
+	uint64_t last_bytes_disp[2];
+	unsigned int last_io_disp[2];
+
+	unsigned long last_check_time;
+
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
@@ -159,6 +166,9 @@ struct throtl_data
 	struct work_struct dispatch_work;
 	unsigned int limit_index;
 	bool limit_valid[LIMIT_CNT];
+
+	unsigned long low_upgrade_time;
+	unsigned long low_downgrade_time;
 };
 
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -896,6 +906,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_iter.bi_size;
 	tg->io_disp[rw]++;
+	tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+	tg->last_io_disp[rw]++;
 
 	/*
 	 * BIO_THROTTLED is used to prevent the same bio to be throttled
@@ -1511,6 +1523,36 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_free_fn		= throtl_pd_free,
 };
 
+static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+	unsigned long rtime = -1, wtime = -1;
+
+	if (tg->bps[READ][LIMIT_LOW] != U64_MAX ||
+	    tg->iops[READ][LIMIT_LOW] != UINT_MAX)
+		rtime = tg->last_low_overflow_time[READ];
+	if (tg->bps[WRITE][LIMIT_LOW] != U64_MAX ||
+	    tg->iops[WRITE][LIMIT_LOW] != UINT_MAX)
+		wtime = tg->last_low_overflow_time[WRITE];
+	return min(rtime, wtime) == -1 ? 0 : min(rtime, wtime);
+}
+
+static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+	struct throtl_service_queue *parent_sq;
+	struct throtl_grp *parent = tg;
+	unsigned long ret = __tg_last_low_overflow_time(tg);
+
+	while (true) {
+		parent_sq = parent->service_queue.parent_sq;
+		parent = sq_to_tg(parent_sq);
+		if (!parent)
+			break;
+		if (time_after(__tg_last_low_overflow_time(parent), ret))
+			ret = __tg_last_low_overflow_time(parent);
+	}
+	return ret;
+}
+
 static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1555,6 +1597,9 @@ static bool throtl_can_upgrade(struct throtl_data *td,
 	if (td->limit_index != LIMIT_LOW)
 		return false;
 
+	if (time_before(jiffies, td->low_downgrade_time + throtl_slice))
+		return false;
+
 	rcu_read_lock();
 	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1578,6 +1623,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
 	struct blkcg_gq *blkg;
 
 	td->limit_index = LIMIT_MAX;
+	td->low_upgrade_time = jiffies;
 	rcu_read_lock();
 	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1593,6 +1639,100 @@ static void throtl_upgrade_state(struct throtl_data *td)
 	queue_work(kthrotld_workqueue, &td->dispatch_work);
 }
 
+static void throtl_downgrade_state(struct throtl_data *td, int new)
+{
+	td->limit_index = new;
+	td->low_downgrade_time = jiffies;
+}
+
+static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
+{
+	struct throtl_data *td = tg->td;
+	unsigned long now = jiffies;
+
+	/*
+	 * If cgroup is below low limit, consider downgrade and throttle other
+	 * cgroups
+	 */
+	if (time_after_eq(now, td->low_upgrade_time + throtl_slice) &&
+	    time_after_eq(now, tg_last_low_overflow_time(tg) + throtl_slice))
+		return true;
+	return false;
+}
+
+static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
+{
+	while (true) {
+		if (!throtl_tg_can_downgrade(tg))
+			return false;
+		tg = sq_to_tg(tg->service_queue.parent_sq);
+		if (!tg || (cgroup_subsys_on_dfl(io_cgrp_subsys) &&
+			    !tg_to_blkg(tg)->parent))
+			break;
+	}
+	return true;
+}
+
+static void throtl_downgrade_check(struct throtl_grp *tg)
+{
+	uint64_t bps;
+	unsigned int iops;
+	unsigned long elapsed_time;
+	unsigned long now = jiffies;
+
+	if (tg->td->limit_index != LIMIT_MAX ||
+	    !tg->td->limit_valid[LIMIT_LOW])
+		return;
+	if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+		return;
+	if (time_after(tg->last_check_time + throtl_slice, now))
+		return;
+
+	elapsed_time = now - tg->last_check_time;
+	tg->last_check_time = now;
+
+	if (time_before(now, tg_last_low_overflow_time(tg) + throtl_slice))
+		return;
+
+	if (tg->bps[READ][LIMIT_LOW] != U64_MAX) {
+		bps = tg->last_bytes_disp[READ] * HZ;
+		do_div(bps, elapsed_time);
+		if (bps >= tg->bps[READ][LIMIT_LOW])
+			tg->last_low_overflow_time[READ] = now;
+	}
+
+	if (tg->bps[WRITE][LIMIT_LOW] != U64_MAX) {
+		bps = tg->last_bytes_disp[WRITE] * HZ;
+		do_div(bps, elapsed_time);
+		if (bps >= tg->bps[WRITE][LIMIT_LOW])
+			tg->last_low_overflow_time[WRITE] = now;
+	}
+
+	if (tg->iops[READ][LIMIT_LOW] != UINT_MAX) {
+		iops = tg->last_io_disp[READ] * HZ / elapsed_time;
+		if (iops >= tg->iops[READ][LIMIT_LOW])
+			tg->last_low_overflow_time[READ] = now;
+	}
+
+	if (tg->iops[WRITE][LIMIT_LOW] != UINT_MAX) {
+		iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
+		if (iops >= tg->iops[WRITE][LIMIT_LOW])
+			tg->last_low_overflow_time[WRITE] = now;
+	}
+
+	/*
+	 * If cgroup is below low limit, consider downgrade and throttle other
+	 * cgroups
+	 */
+	if (throtl_hierarchy_can_downgrade(tg))
+		throtl_downgrade_state(tg->td, LIMIT_LOW);
+
+	tg->last_bytes_disp[READ] = 0;
+	tg->last_bytes_disp[WRITE] = 0;
+	tg->last_io_disp[READ] = 0;
+	tg->last_io_disp[WRITE] = 0;
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -1617,12 +1757,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 again:
 	while (true) {
+		if (tg->last_low_overflow_time[rw] == 0)
+			tg->last_low_overflow_time[rw] = jiffies;
+		throtl_downgrade_check(tg);
 		/* throtl is FIFO - if bios are already queued, should queue */
 		if (sq->nr_queued[rw])
 			break;
 
 		/* if above limits, break to queue */
 		if (!tg_may_dispatch(tg, bio, NULL)) {
+			tg->last_low_overflow_time[rw] = jiffies;
 			if (throtl_can_upgrade(tg->td, tg)) {
 				throtl_upgrade_state(tg->td);
 				goto again;
@@ -1666,6 +1810,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		   tg->io_disp[rw], tg_iops_limit(tg, rw),
 		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
+	tg->last_low_overflow_time[rw] = jiffies;
+
 	bio_associate_current(bio);
 	tg->td->nr_queued[rw]++;
 	throtl_add_bio_tg(bio, qn, tg);
@@ -1776,6 +1922,8 @@ int blk_throtl_init(struct request_queue *q)
 
 	td->limit_valid[LIMIT_MAX] = true;
 	td->limit_index = LIMIT_MAX;
+	td->low_upgrade_time = jiffies;
+	td->low_downgrade_time = jiffies;
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret)
-- 
2.9.3

  parent reply	other threads:[~2016-12-15 20:33 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-12-15 20:32 [PATCH V5 00/17] blk-throttle: add .low limit Shaohua Li
2016-12-15 20:32 ` [PATCH V5 01/17] blk-throttle: use U64_MAX/UINT_MAX to replace -1 Shaohua Li
2016-12-15 20:32 ` [PATCH V5 02/17] blk-throttle: prepare support multiple limits Shaohua Li
2016-12-15 20:32 ` [PATCH V5 03/17] blk-throttle: add .low interface Shaohua Li
2017-01-09 16:35   ` Tejun Heo
2016-12-15 20:32 ` [PATCH V5 04/17] blk-throttle: configure bps/iops limit for cgroup in low limit Shaohua Li
2017-01-09 17:35   ` Tejun Heo
2016-12-15 20:32 ` [PATCH V5 05/17] blk-throttle: add upgrade logic for LIMIT_LOW state Shaohua Li
2017-01-09 18:40   ` Tejun Heo
2017-01-09 19:46     ` Tejun Heo
2016-12-15 20:32 ` Shaohua Li [this message]
2016-12-15 20:32 ` [PATCH V5 07/17] blk-throttle: make sure expire time isn't too big Shaohua Li
2017-01-09 19:54   ` Tejun Heo
2016-12-15 20:32 ` [PATCH V5 08/17] blk-throttle: make throtl_slice tunable Shaohua Li
2017-01-09 20:08   ` Tejun Heo
2016-12-15 20:33 ` [PATCH V5 09/17] blk-throttle: detect completed idle cgroup Shaohua Li
2017-01-09 20:13   ` Tejun Heo
2016-12-15 20:33 ` [PATCH V5 10/17] blk-throttle: make bandwidth change smooth Shaohua Li
2017-01-09 20:28   ` Tejun Heo
2016-12-15 20:33 ` [PATCH V5 11/17] blk-throttle: add a simple idle detection Shaohua Li
2017-01-09 20:56   ` Tejun Heo
2016-12-15 20:33 ` [PATCH V5 12/17] blk-throttle: add interface to configure idle time threshold Shaohua Li
2017-01-09 20:58   ` Tejun Heo
2016-12-15 20:33 ` [PATCH V5 13/17] blk-throttle: ignore idle cgroup limit Shaohua Li
2017-01-09 21:01   ` Tejun Heo
2016-12-15 20:33 ` [PATCH V5 14/17] blk-throttle: add interface for per-cgroup target latency Shaohua Li
2017-01-09 21:14   ` Tejun Heo
2016-12-15 20:33 ` [PATCH V5 15/17] block: track request size in blk_issue_stat Shaohua Li
2016-12-16  2:01   ` kbuild test robot
2017-01-09 21:17   ` Tejun Heo
2016-12-15 20:33 ` [PATCH V5 16/17] blk-throttle: add a mechanism to estimate IO latency Shaohua Li
2017-01-09 21:39   ` Tejun Heo
2016-12-15 20:33 ` [PATCH V5 17/17] blk-throttle: add latency target support Shaohua Li
2017-01-09 21:46 ` [PATCH V5 00/17] blk-throttle: add .low limit Tejun Heo
2017-01-09 22:27   ` Shaohua Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5c22098d912d73388bc214c1d7e404f25cfdad13.1481833017.git.shli@fb.com \
    --to=shli@fb.com \
    --cc=axboe@fb.com \
    --cc=kernel-team@fb.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).