All of lore.kernel.org
 help / color / mirror / Atom feed
From: Shaohua Li <shli@fb.com>
To: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>
Cc: <axboe@fb.com>, <Kernel-team@fb.com>, <tj@kernel.org>,
	<jmoyer@redhat.com>, <vgoyal@redhat.com>
Subject: [PATCH v3 05/11] block-throttle: add downgrade logic
Date: Mon, 3 Oct 2016 14:20:24 -0700	[thread overview]
Message-ID: <ec1ea93a29a885c823da34217fb0ad7b83a982ab.1475529372.git.shli@fb.com> (raw)
In-Reply-To: <cover.1475529372.git.shli@fb.com>

When queue state machine is in LIMIT_MAX state, but a cgroup is below
its high limit for some time, the queue should be downgraded to lower
state as one cgroup's high limit isn't met.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 187 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index db2ea15..41b75a3 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -136,6 +136,13 @@ struct throtl_grp {
 	/* Number of bio's dispatched in current slice */
 	unsigned int io_disp[2];
 
+	unsigned long last_high_overflow_time[2];
+
+	uint64_t last_bytes_disp[2];
+	unsigned int last_io_disp[2];
+
+	unsigned long last_check_time;
+
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
@@ -155,6 +162,9 @@ struct throtl_data
 	struct work_struct dispatch_work;
 	unsigned int limit_index;
 	bool limit_valid[LIMIT_CNT];
+
+	unsigned long high_upgrade_time;
+	unsigned long high_downgrade_time;
 };
 
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -896,6 +906,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_iter.bi_size;
 	tg->io_disp[rw]++;
+	tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+	tg->last_io_disp[rw]++;
 
 	/*
 	 * REQ_THROTTLED is used to prevent the same bio to be throttled
@@ -1510,6 +1522,64 @@ static struct blkcg_policy blkcg_policy_throtl = {
 	.pd_free_fn		= throtl_pd_free,
 };
 
+static unsigned long __tg_last_high_overflow_time(struct throtl_grp *tg)
+{
+	unsigned long rtime = -1, wtime = -1;
+	if (tg->bps[READ][LIMIT_HIGH] != -1 ||
+	    tg->iops[READ][LIMIT_HIGH] != -1 ||
+	    tg->bps[READ][LIMIT_MAX] != -1 ||
+	    tg->iops[READ][LIMIT_MAX] != -1)
+		rtime = tg->last_high_overflow_time[READ];
+	if (tg->bps[WRITE][LIMIT_HIGH] != -1 ||
+	    tg->iops[WRITE][LIMIT_HIGH] != -1 ||
+	    tg->bps[WRITE][LIMIT_MAX] != -1 ||
+	    tg->iops[WRITE][LIMIT_MAX] != -1)
+		wtime = tg->last_high_overflow_time[WRITE];
+	return min(rtime, wtime) == -1 ? 0 : min(rtime, wtime);
+}
+
+static unsigned long tg_last_high_overflow_time(struct throtl_grp *tg)
+{
+	struct throtl_service_queue *parent_sq;
+	struct throtl_grp *parent = tg;
+	unsigned long ret = __tg_last_high_overflow_time(tg);
+
+	while (true) {
+		parent_sq = parent->service_queue.parent_sq;
+		parent = sq_to_tg(parent_sq);
+		if (!parent)
+			break;
+		if (((parent->bps[READ][LIMIT_HIGH] != -1 &&
+		      parent->bps[READ][LIMIT_HIGH] >=
+		       tg->bps[READ][LIMIT_HIGH]) ||
+		     (parent->bps[READ][LIMIT_HIGH] == -1 &&
+		      parent->bps[READ][LIMIT_MAX] >=
+		       tg->bps[READ][LIMIT_HIGH])) &&
+		    ((parent->bps[WRITE][LIMIT_HIGH] != -1 &&
+		      parent->bps[WRITE][LIMIT_HIGH] >=
+		       tg->bps[WRITE][LIMIT_HIGH]) ||
+		     (parent->bps[WRITE][LIMIT_HIGH] == -1 &&
+		      parent->bps[WRITE][LIMIT_MAX] >=
+		       tg->bps[WRITE][LIMIT_HIGH])) &&
+		    ((parent->iops[READ][LIMIT_HIGH] != -1 &&
+		      parent->iops[READ][LIMIT_HIGH] >=
+		       tg->iops[READ][LIMIT_HIGH]) ||
+		     (parent->iops[READ][LIMIT_HIGH] == -1 &&
+		      parent->iops[READ][LIMIT_MAX] >=
+		       tg->iops[READ][LIMIT_HIGH])) &&
+		    ((parent->iops[WRITE][LIMIT_HIGH] != -1 &&
+		      parent->iops[WRITE][LIMIT_HIGH] >=
+		       tg->iops[WRITE][LIMIT_HIGH]) ||
+		     (parent->iops[WRITE][LIMIT_HIGH] == -1 &&
+		      parent->iops[WRITE][LIMIT_MAX] >=
+		       tg->iops[WRITE][LIMIT_HIGH])))
+			break;
+		if (time_after(__tg_last_high_overflow_time(parent), ret))
+			ret = __tg_last_high_overflow_time(parent);
+	}
+	return ret;
+}
+
 static bool throtl_upgrade_check_one(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1557,6 +1627,9 @@ static bool throtl_can_upgrade(struct throtl_data *td,
 	if (td->limit_index != LIMIT_HIGH)
 		return false;
 
+	if (time_before(jiffies, td->high_downgrade_time + throtl_slice))
+		return false;
+
 	rcu_read_lock();
 	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1580,6 +1653,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
 	struct blkcg_gq *blkg;
 
 	td->limit_index = LIMIT_MAX;
+	td->high_upgrade_time = jiffies;
 	rcu_read_lock();
 	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1595,6 +1669,111 @@ static void throtl_upgrade_state(struct throtl_data *td)
 	queue_work(kthrotld_workqueue, &td->dispatch_work);
 }
 
+static void throtl_downgrade_state(struct throtl_data *td, int new)
+{
+	td->limit_index = new;
+	td->high_downgrade_time = jiffies;
+}
+
+static bool throtl_downgrade_check_one(struct throtl_grp *tg)
+{
+	struct throtl_data *td = tg->td;
+	unsigned long now = jiffies;
+
+	/*
+	 * If cgroup is below high limit, consider downgrade and throttle other
+	 * cgroups
+	 */
+	if (time_after_eq(now, td->high_upgrade_time + throtl_slice) &&
+	    time_after_eq(now, tg_last_high_overflow_time(tg) + throtl_slice))
+		return true;
+	return false;
+}
+
+static bool throtl_downgrade_check_hierarchy(struct throtl_grp *tg)
+{
+	if (!throtl_downgrade_check_one(tg))
+		return false;
+	while (true) {
+		if (!tg || (cgroup_subsys_on_dfl(io_cgrp_subsys) &&
+			    !tg_to_blkg(tg)->parent))
+			break;
+
+		if (!throtl_downgrade_check_one(tg))
+			return false;
+		tg = sq_to_tg(tg->service_queue.parent_sq);
+	}
+	return true;
+}
+
+static void throtl_downgrade_check(struct throtl_grp *tg)
+{
+	uint64_t bps;
+	unsigned int iops;
+	unsigned long elapsed_time;
+	unsigned long now = jiffies;
+
+	if (tg->td->limit_index != LIMIT_MAX ||
+	    !tg->td->limit_valid[LIMIT_HIGH])
+		return;
+	if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+		return;
+	if (time_after(tg->last_check_time + throtl_slice, now))
+		return;
+
+	if (time_before(now, tg_last_high_overflow_time(tg) + throtl_slice))
+		return;
+
+	elapsed_time = now - tg->last_check_time;
+	tg->last_check_time = now;
+
+	if (tg->bps[READ][LIMIT_HIGH] != -1 ||
+	    tg->bps[READ][LIMIT_MAX] != -1) {
+		bps = tg->last_bytes_disp[READ] * HZ;
+		do_div(bps, elapsed_time);
+		if (bps >= tg->bps[READ][LIMIT_HIGH] ||
+		    bps >= tg->bps[READ][LIMIT_MAX])
+			tg->last_high_overflow_time[READ] = now;
+	}
+
+	if (tg->bps[WRITE][LIMIT_HIGH] != -1 ||
+	    tg->bps[WRITE][LIMIT_MAX] != -1) {
+		bps = tg->last_bytes_disp[WRITE] * HZ;
+		do_div(bps, elapsed_time);
+		if (bps >= tg->bps[WRITE][LIMIT_HIGH] ||
+		    bps >= tg->bps[WRITE][LIMIT_MAX])
+			tg->last_high_overflow_time[WRITE] = now;
+	}
+
+	if (tg->iops[READ][LIMIT_HIGH] != -1 ||
+	    tg->iops[READ][LIMIT_MAX] != -1) {
+		iops = tg->last_io_disp[READ] * HZ / elapsed_time;
+		if (iops >= tg->iops[READ][LIMIT_HIGH] ||
+		    iops >= tg->iops[READ][LIMIT_MAX])
+			tg->last_high_overflow_time[READ] = now;
+	}
+
+	if (tg->iops[WRITE][LIMIT_HIGH] != -1 ||
+	    tg->iops[WRITE][LIMIT_MAX] != -1) {
+		iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
+		if (iops >= tg->iops[WRITE][LIMIT_HIGH] ||
+		    iops >= tg->iops[WRITE][LIMIT_MAX])
+			tg->last_high_overflow_time[WRITE] = now;
+	}
+
+	/*
+	 * If cgroup is below high limit, consider downgrade and throttle other
+	 * cgroups
+	 */
+	if (throtl_downgrade_check_hierarchy(tg))
+		throtl_downgrade_state(tg->td, LIMIT_HIGH);
+
+	tg->last_bytes_disp[READ] = 0;
+	tg->last_bytes_disp[WRITE] = 0;
+	tg->last_io_disp[READ] = 0;
+	tg->last_io_disp[WRITE] = 0;
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -1619,12 +1798,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 again:
 	while (true) {
+		if (tg->last_high_overflow_time[rw] == 0)
+			tg->last_high_overflow_time[rw] = jiffies;
+		throtl_downgrade_check(tg);
 		/* throtl is FIFO - if bios are already queued, should queue */
 		if (sq->nr_queued[rw])
 			break;
 
 		/* if above limits, break to queue */
 		if (!tg_may_dispatch(tg, bio, NULL)) {
+			tg->last_high_overflow_time[rw] = jiffies;
 			if (throtl_can_upgrade(tg->td, tg)) {
 				throtl_upgrade_state(tg->td);
 				goto again;
@@ -1667,6 +1850,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		   tg->io_disp[rw], tg_iops_limit(tg, rw),
 		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
+	tg->last_high_overflow_time[rw] = jiffies;
+
 	bio_associate_current(bio);
 	tg->td->nr_queued[rw]++;
 	throtl_add_bio_tg(bio, qn, tg);
@@ -1778,6 +1963,8 @@ int blk_throtl_init(struct request_queue *q)
 	td->limit_valid[LIMIT_HIGH] = false;
 	td->limit_valid[LIMIT_MAX] = true;
 	td->limit_index = LIMIT_MAX;
+	td->high_upgrade_time = jiffies;
+	td->high_downgrade_time = jiffies;
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret)
-- 
2.9.3


  parent reply	other threads:[~2016-10-03 21:20 UTC|newest]

Thread overview: 65+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-03 21:20 [PATCH V3 00/11] block-throttle: add .high limit Shaohua Li
2016-10-03 21:20 ` [PATCH v3 01/11] block-throttle: prepare support multiple limits Shaohua Li
2016-10-03 21:20 ` [PATCH v3 02/11] block-throttle: add .high interface Shaohua Li
2016-10-03 21:20 ` [PATCH v3 03/11] block-throttle: configure bps/iops limit for cgroup in high limit Shaohua Li
2016-10-03 21:20 ` [PATCH v3 04/11] block-throttle: add upgrade logic for LIMIT_HIGH state Shaohua Li
2016-10-03 21:20 ` Shaohua Li [this message]
2016-10-03 21:20 ` [PATCH v3 06/11] blk-throttle: make sure expire time isn't too big Shaohua Li
2016-10-03 21:20 ` [PATCH v3 07/11] blk-throttle: make throtl_slice tunable Shaohua Li
2016-10-03 21:20 ` [PATCH v3 08/11] blk-throttle: detect completed idle cgroup Shaohua Li
2016-10-03 21:20 ` [PATCH v3 09/11] block-throttle: make bandwidth change smooth Shaohua Li
2016-10-03 21:20 ` [PATCH v3 10/11] block-throttle: add a simple idle detection Shaohua Li
2016-10-03 21:20 ` [PATCH v3 11/11] blk-throttle: ignore idle cgroup limit Shaohua Li
2016-10-04 13:28 ` [PATCH V3 00/11] block-throttle: add .high limit Vivek Goyal
2016-10-04 15:56   ` Tejun Heo
2016-10-04 16:22     ` Paolo Valente
2016-10-04 16:27       ` Tejun Heo
2016-10-04 17:01         ` Paolo Valente
2016-10-04 17:28           ` Shaohua Li
2016-10-04 17:43             ` Paolo Valente
2016-10-04 18:28               ` Shaohua Li
2016-10-04 19:49                 ` Paolo Valente
2016-10-04 18:54               ` Tejun Heo
2016-10-04 19:02                 ` Paolo Valente
2016-10-04 19:14                   ` Tejun Heo
2016-10-04 19:29                     ` Paolo Valente
2016-10-04 20:27                       ` Tejun Heo
2016-10-05 12:37                         ` Paolo Valente
2016-10-05 13:12                           ` Vivek Goyal
2016-10-05 14:04                             ` Paolo Valente
2016-10-05 14:49                           ` Tejun Heo
2016-10-05 18:30                             ` Shaohua Li
2016-10-05 19:08                               ` Shaohua Li
2016-10-05 19:57                                 ` Paolo Valente
2016-10-05 20:36                                   ` Shaohua Li
2016-10-06  7:22                                     ` Paolo Valente
2016-10-05 19:47                               ` Paolo Valente
2016-10-05 20:07                                 ` Paolo Valente
2016-10-05 20:46                                 ` Shaohua Li
2016-10-06  7:58                                   ` Paolo Valente
2016-10-06 13:15                                     ` Paolo Valente
2016-10-06 17:49                                       ` Vivek Goyal
2016-10-06 18:01                                         ` Paolo Valente
2016-10-06 18:32                                           ` Vivek Goyal
2016-10-06 20:51                                             ` Paolo Valente
2016-10-06 19:44                                         ` Mark Brown
2016-10-06 19:57                                     ` Shaohua Li
2016-10-06 22:24                                       ` Paolo Valente
     [not found]                         ` <CACsaVZ+AqSXHTRdpdrQQp6PuynEPeB-5YOyweWsenjvuKsD12w@mail.gmail.com>
2016-10-09  1:15                           ` Fwd: " Kyle Sanderson
2016-10-14 16:40                             ` Tejun Heo
2016-10-14 17:13                               ` Paolo Valente
2016-10-14 18:35                                 ` Tejun Heo
2016-10-16 19:02                                   ` Paolo Valente
2016-10-18  5:15                                     ` Kyle Sanderson
2016-10-06  8:04                     ` Linus Walleij
2016-10-06 11:03                       ` Mark Brown
2016-10-06 11:57                         ` Austin S. Hemmelgarn
2016-10-06 12:50                           ` Paolo Valente
2016-10-06 13:52                             ` Austin S. Hemmelgarn
2016-10-06 15:05                               ` Paolo Valente
2016-10-06 15:10                                 ` Austin S. Hemmelgarn
2016-10-08 10:46                       ` Heinz Diehl
2016-10-04 18:12     ` Vivek Goyal
2016-10-04 18:50       ` Tejun Heo
2016-10-04 18:56         ` Paolo Valente
2016-10-04 17:08   ` Shaohua Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ec1ea93a29a885c823da34217fb0ad7b83a982ab.1475529372.git.shli@fb.com \
    --to=shli@fb.com \
    --cc=Kernel-team@fb.com \
    --cc=axboe@fb.com \
    --cc=jmoyer@redhat.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.