All of lore.kernel.org
 help / color / mirror / Atom feed
From: Shaohua Li <shli@fb.com>
To: <linux-kernel@vger.kernel.org>, <linux-block@vger.kernel.org>
Cc: <axboe@kernel.dk>, <tj@kernel.org>,
	Vivek Goyal <vgoyal@redhat.com>,
	"jmoyer @ redhat . com" <jmoyer@redhat.com>, <Kernel-team@fb.com>
Subject: [PATCH V2 13/13] blk-throttle: detect wrong shrink
Date: Mon, 22 Feb 2016 14:01:28 -0800	[thread overview]
Message-ID: <3bacf5bc1eb353cdb0e690098cb9c3441d21eb98.1456178093.git.shli@fb.com> (raw)
In-Reply-To: <cover.1456178093.git.shli@fb.com>

Last patch detects wrong shrink for some cases, but not all. Let's have
an example. cg1 gets 20% share, cg2 gets 80% share.

disk max bps 100, cgroup can only dispatch 80 bps
cg1 bps = 20, cg2 bps = 80

This should be stable state, but we don't know about it. As we assign
extra bps to queue, we will find cg2 doesn't use its share. So we try to
adjust it, for example, shrink cg2 bps 5. cg1 will then get 25 bps. The
problem is total disk bps is 100, cg2 can't dispatch enough IO and its
bps will drop to 75. From time to time, cg2's share will be adjusted.
Finally cg1 and cg2 will get same bps. The shrink is wrong.

To detect this situation, we record history when a cgroup's share is
shrinked. If we found the cgroup's bps drops, we think the shrink is
wrong and restore the cgroup's share. To avoid fluctuation, we also
disable shrink for a while if a wrong shrink is detected. It's possible,
the shrink is right actually, we will redo the shrink after a while.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7db7d8e..3af41bc 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -122,7 +122,15 @@ struct throtl_io_cost {
 
 	uint64_t target_bps;
 	unsigned int target_iops;
+
+	uint64_t last_bps;
+	unsigned int last_iops;
+	unsigned int last_weight;
+	unsigned long last_shrink_time;
+	unsigned long last_wrong_shrink_time;
 };
+#define SHRINK_STABLE_TIME (2 * CGCHECK_TIME)
+#define WRONG_SHRINK_STABLE_TIME (16 * CGCHECK_TIME)
 
 /* per cgroup per device data */
 struct throtl_grp {
@@ -1239,6 +1247,76 @@ static void precheck_tg_activity(struct throtl_grp *tg, unsigned long elapsed_ti
 	}
 }
 
+static bool detect_wrong_shrink(struct throtl_grp *tg, unsigned long elapsed_time)
+{
+	struct throtl_service_queue *sq = &tg->service_queue;
+	struct throtl_service_queue *psq = tg->service_queue.parent_sq;
+	struct throtl_grp *ptg = sq_to_tg(psq);
+	struct blkcg_gq *blkg;
+	struct cgroup_subsys_state *pos_css;
+	uint64_t actual_bps;
+	unsigned int actual_iops;
+	bool ret = false;
+	unsigned long now = jiffies;
+	unsigned int new_weight;
+	bool bandwidth_mode = tg->td->mode == MODE_WEIGHT_BANDWIDTH;
+
+	blkg_for_each_child(blkg, pos_css, tg_to_blkg(ptg)) {
+		struct throtl_grp *child;
+
+		child = blkg_to_tg(blkg);
+		sq = &child->service_queue;
+
+		actual_bps = child->io_cost.act_bps;
+		actual_iops = child->io_cost.act_iops;
+		if (child->io_cost.last_shrink_time) {
+			if (time_before_eq(now, child->io_cost.last_shrink_time +
+			     SHRINK_STABLE_TIME) &&
+			   ((bandwidth_mode && actual_bps < child->io_cost.last_bps &&
+			     child->io_cost.last_bps - actual_bps >
+			      (child->io_cost.last_bps >> 5)) ||
+			    (!bandwidth_mode && actual_iops < child->io_cost.last_iops &&
+			      child->io_cost.last_iops - actual_iops >
+			       (child->io_cost.last_iops >> 5)))) {
+
+				ret = true;
+				/* the cgroup will get 1/4 more share */
+				if (4 * psq->children_weight > 5 * sq->acting_weight) {
+					new_weight = sq->acting_weight *
+					  psq->children_weight / (4 * psq->children_weight -
+					  5 * sq->acting_weight) + sq->acting_weight;
+					if (new_weight > sq->weight)
+						new_weight = sq->weight;
+				} else
+					new_weight = sq->weight;
+
+				psq->children_weight += new_weight -
+					sq->acting_weight;
+				sq->acting_weight = new_weight;
+
+				child->io_cost.last_shrink_time = 0;
+				child->io_cost.last_wrong_shrink_time = now;
+			} else if (time_after(now,
+				   child->io_cost.last_shrink_time + SHRINK_STABLE_TIME))
+				child->io_cost.last_shrink_time = 0;
+		}
+		if (child->io_cost.last_wrong_shrink_time &&
+		    time_after(now, child->io_cost.last_wrong_shrink_time +
+		     WRONG_SHRINK_STABLE_TIME))
+			child->io_cost.last_wrong_shrink_time = 0;
+	}
+	if (!ret)
+		return ret;
+
+	blkg_for_each_child(blkg, pos_css, tg_to_blkg(ptg)) {
+		struct throtl_grp *child;
+
+		child = blkg_to_tg(blkg);
+		child->io_cost.check_weight = false;
+	}
+	return true;
+}
+
 static bool detect_one_inactive_cg(struct throtl_grp *tg, unsigned long elapsed_time)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1268,6 +1346,9 @@ static bool detect_one_inactive_cg(struct throtl_grp *tg, unsigned long elapsed_
 	    sq->acting_weight == psq->children_weight)
 		return false;
 
+	if (detect_wrong_shrink(tg, elapsed_time))
+		return true;
+
 	blkg_for_each_child(blkg, pos_css, tg_to_blkg(ptg)) {
 		child = blkg_to_tg(blkg);
 		sq = &child->service_queue;
@@ -1277,9 +1358,18 @@ static bool detect_one_inactive_cg(struct throtl_grp *tg, unsigned long elapsed_
 		if (sq->acting_weight == sq->weight)
 			none_scaled_weight += sq->acting_weight;
 
+		/* wait it stable */
+		if ((child->io_cost.last_shrink_time && time_before_eq(jiffies,
+		     child->io_cost.last_shrink_time + SHRINK_STABLE_TIME)) ||
+		    (child->io_cost.last_wrong_shrink_time && time_before_eq(jiffies,
+		      child->io_cost.last_wrong_shrink_time +
+		      WRONG_SHRINK_STABLE_TIME)))
+			continue;
+
 		if (bandwidth_mode) {
 			if (child->io_cost.bps[0] == -1)
 				continue;
+
 			if (child->io_cost.act_bps +
 			    (child->io_cost.act_bps >> 3) >= child->io_cost.bps[0])
 				continue;
@@ -1290,6 +1380,8 @@ static bool detect_one_inactive_cg(struct throtl_grp *tg, unsigned long elapsed_
 			adjusted_bps = tmp_bps >> 2;
 			child->io_cost.target_bps = child->io_cost.bps[0] -
 				adjusted_bps;
+
+			child->io_cost.last_bps = child->io_cost.act_bps;
 		} else {
 			if (child->io_cost.iops[0] == -1)
 				continue;
@@ -1305,6 +1397,7 @@ static bool detect_one_inactive_cg(struct throtl_grp *tg, unsigned long elapsed_
 				adjusted_iops;
 		}
 
+		child->io_cost.last_weight = sq->acting_weight;
 		adjusted_weight += sq->acting_weight;
 		if (sq->acting_weight == sq->weight)
 			none_scaled_weight -= sq->acting_weight;
@@ -1357,6 +1450,8 @@ static bool detect_one_inactive_cg(struct throtl_grp *tg, unsigned long elapsed_
 		}
 		psq->children_weight -= sq->acting_weight - new_weight;
 		sq->acting_weight = new_weight;
+
+		child->io_cost.last_shrink_time = jiffies;
 	}
 
 	return true;
-- 
2.6.5

  parent reply	other threads:[~2016-02-22 22:02 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-02-22 22:01 [PATCH V2 00/13] block-throttle: proportional throttle Shaohua Li
2016-02-22 22:01 ` [PATCH V2 01/13] block: estimate disk performance Shaohua Li
2016-02-22 22:01 ` [PATCH V2 02/13] blk-throttle: cleanup io cost related stuff Shaohua Li
2016-02-22 22:01 ` [PATCH V2 03/13] blk-throttle: add abstract to index data Shaohua Li
2016-02-22 22:01 ` [PATCH V2 04/13] blk-throttle: weight based throttling Shaohua Li
2016-02-22 22:01 ` [PATCH V2 05/13] blk-throttling: detect inactive cgroup Shaohua Li
2016-02-22 22:01 ` [PATCH V2 06/13] blk-throttle: add per-cgroup data Shaohua Li
2016-02-22 22:01 ` [PATCH V2 07/13] blk-throttle: add interface for proporation based throttle Shaohua Li
2016-02-22 22:01 ` [PATCH V2 08/13] blk-throttle: add cgroup2 interface Shaohua Li
2016-02-22 22:01 ` [PATCH V2 09/13] blk-throttle: add trace for new proporation throttle Shaohua Li
2016-02-22 22:01 ` [PATCH V2 10/13] blk-throttle: over estimate bandwidth Shaohua Li
2016-02-22 22:01 ` [PATCH V2 11/13] blk-throttle: shrink cgroup share if its target is overestimated Shaohua Li
2016-02-22 22:01 ` [PATCH V2 12/13] blk-throttle: restore shrinked cgroup share Shaohua Li
2016-02-22 22:01 ` Shaohua Li [this message]
2016-02-28 15:02 ` [PATCH V2 00/13] block-throttle: proportional throttle Pavel Machek
2016-03-01  5:19   ` Shaohua Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3bacf5bc1eb353cdb0e690098cb9c3441d21eb98.1456178093.git.shli@fb.com \
    --to=shli@fb.com \
    --cc=Kernel-team@fb.com \
    --cc=axboe@kernel.dk \
    --cc=jmoyer@redhat.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.