[PATCH V2 04/13] blk-throttle: weight based throttling

From: Shaohua Li <shli@fb.com>
To: <linux-kernel@vger.kernel.org>, <linux-block@vger.kernel.org>
Cc: <axboe@kernel.dk>, <tj@kernel.org>,
	Vivek Goyal <vgoyal@redhat.com>,
	"jmoyer @ redhat . com" <jmoyer@redhat.com>, <Kernel-team@fb.com>
Subject: [PATCH V2 04/13] blk-throttle: weight based throttling
Date: Mon, 22 Feb 2016 14:01:19 -0800	[thread overview]
Message-ID: <5284a154ff79b8abf99d5661a96a7af96842b7be.1456178093.git.shli@fb.com> (raw)
In-Reply-To: <cover.1456178093.git.shli@fb.com>

We know total bandwidth of a disk and can calculate cgroup's bandwidth
percentage against disk bandwidth according to its weight. We can easily
calculate cgroup bandwidth.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 156 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 90f937f..fafe9c2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2,6 +2,7 @@
  * Interface for controlling IO bandwidth on a request queue
  *
  * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
+ * Proportional throttle - Shaohua Li <shli@kernel.org>
  */
 
 #include <linux/module.h>
@@ -12,6 +13,12 @@
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 
+#define MAX_WEIGHT (10000)
+#define MIN_WEIGHT (1)
+#define DFT_WEIGHT (100)
+#define SHARE_SHIFT (14)
+#define MAX_SHARE (1 << SHARE_SHIFT)
+
 /* Max dispatch from a group in 1 round */
 static int throtl_grp_quantum = 8;
 
@@ -74,6 +81,10 @@ struct throtl_service_queue {
 	unsigned int		nr_pending;	/* # queued in the tree */
 	unsigned long		first_pending_disptime;	/* disptime of the first tg */
 	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
+
+	unsigned int		weight; /* this queue's weight against siblings */
+	unsigned int		children_weight; /* children weight */
+	unsigned int		share; /* disk bandwidth share of the queue */
 };
 
 enum tg_state_flags {
@@ -139,6 +150,22 @@ struct throtl_grp {
 	unsigned long slice_end[2];
 };
 
+enum run_mode {
+	MODE_NONE = 0,
+	MODE_THROTTLE = 1, /* bandwidth/iops based throttle */
+	/* below are weight based */
+	MODE_WEIGHT_BANDWIDTH = 2,
+	MODE_WEIGHT_IOPS = 3,
+	MAX_MODE = 4,
+};
+
+static char *run_mode_name[MAX_MODE] = {
+	[MODE_NONE] = "none",
+	[MODE_THROTTLE] = "throttle",
+	[MODE_WEIGHT_BANDWIDTH] = "weight_bw",
+	[MODE_WEIGHT_IOPS] = "weight_iops",
+};
+
 struct throtl_data
 {
 	/* service tree for active throtl groups */
@@ -156,8 +183,14 @@ struct throtl_data
 
 	/* Work for dispatching throttled bios */
 	struct work_struct dispatch_work;
+	enum run_mode mode;
 };
 
+static bool td_weight_based(struct throtl_data *td)
+{
+	return td->mode > MODE_THROTTLE;
+}
+
 static void throtl_pending_timer_fn(unsigned long arg);
 
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -209,9 +242,33 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
 
 static inline int tg_data_index(struct throtl_grp *tg, bool rw)
 {
+	if (td_weight_based(tg->td))
+		return 0;
 	return rw;
 }
 
+static inline uint64_t queue_bandwidth(struct throtl_data *td)
+{
+	uint64_t bw;
+
+	bw = td->queue->disk_bw * 512;
+	/* can't estimate bandwidth, can't do proporation control */
+	if (bw == 0)
+		bw = -1;
+	return bw;
+}
+
+static inline uint64_t queue_iops(struct throtl_data *td)
+{
+	uint64_t iops;
+
+	iops = td->queue->disk_iops;
+	/* can't estimate iops, can't do proporation control */
+	if (iops == 0)
+		iops = -1;
+	return iops;
+}
+
 /**
  * throtl_log - log debug message via blktrace
  * @sq: the service_queue being reported
@@ -386,6 +443,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
 	sq->parent_sq = &td->service_queue;
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
 		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
+	sq->weight = DFT_WEIGHT;
+	sq->parent_sq->children_weight += sq->weight;
 	tg->td = td;
 }
 
@@ -406,7 +465,8 @@ static void tg_update_has_rules(struct throtl_grp *tg)
 
 	for (i = READ; i <= WRITE; i++)
 		tg->has_rules[i] = (parent_tg && parent_tg->has_rules[i]) ||
-				    io_cost_has_limit(tg, i);
+				    io_cost_has_limit(tg, i) ||
+				    (td_weight_based(tg->td));
 }
 
 static void throtl_pd_online(struct blkg_policy_data *pd)
@@ -421,6 +481,10 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
 static void throtl_pd_free(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
+	struct throtl_service_queue *sq = &tg->service_queue;
+
+	if (sq->parent_sq)
+		sq->parent_sq->children_weight -= sq->weight;
 
 	del_timer_sync(&tg->service_queue.pending_timer);
 	kfree(tg);
@@ -812,6 +876,10 @@ static bool io_cost_with_in_limit(struct throtl_grp *tg, struct bio *bio,
 {
 	unsigned long bps_wait = 0, iops_wait = 0;
 
+	if (tg->td->mode == MODE_WEIGHT_BANDWIDTH)
+		return io_cost_with_in_bps_limit(tg, bio, wait);
+	if (tg->td->mode == MODE_WEIGHT_IOPS)
+		return io_cost_with_in_iops_limit(tg, bio, wait);
 	if (io_cost_with_in_bps_limit(tg, bio, &bps_wait) &&
 	    io_cost_with_in_iops_limit(tg, bio, &iops_wait)) {
 		*wait = 0;
@@ -967,6 +1035,77 @@ static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
 
 }
 
+static void tg_update_perf(struct throtl_grp *tg)
+{
+	struct throtl_service_queue *sq;
+	u64 new_bps, abs_bps = 0;
+	unsigned int new_iops, abs_iops = 0;
+
+	sq = &tg->service_queue;
+
+	/* '/' cgroup in cgroup2 */
+	if (!td_weight_based(tg->td) || !sq->parent_sq ||
+	    (cgroup_subsys_on_dfl(io_cgrp_subsys) && !tg_to_blkg(tg)->parent))
+		return;
+
+	if (tg->td->mode == MODE_WEIGHT_BANDWIDTH) {
+		new_bps = max_t(uint64_t,
+			(queue_bandwidth(tg->td) * sq->share) >> SHARE_SHIFT,
+			1024);
+		if (new_bps > tg->io_cost.bps[0])
+			abs_bps = new_bps - tg->io_cost.bps[0];
+		if (new_bps < tg->io_cost.bps[0])
+			abs_bps = tg->io_cost.bps[0] - new_bps;
+		if (abs_bps > (tg->io_cost.bps[0] >> 3))
+			throtl_start_new_slice(tg, 0);
+		tg->io_cost.bps[0] = new_bps;
+		tg->io_cost.iops[0] = -1;
+	} else {
+		new_iops = max_t(uint64_t,
+			(queue_iops(tg->td) * sq->share) >> SHARE_SHIFT,
+			1);
+		if (new_iops > tg->io_cost.iops[0])
+			abs_iops = new_iops - tg->io_cost.iops[0];
+		if (new_iops < tg->io_cost.iops[0])
+			abs_iops = tg->io_cost.iops[0] - new_iops;
+		if (abs_iops > (tg->io_cost.iops[0] >> 3))
+			throtl_start_new_slice(tg, 0);
+		tg->io_cost.iops[0] = new_iops;
+		tg->io_cost.bps[0] = -1;
+	}
+}
+
+/* update share of tg's siblings */
+static void tg_update_share(struct throtl_data *td, struct throtl_grp *tg)
+{
+	struct cgroup_subsys_state *pos_css;
+	struct blkcg_gq *blkg, *parent_blkg;
+	struct throtl_grp *child;
+
+	if (!td_weight_based(td))
+		return;
+	if (!tg || !tg->service_queue.parent_sq ||
+	    !tg->service_queue.parent_sq->parent_sq)
+		parent_blkg = td->queue->root_blkg;
+	else
+		parent_blkg = tg_to_blkg(sq_to_tg(tg->service_queue.parent_sq));
+
+	blkg_for_each_descendant_pre(blkg, pos_css, parent_blkg) {
+		struct throtl_service_queue *sq;
+
+		child = blkg_to_tg(blkg);
+		sq = &child->service_queue;
+
+		if (!sq->parent_sq)
+			continue;
+
+		sq->share = max_t(unsigned int,
+			sq->parent_sq->share * sq->weight /
+				sq->parent_sq->children_weight,
+			1);
+	}
+}
+
 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1014,11 +1153,18 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	unsigned int nr_reads = 0, nr_writes = 0;
-	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
-	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
+	unsigned int max_nr_reads;
+	unsigned int max_nr_writes;
 	struct bio *bio;
 
-	/* Try to dispatch 75% READS and 25% WRITES */
+	if (td_weight_based(tg->td)) {
+		max_nr_reads = throtl_grp_quantum;
+		max_nr_writes = 0;
+	} else {
+		/* Try to dispatch 75% READS and 25% WRITES */
+		max_nr_reads = throtl_grp_quantum * 3 / 4;
+		max_nr_writes = throtl_grp_quantum - max_nr_reads;
+	}
 
 	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
@@ -1039,6 +1185,9 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
 		if (nr_writes >= max_nr_writes)
 			break;
 	}
+	if (nr_reads + nr_writes) {
+		tg_update_perf(tg);
+	}
 
 	return nr_reads + nr_writes;
 }
@@ -1494,6 +1643,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		/* throtl is FIFO - if bios are already queued, should queue */
 		if (sq->nr_queued[index])
 			break;
+		tg_update_perf(tg);
 
 		/* if above limits, break to queue */
 		if (!tg_may_dispatch(tg, bio, NULL))
@@ -1639,6 +1789,8 @@ int blk_throtl_init(struct request_queue *q)
 
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
+	td->service_queue.share = MAX_SHARE;
+	td->mode = MODE_NONE;
 
 	q->td = td;
 	td->queue = q;
-- 
2.6.5