All of lore.kernel.org
 help / color / mirror / Atom feed
From: Shaohua Li <shli@fb.com>
To: <linux-kernel@vger.kernel.org>, <linux-block@vger.kernel.org>
Cc: <axboe@kernel.dk>, <tj@kernel.org>,
	Vivek Goyal <vgoyal@redhat.com>,
	"jmoyer @ redhat . com" <jmoyer@redhat.com>, <Kernel-team@fb.com>
Subject: [PATCH V2 04/13] blk-throttle: weight based throttling
Date: Mon, 22 Feb 2016 14:01:19 -0800	[thread overview]
Message-ID: <5284a154ff79b8abf99d5661a96a7af96842b7be.1456178093.git.shli@fb.com> (raw)
In-Reply-To: <cover.1456178093.git.shli@fb.com>

We know total bandwidth of a disk and can calculate cgroup's bandwidth
percentage against disk bandwidth according to its weight. We can easily
calculate cgroup bandwidth.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/blk-throttle.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 156 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 90f937f..fafe9c2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2,6 +2,7 @@
  * Interface for controlling IO bandwidth on a request queue
  *
  * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
+ * Proportional throttle - Shaohua Li <shli@kernel.org>
  */
 
 #include <linux/module.h>
@@ -12,6 +13,12 @@
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 
+#define MAX_WEIGHT (10000)
+#define MIN_WEIGHT (1)
+#define DFT_WEIGHT (100)
+#define SHARE_SHIFT (14)
+#define MAX_SHARE (1 << SHARE_SHIFT)
+
 /* Max dispatch from a group in 1 round */
 static int throtl_grp_quantum = 8;
 
@@ -74,6 +81,10 @@ struct throtl_service_queue {
 	unsigned int		nr_pending;	/* # queued in the tree */
 	unsigned long		first_pending_disptime;	/* disptime of the first tg */
 	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
+
+	unsigned int		weight; /* this queue's weight against siblings */
+	unsigned int		children_weight; /* children weight */
+	unsigned int		share; /* disk bandwidth share of the queue */
 };
 
 enum tg_state_flags {
@@ -139,6 +150,22 @@ struct throtl_grp {
 	unsigned long slice_end[2];
 };
 
+enum run_mode {
+	MODE_NONE = 0,
+	MODE_THROTTLE = 1, /* bandwidth/iops based throttle */
+	/* below are weight based */
+	MODE_WEIGHT_BANDWIDTH = 2,
+	MODE_WEIGHT_IOPS = 3,
+	MAX_MODE = 4,
+};
+
+static char *run_mode_name[MAX_MODE] = {
+	[MODE_NONE] = "none",
+	[MODE_THROTTLE] = "throttle",
+	[MODE_WEIGHT_BANDWIDTH] = "weight_bw",
+	[MODE_WEIGHT_IOPS] = "weight_iops",
+};
+
 struct throtl_data
 {
 	/* service tree for active throtl groups */
@@ -156,8 +183,14 @@ struct throtl_data
 
 	/* Work for dispatching throttled bios */
 	struct work_struct dispatch_work;
+	enum run_mode mode;
 };
 
+static bool td_weight_based(struct throtl_data *td)
+{
+	return td->mode > MODE_THROTTLE;
+}
+
 static void throtl_pending_timer_fn(unsigned long arg);
 
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -209,9 +242,33 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
 
 static inline int tg_data_index(struct throtl_grp *tg, bool rw)
 {
+	if (td_weight_based(tg->td))
+		return 0;
 	return rw;
 }
 
+static inline uint64_t queue_bandwidth(struct throtl_data *td)
+{
+	uint64_t bw;
+
+	bw = td->queue->disk_bw * 512;
+	/* can't estimate bandwidth, can't do proporation control */
+	if (bw == 0)
+		bw = -1;
+	return bw;
+}
+
+static inline uint64_t queue_iops(struct throtl_data *td)
+{
+	uint64_t iops;
+
+	iops = td->queue->disk_iops;
+	/* can't estimate iops, can't do proporation control */
+	if (iops == 0)
+		iops = -1;
+	return iops;
+}
+
 /**
  * throtl_log - log debug message via blktrace
  * @sq: the service_queue being reported
@@ -386,6 +443,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
 	sq->parent_sq = &td->service_queue;
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
 		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
+	sq->weight = DFT_WEIGHT;
+	sq->parent_sq->children_weight += sq->weight;
 	tg->td = td;
 }
 
@@ -406,7 +465,8 @@ static void tg_update_has_rules(struct throtl_grp *tg)
 
 	for (i = READ; i <= WRITE; i++)
 		tg->has_rules[i] = (parent_tg && parent_tg->has_rules[i]) ||
-				    io_cost_has_limit(tg, i);
+				    io_cost_has_limit(tg, i) ||
+				    (td_weight_based(tg->td));
 }
 
 static void throtl_pd_online(struct blkg_policy_data *pd)
@@ -421,6 +481,10 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
 static void throtl_pd_free(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
+	struct throtl_service_queue *sq = &tg->service_queue;
+
+	if (sq->parent_sq)
+		sq->parent_sq->children_weight -= sq->weight;
 
 	del_timer_sync(&tg->service_queue.pending_timer);
 	kfree(tg);
@@ -812,6 +876,10 @@ static bool io_cost_with_in_limit(struct throtl_grp *tg, struct bio *bio,
 {
 	unsigned long bps_wait = 0, iops_wait = 0;
 
+	if (tg->td->mode == MODE_WEIGHT_BANDWIDTH)
+		return io_cost_with_in_bps_limit(tg, bio, wait);
+	if (tg->td->mode == MODE_WEIGHT_IOPS)
+		return io_cost_with_in_iops_limit(tg, bio, wait);
 	if (io_cost_with_in_bps_limit(tg, bio, &bps_wait) &&
 	    io_cost_with_in_iops_limit(tg, bio, &iops_wait)) {
 		*wait = 0;
@@ -967,6 +1035,77 @@ static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
 
 }
 
+static void tg_update_perf(struct throtl_grp *tg)
+{
+	struct throtl_service_queue *sq;
+	u64 new_bps, abs_bps = 0;
+	unsigned int new_iops, abs_iops = 0;
+
+	sq = &tg->service_queue;
+
+	/* '/' cgroup in cgroup2 */
+	if (!td_weight_based(tg->td) || !sq->parent_sq ||
+	    (cgroup_subsys_on_dfl(io_cgrp_subsys) && !tg_to_blkg(tg)->parent))
+		return;
+
+	if (tg->td->mode == MODE_WEIGHT_BANDWIDTH) {
+		new_bps = max_t(uint64_t,
+			(queue_bandwidth(tg->td) * sq->share) >> SHARE_SHIFT,
+			1024);
+		if (new_bps > tg->io_cost.bps[0])
+			abs_bps = new_bps - tg->io_cost.bps[0];
+		if (new_bps < tg->io_cost.bps[0])
+			abs_bps = tg->io_cost.bps[0] - new_bps;
+		if (abs_bps > (tg->io_cost.bps[0] >> 3))
+			throtl_start_new_slice(tg, 0);
+		tg->io_cost.bps[0] = new_bps;
+		tg->io_cost.iops[0] = -1;
+	} else {
+		new_iops = max_t(uint64_t,
+			(queue_iops(tg->td) * sq->share) >> SHARE_SHIFT,
+			1);
+		if (new_iops > tg->io_cost.iops[0])
+			abs_iops = new_iops - tg->io_cost.iops[0];
+		if (new_iops < tg->io_cost.iops[0])
+			abs_iops = tg->io_cost.iops[0] - new_iops;
+		if (abs_iops > (tg->io_cost.iops[0] >> 3))
+			throtl_start_new_slice(tg, 0);
+		tg->io_cost.iops[0] = new_iops;
+		tg->io_cost.bps[0] = -1;
+	}
+}
+
+/* update share of tg's siblings */
+static void tg_update_share(struct throtl_data *td, struct throtl_grp *tg)
+{
+	struct cgroup_subsys_state *pos_css;
+	struct blkcg_gq *blkg, *parent_blkg;
+	struct throtl_grp *child;
+
+	if (!td_weight_based(td))
+		return;
+	if (!tg || !tg->service_queue.parent_sq ||
+	    !tg->service_queue.parent_sq->parent_sq)
+		parent_blkg = td->queue->root_blkg;
+	else
+		parent_blkg = tg_to_blkg(sq_to_tg(tg->service_queue.parent_sq));
+
+	blkg_for_each_descendant_pre(blkg, pos_css, parent_blkg) {
+		struct throtl_service_queue *sq;
+
+		child = blkg_to_tg(blkg);
+		sq = &child->service_queue;
+
+		if (!sq->parent_sq)
+			continue;
+
+		sq->share = max_t(unsigned int,
+			sq->parent_sq->share * sq->weight /
+				sq->parent_sq->children_weight,
+			1);
+	}
+}
+
 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1014,11 +1153,18 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	unsigned int nr_reads = 0, nr_writes = 0;
-	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
-	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
+	unsigned int max_nr_reads;
+	unsigned int max_nr_writes;
 	struct bio *bio;
 
-	/* Try to dispatch 75% READS and 25% WRITES */
+	if (td_weight_based(tg->td)) {
+		max_nr_reads = throtl_grp_quantum;
+		max_nr_writes = 0;
+	} else {
+		/* Try to dispatch 75% READS and 25% WRITES */
+		max_nr_reads = throtl_grp_quantum * 3 / 4;
+		max_nr_writes = throtl_grp_quantum - max_nr_reads;
+	}
 
 	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
@@ -1039,6 +1185,9 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
 		if (nr_writes >= max_nr_writes)
 			break;
 	}
+	if (nr_reads + nr_writes) {
+		tg_update_perf(tg);
+	}
 
 	return nr_reads + nr_writes;
 }
@@ -1494,6 +1643,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		/* throtl is FIFO - if bios are already queued, should queue */
 		if (sq->nr_queued[index])
 			break;
+		tg_update_perf(tg);
 
 		/* if above limits, break to queue */
 		if (!tg_may_dispatch(tg, bio, NULL))
@@ -1639,6 +1789,8 @@ int blk_throtl_init(struct request_queue *q)
 
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
+	td->service_queue.share = MAX_SHARE;
+	td->mode = MODE_NONE;
 
 	q->td = td;
 	td->queue = q;
-- 
2.6.5

  parent reply	other threads:[~2016-02-22 22:01 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-02-22 22:01 [PATCH V2 00/13] block-throttle: proportional throttle Shaohua Li
2016-02-22 22:01 ` [PATCH V2 01/13] block: estimate disk performance Shaohua Li
2016-02-22 22:01 ` [PATCH V2 02/13] blk-throttle: cleanup io cost related stuff Shaohua Li
2016-02-22 22:01 ` [PATCH V2 03/13] blk-throttle: add abstract to index data Shaohua Li
2016-02-22 22:01 ` Shaohua Li [this message]
2016-02-22 22:01 ` [PATCH V2 05/13] blk-throttling: detect inactive cgroup Shaohua Li
2016-02-22 22:01 ` [PATCH V2 06/13] blk-throttle: add per-cgroup data Shaohua Li
2016-02-22 22:01 ` [PATCH V2 07/13] blk-throttle: add interface for proporation based throttle Shaohua Li
2016-02-22 22:01 ` [PATCH V2 08/13] blk-throttle: add cgroup2 interface Shaohua Li
2016-02-22 22:01 ` [PATCH V2 09/13] blk-throttle: add trace for new proporation throttle Shaohua Li
2016-02-22 22:01 ` [PATCH V2 10/13] blk-throttle: over estimate bandwidth Shaohua Li
2016-02-22 22:01 ` [PATCH V2 11/13] blk-throttle: shrink cgroup share if its target is overestimated Shaohua Li
2016-02-22 22:01 ` [PATCH V2 12/13] blk-throttle: restore shrinked cgroup share Shaohua Li
2016-02-22 22:01 ` [PATCH V2 13/13] blk-throttle: detect wrong shrink Shaohua Li
2016-02-28 15:02 ` [PATCH V2 00/13] block-throttle: proportional throttle Pavel Machek
2016-03-01  5:19   ` Shaohua Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5284a154ff79b8abf99d5661a96a7af96842b7be.1456178093.git.shli@fb.com \
    --to=shli@fb.com \
    --cc=Kernel-team@fb.com \
    --cc=axboe@kernel.dk \
    --cc=jmoyer@redhat.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.