[RFC PATCH] blk-throttle: dispatch more sync writes in block throttle layer

From: xuejiufei <xuejiufei@gmail.com>
To: Tejun Heo <tj@kernel.org>, Jens Axboe <axboe@fb.com>,
	Shaohua Li <shli@kernel.org>
Cc: linux-block@vger.kernel.org, cgroups@vger.kernel.org,
	jinli.zjl@alibaba-inc.com, qijiang.qj@alibaba-inc.com
Subject: [RFC PATCH] blk-throttle: dispatch more sync writes in block throttle layer
Date: Fri, 5 Jan 2018 13:16:26 +0800	[thread overview]
Message-ID: <f6f59e3c-456a-8275-8af0-32e4f2e5cdd6@gmail.com> (raw)

From: Jiufei Xue <jiufei.xjf@alibaba-inc.com>

Cgroup writeback is supported since v4.2. But there exists a problem
in the following case.

A cgroup may send both buffer and direct/sync IOs. The foreground
thread will be stalled when periodic writeback IOs is flushed because
the service queue in block throttle layer already has a plenty of
writeback IOs, then foreground IOs should be enqueued with its FIFO
policy. The current policy is dispatching 6 reads and 2 writes during
each round, sync writes will be significantly delayed.

This patch adds another queue in block throttle. Now there are 3 queues
in a service queue: read, sync write, async write, and we can dispatch
more sync writes than aync writes.

Test:
1) setup a memcg and a blkcg with uplimit bandwidth 20m/s;
2) start a script writing 5G buffer to page cache and start to sync
3) start sync writes:
    dd if=/dev/zero of=test bs=4k count=12800 oflag=direct conv=notrunc

Unpatched:
52428800 bytes (52 MB) copied, 307.065 s, 171 kB/s

Patched:
52428800 bytes (52 MB) copied, 13.8192 s, 3.8 MB/s

Signed-off-by: Jiufei Xue <jiufei.xjf@alibaba-inc.com>
---
 block/blk-throttle.c | 195 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 129 insertions(+), 66 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d19f416..842257e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -14,10 +14,10 @@
 #include "blk.h"
 
 /* Max dispatch from a group in 1 round */
-static int throtl_grp_quantum = 8;
+static int throtl_grp_quantum = 32;
 
 /* Total max dispatch from all groups in one round */
-static int throtl_quantum = 32;
+static int throtl_quantum = 128;
 
 /* Throttling is performed over a slice and after that slice is renewed */
 #define DFL_THROTL_SLICE_HD (HZ / 10)
@@ -43,6 +43,12 @@
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 
+enum wl_type {
+	READ_WORKLOAD = 0,
+	SYNC_WRITE_WORKLOAD = 1,
+	ASYNC_WRITE_WORKLOAD = 2
+};
+
 /*
  * To implement hierarchical throttling, throtl_grps form a tree and bios
  * are dispatched upwards level by level until they reach the top and get
@@ -79,8 +85,11 @@ struct throtl_service_queue {
 	 * Bios queued directly to this service_queue or dispatched from
 	 * children throtl_grp's.
 	 */
-	struct list_head	queued[2];	/* throtl_qnode [READ/WRITE] */
-	unsigned int		nr_queued[2];	/* number of queued bios */
+	/* throtl_qnode [READ/WRITE/ASYNC_WRITE] */
+	struct list_head	queued[3];
+
+	unsigned int		nr_queued[3];	/* number of queued bios */
+
 
 	/*
 	 * RB tree of active children throtl_grp's, which are sorted by
@@ -127,8 +136,8 @@ struct throtl_grp {
 	 * with the sibling qnode_on_parents and the parent's
 	 * qnode_on_self.
 	 */
-	struct throtl_qnode qnode_on_self[2];
-	struct throtl_qnode qnode_on_parent[2];
+	struct throtl_qnode qnode_on_self[3];
+	struct throtl_qnode qnode_on_parent[3];
 
 	/*
 	 * Dispatch time in jiffies. This is the estimated time when group
@@ -202,7 +211,7 @@ struct throtl_data
 	struct request_queue *queue;
 
 	/* Total Number of queued bios on READ and WRITE lists */
-	unsigned int nr_queued[2];
+	unsigned int nr_queued[3];
 
 	unsigned int throtl_slice;
 
@@ -274,6 +283,18 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
 		return container_of(sq, struct throtl_data, service_queue);
 }
 
+static inline enum wl_type bio_workload_type(struct bio *bio)
+{
+	return bio_data_dir(bio) ?
+	       ((bio->bi_opf & REQ_SYNC) ? SYNC_WRITE_WORKLOAD :
+	       ASYNC_WRITE_WORKLOAD) : READ_WORKLOAD;
+}
+
+static inline bool wl_to_rw(enum wl_type type)
+{
+	return type >= SYNC_WRITE_WORKLOAD;
+}
+
 /*
  * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
  * make the IO dispatch more smooth.
@@ -475,8 +496,9 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
 /* init a service_queue, assumes the caller zeroed it */
 static void throtl_service_queue_init(struct throtl_service_queue *sq)
 {
-	INIT_LIST_HEAD(&sq->queued[0]);
-	INIT_LIST_HEAD(&sq->queued[1]);
+	INIT_LIST_HEAD(&sq->queued[READ_WORKLOAD]);
+	INIT_LIST_HEAD(&sq->queued[SYNC_WRITE_WORKLOAD]);
+	INIT_LIST_HEAD(&sq->queued[ASYNC_WRITE_WORKLOAD]);
 	sq->pending_tree = RB_ROOT;
 	timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
 }
@@ -484,7 +506,7 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
 	struct throtl_grp *tg;
-	int rw;
+	enum wl_type type;
 
 	tg = kzalloc_node(sizeof(*tg), gfp, node);
 	if (!tg)
@@ -492,9 +514,9 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 
 	throtl_service_queue_init(&tg->service_queue);
 
-	for (rw = READ; rw <= WRITE; rw++) {
-		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
-		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+	for (type = READ_WORKLOAD; type <= ASYNC_WRITE_WORKLOAD; type++) {
+		throtl_qnode_init(&tg->qnode_on_self[type], tg);
+		throtl_qnode_init(&tg->qnode_on_parent[type], tg);
 	}
 
 	RB_CLEAR_NODE(&tg->rb_node);
@@ -985,6 +1007,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 			    unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
+	enum wl_type type = bio_workload_type(bio);
 	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
 
 	/*
@@ -993,8 +1016,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	 * this function with a different bio if there are other bios
 	 * queued.
 	 */
-	BUG_ON(tg->service_queue.nr_queued[rw] &&
-	       bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
+	BUG_ON(tg->service_queue.nr_queued[type] &&
+	       bio != throtl_peek_queued(&tg->service_queue.queued[type]));
 
 	/* If tg->bps = -1, then BW is unlimited */
 	if (tg_bps_limit(tg, rw) == U64_MAX &&
@@ -1011,7 +1034,12 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	 * If there is queued bio, that means there should be an active
 	 * slice and it should be extended instead.
 	 */
-	if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
+	if (throtl_slice_used(tg, rw) &&
+	   ((rw == READ &&
+	    !tg->service_queue.nr_queued[READ_WORKLOAD]) ||
+	    (rw == WRITE &&
+	    !tg->service_queue.nr_queued[SYNC_WRITE_WORKLOAD] &&
+	    !tg->service_queue.nr_queued[ASYNC_WRITE_WORKLOAD])))
 		throtl_start_new_slice(tg, rw);
 	else {
 		if (time_before(tg->slice_end[rw],
@@ -1072,10 +1100,10 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
 			      struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
-	bool rw = bio_data_dir(bio);
+	int type = bio_workload_type(bio);
 
 	if (!qn)
-		qn = &tg->qnode_on_self[rw];
+		qn = &tg->qnode_on_self[type];
 
 	/*
 	 * If @tg doesn't currently have any bios queued in the same
@@ -1083,12 +1111,12 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
 	 * dispatched.  Mark that @tg was empty.  This is automatically
 	 * cleaered on the next tg_update_disptime().
 	 */
-	if (!sq->nr_queued[rw])
+	if (!sq->nr_queued[type])
 		tg->flags |= THROTL_TG_WAS_EMPTY;
 
-	throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+	throtl_qnode_add_bio(bio, qn, &sq->queued[type]);
 
-	sq->nr_queued[rw]++;
+	sq->nr_queued[type]++;
 	throtl_enqueue_tg(tg);
 }
 
@@ -1096,16 +1124,22 @@ static void tg_update_disptime(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+	unsigned long sync_write_wait, async_write_wait;
 	struct bio *bio;
 
-	bio = throtl_peek_queued(&sq->queued[READ]);
+	bio = throtl_peek_queued(&sq->queued[READ_WORKLOAD]);
 	if (bio)
 		tg_may_dispatch(tg, bio, &read_wait);
 
-	bio = throtl_peek_queued(&sq->queued[WRITE]);
+	bio = throtl_peek_queued(&sq->queued[SYNC_WRITE_WORKLOAD]);
+	if (bio)
+		tg_may_dispatch(tg, bio, &sync_write_wait);
+
+	bio = throtl_peek_queued(&sq->queued[ASYNC_WRITE_WORKLOAD]);
 	if (bio)
-		tg_may_dispatch(tg, bio, &write_wait);
+		tg_may_dispatch(tg, bio, &async_write_wait);
 
+	write_wait = min(sync_write_wait, async_write_wait);
 	min_wait = min(read_wait, write_wait);
 	disptime = jiffies + min_wait;
 
@@ -1125,16 +1159,16 @@ static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
 		throtl_start_new_slice_with_credit(parent_tg, rw,
 				child_tg->slice_start[rw]);
 	}
-
 }
 
-static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
+static void tg_dispatch_one_bio(struct throtl_grp *tg, enum wl_type type)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	struct throtl_service_queue *parent_sq = sq->parent_sq;
 	struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
 	struct throtl_grp *tg_to_put = NULL;
 	struct bio *bio;
+	bool rw = wl_to_rw(type);
 
 	/*
 	 * @bio is being transferred from @tg to @parent_sq.  Popping a bio
@@ -1142,8 +1176,8 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 	 * getting released prematurely.  Remember the tg to put and put it
 	 * after @bio is transferred to @parent_sq.
 	 */
-	bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
-	sq->nr_queued[rw]--;
+	bio = throtl_pop_queued(&sq->queued[type], &tg_to_put);
+	sq->nr_queued[type]--;
 
 	throtl_charge_bio(tg, bio);
 
@@ -1155,13 +1189,13 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 	 * responsible for issuing these bios.
 	 */
 	if (parent_tg) {
-		throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
+		throtl_add_bio_tg(bio, &tg->qnode_on_parent[type], parent_tg);
 		start_parent_slice_with_credit(tg, parent_tg, rw);
 	} else {
-		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
-				     &parent_sq->queued[rw]);
-		BUG_ON(tg->td->nr_queued[rw] <= 0);
-		tg->td->nr_queued[rw]--;
+		throtl_qnode_add_bio(bio, &tg->qnode_on_parent[type],
+				     &parent_sq->queued[type]);
+		BUG_ON(tg->td->nr_queued[type] <= 0);
+		tg->td->nr_queued[type]--;
 	}
 
 	throtl_trim_slice(tg, rw);
@@ -1173,34 +1207,45 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
 static int throtl_dispatch_tg(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
-	unsigned int nr_reads = 0, nr_writes = 0;
+	unsigned int nr_reads = 0, nr_async_writes = 0, nr_sync_writes = 0;
 	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
-	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
+	unsigned int max_sync_writes = (throtl_grp_quantum - max_nr_reads)*7/8;
+	unsigned int max_async_writes = throtl_grp_quantum - max_nr_reads
+					- max_sync_writes;
 	struct bio *bio;
 
 	/* Try to dispatch 75% READS and 25% WRITES */
-
-	while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
+	while ((bio = throtl_peek_queued(&sq->queued[READ_WORKLOAD])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
-		tg_dispatch_one_bio(tg, bio_data_dir(bio));
+		tg_dispatch_one_bio(tg, READ_WORKLOAD);
 		nr_reads++;
 
 		if (nr_reads >= max_nr_reads)
 			break;
 	}
 
-	while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
+	while ((bio = throtl_peek_queued(&sq->queued[SYNC_WRITE_WORKLOAD])) &&
 	       tg_may_dispatch(tg, bio, NULL)) {
 
-		tg_dispatch_one_bio(tg, bio_data_dir(bio));
-		nr_writes++;
+		tg_dispatch_one_bio(tg, SYNC_WRITE_WORKLOAD);
+		nr_sync_writes++;
 
-		if (nr_writes >= max_nr_writes)
+		if (nr_sync_writes >= max_sync_writes)
 			break;
 	}
 
-	return nr_reads + nr_writes;
+	while ((bio = throtl_peek_queued(&sq->queued[ASYNC_WRITE_WORKLOAD])) &&
+	       tg_may_dispatch(tg, bio, NULL)) {
+
+		tg_dispatch_one_bio(tg, ASYNC_WRITE_WORKLOAD);
+		nr_async_writes++;
+
+		if (nr_async_writes >= max_async_writes)
+			break;
+	}
+
+	return nr_reads + nr_sync_writes + nr_async_writes;
 }
 
 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
@@ -1221,7 +1266,9 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 
 		nr_disp += throtl_dispatch_tg(tg);
 
-		if (sq->nr_queued[0] || sq->nr_queued[1])
+		if (sq->nr_queued[READ_WORKLOAD] ||
+		    sq->nr_queued[SYNC_WRITE_WORKLOAD] ||
+		    sq->nr_queued[ASYNC_WRITE_WORKLOAD])
 			tg_update_disptime(tg);
 
 		if (nr_disp >= throtl_quantum)
@@ -1267,9 +1314,14 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 	dispatched = false;
 
 	while (true) {
-		throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
-			   sq->nr_queued[READ] + sq->nr_queued[WRITE],
-			   sq->nr_queued[READ], sq->nr_queued[WRITE]);
+		throtl_log(sq,
+			  "dispatch nr_queued=%u read=%u sync_write=%u async_write=%u",
+			   sq->nr_queued[READ_WORKLOAD] +
+			   sq->nr_queued[SYNC_WRITE_WORKLOAD] +
+			   sq->nr_queued[ASYNC_WRITE_WORKLOAD],
+			   sq->nr_queued[READ_WORKLOAD],
+			   sq->nr_queued[SYNC_WRITE_WORKLOAD],
+			   sq->nr_queued[ASYNC_WRITE_WORKLOAD]);
 
 		ret = throtl_select_dispatch(sq);
 		if (ret) {
@@ -1325,13 +1377,13 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
 	struct blk_plug plug;
-	int rw;
+	enum wl_type type;
 
 	bio_list_init(&bio_list_on_stack);
 
 	spin_lock_irq(q->queue_lock);
-	for (rw = READ; rw <= WRITE; rw++)
-		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+	for (type = READ_WORKLOAD; type <= ASYNC_WRITE_WORKLOAD; type++)
+		while ((bio = throtl_pop_queued(&td_sq->queued[type], NULL)))
 			bio_list_add(&bio_list_on_stack, bio);
 	spin_unlock_irq(q->queue_lock);
 
@@ -1820,11 +1872,13 @@ static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
 	write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
 	if (!read_limit && !write_limit)
 		return true;
-	if (read_limit && sq->nr_queued[READ] &&
-	    (!write_limit || sq->nr_queued[WRITE]))
+	if (read_limit && sq->nr_queued[READ_WORKLOAD] &&
+	    (!write_limit || sq->nr_queued[SYNC_WRITE_WORKLOAD] ||
+			     sq->nr_queued[ASYNC_WRITE_WORKLOAD]))
 		return true;
-	if (write_limit && sq->nr_queued[WRITE] &&
-	    (!read_limit || sq->nr_queued[READ]))
+	if (write_limit && (sq->nr_queued[SYNC_WRITE_WORKLOAD] ||
+			    sq->nr_queued[ASYNC_WRITE_WORKLOAD]) &&
+	    (!read_limit || sq->nr_queued[READ_WORKLOAD]))
 		return true;
 
 	if (time_after_eq(jiffies,
@@ -2129,6 +2183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
+	enum wl_type type = bio_workload_type(bio);
 	bool throttled = false;
 	struct throtl_data *td = tg->td;
 
@@ -2157,7 +2212,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		throtl_downgrade_check(tg);
 		throtl_upgrade_check(tg);
 		/* throtl is FIFO - if bios are already queued, should queue */
-		if (sq->nr_queued[rw])
+		if (sq->nr_queued[type])
 			break;
 
 		/* if above limits, break to queue */
@@ -2191,7 +2246,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		 * Climb up the ladder.  If we''re already at the top, it
 		 * can be executed directly.
 		 */
-		qn = &tg->qnode_on_parent[rw];
+		qn = &tg->qnode_on_parent[type];
 		sq = sq->parent_sq;
 		tg = sq_to_tg(sq);
 		if (!tg)
@@ -2199,16 +2254,19 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	}
 
 	/* out-of-limit, queue to @tg */
-	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
+	throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u "
+		   "iops=%u queued=%d/%d/%d",
 		   rw == READ ? 'R' : 'W',
 		   tg->bytes_disp[rw], bio->bi_iter.bi_size,
 		   tg_bps_limit(tg, rw),
 		   tg->io_disp[rw], tg_iops_limit(tg, rw),
-		   sq->nr_queued[READ], sq->nr_queued[WRITE]);
+		   sq->nr_queued[READ_WORKLOAD],
+		   sq->nr_queued[SYNC_WRITE_WORKLOAD],
+		   sq->nr_queued[ASYNC_WRITE_WORKLOAD]);
 
 	tg->last_low_overflow_time[rw] = jiffies;
 
-	td->nr_queued[rw]++;
+	td->nr_queued[type]++;
 	throtl_add_bio_tg(bio, qn, tg);
 	throttled = true;
 
@@ -2334,10 +2392,15 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq)
 
 		throtl_dequeue_tg(tg);
 
-		while ((bio = throtl_peek_queued(&sq->queued[READ])))
-			tg_dispatch_one_bio(tg, bio_data_dir(bio));
-		while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
-			tg_dispatch_one_bio(tg, bio_data_dir(bio));
+		while ((bio =
+		      throtl_peek_queued(&sq->queued[READ_WORKLOAD])))
+			tg_dispatch_one_bio(tg, READ_WORKLOAD);
+		while ((bio =
+		      throtl_peek_queued(&sq->queued[SYNC_WRITE_WORKLOAD])))
+			tg_dispatch_one_bio(tg, SYNC_WRITE_WORKLOAD);
+		while ((bio =
+		      throtl_peek_queued(&sq->queued[ASYNC_WRITE_WORKLOAD])))
+			tg_dispatch_one_bio(tg, ASYNC_WRITE_WORKLOAD);
 	}
 }
 
@@ -2354,7 +2417,7 @@ void blk_throtl_drain(struct request_queue *q)
 	struct blkcg_gq *blkg;
 	struct cgroup_subsys_state *pos_css;
 	struct bio *bio;
-	int rw;
+	enum wl_type type;
 
 	queue_lockdep_assert_held(q);
 	rcu_read_lock();
@@ -2375,8 +2438,8 @@ void blk_throtl_drain(struct request_queue *q)
 	spin_unlock_irq(q->queue_lock);
 
 	/* all bios now should be in td->service_queue, issue them */
-	for (rw = READ; rw <= WRITE; rw++)
-		while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
+	for (type = READ_WORKLOAD; type <= ASYNC_WRITE_WORKLOAD; type++)
+		while ((bio = throtl_pop_queued(&td->service_queue.queued[type],
 						NULL)))
 			generic_make_request(bio);
 
-- 
1.8.3.1