All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yu Kuai <yukuai3@huawei.com>
To: <axboe@kernel.dk>, <andriy.shevchenko@linux.intel.com>,
	<john.garry@huawei.com>, <ming.lei@redhat.com>
Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<yukuai3@huawei.com>, <yi.zhang@huawei.com>
Subject: [PATCH -next RFC 5/6] blk-mq: don't preempt tag expect for split bios
Date: Tue, 29 Mar 2022 17:40:47 +0800	[thread overview]
Message-ID: <20220329094048.2107094-6-yukuai3@huawei.com> (raw)
In-Reply-To: <20220329094048.2107094-1-yukuai3@huawei.com>

In order to improve the sequential of split io, this patch disables
tag preemption for the first split bios and other non-split bios if
the device is under high io pressure.

Noted that this solution rely on waitqueues of sbitmap to be balanced,
otherwise it may happen that 'wake_batch' tags is freed and wakers don't
obtain 'wake_batch' new tags, thus concurrent io will become less. The
next patch will avoid such problem, however, fix the unfairness of
waitqueues might be better.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-merge.c         |  7 ++++++-
 block/blk-mq-tag.c        | 37 ++++++++++++++++++++++++++-----------
 block/blk-mq.c            |  6 ++++++
 block/blk-mq.h            |  1 +
 include/linux/blk_types.h |  2 ++
 lib/sbitmap.c             | 14 ++++++++++----
 6 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 340860746cac..fd4bbf773b45 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -357,6 +357,11 @@ static unsigned short blk_queue_split_all(struct request_queue *q,
 		if (!first)
 			first = split;
 
+		/*
+		 * Except the first split bio, others will always preempt
+		 * tag, so that they can be sequential.
+		 */
+		split->bi_opf |= REQ_PREEMPTIVE;
 		nr_split++;
 		submit_bio_noacct(split);
 	}
@@ -387,7 +392,7 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio)
 
 	if (split) {
 		split->bi_nr_split = blk_queue_split_all(q, *bio);
-		(*bio)->bi_opf |= REQ_SPLIT;
+		(*bio)->bi_opf |= (REQ_SPLIT | REQ_PREEMPTIVE);
 		submit_bio_noacct(*bio);
 		*bio = split;
 	}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 83dfbe2f1cfc..4e485bcc5820 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -127,6 +127,13 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
 	return ret;
 }
 
+static inline bool preempt_tag(struct blk_mq_alloc_data *data,
+			       struct sbitmap_queue *bt)
+{
+	return data->preemption ||
+	       atomic_read(&bt->ws_active) <= SBQ_WAIT_QUEUES;
+}
+
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -148,12 +155,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		tag_offset = tags->nr_reserved_tags;
 	}
 
-	tag = __blk_mq_get_tag(data, bt);
-	if (tag != BLK_MQ_NO_TAG)
-		goto found_tag;
+	if (data->flags & BLK_MQ_REQ_NOWAIT || preempt_tag(data, bt)) {
+		tag = __blk_mq_get_tag(data, bt);
+		if (tag != BLK_MQ_NO_TAG)
+			goto found_tag;
 
-	if (data->flags & BLK_MQ_REQ_NOWAIT)
-		return BLK_MQ_NO_TAG;
+		if (data->flags & BLK_MQ_REQ_NOWAIT)
+			return BLK_MQ_NO_TAG;
+	}
 
 	wait.nr_tags += data->nr_split;
 	ws = bt_wait_ptr(bt, data->hctx);
@@ -171,20 +180,26 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __blk_mq_get_tag(data, bt);
-		if (tag != BLK_MQ_NO_TAG)
-			break;
+		if (preempt_tag(data, bt)) {
+			tag = __blk_mq_get_tag(data, bt);
+			if (tag != BLK_MQ_NO_TAG)
+				break;
+		}
 
 		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __blk_mq_get_tag(data, bt);
-		if (tag != BLK_MQ_NO_TAG)
-			break;
+		if (preempt_tag(data, bt)) {
+			tag = __blk_mq_get_tag(data, bt);
+			if (tag != BLK_MQ_NO_TAG)
+				break;
+		}
 
 		bt_prev = bt;
 		io_schedule();
 
 		sbitmap_finish_wait(bt, ws, &wait);
+		if (!blk_mq_is_tag_preemptive(data->hctx->flags))
+			data->preemption = true;
 
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9bace9e2c5ca..06ba6fa9ec1a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -470,6 +470,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 retry:
 	data->ctx = blk_mq_get_ctx(q);
 	data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+	if (blk_mq_is_tag_preemptive(data->hctx->flags))
+		data->preemption = true;
+
 	if (!(data->rq_flags & RQF_ELV))
 		blk_mq_tag_busy(data->hctx);
 
@@ -577,6 +580,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	data.hctx = xa_load(&q->hctx_table, hctx_idx);
 	if (!blk_mq_hw_queue_mapped(data.hctx))
 		goto out_queue_exit;
+	if (blk_mq_is_tag_preemptive(data.hctx->flags))
+		data.preemption = true;
 	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
 	data.ctx = __blk_mq_get_ctx(q, cpu);
 
@@ -2738,6 +2743,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 		.nr_tags	= 1,
 		.cmd_flags	= bio->bi_opf,
 		.nr_split	= bio->bi_nr_split,
+		.preemption	= (bio->bi_opf & REQ_PREEMPTIVE),
 	};
 	struct request *rq;
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3eabe394a5a9..915bb710dd6f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -157,6 +157,7 @@ struct blk_mq_alloc_data {
 	/* allocate multiple requests/tags in one go */
 	unsigned int nr_tags;
 	unsigned int nr_split;
+	bool preemption;
 	struct request **cached_rq;
 
 	/* input & output parameter */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 702f6b83dc88..8fd9756f0a06 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -419,6 +419,7 @@ enum req_flag_bits {
 	__REQ_DRV,
 	__REQ_SWAP,		/* swapping request. */
 	__REQ_SPLIT,		/* io is splitted */
+	__REQ_PREEMPTIVE,	/* io can preempt tag */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -444,6 +445,7 @@ enum req_flag_bits {
 #define REQ_DRV			(1ULL << __REQ_DRV)
 #define REQ_SWAP		(1ULL << __REQ_SWAP)
 #define REQ_SPLIT		(1ULL << __REQ_SPLIT)
+#define REQ_PREEMPTIVE		(1ULL << __REQ_PREEMPTIVE)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 9d04c0ecc8f7..1655c15ee11d 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -597,7 +597,8 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 	return NULL;
 }
 
-static unsigned int get_wake_nr(struct sbq_wait_state *ws, unsigned int nr_tags)
+static unsigned int get_wake_nr(struct sbq_wait_state *ws,
+				unsigned int *nr_tags)
 {
 	struct sbq_wait *wait;
 	struct wait_queue_entry *entry;
@@ -606,11 +607,13 @@ static unsigned int get_wake_nr(struct sbq_wait_state *ws, unsigned int nr_tags)
 	spin_lock_irq(&ws->wait.lock);
 	list_for_each_entry(entry, &ws->wait.head, entry) {
 		wait = container_of(entry, struct sbq_wait, wait);
-		if (nr_tags <= wait->nr_tags)
+		if (*nr_tags <= wait->nr_tags) {
+			*nr_tags = 0;
 			break;
+		}
 
 		nr++;
-		nr_tags -= wait->nr_tags;
+		*nr_tags -= wait->nr_tags;
 	}
 	spin_unlock_irq(&ws->wait.lock);
 
@@ -648,7 +651,10 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq)
 		ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch);
 		if (ret == wait_cnt) {
 			sbq_index_atomic_inc(&sbq->wake_index);
-			wake_up_nr(&ws->wait, get_wake_nr(ws, wake_batch));
+			wake_up_nr(&ws->wait, get_wake_nr(ws, &wake_batch));
+			if (wake_batch)
+				sbitmap_queue_wake_all(sbq);
+
 			return false;
 		}
 
-- 
2.31.1


  parent reply	other threads:[~2022-03-29  9:26 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-29  9:40 [PATCH -next RFC 0/6] improve large random io for HDD Yu Kuai
2022-03-29  9:40 ` [PATCH -next RFC 1/6] blk-mq: add a new flag 'BLK_MQ_F_NO_TAG_PREEMPTION' Yu Kuai
2022-03-29 12:44   ` Jens Axboe
2022-03-30  1:18     ` yukuai (C)
2022-03-30  1:20       ` Jens Axboe
2022-03-29  9:40 ` [PATCH -next RFC 2/6] block: refactor to split bio thoroughly Yu Kuai
2022-03-29 12:46   ` Jens Axboe
2022-03-30  1:35     ` yukuai (C)
2022-03-29 13:32   ` Christoph Hellwig
2022-03-29 14:35     ` Jens Axboe
2022-03-29 14:40       ` Christoph Hellwig
2022-03-29 14:41         ` Jens Axboe
2022-03-29 14:42           ` Christoph Hellwig
2022-03-30  1:54           ` yukuai (C)
2022-03-29  9:40 ` [PATCH -next RFC 3/6] blk-mq: record how many tags are needed for splited bio Yu Kuai
2022-03-29  9:40 ` [PATCH -next RFC 4/6] sbitmap: wake up the number of threads based on required tags Yu Kuai
2022-03-29  9:40 ` Yu Kuai [this message]
2022-03-29  9:40 ` [PATCH -next RFC 6/6] sbitmap: force tag preemption if free tags are sufficient Yu Kuai
2022-03-29 12:53 ` [PATCH -next RFC 0/6] improve large random io for HDD Jens Axboe
2022-03-30  2:05   ` yukuai (C)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220329094048.2107094-6-yukuai3@huawei.com \
    --to=yukuai3@huawei.com \
    --cc=andriy.shevchenko@linux.intel.com \
    --cc=axboe@kernel.dk \
    --cc=john.garry@huawei.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ming.lei@redhat.com \
    --cc=yi.zhang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.