All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yu Kuai <yukuai3@huawei.com>
To: <axboe@kernel.dk>, <bvanassche@acm.org>,
	<andriy.shevchenko@linux.intel.com>, <john.garry@huawei.com>,
	<ming.lei@redhat.com>, <qiulaibin@huawei.com>
Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<yukuai3@huawei.com>, <yi.zhang@huawei.com>
Subject: [PATCH -next RFC v3 4/8] blk-mq: don't preempt tag under heavy load
Date: Fri, 15 Apr 2022 18:10:49 +0800	[thread overview]
Message-ID: <20220415101053.554495-5-yukuai3@huawei.com> (raw)
In-Reply-To: <20220415101053.554495-1-yukuai3@huawei.com>

Tag preemption is the default behaviour, specifically blk_mq_get_tag()
will try to get tag unconditionally, which means a new io can preempt
tag even if there are lots of ios that are waiting for tags.

Such behaviour doesn't make sense when the disk is under heavy load,
because it will intensify competition without improving performance,
especially for huge io as split ios are unlikely to be issued
continuously.

The ideal way to disable tag preemption is to track how many tags are
available, and wait directly in blk_mq_get_tag() if free tags are
very little. However, this is out of reality because fast path is
affected.

As 'ws_active' is only updated in slow path, this patch disable tag
preemption if 'ws_active' is greater than 8, which means there are many
threads waiting for tags already.

Once tag preemption is disabled, there is a situation that can cause
performance degradation(or io hung in extreme scenarios): the waitqueue
doesn't have 'wake_batch' threads, thus wake up on this waitqueue might
cause the concurrency of ios to be decreased. The next patch will fix this
problem.

This patch also add a detection in blk_mq_timeout_work(), just in case
io hung is triggered due to waiters can't awakened in some corner cases.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-mq-tag.c | 36 +++++++++++++++++++++++++-----------
 block/blk-mq.c     | 29 +++++++++++++++++++++++++++++
 block/blk-mq.h     |  2 ++
 3 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 5ad85063e91e..a6c5ec846a5e 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -127,6 +127,13 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
 	return ret;
 }
 
+static inline bool preempt_tag(struct blk_mq_alloc_data *data,
+			       struct sbitmap_queue *bt)
+{
+	return data->preempt ||
+	       atomic_read(&bt->ws_active) <= SBQ_WAIT_QUEUES;
+}
+
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -148,12 +155,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		tag_offset = tags->nr_reserved_tags;
 	}
 
-	tag = __blk_mq_get_tag(data, bt);
-	if (tag != BLK_MQ_NO_TAG)
-		goto found_tag;
+	if (data->flags & BLK_MQ_REQ_NOWAIT || preempt_tag(data, bt)) {
+		tag = __blk_mq_get_tag(data, bt);
+		if (tag != BLK_MQ_NO_TAG)
+			goto found_tag;
 
-	if (data->flags & BLK_MQ_REQ_NOWAIT)
-		return BLK_MQ_NO_TAG;
+		if (data->flags & BLK_MQ_REQ_NOWAIT)
+			return BLK_MQ_NO_TAG;
+	}
 
 	do {
 		struct sbitmap_queue *bt_prev;
@@ -169,21 +178,26 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __blk_mq_get_tag(data, bt);
-		if (tag != BLK_MQ_NO_TAG)
-			break;
+		if (preempt_tag(data, bt)) {
+			tag = __blk_mq_get_tag(data, bt);
+			if (tag != BLK_MQ_NO_TAG)
+				break;
+		}
 
 		if (!ws)
 			ws = bt_wait_ptr(bt, data->hctx);
 		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __blk_mq_get_tag(data, bt);
-		if (tag != BLK_MQ_NO_TAG)
-			break;
+		if (preempt_tag(data, bt)) {
+			tag = __blk_mq_get_tag(data, bt);
+			if (tag != BLK_MQ_NO_TAG)
+				break;
+		}
 
 		bt_prev = bt;
 		io_schedule();
 
+		data->preempt = true;
 		sbitmap_finish_wait(bt, ws, &wait);
 
 		data->ctx = blk_mq_get_ctx(data->q);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ed3ed86f7dd2..32beacbad5e2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1446,6 +1446,34 @@ static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
 	return true;
 }
 
+static void blk_mq_check_tag_waiters(struct blk_mq_hw_ctx *hctx)
+{
+	bool warn = false;
+	struct blk_mq_tags *tags = hctx->tags;
+
+again:
+	if (atomic_read(&tags->bitmap_tags.ws_active)) {
+		warn = true;
+		sbitmap_queue_wake_all(&tags->bitmap_tags);
+	}
+
+	if (atomic_read(&tags->breserved_tags.ws_active)) {
+		warn = true;
+		sbitmap_queue_wake_all(&tags->breserved_tags);
+	}
+
+	if (hctx->sched_tags && tags != hctx->sched_tags) {
+		tags = hctx->sched_tags;
+		goto again;
+	}
+
+	/*
+	 * This is problematic because someone is still waiting for tag while
+	 * no tag is used.
+	 */
+	WARN_ON_ONCE(warn);
+}
+
 static void blk_mq_timeout_work(struct work_struct *work)
 {
 	struct request_queue *q =
@@ -1482,6 +1510,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
 		 * each hctx as idle.
 		 */
 		queue_for_each_hw_ctx(q, hctx, i) {
+			blk_mq_check_tag_waiters(hctx);
 			/* the hctx may be unmapped, so check it here */
 			if (blk_mq_hw_queue_mapped(hctx))
 				blk_mq_tag_idle(hctx);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 2615bd58bad3..1a85bd1045d8 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -156,6 +156,8 @@ struct blk_mq_alloc_data {
 
 	/* allocate multiple requests/tags in one go */
 	unsigned int nr_tags;
+	/* true if blk_mq_get_tag() will try to preempt tag */
+	bool preempt;
 	struct request **cached_rq;
 
 	/* input & output parameter */
-- 
2.31.1


  parent reply	other threads:[~2022-04-15  9:56 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-15 10:10 [PATCH -next RFC v3 0/8] improve tag allocation under heavy load Yu Kuai
2022-04-15 10:10 ` [PATCH -next RFC v3 1/8] sbitmap: record the number of waiters for each waitqueue Yu Kuai
2022-04-15 10:10 ` [PATCH -next RFC v3 2/8] blk-mq: call 'bt_wait_ptr()' later in blk_mq_get_tag() Yu Kuai
2022-04-15 10:10 ` [PATCH -next RFC v3 3/8] sbitmap: make sure waitqueues are balanced Yu Kuai
2022-04-15 10:10 ` Yu Kuai [this message]
2022-04-15 10:10 ` [PATCH -next RFC v3 5/8] sbitmap: force tag preemption if free tags are sufficient Yu Kuai
2022-04-15 10:10 ` [PATCH -next RFC v3 6/8] blk-mq: force tag preemption for split bios Yu Kuai
2022-04-15 10:10 ` [PATCH -next RFC v3 7/8] blk-mq: record how many tags are needed for splited bio Yu Kuai
2022-04-15 10:10 ` [PATCH -next RFC v3 8/8] sbitmap: wake up the number of threads based on required tags Yu Kuai
2022-04-24  2:43 ` [PATCH -next RFC v3 0/8] improve tag allocation under heavy load yukuai (C)
2022-04-25  3:24   ` Damien Le Moal
2022-04-25  6:14     ` yukuai (C)
2022-04-25  6:23       ` Damien Le Moal
2022-04-25  6:47         ` yukuai (C)
2022-04-25  6:50           ` Damien Le Moal
2022-04-25  7:05             ` yukuai (C)
2022-04-25  7:06               ` Damien Le Moal
2022-04-25  7:28                 ` yukuai (C)
2022-04-25 11:20                   ` Damien Le Moal
2022-04-25 13:42                     ` yukuai (C)
2022-04-25  3:09 ` Bart Van Assche
2022-04-25  3:27   ` yukuai (C)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220415101053.554495-5-yukuai3@huawei.com \
    --to=yukuai3@huawei.com \
    --cc=andriy.shevchenko@linux.intel.com \
    --cc=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=john.garry@huawei.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ming.lei@redhat.com \
    --cc=qiulaibin@huawei.com \
    --cc=yi.zhang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.