All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yu Kuai <yukuai3@huawei.com>
To: <axboe@kernel.dk>
Cc: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<yukuai3@huawei.com>, <yi.zhang@huawei.com>,
	<zhangxiaoxu5@huawei.com>
Subject: [PATCH 3/3] blk-mq: decrease pending_queues when it expires
Date: Sat, 26 Dec 2020 18:28:08 +0800	[thread overview]
Message-ID: <20201226102808.2534966-4-yukuai3@huawei.com> (raw)
In-Reply-To: <20201226102808.2534966-1-yukuai3@huawei.com>

If pending_queues is increased once, it will only be decreased when
nr_active is zero, and that will lead to the under-utilization of
host tags because pending_queues is non-zero and the available
tags for the queue will be max(host tags / active_queues, 4)
instead of the needed tags of the queue.

Fix it by adding an expiration time for the increasement of pending_queues,
and decrease it when it expires, so pending_queues will be decreased
to zero if there is no tag allocation failure, and the available tags
for the queue will be the whole host tags.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-mq-tag.c     | 29 ++++++++++++++++++++++++++---
 block/blk-mq-tag.h     |  6 +++---
 block/blk-mq.c         |  5 +++--
 block/blk-mq.h         |  2 +-
 include/linux/blk-mq.h |  6 +++++-
 5 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 6dcd6dd9123a..e3ffe0fde052 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -46,12 +46,16 @@ void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
 		struct blk_mq_tag_set *set = q->tag_set;
 
 		if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) &&
-		    !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags))
+		    !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) {
+			hctx->dtag_wait_time = jiffies;
 			atomic_inc(&set->pending_queues_shared_sbitmap);
+		}
 	} else {
 		if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) &&
-		    !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state))
+		    !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) {
+			hctx->dtag_wait_time = jiffies;
 			atomic_inc(&hctx->tags->pending_queues);
+		}
 	}
 }
 
@@ -89,12 +93,28 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 	blk_mq_tag_wakeup_all(tags, false);
 }
 
-void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
+#define BLK_MQ_DTAG_WAIT_EXPIRE (5 * HZ)
+
+void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force)
 {
 	struct blk_mq_tags *tags = hctx->tags;
 	struct request_queue *q = hctx->queue;
 	struct blk_mq_tag_set *set = q->tag_set;
 
+	if (!force) {
+		if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+			if (!(test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) &&
+			      time_after(jiffies, hctx->dtag_wait_time +
+					BLK_MQ_DTAG_WAIT_EXPIRE)))
+				return;
+		} else {
+			if (!(test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) &&
+			      time_after(jiffies, hctx->dtag_wait_time +
+					BLK_MQ_DTAG_WAIT_EXPIRE)))
+				return;
+		}
+	}
+
 	if (blk_mq_is_sbitmap_shared(hctx->flags) &&
 	    test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags))
 		atomic_dec(&set->pending_queues_shared_sbitmap);
@@ -202,6 +222,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	sbitmap_finish_wait(bt, ws, &wait);
 
 found_tag:
+	if (!data->q->elevator)
+		blk_mq_dtag_idle(data->hctx, false);
+
 	/*
 	 * Give up this allocation if the hctx is inactive.  The caller will
 	 * retry on an active hctx.
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 52d08a92f683..888692498ef2 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -68,7 +68,7 @@ enum {
 extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
 extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
 extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *);
-extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *);
+extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *, bool);
 
 static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
@@ -94,12 +94,12 @@ static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
 	__blk_mq_dtag_busy(hctx);
 }
 
-static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force)
 {
 	if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
 		return;
 
-	__blk_mq_dtag_idle(hctx);
+	__blk_mq_dtag_idle(hctx, force);
 }
 
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2b8fa49bccb4..9ac976107154 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1015,7 +1015,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
 			/* the hctx may be unmapped, so check it here */
 			if (blk_mq_hw_queue_mapped(hctx)) {
 				blk_mq_tag_idle(hctx);
-				blk_mq_dtag_idle(hctx);
+				blk_mq_dtag_idle(hctx, true);
 			}
 		}
 	}
@@ -2568,7 +2568,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 {
 	if (blk_mq_hw_queue_mapped(hctx)) {
 		blk_mq_tag_idle(hctx);
-		blk_mq_dtag_idle(hctx);
+		blk_mq_dtag_idle(hctx, true);
 	}
 
 	if (set->ops->exit_request)
@@ -2667,6 +2667,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	INIT_LIST_HEAD(&hctx->dispatch);
 	hctx->queue = q;
 	hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
+	hctx->dtag_wait_time = jiffies;
 
 	INIT_LIST_HEAD(&hctx->hctx_list);
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 228c5c442be4..93ede498f5e6 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -214,7 +214,7 @@ static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
 		atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
 	else if (!atomic_dec_return(&hctx->nr_active)) {
 		blk_mq_tag_idle(hctx);
-		blk_mq_dtag_idle(hctx);
+		blk_mq_dtag_idle(hctx, true);
 	}
 }
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2a473865ee7f..82591c2f76cc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -158,7 +158,11 @@ struct blk_mq_hw_ctx {
 	unsigned long		poll_invoked;
 	/** @poll_success: Count how many polled requests were completed. */
 	unsigned long		poll_success;
-
+	/**
+	 * record when hardware queue is pending, specifically when
+	 * BLK_MQ_S_DTAG_WAIT is set in state.
+	 */
+	unsigned long dtag_wait_time;
 #ifdef CONFIG_BLK_DEBUG_FS
 	/**
 	 * @debugfs_dir: debugfs directory for this hardware queue. Named
-- 
2.25.4


      parent reply	other threads:[~2020-12-26 10:26 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-26 10:28 [PATCH 0/3] fix the performance fluctuation due to shared tagset Yu Kuai
2020-12-26 10:28 ` [PATCH 1/3] blk-mq: allow hardware queue to get more tag while sharing a tag set Yu Kuai
2020-12-27 11:58   ` Ming Lei
2020-12-28  1:56     ` yukuai (C)
2020-12-28  8:28       ` Ming Lei
2020-12-28  9:02         ` yukuai (C)
2020-12-29  1:15           ` Ming Lei
2020-12-29  2:37             ` yukuai (C)
2020-12-26 10:28 ` [PATCH 2/3] blk-mq: clear 'active_queues' immediately when 'nr_active' is decreased to 0 Yu Kuai
2020-12-26 10:28 ` Yu Kuai [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201226102808.2534966-4-yukuai3@huawei.com \
    --to=yukuai3@huawei.com \
    --cc=axboe@kernel.dk \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=yi.zhang@huawei.com \
    --cc=zhangxiaoxu5@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.