From: Ming Lei <ming.lei@redhat.com>
To: James Bottomley <James.Bottomley@HansenPartnership.com>,
linux-scsi@vger.kernel.org,
"Martin K . Petersen" <martin.petersen@oracle.com>,
linux-block@vger.kernel.org, Jens Axboe <axboe@kernel.dk>
Cc: Ming Lei <ming.lei@redhat.com>, Omar Sandoval <osandov@fb.com>,
Sathya Prakash <sathya.prakash@broadcom.com>,
Chaitra P B <chaitra.basappa@broadcom.com>,
Suganath Prabu Subramani <suganath-prabu.subramani@broadcom.com>,
Kashyap Desai <kashyap.desai@broadcom.com>,
Sumit Saxena <sumit.saxena@broadcom.com>,
Shivasharan S <shivasharan.srikanteshwara@broadcom.com>,
"Ewan D . Milne" <emilne@redhat.com>,
Hannes Reinecke <hare@suse.de>,
Bart Van Assche <bart.vanassche@wdc.com>
Subject: [PATCH 07/10] blk-mq: return budget token from .get_budget callback
Date: Tue, 11 Feb 2020 20:11:32 +0800 [thread overview]
Message-ID: <20200211121135.30064-8-ming.lei@redhat.com> (raw)
In-Reply-To: <20200211121135.30064-1-ming.lei@redhat.com>
SCSI uses one global atomic variable to track queue depth for each
LUN/request queue.
This way doesn't scale well when there is lots of CPU cores and the
disk is very fast. It has been observed that IOPS is affected a lot
by tracking queue depth via sdev->device_busy in IO path.
Return budget token from .get_budget callback, and the budget token
can be passed to driver, so that we can replace the atomic variable
with sbitmap_queue, then the scale issue can be fixed.
Cc: Omar Sandoval <osandov@fb.com>
Cc: Sathya Prakash <sathya.prakash@broadcom.com>
Cc: Chaitra P B <chaitra.basappa@broadcom.com>
Cc: Suganath Prabu Subramani <suganath-prabu.subramani@broadcom.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: Shivasharan S <shivasharan.srikanteshwara@broadcom.com>
Cc: Ewan D. Milne <emilne@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq-sched.c | 20 ++++++++++++--------
block/blk-mq.c | 18 ++++++++++++------
block/blk-mq.h | 11 ++++++-----
drivers/scsi/scsi_lib.c | 10 +++++-----
include/linux/blk-mq.h | 4 ++--
5 files changed, 37 insertions(+), 26 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ca22afd47b3d..38dee68ba04a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -90,6 +90,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list);
+ int budget_token;
do {
struct request *rq;
@@ -97,12 +98,13 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
break;
- if (!blk_mq_get_dispatch_budget(hctx))
+ budget_token = blk_mq_get_dispatch_budget(hctx);
+ if (budget_token < 0)
break;
rq = e->type->ops.dispatch_request(hctx);
if (!rq) {
- blk_mq_put_dispatch_budget(hctx);
+ blk_mq_put_dispatch_budget(hctx, budget_token);
break;
}
@@ -112,7 +114,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
* in blk_mq_dispatch_rq_list().
*/
list_add(&rq->queuelist, &rq_list);
- } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+ } while (blk_mq_dispatch_rq_list(q, &rq_list, budget_token));
}
static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -136,6 +138,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
struct request_queue *q = hctx->queue;
LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+ int budget_token;
do {
struct request *rq;
@@ -143,12 +146,13 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
if (!sbitmap_any_bit_set(&hctx->ctx_map))
break;
- if (!blk_mq_get_dispatch_budget(hctx))
+ budget_token = blk_mq_get_dispatch_budget(hctx);
+ if (budget_token < 0)
break;
rq = blk_mq_dequeue_from_ctx(hctx, ctx);
if (!rq) {
- blk_mq_put_dispatch_budget(hctx);
+ blk_mq_put_dispatch_budget(hctx, budget_token);
break;
}
@@ -162,7 +166,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
- } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+ } while (blk_mq_dispatch_rq_list(q, &rq_list, budget_token));
WRITE_ONCE(hctx->dispatch_from, ctx);
}
@@ -206,7 +210,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+ if (blk_mq_dispatch_rq_list(q, &rq_list, -1)) {
if (has_sched_dispatch)
blk_mq_do_dispatch_sched(hctx);
else
@@ -219,7 +223,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
- blk_mq_dispatch_rq_list(q, &rq_list, false);
+ blk_mq_dispatch_rq_list(q, &rq_list, -1);
}
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 76a5e919c336..43ae2b973d99 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1182,13 +1182,14 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
* Returns true if we did some work AND can potentially do more.
*/
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
- bool got_budget)
+ int budget_token)
{
struct blk_mq_hw_ctx *hctx;
struct request *rq, *nxt;
bool no_tag = false;
int errors, queued;
blk_status_t ret = BLK_STS_OK;
+ bool got_budget = budget_token >= 0;
if (list_empty(list))
return false;
@@ -1205,8 +1206,11 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
hctx = rq->mq_hctx;
- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
- break;
+ if (!got_budget) {
+ budget_token = blk_mq_get_dispatch_budget(hctx);
+ if (budget_token < 0)
+ break;
+ }
if (!blk_mq_get_driver_tag(rq)) {
/*
@@ -1217,7 +1221,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
* we'll re-run it below.
*/
if (!blk_mq_mark_tag_wait(hctx, rq)) {
- blk_mq_put_dispatch_budget(hctx);
+ blk_mq_put_dispatch_budget(hctx, budget_token);
/*
* For non-shared tags, the RESTART check
* will suffice.
@@ -1819,6 +1823,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
{
struct request_queue *q = rq->q;
bool run_queue = true;
+ int budget_token;
/*
* RCU or SRCU read lock is needed before checking quiesced flag.
@@ -1836,11 +1841,12 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (q->elevator && !bypass_insert)
goto insert;
- if (!blk_mq_get_dispatch_budget(hctx))
+ budget_token = blk_mq_get_dispatch_budget(hctx);
+ if (budget_token < 0)
goto insert;
if (!blk_mq_get_driver_tag(rq)) {
- blk_mq_put_dispatch_budget(hctx);
+ blk_mq_put_dispatch_budget(hctx, budget_token);
goto insert;
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index eaaca8fc1c28..1c82d89791c0 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -40,7 +40,7 @@ struct blk_mq_ctx {
void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
+bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, int);
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
bool kick_requeue_list);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
@@ -179,21 +179,22 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);
-static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx,
+ int budget_token)
{
struct request_queue *q = hctx->queue;
if (q->mq_ops->put_budget)
- q->mq_ops->put_budget(hctx);
+ q->mq_ops->put_budget(hctx, budget_token);
}
-static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+static inline int blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
if (q->mq_ops->get_budget)
return q->mq_ops->get_budget(hctx);
- return true;
+ return 0;
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 610ee41fa54c..c8eb555b3ce8 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1619,7 +1619,7 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
}
-static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
+static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx, int budget_token)
{
struct request_queue *q = hctx->queue;
struct scsi_device *sdev = q->queuedata;
@@ -1627,17 +1627,17 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
atomic_dec(&sdev->device_busy);
}
-static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+static int scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct scsi_device *sdev = q->queuedata;
if (scsi_dev_queue_ready(q, sdev))
- return true;
+ return 0;
if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev))
blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
- return false;
+ return -1;
}
static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -1701,7 +1701,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
if (scsi_target(sdev)->can_queue > 0)
atomic_dec(&scsi_target(sdev)->target_busy);
out_put_budget:
- scsi_mq_put_budget(hctx);
+ scsi_mq_put_budget(hctx, 0);
switch (ret) {
case BLK_STS_OK:
break;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 11cfd6470b1a..6fd8d7cfe158 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -265,8 +265,8 @@ struct blk_mq_queue_data {
typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
const struct blk_mq_queue_data *);
typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *);
-typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
-typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
+typedef int (get_budget_fn)(struct blk_mq_hw_ctx *);
+typedef void (put_budget_fn)(struct blk_mq_hw_ctx *, int);
typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
--
2.20.1
next prev parent reply other threads:[~2020-02-11 12:12 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-02-11 12:11 [PATCH 00/10] scsi: tracking device queue depth via sbitmap Ming Lei
2020-02-11 12:11 ` [PATCH 01/10] sbitmap: maintain allocation round_robin in sbitmap Ming Lei
2020-04-14 6:04 ` Hannes Reinecke
2020-02-11 12:11 ` [PATCH 02/10] sbitmap: add helpers for updating allocation hint Ming Lei
2020-04-14 6:05 ` Hannes Reinecke
2020-02-11 12:11 ` [PATCH 03/10] sbitmap: remove sbitmap_clear_bit_unlock Ming Lei
2020-04-14 6:06 ` Hannes Reinecke
2020-02-11 12:11 ` [PATCH 04/10] sbitmap: move allocation hint into sbitmap Ming Lei
2020-02-11 12:11 ` [PATCH 05/10] sbitmap: export sbitmap_weight Ming Lei
2020-02-11 12:11 ` [PATCH 06/10] sbitmap: add helper of sbitmap_calculate_shift Ming Lei
2020-02-11 12:11 ` Ming Lei [this message]
2020-02-11 12:11 ` [PATCH 08/10] blk-mq: pass budget token to dirver via blk_mq_queue_data Ming Lei
2020-02-11 12:11 ` [PATCH 09/10] scsi: add scsi_device_busy() to read sdev->device_busy Ming Lei
2020-02-11 12:11 ` [PATCH 10/10] scsi: replace sdev->device_busy with sbitmap Ming Lei
[not found] ` <202002140428.063yIjwM%lkp@intel.com>
2020-02-14 9:16 ` Ming Lei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200211121135.30064-8-ming.lei@redhat.com \
--to=ming.lei@redhat.com \
--cc=James.Bottomley@HansenPartnership.com \
--cc=axboe@kernel.dk \
--cc=bart.vanassche@wdc.com \
--cc=chaitra.basappa@broadcom.com \
--cc=emilne@redhat.com \
--cc=hare@suse.de \
--cc=kashyap.desai@broadcom.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
--cc=martin.petersen@oracle.com \
--cc=osandov@fb.com \
--cc=sathya.prakash@broadcom.com \
--cc=shivasharan.srikanteshwara@broadcom.com \
--cc=suganath-prabu.subramani@broadcom.com \
--cc=sumit.saxena@broadcom.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).