From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>,
linux-block@vger.kernel.org,
"Martin K . Petersen" <martin.petersen@oracle.com>,
linux-scsi@vger.kernel.org
Cc: Ming Lei <ming.lei@redhat.com>, Omar Sandoval <osandov@fb.com>,
Kashyap Desai <kashyap.desai@broadcom.com>,
Sumanesh Samanta <sumanesh.samanta@broadcom.com>,
"Ewan D . Milne" <emilne@redhat.com>,
Hannes Reinecke <hare@suse.de>
Subject: [PATCH V6 08/13] blk-mq: return budget token from .get_budget callback
Date: Mon, 18 Jan 2021 08:49:16 +0800 [thread overview]
Message-ID: <20210118004921.202545-9-ming.lei@redhat.com> (raw)
In-Reply-To: <20210118004921.202545-1-ming.lei@redhat.com>
SCSI uses one global atomic variable to track queue depth for each
LUN/request queue.
This way doesn't scale well when there is lots of CPU cores and the
disk is very fast. It has been observed that IOPS is affected a lot
by tracking queue depth via sdev->device_busy in IO path.
Return budget token from .get_budget callback, and the budget token
can be passed to driver, so that we can replace the atomic variable
with sbitmap_queue, then the scale issue can be fixed.
Cc: Omar Sandoval <osandov@fb.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Cc: Ewan D. Milne <emilne@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Tested-by: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq-sched.c | 17 +++++++++++++----
block/blk-mq.c | 36 +++++++++++++++++++++++++-----------
block/blk-mq.h | 25 +++++++++++++++++++++----
drivers/scsi/scsi_lib.c | 16 +++++++++++-----
include/linux/blk-mq.h | 4 ++--
5 files changed, 72 insertions(+), 26 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index deff4e826e23..962b801b8fb2 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -131,6 +131,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
do {
struct request *rq;
+ int budget_token;
if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
break;
@@ -140,12 +141,13 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
break;
}
- if (!blk_mq_get_dispatch_budget(q))
+ budget_token = blk_mq_get_dispatch_budget(q);
+ if (budget_token < 0)
break;
rq = e->type->ops.dispatch_request(hctx);
if (!rq) {
- blk_mq_put_dispatch_budget(q);
+ blk_mq_put_dispatch_budget(q, budget_token);
/*
* We're releasing without dispatching. Holding the
* budget could have blocked any "hctx"s with the
@@ -157,6 +159,8 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
break;
}
+ blk_mq_set_rq_budget_token(rq, budget_token);
+
/*
* Now this rq owns the budget which has to be released
* if this rq won't be queued to driver via .queue_rq()
@@ -230,6 +234,8 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
struct request *rq;
do {
+ int budget_token;
+
if (!list_empty_careful(&hctx->dispatch)) {
ret = -EAGAIN;
break;
@@ -238,12 +244,13 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
if (!sbitmap_any_bit_set(&hctx->ctx_map))
break;
- if (!blk_mq_get_dispatch_budget(q))
+ budget_token = blk_mq_get_dispatch_budget(q);
+ if (budget_token < 0)
break;
rq = blk_mq_dequeue_from_ctx(hctx, ctx);
if (!rq) {
- blk_mq_put_dispatch_budget(q);
+ blk_mq_put_dispatch_budget(q, budget_token);
/*
* We're releasing without dispatching. Holding the
* budget could have blocked any "hctx"s with the
@@ -255,6 +262,8 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
break;
}
+ blk_mq_set_rq_budget_token(rq, budget_token);
+
/*
* Now this rq owns the budget which has to be released
* if this rq won't be queued to driver via .queue_rq()
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ef1a9f2003a0..4747066c54de 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1304,10 +1304,15 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
bool need_budget)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+ int budget_token = -1;
- if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
- blk_mq_put_driver_tag(rq);
- return PREP_DISPATCH_NO_BUDGET;
+ if (need_budget) {
+ budget_token = blk_mq_get_dispatch_budget(rq->q);
+ if (budget_token < 0) {
+ blk_mq_put_driver_tag(rq);
+ return PREP_DISPATCH_NO_BUDGET;
+ }
+ blk_mq_set_rq_budget_token(rq, budget_token);
}
if (!blk_mq_get_driver_tag(rq)) {
@@ -1324,7 +1329,7 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
* together during handling partial dispatch
*/
if (need_budget)
- blk_mq_put_dispatch_budget(rq->q);
+ blk_mq_put_dispatch_budget(rq->q, budget_token);
return PREP_DISPATCH_NO_TAG;
}
}
@@ -1334,12 +1339,16 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
static void blk_mq_release_budgets(struct request_queue *q,
- unsigned int nr_budgets)
+ struct list_head *list)
{
- int i;
+ struct request *rq;
- for (i = 0; i < nr_budgets; i++)
- blk_mq_put_dispatch_budget(q);
+ list_for_each_entry(rq, list, queuelist) {
+ int budget_token = blk_mq_get_rq_budget_token(rq);
+
+ if (budget_token >= 0)
+ blk_mq_put_dispatch_budget(q, budget_token);
+ }
}
/*
@@ -1437,7 +1446,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
- blk_mq_release_budgets(q, nr_budgets);
+ if (nr_budgets)
+ blk_mq_release_budgets(q, list);
spin_lock(&hctx->lock);
list_splice_tail_init(list, &hctx->dispatch);
@@ -1982,6 +1992,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
{
struct request_queue *q = rq->q;
bool run_queue = true;
+ int budget_token;
/*
* RCU or SRCU read lock is needed before checking quiesced flag.
@@ -1999,11 +2010,14 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (q->elevator && !bypass_insert)
goto insert;
- if (!blk_mq_get_dispatch_budget(q))
+ budget_token = blk_mq_get_dispatch_budget(q);
+ if (budget_token < 0)
goto insert;
+ blk_mq_set_rq_budget_token(rq, budget_token);
+
if (!blk_mq_get_driver_tag(rq)) {
- blk_mq_put_dispatch_budget(q);
+ blk_mq_put_dispatch_budget(q, budget_token);
goto insert;
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index c1458d9502f1..61a5e884a6f5 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -187,17 +187,34 @@ unsigned int blk_mq_in_flight(struct request_queue *q,
void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
unsigned int inflight[2]);
-static inline void blk_mq_put_dispatch_budget(struct request_queue *q)
+static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
+ int budget_token)
{
if (q->mq_ops->put_budget)
- q->mq_ops->put_budget(q);
+ q->mq_ops->put_budget(q, budget_token);
}
-static inline bool blk_mq_get_dispatch_budget(struct request_queue *q)
+static inline int blk_mq_get_dispatch_budget(struct request_queue *q)
{
if (q->mq_ops->get_budget)
return q->mq_ops->get_budget(q);
- return true;
+ return 0;
+}
+
+static inline void blk_mq_set_rq_budget_token(struct request *rq, int token)
+{
+ if (token < 0)
+ return;
+
+ if (rq->q->mq_ops->set_rq_budget_token)
+ rq->q->mq_ops->set_rq_budget_token(rq, token);
+}
+
+static inline int blk_mq_get_rq_budget_token(struct request *rq)
+{
+ if (rq->q->mq_ops->get_rq_budget_token)
+ return rq->q->mq_ops->get_rq_budget_token(rq);
+ return -1;
}
static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index eeaae47fee8a..08a9534a9bb4 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -329,6 +329,7 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
atomic_dec(&starget->target_busy);
atomic_dec(&sdev->device_busy);
+ cmd->budget_token = -1;
}
static void scsi_kick_queue(struct request_queue *q)
@@ -1142,6 +1143,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
unsigned long jiffies_at_alloc;
int retries, to_clear;
bool in_flight;
+ int budget_token = cmd->budget_token;
if (!blk_rq_is_scsi(rq) && !(flags & SCMD_INITIALIZED)) {
flags |= SCMD_INITIALIZED;
@@ -1170,6 +1172,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
cmd->retries = retries;
if (in_flight)
__set_bit(SCMD_STATE_INFLIGHT, &cmd->state);
+ cmd->budget_token = budget_token;
}
@@ -1604,19 +1607,19 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
blk_mq_complete_request(cmd->request);
}
-static void scsi_mq_put_budget(struct request_queue *q)
+static void scsi_mq_put_budget(struct request_queue *q, int budget_token)
{
struct scsi_device *sdev = q->queuedata;
atomic_dec(&sdev->device_busy);
}
-static bool scsi_mq_get_budget(struct request_queue *q)
+static int scsi_mq_get_budget(struct request_queue *q)
{
struct scsi_device *sdev = q->queuedata;
if (scsi_dev_queue_ready(q, sdev))
- return true;
+ return 0;
atomic_inc(&sdev->restarts);
@@ -1638,7 +1641,7 @@ static bool scsi_mq_get_budget(struct request_queue *q)
if (unlikely(atomic_read(&sdev->device_busy) == 0 &&
!scsi_device_blocked(sdev)))
blk_mq_delay_run_hw_queues(sdev->request_queue, SCSI_QUEUE_DELAY);
- return false;
+ return -1;
}
static void scsi_mq_set_rq_budget_token(struct request *req, int token)
@@ -1666,6 +1669,8 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_status_t ret;
int reason;
+ WARN_ON_ONCE(cmd->budget_token < 0);
+
/*
* If the device is not in running state we will reject some or all
* commands.
@@ -1717,7 +1722,8 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
if (scsi_target(sdev)->can_queue > 0)
atomic_dec(&scsi_target(sdev)->target_busy);
out_put_budget:
- scsi_mq_put_budget(q);
+ scsi_mq_put_budget(q, cmd->budget_token);
+ cmd->budget_token = -1;
switch (ret) {
case BLK_STS_OK:
break;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1f84d47b72f6..3646f3b8d9df 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -310,12 +310,12 @@ struct blk_mq_ops {
* reserved budget. Also we have to handle failure case
* of .get_budget for avoiding I/O deadlock.
*/
- bool (*get_budget)(struct request_queue *);
+ int (*get_budget)(struct request_queue *);
/**
* @put_budget: Release the reserved budget.
*/
- void (*put_budget)(struct request_queue *);
+ void (*put_budget)(struct request_queue *, int);
/*
* @set_rq_budget_toekn: store rq's budget token
--
2.28.0
next prev parent reply other threads:[~2021-01-18 0:52 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-01-18 0:49 [PATCH V6 00/13] blk-mq/scsi: tracking device queue depth via sbitmap Ming Lei
2021-01-18 0:49 ` [PATCH V6 01/13] sbitmap: remove sbitmap_clear_bit_unlock Ming Lei
2021-01-18 0:49 ` [PATCH V6 02/13] sbitmap: maintain allocation round_robin in sbitmap Ming Lei
2021-01-18 4:28 ` kernel test robot
2021-01-18 4:42 ` kernel test robot
2021-01-22 2:38 ` Ming Lei
2021-01-18 0:49 ` [PATCH V6 03/13] sbitmap: add helpers for updating allocation hint Ming Lei
2021-01-18 0:49 ` [PATCH V6 04/13] sbitmap: move allocation hint into sbitmap Ming Lei
2021-01-18 0:49 ` [PATCH V6 05/13] sbitmap: export sbitmap_weight Ming Lei
2021-01-18 0:49 ` [PATCH V6 06/13] sbitmap: add helper of sbitmap_calculate_shift Ming Lei
2021-01-18 0:49 ` [PATCH V6 07/13] blk-mq: add callbacks for storing & retrieving budget token Ming Lei
2021-01-18 0:49 ` Ming Lei [this message]
2021-01-18 0:49 ` [PATCH V6 09/13] scsi: put hot fields of scsi_host_template into one cacheline Ming Lei
2021-01-18 0:49 ` [PATCH V6 10/13] megaraid_sas: v2 replace sdev_busy with local counter Ming Lei
2021-01-18 0:49 ` [PATCH V6 11/13] scsi: add scsi_device_busy() to read sdev->device_busy Ming Lei
2021-01-18 0:49 ` [PATCH V6 12/13] scsi: make sure sdev->queue_depth is <= max(shost->can_queue, 1024) Ming Lei
2021-01-18 0:49 ` [PATCH V6 13/13] scsi: replace sdev->device_busy with sbitmap Ming Lei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210118004921.202545-9-ming.lei@redhat.com \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=emilne@redhat.com \
--cc=hare@suse.de \
--cc=kashyap.desai@broadcom.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
--cc=martin.petersen@oracle.com \
--cc=osandov@fb.com \
--cc=sumanesh.samanta@broadcom.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).