All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>,
	linux-block@vger.kernel.org,
	James Bottomley <James.Bottomley@HansenPartnership.com>,
	linux-scsi@vger.kernel.org,
	"Martin K . Petersen" <martin.petersen@oracle.com>
Cc: Ming Lei <ming.lei@redhat.com>, Omar Sandoval <osandov@fb.com>,
	Sathya Prakash <sathya.prakash@broadcom.com>,
	Chaitra P B <chaitra.basappa@broadcom.com>,
	Suganath Prabu Subramani  <suganath-prabu.subramani@broadcom.com>,
	Kashyap Desai <kashyap.desai@broadcom.com>,
	Sumit Saxena <sumit.saxena@broadcom.com>,
	Shivasharan S <shivasharan.srikanteshwara@broadcom.com>,
	"Ewan D . Milne" <emilne@redhat.com>,
	Hannes Reinecke <hare@suse.de>,
	Bart Van Assche <bart.vanassche@wdc.com>
Subject: [PATCH V2 07/10] blk-mq: return budget token from .get_budget callback
Date: Sat, 29 Feb 2020 07:29:17 +0800	[thread overview]
Message-ID: <20200228232920.20960-8-ming.lei@redhat.com> (raw)
In-Reply-To: <20200228232920.20960-1-ming.lei@redhat.com>

SCSI uses one global atomic variable to track queue depth for each
LUN/request queue.

This way doesn't scale well when there is lots of CPU cores and the
disk is very fast. It has been observed that IOPS is affected a lot
by tracking queue depth via sdev->device_busy in IO path.

Return budget token from .get_budget callback, and the budget token
can be passed to driver, so that we can replace the atomic variable
with sbitmap_queue, then the scale issue can be fixed.

Cc: Omar Sandoval <osandov@fb.com>
Cc: Sathya Prakash <sathya.prakash@broadcom.com>
Cc: Chaitra P B <chaitra.basappa@broadcom.com>
Cc: Suganath Prabu Subramani <suganath-prabu.subramani@broadcom.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: Shivasharan S <shivasharan.srikanteshwara@broadcom.com>
Cc: Ewan D. Milne <emilne@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sched.c    | 20 ++++++++++++--------
 block/blk-mq.c          | 18 ++++++++++++------
 block/blk-mq.h          | 11 ++++++-----
 drivers/scsi/scsi_lib.c | 10 +++++-----
 include/linux/blk-mq.h  |  4 ++--
 5 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ca22afd47b3d..38dee68ba04a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -90,6 +90,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
+	int budget_token;
 
 	do {
 		struct request *rq;
@@ -97,12 +98,13 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
 			break;
 
-		if (!blk_mq_get_dispatch_budget(hctx))
+		budget_token = blk_mq_get_dispatch_budget(hctx);
+		if (budget_token < 0)
 			break;
 
 		rq = e->type->ops.dispatch_request(hctx);
 		if (!rq) {
-			blk_mq_put_dispatch_budget(hctx);
+			blk_mq_put_dispatch_budget(hctx, budget_token);
 			break;
 		}
 
@@ -112,7 +114,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		 * in blk_mq_dispatch_rq_list().
 		 */
 		list_add(&rq->queuelist, &rq_list);
-	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, budget_token));
 }
 
 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -136,6 +138,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	LIST_HEAD(rq_list);
 	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+	int budget_token;
 
 	do {
 		struct request *rq;
@@ -143,12 +146,13 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 		if (!sbitmap_any_bit_set(&hctx->ctx_map))
 			break;
 
-		if (!blk_mq_get_dispatch_budget(hctx))
+		budget_token = blk_mq_get_dispatch_budget(hctx);
+		if (budget_token < 0)
 			break;
 
 		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
 		if (!rq) {
-			blk_mq_put_dispatch_budget(hctx);
+			blk_mq_put_dispatch_budget(hctx, budget_token);
 			break;
 		}
 
@@ -162,7 +166,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 		/* round robin for fair dispatch */
 		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
 
-	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, budget_token));
 
 	WRITE_ONCE(hctx->dispatch_from, ctx);
 }
@@ -206,7 +210,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+		if (blk_mq_dispatch_rq_list(q, &rq_list, -1)) {
 			if (has_sched_dispatch)
 				blk_mq_do_dispatch_sched(hctx);
 			else
@@ -219,7 +223,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
-		blk_mq_dispatch_rq_list(q, &rq_list, false);
+		blk_mq_dispatch_rq_list(q, &rq_list, -1);
 	}
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 76a5e919c336..43ae2b973d99 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1182,13 +1182,14 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
  * Returns true if we did some work AND can potentially do more.
  */
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
-			     bool got_budget)
+			     int budget_token)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq, *nxt;
 	bool no_tag = false;
 	int errors, queued;
 	blk_status_t ret = BLK_STS_OK;
+	bool got_budget = budget_token >= 0;
 
 	if (list_empty(list))
 		return false;
@@ -1205,8 +1206,11 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		rq = list_first_entry(list, struct request, queuelist);
 
 		hctx = rq->mq_hctx;
-		if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
-			break;
+		if (!got_budget) {
+			budget_token = blk_mq_get_dispatch_budget(hctx);
+			if (budget_token < 0)
+				break;
+		}
 
 		if (!blk_mq_get_driver_tag(rq)) {
 			/*
@@ -1217,7 +1221,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			 * we'll re-run it below.
 			 */
 			if (!blk_mq_mark_tag_wait(hctx, rq)) {
-				blk_mq_put_dispatch_budget(hctx);
+				blk_mq_put_dispatch_budget(hctx, budget_token);
 				/*
 				 * For non-shared tags, the RESTART check
 				 * will suffice.
@@ -1819,6 +1823,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 {
 	struct request_queue *q = rq->q;
 	bool run_queue = true;
+	int budget_token;
 
 	/*
 	 * RCU or SRCU read lock is needed before checking quiesced flag.
@@ -1836,11 +1841,12 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	if (q->elevator && !bypass_insert)
 		goto insert;
 
-	if (!blk_mq_get_dispatch_budget(hctx))
+	budget_token = blk_mq_get_dispatch_budget(hctx);
+	if (budget_token < 0)
 		goto insert;
 
 	if (!blk_mq_get_driver_tag(rq)) {
-		blk_mq_put_dispatch_budget(hctx);
+		blk_mq_put_dispatch_budget(hctx, budget_token);
 		goto insert;
 	}
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index eaaca8fc1c28..1c82d89791c0 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -40,7 +40,7 @@ struct blk_mq_ctx {
 void blk_mq_exit_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
+bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, int);
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 				bool kick_requeue_list);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
@@ -179,21 +179,22 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 			 unsigned int inflight[2]);
 
-static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx,
+					      int budget_token)
 {
 	struct request_queue *q = hctx->queue;
 
 	if (q->mq_ops->put_budget)
-		q->mq_ops->put_budget(hctx);
+		q->mq_ops->put_budget(hctx, budget_token);
 }
 
-static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+static inline int blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 
 	if (q->mq_ops->get_budget)
 		return q->mq_ops->get_budget(hctx);
-	return true;
+	return 0;
 }
 
 static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 610ee41fa54c..c8eb555b3ce8 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1619,7 +1619,7 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
 		clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
 }
 
-static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
+static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx, int budget_token)
 {
 	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
@@ -1627,17 +1627,17 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
 	atomic_dec(&sdev->device_busy);
 }
 
-static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+static int scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
 
 	if (scsi_dev_queue_ready(q, sdev))
-		return true;
+		return 0;
 
 	if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev))
 		blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
-	return false;
+	return -1;
 }
 
 static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -1701,7 +1701,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (scsi_target(sdev)->can_queue > 0)
 		atomic_dec(&scsi_target(sdev)->target_busy);
 out_put_budget:
-	scsi_mq_put_budget(hctx);
+	scsi_mq_put_budget(hctx, 0);
 	switch (ret) {
 	case BLK_STS_OK:
 		break;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 11cfd6470b1a..6fd8d7cfe158 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -265,8 +265,8 @@ struct blk_mq_queue_data {
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
 typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *);
-typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
-typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
+typedef int (get_budget_fn)(struct blk_mq_hw_ctx *);
+typedef void (put_budget_fn)(struct blk_mq_hw_ctx *, int);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
-- 
2.20.1


  parent reply	other threads:[~2020-02-28 23:30 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-28 23:29 [PATCH V2 00/10] scsi: tracking device queue depth via sbitmap Ming Lei
2020-02-28 23:29 ` [PATCH V2 01/10] sbitmap: maintain allocation round_robin in sbitmap Ming Lei
2020-02-28 23:29 ` [PATCH V2 02/10] sbitmap: add helpers for updating allocation hint Ming Lei
2020-02-28 23:29 ` [PATCH V2 03/10] sbitmap: remove sbitmap_clear_bit_unlock Ming Lei
2020-02-28 23:29 ` [PATCH V2 04/10] sbitmap: move allocation hint into sbitmap Ming Lei
2020-02-28 23:29 ` [PATCH V2 05/10] sbitmap: export sbitmap_weight Ming Lei
2020-02-28 23:29 ` [PATCH V2 06/10] sbitmap: add helper of sbitmap_calculate_shift Ming Lei
2020-02-28 23:29 ` Ming Lei [this message]
2020-02-28 23:29 ` [PATCH V2 08/10] blk-mq: pass budget token to dirver via blk_mq_queue_data Ming Lei
2020-02-28 23:29 ` [PATCH V2 09/10] scsi: add scsi_device_busy() to read sdev->device_busy Ming Lei
2020-02-28 23:29 ` [PATCH V2 10/10] scsi: replace sdev->device_busy with sbitmap Ming Lei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200228232920.20960-8-ming.lei@redhat.com \
    --to=ming.lei@redhat.com \
    --cc=James.Bottomley@HansenPartnership.com \
    --cc=axboe@kernel.dk \
    --cc=bart.vanassche@wdc.com \
    --cc=chaitra.basappa@broadcom.com \
    --cc=emilne@redhat.com \
    --cc=hare@suse.de \
    --cc=kashyap.desai@broadcom.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=osandov@fb.com \
    --cc=sathya.prakash@broadcom.com \
    --cc=shivasharan.srikanteshwara@broadcom.com \
    --cc=suganath-prabu.subramani@broadcom.com \
    --cc=sumit.saxena@broadcom.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.