All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go
@ 2020-04-23 21:05 Salman Qazi
  2020-04-23 21:30 ` Jens Axboe
  2020-04-24  6:15 ` Christoph Hellwig
  0 siblings, 2 replies; 6+ messages in thread
From: Salman Qazi @ 2020-04-23 21:05 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei, Bart Van Assche, linux-block, linux-kernel
  Cc: Jesse Barnes, Gwendal Grignou, Hannes Reinecke,
	Christoph Hellwig, Salman Qazi

Flushes bypass the I/O scheduler and get added to hctx->dispatch
in blk_mq_sched_bypass_insert.  This can happen while a kworker is running
hctx->run_work work item and is past the point in
blk_mq_sched_dispatch_requests where hctx->dispatch is checked.

The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time,
because the I/O scheduler can feed an arbitrary number of commands.

Since we have only one hctx->run_work, the commands waiting in
hctx->dispatch will wait an arbitrary length of time for run_work to be
rerun.

A similar phenomenon exists with dispatches from the software queue.

The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and
blk_mq_do_dispatch_ctx and return from the run_work handler and let it
rerun.

Signed-off-by: Salman Qazi <sqazi@google.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sched.c | 49 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 74cedea56034..b69b780351c1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -84,12 +84,17 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  * its queue by itself in its completion handler, so we don't need to
  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns true if hctx->dispatch was found non-empty and
+ * run_work has to be run again.  This is necessary to avoid
+ * starving flushes.
  */
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
+	bool ret = false;
 
 	do {
 		struct request *rq;
@@ -97,6 +102,11 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
 			break;
 
+		if (!list_empty_careful(&hctx->dispatch)) {
+			ret = true;
+			break;
+		}
+
 		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
@@ -113,6 +123,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		 */
 		list_add(&rq->queuelist, &rq_list);
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	return ret;
 }
 
 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -130,16 +142,26 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  * its queue by itself in its completion handler, so we don't need to
  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns true if hctx->dispatch was found non-empty and
+ * run_work has to be run again.  This is necessary to avoid
+ * starving flushes.
  */
-static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	LIST_HEAD(rq_list);
 	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+	bool ret = false;
 
 	do {
 		struct request *rq;
 
+		if (!list_empty_careful(&hctx->dispatch)) {
+			ret = true;
+			break;
+		}
+
 		if (!sbitmap_any_bit_set(&hctx->ctx_map))
 			break;
 
@@ -165,6 +187,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
 
 	WRITE_ONCE(hctx->dispatch_from, ctx);
+	return ret;
 }
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -172,6 +195,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
+	bool run_again;
+	bool restarted = false;
 	LIST_HEAD(rq_list);
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -180,6 +205,9 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 
 	hctx->run++;
 
+again:
+	run_again = false;
+
 	/*
 	 * If we have previous entries on our dispatch list, grab them first for
 	 * more fair dispatch.
@@ -208,19 +236,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		blk_mq_sched_mark_restart_hctx(hctx);
 		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
 			if (has_sched_dispatch)
-				blk_mq_do_dispatch_sched(hctx);
+				run_again = blk_mq_do_dispatch_sched(hctx);
 			else
-				blk_mq_do_dispatch_ctx(hctx);
+				run_again = blk_mq_do_dispatch_ctx(hctx);
 		}
 	} else if (has_sched_dispatch) {
-		blk_mq_do_dispatch_sched(hctx);
+		run_again = blk_mq_do_dispatch_sched(hctx);
 	} else if (hctx->dispatch_busy) {
 		/* dequeue request one by one from sw queue if queue is busy */
-		blk_mq_do_dispatch_ctx(hctx);
+		run_again = blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list, false);
 	}
+
+	if (run_again) {
+		if (!restarted) {
+			restarted = true;
+			goto again;
+		}
+
+		blk_mq_run_hw_queue(hctx, true);
+	}
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-- 
2.26.1.301.g55bc3eb7cb9-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go
  2020-04-23 21:05 [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go Salman Qazi
@ 2020-04-23 21:30 ` Jens Axboe
  2020-04-24  1:41   ` Salman Qazi
  2020-04-24  6:15 ` Christoph Hellwig
  1 sibling, 1 reply; 6+ messages in thread
From: Jens Axboe @ 2020-04-23 21:30 UTC (permalink / raw)
  To: Salman Qazi, Ming Lei, Bart Van Assche, linux-block, linux-kernel
  Cc: Jesse Barnes, Gwendal Grignou, Hannes Reinecke, Christoph Hellwig

On 4/23/20 3:05 PM, Salman Qazi wrote:
> Flushes bypass the I/O scheduler and get added to hctx->dispatch
> in blk_mq_sched_bypass_insert.  This can happen while a kworker is running
> hctx->run_work work item and is past the point in
> blk_mq_sched_dispatch_requests where hctx->dispatch is checked.
> 
> The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time,
> because the I/O scheduler can feed an arbitrary number of commands.
> 
> Since we have only one hctx->run_work, the commands waiting in
> hctx->dispatch will wait an arbitrary length of time for run_work to be
> rerun.
> 
> A similar phenomenon exists with dispatches from the software queue.
> 
> The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and
> blk_mq_do_dispatch_ctx and return from the run_work handler and let it
> rerun.

Any changes since v1? It's customary to put that in here too, below
the --- lines.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go
  2020-04-23 21:30 ` Jens Axboe
@ 2020-04-24  1:41   ` Salman Qazi
  0 siblings, 0 replies; 6+ messages in thread
From: Salman Qazi @ 2020-04-24  1:41 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Ming Lei, Bart Van Assche, linux-block,
	Linux Kernel Mailing List, Jesse Barnes, Gwendal Grignou,
	Hannes Reinecke, Christoph Hellwig

I remailed it with the changes since v1 added.  But just to answer directly:

Changes since v1:

* Removed max_sched_batch.
* Extended the fix to the software queue.
* Use a return value from blk_mq_do_dispatch_sched to indicate if
  the dispatch should be rerun.
* Some comments added.

On Thu, Apr 23, 2020 at 2:30 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 4/23/20 3:05 PM, Salman Qazi wrote:
> > Flushes bypass the I/O scheduler and get added to hctx->dispatch
> > in blk_mq_sched_bypass_insert.  This can happen while a kworker is running
> > hctx->run_work work item and is past the point in
> > blk_mq_sched_dispatch_requests where hctx->dispatch is checked.
> >
> > The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time,
> > because the I/O scheduler can feed an arbitrary number of commands.
> >
> > Since we have only one hctx->run_work, the commands waiting in
> > hctx->dispatch will wait an arbitrary length of time for run_work to be
> > rerun.
> >
> > A similar phenomenon exists with dispatches from the software queue.
> >
> > The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and
> > blk_mq_do_dispatch_ctx and return from the run_work handler and let it
> > rerun.
>
> Any changes since v1? It's customary to put that in here too, below
> the --- lines.
>
> --
> Jens Axboe
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go
  2020-04-23 21:05 [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go Salman Qazi
  2020-04-23 21:30 ` Jens Axboe
@ 2020-04-24  6:15 ` Christoph Hellwig
  2020-04-24  6:42   ` Christoph Hellwig
  1 sibling, 1 reply; 6+ messages in thread
From: Christoph Hellwig @ 2020-04-24  6:15 UTC (permalink / raw)
  To: Salman Qazi
  Cc: Jens Axboe, Ming Lei, Bart Van Assche, linux-block, linux-kernel,
	Jesse Barnes, Gwendal Grignou, Hannes Reinecke,
	Christoph Hellwig

> index 74cedea56034..b69b780351c1 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -84,12 +84,17 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
>   * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
>   * its queue by itself in its completion handler, so we don't need to
>   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> + *
> + * Returns true if hctx->dispatch was found non-empty and
> + * run_work has to be run again.  This is necessary to avoid
> + * starving flushes.
>   */

Please use up all 80 chars for your comments (just like the existing
part of the comment).

>   * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
>   * its queue by itself in its completion handler, so we don't need to
>   * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
> + *
> + * Returns true if hctx->dispatch was found non-empty and
> + * run_work has to be run again.  This is necessary to avoid
> + * starving flushes.

Same here.

> +again:
> +	run_again = false;
> +
>  	/*
>  	 * If we have previous entries on our dispatch list, grab them first for
>  	 * more fair dispatch.
> @@ -208,19 +236,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  		blk_mq_sched_mark_restart_hctx(hctx);
>  		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
>  			if (has_sched_dispatch)
> -				blk_mq_do_dispatch_sched(hctx);
> +				run_again = blk_mq_do_dispatch_sched(hctx);
>  			else
> -				blk_mq_do_dispatch_ctx(hctx);
> +				run_again = blk_mq_do_dispatch_ctx(hctx);
>  		}
>  	} else if (has_sched_dispatch) {
> -		blk_mq_do_dispatch_sched(hctx);
> +		run_again = blk_mq_do_dispatch_sched(hctx);
>  	} else if (hctx->dispatch_busy) {
>  		/* dequeue request one by one from sw queue if queue is busy */
> -		blk_mq_do_dispatch_ctx(hctx);
> +		run_again = blk_mq_do_dispatch_ctx(hctx);
>  	} else {
>  		blk_mq_flush_busy_ctxs(hctx, &rq_list);
>  		blk_mq_dispatch_rq_list(q, &rq_list, false);
>  	}
> +
> +	if (run_again) {
> +		if (!restarted) {
> +			restarted = true;
> +			goto again;
> +		}
> +
> +		blk_mq_run_hw_queue(hctx, true);
> +	}

This is a weird loop.  I'd split the code betweem the again label and
the run_again check here into a __blk_mq_sched_dispatch_requests
helper, and then you can do:

	if (__blk_mq_sched_dispatch_requests()) {
		if (__blk_mq_sched_dispatch_requests())
			blk_mq_run_hw_queue(hctx, true);
	}

here.  Preferably with ha good comment explaining the logic.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go
  2020-04-24  6:15 ` Christoph Hellwig
@ 2020-04-24  6:42   ` Christoph Hellwig
  0 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2020-04-24  6:42 UTC (permalink / raw)
  To: Salman Qazi
  Cc: Jens Axboe, Ming Lei, Bart Van Assche, linux-block, linux-kernel,
	Jesse Barnes, Gwendal Grignou, Hannes Reinecke,
	Christoph Hellwig

On Fri, Apr 24, 2020 at 08:15:29AM +0200, Christoph Hellwig wrote:
> This is a weird loop.  I'd split the code betweem the again label and
> the run_again check here into a __blk_mq_sched_dispatch_requests
> helper, and then you can do:
> 
> 	if (__blk_mq_sched_dispatch_requests()) {
> 		if (__blk_mq_sched_dispatch_requests())
> 			blk_mq_run_hw_queue(hctx, true);
> 	}
> 
> here.  Preferably with ha good comment explaining the logic.

Also I wonder if inverting the return values in the lower level function
would make things a little more readable - a true return suggests
everything worked fine.  Alternative 0 for sucess and -EAGAIN for needs
a retry also would be pretty self-documenting.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go
@ 2020-04-23 21:48 Salman Qazi
  0 siblings, 0 replies; 6+ messages in thread
From: Salman Qazi @ 2020-04-23 21:48 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei, Bart Van Assche, linux-block, linux-kernel
  Cc: Jesse Barnes, Gwendal Grignou, Hannes Reinecke,
	Christoph Hellwig, Salman Qazi

Flushes bypass the I/O scheduler and get added to hctx->dispatch
in blk_mq_sched_bypass_insert.  This can happen while a kworker is running
hctx->run_work work item and is past the point in
blk_mq_sched_dispatch_requests where hctx->dispatch is checked.

The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time,
because the I/O scheduler can feed an arbitrary number of commands.

Since we have only one hctx->run_work, the commands waiting in
hctx->dispatch will wait an arbitrary length of time for run_work to be
rerun.

A similar phenomenon exists with dispatches from the software queue.

The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and
blk_mq_do_dispatch_ctx and return from the run_work handler and let it
rerun.

Signed-off-by: Salman Qazi <sqazi@google.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
---
Changes since v1:

* Removed max_sched_batch.
* Extended the fix to the software queue.
* Use a return value from blk_mq_do_dispatch_sched to indicate if
  the dispatch should be rerun.
* Some comments added.

 block/blk-mq-sched.c | 49 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 74cedea56034..b69b780351c1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -84,12 +84,17 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  * its queue by itself in its completion handler, so we don't need to
  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns true if hctx->dispatch was found non-empty and
+ * run_work has to be run again.  This is necessary to avoid
+ * starving flushes.
  */
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
+	bool ret = false;
 
 	do {
 		struct request *rq;
@@ -97,6 +102,11 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
 			break;
 
+		if (!list_empty_careful(&hctx->dispatch)) {
+			ret = true;
+			break;
+		}
+
 		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
@@ -113,6 +123,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		 */
 		list_add(&rq->queuelist, &rq_list);
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	return ret;
 }
 
 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -130,16 +142,26 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  * its queue by itself in its completion handler, so we don't need to
  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns true if hctx->dispatch was found non-empty and
+ * run_work has to be run again.  This is necessary to avoid
+ * starving flushes.
  */
-static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	LIST_HEAD(rq_list);
 	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+	bool ret = false;
 
 	do {
 		struct request *rq;
 
+		if (!list_empty_careful(&hctx->dispatch)) {
+			ret = true;
+			break;
+		}
+
 		if (!sbitmap_any_bit_set(&hctx->ctx_map))
 			break;
 
@@ -165,6 +187,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
 
 	WRITE_ONCE(hctx->dispatch_from, ctx);
+	return ret;
 }
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -172,6 +195,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
+	bool run_again;
+	bool restarted = false;
 	LIST_HEAD(rq_list);
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -180,6 +205,9 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 
 	hctx->run++;
 
+again:
+	run_again = false;
+
 	/*
 	 * If we have previous entries on our dispatch list, grab them first for
 	 * more fair dispatch.
@@ -208,19 +236,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		blk_mq_sched_mark_restart_hctx(hctx);
 		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
 			if (has_sched_dispatch)
-				blk_mq_do_dispatch_sched(hctx);
+				run_again = blk_mq_do_dispatch_sched(hctx);
 			else
-				blk_mq_do_dispatch_ctx(hctx);
+				run_again = blk_mq_do_dispatch_ctx(hctx);
 		}
 	} else if (has_sched_dispatch) {
-		blk_mq_do_dispatch_sched(hctx);
+		run_again = blk_mq_do_dispatch_sched(hctx);
 	} else if (hctx->dispatch_busy) {
 		/* dequeue request one by one from sw queue if queue is busy */
-		blk_mq_do_dispatch_ctx(hctx);
+		run_again = blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list, false);
 	}
+
+	if (run_again) {
+		if (!restarted) {
+			restarted = true;
+			goto again;
+		}
+
+		blk_mq_run_hw_queue(hctx, true);
+	}
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-- 
2.26.1.301.g55bc3eb7cb9-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-04-24  6:42 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-23 21:05 [PATCH v2] block: Limit number of items taken from the I/O scheduler in one go Salman Qazi
2020-04-23 21:30 ` Jens Axboe
2020-04-24  1:41   ` Salman Qazi
2020-04-24  6:15 ` Christoph Hellwig
2020-04-24  6:42   ` Christoph Hellwig
2020-04-23 21:48 Salman Qazi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.