Linux-Block Archive on lore.kernel.org
 help / color / Atom feed
* [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
@ 2020-07-28 13:49 Ming Lei
  2020-07-29 10:28 ` Ming Lei
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Ming Lei @ 2020-07-28 13:49 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig
  Cc: linux-block, Ming Lei, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Sagi Grimberg, Bart Van Assche

In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
section during dispatching request, then request queue quiesce is based on
SRCU. What we want to get is low cost added in fast path.

However, from srcu_read_lock/srcu_read_unlock implementation, not see
it is quicker than percpu refcount, so use percpu_ref to implement
queue quiesce. This usage is cleaner and simpler & enough for implementing
queue quiesce. The main requirement is to make sure all read sections to observe
QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.

Also it becomes much easier to add interface of async queue quiesce.

Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sysfs.c   |  2 -
 block/blk-mq.c         | 94 +++++++++++++++++++-----------------------
 block/blk-sysfs.c      |  6 ++-
 include/linux/blk-mq.h |  7 ----
 include/linux/blkdev.h |  4 ++
 5 files changed, 51 insertions(+), 62 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 062229395a50..799db7937105 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -38,8 +38,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
 
 	cancel_delayed_work_sync(&hctx->run_work);
 
-	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		cleanup_srcu_struct(hctx->srcu);
 	blk_free_flush_queue(hctx->fq);
 	sbitmap_free(&hctx->ctx_map);
 	free_cpumask_var(hctx->cpumask);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c3856377b961..ea75d8e005be 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -220,19 +220,13 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
  */
 void blk_mq_quiesce_queue(struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
-	bool rcu = false;
-
 	blk_mq_quiesce_queue_nowait(q);
 
-	queue_for_each_hw_ctx(q, hctx, i) {
-		if (hctx->flags & BLK_MQ_F_BLOCKING)
-			synchronize_srcu(hctx->srcu);
-		else
-			rcu = true;
-	}
-	if (rcu)
+	if (q->tag_set->flags & BLK_MQ_F_BLOCKING) {
+		percpu_ref_kill(&q->dispatch_counter);
+		wait_event(q->mq_quiesce_wq,
+				percpu_ref_is_zero(&q->dispatch_counter));
+	} else
 		synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
@@ -248,6 +242,9 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
 {
 	blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
 
+	if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
+		percpu_ref_resurrect(&q->dispatch_counter);
+
 	/* dispatch requests which are inserted during quiescing */
 	blk_mq_run_hw_queues(q, true);
 }
@@ -699,24 +696,20 @@ void blk_mq_complete_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
-static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
-	__releases(hctx->srcu)
+static void hctx_unlock(struct blk_mq_hw_ctx *hctx)
 {
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 		rcu_read_unlock();
 	else
-		srcu_read_unlock(hctx->srcu, srcu_idx);
+		percpu_ref_put(&hctx->queue->dispatch_counter);
 }
 
-static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
-	__acquires(hctx->srcu)
+static void hctx_lock(struct blk_mq_hw_ctx *hctx)
 {
-	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-		/* shut up gcc false positive */
-		*srcu_idx = 0;
+	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 		rcu_read_lock();
-	} else
-		*srcu_idx = srcu_read_lock(hctx->srcu);
+	else
+		percpu_ref_get(&hctx->queue->dispatch_counter);
 }
 
 /**
@@ -1486,8 +1479,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
  */
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	int srcu_idx;
-
 	/*
 	 * We should be running this queue from one of the CPUs that
 	 * are mapped to it.
@@ -1521,9 +1512,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-	hctx_lock(hctx, &srcu_idx);
+	hctx_lock(hctx);
 	blk_mq_sched_dispatch_requests(hctx);
-	hctx_unlock(hctx, srcu_idx);
+	hctx_unlock(hctx);
 }
 
 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -1635,7 +1626,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
  */
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	int srcu_idx;
 	bool need_run;
 
 	/*
@@ -1646,10 +1636,10 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
 	 * quiesced.
 	 */
-	hctx_lock(hctx, &srcu_idx);
+	hctx_lock(hctx);
 	need_run = !blk_queue_quiesced(hctx->queue) &&
 		blk_mq_hctx_has_pending(hctx);
-	hctx_unlock(hctx, srcu_idx);
+	hctx_unlock(hctx);
 
 	if (need_run)
 		__blk_mq_delay_run_hw_queue(hctx, async, 0);
@@ -2035,11 +2025,10 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, blk_qc_t *cookie)
 {
 	blk_status_t ret;
-	int srcu_idx;
 
 	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-	hctx_lock(hctx, &srcu_idx);
+	hctx_lock(hctx);
 
 	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
 	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
@@ -2047,19 +2036,18 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	else if (ret != BLK_STS_OK)
 		blk_mq_end_request(rq, ret);
 
-	hctx_unlock(hctx, srcu_idx);
+	hctx_unlock(hctx);
 }
 
 blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 {
 	blk_status_t ret;
-	int srcu_idx;
 	blk_qc_t unused_cookie;
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
-	hctx_lock(hctx, &srcu_idx);
+	hctx_lock(hctx);
 	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
-	hctx_unlock(hctx, srcu_idx);
+	hctx_unlock(hctx);
 
 	return ret;
 }
@@ -2590,20 +2578,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
 	}
 }
 
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
-	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
-	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
-			   __alignof__(struct blk_mq_hw_ctx)) !=
-		     sizeof(struct blk_mq_hw_ctx));
-
-	if (tag_set->flags & BLK_MQ_F_BLOCKING)
-		hw_ctx_size += sizeof(struct srcu_struct);
-
-	return hw_ctx_size;
-}
-
 static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -2641,7 +2615,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	struct blk_mq_hw_ctx *hctx;
 	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
 
-	hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
 	if (!hctx)
 		goto fail_alloc_hctx;
 
@@ -2683,8 +2657,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	if (!hctx->fq)
 		goto free_bitmap;
 
-	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		init_srcu_struct(hctx->srcu);
 	blk_mq_hctx_kobj_init(hctx);
 
 	return hctx;
@@ -3161,6 +3133,13 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	mutex_unlock(&q->sysfs_lock);
 }
 
+static void blk_mq_dispatch_counter_release(struct percpu_ref *ref)
+{
+	struct request_queue *q = container_of(ref, struct request_queue,
+				dispatch_counter);
+	wake_up_all(&q->mq_quiesce_wq);
+}
+
 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 						  struct request_queue *q,
 						  bool elevator_init)
@@ -3177,6 +3156,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (blk_mq_alloc_ctxs(q))
 		goto err_poll;
 
+	if (set->flags & BLK_MQ_F_BLOCKING) {
+		init_waitqueue_head(&q->mq_quiesce_wq);
+		if (percpu_ref_init(&q->dispatch_counter,
+					blk_mq_dispatch_counter_release,
+					PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+			goto err_hctxs;
+	}
+
 	/* init q->mq_kobj and sw queues' kobjects */
 	blk_mq_sysfs_init(q);
 
@@ -3185,7 +3172,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	blk_mq_realloc_hw_ctxs(set, q);
 	if (!q->nr_hw_queues)
-		goto err_hctxs;
+		goto err_dispatch_counter;
 
 	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
@@ -3219,6 +3206,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	return q;
 
+err_dispatch_counter:
+	if (set->flags & BLK_MQ_F_BLOCKING)
+		percpu_ref_exit(&q->dispatch_counter);
 err_hctxs:
 	kfree(q->queue_hw_ctx);
 	q->nr_hw_queues = 0;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index be67952e7be2..8f22de582874 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -914,9 +914,13 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_queue_free_zone_bitmaps(q);
 
-	if (queue_is_mq(q))
+	if (queue_is_mq(q)) {
 		blk_mq_release(q);
 
+		if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
+			percpu_ref_exit(&q->dispatch_counter);
+	}
+
 	blk_trace_shutdown(q);
 	mutex_lock(&q->debugfs_mutex);
 	debugfs_remove_recursive(q->debugfs_dir);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 23230c1d031e..70e64e87586c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -169,13 +169,6 @@ struct blk_mq_hw_ctx {
 	 * q->unused_hctx_list.
 	 */
 	struct list_head	hctx_list;
-
-	/**
-	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
-	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
-	 * blk_mq_hw_ctx_size().
-	 */
-	struct srcu_struct	srcu[];
 };
 
 /**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 06995b96e946..32d6e194e6eb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -567,6 +567,10 @@ struct request_queue {
 	struct list_head	tag_set_list;
 	struct bio_set		bio_split;
 
+	/* only used for BLK_MQ_F_BLOCKING */
+	struct percpu_ref	dispatch_counter;
+	wait_queue_head_t	mq_quiesce_wq;
+
 	struct dentry		*debugfs_dir;
 
 #ifdef CONFIG_BLK_DEBUG_FS
-- 
2.25.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-28 13:49 [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING Ming Lei
@ 2020-07-29 10:28 ` Ming Lei
  2020-07-29 15:42   ` Sagi Grimberg
  2020-07-29 11:20 ` Johannes Thumshirn
  2020-07-29 16:12 ` Keith Busch
  2 siblings, 1 reply; 20+ messages in thread
From: Ming Lei @ 2020-07-29 10:28 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig
  Cc: linux-block, Lai Jiangshan, Paul E . McKenney, Josh Triplett,
	Sagi Grimberg, Bart Van Assche

On Tue, Jul 28, 2020 at 09:49:38PM +0800, Ming Lei wrote:
> In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
> section during dispatching request, then request queue quiesce is based on
> SRCU. What we want to get is low cost added in fast path.
> 
> However, from srcu_read_lock/srcu_read_unlock implementation, not see
> it is quicker than percpu refcount, so use percpu_ref to implement
> queue quiesce. This usage is cleaner and simpler & enough for implementing
> queue quiesce. The main requirement is to make sure all read sections to observe
> QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.
> 
> Also it becomes much easier to add interface of async queue quiesce.

BTW, no obvious IOPS difference is observed with this patch applied when running
io on null_blk(blocking, submit_queues=32) in one dual-socket, 32cores system.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-28 13:49 [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING Ming Lei
  2020-07-29 10:28 ` Ming Lei
@ 2020-07-29 11:20 ` Johannes Thumshirn
  2020-07-29 16:12 ` Keith Busch
  2 siblings, 0 replies; 20+ messages in thread
From: Johannes Thumshirn @ 2020-07-29 11:20 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe, Christoph Hellwig
  Cc: linux-block, Lai Jiangshan, Paul E . McKenney, Josh Triplett,
	Sagi Grimberg, Bart Van Assche

On 28/07/2020 15:50, Ming Lei wrote:
[...]
> -static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
> -	__acquires(hctx->srcu)
> +static void hctx_lock(struct blk_mq_hw_ctx *hctx)
>  {
> -	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> -		/* shut up gcc false positive */
> -		*srcu_idx = 0;
> +	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
>  		rcu_read_lock();
> -	} else
> -		*srcu_idx = srcu_read_lock(hctx->srcu);
> +	else
> +		percpu_ref_get(&hctx->queue->dispatch_counter);
>  }

I quite like this because it hides the internals of hctx_lock() (rcu vs srcu) 
from the callers.

>  /**
> @@ -1486,8 +1479,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
>   */
>  static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>  {
> -	int srcu_idx;
> -
>  	/*
>  	 * We should be running this queue from one of the CPUs that
>  	 * are mapped to it.
> @@ -1521,9 +1512,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>  
>  	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
>  
> -	hctx_lock(hctx, &srcu_idx);
> +	hctx_lock(hctx);
>  	blk_mq_sched_dispatch_requests(hctx);
> -	hctx_unlock(hctx, srcu_idx);
> +	hctx_unlock(hctx);


blk_mq_sched_dispatch_requests also has this comment at the beginning:

	/* RCU or SRCU read lock is needed before checking quiesced flag */
	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))

I think the SRCU part needs to be changed to percpu_ref or the whole comment
should probably just mention the hctx_lock().


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-29 10:28 ` Ming Lei
@ 2020-07-29 15:42   ` Sagi Grimberg
  2020-07-29 15:49     ` Ming Lei
  0 siblings, 1 reply; 20+ messages in thread
From: Sagi Grimberg @ 2020-07-29 15:42 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe, Christoph Hellwig
  Cc: linux-block, Lai Jiangshan, Paul E . McKenney, Josh Triplett,
	Bart Van Assche


>> In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
>> section during dispatching request, then request queue quiesce is based on
>> SRCU. What we want to get is low cost added in fast path.
>>
>> However, from srcu_read_lock/srcu_read_unlock implementation, not see
>> it is quicker than percpu refcount, so use percpu_ref to implement
>> queue quiesce. This usage is cleaner and simpler & enough for implementing
>> queue quiesce. The main requirement is to make sure all read sections to observe
>> QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.
>>
>> Also it becomes much easier to add interface of async queue quiesce.
> 
> BTW, no obvious IOPS difference is observed with this patch applied when running
> io on null_blk(blocking, submit_queues=32) in one dual-socket, 32cores system.

Thanks Ming, can you test for non-blocking on the same setup?

I can test some reset storms during traffic.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-29 15:42   ` Sagi Grimberg
@ 2020-07-29 15:49     ` Ming Lei
  2020-07-29 22:37       ` Sagi Grimberg
  0 siblings, 1 reply; 20+ messages in thread
From: Ming Lei @ 2020-07-29 15:49 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, Christoph Hellwig, linux-block, Lai Jiangshan,
	Paul E . McKenney, Josh Triplett, Bart Van Assche

On Wed, Jul 29, 2020 at 08:42:31AM -0700, Sagi Grimberg wrote:
> 
> > > In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
> > > section during dispatching request, then request queue quiesce is based on
> > > SRCU. What we want to get is low cost added in fast path.
> > > 
> > > However, from srcu_read_lock/srcu_read_unlock implementation, not see
> > > it is quicker than percpu refcount, so use percpu_ref to implement
> > > queue quiesce. This usage is cleaner and simpler & enough for implementing
> > > queue quiesce. The main requirement is to make sure all read sections to observe
> > > QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.
> > > 
> > > Also it becomes much easier to add interface of async queue quiesce.
> > 
> > BTW, no obvious IOPS difference is observed with this patch applied when running
> > io on null_blk(blocking, submit_queues=32) in one dual-socket, 32cores system.
> 
> Thanks Ming, can you test for non-blocking on the same setup?

OK, I can do that, but this patch supposes to not affect non-blocking,
care to share your motivation for testing non-blocking?


Thanks,
Ming


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-28 13:49 [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING Ming Lei
  2020-07-29 10:28 ` Ming Lei
  2020-07-29 11:20 ` Johannes Thumshirn
@ 2020-07-29 16:12 ` Keith Busch
  2020-07-29 22:16   ` Ming Lei
  2 siblings, 1 reply; 20+ messages in thread
From: Keith Busch @ 2020-07-29 16:12 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Christoph Hellwig, linux-block, Lai Jiangshan,
	Paul E . McKenney, Josh Triplett, Sagi Grimberg, Bart Van Assche

On Tue, Jul 28, 2020 at 09:49:38PM +0800, Ming Lei wrote:
>  void blk_mq_quiesce_queue(struct request_queue *q)
>  {
> -	struct blk_mq_hw_ctx *hctx;
> -	unsigned int i;
> -	bool rcu = false;
> -
>  	blk_mq_quiesce_queue_nowait(q);
>  
> -	queue_for_each_hw_ctx(q, hctx, i) {
> -		if (hctx->flags & BLK_MQ_F_BLOCKING)
> -			synchronize_srcu(hctx->srcu);
> -		else
> -			rcu = true;
> -	}
> -	if (rcu)
> +	if (q->tag_set->flags & BLK_MQ_F_BLOCKING) {
> +		percpu_ref_kill(&q->dispatch_counter);
> +		wait_event(q->mq_quiesce_wq,
> +				percpu_ref_is_zero(&q->dispatch_counter));
> +	} else
>  		synchronize_rcu();
>  }



> +static void hctx_lock(struct blk_mq_hw_ctx *hctx)
>  {
> -	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> -		/* shut up gcc false positive */
> -		*srcu_idx = 0;
> +	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
>  		rcu_read_lock();
> -	} else
> -		*srcu_idx = srcu_read_lock(hctx->srcu);
> +	else
> +		percpu_ref_get(&hctx->queue->dispatch_counter);
>  }

percpu_ref_get() will always succeed, even after quiesce kills it.
Isn't it possible that 'percpu_ref_is_zero(&q->dispatch_counter))' may
never reach 0? We only need to ensure that dispatchers will observe
blk_queue_quiesced(). That doesn't require that there are no current
dispatchers.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-29 16:12 ` Keith Busch
@ 2020-07-29 22:16   ` Ming Lei
  2020-07-29 22:42     ` Sagi Grimberg
  0 siblings, 1 reply; 20+ messages in thread
From: Ming Lei @ 2020-07-29 22:16 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, Christoph Hellwig, linux-block, Lai Jiangshan,
	Paul E . McKenney, Josh Triplett, Sagi Grimberg, Bart Van Assche

On Wed, Jul 29, 2020 at 09:12:29AM -0700, Keith Busch wrote:
> On Tue, Jul 28, 2020 at 09:49:38PM +0800, Ming Lei wrote:
> >  void blk_mq_quiesce_queue(struct request_queue *q)
> >  {
> > -	struct blk_mq_hw_ctx *hctx;
> > -	unsigned int i;
> > -	bool rcu = false;
> > -
> >  	blk_mq_quiesce_queue_nowait(q);
> >  
> > -	queue_for_each_hw_ctx(q, hctx, i) {
> > -		if (hctx->flags & BLK_MQ_F_BLOCKING)
> > -			synchronize_srcu(hctx->srcu);
> > -		else
> > -			rcu = true;
> > -	}
> > -	if (rcu)
> > +	if (q->tag_set->flags & BLK_MQ_F_BLOCKING) {
> > +		percpu_ref_kill(&q->dispatch_counter);
> > +		wait_event(q->mq_quiesce_wq,
> > +				percpu_ref_is_zero(&q->dispatch_counter));
> > +	} else
> >  		synchronize_rcu();
> >  }
> 
> 
> 
> > +static void hctx_lock(struct blk_mq_hw_ctx *hctx)
> >  {
> > -	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> > -		/* shut up gcc false positive */
> > -		*srcu_idx = 0;
> > +	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> >  		rcu_read_lock();
> > -	} else
> > -		*srcu_idx = srcu_read_lock(hctx->srcu);
> > +	else
> > +		percpu_ref_get(&hctx->queue->dispatch_counter);
> >  }
> 
> percpu_ref_get() will always succeed, even after quiesce kills it.
> Isn't it possible that 'percpu_ref_is_zero(&q->dispatch_counter))' may
> never reach 0? We only need to ensure that dispatchers will observe
> blk_queue_quiesced(). That doesn't require that there are no current
> dispatchers.

IMO it shouldn't be one issue in reality, because:

- when dispatch can't make progress, the submission side will finally
  stop because we either run queue from submission side or request
  completion
 
- submission side stops because we always have very limited requests

- completion side stops because requests queued to device is limited
too

We still can handle this case by not dispatch in case that percpu_ref_tryget()
returns false, which will change the usage into the following way:

        if (hctx_lock(hctx)) {
        	blk_mq_sched_dispatch_requests(hctx);
        	hctx_unlock(hctx);
		}

And __blk_mq_try_issue_directly() needs a bit special treatment because
the request has to be inserted to queue after queue becomes quiesced.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-29 15:49     ` Ming Lei
@ 2020-07-29 22:37       ` Sagi Grimberg
  2020-07-30 14:53         ` Ming Lei
  0 siblings, 1 reply; 20+ messages in thread
From: Sagi Grimberg @ 2020-07-29 22:37 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Christoph Hellwig, linux-block, Lai Jiangshan,
	Paul E . McKenney, Josh Triplett, Bart Van Assche


>>>> In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
>>>> section during dispatching request, then request queue quiesce is based on
>>>> SRCU. What we want to get is low cost added in fast path.
>>>>
>>>> However, from srcu_read_lock/srcu_read_unlock implementation, not see
>>>> it is quicker than percpu refcount, so use percpu_ref to implement
>>>> queue quiesce. This usage is cleaner and simpler & enough for implementing
>>>> queue quiesce. The main requirement is to make sure all read sections to observe
>>>> QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.
>>>>
>>>> Also it becomes much easier to add interface of async queue quiesce.
>>>
>>> BTW, no obvious IOPS difference is observed with this patch applied when running
>>> io on null_blk(blocking, submit_queues=32) in one dual-socket, 32cores system.
>>
>> Thanks Ming, can you test for non-blocking on the same setup?
> 
> OK, I can do that, but this patch supposes to not affect non-blocking,
> care to share your motivation for testing non-blocking?

I think it will be a significant improvement to have a single code path.
The code will be more robust and we won't need to face issues that are
specific for blocking.

If the cost is negligible, I think the upside is worth it.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-29 22:16   ` Ming Lei
@ 2020-07-29 22:42     ` Sagi Grimberg
  2020-07-30 15:05       ` Ming Lei
  0 siblings, 1 reply; 20+ messages in thread
From: Sagi Grimberg @ 2020-07-29 22:42 UTC (permalink / raw)
  To: Ming Lei, Keith Busch
  Cc: Jens Axboe, Christoph Hellwig, linux-block, Lai Jiangshan,
	Paul E . McKenney, Josh Triplett, Bart Van Assche


>>>   void blk_mq_quiesce_queue(struct request_queue *q)
>>>   {
>>> -	struct blk_mq_hw_ctx *hctx;
>>> -	unsigned int i;
>>> -	bool rcu = false;
>>> -
>>>   	blk_mq_quiesce_queue_nowait(q);
>>>   
>>> -	queue_for_each_hw_ctx(q, hctx, i) {
>>> -		if (hctx->flags & BLK_MQ_F_BLOCKING)
>>> -			synchronize_srcu(hctx->srcu);
>>> -		else
>>> -			rcu = true;
>>> -	}
>>> -	if (rcu)
>>> +	if (q->tag_set->flags & BLK_MQ_F_BLOCKING) {
>>> +		percpu_ref_kill(&q->dispatch_counter);
>>> +		wait_event(q->mq_quiesce_wq,
>>> +				percpu_ref_is_zero(&q->dispatch_counter));
>>> +	} else
>>>   		synchronize_rcu();
>>>   }
>>
>>
>>
>>> +static void hctx_lock(struct blk_mq_hw_ctx *hctx)
>>>   {
>>> -	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
>>> -		/* shut up gcc false positive */
>>> -		*srcu_idx = 0;
>>> +	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
>>>   		rcu_read_lock();
>>> -	} else
>>> -		*srcu_idx = srcu_read_lock(hctx->srcu);
>>> +	else
>>> +		percpu_ref_get(&hctx->queue->dispatch_counter);
>>>   }
>>
>> percpu_ref_get() will always succeed, even after quiesce kills it.
>> Isn't it possible that 'percpu_ref_is_zero(&q->dispatch_counter))' may
>> never reach 0? We only need to ensure that dispatchers will observe
>> blk_queue_quiesced(). That doesn't require that there are no current
>> dispatchers.
> 
> IMO it shouldn't be one issue in reality, because:
> 
> - when dispatch can't make progress, the submission side will finally
>    stop because we either run queue from submission side or request
>    completion
>   
> - submission side stops because we always have very limited requests
> 
> - completion side stops because requests queued to device is limited
> too

I don't think that any requests should pass after the kill was called,
otherwise how can we safely quiesce if requests can come in after
it?

> 
> We still can handle this case by not dispatch in case that percpu_ref_tryget()

You meant tryget_live right?

> returns false, which will change the usage into the following way:
> 
>          if (hctx_lock(hctx)) {
>          	blk_mq_sched_dispatch_requests(hctx);
>          	hctx_unlock(hctx);
> 		}
> 
> And __blk_mq_try_issue_directly() needs a bit special treatment because
> the request has to be inserted to queue after queue becomes quiesced.

Agreed.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-29 22:37       ` Sagi Grimberg
@ 2020-07-30 14:53         ` Ming Lei
  2020-07-30 16:10           ` Sagi Grimberg
  0 siblings, 1 reply; 20+ messages in thread
From: Ming Lei @ 2020-07-30 14:53 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, Christoph Hellwig, linux-block, Lai Jiangshan,
	Paul E . McKenney, Josh Triplett, Bart Van Assche

On Wed, Jul 29, 2020 at 03:37:27PM -0700, Sagi Grimberg wrote:
> 
> > > > > In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
> > > > > section during dispatching request, then request queue quiesce is based on
> > > > > SRCU. What we want to get is low cost added in fast path.
> > > > > 
> > > > > However, from srcu_read_lock/srcu_read_unlock implementation, not see
> > > > > it is quicker than percpu refcount, so use percpu_ref to implement
> > > > > queue quiesce. This usage is cleaner and simpler & enough for implementing
> > > > > queue quiesce. The main requirement is to make sure all read sections to observe
> > > > > QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.
> > > > > 
> > > > > Also it becomes much easier to add interface of async queue quiesce.
> > > > 
> > > > BTW, no obvious IOPS difference is observed with this patch applied when running
> > > > io on null_blk(blocking, submit_queues=32) in one dual-socket, 32cores system.
> > > 
> > > Thanks Ming, can you test for non-blocking on the same setup?
> > 
> > OK, I can do that, but this patch supposes to not affect non-blocking,
> > care to share your motivation for testing non-blocking?
> 
> I think it will be a significant improvement to have a single code path.
> The code will be more robust and we won't need to face issues that are
> specific for blocking.
> 
> If the cost is negligible, I think the upside is worth it.
> 

rcu_read_lock and rcu_read_unlock has been proved as efficient enough,
and I don't think percpu_refcount is better than it, so I'd suggest to
not switch non-blocking into this way.

BTW, in case of blocking, one hctx may dispatch at most one request because there
is only single .run_work, even though when .queue_rq() is slept, that said
blk_mq_submit_bio() queues bio in sync style. This way won't be very efficient.
So percpu_refcount should be good enough for blocking code path, but may not be
well enough for non-blocking case.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-29 22:42     ` Sagi Grimberg
@ 2020-07-30 15:05       ` Ming Lei
  0 siblings, 0 replies; 20+ messages in thread
From: Ming Lei @ 2020-07-30 15:05 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Keith Busch, Jens Axboe, Christoph Hellwig, linux-block,
	Lai Jiangshan, Paul E . McKenney, Josh Triplett, Bart Van Assche

On Wed, Jul 29, 2020 at 03:42:29PM -0700, Sagi Grimberg wrote:
> 
> > > >   void blk_mq_quiesce_queue(struct request_queue *q)
> > > >   {
> > > > -	struct blk_mq_hw_ctx *hctx;
> > > > -	unsigned int i;
> > > > -	bool rcu = false;
> > > > -
> > > >   	blk_mq_quiesce_queue_nowait(q);
> > > > -	queue_for_each_hw_ctx(q, hctx, i) {
> > > > -		if (hctx->flags & BLK_MQ_F_BLOCKING)
> > > > -			synchronize_srcu(hctx->srcu);
> > > > -		else
> > > > -			rcu = true;
> > > > -	}
> > > > -	if (rcu)
> > > > +	if (q->tag_set->flags & BLK_MQ_F_BLOCKING) {
> > > > +		percpu_ref_kill(&q->dispatch_counter);
> > > > +		wait_event(q->mq_quiesce_wq,
> > > > +				percpu_ref_is_zero(&q->dispatch_counter));
> > > > +	} else
> > > >   		synchronize_rcu();
> > > >   }
> > > 
> > > 
> > > 
> > > > +static void hctx_lock(struct blk_mq_hw_ctx *hctx)
> > > >   {
> > > > -	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> > > > -		/* shut up gcc false positive */
> > > > -		*srcu_idx = 0;
> > > > +	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> > > >   		rcu_read_lock();
> > > > -	} else
> > > > -		*srcu_idx = srcu_read_lock(hctx->srcu);
> > > > +	else
> > > > +		percpu_ref_get(&hctx->queue->dispatch_counter);
> > > >   }
> > > 
> > > percpu_ref_get() will always succeed, even after quiesce kills it.
> > > Isn't it possible that 'percpu_ref_is_zero(&q->dispatch_counter))' may
> > > never reach 0? We only need to ensure that dispatchers will observe
> > > blk_queue_quiesced(). That doesn't require that there are no current
> > > dispatchers.
> > 
> > IMO it shouldn't be one issue in reality, because:
> > 
> > - when dispatch can't make progress, the submission side will finally
> >    stop because we either run queue from submission side or request
> >    completion
> > - submission side stops because we always have very limited requests
> > 
> > - completion side stops because requests queued to device is limited
> > too
> 
> I don't think that any requests should pass after the kill was called,
> otherwise how can we safely quiesce if requests can come in after
> it?

What we guarantee is that no request can be queued to LLD after
blk_mq_quiesce_queue returns.

With percpu_refcount, once percpu_ref_is_zero(&q->dispatch_counter)
returns true, all code path can observe the QUIESCED flag reliably just
like what SRCU does, so no any request can pass to LLD after blk_mq_quiesce_queue
returns.

> 
> > 
> > We still can handle this case by not dispatch in case that percpu_ref_tryget()
> 
> You meant tryget_live right?

Both works, but tryget_live could be better.


Thanks, 
Ming


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-30 14:53         ` Ming Lei
@ 2020-07-30 16:10           ` Sagi Grimberg
  2020-07-30 18:18             ` Keith Busch
  2020-07-31  0:28             ` Ming Lei
  0 siblings, 2 replies; 20+ messages in thread
From: Sagi Grimberg @ 2020-07-30 16:10 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Christoph Hellwig, linux-block, Lai Jiangshan,
	Paul E . McKenney, Josh Triplett, Bart Van Assche


>>>>>> In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
>>>>>> section during dispatching request, then request queue quiesce is based on
>>>>>> SRCU. What we want to get is low cost added in fast path.
>>>>>>
>>>>>> However, from srcu_read_lock/srcu_read_unlock implementation, not see
>>>>>> it is quicker than percpu refcount, so use percpu_ref to implement
>>>>>> queue quiesce. This usage is cleaner and simpler & enough for implementing
>>>>>> queue quiesce. The main requirement is to make sure all read sections to observe
>>>>>> QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.
>>>>>>
>>>>>> Also it becomes much easier to add interface of async queue quiesce.
>>>>>
>>>>> BTW, no obvious IOPS difference is observed with this patch applied when running
>>>>> io on null_blk(blocking, submit_queues=32) in one dual-socket, 32cores system.
>>>>
>>>> Thanks Ming, can you test for non-blocking on the same setup?
>>>
>>> OK, I can do that, but this patch supposes to not affect non-blocking,
>>> care to share your motivation for testing non-blocking?
>>
>> I think it will be a significant improvement to have a single code path.
>> The code will be more robust and we won't need to face issues that are
>> specific for blocking.
>>
>> If the cost is negligible, I think the upside is worth it.
>>
> 
> rcu_read_lock and rcu_read_unlock has been proved as efficient enough,
> and I don't think percpu_refcount is better than it, so I'd suggest to
> not switch non-blocking into this way.

It's not a matter of which is better, its a matter of making the code
more robust because it has a single code-path. If moving to percpu_ref
is negligible, I would suggest to move both, I don't want to have two
completely different mechanism for blocking vs. non-blocking.

> BTW, in case of blocking, one hctx may dispatch at most one request because there
> is only single .run_work, even though when .queue_rq() is slept, that said
> blk_mq_submit_bio() queues bio in sync style. This way won't be very efficient.
> So percpu_refcount should be good enough for blocking code path, but may not be
> well enough for non-blocking case.

Not sure what you mean, the percpu_ref is taken exactly where rcu is
taken, not sure what is the difference.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-30 16:10           ` Sagi Grimberg
@ 2020-07-30 18:18             ` Keith Busch
  2020-07-30 18:23               ` Sagi Grimberg
  2020-07-31  0:24               ` Ming Lei
  2020-07-31  0:28             ` Ming Lei
  1 sibling, 2 replies; 20+ messages in thread
From: Keith Busch @ 2020-07-30 18:18 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Ming Lei, Jens Axboe, Christoph Hellwig, linux-block,
	Lai Jiangshan, Paul E . McKenney, Josh Triplett, Bart Van Assche

On Thu, Jul 30, 2020 at 09:10:48AM -0700, Sagi Grimberg wrote:
> > > I think it will be a significant improvement to have a single code path.
> > > The code will be more robust and we won't need to face issues that are
> > > specific for blocking.
> > > 
> > > If the cost is negligible, I think the upside is worth it.
> > > 
> > 
> > rcu_read_lock and rcu_read_unlock has been proved as efficient enough,
> > and I don't think percpu_refcount is better than it, so I'd suggest to
> > not switch non-blocking into this way.
> 
> It's not a matter of which is better, its a matter of making the code
> more robust because it has a single code-path. If moving to percpu_ref
> is negligible, I would suggest to move both, I don't want to have two
> completely different mechanism for blocking vs. non-blocking.

FWIW, I proposed an hctx percpu_ref over a year ago (but for a
completely different reason), and it was measured as too costly.

  https://lore.kernel.org/linux-block/d4a4b6c0-3ea8-f748-85b0-6b39c5023a6f@kernel.dk/

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-30 18:18             ` Keith Busch
@ 2020-07-30 18:23               ` Sagi Grimberg
  2020-07-30 19:27                 ` Keith Busch
  2020-07-31  0:24               ` Ming Lei
  1 sibling, 1 reply; 20+ messages in thread
From: Sagi Grimberg @ 2020-07-30 18:23 UTC (permalink / raw)
  To: Keith Busch
  Cc: Ming Lei, Jens Axboe, Christoph Hellwig, linux-block,
	Lai Jiangshan, Paul E . McKenney, Josh Triplett, Bart Van Assche


>>>> I think it will be a significant improvement to have a single code path.
>>>> The code will be more robust and we won't need to face issues that are
>>>> specific for blocking.
>>>>
>>>> If the cost is negligible, I think the upside is worth it.
>>>>
>>>
>>> rcu_read_lock and rcu_read_unlock has been proved as efficient enough,
>>> and I don't think percpu_refcount is better than it, so I'd suggest to
>>> not switch non-blocking into this way.
>>
>> It's not a matter of which is better, its a matter of making the code
>> more robust because it has a single code-path. If moving to percpu_ref
>> is negligible, I would suggest to move both, I don't want to have two
>> completely different mechanism for blocking vs. non-blocking.
> 
> FWIW, I proposed an hctx percpu_ref over a year ago (but for a
> completely different reason), and it was measured as too costly.
> 
>    https://lore.kernel.org/linux-block/d4a4b6c0-3ea8-f748-85b0-6b39c5023a6f@kernel.dk/

If this is the case, we shouldn't consider this as an alternative at 
all, and move forward with either the original proposal or what
ming proposed to move a counter to the tagset.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-30 18:23               ` Sagi Grimberg
@ 2020-07-30 19:27                 ` Keith Busch
  2020-07-30 19:53                   ` Jens Axboe
  0 siblings, 1 reply; 20+ messages in thread
From: Keith Busch @ 2020-07-30 19:27 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Ming Lei, Jens Axboe, Christoph Hellwig, linux-block,
	Lai Jiangshan, Paul E . McKenney, Josh Triplett, Bart Van Assche

On Thu, Jul 30, 2020 at 11:23:58AM -0700, Sagi Grimberg wrote:
> 
> > > > > I think it will be a significant improvement to have a single code path.
> > > > > The code will be more robust and we won't need to face issues that are
> > > > > specific for blocking.
> > > > > 
> > > > > If the cost is negligible, I think the upside is worth it.
> > > > > 
> > > > 
> > > > rcu_read_lock and rcu_read_unlock has been proved as efficient enough,
> > > > and I don't think percpu_refcount is better than it, so I'd suggest to
> > > > not switch non-blocking into this way.
> > > 
> > > It's not a matter of which is better, its a matter of making the code
> > > more robust because it has a single code-path. If moving to percpu_ref
> > > is negligible, I would suggest to move both, I don't want to have two
> > > completely different mechanism for blocking vs. non-blocking.
> > 
> > FWIW, I proposed an hctx percpu_ref over a year ago (but for a
> > completely different reason), and it was measured as too costly.
> > 
> >    https://lore.kernel.org/linux-block/d4a4b6c0-3ea8-f748-85b0-6b39c5023a6f@kernel.dk/
> 
> If this is the case, we shouldn't consider this as an alternative at all,
> and move forward with either the original proposal or what
> ming proposed to move a counter to the tagset.

Well, the point I was trying to make is that we shouldn't bother making
blocking and non-blocking dispatchers use the same synchronization since
non-blocking has a very cheap solution that blocking can't use.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-30 19:27                 ` Keith Busch
@ 2020-07-30 19:53                   ` Jens Axboe
  2020-07-30 21:03                     ` Sagi Grimberg
  0 siblings, 1 reply; 20+ messages in thread
From: Jens Axboe @ 2020-07-30 19:53 UTC (permalink / raw)
  To: Keith Busch, Sagi Grimberg
  Cc: Ming Lei, Christoph Hellwig, linux-block, Lai Jiangshan,
	Paul E . McKenney, Josh Triplett, Bart Van Assche

On 7/30/20 1:27 PM, Keith Busch wrote:
> On Thu, Jul 30, 2020 at 11:23:58AM -0700, Sagi Grimberg wrote:
>>
>>>>>> I think it will be a significant improvement to have a single code path.
>>>>>> The code will be more robust and we won't need to face issues that are
>>>>>> specific for blocking.
>>>>>>
>>>>>> If the cost is negligible, I think the upside is worth it.
>>>>>>
>>>>>
>>>>> rcu_read_lock and rcu_read_unlock has been proved as efficient enough,
>>>>> and I don't think percpu_refcount is better than it, so I'd suggest to
>>>>> not switch non-blocking into this way.
>>>>
>>>> It's not a matter of which is better, its a matter of making the code
>>>> more robust because it has a single code-path. If moving to percpu_ref
>>>> is negligible, I would suggest to move both, I don't want to have two
>>>> completely different mechanism for blocking vs. non-blocking.
>>>
>>> FWIW, I proposed an hctx percpu_ref over a year ago (but for a
>>> completely different reason), and it was measured as too costly.
>>>
>>>    https://lore.kernel.org/linux-block/d4a4b6c0-3ea8-f748-85b0-6b39c5023a6f@kernel.dk/
>>
>> If this is the case, we shouldn't consider this as an alternative at all,
>> and move forward with either the original proposal or what
>> ming proposed to move a counter to the tagset.
> 
> Well, the point I was trying to make is that we shouldn't bother making
> blocking and non-blocking dispatchers use the same synchronization since
> non-blocking has a very cheap solution that blocking can't use.

I fully agree with that point.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-30 19:53                   ` Jens Axboe
@ 2020-07-30 21:03                     ` Sagi Grimberg
  2020-07-31  0:33                       ` Ming Lei
  0 siblings, 1 reply; 20+ messages in thread
From: Sagi Grimberg @ 2020-07-30 21:03 UTC (permalink / raw)
  To: Jens Axboe, Keith Busch
  Cc: Ming Lei, Christoph Hellwig, linux-block, Lai Jiangshan,
	Paul E . McKenney, Josh Triplett, Bart Van Assche


>>>>>>> I think it will be a significant improvement to have a single code path.
>>>>>>> The code will be more robust and we won't need to face issues that are
>>>>>>> specific for blocking.
>>>>>>>
>>>>>>> If the cost is negligible, I think the upside is worth it.
>>>>>>>
>>>>>>
>>>>>> rcu_read_lock and rcu_read_unlock has been proved as efficient enough,
>>>>>> and I don't think percpu_refcount is better than it, so I'd suggest to
>>>>>> not switch non-blocking into this way.
>>>>>
>>>>> It's not a matter of which is better, its a matter of making the code
>>>>> more robust because it has a single code-path. If moving to percpu_ref
>>>>> is negligible, I would suggest to move both, I don't want to have two
>>>>> completely different mechanism for blocking vs. non-blocking.
>>>>
>>>> FWIW, I proposed an hctx percpu_ref over a year ago (but for a
>>>> completely different reason), and it was measured as too costly.
>>>>
>>>>     https://lore.kernel.org/linux-block/d4a4b6c0-3ea8-f748-85b0-6b39c5023a6f@kernel.dk/
>>>
>>> If this is the case, we shouldn't consider this as an alternative at all,
>>> and move forward with either the original proposal or what
>>> ming proposed to move a counter to the tagset.
>>
>> Well, the point I was trying to make is that we shouldn't bother making
>> blocking and non-blocking dispatchers use the same synchronization since
>> non-blocking has a very cheap solution that blocking can't use.
> 
> I fully agree with that point.

I also agree, just said we should use the same mechanisms, IFF its not
expensive. But I'm concerned that if we use completely different 
mechanisms we are likely to get wrong assumptions and break one at some
point.

Hence my suggestion to move back to srcu and place the rcu_head in the hctx.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-30 18:18             ` Keith Busch
  2020-07-30 18:23               ` Sagi Grimberg
@ 2020-07-31  0:24               ` Ming Lei
  1 sibling, 0 replies; 20+ messages in thread
From: Ming Lei @ 2020-07-31  0:24 UTC (permalink / raw)
  To: Keith Busch
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-block,
	Lai Jiangshan, Paul E . McKenney, Josh Triplett, Bart Van Assche

On Thu, Jul 30, 2020 at 11:18:57AM -0700, Keith Busch wrote:
> On Thu, Jul 30, 2020 at 09:10:48AM -0700, Sagi Grimberg wrote:
> > > > I think it will be a significant improvement to have a single code path.
> > > > The code will be more robust and we won't need to face issues that are
> > > > specific for blocking.
> > > > 
> > > > If the cost is negligible, I think the upside is worth it.
> > > > 
> > > 
> > > rcu_read_lock and rcu_read_unlock has been proved as efficient enough,
> > > and I don't think percpu_refcount is better than it, so I'd suggest to
> > > not switch non-blocking into this way.
> > 
> > It's not a matter of which is better, its a matter of making the code
> > more robust because it has a single code-path. If moving to percpu_ref
> > is negligible, I would suggest to move both, I don't want to have two
> > completely different mechanism for blocking vs. non-blocking.
> 
> FWIW, I proposed an hctx percpu_ref over a year ago (but for a
> completely different reason), and it was measured as too costly.
> 
>   https://lore.kernel.org/linux-block/d4a4b6c0-3ea8-f748-85b0-6b39c5023a6f@kernel.dk/

That is why I don't want to switch non-blocking to percpu-refcount.

However, cost of srcu read lock/unlock is basically similar with percpu
ref.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-30 16:10           ` Sagi Grimberg
  2020-07-30 18:18             ` Keith Busch
@ 2020-07-31  0:28             ` Ming Lei
  1 sibling, 0 replies; 20+ messages in thread
From: Ming Lei @ 2020-07-31  0:28 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, Christoph Hellwig, linux-block, Lai Jiangshan,
	Paul E . McKenney, Josh Triplett, Bart Van Assche

On Thu, Jul 30, 2020 at 09:10:48AM -0700, Sagi Grimberg wrote:
> 
> > > > > > > In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
> > > > > > > section during dispatching request, then request queue quiesce is based on
> > > > > > > SRCU. What we want to get is low cost added in fast path.
> > > > > > > 
> > > > > > > However, from srcu_read_lock/srcu_read_unlock implementation, not see
> > > > > > > it is quicker than percpu refcount, so use percpu_ref to implement
> > > > > > > queue quiesce. This usage is cleaner and simpler & enough for implementing
> > > > > > > queue quiesce. The main requirement is to make sure all read sections to observe
> > > > > > > QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.
> > > > > > > 
> > > > > > > Also it becomes much easier to add interface of async queue quiesce.
> > > > > > 
> > > > > > BTW, no obvious IOPS difference is observed with this patch applied when running
> > > > > > io on null_blk(blocking, submit_queues=32) in one dual-socket, 32cores system.
> > > > > 
> > > > > Thanks Ming, can you test for non-blocking on the same setup?
> > > > 
> > > > OK, I can do that, but this patch supposes to not affect non-blocking,
> > > > care to share your motivation for testing non-blocking?
> > > 
> > > I think it will be a significant improvement to have a single code path.
> > > The code will be more robust and we won't need to face issues that are
> > > specific for blocking.
> > > 
> > > If the cost is negligible, I think the upside is worth it.
> > > 
> > 
> > rcu_read_lock and rcu_read_unlock has been proved as efficient enough,
> > and I don't think percpu_refcount is better than it, so I'd suggest to
> > not switch non-blocking into this way.
> 
> It's not a matter of which is better, its a matter of making the code
> more robust because it has a single code-path. If moving to percpu_ref
> is negligible, I would suggest to move both, I don't want to have two
> completely different mechanism for blocking vs. non-blocking.

RCU and SRCU have been different mechanism already.

> 
> > BTW, in case of blocking, one hctx may dispatch at most one request because there
> > is only single .run_work, even though when .queue_rq() is slept, that said
> > blk_mq_submit_bio() queues bio in sync style. This way won't be very efficient.
> > So percpu_refcount should be good enough for blocking code path, but may not be
> > well enough for non-blocking case.
> 
> Not sure what you mean, the percpu_ref is taken exactly where rcu is
> taken, not sure what is the difference.

My point is that blocking can't be efficient enough, when .queue_rq() is
slept, no any request can be queued to this hctx any more.


thanks,
Ming


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-07-30 21:03                     ` Sagi Grimberg
@ 2020-07-31  0:33                       ` Ming Lei
  0 siblings, 0 replies; 20+ messages in thread
From: Ming Lei @ 2020-07-31  0:33 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, Keith Busch, Christoph Hellwig, linux-block,
	Lai Jiangshan, Paul E . McKenney, Josh Triplett, Bart Van Assche

On Thu, Jul 30, 2020 at 02:03:02PM -0700, Sagi Grimberg wrote:
> 
> > > > > > > > I think it will be a significant improvement to have a single code path.
> > > > > > > > The code will be more robust and we won't need to face issues that are
> > > > > > > > specific for blocking.
> > > > > > > > 
> > > > > > > > If the cost is negligible, I think the upside is worth it.
> > > > > > > > 
> > > > > > > 
> > > > > > > rcu_read_lock and rcu_read_unlock has been proved as efficient enough,
> > > > > > > and I don't think percpu_refcount is better than it, so I'd suggest to
> > > > > > > not switch non-blocking into this way.
> > > > > > 
> > > > > > It's not a matter of which is better, its a matter of making the code
> > > > > > more robust because it has a single code-path. If moving to percpu_ref
> > > > > > is negligible, I would suggest to move both, I don't want to have two
> > > > > > completely different mechanism for blocking vs. non-blocking.
> > > > > 
> > > > > FWIW, I proposed an hctx percpu_ref over a year ago (but for a
> > > > > completely different reason), and it was measured as too costly.
> > > > > 
> > > > >     https://lore.kernel.org/linux-block/d4a4b6c0-3ea8-f748-85b0-6b39c5023a6f@kernel.dk/
> > > > 
> > > > If this is the case, we shouldn't consider this as an alternative at all,
> > > > and move forward with either the original proposal or what
> > > > ming proposed to move a counter to the tagset.
> > > 
> > > Well, the point I was trying to make is that we shouldn't bother making
> > > blocking and non-blocking dispatchers use the same synchronization since
> > > non-blocking has a very cheap solution that blocking can't use.
> > 
> > I fully agree with that point.
> 
> I also agree, just said we should use the same mechanisms, IFF its not
> expensive. But I'm concerned that if we use completely different mechanisms
> we are likely to get wrong assumptions and break one at some
> point.
> 
> Hence my suggestion to move back to srcu and place the rcu_head in the hctx.

SRCU has been different enough compared with RCU, either implementation or API interface.

Then I'd still suggest to replace SRCU with percpu refcount. Then we can
have a simpler quiesce implementation.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, back to index

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-28 13:49 [RFC PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING Ming Lei
2020-07-29 10:28 ` Ming Lei
2020-07-29 15:42   ` Sagi Grimberg
2020-07-29 15:49     ` Ming Lei
2020-07-29 22:37       ` Sagi Grimberg
2020-07-30 14:53         ` Ming Lei
2020-07-30 16:10           ` Sagi Grimberg
2020-07-30 18:18             ` Keith Busch
2020-07-30 18:23               ` Sagi Grimberg
2020-07-30 19:27                 ` Keith Busch
2020-07-30 19:53                   ` Jens Axboe
2020-07-30 21:03                     ` Sagi Grimberg
2020-07-31  0:33                       ` Ming Lei
2020-07-31  0:24               ` Ming Lei
2020-07-31  0:28             ` Ming Lei
2020-07-29 11:20 ` Johannes Thumshirn
2020-07-29 16:12 ` Keith Busch
2020-07-29 22:16   ` Ming Lei
2020-07-29 22:42     ` Sagi Grimberg
2020-07-30 15:05       ` Ming Lei

Linux-Block Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-block/0 linux-block/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-block linux-block/ https://lore.kernel.org/linux-block \
		linux-block@vger.kernel.org
	public-inbox-index linux-block

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-block


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git