All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
@ 2020-08-20  3:02 Ming Lei
  2020-08-21  6:34 ` Christoph Hellwig
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Ming Lei @ 2020-08-20  3:02 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Sagi Grimberg, Bart Van Assche,
	Johannes Thumshirn, Chao Leng, Christoph Hellwig

In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
section during dispatching request, then request queue quiesce is based on
SRCU. What we want to get is low cost added in fast path.

With percpu-ref, it is cleaner and simpler & enough for implementing queue
quiesce. The main requirement is to make sure all read sections to observe
QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.

Also it becomes much easier to add interface of async queue quiesce.

Meantime memory footprint can be reduced with per-request-queue percpu-ref.

From implementation viewpoint, in fast path, not see percpu_ref is
slower than SRCU, and srcu tree(default option in most distributions)
could be slower since memory barrier is required in both lock & unlock,
and rcu_read_lock()/rcu_read_unlock() should be much cheap than
smp_mb().

1) percpu_ref just hold the rcu_read_lock, then run a check &
   increase/decrease on the percpu variable:

   rcu_read_lock()
   if (__ref_is_percpu(ref, &percpu_count))
	this_cpu_inc(*percpu_count);
   rcu_read_unlock()

2) srcu tree:
        idx = READ_ONCE(ssp->srcu_idx) & 0x1;
        this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
        smp_mb(); /* B */  /* Avoid leaking the critical section. */

Also from my test on null_blk(blocking), not observe percpu-ref performs
worse than srcu, see the following test:

1) test steps:

rmmod null_blk > /dev/null 2>&1
modprobe null_blk nr_devices=1 submit_queues=1 blocking=1
fio --bs=4k --size=512G  --rw=randread --norandommap --direct=1 --ioengine=libaio \
	--iodepth=64 --runtime=60 --group_reporting=1  --name=nullb0 \
	--filename=/dev/nullb0 --numjobs=32

test machine: HP DL380, 16 cpu cores, 2 threads per core, dual
sockets/numa, Intel(R) Xeon(R) Silver 4110 CPU @ 2.10GHz

2) test result:
- srcu quiesce: 6063K IOPS
- percpu-ref quiesce: 6113K IOPS

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Johannes Thumshirn <Johannes.Thumshirn@wdc.com>
Cc: Chao Leng <lengchao@huawei.com>
Cc: Christoph Hellwig <hch@lst.de>

---
V1:
	- remove SRCU related comment
	- remove RFC
	- not dispatch when the dispatch percpu ref becomes not live
	- add test result on commit log

 block/blk-mq-sysfs.c   |   2 -
 block/blk-mq.c         | 104 ++++++++++++++++++++---------------------
 block/blk-sysfs.c      |   6 ++-
 include/linux/blk-mq.h |   7 ---
 include/linux/blkdev.h |   4 ++
 5 files changed, 61 insertions(+), 62 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 062229395a50..799db7937105 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -38,8 +38,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
 
 	cancel_delayed_work_sync(&hctx->run_work);
 
-	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		cleanup_srcu_struct(hctx->srcu);
 	blk_free_flush_queue(hctx->fq);
 	sbitmap_free(&hctx->ctx_map);
 	free_cpumask_var(hctx->cpumask);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6294fa5c7ed9..e198bd565109 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -220,19 +220,13 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
  */
 void blk_mq_quiesce_queue(struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
-	bool rcu = false;
-
 	blk_mq_quiesce_queue_nowait(q);
 
-	queue_for_each_hw_ctx(q, hctx, i) {
-		if (hctx->flags & BLK_MQ_F_BLOCKING)
-			synchronize_srcu(hctx->srcu);
-		else
-			rcu = true;
-	}
-	if (rcu)
+	if (q->tag_set->flags & BLK_MQ_F_BLOCKING) {
+		percpu_ref_kill(&q->dispatch_counter);
+		wait_event(q->mq_quiesce_wq,
+				percpu_ref_is_zero(&q->dispatch_counter));
+	} else
 		synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
@@ -248,6 +242,9 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
 {
 	blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
 
+	if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
+		percpu_ref_resurrect(&q->dispatch_counter);
+
 	/* dispatch requests which are inserted during quiescing */
 	blk_mq_run_hw_queues(q, true);
 }
@@ -699,24 +696,21 @@ void blk_mq_complete_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
-static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
-	__releases(hctx->srcu)
+static void hctx_unlock(struct blk_mq_hw_ctx *hctx)
 {
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 		rcu_read_unlock();
 	else
-		srcu_read_unlock(hctx->srcu, srcu_idx);
+		percpu_ref_put(&hctx->queue->dispatch_counter);
 }
 
-static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
-	__acquires(hctx->srcu)
+static inline bool hctx_lock(struct blk_mq_hw_ctx *hctx)
 {
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-		/* shut up gcc false positive */
-		*srcu_idx = 0;
 		rcu_read_lock();
+		return true;
 	} else
-		*srcu_idx = srcu_read_lock(hctx->srcu);
+		return percpu_ref_tryget_live(&hctx->queue->dispatch_counter);
 }
 
 /**
@@ -1495,8 +1489,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
  */
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	int srcu_idx;
-
 	/*
 	 * We should be running this queue from one of the CPUs that
 	 * are mapped to it.
@@ -1530,9 +1522,10 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-	hctx_lock(hctx, &srcu_idx);
-	blk_mq_sched_dispatch_requests(hctx);
-	hctx_unlock(hctx, srcu_idx);
+	if (hctx_lock(hctx)) {
+		blk_mq_sched_dispatch_requests(hctx);
+		hctx_unlock(hctx);
+	}
 }
 
 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -1644,7 +1637,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
  */
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	int srcu_idx;
 	bool need_run;
 
 	/*
@@ -1655,10 +1647,12 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
 	 * quiesced.
 	 */
-	hctx_lock(hctx, &srcu_idx);
+	if (!hctx_lock(hctx))
+		return;
+
 	need_run = !blk_queue_quiesced(hctx->queue) &&
 		blk_mq_hctx_has_pending(hctx);
-	hctx_unlock(hctx, srcu_idx);
+	hctx_unlock(hctx);
 
 	if (need_run)
 		__blk_mq_delay_run_hw_queue(hctx, async, 0);
@@ -1997,7 +1991,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	bool run_queue = true;
 
 	/*
-	 * RCU or SRCU read lock is needed before checking quiesced flag.
+	 * hctx_lock() is needed before checking quiesced flag.
 	 *
 	 * When queue is stopped or quiesced, ignore 'bypass_insert' from
 	 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
@@ -2045,11 +2039,13 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, blk_qc_t *cookie)
 {
 	blk_status_t ret;
-	int srcu_idx;
 
 	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-	hctx_lock(hctx, &srcu_idx);
+	if (!hctx_lock(hctx)) {
+		blk_mq_sched_insert_request(rq, false, false, false);
+		return;
+	}
 
 	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
 	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
@@ -2057,19 +2053,21 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	else if (ret != BLK_STS_OK)
 		blk_mq_end_request(rq, ret);
 
-	hctx_unlock(hctx, srcu_idx);
+	hctx_unlock(hctx);
 }
 
 blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 {
 	blk_status_t ret;
-	int srcu_idx;
 	blk_qc_t unused_cookie;
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
-	hctx_lock(hctx, &srcu_idx);
+	if (!hctx_lock(hctx)) {
+		blk_mq_sched_insert_request(rq, false, false, false);
+		return BLK_STS_OK;
+	}
 	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
-	hctx_unlock(hctx, srcu_idx);
+	hctx_unlock(hctx);
 
 	return ret;
 }
@@ -2600,20 +2598,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
 	}
 }
 
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
-	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
-	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
-			   __alignof__(struct blk_mq_hw_ctx)) !=
-		     sizeof(struct blk_mq_hw_ctx));
-
-	if (tag_set->flags & BLK_MQ_F_BLOCKING)
-		hw_ctx_size += sizeof(struct srcu_struct);
-
-	return hw_ctx_size;
-}
-
 static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -2651,7 +2635,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	struct blk_mq_hw_ctx *hctx;
 	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
 
-	hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
 	if (!hctx)
 		goto fail_alloc_hctx;
 
@@ -2693,8 +2677,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	if (!hctx->fq)
 		goto free_bitmap;
 
-	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		init_srcu_struct(hctx->srcu);
 	blk_mq_hctx_kobj_init(hctx);
 
 	return hctx;
@@ -3171,6 +3153,13 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	mutex_unlock(&q->sysfs_lock);
 }
 
+static void blk_mq_dispatch_counter_release(struct percpu_ref *ref)
+{
+	struct request_queue *q = container_of(ref, struct request_queue,
+				dispatch_counter);
+	wake_up_all(&q->mq_quiesce_wq);
+}
+
 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 						  struct request_queue *q,
 						  bool elevator_init)
@@ -3187,6 +3176,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (blk_mq_alloc_ctxs(q))
 		goto err_poll;
 
+	if (set->flags & BLK_MQ_F_BLOCKING) {
+		init_waitqueue_head(&q->mq_quiesce_wq);
+		if (percpu_ref_init(&q->dispatch_counter,
+					blk_mq_dispatch_counter_release,
+					PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+			goto err_hctxs;
+	}
+
 	/* init q->mq_kobj and sw queues' kobjects */
 	blk_mq_sysfs_init(q);
 
@@ -3195,7 +3192,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	blk_mq_realloc_hw_ctxs(set, q);
 	if (!q->nr_hw_queues)
-		goto err_hctxs;
+		goto err_dispatch_counter;
 
 	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
@@ -3229,6 +3226,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	return q;
 
+err_dispatch_counter:
+	if (set->flags & BLK_MQ_F_BLOCKING)
+		percpu_ref_exit(&q->dispatch_counter);
 err_hctxs:
 	kfree(q->queue_hw_ctx);
 	q->nr_hw_queues = 0;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7dda709f3ccb..56b6c045e30c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -941,9 +941,13 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_queue_free_zone_bitmaps(q);
 
-	if (queue_is_mq(q))
+	if (queue_is_mq(q)) {
 		blk_mq_release(q);
 
+		if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
+			percpu_ref_exit(&q->dispatch_counter);
+	}
+
 	blk_trace_shutdown(q);
 	mutex_lock(&q->debugfs_mutex);
 	debugfs_remove_recursive(q->debugfs_dir);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9d2d5ad367a4..ea3461298de5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -169,13 +169,6 @@ struct blk_mq_hw_ctx {
 	 * q->unused_hctx_list.
 	 */
 	struct list_head	hctx_list;
-
-	/**
-	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
-	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
-	 * blk_mq_hw_ctx_size().
-	 */
-	struct srcu_struct	srcu[];
 };
 
 /**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bb5636cc17b9..5fa8bc1bb7a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -572,6 +572,10 @@ struct request_queue {
 	struct list_head	tag_set_list;
 	struct bio_set		bio_split;
 
+	/* only used for BLK_MQ_F_BLOCKING */
+	struct percpu_ref	dispatch_counter;
+	wait_queue_head_t	mq_quiesce_wq;
+
 	struct dentry		*debugfs_dir;
 
 #ifdef CONFIG_BLK_DEBUG_FS
-- 
2.25.2


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-20  3:02 [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING Ming Lei
@ 2020-08-21  6:34 ` Christoph Hellwig
  2020-08-21 10:16   ` Ming Lei
  2020-08-21 15:05 ` Jens Axboe
  2020-08-21 20:14 ` Sagi Grimberg
  2 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2020-08-21  6:34 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Sagi Grimberg, Bart Van Assche,
	Johannes Thumshirn, Chao Leng, Christoph Hellwig

> -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> -	__releases(hctx->srcu)
> +static void hctx_unlock(struct blk_mq_hw_ctx *hctx)
>  {
>  	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
>  		rcu_read_unlock();
>  	else
> -		srcu_read_unlock(hctx->srcu, srcu_idx);
> +		percpu_ref_put(&hctx->queue->dispatch_counter);

While you're at it:  can we avoid the pointless inversion in the if
statement and just do:

 	if (hctx->flags & BLK_MQ_F_BLOCKING)
		percpu_ref_put(&hctx->queue->dispatch_counter);
	else
		rcu_read_unlock();

> +static inline bool hctx_lock(struct blk_mq_hw_ctx *hctx)
>  {
>  	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> -		/* shut up gcc false positive */
> -		*srcu_idx = 0;
>  		rcu_read_lock();
> +		return true;
>  	} else
> -		*srcu_idx = srcu_read_lock(hctx->srcu);
> +		return percpu_ref_tryget_live(&hctx->queue->dispatch_counter);
>  }

Same here.

Otherwise this looks good to me, but did you do a deep audit that all
the new hctx_lock() failure cases don't cause problems?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-21  6:34 ` Christoph Hellwig
@ 2020-08-21 10:16   ` Ming Lei
  2020-08-21 14:46     ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Ming Lei @ 2020-08-21 10:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Sagi Grimberg, Bart Van Assche,
	Johannes Thumshirn, Chao Leng

On Fri, Aug 21, 2020 at 08:34:48AM +0200, Christoph Hellwig wrote:
> > -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> > -	__releases(hctx->srcu)
> > +static void hctx_unlock(struct blk_mq_hw_ctx *hctx)
> >  {
> >  	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> >  		rcu_read_unlock();
> >  	else
> > -		srcu_read_unlock(hctx->srcu, srcu_idx);
> > +		percpu_ref_put(&hctx->queue->dispatch_counter);
> 
> While you're at it:  can we avoid the pointless inversion in the if
> statement and just do:
> 
>  	if (hctx->flags & BLK_MQ_F_BLOCKING)
> 		percpu_ref_put(&hctx->queue->dispatch_counter);
> 	else
> 		rcu_read_unlock();

OK, will do that, but strictly speaking they don't belong to this patch.

> 
> > +static inline bool hctx_lock(struct blk_mq_hw_ctx *hctx)
> >  {
> >  	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> > -		/* shut up gcc false positive */
> > -		*srcu_idx = 0;
> >  		rcu_read_lock();
> > +		return true;
> >  	} else
> > -		*srcu_idx = srcu_read_lock(hctx->srcu);
> > +		return percpu_ref_tryget_live(&hctx->queue->dispatch_counter);
> >  }
> 
> Same here.
> 
> Otherwise this looks good to me, but did you do a deep audit that all
> the new hctx_lock() failure cases don't cause problems?

This patch just treats hctx_lock() failure as queue quiesce since
queue quiesce is the only reason of the failure:

1) called for run hw queue:

- __blk_mq_run_hw_queue()
- blk_mq_run_hw_queue()

In both two functions, blk_queue_quiesced() follows hctx_lock(), and
we return immediately if queue is quiesced. hctx_lock() failure
is thought as queue being quiesced, so the handling is correct.

2) called in direct issue path:
- blk_mq_try_issue_directly()
- blk_mq_request_issue_directly()

The current handling is to add request to scheduler queue if queue
is quiesced, so this patch adds the request to scheduler queue in case
of hctx_lock() failure.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-21 10:16   ` Ming Lei
@ 2020-08-21 14:46     ` Jens Axboe
  0 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2020-08-21 14:46 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig
  Cc: linux-block, Lai Jiangshan, Paul E . McKenney, Josh Triplett,
	Sagi Grimberg, Bart Van Assche, Johannes Thumshirn, Chao Leng

On 8/21/20 4:16 AM, Ming Lei wrote:
> On Fri, Aug 21, 2020 at 08:34:48AM +0200, Christoph Hellwig wrote:
>>> -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
>>> -	__releases(hctx->srcu)
>>> +static void hctx_unlock(struct blk_mq_hw_ctx *hctx)
>>>  {
>>>  	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
>>>  		rcu_read_unlock();
>>>  	else
>>> -		srcu_read_unlock(hctx->srcu, srcu_idx);
>>> +		percpu_ref_put(&hctx->queue->dispatch_counter);
>>
>> While you're at it:  can we avoid the pointless inversion in the if
>> statement and just do:
>>
>>  	if (hctx->flags & BLK_MQ_F_BLOCKING)
>> 		percpu_ref_put(&hctx->queue->dispatch_counter);
>> 	else
>> 		rcu_read_unlock();
> 
> OK, will do that, but strictly speaking they don't belong to this patch.

Yeah let's please not mix up the two.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-20  3:02 [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING Ming Lei
  2020-08-21  6:34 ` Christoph Hellwig
@ 2020-08-21 15:05 ` Jens Axboe
  2020-08-21 20:14 ` Sagi Grimberg
  2 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2020-08-21 15:05 UTC (permalink / raw)
  To: Ming Lei
  Cc: linux-block, Lai Jiangshan, Paul E . McKenney, Josh Triplett,
	Sagi Grimberg, Bart Van Assche, Johannes Thumshirn, Chao Leng,
	Christoph Hellwig

On 8/19/20 9:02 PM, Ming Lei wrote:
> @@ -699,24 +696,21 @@ void blk_mq_complete_request(struct request *rq)
>  }
>  EXPORT_SYMBOL(blk_mq_complete_request);
>  
> -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> -	__releases(hctx->srcu)
> +static void hctx_unlock(struct blk_mq_hw_ctx *hctx)
>  {
>  	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
>  		rcu_read_unlock();
>  	else
> -		srcu_read_unlock(hctx->srcu, srcu_idx);
> +		percpu_ref_put(&hctx->queue->dispatch_counter);
>  }
>  
> -static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
> -	__acquires(hctx->srcu)
> +static inline bool hctx_lock(struct blk_mq_hw_ctx *hctx)
>  {
>  	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> -		/* shut up gcc false positive */
> -		*srcu_idx = 0;
>  		rcu_read_lock();
> +		return true;
>  	} else
> -		*srcu_idx = srcu_read_lock(hctx->srcu);
> +		return percpu_ref_tryget_live(&hctx->queue->dispatch_counter);
>  }

I don't mind the !flags checking, since this is (by far) the hot path.
I would make it look like:

static inline bool hctx_lock(struct blk_mq_hw_ctx *hctx)
{
	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
		rcu_read_lock();
		return true;
	}

	return percpu_ref_tryget_live(&hctx->queue->dispatch_counter);
}

to make that perfectly clear. You can do the same for the unlock side so
they at least look identical in terms of locking.

Not too many comments on the rest, I think this is a nice cleanup too,
and getting rid of the srcu usage is a nice win on the BLOCKING side.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-20  3:02 [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING Ming Lei
  2020-08-21  6:34 ` Christoph Hellwig
  2020-08-21 15:05 ` Jens Axboe
@ 2020-08-21 20:14 ` Sagi Grimberg
  2020-08-22 13:39   ` Ming Lei
  2 siblings, 1 reply; 15+ messages in thread
From: Sagi Grimberg @ 2020-08-21 20:14 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe
  Cc: linux-block, Lai Jiangshan, Paul E . McKenney, Josh Triplett,
	Bart Van Assche, Johannes Thumshirn, Chao Leng,
	Christoph Hellwig


> In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
> section during dispatching request, then request queue quiesce is based on
> SRCU. What we want to get is low cost added in fast path.
> 
> With percpu-ref, it is cleaner and simpler & enough for implementing queue
> quiesce. The main requirement is to make sure all read sections to observe
> QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.
> 
> Also it becomes much easier to add interface of async queue quiesce.
> 
> Meantime memory footprint can be reduced with per-request-queue percpu-ref.
> 
>  From implementation viewpoint, in fast path, not see percpu_ref is
> slower than SRCU, and srcu tree(default option in most distributions)
> could be slower since memory barrier is required in both lock & unlock,
> and rcu_read_lock()/rcu_read_unlock() should be much cheap than
> smp_mb().
> 
> 1) percpu_ref just hold the rcu_read_lock, then run a check &
>     increase/decrease on the percpu variable:
> 
>     rcu_read_lock()
>     if (__ref_is_percpu(ref, &percpu_count))
> 	this_cpu_inc(*percpu_count);
>     rcu_read_unlock()
> 
> 2) srcu tree:
>          idx = READ_ONCE(ssp->srcu_idx) & 0x1;
>          this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
>          smp_mb(); /* B */  /* Avoid leaking the critical section. */
> 
> Also from my test on null_blk(blocking), not observe percpu-ref performs
> worse than srcu, see the following test:
> 
> 1) test steps:
> 
> rmmod null_blk > /dev/null 2>&1
> modprobe null_blk nr_devices=1 submit_queues=1 blocking=1
> fio --bs=4k --size=512G  --rw=randread --norandommap --direct=1 --ioengine=libaio \
> 	--iodepth=64 --runtime=60 --group_reporting=1  --name=nullb0 \
> 	--filename=/dev/nullb0 --numjobs=32
> 
> test machine: HP DL380, 16 cpu cores, 2 threads per core, dual
> sockets/numa, Intel(R) Xeon(R) Silver 4110 CPU @ 2.10GHz
> 
> 2) test result:
> - srcu quiesce: 6063K IOPS
> - percpu-ref quiesce: 6113K IOPS
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> Cc: Paul E. McKenney <paulmck@kernel.org>
> Cc: Josh Triplett <josh@joshtriplett.org>
> Cc: Sagi Grimberg <sagi@grimberg.me>
> Cc: Bart Van Assche <bvanassche@acm.org>
> Cc: Johannes Thumshirn <Johannes.Thumshirn@wdc.com>
> Cc: Chao Leng <lengchao@huawei.com>
> Cc: Christoph Hellwig <hch@lst.de>
> 
> ---
> V1:
> 	- remove SRCU related comment
> 	- remove RFC
> 	- not dispatch when the dispatch percpu ref becomes not live
> 	- add test result on commit log
> 
>   block/blk-mq-sysfs.c   |   2 -
>   block/blk-mq.c         | 104 ++++++++++++++++++++---------------------
>   block/blk-sysfs.c      |   6 ++-
>   include/linux/blk-mq.h |   7 ---
>   include/linux/blkdev.h |   4 ++
>   5 files changed, 61 insertions(+), 62 deletions(-)
> 
> diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
> index 062229395a50..799db7937105 100644
> --- a/block/blk-mq-sysfs.c
> +++ b/block/blk-mq-sysfs.c
> @@ -38,8 +38,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
>   
>   	cancel_delayed_work_sync(&hctx->run_work);
>   
> -	if (hctx->flags & BLK_MQ_F_BLOCKING)
> -		cleanup_srcu_struct(hctx->srcu);
>   	blk_free_flush_queue(hctx->fq);
>   	sbitmap_free(&hctx->ctx_map);
>   	free_cpumask_var(hctx->cpumask);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 6294fa5c7ed9..e198bd565109 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -220,19 +220,13 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
>    */
>   void blk_mq_quiesce_queue(struct request_queue *q)
>   {
> -	struct blk_mq_hw_ctx *hctx;
> -	unsigned int i;
> -	bool rcu = false;
> -
>   	blk_mq_quiesce_queue_nowait(q);
>   
> -	queue_for_each_hw_ctx(q, hctx, i) {
> -		if (hctx->flags & BLK_MQ_F_BLOCKING)
> -			synchronize_srcu(hctx->srcu);
> -		else
> -			rcu = true;
> -	}
> -	if (rcu)
> +	if (q->tag_set->flags & BLK_MQ_F_BLOCKING) {
> +		percpu_ref_kill(&q->dispatch_counter);
> +		wait_event(q->mq_quiesce_wq,
> +				percpu_ref_is_zero(&q->dispatch_counter));

Looking at the q_usage_counter percpu, it's protected with the
mq_freeze_lock and mq_freeze_depth, the fact that it's not protected
here makes this non-nesting, which scares me... We had issues before
in this area...

> +	} else
>   		synchronize_rcu();
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
> @@ -248,6 +242,9 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
>   {
>   	blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
>   
> +	if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
> +		percpu_ref_resurrect(&q->dispatch_counter);

Same comment here...

> +
>   	/* dispatch requests which are inserted during quiescing */
>   	blk_mq_run_hw_queues(q, true);
>   }
> @@ -699,24 +696,21 @@ void blk_mq_complete_request(struct request *rq)
>   }
>   EXPORT_SYMBOL(blk_mq_complete_request);
>   
> -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> -	__releases(hctx->srcu)
> +static void hctx_unlock(struct blk_mq_hw_ctx *hctx)
>   {
>   	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
>   		rcu_read_unlock();
>   	else
> -		srcu_read_unlock(hctx->srcu, srcu_idx);
> +		percpu_ref_put(&hctx->queue->dispatch_counter);
>   }
>   
> -static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
> -	__acquires(hctx->srcu)
> +static inline bool hctx_lock(struct blk_mq_hw_ctx *hctx)
>   {
>   	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> -		/* shut up gcc false positive */
> -		*srcu_idx = 0;
>   		rcu_read_lock();
> +		return true;
>   	} else
> -		*srcu_idx = srcu_read_lock(hctx->srcu);
> +		return percpu_ref_tryget_live(&hctx->queue->dispatch_counter);
>   }
>   
>   /**
> @@ -1495,8 +1489,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
>    */
>   static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>   {
> -	int srcu_idx;
> -
>   	/*
>   	 * We should be running this queue from one of the CPUs that
>   	 * are mapped to it.
> @@ -1530,9 +1522,10 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>   
>   	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
>   
> -	hctx_lock(hctx, &srcu_idx);
> -	blk_mq_sched_dispatch_requests(hctx);
> -	hctx_unlock(hctx, srcu_idx);
> +	if (hctx_lock(hctx)) {
> +		blk_mq_sched_dispatch_requests(hctx);
> +		hctx_unlock(hctx);
> +	}

Maybe invert?
	if (!hctx_lock(hctx))
		return;
	blk_mq_sched_dispatch_requests(hctx);
	hctx_unlock(hctx);

I think we need a comment to why this is OK, both in the change log
and in the code.

>   }
>   
>   static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
> @@ -1644,7 +1637,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
>    */
>   void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
>   {
> -	int srcu_idx;
>   	bool need_run;
>   
>   	/*
> @@ -1655,10 +1647,12 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
>   	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
>   	 * quiesced.
>   	 */
> -	hctx_lock(hctx, &srcu_idx);
> +	if (!hctx_lock(hctx))
> +		return;
> +
>   	need_run = !blk_queue_quiesced(hctx->queue) &&
>   		blk_mq_hctx_has_pending(hctx);
> -	hctx_unlock(hctx, srcu_idx);
> +	hctx_unlock(hctx);
>   
>   	if (need_run)
>   		__blk_mq_delay_run_hw_queue(hctx, async, 0);
> @@ -1997,7 +1991,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
>   	bool run_queue = true;
>   
>   	/*
> -	 * RCU or SRCU read lock is needed before checking quiesced flag.
> +	 * hctx_lock() is needed before checking quiesced flag.
>   	 *
>   	 * When queue is stopped or quiesced, ignore 'bypass_insert' from
>   	 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
> @@ -2045,11 +2039,13 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
>   		struct request *rq, blk_qc_t *cookie)
>   {
>   	blk_status_t ret;
> -	int srcu_idx;
>   
>   	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
>   
> -	hctx_lock(hctx, &srcu_idx);
> +	if (!hctx_lock(hctx)) {
> +		blk_mq_sched_insert_request(rq, false, false, false);
> +		return;
> +	}

I think we want the same flow for both modes. Maybe a preparation patch
that lifts the checks in __blk_mq_try_issue_directly to do this, and
then have the same flow with the hctx_lock failure (with a comment
explaining this).

>   
>   	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
>   	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
> @@ -2057,19 +2053,21 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
>   	else if (ret != BLK_STS_OK)
>   		blk_mq_end_request(rq, ret);
>   
> -	hctx_unlock(hctx, srcu_idx);
> +	hctx_unlock(hctx);
>   }
>   
>   blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
>   {
>   	blk_status_t ret;
> -	int srcu_idx;
>   	blk_qc_t unused_cookie;
>   	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
>   
> -	hctx_lock(hctx, &srcu_idx);
> +	if (!hctx_lock(hctx)) {
> +		blk_mq_sched_insert_request(rq, false, false, false);
> +		return BLK_STS_OK;
> +	}

Same comment here.

>   	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
> -	hctx_unlock(hctx, srcu_idx);
> +	hctx_unlock(hctx);
>   
>   	return ret;
>   }
> @@ -2600,20 +2598,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
>   	}
>   }
>   
> -static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
> -{
> -	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
> -
> -	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
> -			   __alignof__(struct blk_mq_hw_ctx)) !=
> -		     sizeof(struct blk_mq_hw_ctx));
> -
> -	if (tag_set->flags & BLK_MQ_F_BLOCKING)
> -		hw_ctx_size += sizeof(struct srcu_struct);
> -
> -	return hw_ctx_size;
> -}
> -
>   static int blk_mq_init_hctx(struct request_queue *q,
>   		struct blk_mq_tag_set *set,
>   		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
> @@ -2651,7 +2635,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
>   	struct blk_mq_hw_ctx *hctx;
>   	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
>   
> -	hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
> +	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
>   	if (!hctx)
>   		goto fail_alloc_hctx;
>   
> @@ -2693,8 +2677,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
>   	if (!hctx->fq)
>   		goto free_bitmap;
>   
> -	if (hctx->flags & BLK_MQ_F_BLOCKING)
> -		init_srcu_struct(hctx->srcu);
>   	blk_mq_hctx_kobj_init(hctx);
>   
>   	return hctx;
> @@ -3171,6 +3153,13 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
>   	mutex_unlock(&q->sysfs_lock);
>   }
>   
> +static void blk_mq_dispatch_counter_release(struct percpu_ref *ref)
> +{
> +	struct request_queue *q = container_of(ref, struct request_queue,
> +				dispatch_counter);
> +	wake_up_all(&q->mq_quiesce_wq);
> +}
> +
>   struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>   						  struct request_queue *q,
>   						  bool elevator_init)
> @@ -3187,6 +3176,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>   	if (blk_mq_alloc_ctxs(q))
>   		goto err_poll;
>   
> +	if (set->flags & BLK_MQ_F_BLOCKING) {
> +		init_waitqueue_head(&q->mq_quiesce_wq);
> +		if (percpu_ref_init(&q->dispatch_counter,
> +					blk_mq_dispatch_counter_release,
> +					PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
> +			goto err_hctxs;
> +	}
> +
>   	/* init q->mq_kobj and sw queues' kobjects */
>   	blk_mq_sysfs_init(q);
>   
> @@ -3195,7 +3192,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>   
>   	blk_mq_realloc_hw_ctxs(set, q);
>   	if (!q->nr_hw_queues)
> -		goto err_hctxs;
> +		goto err_dispatch_counter;
>   
>   	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
>   	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
> @@ -3229,6 +3226,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>   
>   	return q;
>   
> +err_dispatch_counter:
> +	if (set->flags & BLK_MQ_F_BLOCKING)
> +		percpu_ref_exit(&q->dispatch_counter);
>   err_hctxs:
>   	kfree(q->queue_hw_ctx);
>   	q->nr_hw_queues = 0;
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 7dda709f3ccb..56b6c045e30c 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -941,9 +941,13 @@ static void blk_release_queue(struct kobject *kobj)
>   
>   	blk_queue_free_zone_bitmaps(q);
>   
> -	if (queue_is_mq(q))
> +	if (queue_is_mq(q)) {
>   		blk_mq_release(q);
>   
> +		if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
> +			percpu_ref_exit(&q->dispatch_counter);
> +	}
> +
>   	blk_trace_shutdown(q);
>   	mutex_lock(&q->debugfs_mutex);
>   	debugfs_remove_recursive(q->debugfs_dir);
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 9d2d5ad367a4..ea3461298de5 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -169,13 +169,6 @@ struct blk_mq_hw_ctx {
>   	 * q->unused_hctx_list.
>   	 */
>   	struct list_head	hctx_list;
> -
> -	/**
> -	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
> -	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
> -	 * blk_mq_hw_ctx_size().
> -	 */
> -	struct srcu_struct	srcu[];
>   };
>   
>   /**
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index bb5636cc17b9..5fa8bc1bb7a8 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -572,6 +572,10 @@ struct request_queue {
>   	struct list_head	tag_set_list;
>   	struct bio_set		bio_split;
>   
> +	/* only used for BLK_MQ_F_BLOCKING */
> +	struct percpu_ref	dispatch_counter;

Can this be moved to be next to the q_usage_counter? they
will be taken together for blocking drivers...

Also maybe a better name is needed here since it's just
for blocking hctxs.

> +	wait_queue_head_t	mq_quiesce_wq;
> +
>   	struct dentry		*debugfs_dir;
>   
>   #ifdef CONFIG_BLK_DEBUG_FS
> 

What I think is needed here is at a minimum test quiesce/unquiesce loops
during I/O. code auditing is not enough, there may be driver assumptions
broken with this change (although I hope there shouldn't be).

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-21 20:14 ` Sagi Grimberg
@ 2020-08-22 13:39   ` Ming Lei
  2020-08-24  8:19     ` Sagi Grimberg
  0 siblings, 1 reply; 15+ messages in thread
From: Ming Lei @ 2020-08-22 13:39 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Bart Van Assche, Johannes Thumshirn, Chao Leng,
	Christoph Hellwig

On Fri, Aug 21, 2020 at 01:14:41PM -0700, Sagi Grimberg wrote:
> 
> > In case of BLK_MQ_F_BLOCKING, blk-mq uses SRCU to mark read critical
> > section during dispatching request, then request queue quiesce is based on
> > SRCU. What we want to get is low cost added in fast path.
> > 
> > With percpu-ref, it is cleaner and simpler & enough for implementing queue
> > quiesce. The main requirement is to make sure all read sections to observe
> > QUEUE_FLAG_QUIESCED once blk_mq_quiesce_queue() returns.
> > 
> > Also it becomes much easier to add interface of async queue quiesce.
> > 
> > Meantime memory footprint can be reduced with per-request-queue percpu-ref.
> > 
> >  From implementation viewpoint, in fast path, not see percpu_ref is
> > slower than SRCU, and srcu tree(default option in most distributions)
> > could be slower since memory barrier is required in both lock & unlock,
> > and rcu_read_lock()/rcu_read_unlock() should be much cheap than
> > smp_mb().
> > 
> > 1) percpu_ref just hold the rcu_read_lock, then run a check &
> >     increase/decrease on the percpu variable:
> > 
> >     rcu_read_lock()
> >     if (__ref_is_percpu(ref, &percpu_count))
> > 	this_cpu_inc(*percpu_count);
> >     rcu_read_unlock()
> > 
> > 2) srcu tree:
> >          idx = READ_ONCE(ssp->srcu_idx) & 0x1;
> >          this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
> >          smp_mb(); /* B */  /* Avoid leaking the critical section. */
> > 
> > Also from my test on null_blk(blocking), not observe percpu-ref performs
> > worse than srcu, see the following test:
> > 
> > 1) test steps:
> > 
> > rmmod null_blk > /dev/null 2>&1
> > modprobe null_blk nr_devices=1 submit_queues=1 blocking=1
> > fio --bs=4k --size=512G  --rw=randread --norandommap --direct=1 --ioengine=libaio \
> > 	--iodepth=64 --runtime=60 --group_reporting=1  --name=nullb0 \
> > 	--filename=/dev/nullb0 --numjobs=32
> > 
> > test machine: HP DL380, 16 cpu cores, 2 threads per core, dual
> > sockets/numa, Intel(R) Xeon(R) Silver 4110 CPU @ 2.10GHz
> > 
> > 2) test result:
> > - srcu quiesce: 6063K IOPS
> > - percpu-ref quiesce: 6113K IOPS
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > Cc: Lai Jiangshan <jiangshanlai@gmail.com>
> > Cc: Paul E. McKenney <paulmck@kernel.org>
> > Cc: Josh Triplett <josh@joshtriplett.org>
> > Cc: Sagi Grimberg <sagi@grimberg.me>
> > Cc: Bart Van Assche <bvanassche@acm.org>
> > Cc: Johannes Thumshirn <Johannes.Thumshirn@wdc.com>
> > Cc: Chao Leng <lengchao@huawei.com>
> > Cc: Christoph Hellwig <hch@lst.de>
> > 
> > ---
> > V1:
> > 	- remove SRCU related comment
> > 	- remove RFC
> > 	- not dispatch when the dispatch percpu ref becomes not live
> > 	- add test result on commit log
> > 
> >   block/blk-mq-sysfs.c   |   2 -
> >   block/blk-mq.c         | 104 ++++++++++++++++++++---------------------
> >   block/blk-sysfs.c      |   6 ++-
> >   include/linux/blk-mq.h |   7 ---
> >   include/linux/blkdev.h |   4 ++
> >   5 files changed, 61 insertions(+), 62 deletions(-)
> > 
> > diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
> > index 062229395a50..799db7937105 100644
> > --- a/block/blk-mq-sysfs.c
> > +++ b/block/blk-mq-sysfs.c
> > @@ -38,8 +38,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
> >   	cancel_delayed_work_sync(&hctx->run_work);
> > -	if (hctx->flags & BLK_MQ_F_BLOCKING)
> > -		cleanup_srcu_struct(hctx->srcu);
> >   	blk_free_flush_queue(hctx->fq);
> >   	sbitmap_free(&hctx->ctx_map);
> >   	free_cpumask_var(hctx->cpumask);
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 6294fa5c7ed9..e198bd565109 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -220,19 +220,13 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
> >    */
> >   void blk_mq_quiesce_queue(struct request_queue *q)
> >   {
> > -	struct blk_mq_hw_ctx *hctx;
> > -	unsigned int i;
> > -	bool rcu = false;
> > -
> >   	blk_mq_quiesce_queue_nowait(q);
> > -	queue_for_each_hw_ctx(q, hctx, i) {
> > -		if (hctx->flags & BLK_MQ_F_BLOCKING)
> > -			synchronize_srcu(hctx->srcu);
> > -		else
> > -			rcu = true;
> > -	}
> > -	if (rcu)
> > +	if (q->tag_set->flags & BLK_MQ_F_BLOCKING) {
> > +		percpu_ref_kill(&q->dispatch_counter);
> > +		wait_event(q->mq_quiesce_wq,
> > +				percpu_ref_is_zero(&q->dispatch_counter));
> 
> Looking at the q_usage_counter percpu, it's protected with the
> mq_freeze_lock and mq_freeze_depth, the fact that it's not protected
> here makes this non-nesting, which scares me... We had issues before
> in this area...

In theory, percpu_ref_kill() could be called nested, since percpu_ref_switch_lock
covers it. However, the warning in percpu_ref_kill_and_confirm() may be
triggered.

So looks we just need one mutex to cover quiesce and unquiesce. If queue
is being quiesced, return immediately from blk_mq_quiesce_queue().

> 
> > +	} else
> >   		synchronize_rcu();
> >   }
> >   EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
> > @@ -248,6 +242,9 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
> >   {
> >   	blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
> > +	if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
> > +		percpu_ref_resurrect(&q->dispatch_counter);
> 
> Same comment here...
> 
> > +
> >   	/* dispatch requests which are inserted during quiescing */
> >   	blk_mq_run_hw_queues(q, true);
> >   }
> > @@ -699,24 +696,21 @@ void blk_mq_complete_request(struct request *rq)
> >   }
> >   EXPORT_SYMBOL(blk_mq_complete_request);
> > -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> > -	__releases(hctx->srcu)
> > +static void hctx_unlock(struct blk_mq_hw_ctx *hctx)
> >   {
> >   	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> >   		rcu_read_unlock();
> >   	else
> > -		srcu_read_unlock(hctx->srcu, srcu_idx);
> > +		percpu_ref_put(&hctx->queue->dispatch_counter);
> >   }
> > -static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
> > -	__acquires(hctx->srcu)
> > +static inline bool hctx_lock(struct blk_mq_hw_ctx *hctx)
> >   {
> >   	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> > -		/* shut up gcc false positive */
> > -		*srcu_idx = 0;
> >   		rcu_read_lock();
> > +		return true;
> >   	} else
> > -		*srcu_idx = srcu_read_lock(hctx->srcu);
> > +		return percpu_ref_tryget_live(&hctx->queue->dispatch_counter);
> >   }
> >   /**
> > @@ -1495,8 +1489,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
> >    */
> >   static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
> >   {
> > -	int srcu_idx;
> > -
> >   	/*
> >   	 * We should be running this queue from one of the CPUs that
> >   	 * are mapped to it.
> > @@ -1530,9 +1522,10 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
> >   	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
> > -	hctx_lock(hctx, &srcu_idx);
> > -	blk_mq_sched_dispatch_requests(hctx);
> > -	hctx_unlock(hctx, srcu_idx);
> > +	if (hctx_lock(hctx)) {
> > +		blk_mq_sched_dispatch_requests(hctx);
> > +		hctx_unlock(hctx);
> > +	}
> 
> Maybe invert?
> 	if (!hctx_lock(hctx))
> 		return;
> 	blk_mq_sched_dispatch_requests(hctx);
> 	hctx_unlock(hctx);
> 
> I think we need a comment to why this is OK, both in the change log
> and in the code.

Fine, it is because hctx_lock() failure can be thought as queue being
quiesced.

> 
> >   }
> >   static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
> > @@ -1644,7 +1637,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
> >    */
> >   void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
> >   {
> > -	int srcu_idx;
> >   	bool need_run;
> >   	/*
> > @@ -1655,10 +1647,12 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
> >   	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
> >   	 * quiesced.
> >   	 */
> > -	hctx_lock(hctx, &srcu_idx);
> > +	if (!hctx_lock(hctx))
> > +		return;
> > +
> >   	need_run = !blk_queue_quiesced(hctx->queue) &&
> >   		blk_mq_hctx_has_pending(hctx);
> > -	hctx_unlock(hctx, srcu_idx);
> > +	hctx_unlock(hctx);
> >   	if (need_run)
> >   		__blk_mq_delay_run_hw_queue(hctx, async, 0);
> > @@ -1997,7 +1991,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
> >   	bool run_queue = true;
> >   	/*
> > -	 * RCU or SRCU read lock is needed before checking quiesced flag.
> > +	 * hctx_lock() is needed before checking quiesced flag.
> >   	 *
> >   	 * When queue is stopped or quiesced, ignore 'bypass_insert' from
> >   	 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
> > @@ -2045,11 +2039,13 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
> >   		struct request *rq, blk_qc_t *cookie)
> >   {
> >   	blk_status_t ret;
> > -	int srcu_idx;
> >   	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
> > -	hctx_lock(hctx, &srcu_idx);
> > +	if (!hctx_lock(hctx)) {
> > +		blk_mq_sched_insert_request(rq, false, false, false);
> > +		return;
> > +	}
> 
> I think we want the same flow for both modes. Maybe a preparation patch
> that lifts the checks in __blk_mq_try_issue_directly to do this, and
> then have the same flow with the hctx_lock failure (with a comment
> explaining this).

Looks a good idea.

> 
> >   	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
> >   	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
> > @@ -2057,19 +2053,21 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
> >   	else if (ret != BLK_STS_OK)
> >   		blk_mq_end_request(rq, ret);
> > -	hctx_unlock(hctx, srcu_idx);
> > +	hctx_unlock(hctx);
> >   }
> >   blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
> >   {
> >   	blk_status_t ret;
> > -	int srcu_idx;
> >   	blk_qc_t unused_cookie;
> >   	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> > -	hctx_lock(hctx, &srcu_idx);
> > +	if (!hctx_lock(hctx)) {
> > +		blk_mq_sched_insert_request(rq, false, false, false);
> > +		return BLK_STS_OK;
> > +	}
> 
> Same comment here.
> 
> >   	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
> > -	hctx_unlock(hctx, srcu_idx);
> > +	hctx_unlock(hctx);
> >   	return ret;
> >   }
> > @@ -2600,20 +2598,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
> >   	}
> >   }
> > -static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
> > -{
> > -	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
> > -
> > -	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
> > -			   __alignof__(struct blk_mq_hw_ctx)) !=
> > -		     sizeof(struct blk_mq_hw_ctx));
> > -
> > -	if (tag_set->flags & BLK_MQ_F_BLOCKING)
> > -		hw_ctx_size += sizeof(struct srcu_struct);
> > -
> > -	return hw_ctx_size;
> > -}
> > -
> >   static int blk_mq_init_hctx(struct request_queue *q,
> >   		struct blk_mq_tag_set *set,
> >   		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
> > @@ -2651,7 +2635,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
> >   	struct blk_mq_hw_ctx *hctx;
> >   	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
> > -	hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
> > +	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
> >   	if (!hctx)
> >   		goto fail_alloc_hctx;
> > @@ -2693,8 +2677,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
> >   	if (!hctx->fq)
> >   		goto free_bitmap;
> > -	if (hctx->flags & BLK_MQ_F_BLOCKING)
> > -		init_srcu_struct(hctx->srcu);
> >   	blk_mq_hctx_kobj_init(hctx);
> >   	return hctx;
> > @@ -3171,6 +3153,13 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
> >   	mutex_unlock(&q->sysfs_lock);
> >   }
> > +static void blk_mq_dispatch_counter_release(struct percpu_ref *ref)
> > +{
> > +	struct request_queue *q = container_of(ref, struct request_queue,
> > +				dispatch_counter);
> > +	wake_up_all(&q->mq_quiesce_wq);
> > +}
> > +
> >   struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
> >   						  struct request_queue *q,
> >   						  bool elevator_init)
> > @@ -3187,6 +3176,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
> >   	if (blk_mq_alloc_ctxs(q))
> >   		goto err_poll;
> > +	if (set->flags & BLK_MQ_F_BLOCKING) {
> > +		init_waitqueue_head(&q->mq_quiesce_wq);
> > +		if (percpu_ref_init(&q->dispatch_counter,
> > +					blk_mq_dispatch_counter_release,
> > +					PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
> > +			goto err_hctxs;
> > +	}
> > +
> >   	/* init q->mq_kobj and sw queues' kobjects */
> >   	blk_mq_sysfs_init(q);
> > @@ -3195,7 +3192,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
> >   	blk_mq_realloc_hw_ctxs(set, q);
> >   	if (!q->nr_hw_queues)
> > -		goto err_hctxs;
> > +		goto err_dispatch_counter;
> >   	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
> >   	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
> > @@ -3229,6 +3226,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
> >   	return q;
> > +err_dispatch_counter:
> > +	if (set->flags & BLK_MQ_F_BLOCKING)
> > +		percpu_ref_exit(&q->dispatch_counter);
> >   err_hctxs:
> >   	kfree(q->queue_hw_ctx);
> >   	q->nr_hw_queues = 0;
> > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> > index 7dda709f3ccb..56b6c045e30c 100644
> > --- a/block/blk-sysfs.c
> > +++ b/block/blk-sysfs.c
> > @@ -941,9 +941,13 @@ static void blk_release_queue(struct kobject *kobj)
> >   	blk_queue_free_zone_bitmaps(q);
> > -	if (queue_is_mq(q))
> > +	if (queue_is_mq(q)) {
> >   		blk_mq_release(q);
> > +		if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
> > +			percpu_ref_exit(&q->dispatch_counter);
> > +	}
> > +
> >   	blk_trace_shutdown(q);
> >   	mutex_lock(&q->debugfs_mutex);
> >   	debugfs_remove_recursive(q->debugfs_dir);
> > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> > index 9d2d5ad367a4..ea3461298de5 100644
> > --- a/include/linux/blk-mq.h
> > +++ b/include/linux/blk-mq.h
> > @@ -169,13 +169,6 @@ struct blk_mq_hw_ctx {
> >   	 * q->unused_hctx_list.
> >   	 */
> >   	struct list_head	hctx_list;
> > -
> > -	/**
> > -	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
> > -	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
> > -	 * blk_mq_hw_ctx_size().
> > -	 */
> > -	struct srcu_struct	srcu[];
> >   };
> >   /**
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index bb5636cc17b9..5fa8bc1bb7a8 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -572,6 +572,10 @@ struct request_queue {
> >   	struct list_head	tag_set_list;
> >   	struct bio_set		bio_split;
> > +	/* only used for BLK_MQ_F_BLOCKING */
> > +	struct percpu_ref	dispatch_counter;
> 
> Can this be moved to be next to the q_usage_counter? they
> will be taken together for blocking drivers...

I don't think it is a good idea, at least hctx->srcu is put at the end
of hctx, do you want to move it at beginning of hctx?

.q_usage_counter should have been put in the 1st cacheline of
request queue. If it is moved to the 1st cacheline of request queue,
we shouldn't put 'dispatch_counter' there, because it may hurt other
non-blocking drivers.

> 
> Also maybe a better name is needed here since it's just
> for blocking hctxs.
> 
> > +	wait_queue_head_t	mq_quiesce_wq;
> > +
> >   	struct dentry		*debugfs_dir;
> >   #ifdef CONFIG_BLK_DEBUG_FS
> > 
> 
> What I think is needed here is at a minimum test quiesce/unquiesce loops
> during I/O. code auditing is not enough, there may be driver assumptions
> broken with this change (although I hope there shouldn't be).

We have elevator switch / updating nr_request stress test, and both relies
on quiesce/unquiesce, and I did run such test with this patch.


thanks, 
Ming


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-22 13:39   ` Ming Lei
@ 2020-08-24  8:19     ` Sagi Grimberg
  2020-08-24 10:40       ` Ming Lei
  0 siblings, 1 reply; 15+ messages in thread
From: Sagi Grimberg @ 2020-08-24  8:19 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Bart Van Assche, Johannes Thumshirn, Chao Leng,
	Christoph Hellwig


>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>> index bb5636cc17b9..5fa8bc1bb7a8 100644
>>> --- a/include/linux/blkdev.h
>>> +++ b/include/linux/blkdev.h
>>> @@ -572,6 +572,10 @@ struct request_queue {
>>>    	struct list_head	tag_set_list;
>>>    	struct bio_set		bio_split;
>>> +	/* only used for BLK_MQ_F_BLOCKING */
>>> +	struct percpu_ref	dispatch_counter;
>>
>> Can this be moved to be next to the q_usage_counter? they
>> will be taken together for blocking drivers...
> 
> I don't think it is a good idea, at least hctx->srcu is put at the end
> of hctx, do you want to move it at beginning of hctx?

I'd think it'd be an improvement, yes.

> .q_usage_counter should have been put in the 1st cacheline of
> request queue. If it is moved to the 1st cacheline of request queue,
> we shouldn't put 'dispatch_counter' there, because it may hurt other
> non-blocking drivers.

q_usage_counter currently there, and the two will always be taken
together, and there are several other stuff that we can remove from
that cacheline without hurting performance for anything.

And when q_usage_counter is moved to the first cacheline, then I'd
expect that the dispatch_counter also moves to the front (maybe not
the first if it is on the expense of other hot members, but definitely
it should be treated as a hot member).

Anyways, I think that for now we should place them together.

>> Also maybe a better name is needed here since it's just
>> for blocking hctxs.
>>
>>> +	wait_queue_head_t	mq_quiesce_wq;
>>> +
>>>    	struct dentry		*debugfs_dir;
>>>    #ifdef CONFIG_BLK_DEBUG_FS
>>>
>>
>> What I think is needed here is at a minimum test quiesce/unquiesce loops
>> during I/O. code auditing is not enough, there may be driver assumptions
>> broken with this change (although I hope there shouldn't be).
> 
> We have elevator switch / updating nr_request stress test, and both relies
> on quiesce/unquiesce, and I did run such test with this patch.

You have a blktest for this? If not, I strongly suggest that one is
added to validate the change also moving forward.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-24  8:19     ` Sagi Grimberg
@ 2020-08-24 10:40       ` Ming Lei
  2020-08-24 21:34         ` Sagi Grimberg
  0 siblings, 1 reply; 15+ messages in thread
From: Ming Lei @ 2020-08-24 10:40 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Bart Van Assche, Johannes Thumshirn, Chao Leng,
	Christoph Hellwig

On Mon, Aug 24, 2020 at 01:19:56AM -0700, Sagi Grimberg wrote:
> 
> > > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > > > index bb5636cc17b9..5fa8bc1bb7a8 100644
> > > > --- a/include/linux/blkdev.h
> > > > +++ b/include/linux/blkdev.h
> > > > @@ -572,6 +572,10 @@ struct request_queue {
> > > >    	struct list_head	tag_set_list;
> > > >    	struct bio_set		bio_split;
> > > > +	/* only used for BLK_MQ_F_BLOCKING */
> > > > +	struct percpu_ref	dispatch_counter;
> > > 
> > > Can this be moved to be next to the q_usage_counter? they
> > > will be taken together for blocking drivers...
> > 
> > I don't think it is a good idea, at least hctx->srcu is put at the end
> > of hctx, do you want to move it at beginning of hctx?
> 
> I'd think it'd be an improvement, yes.

Please see the reason why it is put back of hctx in
073196787727("blk-mq: Reduce blk_mq_hw_ctx size").

> 
> > .q_usage_counter should have been put in the 1st cacheline of
> > request queue. If it is moved to the 1st cacheline of request queue,
> > we shouldn't put 'dispatch_counter' there, because it may hurt other
> > non-blocking drivers.
> 
> q_usage_counter currently there, and the two will always be taken
> together, and there are several other stuff that we can remove from
> that cacheline without hurting performance for anything.
> 
> And when q_usage_counter is moved to the first cacheline, then I'd
> expect that the dispatch_counter also moves to the front (maybe not
> the first if it is on the expense of other hot members, but definitely
> it should be treated as a hot member).
> 
> Anyways, I think that for now we should place them together.

Then it may hurt non-blocking.

Each hctx has only one run-work, if the hctx is blocked, no other request
may be queued to hctx any more. That is basically sync run queue, so I
am not sure good enough perf can be expected on blocking.

So it may not be worth of putting the added .dispatch_counter together
with .q_usage_counter.

> 
> > > Also maybe a better name is needed here since it's just
> > > for blocking hctxs.
> > > 
> > > > +	wait_queue_head_t	mq_quiesce_wq;
> > > > +
> > > >    	struct dentry		*debugfs_dir;
> > > >    #ifdef CONFIG_BLK_DEBUG_FS
> > > > 
> > > 
> > > What I think is needed here is at a minimum test quiesce/unquiesce loops
> > > during I/O. code auditing is not enough, there may be driver assumptions
> > > broken with this change (although I hope there shouldn't be).
> > 
> > We have elevator switch / updating nr_request stress test, and both relies
> > on quiesce/unquiesce, and I did run such test with this patch.
> 
> You have a blktest for this? If not, I strongly suggest that one is
> added to validate the change also moving forward.

There are lots of blktest tests doing that, such as block/005,
block/016, block/021, ...


Thanks, 
Ming


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-24 10:40       ` Ming Lei
@ 2020-08-24 21:34         ` Sagi Grimberg
  2020-08-25  2:32           ` Ming Lei
  0 siblings, 1 reply; 15+ messages in thread
From: Sagi Grimberg @ 2020-08-24 21:34 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Bart Van Assche, Johannes Thumshirn, Chao Leng,
	Christoph Hellwig


>> I'd think it'd be an improvement, yes.
> 
> Please see the reason why it is put back of hctx in
> 073196787727("blk-mq: Reduce blk_mq_hw_ctx size").

I know why it is there, just was saying that having an additional
pointer is fine. But the discussion is moot.

>>> .q_usage_counter should have been put in the 1st cacheline of
>>> request queue. If it is moved to the 1st cacheline of request queue,
>>> we shouldn't put 'dispatch_counter' there, because it may hurt other
>>> non-blocking drivers.
>>
>> q_usage_counter currently there, and the two will always be taken
>> together, and there are several other stuff that we can remove from
>> that cacheline without hurting performance for anything.
>>
>> And when q_usage_counter is moved to the first cacheline, then I'd
>> expect that the dispatch_counter also moves to the front (maybe not
>> the first if it is on the expense of other hot members, but definitely
>> it should be treated as a hot member).
>>
>> Anyways, I think that for now we should place them together.
> 
> Then it may hurt non-blocking.
> 
> Each hctx has only one run-work, if the hctx is blocked, no other request
> may be queued to hctx any more. That is basically sync run queue, so I
> am not sure good enough perf can be expected on blocking.

I don't think that you should assume that a blocking driver will block
normally, it will only rarely block (very rarely).

> So it may not be worth of putting the added .dispatch_counter together
> with .q_usage_counter.

I happen to think it would. Not sure why you resist so much given how
request_queue is arranged currently.

>>>> Also maybe a better name is needed here since it's just
>>>> for blocking hctxs.
>>>>
>>>>> +	wait_queue_head_t	mq_quiesce_wq;
>>>>> +
>>>>>     	struct dentry		*debugfs_dir;
>>>>>     #ifdef CONFIG_BLK_DEBUG_FS
>>>>>
>>>>
>>>> What I think is needed here is at a minimum test quiesce/unquiesce loops
>>>> during I/O. code auditing is not enough, there may be driver assumptions
>>>> broken with this change (although I hope there shouldn't be).
>>>
>>> We have elevator switch / updating nr_request stress test, and both relies
>>> on quiesce/unquiesce, and I did run such test with this patch.
>>
>> You have a blktest for this? If not, I strongly suggest that one is
>> added to validate the change also moving forward.
> 
> There are lots of blktest tests doing that, such as block/005,
> block/016, block/021, ...

Good, but I'd also won't want to get this without making sure the async
quiesce works well on large number of namespaces (the reason why this
is proposed in the first place). Not sure who is planning to do that...

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-24 21:34         ` Sagi Grimberg
@ 2020-08-25  2:32           ` Ming Lei
  2020-08-25  5:24             ` Sagi Grimberg
  0 siblings, 1 reply; 15+ messages in thread
From: Ming Lei @ 2020-08-25  2:32 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Bart Van Assche, Johannes Thumshirn, Chao Leng,
	Christoph Hellwig

On Mon, Aug 24, 2020 at 02:34:04PM -0700, Sagi Grimberg wrote:
> 
> > > I'd think it'd be an improvement, yes.
> > 
> > Please see the reason why it is put back of hctx in
> > 073196787727("blk-mq: Reduce blk_mq_hw_ctx size").
> 
> I know why it is there, just was saying that having an additional
> pointer is fine. But the discussion is moot.
> 
> > > > .q_usage_counter should have been put in the 1st cacheline of
> > > > request queue. If it is moved to the 1st cacheline of request queue,
> > > > we shouldn't put 'dispatch_counter' there, because it may hurt other
> > > > non-blocking drivers.
> > > 
> > > q_usage_counter currently there, and the two will always be taken
> > > together, and there are several other stuff that we can remove from
> > > that cacheline without hurting performance for anything.
> > > 
> > > And when q_usage_counter is moved to the first cacheline, then I'd
> > > expect that the dispatch_counter also moves to the front (maybe not
> > > the first if it is on the expense of other hot members, but definitely
> > > it should be treated as a hot member).
> > > 
> > > Anyways, I think that for now we should place them together.
> > 
> > Then it may hurt non-blocking.
> > 
> > Each hctx has only one run-work, if the hctx is blocked, no other request
> > may be queued to hctx any more. That is basically sync run queue, so I
> > am not sure good enough perf can be expected on blocking.
> 
> I don't think that you should assume that a blocking driver will block
> normally, it will only rarely block (very rarely).

If nvme-tcp only blocks rarely, just wondering why not switch to non-blocking
which can be done simply with one driver specific wq work? Then nvme-tcp
can be aligned with other nvme drivers.

> 
> > So it may not be worth of putting the added .dispatch_counter together
> > with .q_usage_counter.
> 
> I happen to think it would. Not sure why you resist so much given how
> request_queue is arranged currently.

The reason is same with 073196787727("blk-mq: Reduce blk_mq_hw_ctx size").

non-blocking is the preferred style for blk-mq driver, so we can just
focus on non-blocking wrt. performance improvement as I mentioned blocking
has big problem of sync run queue.

It may be contradictory for improving both, for example, if the
added .dispatch_counter is put with .q_usage_cunter together, it will
be fetched to L1 unnecessarily which is definitely not good for
non-blocking. 

> 
> > > > > Also maybe a better name is needed here since it's just
> > > > > for blocking hctxs.
> > > > > 
> > > > > > +	wait_queue_head_t	mq_quiesce_wq;
> > > > > > +
> > > > > >     	struct dentry		*debugfs_dir;
> > > > > >     #ifdef CONFIG_BLK_DEBUG_FS
> > > > > > 
> > > > > 
> > > > > What I think is needed here is at a minimum test quiesce/unquiesce loops
> > > > > during I/O. code auditing is not enough, there may be driver assumptions
> > > > > broken with this change (although I hope there shouldn't be).
> > > > 
> > > > We have elevator switch / updating nr_request stress test, and both relies
> > > > on quiesce/unquiesce, and I did run such test with this patch.
> > > 
> > > You have a blktest for this? If not, I strongly suggest that one is
> > > added to validate the change also moving forward.
> > 
> > There are lots of blktest tests doing that, such as block/005,
> > block/016, block/021, ...
> 
> Good, but I'd also won't want to get this without making sure the async
> quiesce works well on large number of namespaces (the reason why this
> is proposed in the first place). Not sure who is planning to do that...

That can be added when async quiesce is done.



thanks,
Ming


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-25  2:32           ` Ming Lei
@ 2020-08-25  5:24             ` Sagi Grimberg
  2020-08-25  9:41               ` Chao Leng
  0 siblings, 1 reply; 15+ messages in thread
From: Sagi Grimberg @ 2020-08-25  5:24 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Bart Van Assche, Johannes Thumshirn, Chao Leng,
	Christoph Hellwig


>>>> Anyways, I think that for now we should place them together.
>>>
>>> Then it may hurt non-blocking.
>>>
>>> Each hctx has only one run-work, if the hctx is blocked, no other request
>>> may be queued to hctx any more. That is basically sync run queue, so I
>>> am not sure good enough perf can be expected on blocking.
>>
>> I don't think that you should assume that a blocking driver will block
>> normally, it will only rarely block (very rarely).
> 
> If nvme-tcp only blocks rarely, just wondering why not switch to non-blocking
> which can be done simply with one driver specific wq work? Then nvme-tcp
> can be aligned with other nvme drivers.

It used to be this way (and also is that way today in some cases), but
some latency recent optimizations revealed that sending the request to
the wire from queue_rq (when some conditions are met) instead of
incurring a context switch is a win in most cases where latency matters.

Once we call sendpage from queue_rq, we might_sleep, hence we must be
blocking. But in practice, sendpage with MSG_DONTWAIT will rarely
actually sleep.

>>> So it may not be worth of putting the added .dispatch_counter together
>>> with .q_usage_counter.
>>
>> I happen to think it would. Not sure why you resist so much given how
>> request_queue is arranged currently.
> 
> The reason is same with 073196787727("blk-mq: Reduce blk_mq_hw_ctx size").

percpu_ref probably is a quarter of the size of srcu, not sure anyone
would have bothered to do that for percpu_ref. You're really
exaggerating I think...

> non-blocking is the preferred style for blk-mq driver, so we can just
> focus on non-blocking wrt. performance improvement as I mentioned blocking
> has big problem of sync run queue.
> 
> It may be contradictory for improving both, for example, if the
> added .dispatch_counter is put with .q_usage_cunter together, it will
> be fetched to L1 unnecessarily which is definitely not good for
> non-blocking.

I'll cease asking you for this, but your resistance is really unclear to 
me. We can measure what is the penalty/gain later by realigning some
items.

Let's stop wasting our time here...

>>>>>> Also maybe a better name is needed here since it's just
>>>>>> for blocking hctxs.
>>>>>>
>>>>>>> +	wait_queue_head_t	mq_quiesce_wq;
>>>>>>> +
>>>>>>>      	struct dentry		*debugfs_dir;
>>>>>>>      #ifdef CONFIG_BLK_DEBUG_FS
>>>>>>>
>>>>>>
>>>>>> What I think is needed here is at a minimum test quiesce/unquiesce loops
>>>>>> during I/O. code auditing is not enough, there may be driver assumptions
>>>>>> broken with this change (although I hope there shouldn't be).
>>>>>
>>>>> We have elevator switch / updating nr_request stress test, and both relies
>>>>> on quiesce/unquiesce, and I did run such test with this patch.
>>>>
>>>> You have a blktest for this? If not, I strongly suggest that one is
>>>> added to validate the change also moving forward.
>>>
>>> There are lots of blktest tests doing that, such as block/005,
>>> block/016, block/021, ...
>>
>> Good, but I'd also won't want to get this without making sure the async
>> quiesce works well on large number of namespaces (the reason why this
>> is proposed in the first place). Not sure who is planning to do that...
> 
> That can be added when async quiesce is done.

Chao, are you looking into that? I'd really hate to find out we have an
issue there post conversion...

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-25  5:24             ` Sagi Grimberg
@ 2020-08-25  9:41               ` Chao Leng
  2020-08-25 17:38                 ` Sagi Grimberg
  0 siblings, 1 reply; 15+ messages in thread
From: Chao Leng @ 2020-08-25  9:41 UTC (permalink / raw)
  To: Sagi Grimberg, Ming Lei
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Bart Van Assche, Johannes Thumshirn,
	Christoph Hellwig



On 2020/8/25 13:24, Sagi Grimberg wrote:
> 
>>>>> Anyways, I think that for now we should place them together.
>>>>
>>>> Then it may hurt non-blocking.
>>>>
>>>> Each hctx has only one run-work, if the hctx is blocked, no other request
>>>> may be queued to hctx any more. That is basically sync run queue, so I
>>>> am not sure good enough perf can be expected on blocking.
>>>
>>> I don't think that you should assume that a blocking driver will block
>>> normally, it will only rarely block (very rarely).
>>
>> If nvme-tcp only blocks rarely, just wondering why not switch to non-blocking
>> which can be done simply with one driver specific wq work? Then nvme-tcp
>> can be aligned with other nvme drivers.
> 
> It used to be this way (and also is that way today in some cases), but
> some latency recent optimizations revealed that sending the request to
> the wire from queue_rq (when some conditions are met) instead of
> incurring a context switch is a win in most cases where latency matters.
> 
> Once we call sendpage from queue_rq, we might_sleep, hence we must be
> blocking. But in practice, sendpage with MSG_DONTWAIT will rarely
> actually sleep.
> 
>>>> So it may not be worth of putting the added .dispatch_counter together
>>>> with .q_usage_counter.
>>>
>>> I happen to think it would. Not sure why you resist so much given how
>>> request_queue is arranged currently.
>>
>> The reason is same with 073196787727("blk-mq: Reduce blk_mq_hw_ctx size").
> 
> percpu_ref probably is a quarter of the size of srcu, not sure anyone
> would have bothered to do that for percpu_ref. You're really
> exaggerating I think...
> 
>> non-blocking is the preferred style for blk-mq driver, so we can just
>> focus on non-blocking wrt. performance improvement as I mentioned blocking
>> has big problem of sync run queue.
>>
>> It may be contradictory for improving both, for example, if the
>> added .dispatch_counter is put with .q_usage_cunter together, it will
>> be fetched to L1 unnecessarily which is definitely not good for
>> non-blocking.
> 
> I'll cease asking you for this, but your resistance is really unclear to me. We can measure what is the penalty/gain later by realigning some
> items.
> 
> Let's stop wasting our time here...
> 
>>>>>>> Also maybe a better name is needed here since it's just
>>>>>>> for blocking hctxs.
>>>>>>>
>>>>>>>> +    wait_queue_head_t    mq_quiesce_wq;
>>>>>>>> +
>>>>>>>>          struct dentry        *debugfs_dir;
>>>>>>>>      #ifdef CONFIG_BLK_DEBUG_FS
>>>>>>>>
>>>>>>>
>>>>>>> What I think is needed here is at a minimum test quiesce/unquiesce loops
>>>>>>> during I/O. code auditing is not enough, there may be driver assumptions
>>>>>>> broken with this change (although I hope there shouldn't be).
>>>>>>
>>>>>> We have elevator switch / updating nr_request stress test, and both relies
>>>>>> on quiesce/unquiesce, and I did run such test with this patch.
>>>>>
>>>>> You have a blktest for this? If not, I strongly suggest that one is
>>>>> added to validate the change also moving forward.
>>>>
>>>> There are lots of blktest tests doing that, such as block/005,
>>>> block/016, block/021, ...
>>>
>>> Good, but I'd also won't want to get this without making sure the async
>>> quiesce works well on large number of namespaces (the reason why this
>>> is proposed in the first place). Not sure who is planning to do that...
>>
>> That can be added when async quiesce is done.
> 
> Chao, are you looking into that? I'd really hate to find out we have an
> issue there post conversion...

Now we config CONFIG_TREE_SRCU, the size of TREE_SRCU is too big. I
really appreciate the work of Ming.

I review the patch, I think the patch may work well now, but do extra
works for exception scenario. Percpu_ref is not disigned for
serialization which read low cost. If we replace SRCU with percpu_ref,
the benefit is save memory for blocking queue, the price is limit future
changes or do more extra works.

I do not think replace SRCU with percpu_ref is a good idea, because it's
hard to predict how much we'll lose.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-25  9:41               ` Chao Leng
@ 2020-08-25 17:38                 ` Sagi Grimberg
  2020-08-26  7:25                   ` Chao Leng
  0 siblings, 1 reply; 15+ messages in thread
From: Sagi Grimberg @ 2020-08-25 17:38 UTC (permalink / raw)
  To: Chao Leng, Ming Lei
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Bart Van Assche, Johannes Thumshirn,
	Christoph Hellwig


>>>> Good, but I'd also won't want to get this without making sure the async
>>>> quiesce works well on large number of namespaces (the reason why this
>>>> is proposed in the first place). Not sure who is planning to do that...
>>>
>>> That can be added when async quiesce is done.
>>
>> Chao, are you looking into that? I'd really hate to find out we have an
>> issue there post conversion...
> 
> Now we config CONFIG_TREE_SRCU, the size of TREE_SRCU is too big. I
> really appreciate the work of Ming.
> 
> I review the patch, I think the patch may work well now, but do extra
> works for exception scenario. Percpu_ref is not disigned for
> serialization which read low cost. If we replace SRCU with percpu_ref,
> the benefit is save memory for blocking queue, the price is limit future
> changes or do more extra works.
> 
> I do not think replace SRCU with percpu_ref is a good idea, because it's
> hard to predict how much we'll lose.

Not sure I understand your point, can you clarify what is the poor
design of percpu_ref and for which use-case?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING
  2020-08-25 17:38                 ` Sagi Grimberg
@ 2020-08-26  7:25                   ` Chao Leng
  0 siblings, 0 replies; 15+ messages in thread
From: Chao Leng @ 2020-08-26  7:25 UTC (permalink / raw)
  To: Sagi Grimberg, Ming Lei
  Cc: Jens Axboe, linux-block, Lai Jiangshan, Paul E . McKenney,
	Josh Triplett, Bart Van Assche, Johannes Thumshirn,
	Christoph Hellwig



On 2020/8/26 1:38, Sagi Grimberg wrote:
> 
>>>>> Good, but I'd also won't want to get this without making sure the async
>>>>> quiesce works well on large number of namespaces (the reason why this
>>>>> is proposed in the first place). Not sure who is planning to do that...
>>>>
>>>> That can be added when async quiesce is done.
>>>
>>> Chao, are you looking into that? I'd really hate to find out we have an
>>> issue there post conversion...
>>
>> Now we config CONFIG_TREE_SRCU, the size of TREE_SRCU is too big. I
>> really appreciate the work of Ming.
>>
>> I review the patch, I think the patch may work well now, but do extra
>> works for exception scenario. Percpu_ref is not disigned for
>> serialization which read low cost. If we replace SRCU with percpu_ref,
>> the benefit is save memory for blocking queue, the price is limit future
>> changes or do more extra works.
>>
>> I do not think replace SRCU with percpu_ref is a good idea, because it's
>> hard to predict how much we'll lose.
> 
> Not sure I understand your point, can you clarify what is the poor
> design of percpu_ref and for which use-case?
> .
1.percpu_ref need introduce fail status for hctc_lock to avoid possible
long waits for synchronization, need do some extra work for failed hctx_lock .
Just like this which in the patch:
@@ -2057,11 +2051,14 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
  		struct request *rq, blk_qc_t *cookie)
  {
  	blk_status_t ret;
-	int srcu_idx;

  	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);

-	hctx_lock(hctx, &srcu_idx);
+	/* Insert request to queue in case of being quiesced */
+	if (!hctx_lock(hctx)) {
+		blk_mq_sched_insert_request(rq, false, false, false);
+		return;
+	}

Now is simple, the code can work well. If the logic gets complicated,
it's probably hard to handle.
For example: for some unkown reason, if we may introduce some mechanism(such
as check other flag or state) for dispatch or issue requests, we may need do
more extra works in the branch of failed hctx_lock. perhaps more seriously,
it may hard to be handled in this branch. If we need do like this
in __blk_mq_try_issue_directly.
	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
		run_queue = false;
		bypass_insert = false;
		goto insert;
+	} else if (blk_queue_xxx(q)) {
+		/*do some other things*/
+       }
Of course, there's a good chance it won't happen.



^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2020-08-26  7:26 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-20  3:02 [PATCH] blk-mq: implement queue quiesce via percpu_ref for BLK_MQ_F_BLOCKING Ming Lei
2020-08-21  6:34 ` Christoph Hellwig
2020-08-21 10:16   ` Ming Lei
2020-08-21 14:46     ` Jens Axboe
2020-08-21 15:05 ` Jens Axboe
2020-08-21 20:14 ` Sagi Grimberg
2020-08-22 13:39   ` Ming Lei
2020-08-24  8:19     ` Sagi Grimberg
2020-08-24 10:40       ` Ming Lei
2020-08-24 21:34         ` Sagi Grimberg
2020-08-25  2:32           ` Ming Lei
2020-08-25  5:24             ` Sagi Grimberg
2020-08-25  9:41               ` Chao Leng
2020-08-25 17:38                 ` Sagi Grimberg
2020-08-26  7:25                   ` Chao Leng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.