All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chao Leng <lengchao@huawei.com>
To: Christoph Hellwig <hch@lst.de>, Jens Axboe <axboe@kernel.dk>,
	Keith Busch <kbusch@kernel.org>, Sagi Grimberg <sagi@grimberg.me>
Cc: Ming Lei <ming.lei@redhat.com>, <linux-nvme@lists.infradead.org>,
	<linux-block@vger.kernel.org>
Subject: Re: [PATCH 3/8] blk-mq: move the srcu_struct used for quiescing to the tagset
Date: Fri, 21 Oct 2022 15:16:50 +0800	[thread overview]
Message-ID: <46ce9b02-7474-22a7-f8c2-10f46e88853a@huawei.com> (raw)
In-Reply-To: <20221020105608.1581940-4-hch@lst.de>



On 2022/10/20 18:56, Christoph Hellwig wrote:
> All I/O submissions have fairly similar latencies, and a tagset-wide
> quiesce is a fairly common operation.  Becuase there are a lot less
> tagsets there is also no need for the variable size allocation trick.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   block/blk-core.c       | 27 +++++----------------------
>   block/blk-mq.c         | 25 +++++++++++++++++--------
>   block/blk-mq.h         | 14 +++++++-------
>   block/blk-sysfs.c      |  9 ++-------
>   block/blk.h            |  9 +--------
>   block/genhd.c          |  2 +-
>   include/linux/blk-mq.h |  4 ++++
>   include/linux/blkdev.h |  9 ---------
>   8 files changed, 37 insertions(+), 62 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 17667159482e0..3a2ed8dadf738 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -65,7 +65,6 @@ DEFINE_IDA(blk_queue_ida);
>    * For queue allocation
>    */
>   struct kmem_cache *blk_requestq_cachep;
> -struct kmem_cache *blk_requestq_srcu_cachep;
>   
>   /*
>    * Controlling structure to kblockd
> @@ -373,26 +372,20 @@ static void blk_timeout_work(struct work_struct *work)
>   {
>   }
>   
> -struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
> +struct request_queue *blk_alloc_queue(int node_id)
>   {
>   	struct request_queue *q;
>   
> -	q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
> -			GFP_KERNEL | __GFP_ZERO, node_id);
> +	q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
> +				  node_id);
>   	if (!q)
>   		return NULL;
>   
> -	if (alloc_srcu) {
> -		blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
> -		if (init_srcu_struct(q->srcu) != 0)
> -			goto fail_q;
> -	}
> -
>   	q->last_merge = NULL;
>   
>   	q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
>   	if (q->id < 0)
> -		goto fail_srcu;
> +		goto fail_q;
>   
>   	q->stats = blk_alloc_queue_stats();
>   	if (!q->stats)
> @@ -435,11 +428,8 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
>   	blk_free_queue_stats(q->stats);
>   fail_id:
>   	ida_free(&blk_queue_ida, q->id);
> -fail_srcu:
> -	if (alloc_srcu)
> -		cleanup_srcu_struct(q->srcu);
>   fail_q:
> -	kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
> +	kmem_cache_free(blk_requestq_cachep, q);
>   	return NULL;
>   }
>   
> @@ -1184,9 +1174,6 @@ int __init blk_dev_init(void)
>   			sizeof_field(struct request, cmd_flags));
>   	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
>   			sizeof_field(struct bio, bi_opf));
> -	BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
> -			   __alignof__(struct request_queue)) !=
> -		     sizeof(struct request_queue));
>   
>   	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
>   	kblockd_workqueue = alloc_workqueue("kblockd",
> @@ -1197,10 +1184,6 @@ int __init blk_dev_init(void)
>   	blk_requestq_cachep = kmem_cache_create("request_queue",
>   			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
>   
> -	blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
> -			sizeof(struct request_queue) +
> -			sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
> -
>   	blk_debugfs_root = debugfs_create_dir("block", NULL);
>   
>   	return 0;
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index df967c8af9fee..4a81a2da43328 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -261,8 +261,8 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
>    */
>   void blk_mq_wait_quiesce_done(struct request_queue *q)
>   {
> -	if (blk_queue_has_srcu(q))
> -		synchronize_srcu(q->srcu);
> +	if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
> +		synchronize_srcu(&q->tag_set->srcu);
>   	else
>   		synchronize_rcu();
>   }
> @@ -3971,7 +3971,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
>   	struct request_queue *q;
>   	int ret;
>   
> -	q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
> +	q = blk_alloc_queue(set->numa_node);
>   	if (!q)
>   		return ERR_PTR(-ENOMEM);
>   	q->queuedata = queuedata;
> @@ -4138,9 +4138,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q)
>   int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>   		struct request_queue *q)
>   {
> -	WARN_ON_ONCE(blk_queue_has_srcu(q) !=
> -			!!(set->flags & BLK_MQ_F_BLOCKING));
> -
>   	/* mark the queue as mq asap */
>   	q->mq_ops = set->ops;
>   
> @@ -4398,9 +4395,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
>   	 */
>   	if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
>   		set->nr_hw_queues = nr_cpu_ids;
> +
> +	if (set->flags & BLK_MQ_F_BLOCKING) {
> +		ret = init_srcu_struct(&set->srcu);
> +		if (ret)
> +			return ret;
> +	}
>   
> -	if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
> -		return -ENOMEM;
> +	ret = blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues);
> +	if (ret)
> +		goto out_free_srcu;
>   
>   	ret = -ENOMEM;
>   	for (i = 0; i < set->nr_maps; i++) {
> @@ -4430,6 +4434,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
>   	}
>   	kfree(set->tags);
>   	set->tags = NULL;
> +out_free_srcu:
> +	if (set->flags & BLK_MQ_F_BLOCKING)
> +		cleanup_srcu_struct(&set->srcu);
>   	return ret;
>   }
>   EXPORT_SYMBOL(blk_mq_alloc_tag_set);
> @@ -4469,6 +4476,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
>   
>   	kfree(set->tags);
>   	set->tags = NULL;
> +	if (set->flags & BLK_MQ_F_BLOCKING)
> +		cleanup_srcu_struct(&set->srcu);
>   }
>   EXPORT_SYMBOL(blk_mq_free_tag_set);
>   
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 0b2870839cdd6..06eb46d1d7a76 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -377,17 +377,17 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
>   /* run the code block in @dispatch_ops with rcu/srcu read lock held */
>   #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops)	\
>   do {								\
> -	if (!blk_queue_has_srcu(q)) {				\
> -		rcu_read_lock();				\
> -		(dispatch_ops);					\
> -		rcu_read_unlock();				\
> -	} else {						\
> +	if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) {		\
>   		int srcu_idx;					\
>   								\
>   		might_sleep_if(check_sleep);			\
> -		srcu_idx = srcu_read_lock((q)->srcu);		\
> +		srcu_idx = srcu_read_lock(&((q)->tag_set->srcu)); \
>   		(dispatch_ops);					\
> -		srcu_read_unlock((q)->srcu, srcu_idx);		\
> +		srcu_read_unlock(&((q)->tag_set->srcu), srcu_idx); \
> +	} else {						\
> +		rcu_read_lock();				\
> +		(dispatch_ops);					\
> +		rcu_read_unlock();				\
>   	}							\
>   } while (0)
>   
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index e71b3b43927c0..e7871665825a3 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -739,10 +739,8 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
>   
>   static void blk_free_queue_rcu(struct rcu_head *rcu_head)
>   {
> -	struct request_queue *q = container_of(rcu_head, struct request_queue,
> -					       rcu_head);
> -
> -	kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
> +	kmem_cache_free(blk_requestq_cachep,
> +			container_of(rcu_head, struct request_queue, rcu_head));
>   }
>   
>   /**
> @@ -779,9 +777,6 @@ static void blk_release_queue(struct kobject *kobj)
>   	if (queue_is_mq(q))
>   		blk_mq_release(q);
>   
> -	if (blk_queue_has_srcu(q))
> -		cleanup_srcu_struct(q->srcu);
> -
>   	ida_free(&blk_queue_ida, q->id);
>   	call_rcu(&q->rcu_head, blk_free_queue_rcu);
>   }
> diff --git a/block/blk.h b/block/blk.h
> index 5350bf363035e..b25e2d22f3725 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -27,7 +27,6 @@ struct blk_flush_queue {
>   };
>   
>   extern struct kmem_cache *blk_requestq_cachep;
> -extern struct kmem_cache *blk_requestq_srcu_cachep;
>   extern struct kobj_type blk_queue_ktype;
>   extern struct ida blk_queue_ida;
>   
> @@ -420,13 +419,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
>   		struct page *page, unsigned int len, unsigned int offset,
>   		unsigned int max_sectors, bool *same_page);
>   
> -static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
> -{
> -	if (srcu)
> -		return blk_requestq_srcu_cachep;
> -	return blk_requestq_cachep;
> -}
> -struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
> +struct request_queue *blk_alloc_queue(int node_id);
>   
>   int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
>   
> diff --git a/block/genhd.c b/block/genhd.c
> index 2877b5f905579..fd0b13d6175a3 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1410,7 +1410,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
>   	struct request_queue *q;
>   	struct gendisk *disk;
>   
> -	q = blk_alloc_queue(node, false);
> +	q = blk_alloc_queue(node);
>   	if (!q)
>   		return NULL;
>   
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index ba18e9bdb799b..f040a7cab5dbf 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -7,6 +7,7 @@
>   #include <linux/lockdep.h>
>   #include <linux/scatterlist.h>
>   #include <linux/prefetch.h>
> +#include <linux/srcu.h>
>   
>   struct blk_mq_tags;
>   struct blk_flush_queue;
> @@ -501,6 +502,8 @@ enum hctx_type {
>    * @tag_list_lock: Serializes tag_list accesses.
>    * @tag_list:	   List of the request queues that use this tag set. See also
>    *		   request_queue.tag_set_list.
> + * @srcu:	   Use as lock when type of the request queue is blocking
> + *		   (BLK_MQ_F_BLOCKING). Must be the last member
>    */
>   struct blk_mq_tag_set {
>   	struct blk_mq_queue_map	map[HCTX_MAX_TYPES];
> @@ -521,6 +524,7 @@ struct blk_mq_tag_set {
>   
>   	struct mutex		tag_list_lock;
>   	struct list_head	tag_list;
> +	struct srcu_struct	srcu;
srcu_struct size is more than 50+ KB, it is waste for the tagset which do not set
the BLK_MQ_F_BLOCKING, and most tagsets do not set the BLK_MQ_F_BLOCKING.
Maybe we can define "srcu" as a pointer, and allocate the memory in blk_mq_alloc_tag_set
just for tagset which set the BLK_MQ_F_BLOCKING.
>   };
>   
>   /**
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 50e358a19d986..b15b6a011c028 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -22,7 +22,6 @@
>   #include <linux/blkzoned.h>
>   #include <linux/sched.h>
>   #include <linux/sbitmap.h>
> -#include <linux/srcu.h>
>   #include <linux/uuid.h>
>   #include <linux/xarray.h>
>   
> @@ -543,18 +542,11 @@ struct request_queue {
>   	struct mutex		debugfs_mutex;
>   
>   	bool			mq_sysfs_init_done;
> -
> -	/**
> -	 * @srcu: Sleepable RCU. Use as lock when type of the request queue
> -	 * is blocking (BLK_MQ_F_BLOCKING). Must be the last member
> -	 */
> -	struct srcu_struct	srcu[];
>   };
>   
>   /* Keep blk_queue_flag_name[] in sync with the definitions below */
>   #define QUEUE_FLAG_STOPPED	0	/* queue is stopped */
>   #define QUEUE_FLAG_DYING	1	/* queue being torn down */
> -#define QUEUE_FLAG_HAS_SRCU	2	/* SRCU is allocated */
>   #define QUEUE_FLAG_NOMERGES     3	/* disable merge attempts */
>   #define QUEUE_FLAG_SAME_COMP	4	/* complete on same CPU-group */
>   #define QUEUE_FLAG_FAIL_IO	5	/* fake timeout */
> @@ -590,7 +582,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
>   
>   #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
>   #define blk_queue_dying(q)	test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
> -#define blk_queue_has_srcu(q)	test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags)
>   #define blk_queue_init_done(q)	test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
>   #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
>   #define blk_queue_noxmerges(q)	\
> 

  parent reply	other threads:[~2022-10-21  7:17 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-20 10:56 per-tagset SRCU struct and quiesce Christoph Hellwig
2022-10-20 10:56 ` [PATCH 1/8] block: set the disk capacity to 0 in blk_mark_disk_dead Christoph Hellwig
2022-10-20 13:16   ` Sagi Grimberg
2022-10-21  1:09   ` Ming Lei
2022-10-21 13:11     ` Christoph Hellwig
2022-10-21  1:53   ` Chao Leng
2022-10-21  6:49   ` Hannes Reinecke
2022-10-21 13:13     ` Christoph Hellwig
2022-10-21 21:12   ` Bart Van Assche
2022-10-20 10:56 ` [PATCH 2/8] blk-mq: skip non-mq queues in blk_mq_quiesce_queue Christoph Hellwig
2022-10-20 13:16   ` Sagi Grimberg
2022-10-21  1:13   ` Ming Lei
2022-10-21 13:19     ` Christoph Hellwig
2022-10-21 15:08       ` Ming Lei
2022-10-21  2:47   ` Chao Leng
2022-10-21  3:16     ` Chao Leng
2022-10-21  6:49   ` Hannes Reinecke
2022-10-20 10:56 ` [PATCH 3/8] blk-mq: move the srcu_struct used for quiescing to the tagset Christoph Hellwig
2022-10-20 13:23   ` Sagi Grimberg
2022-10-20 17:26   ` Keith Busch
2022-10-21 13:20     ` Christoph Hellwig
2022-10-21  1:41   ` Ming Lei
2022-10-21  2:49   ` Chao Leng
2022-10-21  6:50   ` Hannes Reinecke
2022-10-21  7:16   ` Chao Leng [this message]
2022-10-21 13:22     ` Christoph Hellwig
2022-10-20 10:56 ` [PATCH 4/8] blk-mq: pass a tagset to blk_mq_wait_quiesce_done Christoph Hellwig
2022-10-20 13:23   ` Sagi Grimberg
2022-10-21  1:46   ` Ming Lei
2022-10-21 13:23     ` Christoph Hellwig
2022-10-21  2:49   ` Chao Leng
2022-10-21  6:50   ` Hannes Reinecke
2022-10-21 21:18   ` Bart Van Assche
2022-10-20 10:56 ` [PATCH 5/8] blk-mq: add tagset quiesce interface Christoph Hellwig
2022-10-20 13:24   ` Sagi Grimberg
2022-10-21  1:53   ` Ming Lei
2022-10-21  2:49   ` Chao Leng
2022-10-21  6:51   ` Hannes Reinecke
2022-10-21 21:22   ` Bart Van Assche
2022-10-24  1:57     ` Chao Leng
2022-10-24 13:35       ` Bart Van Assche
2022-10-25  1:38         ` Chao Leng
2022-10-20 10:56 ` [PATCH 6/8] nvme: move the NS_DEAD flag to the controller Christoph Hellwig
2022-10-20 13:30   ` Sagi Grimberg
2022-10-21 13:28     ` Christoph Hellwig
2022-10-24  8:43       ` Sagi Grimberg
2022-10-24  8:50         ` Sagi Grimberg
2022-10-21  2:49   ` Chao Leng
2022-10-21  6:51   ` Hannes Reinecke
2022-10-20 10:56 ` [PATCH 7/8] nvme: remove nvme_set_queue_dying Christoph Hellwig
2022-10-20 13:10   ` Sagi Grimberg
2022-10-21 13:29     ` Christoph Hellwig
2022-10-24  8:48       ` Sagi Grimberg
2022-10-21  2:50   ` Chao Leng
2022-10-21  6:52   ` Hannes Reinecke
2022-10-20 10:56 ` [PATCH 8/8] nvme: use blk_mq_[un]quiesce_tagset Christoph Hellwig
2022-10-20 13:35   ` Sagi Grimberg
2022-10-21  2:50   ` Chao Leng
2022-10-21  6:52   ` Hannes Reinecke
2022-10-20 13:16 ` per-tagset SRCU struct and quiesce Sagi Grimberg
2022-10-21 18:06 ` Keith Busch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=46ce9b02-7474-22a7-f8c2-10f46e88853a@huawei.com \
    --to=lengchao@huawei.com \
    --cc=axboe@kernel.dk \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=ming.lei@redhat.com \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.