All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: John Garry <john.garry@huawei.com>
Cc: axboe@kernel.dk, linux-block@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-scsi@vger.kernel.org,
	kashyap.desai@broadcom.com, chenxiang66@hisilicon.com,
	yama@redhat.com
Subject: Re: [PATCH] blk-mq: Use request queue-wide tags for tagset-wide sbitmap
Date: Thu, 6 May 2021 16:32:07 +0800	[thread overview]
Message-ID: <YJOph1oI8CTJjzQx@T590> (raw)
In-Reply-To: <1620037333-2495-1-git-send-email-john.garry@huawei.com>

On Mon, May 03, 2021 at 06:22:13PM +0800, John Garry wrote:
> The tags used for an IO scheduler are currently per hctx.
> 
> As such, when q->nr_hw_queues grows, so does the request queue total IO
> scheduler tag depth.
> 
> This may cause problems for SCSI MQ HBAs whose total driver depth is
> fixed.
> 
> Ming and Yanhui report higher CPU usage and lower throughput in scenarios
> where the fixed total driver tag depth is appreciably lower than the total
> scheduler tag depth:
> https://lore.kernel.org/linux-block/440dfcfc-1a2c-bd98-1161-cec4d78c6dfc@huawei.com/T/#mc0d6d4f95275a2743d1c8c3e4dc9ff6c9aa3a76b
> 
> In that scenario, since the scheduler tag is got first, much contention
> is introduced since a driver tag may not be available after we have got
> the sched tag.
> 
> Improve this scenario by introducing request queue-wide tags for when
> a tagset-wide sbitmap is used. The static sched requests are still
> allocated per hctx, as requests are initialised per hctx, as in
> blk_mq_init_request(..., hctx_idx, ...) ->
> set->ops->init_request(.., hctx_idx, ...).
> 
> For simplicity of resizing the request queue sbitmap when updating the
> request queue depth, just init at the max possible size, so we don't need
> to deal with the possibly with swapping out a new sbitmap for old if
> we need to grow.
> 
> Signed-off-by: John Garry <john.garry@huawei.com>
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index e1e997af89a0..121207abc026 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -497,11 +497,9 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
>  				   struct blk_mq_hw_ctx *hctx,
>  				   unsigned int hctx_idx)
>  {
> -	unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
> -
>  	if (hctx->sched_tags) {
>  		blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
> -		blk_mq_free_rq_map(hctx->sched_tags, flags);
> +		blk_mq_free_rq_map(hctx->sched_tags, set->flags);
>  		hctx->sched_tags = NULL;
>  	}
>  }
> @@ -511,12 +509,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
>  				   unsigned int hctx_idx)
>  {
>  	struct blk_mq_tag_set *set = q->tag_set;
> -	/* Clear HCTX_SHARED so tags are init'ed */
> -	unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
>  	int ret;
>  
>  	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
> -					       set->reserved_tags, flags);
> +					       set->reserved_tags, set->flags);
>  	if (!hctx->sched_tags)
>  		return -ENOMEM;
>  
> @@ -534,11 +530,8 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
>  	int i;
>  
>  	queue_for_each_hw_ctx(q, hctx, i) {
> -		/* Clear HCTX_SHARED so tags are freed */
> -		unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
> -
>  		if (hctx->sched_tags) {
> -			blk_mq_free_rq_map(hctx->sched_tags, flags);
> +			blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
>  			hctx->sched_tags = NULL;
>  		}
>  	}
> @@ -568,12 +561,25 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>  	queue_for_each_hw_ctx(q, hctx, i) {
>  		ret = blk_mq_sched_alloc_tags(q, hctx, i);
>  		if (ret)
> -			goto err;
> +			goto err_free_tags;
> +	}
> +
> +	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
> +		ret = blk_mq_init_sched_shared_sbitmap(q);
> +		if (ret)
> +			goto err_free_tags;
> +
> +		queue_for_each_hw_ctx(q, hctx, i) {
> +			hctx->sched_tags->bitmap_tags =
> +					q->sched_bitmap_tags;
> +			hctx->sched_tags->breserved_tags =
> +					q->sched_breserved_tags;
> +		}
>  	}
>  
>  	ret = e->ops.init_sched(q, e);
>  	if (ret)
> -		goto err;
> +		goto err_free_sbitmap;
>  
>  	blk_mq_debugfs_register_sched(q);
>  
> @@ -584,6 +590,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>  				eq = q->elevator;
>  				blk_mq_sched_free_requests(q);
>  				blk_mq_exit_sched(q, eq);
> +				blk_mq_exit_sched_shared_sbitmap(q);

blk_mq_exit_sched_shared_sbitmap() has been called in blk_mq_exit_sched() already.

>  				kobject_put(&eq->kobj);
>  				return ret;
>  			}
> @@ -593,7 +600,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>  
>  	return 0;
>  
> -err:
> +err_free_sbitmap:
> +	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
> +		blk_mq_exit_sched_shared_sbitmap(q);
> +err_free_tags:
>  	blk_mq_sched_free_requests(q);
>  	blk_mq_sched_tags_teardown(q);
>  	q->elevator = NULL;
> @@ -631,5 +641,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
>  	if (e->type->ops.exit_sched)
>  		e->type->ops.exit_sched(e);
>  	blk_mq_sched_tags_teardown(q);
> +	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
> +		blk_mq_exit_sched_shared_sbitmap(q);
>  	q->elevator = NULL;
>  }
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 2a37731e8244..734fedceca7d 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -466,19 +466,40 @@ static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
>  	return -ENOMEM;
>  }
>  
> -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
> +static int __blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
> +				 struct sbitmap_queue *breserved_tags,
> +				 struct blk_mq_tag_set *set,
> +				 unsigned int queue_depth,
> +				 unsigned int reserved)
>  {
> -	unsigned int depth = set->queue_depth - set->reserved_tags;
> +	unsigned int depth = queue_depth - reserved;
>  	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
>  	bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
> -	int i, node = set->numa_node;
>  
> -	if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node))
> +	if (bt_alloc(bitmap_tags, depth, round_robin, set->numa_node))
>  		return -ENOMEM;
> -	if (bt_alloc(&set->__breserved_tags, set->reserved_tags,
> -		     round_robin, node))
> +	if (bt_alloc(breserved_tags, set->reserved_tags,
> +		     round_robin, set->numa_node))
>  		goto free_bitmap_tags;
>  
> +	return 0;
> +
> +free_bitmap_tags:
> +	sbitmap_queue_free(bitmap_tags);
> +	return -ENOMEM;
> +}
> +
> +int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)

IMO, this function should be named as blk_mq_init_shared_tags
and moved to blk-mq-sched.c

> +{
> +	int i, ret;
> +
> +	ret = __blk_mq_init_bitmaps(&set->__bitmap_tags,
> +				    &set->__breserved_tags,
> +				    set, set->queue_depth,
> +				    set->reserved_tags);
> +	if (ret)
> +		return ret;
> +
>  	for (i = 0; i < set->nr_hw_queues; i++) {
>  		struct blk_mq_tags *tags = set->tags[i];
>  
> @@ -487,9 +508,6 @@ int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
>  	}
>  
>  	return 0;
> -free_bitmap_tags:
> -	sbitmap_queue_free(&set->__bitmap_tags);
> -	return -ENOMEM;
>  }
>  
>  void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
> @@ -498,6 +516,52 @@ void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
>  	sbitmap_queue_free(&set->__breserved_tags);
>  }
>  
> +#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
> +
> +int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
> +{
> +	struct blk_mq_tag_set *set = queue->tag_set;
> +	int ret;
> +
> +	queue->sched_bitmap_tags =
> +		kmalloc(sizeof(*queue->sched_bitmap_tags), GFP_KERNEL);
> +	queue->sched_breserved_tags =
> +		kmalloc(sizeof(*queue->sched_breserved_tags), GFP_KERNEL);
> +	if (!queue->sched_bitmap_tags || !queue->sched_breserved_tags)
> +		goto err;

The two sbitmap queues can be embedded into 'request queue', so that
we can avoid to re-allocation in every elevator switch.

I will ask Yanhui to test the patch and see if it can make a difference.


Thanks,
Ming


  reply	other threads:[~2021-05-06  8:32 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-03 10:22 [PATCH] blk-mq: Use request queue-wide tags for tagset-wide sbitmap John Garry
2021-05-06  8:32 ` Ming Lei [this message]
2021-05-07 10:15   ` John Garry
2021-05-11  0:52 ` Ming Lei
2021-05-11  1:35   ` Douglas Gilbert
2021-05-11  1:47     ` Ming Lei
2021-05-11  7:33   ` John Garry

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YJOph1oI8CTJjzQx@T590 \
    --to=ming.lei@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=chenxiang66@hisilicon.com \
    --cc=john.garry@huawei.com \
    --cc=kashyap.desai@broadcom.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=yama@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.