All of lore.kernel.org
 help / color / mirror / Atom feed
From: Hannes Reinecke <hare@suse.de>
To: Ming Lei <ming.lei@redhat.com>, Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, John Garry <john.garry@huawei.com>,
	Bart Van Assche <bvanassche@acm.org>,
	Hannes Reinecke <hare@suse.com>, Christoph Hellwig <hch@lst.de>,
	Thomas Gleixner <tglx@linutronix.de>
Subject: Re: [PATCH V8 07/11] blk-mq: stop to handle IO and drain IO before hctx becomes inactive
Date: Fri, 24 Apr 2020 15:27:45 +0200	[thread overview]
Message-ID: <69554493-db0b-228c-94a1-0f6f50580675@suse.de> (raw)
In-Reply-To: <20200424102351.475641-8-ming.lei@redhat.com>

On 4/24/20 12:23 PM, Ming Lei wrote:
> Before one CPU becomes offline, check if it is the last online CPU of hctx.
> If yes, mark this hctx as inactive, meantime wait for completion of all
> in-flight IOs originated from this hctx. Meantime check if this hctx has
> become inactive in blk_mq_get_driver_tag(), if yes, release the
> allocated tag.
> 
> This way guarantees that there isn't any inflight IO before shutdowning
> the managed IRQ line when all CPUs of this IRQ line is offline.
> 
> Cc: John Garry <john.garry@huawei.com>
> Cc: Bart Van Assche <bvanassche@acm.org>
> Cc: Hannes Reinecke <hare@suse.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>   block/blk-mq-debugfs.c |   1 +
>   block/blk-mq.c         | 124 +++++++++++++++++++++++++++++++++++++----
>   include/linux/blk-mq.h |   3 +
>   3 files changed, 117 insertions(+), 11 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 8e745826eb86..b62390918ca5 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
>   	HCTX_STATE_NAME(STOPPED),
>   	HCTX_STATE_NAME(TAG_ACTIVE),
>   	HCTX_STATE_NAME(SCHED_RESTART),
> +	HCTX_STATE_NAME(INACTIVE),
>   };
>   #undef HCTX_STATE_NAME
>   
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d432cc74ef78..4d0c271d9f6f 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1050,11 +1050,31 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
>   	return true;
>   }
>   
> -static bool blk_mq_get_driver_tag(struct request *rq)
> +static bool blk_mq_get_driver_tag(struct request *rq, bool direct_issue)
>   {
>   	if (rq->tag != -1)
>   		return true;
> -	return __blk_mq_get_driver_tag(rq);
> +
> +	if (!__blk_mq_get_driver_tag(rq))
> +		return false;
> +	/*
> +	 * Add one memory barrier in case that direct issue IO process is
> +	 * migrated to other CPU which may not belong to this hctx, so we can
> +	 * order driver tag assignment and checking BLK_MQ_S_INACTIVE.
> +	 * Otherwise, barrier() is enough given both setting BLK_MQ_S_INACTIVE
> +	 * and driver tag assignment are run on the same CPU in case that
> +	 * BLK_MQ_S_INACTIVE is set.
> +	 */
> +	if (unlikely(direct_issue && rq->mq_ctx->cpu != raw_smp_processor_id()))
> +		smp_mb();
> +	else
> +		barrier();
> +
> +	if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state))) {
> +		blk_mq_put_driver_tag(rq);
> +		return false;
> +	}
> +	return true;
>   }
>   
>   static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
> @@ -1103,7 +1123,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
>   		 * Don't clear RESTART here, someone else could have set it.
>   		 * At most this will cost an extra queue run.
>   		 */
> -		return blk_mq_get_driver_tag(rq);
> +		return blk_mq_get_driver_tag(rq, false);
>   	}
>   
>   	wait = &hctx->dispatch_wait;
> @@ -1129,7 +1149,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
>   	 * allocation failure and adding the hardware queue to the wait
>   	 * queue.
>   	 */
> -	ret = blk_mq_get_driver_tag(rq);
> +	ret = blk_mq_get_driver_tag(rq, false);
>   	if (!ret) {
>   		spin_unlock(&hctx->dispatch_wait_lock);
>   		spin_unlock_irq(&wq->lock);
> @@ -1228,7 +1248,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
>   			break;
>   		}
>   
> -		if (!blk_mq_get_driver_tag(rq)) {
> +		if (!blk_mq_get_driver_tag(rq, false)) {
>   			/*
>   			 * The initial allocation attempt failed, so we need to
>   			 * rerun the hardware queue when a tag is freed. The
> @@ -1260,7 +1280,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
>   			bd.last = true;
>   		else {
>   			nxt = list_first_entry(list, struct request, queuelist);
> -			bd.last = !blk_mq_get_driver_tag(nxt);
> +			bd.last = !blk_mq_get_driver_tag(nxt, false);
>   		}
>   
>   		ret = q->mq_ops->queue_rq(hctx, &bd);
> @@ -1853,7 +1873,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
>   	if (!blk_mq_get_dispatch_budget(hctx))
>   		goto insert;
>   
> -	if (!blk_mq_get_driver_tag(rq)) {
> +	if (!blk_mq_get_driver_tag(rq, true)) {
>   		blk_mq_put_dispatch_budget(hctx);
>   		goto insert;
>   	}
> @@ -2261,13 +2281,92 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>   	return -ENOMEM;
>   }
>   
> -static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
> +struct count_inflight_data {
> +	unsigned count;
> +	struct blk_mq_hw_ctx *hctx;
> +};
> +
> +static bool blk_mq_count_inflight_rq(struct request *rq, void *data,
> +				     bool reserved)
>   {
> -	return 0;
> +	struct count_inflight_data *count_data = data;
> +
> +	/*
> +	 * Can't check rq's state because it is updated to MQ_RQ_IN_FLIGHT
> +	 * in blk_mq_start_request(), at that time we can't prevent this rq
> +	 * from being issued.
> +	 *
> +	 * So check if driver tag is assigned, if yes, count this rq as
> +	 * inflight.
> +	 */
> +	if (rq->tag >= 0 && rq->mq_hctx == count_data->hctx)
> +		count_data->count++;
> +
> +	return true;
> +}
> +
> +static bool blk_mq_inflight_rq(struct request *rq, void *data,
> +			       bool reserved)
> +{
> +	return rq->tag >= 0;
> +}
> +
> +static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct count_inflight_data count_data = {
> +		.count	= 0,
> +		.hctx	= hctx,
> +	};
> +
> +	blk_mq_all_tag_busy_iter(hctx->tags, blk_mq_count_inflight_rq,
> +			blk_mq_inflight_rq, &count_data);
> +
> +	return count_data.count;
> +}
> +

Remind me again: Why do we need the 'filter' function here?
Can't we just move the filter function into the main iterator and
stay with the original implementation?

> +static void blk_mq_hctx_drain_inflight_rqs(struct blk_mq_hw_ctx *hctx)
> +{
> +	while (1) {
> +		if (!blk_mq_tags_inflight_rqs(hctx))
> +			break;
> +		msleep(5);
> +	}
>   }
>   
>   static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
>   {
> +	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
> +			struct blk_mq_hw_ctx, cpuhp_online);
> +
> +	if (!cpumask_test_cpu(cpu, hctx->cpumask))
> +		return 0;
> +
> +	if ((cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) ||
> +	    (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids))
> +		return 0;
> +
> +	/*
> +	 * The current CPU is the last one in this hctx, S_INACTIVE
> +	 * can be observed in dispatch path without any barrier needed,
> +	 * cause both are run on one same CPU.
> +	 */
> +	set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
> +	/*
> +	 * Order setting BLK_MQ_S_INACTIVE and checking rq->tag & rqs[tag],
> +	 * and its pair is the smp_mb() in blk_mq_get_driver_tag
> +	 */
> +	smp_mb__after_atomic();
> +	blk_mq_hctx_drain_inflight_rqs(hctx);
> +	return 0;
> +}
> +
> +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
> +			struct blk_mq_hw_ctx, cpuhp_online);
> +
> +	if (cpumask_test_cpu(cpu, hctx->cpumask))
> +		clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
>   	return 0;
>   }
>   
> @@ -2278,12 +2377,15 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
>    */
>   static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
>   {
> -	struct blk_mq_hw_ctx *hctx;
> +	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
> +			struct blk_mq_hw_ctx, cpuhp_dead);
>   	struct blk_mq_ctx *ctx;
>   	LIST_HEAD(tmp);
>   	enum hctx_type type;
>   
> -	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
> +	if (!cpumask_test_cpu(cpu, hctx->cpumask))
> +		return 0;
> +
>   	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
>   	type = hctx->type;
>   
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index f550b5274b8b..b4812c455807 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -403,6 +403,9 @@ enum {
>   	BLK_MQ_S_TAG_ACTIVE	= 1,
>   	BLK_MQ_S_SCHED_RESTART	= 2,
>   
> +	/* hw queue is inactive after all its CPUs become offline */
> +	BLK_MQ_S_INACTIVE	= 3,
> +
>   	BLK_MQ_MAX_DEPTH	= 10240,
>   
>   	BLK_MQ_CPU_WORK_BATCH	= 8,
> 
Otherwise this looks good, and is exactly what we need.
Thanks for doing the work!

Cheers,

Hannes
-- 
Dr. Hannes Reinecke            Teamlead Storage & Networking
hare@suse.de                               +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer

  parent reply	other threads:[~2020-04-24 13:27 UTC|newest]

Thread overview: 81+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-04-24 10:23 [PATCH V8 00/11] blk-mq: improvement CPU hotplug Ming Lei
2020-04-24 10:23 ` [PATCH V8 01/11] block: clone nr_integrity_segments and write_hint in blk_rq_prep_clone Ming Lei
2020-04-24 10:32   ` Christoph Hellwig
2020-04-24 12:43   ` Hannes Reinecke
2020-04-24 16:11   ` Martin K. Petersen
2020-04-24 10:23 ` [PATCH V8 02/11] block: add helper for copying request Ming Lei
2020-04-24 10:23   ` Ming Lei
2020-04-24 10:35   ` Christoph Hellwig
2020-04-24 12:43   ` Hannes Reinecke
2020-04-24 16:12   ` Martin K. Petersen
2020-04-24 10:23 ` [PATCH V8 03/11] blk-mq: mark blk_mq_get_driver_tag as static Ming Lei
2020-04-24 12:44   ` Hannes Reinecke
2020-04-24 16:13   ` Martin K. Petersen
2020-04-24 10:23 ` [PATCH V8 04/11] blk-mq: assign rq->tag in blk_mq_get_driver_tag Ming Lei
2020-04-24 10:35   ` Christoph Hellwig
2020-04-24 13:02   ` Hannes Reinecke
2020-04-25  2:54     ` Ming Lei
2020-04-25 18:26       ` Hannes Reinecke
2020-04-24 10:23 ` [PATCH V8 05/11] blk-mq: support rq filter callback when iterating rqs Ming Lei
2020-04-24 13:17   ` Hannes Reinecke
2020-04-25  3:04     ` Ming Lei
2020-04-24 10:23 ` [PATCH V8 06/11] blk-mq: prepare for draining IO when hctx's all CPUs are offline Ming Lei
2020-04-24 13:23   ` Hannes Reinecke
2020-04-25  3:24     ` Ming Lei
2020-04-24 10:23 ` [PATCH V8 07/11] blk-mq: stop to handle IO and drain IO before hctx becomes inactive Ming Lei
2020-04-24 10:38   ` Christoph Hellwig
2020-04-25  3:17     ` Ming Lei
2020-04-25  8:32       ` Christoph Hellwig
2020-04-25  9:34         ` Ming Lei
2020-04-25  9:53           ` Ming Lei
2020-04-25 15:48             ` Christoph Hellwig
2020-04-26  2:06               ` Ming Lei
2020-04-26  8:19                 ` John Garry
2020-04-27 15:36                 ` Christoph Hellwig
2020-04-28  1:10                   ` Ming Lei
2020-04-27 19:03               ` Paul E. McKenney
2020-04-28  6:54                 ` Christoph Hellwig
2020-04-28 15:58               ` Peter Zijlstra
2020-04-29  2:16                 ` Ming Lei
2020-04-29  8:07                   ` Will Deacon
2020-04-29  9:46                     ` Ming Lei
2020-04-29 12:27                       ` Will Deacon
2020-04-29 13:43                         ` Ming Lei
2020-04-29 17:34                           ` Will Deacon
2020-04-30  0:39                             ` Ming Lei
2020-04-30 11:04                               ` Will Deacon
2020-04-30 14:02                                 ` Ming Lei
2020-05-05 15:46                                   ` Christoph Hellwig
2020-05-06  1:24                                     ` Ming Lei
2020-05-06  7:28                                       ` Will Deacon
2020-05-06  8:07                                         ` Ming Lei
2020-05-06  9:56                                           ` Will Deacon
2020-05-06 10:22                                             ` Ming Lei
2020-04-29 17:46                           ` Paul E. McKenney
2020-04-30  0:43                             ` Ming Lei
2020-04-24 13:27   ` Hannes Reinecke [this message]
2020-04-25  3:30     ` Ming Lei
2020-04-24 13:42   ` John Garry
2020-04-25  3:41     ` Ming Lei
2020-04-24 10:23 ` [PATCH V8 08/11] block: add blk_end_flush_machinery Ming Lei
2020-04-24 10:41   ` Christoph Hellwig
2020-04-25  3:44     ` Ming Lei
2020-04-25  8:11       ` Christoph Hellwig
2020-04-25  9:51         ` Ming Lei
2020-04-24 13:47   ` Hannes Reinecke
2020-04-25  3:47     ` Ming Lei
2020-04-24 10:23 ` [PATCH V8 09/11] blk-mq: add blk_mq_hctx_handle_dead_cpu for handling cpu dead Ming Lei
2020-04-24 10:42   ` Christoph Hellwig
2020-04-25  3:48     ` Ming Lei
2020-04-24 13:48   ` Hannes Reinecke
2020-04-24 10:23 ` [PATCH V8 10/11] blk-mq: re-submit IO in case that hctx is inactive Ming Lei
2020-04-24 10:44   ` Christoph Hellwig
2020-04-25  3:52     ` Ming Lei
2020-04-24 13:55   ` Hannes Reinecke
2020-04-25  3:59     ` Ming Lei
2020-04-24 10:23 ` [PATCH V8 11/11] block: deactivate hctx when the hctx is actually inactive Ming Lei
2020-04-24 10:43   ` Christoph Hellwig
2020-04-24 13:56   ` Hannes Reinecke
2020-04-24 15:23 ` [PATCH V8 00/11] blk-mq: improvement CPU hotplug Jens Axboe
2020-04-24 15:40   ` Christoph Hellwig
2020-04-24 15:41     ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=69554493-db0b-228c-94a1-0f6f50580675@suse.de \
    --to=hare@suse.de \
    --cc=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=hare@suse.com \
    --cc=hch@lst.de \
    --cc=john.garry@huawei.com \
    --cc=linux-block@vger.kernel.org \
    --cc=ming.lei@redhat.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.