All of lore.kernel.org
 help / color / mirror / Atom feed
From: John Garry <john.garry@huawei.com>
To: Ming Lei <ming.lei@redhat.com>, Jens Axboe <axboe@kernel.dk>
Cc: <linux-block@vger.kernel.org>,
	"James E . J . Bottomley" <jejb@linux.ibm.com>,
	"Martin K . Petersen" <martin.petersen@oracle.com>,
	<linux-scsi@vger.kernel.org>,
	Bart Van Assche <bvanassche@acm.org>,
	"Hannes Reinecke" <hare@suse.com>, Christoph Hellwig <hch@lst.de>,
	Thomas Gleixner <tglx@linutronix.de>,
	Keith Busch <keith.busch@intel.com>
Subject: Re: [RFC PATCH 3/7] blk-mq: stop to handle IO before hctx's all CPUs become offline
Date: Tue, 16 Jul 2019 11:03:14 +0800	[thread overview]
Message-ID: <91f2a5ce-614a-1e4d-6830-81750cf7f708@huawei.com> (raw)
In-Reply-To: <20190712024726.1227-4-ming.lei@redhat.com>

在 12/07/2019 10:47, Ming Lei 写道:
> Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup
> up queue mapping. Thomas mentioned the following point[1]:
> 
> "
>   That was the constraint of managed interrupts from the very beginning:
> 
>    The driver/subsystem has to quiesce the interrupt line and the associated
>    queue _before_ it gets shutdown in CPU unplug and not fiddle with it
>    until it's restarted by the core when the CPU is plugged in again.
> "
> 
> However, current blk-mq implementation doesn't quiesce hw queue before
> the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is
> one cpuhp state handled after the CPU is down, so there isn't any chance
> to quiesce hctx for blk-mq wrt. CPU hotplug.
> 
> Add new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE for blk-mq to stop queues
> and wait for completion of in-flight requests.
> 
> [1] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/
> 
> Cc: Bart Van Assche <bvanassche@acm.org>
> Cc: Hannes Reinecke <hare@suse.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Keith Busch <keith.busch@intel.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>   block/blk-mq-tag.c         |  2 +-
>   block/blk-mq-tag.h         |  2 ++
>   block/blk-mq.c             | 67 ++++++++++++++++++++++++++++++++++++++
>   include/linux/blk-mq.h     |  1 +
>   include/linux/cpuhotplug.h |  1 +
>   5 files changed, 72 insertions(+), 1 deletion(-)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index da19f0bc8876..bcefb213ad69 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -324,7 +324,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
>    *		true to continue iterating tags, false to stop.
>    * @priv:	Will be passed as second argument to @fn.
>    */
> -static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
> +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
>   		busy_tag_iter_fn *fn, void *priv)
>   {
>   	if (tags->nr_reserved_tags)
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 61deab0b5a5a..321fd6f440e6 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -35,6 +35,8 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
>   extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
>   void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
>   		void *priv);
> +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
> +		busy_tag_iter_fn *fn, void *priv);
>   
>   static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
>   						 struct blk_mq_hw_ctx *hctx)
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index e5ef40c603ca..028c5d78e409 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2205,6 +2205,64 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>   	return -ENOMEM;
>   }
>   
> +static bool blk_mq_count_inflight_rq(struct request *rq, void *data,
> +				     bool reserved)
> +{
> +	unsigned *count = data;
> +
> +	if ((blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT))
> +		(*count)++;
> +
> +	return true;
> +}
> +
> +unsigned blk_mq_tags_inflight_rqs(struct blk_mq_tags *tags)
> +{
> +	unsigned count = 0;
> +
> +	blk_mq_all_tag_busy_iter(tags, blk_mq_count_inflight_rq, &count);
> +
> +	return count;
> +}
> +
> +static void blk_mq_drain_inflight_rqs(struct blk_mq_hw_ctx *hctx)
> +{
> +	while (1) {
> +		if (!blk_mq_tags_inflight_rqs(hctx->tags))
> +			break;
> +		msleep(5);
> +	}
> +}
> +
> +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
> +			struct blk_mq_hw_ctx, cpuhp_online);
> +	unsigned prev_cpu = -1;
> +
> +	if (hctx->flags & BLK_MQ_F_NO_MANAGED_IRQ)
> +		return 0;

is it possible to make this check at registration, such that we don't 
even register for when (hctx->flags & BLK_MQ_F_NO_MANAGED_IRQ)?

> +
> +	while (true) {
> +		unsigned other_cpu = cpumask_next_and(prev_cpu, hctx->cpumask,

nit: to me name next_cpu seems better

> +				cpu_online_mask);
> +
> +		if (other_cpu >= nr_cpu_ids)
> +			break;
> +
> +		/* return if there is other online CPU on this hctx */
> +		if (other_cpu != cpu)
> +			return 0;
> +
> +		prev_cpu = other_cpu;
> +	}
> +
> +	set_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);
> +	blk_mq_drain_inflight_rqs(hctx);
> +
> +	return 0;
> +}
> +
>   /*
>    * 'cpu' is going away. splice any existing rq_list entries from this
>    * software queue to the hw queue dispatch list, and ensure that it
> @@ -2221,6 +2279,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
>   	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
>   	type = hctx->type;
>   
> +	if (test_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state))
> +		clear_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state);

can you just always clear the bit without the test?

> +
>   	spin_lock(&ctx->lock);
>   	if (!list_empty(&ctx->rq_lists[type])) {
>   		list_splice_init(&ctx->rq_lists[type], &tmp);
> @@ -2243,6 +2304,8 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
>   {
>   	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
>   					    &hctx->cpuhp_dead);
> +	cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
> +					    &hctx->cpuhp_online);
>   }
>   
>   /* hctx->ctxs will be freed in queue's release handler */
> @@ -2301,6 +2364,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
>   	hctx->queue_num = hctx_idx;
>   
>   	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
> +	cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
> +			&hctx->cpuhp_online);
>   
>   	hctx->tags = set->tags[hctx_idx];
>   
> @@ -3536,6 +3601,8 @@ static int __init blk_mq_init(void)
>   {
>   	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
>   				blk_mq_hctx_notify_dead);
> +	cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
> +				NULL, blk_mq_hctx_notify_online);
>   	return 0;
>   }
>   subsys_initcall(blk_mq_init);
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 911cdc6479dc..dc86bdac08f4 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -59,6 +59,7 @@ struct blk_mq_hw_ctx {
>   	atomic_t		nr_active;
>   
>   	struct hlist_node	cpuhp_dead;
> +	struct hlist_node	cpuhp_online;

I'd reorder with cpuhp_dead

>   	struct kobject		kobj;
>   
>   	unsigned long		poll_considered;
> diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
> index 87c211adf49e..5177f7bbcb88 100644
> --- a/include/linux/cpuhotplug.h
> +++ b/include/linux/cpuhotplug.h
> @@ -147,6 +147,7 @@ enum cpuhp_state {
>   	CPUHP_AP_SMPBOOT_THREADS,
>   	CPUHP_AP_X86_VDSO_VMA_ONLINE,
>   	CPUHP_AP_IRQ_AFFINITY_ONLINE,
> +	CPUHP_AP_BLK_MQ_ONLINE,
>   	CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
>   	CPUHP_AP_X86_INTEL_EPB_ONLINE,
>   	CPUHP_AP_PERF_ONLINE,
> 



  parent reply	other threads:[~2019-07-16  3:03 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-07-12  2:47 [RFC PATCH 0/7] blk-mq: improvement on handling IO during CPU hotplug Ming Lei
2019-07-12  2:47 ` [RFC PATCH 1/7] blk-mq: add new state of BLK_MQ_S_INTERNAL_STOPPED Ming Lei
2019-07-12  2:47 ` [RFC PATCH 2/7] blk-mq: add blk-mq flag of BLK_MQ_F_NO_MANAGED_IRQ Ming Lei
2019-07-16  2:53   ` John Garry
2019-07-16  6:08     ` Ming Lei
2019-07-12  2:47 ` [RFC PATCH 3/7] blk-mq: stop to handle IO before hctx's all CPUs become offline Ming Lei
2019-07-12  9:03   ` Minwoo Im
2019-07-16  3:03   ` John Garry [this message]
2019-07-16  6:12     ` Ming Lei
2019-07-12  2:47 ` [RFC PATCH 4/7] blk-mq: add callback of .free_request Ming Lei
2019-07-12  2:47 ` [RFC PATCH 5/7] SCSI: implement .free_request callback Ming Lei
2019-07-12  2:47 ` [RFC PATCH 6/7] blk-mq: re-submit IO in case that hctx is dead Ming Lei
2019-07-12  2:47 ` [RFC PATCH 7/7] blk-mq: handle requests dispatched from IO scheduler " Ming Lei
2019-07-16  6:54 ` [RFC PATCH 0/7] blk-mq: improvement on handling IO during CPU hotplug John Garry
2019-07-16  7:18   ` Ming Lei
2019-07-16  8:13     ` John Garry

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=91f2a5ce-614a-1e4d-6830-81750cf7f708@huawei.com \
    --to=john.garry@huawei.com \
    --cc=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=hare@suse.com \
    --cc=hch@lst.de \
    --cc=jejb@linux.ibm.com \
    --cc=keith.busch@intel.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=ming.lei@redhat.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.