[PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
@ 2023-06-20  1:33 Ming Lei
  2023-06-20  1:33 ` [PATCH V2 1/4] blk-mq: add API of blk_mq_unfreeze_queue_force Ming Lei
                   ` (4 more replies)
  0 siblings, 5 replies; 26+ messages in thread
From: Ming Lei @ 2023-06-20  1:33 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme
  Cc: Yi Zhang, linux-block, Chunguang Xu, Ming Lei

Hello,

The 1st three patch fixes io hang when controller removal interrupts error
recovery, then queue is left as frozen.

The 4th patch fixes io hang when controller is left as unquiesce.

V2:
	- don't unfreeze queue which isn't frozen for avoiding warning from
	percpu_ref_resurrect() 

Ming Lei (4):
  blk-mq: add API of blk_mq_unfreeze_queue_force
  nvme: add nvme_unfreeze_force()
  nvme: unfreeze queues before removing namespaces
  nvme: unquiesce io queues when removing namespaces

 block/blk-mq.c           | 28 +++++++++++++++++++++++++---
 block/blk.h              |  3 ++-
 block/genhd.c            |  2 +-
 drivers/nvme/host/core.c | 34 ++++++++++++++++++++++++++++------
 drivers/nvme/host/nvme.h |  1 +
 include/linux/blk-mq.h   |  1 +
 6 files changed, 58 insertions(+), 11 deletions(-)

-- 
2.40.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH V2 1/4] blk-mq: add API of blk_mq_unfreeze_queue_force
  2023-06-20  1:33 [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs Ming Lei
@ 2023-06-20  1:33 ` Ming Lei
  2023-06-20  1:33 ` [PATCH V2 2/4] nvme: add nvme_unfreeze_force() Ming Lei
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Ming Lei @ 2023-06-20  1:33 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme
  Cc: Yi Zhang, linux-block, Chunguang Xu, Ming Lei

NVMe calls freeze/unfreeze in different contexts, and controller removal
may break in-progress error recovery, then leave queues in frozen state.
So cause IO hang in del_gendisk() because pending writeback IOs are
still waited in bio_queue_enter().

Prepare for fixing this issue by calling the added
blk_mq_unfreeze_queue_force when removing device.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq.c         | 28 +++++++++++++++++++++++++---
 block/blk.h            |  3 ++-
 block/genhd.c          |  2 +-
 include/linux/blk-mq.h |  1 +
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 16c524e37123..4c02c28b4835 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -185,26 +185,48 @@ void blk_mq_freeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
+void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic,
+		bool force)
 {
 	mutex_lock(&q->mq_freeze_lock);
 	if (force_atomic)
 		q->q_usage_counter.data->force_atomic = true;
-	q->mq_freeze_depth--;
+	if (force) {
+		if (!q->mq_freeze_depth)
+			goto unlock;
+		q->mq_freeze_depth = 0;
+	} else
+		q->mq_freeze_depth--;
 	WARN_ON_ONCE(q->mq_freeze_depth < 0);
 	if (!q->mq_freeze_depth) {
 		percpu_ref_resurrect(&q->q_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
+unlock:
 	mutex_unlock(&q->mq_freeze_lock);
 }
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	__blk_mq_unfreeze_queue(q, false);
+	__blk_mq_unfreeze_queue(q, false, false);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
+/*
+ * Force to unfreeze queue
+ *
+ * Be careful: this API should only be used for avoiding IO hang in
+ * bio_queue_enter() when going to remove disk which needs to drain pending
+ * writeback IO.
+ *
+ * Please don't use it for other cases.
+ */
+void blk_mq_unfreeze_queue_force(struct request_queue *q)
+{
+	__blk_mq_unfreeze_queue(q, false, true);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_force);
+
 /*
  * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
  * mpt3sas driver such that this function can be removed.
diff --git a/block/blk.h b/block/blk.h
index 608c5dcc516b..6ecabd844e13 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -33,7 +33,8 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
 void blk_free_flush_queue(struct blk_flush_queue *q);
 
 void blk_freeze_queue(struct request_queue *q);
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
+void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic, bool
+		force);
 void blk_queue_start_drain(struct request_queue *q);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
 void submit_bio_noacct_nocheck(struct bio *bio);
diff --git a/block/genhd.c b/block/genhd.c
index f71f82991434..184aa968b453 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -708,7 +708,7 @@ void del_gendisk(struct gendisk *disk)
 	 */
 	if (!test_bit(GD_OWNS_QUEUE, &disk->state)) {
 		blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
-		__blk_mq_unfreeze_queue(q, true);
+		__blk_mq_unfreeze_queue(q, true, false);
 	} else {
 		if (queue_is_mq(q))
 			blk_mq_exit_queue(q);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f401067ac03a..fa265e85d753 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -890,6 +890,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
+void blk_mq_unfreeze_queue_force(struct request_queue *q);
 void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH V2 2/4] nvme: add nvme_unfreeze_force()
  2023-06-20  1:33 [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs Ming Lei
  2023-06-20  1:33 ` [PATCH V2 1/4] blk-mq: add API of blk_mq_unfreeze_queue_force Ming Lei
@ 2023-06-20  1:33 ` Ming Lei
  2023-06-20  1:33 ` [PATCH V2 3/4] nvme: unfreeze queues before removing namespaces Ming Lei
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Ming Lei @ 2023-06-20  1:33 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme
  Cc: Yi Zhang, linux-block, Chunguang Xu, Ming Lei

Add nvme_unfreeze_force() for fixing IO hang during removing namespaces
from error recovery.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/nvme/host/core.c | 21 ++++++++++++++++++---
 drivers/nvme/host/nvme.h |  1 +
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 76e8f8b4098e..6b3f12368196 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4579,17 +4579,32 @@ void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
 
-void nvme_unfreeze(struct nvme_ctrl *ctrl)
+static void __nvme_unfreeze(struct nvme_ctrl *ctrl, bool force)
 {
 	struct nvme_ns *ns;
 
 	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_unfreeze_queue(ns->queue);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (force)
+			blk_mq_unfreeze_queue_force(ns->queue);
+		else
+			blk_mq_unfreeze_queue(ns->queue);
+	}
 	up_read(&ctrl->namespaces_rwsem);
 }
+
+void nvme_unfreeze(struct nvme_ctrl *ctrl)
+{
+	__nvme_unfreeze(ctrl, false);
+}
 EXPORT_SYMBOL_GPL(nvme_unfreeze);
 
+void nvme_unfreeze_force(struct nvme_ctrl *ctrl)
+{
+	__nvme_unfreeze(ctrl, true);
+}
+EXPORT_SYMBOL_GPL(nvme_unfreeze_force);
+
 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 78308f15e090..b583bab985c3 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -765,6 +765,7 @@ void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl);
 void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
 void nvme_unfreeze(struct nvme_ctrl *ctrl);
+void nvme_unfreeze_force(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze(struct nvme_ctrl *ctrl);
 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
 void nvme_start_freeze(struct nvme_ctrl *ctrl);
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH V2 3/4] nvme: unfreeze queues before removing namespaces
  2023-06-20  1:33 [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs Ming Lei
  2023-06-20  1:33 ` [PATCH V2 1/4] blk-mq: add API of blk_mq_unfreeze_queue_force Ming Lei
  2023-06-20  1:33 ` [PATCH V2 2/4] nvme: add nvme_unfreeze_force() Ming Lei
@ 2023-06-20  1:33 ` Ming Lei
  2023-06-20  1:33 ` [PATCH V2 4/4] nvme: unquiesce io queues when " Ming Lei
  2023-06-20 10:04 ` [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs Sagi Grimberg
  4 siblings, 0 replies; 26+ messages in thread
From: Ming Lei @ 2023-06-20  1:33 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme
  Cc: Yi Zhang, linux-block, Chunguang Xu, Ming Lei

If removal is from breaking error recovery, queues may be frozen, and
there may be pending IOs in bio_queue_enter(), and the following
del_gendisk() may wait for these IOs, especially from writeback.

Similar IO hang exists in flushing scan work too if there are pending
IO in scan work context.

Fix the kind of issue by unfreezing queues before removing namespace,
so that all pending IOs can be handled.

Reported-by: Chunguang Xu <brookxu.cn@gmail.com>
https://lore.kernel.org/linux-nvme/cover.1685350577.git.chunguang.xu@shopee.com/
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/nvme/host/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6b3f12368196..7d8ff58660ee 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4002,6 +4002,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	 */
 	nvme_mpath_clear_ctrl_paths(ctrl);
 
+	/* unfreeze queues which may be frozen from error recovery */
+	nvme_unfreeze_force(ctrl);
+
 	/* prevent racing with ns scanning */
 	flush_work(&ctrl->scan_work);
 
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH V2 4/4] nvme: unquiesce io queues when removing namespaces
  2023-06-20  1:33 [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs Ming Lei
                   ` (2 preceding siblings ...)
  2023-06-20  1:33 ` [PATCH V2 3/4] nvme: unfreeze queues before removing namespaces Ming Lei
@ 2023-06-20  1:33 ` Ming Lei
  2023-06-20 10:04 ` [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs Sagi Grimberg
  4 siblings, 0 replies; 26+ messages in thread
From: Ming Lei @ 2023-06-20  1:33 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme
  Cc: Yi Zhang, linux-block, Chunguang Xu, Ming Lei

Error recovery can be interrupted by controller removal, then the
controller is left as quiesced, and IO hang can be caused.

Fix the issue by unquiescing controller unconditionally when removing
namespaces.

Reported-by: Chunguang Xu <brookxu.cn@gmail.com>
Closes: https://lore.kernel.org/linux-nvme/cover.1685350577.git.chunguang.xu@shopee.com/
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/nvme/host/core.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7d8ff58660ee..de910badcdfc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4005,6 +4005,12 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	/* unfreeze queues which may be frozen from error recovery */
 	nvme_unfreeze_force(ctrl);
 
+	/*
+	 * Unquiesce io queues so any pending IO won't hang, especially
+	 * those submitted from scan work
+	 */
+	nvme_unquiesce_io_queues(ctrl);
+
 	/* prevent racing with ns scanning */
 	flush_work(&ctrl->scan_work);
 
@@ -4014,10 +4020,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	 * removing the namespaces' disks; fail all the queues now to avoid
 	 * potentially having to clean up the failed sync later.
 	 */
-	if (ctrl->state == NVME_CTRL_DEAD) {
+	if (ctrl->state == NVME_CTRL_DEAD)
 		nvme_mark_namespaces_dead(ctrl);
-		nvme_unquiesce_io_queues(ctrl);
-	}
 
 	/* this is a no-op when called from the controller reset handler */
 	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-20  1:33 [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs Ming Lei
                   ` (3 preceding siblings ...)
  2023-06-20  1:33 ` [PATCH V2 4/4] nvme: unquiesce io queues when " Ming Lei
@ 2023-06-20 10:04 ` Sagi Grimberg
  2023-06-20 13:23   ` Ming Lei
  4 siblings, 1 reply; 26+ messages in thread
From: Sagi Grimberg @ 2023-06-20 10:04 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe, Christoph Hellwig, Keith Busch, linux-nvme
  Cc: Yi Zhang, linux-block, Chunguang Xu


> Hello,
> 
> The 1st three patch fixes io hang when controller removal interrupts error
> recovery, then queue is left as frozen.
> 
> The 4th patch fixes io hang when controller is left as unquiesce.

Ming, what happened to nvme-tcp/rdma move of freeze/unfreeze to the
connect patches?

I think that these are patches that should go in regradless because
they also fix I/O failover blocked on request queue enter.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-20 10:04 ` [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs Sagi Grimberg
@ 2023-06-20 13:23   ` Ming Lei
  2023-06-20 13:40     ` Sagi Grimberg
  0 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2023-06-20 13:23 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, Christoph Hellwig, Keith Busch, linux-nvme, Yi Zhang,
	linux-block, Chunguang Xu

On Tue, Jun 20, 2023 at 01:04:33PM +0300, Sagi Grimberg wrote:
> 
> > Hello,
> > 
> > The 1st three patch fixes io hang when controller removal interrupts error
> > recovery, then queue is left as frozen.
> > 
> > The 4th patch fixes io hang when controller is left as unquiesce.
> 
> Ming, what happened to nvme-tcp/rdma move of freeze/unfreeze to the
> connect patches?

I'd suggest to handle all drivers(include nvme-pci) in same logic for avoiding
extra maintain burden wrt. error handling, but looks Keith worries about the
delay freezing may cause too many requests queued during error handling, and
that might cause user report.


Thanks, 
Ming


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-20 13:23   ` Ming Lei
@ 2023-06-20 13:40     ` Sagi Grimberg
  2023-06-21  0:09       ` Ming Lei
  0 siblings, 1 reply; 26+ messages in thread
From: Sagi Grimberg @ 2023-06-20 13:40 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Christoph Hellwig, Keith Busch, linux-nvme, Yi Zhang,
	linux-block, Chunguang Xu


>>> Hello,
>>>
>>> The 1st three patch fixes io hang when controller removal interrupts error
>>> recovery, then queue is left as frozen.
>>>
>>> The 4th patch fixes io hang when controller is left as unquiesce.
>>
>> Ming, what happened to nvme-tcp/rdma move of freeze/unfreeze to the
>> connect patches?
> 
> I'd suggest to handle all drivers(include nvme-pci) in same logic for avoiding
> extra maintain burden wrt. error handling, but looks Keith worries about the
> delay freezing may cause too many requests queued during error handling, and
> that might cause user report.

For nvme-tcp/rdma your patch also addresses IO not failing over because
they block on queue enter. So I definitely want this for fabrics.

AFAICT nvme-pci would also want to failover asap for dual-ported
multipathed devices, not sure if this is something that we are
interested in optimizing though, as pci either succeeds the reset,
or removes the gendisk. But the time-frame is different for fabrics
for sure.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-20 13:40     ` Sagi Grimberg
@ 2023-06-21  0:09       ` Ming Lei
  2023-06-21 10:13         ` Sagi Grimberg
  0 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2023-06-21  0:09 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, Christoph Hellwig, Keith Busch, linux-nvme, Yi Zhang,
	linux-block, Chunguang Xu

On Tue, Jun 20, 2023 at 04:40:49PM +0300, Sagi Grimberg wrote:
> 
> > > > Hello,
> > > > 
> > > > The 1st three patch fixes io hang when controller removal interrupts error
> > > > recovery, then queue is left as frozen.
> > > > 
> > > > The 4th patch fixes io hang when controller is left as unquiesce.
> > > 
> > > Ming, what happened to nvme-tcp/rdma move of freeze/unfreeze to the
> > > connect patches?
> > 
> > I'd suggest to handle all drivers(include nvme-pci) in same logic for avoiding
> > extra maintain burden wrt. error handling, but looks Keith worries about the
> > delay freezing may cause too many requests queued during error handling, and
> > that might cause user report.
> 
> For nvme-tcp/rdma your patch also addresses IO not failing over because
> they block on queue enter. So I definitely want this for fabrics.

The patch in the following link should fix these issues too:

https://lore.kernel.org/linux-block/ZJGmW7lEaipT6saa@ovpn-8-23.pek2.redhat.com/T/#u

I guess you still want the paired freeze patch because it makes freeze &
unfreeze more reliable in error handling. If yes, I can make one fabric
only change for you.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-21  0:09       ` Ming Lei
@ 2023-06-21 10:13         ` Sagi Grimberg
  2023-06-21 13:27           ` Ming Lei
  0 siblings, 1 reply; 26+ messages in thread
From: Sagi Grimberg @ 2023-06-21 10:13 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Christoph Hellwig, Keith Busch, linux-nvme, Yi Zhang,
	linux-block, Chunguang Xu


>>>>> Hello,
>>>>>
>>>>> The 1st three patch fixes io hang when controller removal interrupts error
>>>>> recovery, then queue is left as frozen.
>>>>>
>>>>> The 4th patch fixes io hang when controller is left as unquiesce.
>>>>
>>>> Ming, what happened to nvme-tcp/rdma move of freeze/unfreeze to the
>>>> connect patches?
>>>
>>> I'd suggest to handle all drivers(include nvme-pci) in same logic for avoiding
>>> extra maintain burden wrt. error handling, but looks Keith worries about the
>>> delay freezing may cause too many requests queued during error handling, and
>>> that might cause user report.
>>
>> For nvme-tcp/rdma your patch also addresses IO not failing over because
>> they block on queue enter. So I definitely want this for fabrics.
> 
> The patch in the following link should fix these issues too:
> 
> https://lore.kernel.org/linux-block/ZJGmW7lEaipT6saa@ovpn-8-23.pek2.redhat.com/T/#u
> 
> I guess you still want the paired freeze patch because it makes freeze &
> unfreeze more reliable in error handling. If yes, I can make one fabric
> only change for you.

Not sure exactly what reliability is referred here. I agree that there
is an issue with controller delete during error recovery. The patch
was a way to side-step it, great. But it addressed I/O blocked on enter
and not failing over.

So yes, for fabrics we should have it. I would argue that it would be
the right thing to do for pci as well. But I won't argue if Keith feels
otherwise.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-21 10:13         ` Sagi Grimberg
@ 2023-06-21 13:27           ` Ming Lei
  2023-06-21 15:48             ` Keith Busch
  0 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2023-06-21 13:27 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, Christoph Hellwig, Keith Busch, linux-nvme, Yi Zhang,
	linux-block, Chunguang Xu

On Wed, Jun 21, 2023 at 01:13:05PM +0300, Sagi Grimberg wrote:
> 
> > > > > > Hello,
> > > > > > 
> > > > > > The 1st three patch fixes io hang when controller removal interrupts error
> > > > > > recovery, then queue is left as frozen.
> > > > > > 
> > > > > > The 4th patch fixes io hang when controller is left as unquiesce.
> > > > > 
> > > > > Ming, what happened to nvme-tcp/rdma move of freeze/unfreeze to the
> > > > > connect patches?
> > > > 
> > > > I'd suggest to handle all drivers(include nvme-pci) in same logic for avoiding
> > > > extra maintain burden wrt. error handling, but looks Keith worries about the
> > > > delay freezing may cause too many requests queued during error handling, and
> > > > that might cause user report.
> > > 
> > > For nvme-tcp/rdma your patch also addresses IO not failing over because
> > > they block on queue enter. So I definitely want this for fabrics.
> > 
> > The patch in the following link should fix these issues too:
> > 
> > https://lore.kernel.org/linux-block/ZJGmW7lEaipT6saa@ovpn-8-23.pek2.redhat.com/T/#u
> > 
> > I guess you still want the paired freeze patch because it makes freeze &
> > unfreeze more reliable in error handling. If yes, I can make one fabric
> > only change for you.
> 
> Not sure exactly what reliability is referred here.

freeze and unfreeze have to be called strictly in pair, but nvme calls
the two from different contexts, so unfreeze may easily be missed, and
causes mismatched freeze count. There has many such reports so far.

> I agree that there
> is an issue with controller delete during error recovery. The patch
> was a way to side-step it, great. But it addressed I/O blocked on enter
> and not failing over.
> 
> So yes, for fabrics we should have it. I would argue that it would be
> the right thing to do for pci as well. But I won't argue if Keith feels
> otherwise.

Keith, can you update with us if you are fine with moving
nvme_start_freeze() into nvme_reset_work() for nvme pci driver?


Thanks,
Ming


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-21 13:27           ` Ming Lei
@ 2023-06-21 15:48             ` Keith Busch
  2023-06-22 13:51               ` Ming Lei
  0 siblings, 1 reply; 26+ messages in thread
From: Keith Busch @ 2023-06-21 15:48 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu

On Wed, Jun 21, 2023 at 09:27:31PM +0800, Ming Lei wrote:
> On Wed, Jun 21, 2023 at 01:13:05PM +0300, Sagi Grimberg wrote:
> > 
> > > > > > > Hello,
> > > > > > > 
> > > > > > > The 1st three patch fixes io hang when controller removal interrupts error
> > > > > > > recovery, then queue is left as frozen.
> > > > > > > 
> > > > > > > The 4th patch fixes io hang when controller is left as unquiesce.
> > > > > > 
> > > > > > Ming, what happened to nvme-tcp/rdma move of freeze/unfreeze to the
> > > > > > connect patches?
> > > > > 
> > > > > I'd suggest to handle all drivers(include nvme-pci) in same logic for avoiding
> > > > > extra maintain burden wrt. error handling, but looks Keith worries about the
> > > > > delay freezing may cause too many requests queued during error handling, and
> > > > > that might cause user report.
> > > > 
> > > > For nvme-tcp/rdma your patch also addresses IO not failing over because
> > > > they block on queue enter. So I definitely want this for fabrics.
> > > 
> > > The patch in the following link should fix these issues too:
> > > 
> > > https://lore.kernel.org/linux-block/ZJGmW7lEaipT6saa@ovpn-8-23.pek2.redhat.com/T/#u
> > > 
> > > I guess you still want the paired freeze patch because it makes freeze &
> > > unfreeze more reliable in error handling. If yes, I can make one fabric
> > > only change for you.
> > 
> > Not sure exactly what reliability is referred here.
> 
> freeze and unfreeze have to be called strictly in pair, but nvme calls
> the two from different contexts, so unfreeze may easily be missed, and
> causes mismatched freeze count. There has many such reports so far.
> 
> > I agree that there
> > is an issue with controller delete during error recovery. The patch
> > was a way to side-step it, great. But it addressed I/O blocked on enter
> > and not failing over.
> > 
> > So yes, for fabrics we should have it. I would argue that it would be
> > the right thing to do for pci as well. But I won't argue if Keith feels
> > otherwise.
> 
> Keith, can you update with us if you are fine with moving
> nvme_start_freeze() into nvme_reset_work() for nvme pci driver?

The point was to contain requests from entering while the hctx's are
being reconfigured. If you're going to pair up the freezes as you've
suggested, we might as well just not call freeze at all.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-21 15:48             ` Keith Busch
@ 2023-06-22 13:51               ` Ming Lei
  2023-06-22 14:35                 ` Keith Busch
  0 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2023-06-22 13:51 UTC (permalink / raw)
  To: Keith Busch
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu

On Wed, Jun 21, 2023 at 09:48:49AM -0600, Keith Busch wrote:
> On Wed, Jun 21, 2023 at 09:27:31PM +0800, Ming Lei wrote:
> > On Wed, Jun 21, 2023 at 01:13:05PM +0300, Sagi Grimberg wrote:
> > > 
> > > > > > > > Hello,
> > > > > > > > 
> > > > > > > > The 1st three patch fixes io hang when controller removal interrupts error
> > > > > > > > recovery, then queue is left as frozen.
> > > > > > > > 
> > > > > > > > The 4th patch fixes io hang when controller is left as unquiesce.
> > > > > > > 
> > > > > > > Ming, what happened to nvme-tcp/rdma move of freeze/unfreeze to the
> > > > > > > connect patches?
> > > > > > 
> > > > > > I'd suggest to handle all drivers(include nvme-pci) in same logic for avoiding
> > > > > > extra maintain burden wrt. error handling, but looks Keith worries about the
> > > > > > delay freezing may cause too many requests queued during error handling, and
> > > > > > that might cause user report.
> > > > > 
> > > > > For nvme-tcp/rdma your patch also addresses IO not failing over because
> > > > > they block on queue enter. So I definitely want this for fabrics.
> > > > 
> > > > The patch in the following link should fix these issues too:
> > > > 
> > > > https://lore.kernel.org/linux-block/ZJGmW7lEaipT6saa@ovpn-8-23.pek2.redhat.com/T/#u
> > > > 
> > > > I guess you still want the paired freeze patch because it makes freeze &
> > > > unfreeze more reliable in error handling. If yes, I can make one fabric
> > > > only change for you.
> > > 
> > > Not sure exactly what reliability is referred here.
> > 
> > freeze and unfreeze have to be called strictly in pair, but nvme calls
> > the two from different contexts, so unfreeze may easily be missed, and
> > causes mismatched freeze count. There has many such reports so far.
> > 
> > > I agree that there
> > > is an issue with controller delete during error recovery. The patch
> > > was a way to side-step it, great. But it addressed I/O blocked on enter
> > > and not failing over.
> > > 
> > > So yes, for fabrics we should have it. I would argue that it would be
> > > the right thing to do for pci as well. But I won't argue if Keith feels
> > > otherwise.
> > 
> > Keith, can you update with us if you are fine with moving
> > nvme_start_freeze() into nvme_reset_work() for nvme pci driver?
> 
> The point was to contain requests from entering while the hctx's are
> being reconfigured. If you're going to pair up the freezes as you've
> suggested, we might as well just not call freeze at all.

blk_mq_update_nr_hw_queues() requires queue to be frozen.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-22 13:51               ` Ming Lei
@ 2023-06-22 14:35                 ` Keith Busch
  2023-06-22 14:53                   ` Ming Lei
  0 siblings, 1 reply; 26+ messages in thread
From: Keith Busch @ 2023-06-22 14:35 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu

On Thu, Jun 22, 2023 at 09:51:12PM +0800, Ming Lei wrote:
> On Wed, Jun 21, 2023 at 09:48:49AM -0600, Keith Busch wrote:
> > The point was to contain requests from entering while the hctx's are
> > being reconfigured. If you're going to pair up the freezes as you've
> > suggested, we might as well just not call freeze at all.
> 
> blk_mq_update_nr_hw_queues() requires queue to be frozen.

It's too late at that point. Let's work through a real example. You'll
need a system that has more CPU's than your nvme has IO queues.

Boot without any special nvme parameters. Every possible nvme IO queue
will be assigned "default" hctx type. Now start IO to every queue, then
run:

  # echo 8 > /sys/modules/nvme/parameters/poll_queues && echo 1 > /sys/class/nvme/nvme0/reset_controller

Today, we freeze prior to tearing down the "default" IO queues, so
there's nothing entered into them while the driver reconfigures the
queues.

What you're suggesting will allow IO to queue up in a queisced "default"
queue, which will become "polled" without an interrupt hanlder on the
other side of the reset. The application doesn't know that, so the IO
you're allowing to queue up will time out.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-22 14:35                 ` Keith Busch
@ 2023-06-22 14:53                   ` Ming Lei
  2023-06-22 15:19                     ` Keith Busch
  0 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2023-06-22 14:53 UTC (permalink / raw)
  To: Keith Busch
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu

On Thu, Jun 22, 2023 at 08:35:49AM -0600, Keith Busch wrote:
> On Thu, Jun 22, 2023 at 09:51:12PM +0800, Ming Lei wrote:
> > On Wed, Jun 21, 2023 at 09:48:49AM -0600, Keith Busch wrote:
> > > The point was to contain requests from entering while the hctx's are
> > > being reconfigured. If you're going to pair up the freezes as you've
> > > suggested, we might as well just not call freeze at all.
> > 
> > blk_mq_update_nr_hw_queues() requires queue to be frozen.
> 
> It's too late at that point. Let's work through a real example. You'll
> need a system that has more CPU's than your nvme has IO queues.
> 
> Boot without any special nvme parameters. Every possible nvme IO queue
> will be assigned "default" hctx type. Now start IO to every queue, then
> run:
> 
>   # echo 8 > /sys/modules/nvme/parameters/poll_queues && echo 1 > /sys/class/nvme/nvme0/reset_controller
> 
> Today, we freeze prior to tearing down the "default" IO queues, so
> there's nothing entered into them while the driver reconfigures the
> queues.

nvme_start_freeze() just prevents new IO from being queued, and old ones
may still be entering block layer queue, and what matters here is
actually quiesce, which prevents new IO from being queued to
driver/hardware.

> 
> What you're suggesting will allow IO to queue up in a queisced "default"
> queue, which will become "polled" without an interrupt hanlder on the
> other side of the reset. The application doesn't know that, so the IO
> you're allowing to queue up will time out.

time out only happens after the request is queued to driver/hardware, or after
blk_mq_start_request() is called in nvme_queue_rq(), but quiesce actually
prevents new IOs from being dispatched to driver or be queued via .queue_rq(),
meantime old requests have been canceled, so no any request can be
timed out.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-22 14:53                   ` Ming Lei
@ 2023-06-22 15:19                     ` Keith Busch
  2023-06-25  0:26                       ` Ming Lei
  0 siblings, 1 reply; 26+ messages in thread
From: Keith Busch @ 2023-06-22 15:19 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu

On Thu, Jun 22, 2023 at 10:53:05PM +0800, Ming Lei wrote:
> On Thu, Jun 22, 2023 at 08:35:49AM -0600, Keith Busch wrote:
> > On Thu, Jun 22, 2023 at 09:51:12PM +0800, Ming Lei wrote:
> > > On Wed, Jun 21, 2023 at 09:48:49AM -0600, Keith Busch wrote:
> > > > The point was to contain requests from entering while the hctx's are
> > > > being reconfigured. If you're going to pair up the freezes as you've
> > > > suggested, we might as well just not call freeze at all.
> > > 
> > > blk_mq_update_nr_hw_queues() requires queue to be frozen.
> > 
> > It's too late at that point. Let's work through a real example. You'll
> > need a system that has more CPU's than your nvme has IO queues.
> > 
> > Boot without any special nvme parameters. Every possible nvme IO queue
> > will be assigned "default" hctx type. Now start IO to every queue, then
> > run:
> > 
> >   # echo 8 > /sys/modules/nvme/parameters/poll_queues && echo 1 > /sys/class/nvme/nvme0/reset_controller
> > 
> > Today, we freeze prior to tearing down the "default" IO queues, so
> > there's nothing entered into them while the driver reconfigures the
> > queues.
> 
> nvme_start_freeze() just prevents new IO from being queued, and old ones
> may still be entering block layer queue, and what matters here is
> actually quiesce, which prevents new IO from being queued to
> driver/hardware.
> 
> > 
> > What you're suggesting will allow IO to queue up in a queisced "default"
> > queue, which will become "polled" without an interrupt hanlder on the
> > other side of the reset. The application doesn't know that, so the IO
> > you're allowing to queue up will time out.
> 
> time out only happens after the request is queued to driver/hardware, or after
> blk_mq_start_request() is called in nvme_queue_rq(), but quiesce actually
> prevents new IOs from being dispatched to driver or be queued via .queue_rq(),
> meantime old requests have been canceled, so no any request can be
> timed out.

Quiesce doesn't prevent requests from entering an hctx, and you can't
back it out to put on another hctx later. It doesn't matter that you
haven't dispatched it to hardware yet. The request's queue was set the
moment it was allocated, so after you unquiesce and freeze for the new
queue mapping, the requests previously blocked on quiesce will time out
in the scenario I've described.

There are certainly gaps in the existing code where error'ed requests
can be requeued or stuck elsewhere and hit the exact same problem, but
the current way at least tries to contain it.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-22 15:19                     ` Keith Busch
@ 2023-06-25  0:26                       ` Ming Lei
  2023-06-25  8:09                         ` Sagi Grimberg
  2023-06-27 17:21                         ` Keith Busch
  0 siblings, 2 replies; 26+ messages in thread
From: Ming Lei @ 2023-06-25  0:26 UTC (permalink / raw)
  To: Keith Busch
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu

On Thu, Jun 22, 2023 at 09:19:19AM -0600, Keith Busch wrote:
> On Thu, Jun 22, 2023 at 10:53:05PM +0800, Ming Lei wrote:
> > On Thu, Jun 22, 2023 at 08:35:49AM -0600, Keith Busch wrote:
> > > On Thu, Jun 22, 2023 at 09:51:12PM +0800, Ming Lei wrote:
> > > > On Wed, Jun 21, 2023 at 09:48:49AM -0600, Keith Busch wrote:
> > > > > The point was to contain requests from entering while the hctx's are
> > > > > being reconfigured. If you're going to pair up the freezes as you've
> > > > > suggested, we might as well just not call freeze at all.
> > > > 
> > > > blk_mq_update_nr_hw_queues() requires queue to be frozen.
> > > 
> > > It's too late at that point. Let's work through a real example. You'll
> > > need a system that has more CPU's than your nvme has IO queues.
> > > 
> > > Boot without any special nvme parameters. Every possible nvme IO queue
> > > will be assigned "default" hctx type. Now start IO to every queue, then
> > > run:
> > > 
> > >   # echo 8 > /sys/modules/nvme/parameters/poll_queues && echo 1 > /sys/class/nvme/nvme0/reset_controller
> > > 
> > > Today, we freeze prior to tearing down the "default" IO queues, so
> > > there's nothing entered into them while the driver reconfigures the
> > > queues.
> > 
> > nvme_start_freeze() just prevents new IO from being queued, and old ones
> > may still be entering block layer queue, and what matters here is
> > actually quiesce, which prevents new IO from being queued to
> > driver/hardware.
> > 
> > > 
> > > What you're suggesting will allow IO to queue up in a queisced "default"
> > > queue, which will become "polled" without an interrupt hanlder on the
> > > other side of the reset. The application doesn't know that, so the IO
> > > you're allowing to queue up will time out.
> > 
> > time out only happens after the request is queued to driver/hardware, or after
> > blk_mq_start_request() is called in nvme_queue_rq(), but quiesce actually
> > prevents new IOs from being dispatched to driver or be queued via .queue_rq(),
> > meantime old requests have been canceled, so no any request can be
> > timed out.
> 
> Quiesce doesn't prevent requests from entering an hctx, and you can't
> back it out to put on another hctx later. It doesn't matter that you
> haven't dispatched it to hardware yet. The request's queue was set the
> moment it was allocated, so after you unquiesce and freeze for the new
> queue mapping, the requests previously blocked on quiesce will time out
> in the scenario I've described.
> 
> There are certainly gaps in the existing code where error'ed requests
> can be requeued or stuck elsewhere and hit the exact same problem, but
> the current way at least tries to contain it.

Yeah, but you can't remove the gap at all with start_freeze, that said
the current code has to live with the situation of new mapping change
and old request with old mapping.

Actually I considered to handle this kind of situation before, one approach
is to reuse the bio steal logic taken in nvme mpath:

1) for FS IO, re-submit bios, meantime free request

2) for PT request, simply fail it

It could be a bit violent for 2) even though REQ_FAILFAST_DRIVER is
always set for PT request, but not see any better approach for handling
PT request.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-25  0:26                       ` Ming Lei
@ 2023-06-25  8:09                         ` Sagi Grimberg
  2023-06-27 17:21                         ` Keith Busch
  1 sibling, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2023-06-25  8:09 UTC (permalink / raw)
  To: Ming Lei, Keith Busch
  Cc: Jens Axboe, Christoph Hellwig, linux-nvme, Yi Zhang, linux-block,
	Chunguang Xu


>>>>>> The point was to contain requests from entering while the hctx's are
>>>>>> being reconfigured. If you're going to pair up the freezes as you've
>>>>>> suggested, we might as well just not call freeze at all.
>>>>>
>>>>> blk_mq_update_nr_hw_queues() requires queue to be frozen.
>>>>
>>>> It's too late at that point. Let's work through a real example. You'll
>>>> need a system that has more CPU's than your nvme has IO queues.
>>>>
>>>> Boot without any special nvme parameters. Every possible nvme IO queue
>>>> will be assigned "default" hctx type. Now start IO to every queue, then
>>>> run:
>>>>
>>>>    # echo 8 > /sys/modules/nvme/parameters/poll_queues && echo 1 > /sys/class/nvme/nvme0/reset_controller
>>>>
>>>> Today, we freeze prior to tearing down the "default" IO queues, so
>>>> there's nothing entered into them while the driver reconfigures the
>>>> queues.
>>>
>>> nvme_start_freeze() just prevents new IO from being queued, and old ones
>>> may still be entering block layer queue, and what matters here is
>>> actually quiesce, which prevents new IO from being queued to
>>> driver/hardware.
>>>
>>>>
>>>> What you're suggesting will allow IO to queue up in a queisced "default"
>>>> queue, which will become "polled" without an interrupt hanlder on the
>>>> other side of the reset. The application doesn't know that, so the IO
>>>> you're allowing to queue up will time out.
>>>
>>> time out only happens after the request is queued to driver/hardware, or after
>>> blk_mq_start_request() is called in nvme_queue_rq(), but quiesce actually
>>> prevents new IOs from being dispatched to driver or be queued via .queue_rq(),
>>> meantime old requests have been canceled, so no any request can be
>>> timed out.
>>
>> Quiesce doesn't prevent requests from entering an hctx, and you can't
>> back it out to put on another hctx later. It doesn't matter that you
>> haven't dispatched it to hardware yet. The request's queue was set the
>> moment it was allocated, so after you unquiesce and freeze for the new
>> queue mapping, the requests previously blocked on quiesce will time out
>> in the scenario I've described.
>>
>> There are certainly gaps in the existing code where error'ed requests
>> can be requeued or stuck elsewhere and hit the exact same problem, but
>> the current way at least tries to contain it.
> 
> Yeah, but you can't remove the gap at all with start_freeze, that said
> the current code has to live with the situation of new mapping change
> and old request with old mapping.
> 
> Actually I considered to handle this kind of situation before, one approach
> is to reuse the bio steal logic taken in nvme mpath:
> 
> 1) for FS IO, re-submit bios, meantime free request
> 
> 2) for PT request, simply fail it
> 
> It could be a bit violent for 2) even though REQ_FAILFAST_DRIVER is
> always set for PT request, but not see any better approach for handling
> PT request.

Ming,

I suggest to submit patches for tcp/rdma and continue the discussion on
the pci driver.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-25  0:26                       ` Ming Lei
  2023-06-25  8:09                         ` Sagi Grimberg
@ 2023-06-27 17:21                         ` Keith Busch
  2023-06-27 21:15                           ` Sagi Grimberg
                                             ` (2 more replies)
  1 sibling, 3 replies; 26+ messages in thread
From: Keith Busch @ 2023-06-27 17:21 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu

On Sun, Jun 25, 2023 at 08:26:48AM +0800, Ming Lei wrote:
> Yeah, but you can't remove the gap at all with start_freeze, that said
> the current code has to live with the situation of new mapping change
> and old request with old mapping.
> 
> Actually I considered to handle this kind of situation before, one approach
> is to reuse the bio steal logic taken in nvme mpath:
> 
> 1) for FS IO, re-submit bios, meantime free request
> 
> 2) for PT request, simply fail it
> 
> It could be a bit violent for 2) even though REQ_FAILFAST_DRIVER is
> always set for PT request, but not see any better approach for handling
> PT request.

I think that's acceptable for PT requests, or any request that doesn't
have a bio. I tried something similiar a while back that was almost
working, but I neither never posted it, or it's in that window when
infradead lost all the emails. :(

Anyway, for the pci controller, I think I see the problem you're fixing.
When reset_work fails, we used to do the mark dead + unquieces via
"nvme_kill_queues()", which doesn't exist anymore, but I think your
scenario worked back then. Currently a failed nvme_reset_work simply
marks them dead without the unquiesce. Would it be enough to just bring
that unqueisce behavior back?

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b027e5e3f4acb..8eaa954aa6ed4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2778,6 +2778,7 @@ static void nvme_reset_work(struct work_struct *work)
 	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
 	nvme_dev_disable(dev, true);
 	nvme_mark_namespaces_dead(&dev->ctrl);
+	nvme_unquiesce_io_queues(&dev->ctrl);
 	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
 }
 
--

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-27 17:21                         ` Keith Busch
@ 2023-06-27 21:15                           ` Sagi Grimberg
  2023-06-28  1:30                           ` Ming Lei
  2023-06-28  7:06                           ` Ming Lei
  2 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2023-06-27 21:15 UTC (permalink / raw)
  To: Keith Busch, Ming Lei
  Cc: Jens Axboe, Christoph Hellwig, linux-nvme, Yi Zhang, linux-block,
	Chunguang Xu


>> Yeah, but you can't remove the gap at all with start_freeze, that said
>> the current code has to live with the situation of new mapping change
>> and old request with old mapping.
>>
>> Actually I considered to handle this kind of situation before, one approach
>> is to reuse the bio steal logic taken in nvme mpath:
>>
>> 1) for FS IO, re-submit bios, meantime free request
>>
>> 2) for PT request, simply fail it
>>
>> It could be a bit violent for 2) even though REQ_FAILFAST_DRIVER is
>> always set for PT request, but not see any better approach for handling
>> PT request.
> 
> I think that's acceptable for PT requests, or any request that doesn't
> have a bio. I tried something similiar a while back that was almost
> working, but I neither never posted it, or it's in that window when
> infradead lost all the emails. :(
> 
> Anyway, for the pci controller, I think I see the problem you're fixing.
> When reset_work fails, we used to do the mark dead + unquieces via
> "nvme_kill_queues()", which doesn't exist anymore, but I think your
> scenario worked back then. Currently a failed nvme_reset_work simply
> marks them dead without the unquiesce. Would it be enough to just bring
> that unqueisce behavior back?
> 
> ---
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index b027e5e3f4acb..8eaa954aa6ed4 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2778,6 +2778,7 @@ static void nvme_reset_work(struct work_struct *work)
>   	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
>   	nvme_dev_disable(dev, true);
>   	nvme_mark_namespaces_dead(&dev->ctrl);
> +	nvme_unquiesce_io_queues(&dev->ctrl);
>   	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
>   }
>   
> --

I think this should work.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-27 17:21                         ` Keith Busch
  2023-06-27 21:15                           ` Sagi Grimberg
@ 2023-06-28  1:30                           ` Ming Lei
  2023-06-28 14:35                             ` Keith Busch
  2023-06-28  7:06                           ` Ming Lei
  2 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2023-06-28  1:30 UTC (permalink / raw)
  To: Keith Busch
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu, ming.lei

On Tue, Jun 27, 2023 at 11:21:36AM -0600, Keith Busch wrote:
> On Sun, Jun 25, 2023 at 08:26:48AM +0800, Ming Lei wrote:
> > Yeah, but you can't remove the gap at all with start_freeze, that said
> > the current code has to live with the situation of new mapping change
> > and old request with old mapping.
> > 
> > Actually I considered to handle this kind of situation before, one approach
> > is to reuse the bio steal logic taken in nvme mpath:
> > 
> > 1) for FS IO, re-submit bios, meantime free request
> > 
> > 2) for PT request, simply fail it
> > 
> > It could be a bit violent for 2) even though REQ_FAILFAST_DRIVER is
> > always set for PT request, but not see any better approach for handling
> > PT request.
> 
> I think that's acceptable for PT requests, or any request that doesn't
> have a bio. I tried something similiar a while back that was almost
> working, but I neither never posted it, or it's in that window when
> infradead lost all the emails. :(
> 
> Anyway, for the pci controller, I think I see the problem you're fixing.
> When reset_work fails, we used to do the mark dead + unquieces via
> "nvme_kill_queues()", which doesn't exist anymore, but I think your
> scenario worked back then. Currently a failed nvme_reset_work simply
> marks them dead without the unquiesce. Would it be enough to just bring
> that unqueisce behavior back?
> 
> ---
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index b027e5e3f4acb..8eaa954aa6ed4 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2778,6 +2778,7 @@ static void nvme_reset_work(struct work_struct *work)
>  	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
>  	nvme_dev_disable(dev, true);
>  	nvme_mark_namespaces_dead(&dev->ctrl);
> +	nvme_unquiesce_io_queues(&dev->ctrl);
>  	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
>  }

That may not be enough:

- What if nvme_sysfs_delete() is called from sysfs before the 1st check in
nvme_reset_work()?

- What if one pending nvme_dev_disable()<-nvme_timeout() comes after
the added nvme_unquiesce_io_queues() returns?


Thanks,
Ming


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-27 17:21                         ` Keith Busch
  2023-06-27 21:15                           ` Sagi Grimberg
  2023-06-28  1:30                           ` Ming Lei
@ 2023-06-28  7:06                           ` Ming Lei
  2023-06-28  7:26                             ` Sagi Grimberg
  2 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2023-06-28  7:06 UTC (permalink / raw)
  To: Keith Busch
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu, ming.lei

On Tue, Jun 27, 2023 at 11:21:36AM -0600, Keith Busch wrote:
> On Sun, Jun 25, 2023 at 08:26:48AM +0800, Ming Lei wrote:
> > Yeah, but you can't remove the gap at all with start_freeze, that said
> > the current code has to live with the situation of new mapping change
> > and old request with old mapping.
> > 
> > Actually I considered to handle this kind of situation before, one approach
> > is to reuse the bio steal logic taken in nvme mpath:
> > 
> > 1) for FS IO, re-submit bios, meantime free request
> > 
> > 2) for PT request, simply fail it
> > 
> > It could be a bit violent for 2) even though REQ_FAILFAST_DRIVER is
> > always set for PT request, but not see any better approach for handling
> > PT request.
> 
> I think that's acceptable for PT requests, or any request that doesn't
> have a bio. I tried something similiar a while back that was almost
> working, but I neither never posted it, or it's in that window when
> infradead lost all the emails. :(

If you are fine to fail PT request, I'd suggest to handle the
problem in the following way:

1) moving freeze into reset

2) during resetting

- freeze NS queues
- unquiesce NS queues
- nvme_wait_freeze()
- update_nr_hw_queues
- unfreeze NS queues

3) meantime changes driver's ->queue_rq() in case that ctrl state is NVME_CTRL_CONNECTING,

- if the request is FS IO with data, re-submit all bios of this request,
  and free the request

- otherwise, fail the request  

With this way, not only freeze is paired with unfreeze. More
importantly, it becomes not possible to trigger new timeout during
handling NVME_CTRL_CONNECTING, then fallback to ctrl removal can
be avoided.

Any comment on this approach?

Thanks,
Ming


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-28  7:06                           ` Ming Lei
@ 2023-06-28  7:26                             ` Sagi Grimberg
  0 siblings, 0 replies; 26+ messages in thread
From: Sagi Grimberg @ 2023-06-28  7:26 UTC (permalink / raw)
  To: Ming Lei, Keith Busch
  Cc: Jens Axboe, Christoph Hellwig, linux-nvme, Yi Zhang, linux-block,
	Chunguang Xu


>>> Yeah, but you can't remove the gap at all with start_freeze, that said
>>> the current code has to live with the situation of new mapping change
>>> and old request with old mapping.
>>>
>>> Actually I considered to handle this kind of situation before, one approach
>>> is to reuse the bio steal logic taken in nvme mpath:
>>>
>>> 1) for FS IO, re-submit bios, meantime free request
>>>
>>> 2) for PT request, simply fail it
>>>
>>> It could be a bit violent for 2) even though REQ_FAILFAST_DRIVER is
>>> always set for PT request, but not see any better approach for handling
>>> PT request.
>>
>> I think that's acceptable for PT requests, or any request that doesn't
>> have a bio. I tried something similiar a while back that was almost
>> working, but I neither never posted it, or it's in that window when
>> infradead lost all the emails. :(
> 
> If you are fine to fail PT request, I'd suggest to handle the
> problem in the following way:
> 
> 1) moving freeze into reset
> 
> 2) during resetting
> 
> - freeze NS queues
> - unquiesce NS queues
> - nvme_wait_freeze()
> - update_nr_hw_queues
> - unfreeze NS queues
> 
> 3) meantime changes driver's ->queue_rq() in case that ctrl state is NVME_CTRL_CONNECTING,
> 
> - if the request is FS IO with data, re-submit all bios of this request,
>    and free the request
> 
> - otherwise, fail the request
> 
> With this way, not only freeze is paired with unfreeze. More
> importantly, it becomes not possible to trigger new timeout during
> handling NVME_CTRL_CONNECTING, then fallback to ctrl removal can
> be avoided.
> 
> Any comment on this approach?

As aid, for tcp/rdma I agree with this approach. No need to worry
about the non-mpath case, I don't think it is really used anyway
nowadays.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-28  1:30                           ` Ming Lei
@ 2023-06-28 14:35                             ` Keith Busch
  2023-06-29  0:08                               ` Ming Lei
  0 siblings, 1 reply; 26+ messages in thread
From: Keith Busch @ 2023-06-28 14:35 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu

On Wed, Jun 28, 2023 at 09:30:16AM +0800, Ming Lei wrote:
> That may not be enough:
> 
> - What if nvme_sysfs_delete() is called from sysfs before the 1st check in
> nvme_reset_work()?
> 
> - What if one pending nvme_dev_disable()<-nvme_timeout() comes after
> the added nvme_unquiesce_io_queues() returns?

Okay, the following will handle both:

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b027e5e3f4acb..c9224d39195e5 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2690,7 +2690,8 @@ static void nvme_reset_work(struct work_struct *work)
 	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
 		dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
 			 dev->ctrl.state);
-		return;
+		result = -ENODEV;
+		goto out;
 	}
 
 	/*
@@ -2777,7 +2778,9 @@ static void nvme_reset_work(struct work_struct *work)
 		 result);
 	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
 	nvme_dev_disable(dev, true);
+	nvme_sync_queues(&dev->ctrl);
 	nvme_mark_namespaces_dead(&dev->ctrl);
+	nvme_unquiesce_io_queues(&dev->ctrl);
 	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
 }
 
--

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-28 14:35                             ` Keith Busch
@ 2023-06-29  0:08                               ` Ming Lei
  2023-06-29 15:33                                 ` Keith Busch
  0 siblings, 1 reply; 26+ messages in thread
From: Ming Lei @ 2023-06-29  0:08 UTC (permalink / raw)
  To: Keith Busch
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu

On Wed, Jun 28, 2023 at 08:35:04AM -0600, Keith Busch wrote:
> On Wed, Jun 28, 2023 at 09:30:16AM +0800, Ming Lei wrote:
> > That may not be enough:
> > 
> > - What if nvme_sysfs_delete() is called from sysfs before the 1st check in
> > nvme_reset_work()?
> > 
> > - What if one pending nvme_dev_disable()<-nvme_timeout() comes after
> > the added nvme_unquiesce_io_queues() returns?
> 
> Okay, the following will handle both:
> 
> ---
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index b027e5e3f4acb..c9224d39195e5 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2690,7 +2690,8 @@ static void nvme_reset_work(struct work_struct *work)
>  	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
>  		dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
>  			 dev->ctrl.state);
> -		return;
> +		result = -ENODEV;
> +		goto out;
>  	}
>  
>  	/*
> @@ -2777,7 +2778,9 @@ static void nvme_reset_work(struct work_struct *work)
>  		 result);
>  	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
>  	nvme_dev_disable(dev, true);
> +	nvme_sync_queues(&dev->ctrl);
>  	nvme_mark_namespaces_dead(&dev->ctrl);
> +	nvme_unquiesce_io_queues(&dev->ctrl);
>  	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
>  }

This one looks better, but reset may not be scheduled successfully because
of removal, such as, the removal comes exactly before changing state
to NVME_CTRL_RESETTING.

Thanks,
Ming


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs
  2023-06-29  0:08                               ` Ming Lei
@ 2023-06-29 15:33                                 ` Keith Busch
  0 siblings, 0 replies; 26+ messages in thread
From: Keith Busch @ 2023-06-29 15:33 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, Jens Axboe, Christoph Hellwig, linux-nvme,
	Yi Zhang, linux-block, Chunguang Xu

On Thu, Jun 29, 2023 at 08:08:31AM +0800, Ming Lei wrote:
> On Wed, Jun 28, 2023 at 08:35:04AM -0600, Keith Busch wrote:
> > On Wed, Jun 28, 2023 at 09:30:16AM +0800, Ming Lei wrote:
> > > That may not be enough:
> > > 
> > > - What if nvme_sysfs_delete() is called from sysfs before the 1st check in
> > > nvme_reset_work()?
> > > 
> > > - What if one pending nvme_dev_disable()<-nvme_timeout() comes after
> > > the added nvme_unquiesce_io_queues() returns?
> > 
> > Okay, the following will handle both:
> > 
> > ---
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index b027e5e3f4acb..c9224d39195e5 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -2690,7 +2690,8 @@ static void nvme_reset_work(struct work_struct *work)
> >  	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
> >  		dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
> >  			 dev->ctrl.state);
> > -		return;
> > +		result = -ENODEV;
> > +		goto out;
> >  	}
> >  
> >  	/*
> > @@ -2777,7 +2778,9 @@ static void nvme_reset_work(struct work_struct *work)
> >  		 result);
> >  	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
> >  	nvme_dev_disable(dev, true);
> > +	nvme_sync_queues(&dev->ctrl);
> >  	nvme_mark_namespaces_dead(&dev->ctrl);
> > +	nvme_unquiesce_io_queues(&dev->ctrl);
> >  	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
> >  }
> 
> This one looks better, but reset may not be scheduled successfully because
> of removal, such as, the removal comes exactly before changing state
> to NVME_CTRL_RESETTING.

For example, io timeout disables the controller, but can't schedule the
reset because someone requested driver unbinding between the disabling
and the reset_work queueing? I think this needs some error checks,
like below. Fortunately there are not many places in nvme-pci that needs
this.

I do think the unconditional unquiesce in nvme_remove_namespaces() you
proposed earlier looks good though, too.

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c9224d39195e5..bf21bd08ee7ed 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -108,6 +108,7 @@ MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");
 struct nvme_dev;
 struct nvme_queue;
 
+static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 static void nvme_delete_io_queues(struct nvme_dev *dev);
 static void nvme_update_attrs(struct nvme_dev *dev);
@@ -1298,9 +1299,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	 */
 	if (nvme_should_reset(dev, csts)) {
 		nvme_warn_reset(dev, csts);
-		nvme_dev_disable(dev, false);
-		nvme_reset_ctrl(&dev->ctrl);
-		return BLK_EH_DONE;
+		goto disable;
 	}
 
 	/*
@@ -1351,10 +1350,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		nvme_dev_disable(dev, false);
-		nvme_reset_ctrl(&dev->ctrl);
-
-		return BLK_EH_DONE;
+		goto disable;
 	}
 
 	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -1391,6 +1387,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	 * as the device then is in a faulty state.
 	 */
 	return BLK_EH_RESET_TIMER;
+
+disable:
+	if (!nvme_disable_prepare_reset(dev, false) &&
+	    nvme_try_sched_reset(&dev->ctrl))
+		nvme_unquiesce_io_queues(&dev->ctrl);
+	return BLK_EH_DONE;
 }
 
 static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -3278,7 +3280,8 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
 	case pci_channel_io_frozen:
 		dev_warn(dev->ctrl.device,
 			"frozen state error detected, reset controller\n");
-		nvme_dev_disable(dev, false);
+		if (nvme_disable_prepare_reset(dev, false))
+			return PCI_ERS_RESULT_DISCONNECT;
 		return PCI_ERS_RESULT_NEED_RESET;
 	case pci_channel_io_perm_failure:
 		dev_warn(dev->ctrl.device,
@@ -3294,7 +3297,8 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
 
 	dev_info(dev->ctrl.device, "restart after slot reset\n");
 	pci_restore_state(pdev);
-	nvme_reset_ctrl(&dev->ctrl);
+	if (!nvme_try_sched_reset(&dev->ctrl))
+		nvme_unquiesce_io_queues(&dev->ctrl);
 	return PCI_ERS_RESULT_RECOVERED;
 }
 
--

^ permalink raw reply related	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2023-06-29 15:33 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-20  1:33 [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs Ming Lei
2023-06-20  1:33 ` [PATCH V2 1/4] blk-mq: add API of blk_mq_unfreeze_queue_force Ming Lei
2023-06-20  1:33 ` [PATCH V2 2/4] nvme: add nvme_unfreeze_force() Ming Lei
2023-06-20  1:33 ` [PATCH V2 3/4] nvme: unfreeze queues before removing namespaces Ming Lei
2023-06-20  1:33 ` [PATCH V2 4/4] nvme: unquiesce io queues when " Ming Lei
2023-06-20 10:04 ` [PATCH V2 0/4] nvme: fix two kinds of IO hang from removing NSs Sagi Grimberg
2023-06-20 13:23   ` Ming Lei
2023-06-20 13:40     ` Sagi Grimberg
2023-06-21  0:09       ` Ming Lei
2023-06-21 10:13         ` Sagi Grimberg
2023-06-21 13:27           ` Ming Lei
2023-06-21 15:48             ` Keith Busch
2023-06-22 13:51               ` Ming Lei
2023-06-22 14:35                 ` Keith Busch
2023-06-22 14:53                   ` Ming Lei
2023-06-22 15:19                     ` Keith Busch
2023-06-25  0:26                       ` Ming Lei
2023-06-25  8:09                         ` Sagi Grimberg
2023-06-27 17:21                         ` Keith Busch
2023-06-27 21:15                           ` Sagi Grimberg
2023-06-28  1:30                           ` Ming Lei
2023-06-28 14:35                             ` Keith Busch
2023-06-29  0:08                               ` Ming Lei
2023-06-29 15:33                                 ` Keith Busch
2023-06-28  7:06                           ` Ming Lei
2023-06-28  7:26                             ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.