* [PATCH 0/2] nvme: add nvme_delete_dead_ctrl for avoiding io deadlock
@ 2023-05-30 9:43 Ming Lei
2023-05-30 9:43 ` [PATCH 1/2] nvme: add API of nvme_delete_dead_ctrl Ming Lei
2023-05-30 9:43 ` [PATCH 2/2] nvme: rdma/tcp: call nvme_delete_dead_ctrl for handling reconnect failure Ming Lei
0 siblings, 2 replies; 7+ messages in thread
From: Ming Lei @ 2023-05-30 9:43 UTC (permalink / raw)
To: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme
Cc: Yi Zhang, linux-block, Ming Lei
Hello,
The 1st patch adds nvme_delete_dead_ctrl().
The 2nd patch calls it after reconnection fails in tcp/rdma driver, and
fixes one io dead lock.
Ming Lei (2):
nvme: add API of nvme_delete_dead_ctrl
nvme: rdma/tcp: call nvme_delete_dead_ctrl for handling reconnect
failure
drivers/nvme/host/core.c | 24 +++++++++++++++++++++++-
drivers/nvme/host/nvme.h | 1 +
drivers/nvme/host/rdma.c | 2 +-
drivers/nvme/host/tcp.c | 2 +-
4 files changed, 26 insertions(+), 3 deletions(-)
--
2.40.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 1/2] nvme: add API of nvme_delete_dead_ctrl
2023-05-30 9:43 [PATCH 0/2] nvme: add nvme_delete_dead_ctrl for avoiding io deadlock Ming Lei
@ 2023-05-30 9:43 ` Ming Lei
2023-06-05 21:48 ` Sagi Grimberg
2023-05-30 9:43 ` [PATCH 2/2] nvme: rdma/tcp: call nvme_delete_dead_ctrl for handling reconnect failure Ming Lei
1 sibling, 1 reply; 7+ messages in thread
From: Ming Lei @ 2023-05-30 9:43 UTC (permalink / raw)
To: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme
Cc: Yi Zhang, linux-block, Ming Lei
When driver confirms that the controller is dead, this controller should
be deleted with marking as DEAD. Otherwise, upper layer may wait forever
in __bio_queue_enter() since the disk won't be marked as DEAD.
Especially, in del_gendisk(), disk won't be marked as DEAD unless bdev
sync & invalidate returns. If any writeback IO waits in
__bio_queue_enter(), IO deadlock is caused.
Add nvme_delete_dead_ctrl() for avoiding such kind of io deadlock.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/nvme/host/core.c | 24 +++++++++++++++++++++++-
drivers/nvme/host/nvme.h | 1 +
2 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ccb6eb1282f8..413213cfa417 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -227,16 +227,38 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
nvme_do_delete_ctrl(ctrl);
}
-int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+static int __nvme_delete_ctrl(struct nvme_ctrl *ctrl,
+ enum nvme_ctrl_state state)
{
+ if (state != NVME_CTRL_DELETING && state != NVME_CTRL_DEAD)
+ return -EINVAL;
+
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
return -EBUSY;
+ if (state == NVME_CTRL_DEAD) {
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD))
+ return -EBUSY;
+ }
if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
return -EBUSY;
return 0;
}
+
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+ return __nvme_delete_ctrl(ctrl, NVME_CTRL_DELETING);
+}
EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
+/*
+ * Called when driver confirmed that the controller is really dead
+ */
+int nvme_delete_dead_ctrl(struct nvme_ctrl *ctrl)
+{
+ return __nvme_delete_ctrl(ctrl, NVME_CTRL_DEAD);
+}
+EXPORT_SYMBOL_GPL(nvme_delete_dead_ctrl);
+
static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
{
/*
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bf46f122e9e1..8f62246a85be 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -828,6 +828,7 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
+int nvme_delete_dead_ctrl(struct nvme_ctrl *ctrl);
void nvme_queue_scan(struct nvme_ctrl *ctrl);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
void *log, size_t size, u64 offset);
--
2.40.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/2] nvme: rdma/tcp: call nvme_delete_dead_ctrl for handling reconnect failure
2023-05-30 9:43 [PATCH 0/2] nvme: add nvme_delete_dead_ctrl for avoiding io deadlock Ming Lei
2023-05-30 9:43 ` [PATCH 1/2] nvme: add API of nvme_delete_dead_ctrl Ming Lei
@ 2023-05-30 9:43 ` Ming Lei
2023-06-05 14:31 ` Yi Zhang
1 sibling, 1 reply; 7+ messages in thread
From: Ming Lei @ 2023-05-30 9:43 UTC (permalink / raw)
To: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme
Cc: Yi Zhang, linux-block, Ming Lei
Reconnect failure has been reached after trying enough times, and controller
is actually incapable of handling IO, so it should be marked as dead, so call
nvme_delete_dead_ctrl() to handle the failure for avoiding the following IO
deadlock:
1) writeback IO waits in __bio_queue_enter() because queue is frozen
during error recovery
2) reconnect failure handler removes controller, and del_gendisk() waits
for above writeback IO in fsync/invalidate bdev
Fix the issue by calling nvme_delete_dead_ctrl() which call
nvme_mark_namespaces_dead() before deleting disk, so the above writeback
IO will be failed, and IO deadlock is avoided.
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/nvme/host/rdma.c | 2 +-
drivers/nvme/host/tcp.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0eb79696fb73..cdf5855c3009 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1028,7 +1028,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
ctrl->ctrl.opts->reconnect_delay * HZ);
} else {
- nvme_delete_ctrl(&ctrl->ctrl);
+ nvme_delete_dead_ctrl(&ctrl->ctrl);
}
}
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bf0230442d57..2c119bff7010 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2047,7 +2047,7 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
ctrl->opts->reconnect_delay * HZ);
} else {
dev_info(ctrl->device, "Removing controller...\n");
- nvme_delete_ctrl(ctrl);
+ nvme_delete_dead_ctrl(ctrl);
}
}
--
2.40.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 2/2] nvme: rdma/tcp: call nvme_delete_dead_ctrl for handling reconnect failure
2023-05-30 9:43 ` [PATCH 2/2] nvme: rdma/tcp: call nvme_delete_dead_ctrl for handling reconnect failure Ming Lei
@ 2023-06-05 14:31 ` Yi Zhang
0 siblings, 0 replies; 7+ messages in thread
From: Yi Zhang @ 2023-06-05 14:31 UTC (permalink / raw)
To: Ming Lei
Cc: Christoph Hellwig, Sagi Grimberg, Keith Busch, linux-nvme, linux-block
Thanks Ming, feel free to add:
Tested-by: Yi Zhang <yi.zhang@redhat.com>
On Tue, May 30, 2023 at 5:44 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> Reconnect failure has been reached after trying enough times, and controller
> is actually incapable of handling IO, so it should be marked as dead, so call
> nvme_delete_dead_ctrl() to handle the failure for avoiding the following IO
> deadlock:
>
> 1) writeback IO waits in __bio_queue_enter() because queue is frozen
> during error recovery
>
> 2) reconnect failure handler removes controller, and del_gendisk() waits
> for above writeback IO in fsync/invalidate bdev
>
> Fix the issue by calling nvme_delete_dead_ctrl() which call
> nvme_mark_namespaces_dead() before deleting disk, so the above writeback
> IO will be failed, and IO deadlock is avoided.
>
> Reported-by: Yi Zhang <yi.zhang@redhat.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
> drivers/nvme/host/rdma.c | 2 +-
> drivers/nvme/host/tcp.c | 2 +-
> 2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 0eb79696fb73..cdf5855c3009 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -1028,7 +1028,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
> queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
> ctrl->ctrl.opts->reconnect_delay * HZ);
> } else {
> - nvme_delete_ctrl(&ctrl->ctrl);
> + nvme_delete_dead_ctrl(&ctrl->ctrl);
> }
> }
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index bf0230442d57..2c119bff7010 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -2047,7 +2047,7 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
> ctrl->opts->reconnect_delay * HZ);
> } else {
> dev_info(ctrl->device, "Removing controller...\n");
> - nvme_delete_ctrl(ctrl);
> + nvme_delete_dead_ctrl(ctrl);
> }
> }
>
> --
> 2.40.1
>
--
Best Regards,
Yi Zhang
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 1/2] nvme: add API of nvme_delete_dead_ctrl
2023-05-30 9:43 ` [PATCH 1/2] nvme: add API of nvme_delete_dead_ctrl Ming Lei
@ 2023-06-05 21:48 ` Sagi Grimberg
2023-06-06 0:51 ` Ming Lei
0 siblings, 1 reply; 7+ messages in thread
From: Sagi Grimberg @ 2023-06-05 21:48 UTC (permalink / raw)
To: Ming Lei, Christoph Hellwig, Keith Busch, linux-nvme
Cc: Yi Zhang, linux-block
> When driver confirms that the controller is dead, this controller should
> be deleted with marking as DEAD. Otherwise, upper layer may wait forever
> in __bio_queue_enter() since the disk won't be marked as DEAD.
> Especially, in del_gendisk(), disk won't be marked as DEAD unless bdev
> sync & invalidate returns. If any writeback IO waits in
> __bio_queue_enter(), IO deadlock is caused.
>
> Add nvme_delete_dead_ctrl() for avoiding such kind of io deadlock.
>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
> drivers/nvme/host/core.c | 24 +++++++++++++++++++++++-
> drivers/nvme/host/nvme.h | 1 +
> 2 files changed, 24 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index ccb6eb1282f8..413213cfa417 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -227,16 +227,38 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
> nvme_do_delete_ctrl(ctrl);
> }
>
> -int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
> +static int __nvme_delete_ctrl(struct nvme_ctrl *ctrl,
> + enum nvme_ctrl_state state)
> {
> + if (state != NVME_CTRL_DELETING && state != NVME_CTRL_DEAD)
> + return -EINVAL;
> +
> if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
> return -EBUSY;
> + if (state == NVME_CTRL_DEAD) {
> + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD))
> + return -EBUSY;
> + }
> if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
> return -EBUSY;
> return 0;
> }
the user can trigger a delete in exactly the same condition as
the transport (say a nanosecond before the transport exhaust
the max_reconnects).
I think that we'd want something like
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 841f155fe0b3..6c718ad46e06 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -231,6 +231,11 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
return -EBUSY;
+
+ if (ctrl->ops->transport_disconnected &&
+ ctrl->ops->transport_disconnected(ctrl))
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD);
+
if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
return -EBUSY;
return 0;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 054bf2f8b1a1..657d3f79953d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2828,6 +2828,13 @@ static bool nvme_pci_supports_pci_p2pdma(struct
nvme_ctrl *ctrl)
return dma_pci_p2pdma_supported(dev->dev);
}
+static bool nvme_pci_disconnected(struct nvme_ctrl *nctrl)
+{
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
+
+ return !pci_device_is_present(to_pci_dev(dev->dev));
+}
+
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
@@ -2841,6 +2848,7 @@ static const struct nvme_ctrl_ops
nvme_pci_ctrl_ops = {
.get_address = nvme_pci_get_address,
.print_device_info = nvme_pci_print_device_info,
.supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma,
+ .transport_disconnected = nvme_pci_disconnected,
};
static int nvme_dev_map(struct nvme_dev *dev)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0eb79696fb73..2a03df693b0e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2235,6 +2235,18 @@ static void nvme_rdma_reset_ctrl_work(struct
work_struct *work)
nvme_rdma_reconnect_or_remove(ctrl);
}
+static bool nvme_rdma_disconnected(struct nvme_ctrl *nctrl)
+{
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+ int i;
+
+ for (i = 0; i < ctrl->queue_count; i++) {
+ if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags))
+ return false;
+ }
+ return true;
+}
+
static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.name = "rdma",
.module = THIS_MODULE,
@@ -2247,6 +2259,7 @@ static const struct nvme_ctrl_ops
nvme_rdma_ctrl_ops = {
.delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
.stop_ctrl = nvme_rdma_stop_ctrl,
+ .transport_disconnected = nvme_rdma_disconnected,
};
/*
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index fe01ba75570d..043ce9878560 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2536,6 +2536,18 @@ static int nvme_tcp_get_address(struct nvme_ctrl
*ctrl, char *buf, int size)
return len;
}
+static bool nvme_tcp_disconnected(struct nvme_ctrl *nctrl)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ int i;
+
+ for (i = 0; i < ctrl->queue_count; i++) {
+ if (test_bit(NVME_TCP_Q_LIVE, &ctrl->queues[i].flags))
+ return false;
+ }
+ return true;
+}
+
static const struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.commit_rqs = nvme_tcp_commit_rqs,
@@ -2569,6 +2581,7 @@ static const struct nvme_ctrl_ops
nvme_tcp_ctrl_ops = {
.delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvme_tcp_get_address,
.stop_ctrl = nvme_tcp_stop_ctrl,
+ .transport_disconnected = nvme_tcp_disconnected,
};
static bool
--
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 1/2] nvme: add API of nvme_delete_dead_ctrl
2023-06-05 21:48 ` Sagi Grimberg
@ 2023-06-06 0:51 ` Ming Lei
2023-06-06 6:28 ` Sagi Grimberg
0 siblings, 1 reply; 7+ messages in thread
From: Ming Lei @ 2023-06-06 0:51 UTC (permalink / raw)
To: Sagi Grimberg
Cc: Christoph Hellwig, Keith Busch, linux-nvme, Yi Zhang,
linux-block, ming.lei
On Tue, Jun 06, 2023 at 12:48:32AM +0300, Sagi Grimberg wrote:
>
> > When driver confirms that the controller is dead, this controller should
> > be deleted with marking as DEAD. Otherwise, upper layer may wait forever
> > in __bio_queue_enter() since the disk won't be marked as DEAD.
> > Especially, in del_gendisk(), disk won't be marked as DEAD unless bdev
> > sync & invalidate returns. If any writeback IO waits in
> > __bio_queue_enter(), IO deadlock is caused.
> >
> > Add nvme_delete_dead_ctrl() for avoiding such kind of io deadlock.
> >
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> > drivers/nvme/host/core.c | 24 +++++++++++++++++++++++-
> > drivers/nvme/host/nvme.h | 1 +
> > 2 files changed, 24 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index ccb6eb1282f8..413213cfa417 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -227,16 +227,38 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
> > nvme_do_delete_ctrl(ctrl);
> > }
> > -int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
> > +static int __nvme_delete_ctrl(struct nvme_ctrl *ctrl,
> > + enum nvme_ctrl_state state)
> > {
> > + if (state != NVME_CTRL_DELETING && state != NVME_CTRL_DEAD)
> > + return -EINVAL;
> > +
> > if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
> > return -EBUSY;
> > + if (state == NVME_CTRL_DEAD) {
> > + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD))
> > + return -EBUSY;
> > + }
> > if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
> > return -EBUSY;
> > return 0;
> > }
>
> the user can trigger a delete in exactly the same condition as
> the transport (say a nanosecond before the transport exhaust
> the max_reconnects).
Yeah, the problem can be triggered when removing any NS which request
queue is frozen.
It it too fragile to call freeze_queue and unfreeze_queue in different
contexts since both two should have been done in strict pair, and
meantime I don't think freezing queues in nvme_rdma_teardown_io_queues()
is needed cause quiescing is supposed to be enough for driver to recover
controller.
And I guess the following approach(rdma only) should address the issue cleanly:
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0eb79696fb73..59352671aeb7 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -919,6 +919,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
if (!new) {
nvme_unquiesce_io_queues(&ctrl->ctrl);
+ nvme_start_freeze(&ctrl->ctrl);
if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
/*
* If we timed out waiting for freeze we are likely to
@@ -926,6 +927,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
* to be safe.
*/
ret = -ENODEV;
+ nvme_unfreeze(&ctrl->ctrl);
goto out_wait_freeze_timed_out;
}
blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
@@ -975,7 +977,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
if (ctrl->ctrl.queue_count > 1) {
- nvme_start_freeze(&ctrl->ctrl);
nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_sync_io_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
>
> I think that we'd want something like
> --
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 841f155fe0b3..6c718ad46e06 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -231,6 +231,11 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
> {
> if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
> return -EBUSY;
> +
> + if (ctrl->ops->transport_disconnected &&
> + ctrl->ops->transport_disconnected(ctrl))
> + nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD);
> +
> if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
> return -EBUSY;
> return 0;
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 054bf2f8b1a1..657d3f79953d 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2828,6 +2828,13 @@ static bool nvme_pci_supports_pci_p2pdma(struct
> nvme_ctrl *ctrl)
> return dma_pci_p2pdma_supported(dev->dev);
> }
>
> +static bool nvme_pci_disconnected(struct nvme_ctrl *nctrl)
> +{
> + struct nvme_dev *dev = to_nvme_dev(ctrl);
> +
> + return !pci_device_is_present(to_pci_dev(dev->dev));
> +}
> +
> static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
> .name = "pcie",
> .module = THIS_MODULE,
> @@ -2841,6 +2848,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops =
> {
> .get_address = nvme_pci_get_address,
> .print_device_info = nvme_pci_print_device_info,
> .supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma,
> + .transport_disconnected = nvme_pci_disconnected,
> };
>
> static int nvme_dev_map(struct nvme_dev *dev)
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 0eb79696fb73..2a03df693b0e 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -2235,6 +2235,18 @@ static void nvme_rdma_reset_ctrl_work(struct
> work_struct *work)
> nvme_rdma_reconnect_or_remove(ctrl);
> }
>
> +static bool nvme_rdma_disconnected(struct nvme_ctrl *nctrl)
> +{
> + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
> + int i;
> +
> + for (i = 0; i < ctrl->queue_count; i++) {
> + if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags))
> + return false;
> + }
> + return true;
> +}
> +
> static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
> .name = "rdma",
> .module = THIS_MODULE,
> @@ -2247,6 +2259,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops =
> {
> .delete_ctrl = nvme_rdma_delete_ctrl,
> .get_address = nvmf_get_address,
> .stop_ctrl = nvme_rdma_stop_ctrl,
> + .transport_disconnected = nvme_rdma_disconnected,
> };
>
> /*
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index fe01ba75570d..043ce9878560 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -2536,6 +2536,18 @@ static int nvme_tcp_get_address(struct nvme_ctrl
> *ctrl, char *buf, int size)
> return len;
> }
>
> +static bool nvme_tcp_disconnected(struct nvme_ctrl *nctrl)
> +{
> + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
> + int i;
> +
> + for (i = 0; i < ctrl->queue_count; i++) {
> + if (test_bit(NVME_TCP_Q_LIVE, &ctrl->queues[i].flags))
> + return false;
> + }
> + return true;
> +}
> +
> static const struct blk_mq_ops nvme_tcp_mq_ops = {
> .queue_rq = nvme_tcp_queue_rq,
> .commit_rqs = nvme_tcp_commit_rqs,
> @@ -2569,6 +2581,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops =
> {
> .delete_ctrl = nvme_tcp_delete_ctrl,
> .get_address = nvme_tcp_get_address,
> .stop_ctrl = nvme_tcp_stop_ctrl,
> + .transport_disconnected = nvme_tcp_disconnected,
This way is too violent. The current queue/device status does not mean
the controller/queue is really dead cause recovering is in in-progress,
and we should call blk_mark_disk_dead() in case that controller is confirmed
as DEAD.
Thanks,
Ming
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 1/2] nvme: add API of nvme_delete_dead_ctrl
2023-06-06 0:51 ` Ming Lei
@ 2023-06-06 6:28 ` Sagi Grimberg
0 siblings, 0 replies; 7+ messages in thread
From: Sagi Grimberg @ 2023-06-06 6:28 UTC (permalink / raw)
To: Ming Lei
Cc: Christoph Hellwig, Keith Busch, linux-nvme, Yi Zhang, linux-block
On 6/6/23 03:51, Ming Lei wrote:
> On Tue, Jun 06, 2023 at 12:48:32AM +0300, Sagi Grimberg wrote:
>>
>>> When driver confirms that the controller is dead, this controller should
>>> be deleted with marking as DEAD. Otherwise, upper layer may wait forever
>>> in __bio_queue_enter() since the disk won't be marked as DEAD.
>>> Especially, in del_gendisk(), disk won't be marked as DEAD unless bdev
>>> sync & invalidate returns. If any writeback IO waits in
>>> __bio_queue_enter(), IO deadlock is caused.
>>>
>>> Add nvme_delete_dead_ctrl() for avoiding such kind of io deadlock.
>>>
>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>> ---
>>> drivers/nvme/host/core.c | 24 +++++++++++++++++++++++-
>>> drivers/nvme/host/nvme.h | 1 +
>>> 2 files changed, 24 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index ccb6eb1282f8..413213cfa417 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -227,16 +227,38 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
>>> nvme_do_delete_ctrl(ctrl);
>>> }
>>> -int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
>>> +static int __nvme_delete_ctrl(struct nvme_ctrl *ctrl,
>>> + enum nvme_ctrl_state state)
>>> {
>>> + if (state != NVME_CTRL_DELETING && state != NVME_CTRL_DEAD)
>>> + return -EINVAL;
>>> +
>>> if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
>>> return -EBUSY;
>>> + if (state == NVME_CTRL_DEAD) {
>>> + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD))
>>> + return -EBUSY;
>>> + }
>>> if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
>>> return -EBUSY;
>>> return 0;
>>> }
>>
>> the user can trigger a delete in exactly the same condition as
>> the transport (say a nanosecond before the transport exhaust
>> the max_reconnects).
>
> Yeah, the problem can be triggered when removing any NS which request
> queue is frozen.
>
> It it too fragile to call freeze_queue and unfreeze_queue in different
> contexts since both two should have been done in strict pair, and
> meantime I don't think freezing queues in nvme_rdma_teardown_io_queues()
> is needed cause quiescing is supposed to be enough for driver to recover
> controller.
>
> And I guess the following approach(rdma only) should address the issue cleanly:
>
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 0eb79696fb73..59352671aeb7 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -919,6 +919,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
>
> if (!new) {
> nvme_unquiesce_io_queues(&ctrl->ctrl);
> + nvme_start_freeze(&ctrl->ctrl);
> if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
> /*
> * If we timed out waiting for freeze we are likely to
> @@ -926,6 +927,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
> * to be safe.
> */
> ret = -ENODEV;
> + nvme_unfreeze(&ctrl->ctrl);
> goto out_wait_freeze_timed_out;
> }
> blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
> @@ -975,7 +977,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
> bool remove)
> {
> if (ctrl->ctrl.queue_count > 1) {
> - nvme_start_freeze(&ctrl->ctrl);
> nvme_quiesce_io_queues(&ctrl->ctrl);
> nvme_sync_io_queues(&ctrl->ctrl);
> nvme_rdma_stop_io_queues(ctrl);
I agree this should work.
>
>>
>> I think that we'd want something like
>> --
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 841f155fe0b3..6c718ad46e06 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -231,6 +231,11 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
>> {
>> if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
>> return -EBUSY;
>> +
>> + if (ctrl->ops->transport_disconnected &&
>> + ctrl->ops->transport_disconnected(ctrl))
>> + nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD);
>> +
>> if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
>> return -EBUSY;
>> return 0;
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 054bf2f8b1a1..657d3f79953d 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -2828,6 +2828,13 @@ static bool nvme_pci_supports_pci_p2pdma(struct
>> nvme_ctrl *ctrl)
>> return dma_pci_p2pdma_supported(dev->dev);
>> }
>>
>> +static bool nvme_pci_disconnected(struct nvme_ctrl *nctrl)
>> +{
>> + struct nvme_dev *dev = to_nvme_dev(ctrl);
>> +
>> + return !pci_device_is_present(to_pci_dev(dev->dev));
>> +}
>> +
>> static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
>> .name = "pcie",
>> .module = THIS_MODULE,
>> @@ -2841,6 +2848,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops =
>> {
>> .get_address = nvme_pci_get_address,
>> .print_device_info = nvme_pci_print_device_info,
>> .supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma,
>> + .transport_disconnected = nvme_pci_disconnected,
>> };
>>
>> static int nvme_dev_map(struct nvme_dev *dev)
>> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
>> index 0eb79696fb73..2a03df693b0e 100644
>> --- a/drivers/nvme/host/rdma.c
>> +++ b/drivers/nvme/host/rdma.c
>> @@ -2235,6 +2235,18 @@ static void nvme_rdma_reset_ctrl_work(struct
>> work_struct *work)
>> nvme_rdma_reconnect_or_remove(ctrl);
>> }
>>
>> +static bool nvme_rdma_disconnected(struct nvme_ctrl *nctrl)
>> +{
>> + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
>> + int i;
>> +
>> + for (i = 0; i < ctrl->queue_count; i++) {
>> + if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags))
>> + return false;
>> + }
>> + return true;
>> +}
>> +
>> static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
>> .name = "rdma",
>> .module = THIS_MODULE,
>> @@ -2247,6 +2259,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops =
>> {
>> .delete_ctrl = nvme_rdma_delete_ctrl,
>> .get_address = nvmf_get_address,
>> .stop_ctrl = nvme_rdma_stop_ctrl,
>> + .transport_disconnected = nvme_rdma_disconnected,
>> };
>>
>> /*
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index fe01ba75570d..043ce9878560 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -2536,6 +2536,18 @@ static int nvme_tcp_get_address(struct nvme_ctrl
>> *ctrl, char *buf, int size)
>> return len;
>> }
>>
>> +static bool nvme_tcp_disconnected(struct nvme_ctrl *nctrl)
>> +{
>> + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
>> + int i;
>> +
>> + for (i = 0; i < ctrl->queue_count; i++) {
>> + if (test_bit(NVME_TCP_Q_LIVE, &ctrl->queues[i].flags))
>> + return false;
>> + }
>> + return true;
>> +}
>> +
>> static const struct blk_mq_ops nvme_tcp_mq_ops = {
>> .queue_rq = nvme_tcp_queue_rq,
>> .commit_rqs = nvme_tcp_commit_rqs,
>> @@ -2569,6 +2581,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops =
>> {
>> .delete_ctrl = nvme_tcp_delete_ctrl,
>> .get_address = nvme_tcp_get_address,
>> .stop_ctrl = nvme_tcp_stop_ctrl,
>> + .transport_disconnected = nvme_tcp_disconnected,
>
> This way is too violent. The current queue/device status does not mean
> the controller/queue is really dead cause recovering is in in-progress,
> and we should call blk_mark_disk_dead() in case that controller is confirmed
> as DEAD.
Well, the controller is going away, and the queues are not LIVE, so I
think the logic makes sense. However I do agree that your other
suggestion is cleaner.
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2023-06-06 6:28 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-30 9:43 [PATCH 0/2] nvme: add nvme_delete_dead_ctrl for avoiding io deadlock Ming Lei
2023-05-30 9:43 ` [PATCH 1/2] nvme: add API of nvme_delete_dead_ctrl Ming Lei
2023-06-05 21:48 ` Sagi Grimberg
2023-06-06 0:51 ` Ming Lei
2023-06-06 6:28 ` Sagi Grimberg
2023-05-30 9:43 ` [PATCH 2/2] nvme: rdma/tcp: call nvme_delete_dead_ctrl for handling reconnect failure Ming Lei
2023-06-05 14:31 ` Yi Zhang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).