All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
@ 2021-09-18 21:57 Anton Eidelman
  2021-09-19 10:19 ` Sagi Grimberg
                   ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Anton Eidelman @ 2021-09-18 21:57 UTC (permalink / raw)
  To: linux-nvme, hch, kbusch, sagi, axboe; +Cc: Anton Eidelman

nvme_mpath_init_identify() invoked from nvme_init_identify()
fetches a fresh ANA log from the ctrl.
This is essential to have an up to date path states
for both existing namespaces and for those scan_work
may discover once the ctrl is up.

This happens in the following cases:
1) A new ctrl is being connected.
2) An existing ctrl is successfully reconnected.
3) An existing ctrl is being reset.

While in (1) ctrl->namespaces is empty, (2 & 3) may have namespaces,
and nvme_read_ana_log() may call nvme_update_ns_ana_state().

This result in a hang when the ANA state of an existing namespace
changes and makes the disk live: nvme_mpath_set_live() issues
IO to the namespace through the ctrl, which does NOT have IO queues yet.
See sample hang below.

Solution:
- move nvme_read_ana_log() call from nvme_mpath_init_identify()
into a separate function, nvme_mpath_update().
- call this function in nvme_start_ctrl() - here the ctrl is ready.

Downside: in case nvme_read_ana_log() fails, we do not find this out
  in nvme_mpath_init_identify().

Alernative solution:
  nvme_get_log() in nvme_mpath_init_identify(), but don't parse.
  Then nvme_start_ctrl() can just queue_work(nvme_wq, &ctrl->ana_work),
  along with scan_work.
  The ctrl->ana_log_buf contents is valid for scan_work,
  and ana_work will re-fetch and parse it.
  Downside: fetching an ANA log from the ctrl twice.

Sample failure:
    nvme nvme0: starting error recovery
    nvme nvme0: Reconnecting in 10 seconds...
    block nvme0n6: no usable path - requeuing I/O
    INFO: task kworker/u8:3:312 blocked for more than 122 seconds.
          Tainted: G            E     5.14.5-1.el7.elrepo.x86_64 #1
    Workqueue: nvme-wq nvme_tcp_reconnect_ctrl_work [nvme_tcp]
    Call Trace:
     __schedule+0x2a2/0x7e0
     schedule+0x4e/0xb0
     io_schedule+0x16/0x40
     wait_on_page_bit_common+0x15c/0x3e0
     do_read_cache_page+0x1e0/0x410
     read_cache_page+0x12/0x20
     read_part_sector+0x46/0x100
     read_lba+0x121/0x240
     efi_partition+0x1d2/0x6a0
     bdev_disk_changed.part.0+0x1df/0x430
     bdev_disk_changed+0x18/0x20
     blkdev_get_whole+0x77/0xe0
     blkdev_get_by_dev+0xd2/0x3a0
     __device_add_disk+0x1ed/0x310
     device_add_disk+0x13/0x20
     nvme_mpath_set_live+0x138/0x1b0 [nvme_core]
     nvme_update_ns_ana_state+0x2b/0x30 [nvme_core]
     nvme_update_ana_state+0xca/0xe0 [nvme_core]
     nvme_parse_ana_log+0xac/0x170 [nvme_core]
     nvme_read_ana_log+0x7d/0xe0 [nvme_core]
     nvme_mpath_init_identify+0x105/0x150 [nvme_core]
     nvme_init_identify+0x2df/0x4d0 [nvme_core]
     nvme_init_ctrl_finish+0x8d/0x3b0 [nvme_core]
     nvme_tcp_setup_ctrl+0x337/0x390 [nvme_tcp]
     nvme_tcp_reconnect_ctrl_work+0x24/0x40 [nvme_tcp]
     process_one_work+0x1bd/0x360
     worker_thread+0x50/0x3d0

Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
---
 drivers/nvme/host/core.c      | 1 +
 drivers/nvme/host/multipath.c | 9 ++++++---
 drivers/nvme/host/nvme.h      | 4 ++++
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 97f8211cf92c..884ca1f6f80b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4325,6 +4325,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
 	nvme_enable_aen(ctrl);
 
 	if (ctrl->queue_count > 1) {
+		nvme_mpath_update(ctrl);
 		nvme_queue_scan(ctrl);
 		nvme_start_queues(ctrl);
 	}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index e8ccdd398f78..8bd133f66636 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -850,9 +850,7 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 			return -ENOMEM;
 	}
 	ctrl->ana_log_size = ana_log_size;
-	error = nvme_read_ana_log(ctrl);
-	if (error)
-		goto out_uninit;
+
 	return 0;
 
 out_uninit:
@@ -860,6 +858,11 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	return error;
 }
 
+void nvme_mpath_update(struct nvme_ctrl *ctrl)
+{
+	nvme_read_ana_log(ctrl);
+}
+
 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
 {
 	kfree(ctrl->ana_log_buf);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9871c0c9374c..1560ea7adc6b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -746,6 +746,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
 void nvme_mpath_remove_disk(struct nvme_ns_head *head);
 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
+void nvme_mpath_update(struct nvme_ctrl *ctrl);
 void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
 void nvme_mpath_stop(struct nvme_ctrl *ctrl);
 bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
@@ -820,6 +821,9 @@ static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl,
 "Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n");
 	return 0;
 }
+static void nvme_mpath_update(struct nvme_ctrl *ctrl)
+{
+}
 static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
 {
 }
-- 
2.25.1


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
  2021-09-18 21:57 [PATCH] nvme/mpath: fix hang when disk goes live over reconnect Anton Eidelman
@ 2021-09-19 10:19 ` Sagi Grimberg
  2021-09-19 23:05   ` Anton Eidelman
  2021-09-20  6:35 ` Christoph Hellwig
  2022-03-23  3:55 ` [PATCH v2 0/1] " Anton Eidelman
  2 siblings, 1 reply; 22+ messages in thread
From: Sagi Grimberg @ 2021-09-19 10:19 UTC (permalink / raw)
  To: Anton Eidelman, linux-nvme, hch, kbusch, axboe; +Cc: Anton Eidelman



On 9/19/21 12:57 AM, Anton Eidelman wrote:
> nvme_mpath_init_identify() invoked from nvme_init_identify()
> fetches a fresh ANA log from the ctrl.
> This is essential to have an up to date path states
> for both existing namespaces and for those scan_work
> may discover once the ctrl is up.
> 
> This happens in the following cases:
> 1) A new ctrl is being connected.
> 2) An existing ctrl is successfully reconnected.
> 3) An existing ctrl is being reset.
> 
> While in (1) ctrl->namespaces is empty, (2 & 3) may have namespaces,
> and nvme_read_ana_log() may call nvme_update_ns_ana_state().
> 
> This result in a hang when the ANA state of an existing namespace
> changes and makes the disk live: nvme_mpath_set_live() issues
> IO to the namespace through the ctrl, which does NOT have IO queues yet.
> See sample hang below.
> 
> Solution:
> - move nvme_read_ana_log() call from nvme_mpath_init_identify()
> into a separate function, nvme_mpath_update().
> - call this function in nvme_start_ctrl() - here the ctrl is ready.
> 
> Downside: in case nvme_read_ana_log() fails, we do not find this out
>    in nvme_mpath_init_identify().
> 
> Alernative solution:
>    nvme_get_log() in nvme_mpath_init_identify(), but don't parse.
>    Then nvme_start_ctrl() can just queue_work(nvme_wq, &ctrl->ana_work),
>    along with scan_work.
>    The ctrl->ana_log_buf contents is valid for scan_work,
>    and ana_work will re-fetch and parse it.
>    Downside: fetching an ANA log from the ctrl twice.
> 
> Sample failure:
>      nvme nvme0: starting error recovery
>      nvme nvme0: Reconnecting in 10 seconds...
>      block nvme0n6: no usable path - requeuing I/O
>      INFO: task kworker/u8:3:312 blocked for more than 122 seconds.
>            Tainted: G            E     5.14.5-1.el7.elrepo.x86_64 #1
>      Workqueue: nvme-wq nvme_tcp_reconnect_ctrl_work [nvme_tcp]
>      Call Trace:
>       __schedule+0x2a2/0x7e0
>       schedule+0x4e/0xb0
>       io_schedule+0x16/0x40
>       wait_on_page_bit_common+0x15c/0x3e0
>       do_read_cache_page+0x1e0/0x410
>       read_cache_page+0x12/0x20
>       read_part_sector+0x46/0x100
>       read_lba+0x121/0x240
>       efi_partition+0x1d2/0x6a0
>       bdev_disk_changed.part.0+0x1df/0x430
>       bdev_disk_changed+0x18/0x20
>       blkdev_get_whole+0x77/0xe0
>       blkdev_get_by_dev+0xd2/0x3a0
>       __device_add_disk+0x1ed/0x310
>       device_add_disk+0x13/0x20
>       nvme_mpath_set_live+0x138/0x1b0 [nvme_core]
>       nvme_update_ns_ana_state+0x2b/0x30 [nvme_core]
>       nvme_update_ana_state+0xca/0xe0 [nvme_core]
>       nvme_parse_ana_log+0xac/0x170 [nvme_core]
>       nvme_read_ana_log+0x7d/0xe0 [nvme_core]
>       nvme_mpath_init_identify+0x105/0x150 [nvme_core]
>       nvme_init_identify+0x2df/0x4d0 [nvme_core]
>       nvme_init_ctrl_finish+0x8d/0x3b0 [nvme_core]
>       nvme_tcp_setup_ctrl+0x337/0x390 [nvme_tcp]
>       nvme_tcp_reconnect_ctrl_work+0x24/0x40 [nvme_tcp]
>       process_one_work+0x1bd/0x360
>       worker_thread+0x50/0x3d0
> 
> Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
> ---
>   drivers/nvme/host/core.c      | 1 +
>   drivers/nvme/host/multipath.c | 9 ++++++---
>   drivers/nvme/host/nvme.h      | 4 ++++
>   3 files changed, 11 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 97f8211cf92c..884ca1f6f80b 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -4325,6 +4325,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
>   	nvme_enable_aen(ctrl);
>   
>   	if (ctrl->queue_count > 1) {
> +		nvme_mpath_update(ctrl);

Does this ordering now becomes subtle as it needs to run before we queue 
the ns scan?

>   		nvme_queue_scan(ctrl);
>   		nvme_start_queues(ctrl);
>   	}

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
  2021-09-19 10:19 ` Sagi Grimberg
@ 2021-09-19 23:05   ` Anton Eidelman
  2021-09-20  7:55     ` Sagi Grimberg
  0 siblings, 1 reply; 22+ messages in thread
From: Anton Eidelman @ 2021-09-19 23:05 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme, hch, kbusch, axboe

Well, this works for me (passes a test that gets into
the described scenario deteministically),
probably because we nvme_start_queues() earlier
in nvme_tcp_configure_io_queues().
Same is correct for rdma transport.

On Sun, Sep 19, 2021 at 01:19:39PM +0300, Sagi Grimberg wrote:
> 
> 
> On 9/19/21 12:57 AM, Anton Eidelman wrote:
> > nvme_mpath_init_identify() invoked from nvme_init_identify()
> > fetches a fresh ANA log from the ctrl.
> > This is essential to have an up to date path states
> > for both existing namespaces and for those scan_work
> > may discover once the ctrl is up.
> > 
> > This happens in the following cases:
> > 1) A new ctrl is being connected.
> > 2) An existing ctrl is successfully reconnected.
> > 3) An existing ctrl is being reset.
> > 
> > While in (1) ctrl->namespaces is empty, (2 & 3) may have namespaces,
> > and nvme_read_ana_log() may call nvme_update_ns_ana_state().
> > 
> > This result in a hang when the ANA state of an existing namespace
> > changes and makes the disk live: nvme_mpath_set_live() issues
> > IO to the namespace through the ctrl, which does NOT have IO queues yet.
> > See sample hang below.
> > 
> > Solution:
> > - move nvme_read_ana_log() call from nvme_mpath_init_identify()
> > into a separate function, nvme_mpath_update().
> > - call this function in nvme_start_ctrl() - here the ctrl is ready.
> > 
> > Downside: in case nvme_read_ana_log() fails, we do not find this out
> >    in nvme_mpath_init_identify().
> > 
> > Alernative solution:
> >    nvme_get_log() in nvme_mpath_init_identify(), but don't parse.
> >    Then nvme_start_ctrl() can just queue_work(nvme_wq, &ctrl->ana_work),
> >    along with scan_work.
> >    The ctrl->ana_log_buf contents is valid for scan_work,
> >    and ana_work will re-fetch and parse it.
> >    Downside: fetching an ANA log from the ctrl twice.
> > 
> > Sample failure:
> >      nvme nvme0: starting error recovery
> >      nvme nvme0: Reconnecting in 10 seconds...
> >      block nvme0n6: no usable path - requeuing I/O
> >      INFO: task kworker/u8:3:312 blocked for more than 122 seconds.
> >            Tainted: G            E     5.14.5-1.el7.elrepo.x86_64 #1
> >      Workqueue: nvme-wq nvme_tcp_reconnect_ctrl_work [nvme_tcp]
> >      Call Trace:
> >       __schedule+0x2a2/0x7e0
> >       schedule+0x4e/0xb0
> >       io_schedule+0x16/0x40
> >       wait_on_page_bit_common+0x15c/0x3e0
> >       do_read_cache_page+0x1e0/0x410
> >       read_cache_page+0x12/0x20
> >       read_part_sector+0x46/0x100
> >       read_lba+0x121/0x240
> >       efi_partition+0x1d2/0x6a0
> >       bdev_disk_changed.part.0+0x1df/0x430
> >       bdev_disk_changed+0x18/0x20
> >       blkdev_get_whole+0x77/0xe0
> >       blkdev_get_by_dev+0xd2/0x3a0
> >       __device_add_disk+0x1ed/0x310
> >       device_add_disk+0x13/0x20
> >       nvme_mpath_set_live+0x138/0x1b0 [nvme_core]
> >       nvme_update_ns_ana_state+0x2b/0x30 [nvme_core]
> >       nvme_update_ana_state+0xca/0xe0 [nvme_core]
> >       nvme_parse_ana_log+0xac/0x170 [nvme_core]
> >       nvme_read_ana_log+0x7d/0xe0 [nvme_core]
> >       nvme_mpath_init_identify+0x105/0x150 [nvme_core]
> >       nvme_init_identify+0x2df/0x4d0 [nvme_core]
> >       nvme_init_ctrl_finish+0x8d/0x3b0 [nvme_core]
> >       nvme_tcp_setup_ctrl+0x337/0x390 [nvme_tcp]
> >       nvme_tcp_reconnect_ctrl_work+0x24/0x40 [nvme_tcp]
> >       process_one_work+0x1bd/0x360
> >       worker_thread+0x50/0x3d0
> > 
> > Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
> > ---
> >   drivers/nvme/host/core.c      | 1 +
> >   drivers/nvme/host/multipath.c | 9 ++++++---
> >   drivers/nvme/host/nvme.h      | 4 ++++
> >   3 files changed, 11 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index 97f8211cf92c..884ca1f6f80b 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -4325,6 +4325,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
> >   	nvme_enable_aen(ctrl);
> >   	if (ctrl->queue_count > 1) {
> > +		nvme_mpath_update(ctrl);
> 
> Does this ordering now becomes subtle as it needs to run before we queue the
> ns scan?
> 
> >   		nvme_queue_scan(ctrl);
> >   		nvme_start_queues(ctrl);
> >   	}

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
  2021-09-18 21:57 [PATCH] nvme/mpath: fix hang when disk goes live over reconnect Anton Eidelman
  2021-09-19 10:19 ` Sagi Grimberg
@ 2021-09-20  6:35 ` Christoph Hellwig
  2021-09-20 14:53   ` Anton Eidelman
  2022-03-23  3:55 ` [PATCH v2 0/1] " Anton Eidelman
  2 siblings, 1 reply; 22+ messages in thread
From: Christoph Hellwig @ 2021-09-20  6:35 UTC (permalink / raw)
  To: Anton Eidelman; +Cc: linux-nvme, hch, kbusch, sagi, axboe, Anton Eidelman

On Sat, Sep 18, 2021 at 03:57:29PM -0600, Anton Eidelman wrote:
> +void nvme_mpath_update(struct nvme_ctrl *ctrl)
> +{
> +	nvme_read_ana_log(ctrl);
> +}

What is the point of this wrapper vs just calling nvme_read_ana_log
directly?  Also shouldn;t we propagate the error here and through
nvme_start_ctrl?

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
  2021-09-19 23:05   ` Anton Eidelman
@ 2021-09-20  7:55     ` Sagi Grimberg
  0 siblings, 0 replies; 22+ messages in thread
From: Sagi Grimberg @ 2021-09-20  7:55 UTC (permalink / raw)
  To: Anton Eidelman, linux-nvme, hch, kbusch, axboe


> Well, this works for me (passes a test that gets into
> the described scenario deteministically),
> probably because we nvme_start_queues() earlier
> in nvme_tcp_configure_io_queues().
> Same is correct for rdma transport.

OK, looks good to me:
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
  2021-09-20  6:35 ` Christoph Hellwig
@ 2021-09-20 14:53   ` Anton Eidelman
  2021-09-21  7:15     ` Christoph Hellwig
  0 siblings, 1 reply; 22+ messages in thread
From: Anton Eidelman @ 2021-09-20 14:53 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvme, kbusch, sagi, axboe

The wrapper is to:
- follow the naming convention for mpath functions called from core
- allow the implementation to have more logic in future

As for error propagation, nvme_start_ctrl is void.
Any error in nvme_read_ana_log() will have the same effect as
when it is invoked from nvme_ana_work(), i.e. will be ignored.

On Mon, Sep 20, 2021 at 08:35:34AM +0200, Christoph Hellwig wrote:
> On Sat, Sep 18, 2021 at 03:57:29PM -0600, Anton Eidelman wrote:
> > +void nvme_mpath_update(struct nvme_ctrl *ctrl)
> > +{
> > +	nvme_read_ana_log(ctrl);
> > +}
> 
> What is the point of this wrapper vs just calling nvme_read_ana_log
> directly?  Also shouldn;t we propagate the error here and through
> nvme_start_ctrl?

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
  2021-09-20 14:53   ` Anton Eidelman
@ 2021-09-21  7:15     ` Christoph Hellwig
  2021-10-04 16:46       ` Anton Eidelman
  0 siblings, 1 reply; 22+ messages in thread
From: Christoph Hellwig @ 2021-09-21  7:15 UTC (permalink / raw)
  To: Anton Eidelman; +Cc: Christoph Hellwig, linux-nvme, kbusch, sagi, axboe

On Mon, Sep 20, 2021 at 08:53:36AM -0600, Anton Eidelman wrote:
> The wrapper is to:
> - follow the naming convention for mpath functions called from core
> - allow the implementation to have more logic in future
> 
> As for error propagation, nvme_start_ctrl is void.

Which of course can be changed, like all software.

> Any error in nvme_read_ana_log() will have the same effect as
> when it is invoked from nvme_ana_work(), i.e. will be ignored.

We need to return negative errors during initialization otherwise
we're going to run into teardown problems.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
  2021-09-21  7:15     ` Christoph Hellwig
@ 2021-10-04 16:46       ` Anton Eidelman
  2021-10-04 16:57         ` Christoph Hellwig
  0 siblings, 1 reply; 22+ messages in thread
From: Anton Eidelman @ 2021-10-04 16:46 UTC (permalink / raw)
  To: Christoph Hellwig, linux-nvme, kbusch, sagi, axboe

On Tue, Sep 21, 2021 at 09:15:27AM +0200, Christoph Hellwig wrote:
> On Mon, Sep 20, 2021 at 08:53:36AM -0600, Anton Eidelman wrote:
> > The wrapper is to:
> > - follow the naming convention for mpath functions called from core
> > - allow the implementation to have more logic in future
> > 
> > As for error propagation, nvme_start_ctrl is void.
> 
> Which of course can be changed, like all software.
> 
> > Any error in nvme_read_ana_log() will have the same effect as
> > when it is invoked from nvme_ana_work(), i.e. will be ignored.
> 
> We need to return negative errors during initialization otherwise
> we're going to run into teardown problems.

How do we proceed with this fix?
I believe error propagation here is not wanted because:
1) A failure to fetch or parse the ANA log should not be considered
   an error in ctrl initialization.
2) Such error will not cause problems in teardown.
3) The same failure is possible in ANA work and we do not take
   any action in such case.
4) Adding support for failure to nvme_start_ctrl() adds complexity
   and does not look useful due to the above 1-3.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
  2021-10-04 16:46       ` Anton Eidelman
@ 2021-10-04 16:57         ` Christoph Hellwig
  2021-10-05 12:38           ` Sagi Grimberg
  0 siblings, 1 reply; 22+ messages in thread
From: Christoph Hellwig @ 2021-10-04 16:57 UTC (permalink / raw)
  To: Anton Eidelman; +Cc: Christoph Hellwig, linux-nvme, kbusch, sagi, axboe

On Mon, Oct 04, 2021 at 10:46:12AM -0600, Anton Eidelman wrote:
> How do we proceed with this fix?

Please resend with the suggested updates.

> I believe error propagation here is not wanted because:
> 1) A failure to fetch or parse the ANA log should not be considered
>    an error in ctrl initialization.

We must handle error that are due to a controller failing to initialize.

> 2) Such error will not cause problems in teardown.
> 3) The same failure is possible in ANA work and we do not take
>    any action in such case.

Failing in a workqueue is different from failing in the initialization
path.

> 4) Adding support for failure to nvme_start_ctrl() adds complexity
>    and does not look useful due to the above 1-3.
---end quoted text---

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
  2021-10-04 16:57         ` Christoph Hellwig
@ 2021-10-05 12:38           ` Sagi Grimberg
       [not found]             ` <368499da-c117-e8b7-7b1b-46894e1e0b48@grimberg.me>
  0 siblings, 1 reply; 22+ messages in thread
From: Sagi Grimberg @ 2021-10-05 12:38 UTC (permalink / raw)
  To: Christoph Hellwig, Anton Eidelman; +Cc: linux-nvme, kbusch, axboe


>> How do we proceed with this fix?
> 
> Please resend with the suggested updates.
> 
>> I believe error propagation here is not wanted because:
>> 1) A failure to fetch or parse the ANA log should not be considered
>>     an error in ctrl initialization.
> 
> We must handle error that are due to a controller failing to initialize.
> 
>> 2) Such error will not cause problems in teardown.
>> 3) The same failure is possible in ANA work and we do not take
>>     any action in such case.
> 
> Failing in a workqueue is different from failing in the initialization
> path.
> 
>> 4) Adding support for failure to nvme_start_ctrl() adds complexity
>>     and does not look useful due to the above 1-3.

The feedback here is that this patch is changing the functionality
as before we failed initialization and now we don't.

What is the issue in propagating the error and then modify
the call-sites? Shouldn't it be simple enough to do?
--
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index aa14ad963d91..d86623b90eea 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3158,8 +3158,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)

         ctrl->ctrl.nr_reconnects = 0;

-       if (changed)
-               nvme_start_ctrl(&ctrl->ctrl);
+       if (changed) {
+               ret = nvme_start_ctrl(&ctrl->ctrl);
+               if (ret)
+                       goto out_term_aen_ops;
+       }

         return 0;       /* Success */

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b82492cd7503..59bfdd72a51a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2827,7 +2827,10 @@ static void nvme_reset_work(struct work_struct *work)
                         &nvme_pci_attr_group))
                 dev->attrs_added = true;

-       nvme_start_ctrl(&dev->ctrl);
+       ret = nvme_start_ctrl(&dev->ctrl);
+       if (ret)
+               goto out_unlock;
+
         return;

   out_unlock:
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 042c594bc57e..9e9e34a012e7 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1141,7 +1141,10 @@ static int nvme_rdma_setup_ctrl(struct 
nvme_rdma_ctrl *ctrl, bool new)
                 goto destroy_io;
         }

-       nvme_start_ctrl(&ctrl->ctrl);
+       ret = nvme_start_ctrl(&ctrl->ctrl);
+       if (ret)
+               goto destroy_io;
+
         return 0;

  destroy_io:
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 10b164f02b5d..22199281ad17 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -923,6 +923,11 @@ static void nvme_tcp_fail_request(struct 
nvme_tcp_request *req)
         nvme_tcp_end_request(blk_mq_rq_from_pdu(req), 
NVME_SC_HOST_PATH_ERROR);
  }
@@ -2045,7 +2052,10 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl 
*ctrl, bool new)
                 goto destroy_io;
         }

-       nvme_start_ctrl(ctrl);
+       ret = nvme_start_ctrl(ctrl);
+       if (ret)
+               goto destroy_io;
+
         return 0;

  destroy_io:
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 0285ccc7541f..3fa89650c0d1 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -490,7 +490,9 @@ static void nvme_loop_reset_ctrl_work(struct 
work_struct *work)
         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
                 WARN_ON_ONCE(1);

-       nvme_start_ctrl(&ctrl->ctrl);
+       ret = nvme_start_ctrl(&ctrl->ctrl);
+       if (ret)
+               goto out_destroy_io;

         return;
--

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] nvme/mpath: fix hang when disk goes live over reconnect
       [not found]             ` <368499da-c117-e8b7-7b1b-46894e1e0b48@grimberg.me>
@ 2021-10-19 15:13               ` Anton Eidelman
  0 siblings, 0 replies; 22+ messages in thread
From: Anton Eidelman @ 2021-10-19 15:13 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme, hch, kbusch, axboe

Looking into making nvme_start_ctrl capable of returning an error.
This is a bit more complicated IMHO, because the in most transports
the ctrl is already in LIVE state when nvme_start_ctrl is invoked,
so we need to bail out carefully.

On Tue, Oct 19, 2021 at 05:37:30PM +0300, Sagi Grimberg wrote:
> 
> 
> On 10/5/21 3:38 PM, Sagi Grimberg wrote:
> > 
> > > > How do we proceed with this fix?
> > > 
> > > Please resend with the suggested updates.
> > > 
> > > > I believe error propagation here is not wanted because:
> > > > 1) A failure to fetch or parse the ANA log should not be considered
> > > >     an error in ctrl initialization.
> > > 
> > > We must handle error that are due to a controller failing to initialize.
> > > 
> > > > 2) Such error will not cause problems in teardown.
> > > > 3) The same failure is possible in ANA work and we do not take
> > > >     any action in such case.
> > > 
> > > Failing in a workqueue is different from failing in the initialization
> > > path.
> > > 
> > > > 4) Adding support for failure to nvme_start_ctrl() adds complexity
> > > >     and does not look useful due to the above 1-3.
> > 
> > The feedback here is that this patch is changing the functionality
> > as before we failed initialization and now we don't.
> > 
> > What is the issue in propagating the error and then modify
> > the call-sites? Shouldn't it be simple enough to do?
> > -- 
> > diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> > index aa14ad963d91..d86623b90eea 100644
> > --- a/drivers/nvme/host/fc.c
> > +++ b/drivers/nvme/host/fc.c
> > @@ -3158,8 +3158,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl
> > *ctrl)
> > 
> >          ctrl->ctrl.nr_reconnects = 0;
> > 
> > -       if (changed)
> > -               nvme_start_ctrl(&ctrl->ctrl);
> > +       if (changed) {
> > +               ret = nvme_start_ctrl(&ctrl->ctrl);
> > +               if (ret)
> > +                       goto out_term_aen_ops;
> > +       }
> > 
> >          return 0;       /* Success */
> > 
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index b82492cd7503..59bfdd72a51a 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -2827,7 +2827,10 @@ static void nvme_reset_work(struct work_struct
> > *work)
> >                          &nvme_pci_attr_group))
> >                  dev->attrs_added = true;
> > 
> > -       nvme_start_ctrl(&dev->ctrl);
> > +       ret = nvme_start_ctrl(&dev->ctrl);
> > +       if (ret)
> > +               goto out_unlock;
> > +
> >          return;
> > 
> >    out_unlock:
> > diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> > index 042c594bc57e..9e9e34a012e7 100644
> > --- a/drivers/nvme/host/rdma.c
> > +++ b/drivers/nvme/host/rdma.c
> > @@ -1141,7 +1141,10 @@ static int nvme_rdma_setup_ctrl(struct
> > nvme_rdma_ctrl *ctrl, bool new)
> >                  goto destroy_io;
> >          }
> > 
> > -       nvme_start_ctrl(&ctrl->ctrl);
> > +       ret = nvme_start_ctrl(&ctrl->ctrl);
> > +       if (ret)
> > +               goto destroy_io;
> > +
> >          return 0;
> > 
> >   destroy_io:
> > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > index 10b164f02b5d..22199281ad17 100644
> > --- a/drivers/nvme/host/tcp.c
> > +++ b/drivers/nvme/host/tcp.c
> > @@ -923,6 +923,11 @@ static void nvme_tcp_fail_request(struct
> > nvme_tcp_request *req)
> >          nvme_tcp_end_request(blk_mq_rq_from_pdu(req),
> > NVME_SC_HOST_PATH_ERROR);
> >   }
> > @@ -2045,7 +2052,10 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl
> > *ctrl, bool new)
> >                  goto destroy_io;
> >          }
> > 
> > -       nvme_start_ctrl(ctrl);
> > +       ret = nvme_start_ctrl(ctrl);
> > +       if (ret)
> > +               goto destroy_io;
> > +
> >          return 0;
> > 
> >   destroy_io:
> > diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
> > index 0285ccc7541f..3fa89650c0d1 100644
> > --- a/drivers/nvme/target/loop.c
> > +++ b/drivers/nvme/target/loop.c
> > @@ -490,7 +490,9 @@ static void nvme_loop_reset_ctrl_work(struct
> > work_struct *work)
> >          if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
> >                  WARN_ON_ONCE(1);
> > 
> > -       nvme_start_ctrl(&ctrl->ctrl);
> > +       ret = nvme_start_ctrl(&ctrl->ctrl);
> > +       if (ret)
> > +               goto out_destroy_io;
> > 
> >          return;
> > -- 
> 
> So Anton, can this suggestion work? I think Hannes reported the same
> issue that this patch addresses.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 0/1] nvme/mpath: fix hang when disk goes live over reconnect
  2021-09-18 21:57 [PATCH] nvme/mpath: fix hang when disk goes live over reconnect Anton Eidelman
  2021-09-19 10:19 ` Sagi Grimberg
  2021-09-20  6:35 ` Christoph Hellwig
@ 2022-03-23  3:55 ` Anton Eidelman
  2022-03-23  3:55   ` [PATCH v2 1/1] " Anton Eidelman
  2 siblings, 1 reply; 22+ messages in thread
From: Anton Eidelman @ 2022-03-23  3:55 UTC (permalink / raw)
  To: linux-nvme, hch, kbusch, sagi, axboe; +Cc: ushankar, Anton Eidelman

Revising the original v1 patch submitted on Sep 18, 2021,
which deferred nvme_read_ana_log() until nvme_start_ctrl()
and raised concerns since the former might fail,
while the latter does not currently return an error.
I did not see appropriate to introduce error handling
in the nvme_start_ctrl() call-sites.

Instead, this patch keeps the nvme_read_ana_log() in its original
place inside nvme_mpath_init_identify(),
but defers updating of the existing ns path state
until the ctrl IO queues are enabled in nvme_start_ctrl().

This way the handling of failures to retrieve the ANA log remains intact,
while the mpath update is deferred until it's safe to do.

Anton Eidelman (1):
  nvme/mpath: fix hang when disk goes live over reconnect

 drivers/nvme/host/core.c      |  1 +
 drivers/nvme/host/multipath.c | 12 +++++++++++-
 drivers/nvme/host/nvme.h      |  4 ++++
 3 files changed, 16 insertions(+), 1 deletion(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 1/1] nvme/mpath: fix hang when disk goes live over reconnect
  2022-03-23  3:55 ` [PATCH v2 0/1] " Anton Eidelman
@ 2022-03-23  3:55   ` Anton Eidelman
  2022-03-23  9:23     ` Sagi Grimberg
  2022-03-23 14:45     ` [PATCH v3 0/1] " Anton Eidelman
  0 siblings, 2 replies; 22+ messages in thread
From: Anton Eidelman @ 2022-03-23  3:55 UTC (permalink / raw)
  To: linux-nvme, hch, kbusch, sagi, axboe; +Cc: ushankar, Anton Eidelman

nvme_mpath_init_identify() invoked from nvme_init_identify()
fetches a fresh ANA log from the ctrl.
This is essential to have an up to date path states
for both existing namespaces and for those scan_work
may discover once the ctrl is up.

This happens in the following cases:
1) A new ctrl is being connected.
2) An existing ctrl is successfully reconnected.
3) An existing ctrl is being reset.

While in (1) ctrl->namespaces is empty, (2 & 3) may have namespaces,
and nvme_read_ana_log() may call nvme_update_ns_ana_state().

This result in a hang when the ANA state of an existing namespace
changes and makes the disk live: nvme_mpath_set_live() issues
IO to the namespace through the ctrl,
which does NOT have IO queues yet.

See sample hang below.

Solution:
- nvme_update_ns_ana_state() to call set_live only if ctrl is live
- nvme_read_ana_log() call from nvme_mpath_init_identify()
  therefore only fetches and parses the ANA log;
  any erros in this process will fail the ctrl setup as appropriate;
- a separate function nvme_mpath_update()
  is called in nvme_start_ctrl();
  this parses the ANA log without fetching it.
  At this point the ctrl is live,
  therefore, disks can be set live normally.

Sample failure:
    nvme nvme0: starting error recovery
    nvme nvme0: Reconnecting in 10 seconds...
    block nvme0n6: no usable path - requeuing I/O
    INFO: task kworker/u8:3:312 blocked for more than 122 seconds.
          Tainted: G            E     5.14.5-1.el7.elrepo.x86_64 #1
    Workqueue: nvme-wq nvme_tcp_reconnect_ctrl_work [nvme_tcp]
    Call Trace:
     __schedule+0x2a2/0x7e0
     schedule+0x4e/0xb0
     io_schedule+0x16/0x40
     wait_on_page_bit_common+0x15c/0x3e0
     do_read_cache_page+0x1e0/0x410
     read_cache_page+0x12/0x20
     read_part_sector+0x46/0x100
     read_lba+0x121/0x240
     efi_partition+0x1d2/0x6a0
     bdev_disk_changed.part.0+0x1df/0x430
     bdev_disk_changed+0x18/0x20
     blkdev_get_whole+0x77/0xe0
     blkdev_get_by_dev+0xd2/0x3a0
     __device_add_disk+0x1ed/0x310
     device_add_disk+0x13/0x20
     nvme_mpath_set_live+0x138/0x1b0 [nvme_core]
     nvme_update_ns_ana_state+0x2b/0x30 [nvme_core]
     nvme_update_ana_state+0xca/0xe0 [nvme_core]
     nvme_parse_ana_log+0xac/0x170 [nvme_core]
     nvme_read_ana_log+0x7d/0xe0 [nvme_core]
     nvme_mpath_init_identify+0x105/0x150 [nvme_core]
     nvme_init_identify+0x2df/0x4d0 [nvme_core]
     nvme_init_ctrl_finish+0x8d/0x3b0 [nvme_core]
     nvme_tcp_setup_ctrl+0x337/0x390 [nvme_tcp]
     nvme_tcp_reconnect_ctrl_work+0x24/0x40 [nvme_tcp]
     process_one_work+0x1bd/0x360
     worker_thread+0x50/0x3d0

Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
---
 drivers/nvme/host/core.c      |  1 +
 drivers/nvme/host/multipath.c | 12 +++++++++++-
 drivers/nvme/host/nvme.h      |  4 ++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3c0461129bca..d4495fa97657 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4509,6 +4509,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
 	if (ctrl->queue_count > 1) {
 		nvme_queue_scan(ctrl);
 		nvme_start_queues(ctrl);
+		nvme_mpath_update(ctrl);
 	}
 
 	nvme_change_uevent(ctrl, "NVME_EVENT=connected");
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index c97d7f843977..a22d455f48a9 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -613,7 +613,8 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
 	ns->ana_state = desc->state;
 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
 
-	if (nvme_state_is_live(ns->ana_state))
+	if (nvme_state_is_live(ns->ana_state) &&
+		(ns->ctrl->state == NVME_CTRL_LIVE))
 		nvme_mpath_set_live(ns);
 }
 
@@ -700,6 +701,15 @@ static void nvme_ana_work(struct work_struct *work)
 	nvme_read_ana_log(ctrl);
 }
 
+void nvme_mpath_update(struct nvme_ctrl *ctrl)
+{
+	u32 nr_change_groups = 0;
+
+	mutex_lock(&ctrl->ana_lock);
+	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
+	mutex_unlock(&ctrl->ana_lock);
+}
+
 static void nvme_anatt_timeout(struct timer_list *t)
 {
 	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1ea908d43e17..76f7a5f37379 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -781,6 +781,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
 void nvme_mpath_remove_disk(struct nvme_ns_head *head);
 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
+void nvme_mpath_update(struct nvme_ctrl *ctrl);
 void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
 void nvme_mpath_stop(struct nvme_ctrl *ctrl);
 bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
@@ -852,6 +853,9 @@ static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl,
 "Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n");
 	return 0;
 }
+void nvme_mpath_update(struct nvme_ctrl *ctrl)
+{
+}
 static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
 {
 }
-- 
2.25.1



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/1] nvme/mpath: fix hang when disk goes live over reconnect
  2022-03-23  3:55   ` [PATCH v2 1/1] " Anton Eidelman
@ 2022-03-23  9:23     ` Sagi Grimberg
  2022-03-23 14:45     ` [PATCH v3 0/1] " Anton Eidelman
  1 sibling, 0 replies; 22+ messages in thread
From: Sagi Grimberg @ 2022-03-23  9:23 UTC (permalink / raw)
  To: Anton Eidelman, linux-nvme, hch, kbusch, axboe; +Cc: ushankar, Anton Eidelman



On 3/23/22 05:55, Anton Eidelman wrote:
> nvme_mpath_init_identify() invoked from nvme_init_identify()
> fetches a fresh ANA log from the ctrl.
> This is essential to have an up to date path states
> for both existing namespaces and for those scan_work
> may discover once the ctrl is up.
> 
> This happens in the following cases:
> 1) A new ctrl is being connected.
> 2) An existing ctrl is successfully reconnected.
> 3) An existing ctrl is being reset.
> 
> While in (1) ctrl->namespaces is empty, (2 & 3) may have namespaces,
> and nvme_read_ana_log() may call nvme_update_ns_ana_state().
> 
> This result in a hang when the ANA state of an existing namespace
> changes and makes the disk live: nvme_mpath_set_live() issues
> IO to the namespace through the ctrl,
> which does NOT have IO queues yet.

Given that setting the path as live triggers I/O, it makes perfect
sense to do it only after we have I/O queues that can accept this
I/O.

> 
> See sample hang below.
> 
> Solution:
> - nvme_update_ns_ana_state() to call set_live only if ctrl is live
> - nvme_read_ana_log() call from nvme_mpath_init_identify()
>    therefore only fetches and parses the ANA log;
>    any erros in this process will fail the ctrl setup as appropriate;
> - a separate function nvme_mpath_update()
>    is called in nvme_start_ctrl();
>    this parses the ANA log without fetching it.
>    At this point the ctrl is live,
>    therefore, disks can be set live normally.
> 
> Sample failure:
>      nvme nvme0: starting error recovery
>      nvme nvme0: Reconnecting in 10 seconds...
>      block nvme0n6: no usable path - requeuing I/O
>      INFO: task kworker/u8:3:312 blocked for more than 122 seconds.
>            Tainted: G            E     5.14.5-1.el7.elrepo.x86_64 #1
>      Workqueue: nvme-wq nvme_tcp_reconnect_ctrl_work [nvme_tcp]
>      Call Trace:
>       __schedule+0x2a2/0x7e0
>       schedule+0x4e/0xb0
>       io_schedule+0x16/0x40
>       wait_on_page_bit_common+0x15c/0x3e0
>       do_read_cache_page+0x1e0/0x410
>       read_cache_page+0x12/0x20
>       read_part_sector+0x46/0x100
>       read_lba+0x121/0x240
>       efi_partition+0x1d2/0x6a0
>       bdev_disk_changed.part.0+0x1df/0x430
>       bdev_disk_changed+0x18/0x20
>       blkdev_get_whole+0x77/0xe0
>       blkdev_get_by_dev+0xd2/0x3a0
>       __device_add_disk+0x1ed/0x310
>       device_add_disk+0x13/0x20
>       nvme_mpath_set_live+0x138/0x1b0 [nvme_core]
>       nvme_update_ns_ana_state+0x2b/0x30 [nvme_core]
>       nvme_update_ana_state+0xca/0xe0 [nvme_core]
>       nvme_parse_ana_log+0xac/0x170 [nvme_core]
>       nvme_read_ana_log+0x7d/0xe0 [nvme_core]
>       nvme_mpath_init_identify+0x105/0x150 [nvme_core]
>       nvme_init_identify+0x2df/0x4d0 [nvme_core]
>       nvme_init_ctrl_finish+0x8d/0x3b0 [nvme_core]
>       nvme_tcp_setup_ctrl+0x337/0x390 [nvme_tcp]
>       nvme_tcp_reconnect_ctrl_work+0x24/0x40 [nvme_tcp]
>       process_one_work+0x1bd/0x360
>       worker_thread+0x50/0x3d0
> 
> Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
> ---
>   drivers/nvme/host/core.c      |  1 +
>   drivers/nvme/host/multipath.c | 12 +++++++++++-
>   drivers/nvme/host/nvme.h      |  4 ++++
>   3 files changed, 16 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 3c0461129bca..d4495fa97657 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -4509,6 +4509,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
>   	if (ctrl->queue_count > 1) {
>   		nvme_queue_scan(ctrl);
>   		nvme_start_queues(ctrl);
> +		nvme_mpath_update(ctrl);
>   	}
>   
>   	nvme_change_uevent(ctrl, "NVME_EVENT=connected");
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index c97d7f843977..a22d455f48a9 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -613,7 +613,8 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
>   	ns->ana_state = desc->state;
>   	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
>   

I think we need a comment here explaining this. something like:

	/*
	 * nvme_mpath_set_live() will trigger I/O to the mpath
	 * device node and in turn to this path device, however we
	 * cannot accept this I/O if the ctrl is not live, this may
	 * deadlock if called from the nvme_mpath_init_identify and
	 * the controller will never complete initialization, preventing
	 * I/O from completing. For this we will reprocess the ANA log
	 * page again once the controller will be fully initialized.
	 */

> -	if (nvme_state_is_live(ns->ana_state))
> +	if (nvme_state_is_live(ns->ana_state) &&
> +		(ns->ctrl->state == NVME_CTRL_LIVE))

No need for the brackets I think, and

Alignment should be:
	if (nvme_state_is_live(ns->ana_state) &&
	    ns->ctrl->state == NVME_CTRL_LIVE)

>   		nvme_mpath_set_live(ns);
>   }
>   
> @@ -700,6 +701,15 @@ static void nvme_ana_work(struct work_struct *work)
>   	nvme_read_ana_log(ctrl);
>   }
>   
> +void nvme_mpath_update(struct nvme_ctrl *ctrl)
> +{
> +	u32 nr_change_groups = 0;
> +
> +	mutex_lock(&ctrl->ana_lock);
> +	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
> +	mutex_unlock(&ctrl->ana_lock);
> +}
> +
>   static void nvme_anatt_timeout(struct timer_list *t)
>   {
>   	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 1ea908d43e17..76f7a5f37379 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -781,6 +781,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
>   void nvme_mpath_remove_disk(struct nvme_ns_head *head);
>   int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
>   void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
> +void nvme_mpath_update(struct nvme_ctrl *ctrl);
>   void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
>   void nvme_mpath_stop(struct nvme_ctrl *ctrl);
>   bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
> @@ -852,6 +853,9 @@ static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl,
>   "Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n");
>   	return 0;
>   }
> +void nvme_mpath_update(struct nvme_ctrl *ctrl)
> +{
> +}
>   static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
>   {
>   }


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v3 0/1] nvme/mpath: fix hang when disk goes live over reconnect
  2022-03-23  3:55   ` [PATCH v2 1/1] " Anton Eidelman
  2022-03-23  9:23     ` Sagi Grimberg
@ 2022-03-23 14:45     ` Anton Eidelman
  2022-03-23 14:45       ` [PATCH v3 1/1] " Anton Eidelman
  1 sibling, 1 reply; 22+ messages in thread
From: Anton Eidelman @ 2022-03-23 14:45 UTC (permalink / raw)
  To: linux-nvme, hch, kbusch, sagi, axboe; +Cc: ushankar, Anton Eidelman

Fixed cosmetics.
Added a comment in nvme_update_ns_ana_state()
explaining the condition for nvme_mpath_set_live() call.

Anton Eidelman (1):
  nvme/mpath: fix hang when disk goes live over reconnect

 drivers/nvme/host/core.c      |  1 +
 drivers/nvme/host/multipath.c | 23 +++++++++++++++++++++--
 drivers/nvme/host/nvme.h      |  4 ++++
 3 files changed, 26 insertions(+), 2 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v3 1/1] nvme/mpath: fix hang when disk goes live over reconnect
  2022-03-23 14:45     ` [PATCH v3 0/1] " Anton Eidelman
@ 2022-03-23 14:45       ` Anton Eidelman
  2022-03-23 14:55         ` Sagi Grimberg
  2022-03-23 15:22         ` Christoph Hellwig
  0 siblings, 2 replies; 22+ messages in thread
From: Anton Eidelman @ 2022-03-23 14:45 UTC (permalink / raw)
  To: linux-nvme, hch, kbusch, sagi, axboe; +Cc: ushankar, Anton Eidelman

nvme_mpath_init_identify() invoked from nvme_init_identify()
fetches a fresh ANA log from the ctrl.
This is essential to have an up to date path states
for both existing namespaces and for those scan_work
may discover once the ctrl is up.

This happens in the following cases:
1) A new ctrl is being connected.
2) An existing ctrl is successfully reconnected.
3) An existing ctrl is being reset.

While in (1) ctrl->namespaces is empty, (2 & 3) may have namespaces,
and nvme_read_ana_log() may call nvme_update_ns_ana_state().

This result in a hang when the ANA state of an existing namespace
changes and makes the disk live: nvme_mpath_set_live() issues
IO to the namespace through the ctrl,
which does NOT have IO queues yet.

See sample hang below.

Solution:
- nvme_update_ns_ana_state() to call set_live only if ctrl is live
- nvme_read_ana_log() call from nvme_mpath_init_identify()
  therefore only fetches and parses the ANA log;
  any erros in this process will fail the ctrl setup as appropriate;
- a separate function nvme_mpath_update()
  is called in nvme_start_ctrl();
  this parses the ANA log without fetching it.
  At this point the ctrl is live,
  therefore, disks can be set live normally.

Sample failure:
    nvme nvme0: starting error recovery
    nvme nvme0: Reconnecting in 10 seconds...
    block nvme0n6: no usable path - requeuing I/O
    INFO: task kworker/u8:3:312 blocked for more than 122 seconds.
          Tainted: G            E     5.14.5-1.el7.elrepo.x86_64 #1
    Workqueue: nvme-wq nvme_tcp_reconnect_ctrl_work [nvme_tcp]
    Call Trace:
     __schedule+0x2a2/0x7e0
     schedule+0x4e/0xb0
     io_schedule+0x16/0x40
     wait_on_page_bit_common+0x15c/0x3e0
     do_read_cache_page+0x1e0/0x410
     read_cache_page+0x12/0x20
     read_part_sector+0x46/0x100
     read_lba+0x121/0x240
     efi_partition+0x1d2/0x6a0
     bdev_disk_changed.part.0+0x1df/0x430
     bdev_disk_changed+0x18/0x20
     blkdev_get_whole+0x77/0xe0
     blkdev_get_by_dev+0xd2/0x3a0
     __device_add_disk+0x1ed/0x310
     device_add_disk+0x13/0x20
     nvme_mpath_set_live+0x138/0x1b0 [nvme_core]
     nvme_update_ns_ana_state+0x2b/0x30 [nvme_core]
     nvme_update_ana_state+0xca/0xe0 [nvme_core]
     nvme_parse_ana_log+0xac/0x170 [nvme_core]
     nvme_read_ana_log+0x7d/0xe0 [nvme_core]
     nvme_mpath_init_identify+0x105/0x150 [nvme_core]
     nvme_init_identify+0x2df/0x4d0 [nvme_core]
     nvme_init_ctrl_finish+0x8d/0x3b0 [nvme_core]
     nvme_tcp_setup_ctrl+0x337/0x390 [nvme_tcp]
     nvme_tcp_reconnect_ctrl_work+0x24/0x40 [nvme_tcp]
     process_one_work+0x1bd/0x360
     worker_thread+0x50/0x3d0

Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
---
 drivers/nvme/host/core.c      |  1 +
 drivers/nvme/host/multipath.c | 23 +++++++++++++++++++++--
 drivers/nvme/host/nvme.h      |  4 ++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3c0461129bca..d4495fa97657 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4509,6 +4509,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
 	if (ctrl->queue_count > 1) {
 		nvme_queue_scan(ctrl);
 		nvme_start_queues(ctrl);
+		nvme_mpath_update(ctrl);
 	}
 
 	nvme_change_uevent(ctrl, "NVME_EVENT=connected");
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index c97d7f843977..12d4afde3662 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -612,8 +612,18 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
 	ns->ana_grpid = le32_to_cpu(desc->grpid);
 	ns->ana_state = desc->state;
 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
-
-	if (nvme_state_is_live(ns->ana_state))
+	/*
+	 * nvme_mpath_set_live() will trigger I/O to the mpath
+	 * device node and in turn to this path device, however we
+	 * cannot accept this I/O if the ctrl is not live.
+	 * This may deadlock if called from the nvme_mpath_init_identify()
+	 * and the ctrl will never complete initialization,
+	 * preventing I/O from completing.
+	 * For this case we will reprocess the ANA log page
+	 * in nvme_mpath_update() once the ctrl ready.
+	 */
+	if (nvme_state_is_live(ns->ana_state) &&
+	    ns->ctrl->state == NVME_CTRL_LIVE)
 		nvme_mpath_set_live(ns);
 }
 
@@ -700,6 +710,15 @@ static void nvme_ana_work(struct work_struct *work)
 	nvme_read_ana_log(ctrl);
 }
 
+void nvme_mpath_update(struct nvme_ctrl *ctrl)
+{
+	u32 nr_change_groups = 0;
+
+	mutex_lock(&ctrl->ana_lock);
+	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
+	mutex_unlock(&ctrl->ana_lock);
+}
+
 static void nvme_anatt_timeout(struct timer_list *t)
 {
 	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1ea908d43e17..76f7a5f37379 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -781,6 +781,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
 void nvme_mpath_remove_disk(struct nvme_ns_head *head);
 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
+void nvme_mpath_update(struct nvme_ctrl *ctrl);
 void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
 void nvme_mpath_stop(struct nvme_ctrl *ctrl);
 bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
@@ -852,6 +853,9 @@ static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl,
 "Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n");
 	return 0;
 }
+void nvme_mpath_update(struct nvme_ctrl *ctrl)
+{
+}
 static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
 {
 }
-- 
2.25.1



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 1/1] nvme/mpath: fix hang when disk goes live over reconnect
  2022-03-23 14:45       ` [PATCH v3 1/1] " Anton Eidelman
@ 2022-03-23 14:55         ` Sagi Grimberg
  2022-03-23 15:22         ` Christoph Hellwig
  1 sibling, 0 replies; 22+ messages in thread
From: Sagi Grimberg @ 2022-03-23 14:55 UTC (permalink / raw)
  To: Anton Eidelman, linux-nvme, hch, kbusch, axboe; +Cc: ushankar, Anton Eidelman

This looks good to me Anton,

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v3 1/1] nvme/mpath: fix hang when disk goes live over reconnect
  2022-03-23 14:45       ` [PATCH v3 1/1] " Anton Eidelman
  2022-03-23 14:55         ` Sagi Grimberg
@ 2022-03-23 15:22         ` Christoph Hellwig
  2022-03-24 19:05           ` [PATCH v4 0/1] " Anton Eidelman
  1 sibling, 1 reply; 22+ messages in thread
From: Christoph Hellwig @ 2022-03-23 15:22 UTC (permalink / raw)
  To: Anton Eidelman
  Cc: linux-nvme, hch, kbusch, sagi, axboe, ushankar, Anton Eidelman

Thanks,

applied to nvme-5.18.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v4 0/1] nvme/mpath: fix hang when disk goes live over reconnect
  2022-03-23 15:22         ` Christoph Hellwig
@ 2022-03-24 19:05           ` Anton Eidelman
  2022-03-24 19:05             ` [PATCH v4 1/1] " Anton Eidelman
  0 siblings, 1 reply; 22+ messages in thread
From: Anton Eidelman @ 2022-03-24 19:05 UTC (permalink / raw)
  To: linux-nvme, hch, kbusch, sagi, axboe; +Cc: Anton Eidelman

Fixed:
1) Build error with CONFIG_NVME_MULTIPATH=n
   - missing "static inline" for the nvme_mpath_update() stub.
2) Crash when CONFIG_NVME_MULTIPATH=y but
   either multipath is off or the ctrl is not ANA-enabled.

Anton Eidelman (1):
  nvme/mpath: fix hang when disk goes live over reconnect

 drivers/nvme/host/core.c      |  1 +
 drivers/nvme/host/multipath.c | 26 ++++++++++++++++++++++++--
 drivers/nvme/host/nvme.h      |  4 ++++
 3 files changed, 29 insertions(+), 2 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v4 1/1] nvme/mpath: fix hang when disk goes live over reconnect
  2022-03-24 19:05           ` [PATCH v4 0/1] " Anton Eidelman
@ 2022-03-24 19:05             ` Anton Eidelman
  2022-03-24 21:06               ` Sagi Grimberg
  2022-03-25  6:36               ` Christoph Hellwig
  0 siblings, 2 replies; 22+ messages in thread
From: Anton Eidelman @ 2022-03-24 19:05 UTC (permalink / raw)
  To: linux-nvme, hch, kbusch, sagi, axboe; +Cc: Anton Eidelman

nvme_mpath_init_identify() invoked from nvme_init_identify()
fetches a fresh ANA log from the ctrl.
This is essential to have an up to date path states
for both existing namespaces and for those scan_work
may discover once the ctrl is up.

This happens in the following cases:
1) A new ctrl is being connected.
2) An existing ctrl is successfully reconnected.
3) An existing ctrl is being reset.

While in (1) ctrl->namespaces is empty, (2 & 3) may have namespaces,
and nvme_read_ana_log() may call nvme_update_ns_ana_state().

This result in a hang when the ANA state of an existing namespace
changes and makes the disk live: nvme_mpath_set_live() issues
IO to the namespace through the ctrl,
which does NOT have IO queues yet.

See sample hang below.

Solution:
- nvme_update_ns_ana_state() to call set_live only if ctrl is live
- nvme_read_ana_log() call from nvme_mpath_init_identify()
  therefore only fetches and parses the ANA log;
  any erros in this process will fail the ctrl setup as appropriate;
- a separate function nvme_mpath_update()
  is called in nvme_start_ctrl();
  this parses the ANA log without fetching it.
  At this point the ctrl is live,
  therefore, disks can be set live normally.

Sample failure:
    nvme nvme0: starting error recovery
    nvme nvme0: Reconnecting in 10 seconds...
    block nvme0n6: no usable path - requeuing I/O
    INFO: task kworker/u8:3:312 blocked for more than 122 seconds.
          Tainted: G            E     5.14.5-1.el7.elrepo.x86_64 #1
    Workqueue: nvme-wq nvme_tcp_reconnect_ctrl_work [nvme_tcp]
    Call Trace:
     __schedule+0x2a2/0x7e0
     schedule+0x4e/0xb0
     io_schedule+0x16/0x40
     wait_on_page_bit_common+0x15c/0x3e0
     do_read_cache_page+0x1e0/0x410
     read_cache_page+0x12/0x20
     read_part_sector+0x46/0x100
     read_lba+0x121/0x240
     efi_partition+0x1d2/0x6a0
     bdev_disk_changed.part.0+0x1df/0x430
     bdev_disk_changed+0x18/0x20
     blkdev_get_whole+0x77/0xe0
     blkdev_get_by_dev+0xd2/0x3a0
     __device_add_disk+0x1ed/0x310
     device_add_disk+0x13/0x20
     nvme_mpath_set_live+0x138/0x1b0 [nvme_core]
     nvme_update_ns_ana_state+0x2b/0x30 [nvme_core]
     nvme_update_ana_state+0xca/0xe0 [nvme_core]
     nvme_parse_ana_log+0xac/0x170 [nvme_core]
     nvme_read_ana_log+0x7d/0xe0 [nvme_core]
     nvme_mpath_init_identify+0x105/0x150 [nvme_core]
     nvme_init_identify+0x2df/0x4d0 [nvme_core]
     nvme_init_ctrl_finish+0x8d/0x3b0 [nvme_core]
     nvme_tcp_setup_ctrl+0x337/0x390 [nvme_tcp]
     nvme_tcp_reconnect_ctrl_work+0x24/0x40 [nvme_tcp]
     process_one_work+0x1bd/0x360
     worker_thread+0x50/0x3d0

Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
---
 drivers/nvme/host/core.c      |  1 +
 drivers/nvme/host/multipath.c | 26 ++++++++++++++++++++++++--
 drivers/nvme/host/nvme.h      |  4 ++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ccc5877d514b..8cb1197aac42 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4511,6 +4511,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
 	if (ctrl->queue_count > 1) {
 		nvme_queue_scan(ctrl);
 		nvme_start_queues(ctrl);
+		nvme_mpath_update(ctrl);
 	}
 
 	nvme_change_uevent(ctrl, "NVME_EVENT=connected");
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index c97d7f843977..6f4d3108f35b 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -612,8 +612,18 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
 	ns->ana_grpid = le32_to_cpu(desc->grpid);
 	ns->ana_state = desc->state;
 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
-
-	if (nvme_state_is_live(ns->ana_state))
+	/*
+	 * nvme_mpath_set_live() will trigger I/O to the mpath
+	 * device node and in turn to this path device, however we
+	 * cannot accept this I/O if the ctrl is not live.
+	 * This may deadlock if called from the nvme_mpath_init_identify()
+	 * and the ctrl will never complete initialization,
+	 * preventing I/O from completing.
+	 * For this case we will reprocess the ANA log page
+	 * in nvme_mpath_update() once the ctrl ready.
+	 */
+	if (nvme_state_is_live(ns->ana_state) &&
+	    ns->ctrl->state == NVME_CTRL_LIVE)
 		nvme_mpath_set_live(ns);
 }
 
@@ -700,6 +710,18 @@ static void nvme_ana_work(struct work_struct *work)
 	nvme_read_ana_log(ctrl);
 }
 
+void nvme_mpath_update(struct nvme_ctrl *ctrl)
+{
+	u32 nr_change_groups = 0;
+
+	if (!ctrl->ana_log_buf)
+		return;
+
+	mutex_lock(&ctrl->ana_lock);
+	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
+	mutex_unlock(&ctrl->ana_lock);
+}
+
 static void nvme_anatt_timeout(struct timer_list *t)
 {
 	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1ea908d43e17..8cf90c9ed667 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -781,6 +781,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
 void nvme_mpath_remove_disk(struct nvme_ns_head *head);
 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
+void nvme_mpath_update(struct nvme_ctrl *ctrl);
 void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
 void nvme_mpath_stop(struct nvme_ctrl *ctrl);
 bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
@@ -852,6 +853,9 @@ static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl,
 "Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n");
 	return 0;
 }
+static inline void nvme_mpath_update(struct nvme_ctrl *ctrl)
+{
+}
 static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
 {
 }
-- 
2.25.1



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 1/1] nvme/mpath: fix hang when disk goes live over reconnect
  2022-03-24 19:05             ` [PATCH v4 1/1] " Anton Eidelman
@ 2022-03-24 21:06               ` Sagi Grimberg
  2022-03-25  6:36               ` Christoph Hellwig
  1 sibling, 0 replies; 22+ messages in thread
From: Sagi Grimberg @ 2022-03-24 21:06 UTC (permalink / raw)
  To: Anton Eidelman, linux-nvme, hch, kbusch, axboe; +Cc: Anton Eidelman

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v4 1/1] nvme/mpath: fix hang when disk goes live over reconnect
  2022-03-24 19:05             ` [PATCH v4 1/1] " Anton Eidelman
  2022-03-24 21:06               ` Sagi Grimberg
@ 2022-03-25  6:36               ` Christoph Hellwig
  1 sibling, 0 replies; 22+ messages in thread
From: Christoph Hellwig @ 2022-03-25  6:36 UTC (permalink / raw)
  To: Anton Eidelman; +Cc: linux-nvme, hch, kbusch, sagi, axboe, Anton Eidelman

Thanks,

applied to nvme-5.18.


^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2022-03-25  6:36 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-18 21:57 [PATCH] nvme/mpath: fix hang when disk goes live over reconnect Anton Eidelman
2021-09-19 10:19 ` Sagi Grimberg
2021-09-19 23:05   ` Anton Eidelman
2021-09-20  7:55     ` Sagi Grimberg
2021-09-20  6:35 ` Christoph Hellwig
2021-09-20 14:53   ` Anton Eidelman
2021-09-21  7:15     ` Christoph Hellwig
2021-10-04 16:46       ` Anton Eidelman
2021-10-04 16:57         ` Christoph Hellwig
2021-10-05 12:38           ` Sagi Grimberg
     [not found]             ` <368499da-c117-e8b7-7b1b-46894e1e0b48@grimberg.me>
2021-10-19 15:13               ` Anton Eidelman
2022-03-23  3:55 ` [PATCH v2 0/1] " Anton Eidelman
2022-03-23  3:55   ` [PATCH v2 1/1] " Anton Eidelman
2022-03-23  9:23     ` Sagi Grimberg
2022-03-23 14:45     ` [PATCH v3 0/1] " Anton Eidelman
2022-03-23 14:45       ` [PATCH v3 1/1] " Anton Eidelman
2022-03-23 14:55         ` Sagi Grimberg
2022-03-23 15:22         ` Christoph Hellwig
2022-03-24 19:05           ` [PATCH v4 0/1] " Anton Eidelman
2022-03-24 19:05             ` [PATCH v4 1/1] " Anton Eidelman
2022-03-24 21:06               ` Sagi Grimberg
2022-03-25  6:36               ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.