Hi, This update version makes sure the unfreeze call is done when recreating the queues. I was able to reproduce hanging I/Os when we go into error recovery mode for FC and TCP [1]. Unfortunatly, I don't have access to a RDMA setup to verify but as the code is identically to the TCP, RDMA looks to like to suffer from the same problem. Thanks, Daniel [1] https://lore.kernel.org/linux-nvme/20210730094907.5vg7qebggttibogz@beryllium.lan/ https://lore.kernel.org/linux-nvme/20210730113415.wezsrvxv5cu4yz4x@beryllium.lan/ v1: - https://lore.kernel.org/linux-nvme/20210625101649.49296-1-dwagner@suse.de/ v2: - https://lore.kernel.org/linux-nvme/20210708092755.15660-1-dwagner@suse.de/ - reviewed tags collected - added 'update hardware queues' for all transport - added fix for fc hanger in nvme_wait_freeze_timeout v3: - https://lore.kernel.org/linux-nvme/20210720124353.127959-1-dwagner@suse.de/ - dropped 'nvme-fc: Freeze queues before destroying them' - added James' two patches v4: - added 'nvme-*: Unfreeze queues on reconnect' - added Hannes' reviewed tags Daniel Wagner (5): nvme-fc: Update hardware queues before using them nvme-rdma: Update number of hardware queues before using them nvme-fc: Wait with a timeout for queue to freeze nvme-tcp: Unfreeze queues on reconnect nvme-rdma: Unfreeze queues on reconnect Hannes Reinecke (1): nvme-tcp: Update number of hardware queues before using them James Smart (2): nvme-fc: avoid race between time out and tear down nvme-fc: fix controller reset hang during traffic drivers/nvme/host/fc.c | 28 +++++++++++++++++++--------- drivers/nvme/host/rdma.c | 15 ++++++++------- drivers/nvme/host/tcp.c | 18 +++++++++--------- 3 files changed, 36 insertions(+), 25 deletions(-) -- 2.29.2
In case the number of hardware queues changes, do the update the tagset and ctx to hctx first before using the mapping to recreate and connnect the IO queues. Reviewed-by: James Smart <jsmart2021@gmail.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Daniel Wagner <dwagner@suse.de> --- drivers/nvme/host/fc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 7f462af1b02a..8a903769364f 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2951,14 +2951,6 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) if (ctrl->ctrl.queue_count == 1) return 0; - ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); - if (ret) - goto out_free_io_queues; - - ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); - if (ret) - goto out_delete_hw_queues; - if (prior_ioq_cnt != nr_io_queues) { dev_info(ctrl->ctrl.device, "reconnect: revising io queue count from %d to %d\n", @@ -2968,6 +2960,14 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) nvme_unfreeze(&ctrl->ctrl); } + ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); + if (ret) + goto out_free_io_queues; + + ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); + if (ret) + goto out_delete_hw_queues; + return 0; out_delete_hw_queues: -- 2.29.2
From: Hannes Reinecke <hare@suse.de> When the number of hardware queues changes during resetting we should update the tagset first before using it. Signed-off-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Daniel Wagner <dwagner@suse.de> --- drivers/nvme/host/tcp.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 0a97ba02f61e..32268f24f62a 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1789,6 +1789,7 @@ static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) { int ret; + u32 prior_q_cnt = ctrl->queue_count; ret = nvme_tcp_alloc_io_queues(ctrl); if (ret) @@ -1806,14 +1807,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } - } - - ret = nvme_tcp_start_io_queues(ctrl); - if (ret) - goto out_cleanup_connect_q; - - if (!new) { - nvme_start_queues(ctrl); + } else if (prior_q_cnt != ctrl->queue_count) { if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { /* * If we timed out waiting for freeze we are likely to @@ -1828,6 +1822,10 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) nvme_unfreeze(ctrl); } + ret = nvme_tcp_start_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + return 0; out_wait_freeze_timed_out: -- 2.29.2
When the number of hardware queues changes during resetting we should update the tagset first before using it. Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Daniel Wagner <dwagner@suse.de> --- drivers/nvme/host/rdma.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 69ae67652f38..de2a8950d282 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -965,6 +965,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) { int ret; + u32 prior_q_cnt = ctrl->ctrl.queue_count; ret = nvme_rdma_alloc_io_queues(ctrl); if (ret) @@ -982,13 +983,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->ctrl.connect_q); goto out_free_tag_set; } - } - - ret = nvme_rdma_start_io_queues(ctrl); - if (ret) - goto out_cleanup_connect_q; - - if (!new) { + } else if (prior_q_cnt != ctrl->ctrl.queue_count) { nvme_start_queues(&ctrl->ctrl); if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { /* @@ -1004,6 +999,10 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) nvme_unfreeze(&ctrl->ctrl); } + ret = nvme_rdma_start_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + return 0; out_wait_freeze_timed_out: -- 2.29.2
Do not wait indifinitly for all queues to freeze. Instead use a timeout and abort the operation if we get stuck. Reviewed-by: James Smart <jsmart2021@gmail.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Daniel Wagner <dwagner@suse.de> --- drivers/nvme/host/fc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 8a903769364f..dbb8ad816df8 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2955,7 +2955,14 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) dev_info(ctrl->ctrl.device, "reconnect: revising io queue count from %d to %d\n", prior_ioq_cnt, nr_io_queues); - nvme_wait_freeze(&ctrl->ctrl); + if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + return -ENODEV; + } blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); nvme_unfreeze(&ctrl->ctrl); } -- 2.29.2
From: James Smart <jsmart2021@gmail.com> To avoid race between time out and tear down, in tear down process, first we quiesce the queue, and then delete the timer and cancel the time out work for the queue. This patch merges the admin and io sync ops into the queue teardown logic as shown in the RDMA patch 3017013dcc "nvme-rdma: avoid race between time out and tear down". There is no teardown_lock in nvme-fc. Signed-off-by: James Smart <jsmart2021@gmail.com> CC: Chao Leng <lengchao@huawei.com> Tested-by: Daniel Wagner <dwagner@suse.de> [dwagner: updated commit id referenced in commit message] Reviewed-by: Daniel Wagner <dwagner@suse.de> Reviewed-by: Hannes Reinecke <hare@suse.de> --- drivers/nvme/host/fc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index dbb8ad816df8..133b87db4f1d 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2487,6 +2487,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) */ if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->tag_set); @@ -2510,6 +2511,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * clean up the admin queue. Same thing as above. */ blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); -- 2.29.2
From: James Smart <jsmart2021@gmail.com> commit fe35ec58f0d3 ("block: update hctx map when use multiple maps") exposed an issue where we may hang trying to wait for queue freeze during I/O. We call blk_mq_update_nr_hw_queues which may attempt to freeze the queue. However we never started queue freeze when starting the reset, which means that we have inflight pending requests that entered the queue that we will not complete once the queue is quiesced. So start a freeze before we quiesce the queue, and unfreeze the queue after we successfully connected the I/O queues (the unfreeze is already present in the code). blk_mq_update_nr_hw_queues will be called only after we are sure that the queue was already frozen. This follows to how the pci driver handles resets. This patch added logic introduced in commit 9f98772ba307 "nvme-rdma: fix controller reset hang during traffic". Signed-off-by: James Smart <jsmart2021@gmail.com> CC: Sagi Grimberg <sagi@grimberg.me> [dwagner: call nvme_unfreeze() unconditionally in nvme_fc_recreate_io_queues() to match the nvme_start_freeze()] Tested-by: Daniel Wagner <dwagner@suse.de> Reviewed-by: Daniel Wagner <dwagner@suse.de> --- drivers/nvme/host/fc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 133b87db4f1d..b292af0fd655 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2486,6 +2486,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * (but with error status). */ if (ctrl->ctrl.queue_count > 1) { + nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, @@ -2966,8 +2967,8 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) return -ENODEV; } blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); - nvme_unfreeze(&ctrl->ctrl); } + nvme_unfreeze(&ctrl->ctrl); ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); if (ret) -- 2.29.2
During the queue teardown in nvme_tcp_teardown_io_queues() freeze is called unconditionally. When we reconnect we need to pair the freeze with an unfreeze to avoid hanging I/Os. For newly created connection this is not needed. Fixes: 2875b0aecabe ("nvme-tcp: fix controller reset hang during traffic") Signed-off-by: Daniel Wagner <dwagner@suse.de> --- drivers/nvme/host/tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 32268f24f62a..097f7dd10ed3 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1819,9 +1819,11 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) } blk_mq_update_nr_hw_queues(ctrl->tagset, ctrl->queue_count - 1); - nvme_unfreeze(ctrl); } + if (!new) + nvme_unfreeze(ctrl); + ret = nvme_tcp_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; -- 2.29.2
During the queue teardown in nvme_rdma_teardown_io_queues() freeze is called unconditionally. When we reconnect we need to pair the freeze with an unfreeze to avoid hanging I/Os. For newly created connection this is not needed. Fixes: 9f98772ba307 ("nvme-rdma: fix controller reset hang during traffic") Signed-off-by: Daniel Wagner <dwagner@suse.de> --- drivers/nvme/host/rdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index de2a8950d282..21a8a5353af0 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -901,6 +901,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, error = PTR_ERR(ctrl->ctrl.admin_q); goto out_cleanup_fabrics_q; } + } else { + nvme_unfreeze(&ctrl->ctrl); } error = nvme_rdma_start_queue(ctrl, 0); -- 2.29.2
Do not wait indifinitly for all queues to freeze. Instead use a timeout and abort the operation if we get stuck. Reviewed-by: James Smart <jsmart2021@gmail.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Daniel Wagner <dwagner@suse.de> --- drivers/nvme/host/fc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 8a903769364f..dbb8ad816df8 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2955,7 +2955,14 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) dev_info(ctrl->ctrl.device, "reconnect: revising io queue count from %d to %d\n", prior_ioq_cnt, nr_io_queues); - nvme_wait_freeze(&ctrl->ctrl); + if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + return -ENODEV; + } blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); nvme_unfreeze(&ctrl->ctrl); } -- 2.29.2
On 8/2/21 6:26 AM, Daniel Wagner wrote:
> Do not wait indifinitly for all queues to freeze. Instead use a
> timeout and abort the operation if we get stuck.
>
> Reviewed-by: James Smart <jsmart2021@gmail.com>
> Reviewed-by: Hannes Reinecke <hare@suse.de>
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
> drivers/nvme/host/fc.c | 9 ++++++++-
> 1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index 8a903769364f..dbb8ad816df8 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -2955,7 +2955,14 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl)
> dev_info(ctrl->ctrl.device,
> "reconnect: revising io queue count from %d to %d\n",
> prior_ioq_cnt, nr_io_queues);
> - nvme_wait_freeze(&ctrl->ctrl);
> + if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
> + /*
> + * If we timed out waiting for freeze we are likely to
> + * be stuck. Fail the controller initialization just
> + * to be safe.
> + */
> + return -ENODEV;
> + }
> blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues);
> nvme_unfreeze(&ctrl->ctrl);
> }
>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
--
Himanshu Madhani Oracle Linux Engineering