linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries
@ 2024-03-05  8:00 Daniel Wagner
  2024-03-05  8:00 ` [PATCH v3 1/2] nvme-tcp: short-circuit reconnect retries Daniel Wagner
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Daniel Wagner @ 2024-03-05  8:00 UTC (permalink / raw)
  To: James Smart
  Cc: Keith Busch, Christoph Hellwig, Sagi Grimberg, Hannes Reinecke,
	linux-nvme, linux-kernel, Daniel Wagner

I've picked up Hannes' DNR patches. In short the make the transports behave the
same way when the DNR bit set on a re-connect attempt. We had a discussion this
topic in the past and if I got this right we all agreed is that the host should
honor the DNR bit on a connect attempt [1]

The nvme/045 test case (authentication tests) in blktests is a good test case
for this after extending it slightly. TCP and RDMA try to reconnect with an
invalid key over and over again, while loop and FC stop after the first fail.

[1] https://lore.kernel.org/linux-nvme/20220927143157.3659-1-dwagner@suse.de/

changes:
v3:
  - added my SOB tag
  - fixed indention

v2:
  - refresh/rebase on current head
  - extended blktests (nvme/045) to cover this case
    (see separate post)
  - https://lore.kernel.org/linux-nvme/20240304161006.19328-1-dwagner@suse.de/
  
v1:
  - initial version
  - https://lore.kernel.org/linux-nvme/20210623143250.82445-1-hare@suse.de/

Hannes Reinecke (2):
  nvme-tcp: short-circuit reconnect retries
  nvme-rdma: short-circuit reconnect retries

 drivers/nvme/host/rdma.c | 22 +++++++++++++++-------
 drivers/nvme/host/tcp.c  | 23 +++++++++++++++--------
 2 files changed, 30 insertions(+), 15 deletions(-)

-- 
2.44.0


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3 1/2] nvme-tcp: short-circuit reconnect retries
  2024-03-05  8:00 [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries Daniel Wagner
@ 2024-03-05  8:00 ` Daniel Wagner
  2024-03-06  8:10   ` Chaitanya Kulkarni
  2024-03-05  8:00 ` [PATCH v3 2/2] nvme-rdma: " Daniel Wagner
  2024-03-07  8:00 ` [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries Sagi Grimberg
  2 siblings, 1 reply; 13+ messages in thread
From: Daniel Wagner @ 2024-03-05  8:00 UTC (permalink / raw)
  To: James Smart
  Cc: Keith Busch, Christoph Hellwig, Sagi Grimberg, Hannes Reinecke,
	linux-nvme, linux-kernel, Daniel Wagner

From: Hannes Reinecke <hare@suse.de>

Returning an nvme status from nvme_tcp_setup_ctrl() indicates
that the association was established and we have received a status
from the controller; consequently we should honour the DNR bit.
If not any future reconnect attempts will just return the same error, so
we can short-circuit the reconnect attempts and fail the connection
directly.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Daniel Wagner <dwagner@suse.de>
---
 drivers/nvme/host/tcp.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 3b20c5ed033f..f9ad5904ed62 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2149,9 +2149,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
 	nvme_tcp_destroy_io_queues(ctrl, remove);
 }
 
-static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
+static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl,
+		int status)
 {
 	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+	bool recon = status > 0 && (status & NVME_SC_DNR) ? false : true;
 
 	/* If we are resetting/deleting then do nothing */
 	if (state != NVME_CTRL_CONNECTING) {
@@ -2159,13 +2161,14 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
 		return;
 	}
 
-	if (nvmf_should_reconnect(ctrl)) {
+	if (recon && nvmf_should_reconnect(ctrl)) {
 		dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
 			ctrl->opts->reconnect_delay);
 		queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
 				ctrl->opts->reconnect_delay * HZ);
 	} else {
-		dev_info(ctrl->device, "Removing controller...\n");
+		dev_info(ctrl->device, "Removing controller (%d)...\n",
+			 status);
 		nvme_delete_ctrl(ctrl);
 	}
 }
@@ -2246,10 +2249,12 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
 	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
 			struct nvme_tcp_ctrl, connect_work);
 	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+	int ret;
 
 	++ctrl->nr_reconnects;
 
-	if (nvme_tcp_setup_ctrl(ctrl, false))
+	ret = nvme_tcp_setup_ctrl(ctrl, false);
+	if (ret)
 		goto requeue;
 
 	dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
@@ -2262,7 +2267,7 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
 requeue:
 	dev_info(ctrl->device, "Failed reconnect attempt %d\n",
 			ctrl->nr_reconnects);
-	nvme_tcp_reconnect_or_remove(ctrl);
+	nvme_tcp_reconnect_or_remove(ctrl, ret);
 }
 
 static void nvme_tcp_error_recovery_work(struct work_struct *work)
@@ -2289,7 +2294,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 		return;
 	}
 
-	nvme_tcp_reconnect_or_remove(ctrl);
+	nvme_tcp_reconnect_or_remove(ctrl, -ENOTCONN);
 }
 
 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
@@ -2309,6 +2314,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
 {
 	struct nvme_ctrl *ctrl =
 		container_of(work, struct nvme_ctrl, reset_work);
+	int ret;
 
 	nvme_stop_ctrl(ctrl);
 	nvme_tcp_teardown_ctrl(ctrl, false);
@@ -2322,14 +2328,15 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
 		return;
 	}
 
-	if (nvme_tcp_setup_ctrl(ctrl, false))
+	ret = nvme_tcp_setup_ctrl(ctrl, false);
+	if (ret)
 		goto out_fail;
 
 	return;
 
 out_fail:
 	++ctrl->nr_reconnects;
-	nvme_tcp_reconnect_or_remove(ctrl);
+	nvme_tcp_reconnect_or_remove(ctrl, ret);
 }
 
 static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v3 2/2] nvme-rdma: short-circuit reconnect retries
  2024-03-05  8:00 [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries Daniel Wagner
  2024-03-05  8:00 ` [PATCH v3 1/2] nvme-tcp: short-circuit reconnect retries Daniel Wagner
@ 2024-03-05  8:00 ` Daniel Wagner
  2024-03-06  8:11   ` Chaitanya Kulkarni
  2024-03-07  8:00 ` [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries Sagi Grimberg
  2 siblings, 1 reply; 13+ messages in thread
From: Daniel Wagner @ 2024-03-05  8:00 UTC (permalink / raw)
  To: James Smart
  Cc: Keith Busch, Christoph Hellwig, Sagi Grimberg, Hannes Reinecke,
	linux-nvme, linux-kernel, Daniel Wagner

From: Hannes Reinecke <hare@suse.de>

Returning an nvme status from nvme_rdma_setup_ctrl() indicates
that the association was established and we have received a status
from the controller; consequently we should honour the DNR bit.
If not any future reconnect attempts will just return the same error, so
we can short-circuit the reconnect attempts and fail the connection
directly.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Daniel Wagner <dwagner@suse.de>
---
 drivers/nvme/host/rdma.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index d3747795ad80..7e556e10caba 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -982,9 +982,11 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
 	kfree(ctrl);
 }
 
-static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
+static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl,
+		int status)
 {
 	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
+	bool recon = status > 0 && (status & NVME_SC_DNR) ? false : true;
 
 	/* If we are resetting/deleting then do nothing */
 	if (state != NVME_CTRL_CONNECTING) {
@@ -992,12 +994,14 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
 		return;
 	}
 
-	if (nvmf_should_reconnect(&ctrl->ctrl)) {
+	if (recon && nvmf_should_reconnect(&ctrl->ctrl)) {
 		dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
 			ctrl->ctrl.opts->reconnect_delay);
 		queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
 				ctrl->ctrl.opts->reconnect_delay * HZ);
 	} else {
+		dev_info(ctrl->ctrl.device, "Removing controller (%d)...\n",
+			 status);
 		nvme_delete_ctrl(&ctrl->ctrl);
 	}
 }
@@ -1098,10 +1102,12 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 {
 	struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
 			struct nvme_rdma_ctrl, reconnect_work);
+	int ret;
 
 	++ctrl->ctrl.nr_reconnects;
 
-	if (nvme_rdma_setup_ctrl(ctrl, false))
+	ret = nvme_rdma_setup_ctrl(ctrl, false);
+	if (ret)
 		goto requeue;
 
 	dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
@@ -1114,7 +1120,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 requeue:
 	dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
 			ctrl->ctrl.nr_reconnects);
-	nvme_rdma_reconnect_or_remove(ctrl);
+	nvme_rdma_reconnect_or_remove(ctrl, ret);
 }
 
 static void nvme_rdma_error_recovery_work(struct work_struct *work)
@@ -1139,7 +1145,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 		return;
 	}
 
-	nvme_rdma_reconnect_or_remove(ctrl);
+	nvme_rdma_reconnect_or_remove(ctrl, -ENOTCONN);
 }
 
 static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
@@ -2163,6 +2169,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
 {
 	struct nvme_rdma_ctrl *ctrl =
 		container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
+	int ret;
 
 	nvme_stop_ctrl(&ctrl->ctrl);
 	nvme_rdma_shutdown_ctrl(ctrl, false);
@@ -2173,14 +2180,15 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
 		return;
 	}
 
-	if (nvme_rdma_setup_ctrl(ctrl, false))
+	ret = nvme_rdma_setup_ctrl(ctrl, false);
+	if (ret)
 		goto out_fail;
 
 	return;
 
 out_fail:
 	++ctrl->ctrl.nr_reconnects;
-	nvme_rdma_reconnect_or_remove(ctrl);
+	nvme_rdma_reconnect_or_remove(ctrl, ret);
 }
 
 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 1/2] nvme-tcp: short-circuit reconnect retries
  2024-03-05  8:00 ` [PATCH v3 1/2] nvme-tcp: short-circuit reconnect retries Daniel Wagner
@ 2024-03-06  8:10   ` Chaitanya Kulkarni
  0 siblings, 0 replies; 13+ messages in thread
From: Chaitanya Kulkarni @ 2024-03-06  8:10 UTC (permalink / raw)
  To: Daniel Wagner, James Smart
  Cc: Keith Busch, Christoph Hellwig, Sagi Grimberg, Hannes Reinecke,
	linux-nvme, linux-kernel

On 3/5/24 00:00, Daniel Wagner wrote:
> From: Hannes Reinecke <hare@suse.de>
>
> Returning an nvme status from nvme_tcp_setup_ctrl() indicates
> that the association was established and we have received a status
> from the controller; consequently we should honour the DNR bit.
> If not any future reconnect attempts will just return the same error, so
> we can short-circuit the reconnect attempts and fail the connection
> directly.
>
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
>   drivers/nvme/host/tcp.c | 23 +++++++++++++++--------
>   1 file changed, 15 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 3b20c5ed033f..f9ad5904ed62 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -2149,9 +2149,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
>   	nvme_tcp_destroy_io_queues(ctrl, remove);
>   }
>   
> -static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
> +static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl,
> +		int status)
>   {
>   	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
> +	bool recon = status > 0 && (status & NVME_SC_DNR) ? false : true;
>   

hmmm bunch of nvme command calls may have DNR set ...

Looks good.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/2] nvme-rdma: short-circuit reconnect retries
  2024-03-05  8:00 ` [PATCH v3 2/2] nvme-rdma: " Daniel Wagner
@ 2024-03-06  8:11   ` Chaitanya Kulkarni
  2024-03-06  8:17     ` Chaitanya Kulkarni
  0 siblings, 1 reply; 13+ messages in thread
From: Chaitanya Kulkarni @ 2024-03-06  8:11 UTC (permalink / raw)
  To: Daniel Wagner, James Smart
  Cc: Keith Busch, Christoph Hellwig, Sagi Grimberg, Hannes Reinecke,
	linux-nvme, linux-kernel

On 3/5/24 00:00, Daniel Wagner wrote:
> From: Hannes Reinecke <hare@suse.de>
>
> Returning an nvme status from nvme_rdma_setup_ctrl() indicates
> that the association was established and we have received a status
> from the controller; consequently we should honour the DNR bit.
> If not any future reconnect attempts will just return the same error, so
> we can short-circuit the reconnect attempts and fail the connection
> directly.
>
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
>

same here, do we need a same fix for FC as well ? just curious ...

Looks good.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 2/2] nvme-rdma: short-circuit reconnect retries
  2024-03-06  8:11   ` Chaitanya Kulkarni
@ 2024-03-06  8:17     ` Chaitanya Kulkarni
  0 siblings, 0 replies; 13+ messages in thread
From: Chaitanya Kulkarni @ 2024-03-06  8:17 UTC (permalink / raw)
  To: Daniel Wagner, James Smart
  Cc: Keith Busch, Christoph Hellwig, Sagi Grimberg, Hannes Reinecke,
	linux-nvme, linux-kernel

On 3/6/24 00:11, Chaitanya Kulkarni wrote:
> On 3/5/24 00:00, Daniel Wagner wrote:
>> From: Hannes Reinecke <hare@suse.de>
>>
>> Returning an nvme status from nvme_rdma_setup_ctrl() indicates
>> that the association was established and we have received a status
>> from the controller; consequently we should honour the DNR bit.
>> If not any future reconnect attempts will just return the same error, so
>> we can short-circuit the reconnect attempts and fail the connection
>> directly.
>>
>> Signed-off-by: Hannes Reinecke <hare@suse.de>
>> Signed-off-by: Daniel Wagner <dwagner@suse.de>
>> ---
>>
> same here, do we need a same fix for FC as well ? just curious ...
>
> Looks good.
>
> Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
>
> -ck
>

fc already handles DNR in nvme_fc_reconnect_or_delete().

-ck



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries
  2024-03-05  8:00 [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries Daniel Wagner
  2024-03-05  8:00 ` [PATCH v3 1/2] nvme-tcp: short-circuit reconnect retries Daniel Wagner
  2024-03-05  8:00 ` [PATCH v3 2/2] nvme-rdma: " Daniel Wagner
@ 2024-03-07  8:00 ` Sagi Grimberg
  2024-03-07 10:37   ` Hannes Reinecke
  2 siblings, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2024-03-07  8:00 UTC (permalink / raw)
  To: Daniel Wagner, James Smart
  Cc: Keith Busch, Christoph Hellwig, Hannes Reinecke, linux-nvme,
	linux-kernel


On 05/03/2024 10:00, Daniel Wagner wrote:
> I've picked up Hannes' DNR patches. In short the make the transports behave the
> same way when the DNR bit set on a re-connect attempt. We had a discussion this
> topic in the past and if I got this right we all agreed is that the host should
> honor the DNR bit on a connect attempt [1]
Umm, I don't recall this being conclusive though. The spec ought to be 
clearer here I think.
>
> The nvme/045 test case (authentication tests) in blktests is a good test case
> for this after extending it slightly. TCP and RDMA try to reconnect with an
> invalid key over and over again, while loop and FC stop after the first fail.

Who says that invalid key is a permanent failure though?


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries
  2024-03-07  8:00 ` [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries Sagi Grimberg
@ 2024-03-07 10:37   ` Hannes Reinecke
  2024-03-07 11:30     ` Sagi Grimberg
  0 siblings, 1 reply; 13+ messages in thread
From: Hannes Reinecke @ 2024-03-07 10:37 UTC (permalink / raw)
  To: Sagi Grimberg, Daniel Wagner, James Smart
  Cc: Keith Busch, Christoph Hellwig, linux-nvme, linux-kernel

On 3/7/24 09:00, Sagi Grimberg wrote:
> 
> On 05/03/2024 10:00, Daniel Wagner wrote:
>> I've picked up Hannes' DNR patches. In short the make the transports 
>> behave the same way when the DNR bit set on a re-connect attempt. We
>> had a discussion this
>> topic in the past and if I got this right we all agreed is that the 
>> host should honor the DNR bit on a connect attempt [1]
> Umm, I don't recall this being conclusive though. The spec ought to be 
> clearer here I think.

I've asked the NVMexpress fmds group, and the response was pretty 
unanimous that the DNR bit on connect should be evaluated.

>>
>> The nvme/045 test case (authentication tests) in blktests is a good 
>> test case for this after extending it slightly. TCP and RDMA try to
>> reconnect with an
>> invalid key over and over again, while loop and FC stop after the 
>> first fail.
> 
> Who says that invalid key is a permanent failure though?
> 
See the response to the other patchset.
'Invalid key' in this context means that the _client_ evaluated the key 
as invalid, ie the key is unusable for the client.
As the key is passed in via the commandline there is no way the client
can ever change the value here, and no amount of retry will change 
things here. That's what we try to fix.

The controller surely can return an invalid key, but that's part of
the authenticaion protocol and will be evaluated differently.

Cheers,

Hannes


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries
  2024-03-07 10:37   ` Hannes Reinecke
@ 2024-03-07 11:30     ` Sagi Grimberg
  2024-03-07 11:45       ` Hannes Reinecke
  0 siblings, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2024-03-07 11:30 UTC (permalink / raw)
  To: Hannes Reinecke, Daniel Wagner, James Smart
  Cc: Keith Busch, Christoph Hellwig, linux-nvme, linux-kernel



On 07/03/2024 12:37, Hannes Reinecke wrote:
> On 3/7/24 09:00, Sagi Grimberg wrote:
>>
>> On 05/03/2024 10:00, Daniel Wagner wrote:
>>> I've picked up Hannes' DNR patches. In short the make the transports 
>>> behave the same way when the DNR bit set on a re-connect attempt. We
>>> had a discussion this
>>> topic in the past and if I got this right we all agreed is that the 
>>> host should honor the DNR bit on a connect attempt [1]
>> Umm, I don't recall this being conclusive though. The spec ought to 
>> be clearer here I think.
>
> I've asked the NVMexpress fmds group, and the response was pretty 
> unanimous that the DNR bit on connect should be evaluated.

OK.

>
>>>
>>> The nvme/045 test case (authentication tests) in blktests is a good 
>>> test case for this after extending it slightly. TCP and RDMA try to
>>> reconnect with an
>>> invalid key over and over again, while loop and FC stop after the 
>>> first fail.
>>
>> Who says that invalid key is a permanent failure though?
>>
> See the response to the other patchset.
> 'Invalid key' in this context means that the _client_ evaluated the 
> key as invalid, ie the key is unusable for the client.
> As the key is passed in via the commandline there is no way the client
> can ever change the value here, and no amount of retry will change 
> things here. That's what we try to fix.

Where is this retried today, I don't see where connect failure is 
retried, outside of a periodic reconnect.
Maybe I'm missing where what is the actual failure here.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries
  2024-03-07 11:30     ` Sagi Grimberg
@ 2024-03-07 11:45       ` Hannes Reinecke
  2024-03-07 12:14         ` Sagi Grimberg
  0 siblings, 1 reply; 13+ messages in thread
From: Hannes Reinecke @ 2024-03-07 11:45 UTC (permalink / raw)
  To: Sagi Grimberg, Daniel Wagner, James Smart
  Cc: Keith Busch, Christoph Hellwig, linux-nvme, linux-kernel

On 3/7/24 12:30, Sagi Grimberg wrote:
> 
> 
> On 07/03/2024 12:37, Hannes Reinecke wrote:
>> On 3/7/24 09:00, Sagi Grimberg wrote:
>>>
>>> On 05/03/2024 10:00, Daniel Wagner wrote:
>>>> I've picked up Hannes' DNR patches. In short the make the transports 
>>>> behave the same way when the DNR bit set on a re-connect attempt. We
>>>> had a discussion this
>>>> topic in the past and if I got this right we all agreed is that the 
>>>> host should honor the DNR bit on a connect attempt [1]
>>> Umm, I don't recall this being conclusive though. The spec ought to 
>>> be clearer here I think.
>>
>> I've asked the NVMexpress fmds group, and the response was pretty 
>> unanimous that the DNR bit on connect should be evaluated.
> 
> OK.
> 
>>
>>>>
>>>> The nvme/045 test case (authentication tests) in blktests is a good 
>>>> test case for this after extending it slightly. TCP and RDMA try to
>>>> reconnect with an
>>>> invalid key over and over again, while loop and FC stop after the 
>>>> first fail.
>>>
>>> Who says that invalid key is a permanent failure though?
>>>
>> See the response to the other patchset.
>> 'Invalid key' in this context means that the _client_ evaluated the 
>> key as invalid, ie the key is unusable for the client.
>> As the key is passed in via the commandline there is no way the client
>> can ever change the value here, and no amount of retry will change 
>> things here. That's what we try to fix.
> 
> Where is this retried today, I don't see where connect failure is 
> retried, outside of a periodic reconnect.
> Maybe I'm missing where what is the actual failure here.

static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
{
         struct nvme_tcp_ctrl *tcp_ctrl =
                         container_of(to_delayed_work(work),
                         struct nvme_tcp_ctrl, connect_work);
         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;

         ++ctrl->nr_reconnects;

         if (nvme_tcp_setup_ctrl(ctrl, false))
                 goto requeue;

         dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
                         ctrl->nr_reconnects);

         ctrl->nr_reconnects = 0;

         return;

requeue:
         dev_info(ctrl->device, "Failed reconnect attempt %d\n",

and nvme_tcp_setup_ctrl() returns either a negative errno or an NVMe 
status code (which might include the DNR bit).

Cheers,

Hannes


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries
  2024-03-07 11:45       ` Hannes Reinecke
@ 2024-03-07 12:14         ` Sagi Grimberg
  2024-03-07 12:52           ` Hannes Reinecke
  0 siblings, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2024-03-07 12:14 UTC (permalink / raw)
  To: Hannes Reinecke, Daniel Wagner, James Smart
  Cc: Keith Busch, Christoph Hellwig, linux-nvme, linux-kernel



On 07/03/2024 13:45, Hannes Reinecke wrote:
> On 3/7/24 12:30, Sagi Grimberg wrote:
>>
>>
>> On 07/03/2024 12:37, Hannes Reinecke wrote:
>>> On 3/7/24 09:00, Sagi Grimberg wrote:
>>>>
>>>> On 05/03/2024 10:00, Daniel Wagner wrote:
>>>>> I've picked up Hannes' DNR patches. In short the make the 
>>>>> transports behave the same way when the DNR bit set on a 
>>>>> re-connect attempt. We
>>>>> had a discussion this
>>>>> topic in the past and if I got this right we all agreed is that 
>>>>> the host should honor the DNR bit on a connect attempt [1]
>>>> Umm, I don't recall this being conclusive though. The spec ought to 
>>>> be clearer here I think.
>>>
>>> I've asked the NVMexpress fmds group, and the response was pretty 
>>> unanimous that the DNR bit on connect should be evaluated.
>>
>> OK.
>>
>>>
>>>>>
>>>>> The nvme/045 test case (authentication tests) in blktests is a 
>>>>> good test case for this after extending it slightly. TCP and RDMA 
>>>>> try to
>>>>> reconnect with an
>>>>> invalid key over and over again, while loop and FC stop after the 
>>>>> first fail.
>>>>
>>>> Who says that invalid key is a permanent failure though?
>>>>
>>> See the response to the other patchset.
>>> 'Invalid key' in this context means that the _client_ evaluated the 
>>> key as invalid, ie the key is unusable for the client.
>>> As the key is passed in via the commandline there is no way the client
>>> can ever change the value here, and no amount of retry will change 
>>> things here. That's what we try to fix.
>>
>> Where is this retried today, I don't see where connect failure is 
>> retried, outside of a periodic reconnect.
>> Maybe I'm missing where what is the actual failure here.
>
> static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
> {
>         struct nvme_tcp_ctrl *tcp_ctrl =
>                         container_of(to_delayed_work(work),
>                         struct nvme_tcp_ctrl, connect_work);
>         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>
>         ++ctrl->nr_reconnects;
>
>         if (nvme_tcp_setup_ctrl(ctrl, false))
>                 goto requeue;
>
>         dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
>                         ctrl->nr_reconnects);
>
>         ctrl->nr_reconnects = 0;
>
>         return;
>
> requeue:
>         dev_info(ctrl->device, "Failed reconnect attempt %d\n",
>
> and nvme_tcp_setup_ctrl() returns either a negative errno or an NVMe 
> status code (which might include the DNR bit).

I thought this is about the initialization. yes today we ignore the 
status in re-connection assuming that whatever
happened, may (or may not) resolve itself. The basis for this assumption 
is that if we managed to connect the first
time there is no reason to assume that connecting again should fail 
persistently.

If there is a consensus that we should not assume it, its a valid 
argument. I didn't see where this happens with respect
to authentication though.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries
  2024-03-07 12:14         ` Sagi Grimberg
@ 2024-03-07 12:52           ` Hannes Reinecke
  2024-03-08 10:21             ` Sagi Grimberg
  0 siblings, 1 reply; 13+ messages in thread
From: Hannes Reinecke @ 2024-03-07 12:52 UTC (permalink / raw)
  To: Sagi Grimberg, Daniel Wagner, James Smart
  Cc: Keith Busch, Christoph Hellwig, linux-nvme, linux-kernel

On 3/7/24 13:14, Sagi Grimberg wrote:
> 
> 
> On 07/03/2024 13:45, Hannes Reinecke wrote:
>> On 3/7/24 12:30, Sagi Grimberg wrote:
>>>
[ .. ]
>>>
>>> Where is this retried today, I don't see where connect failure is 
>>> retried, outside of a periodic reconnect.
>>> Maybe I'm missing where what is the actual failure here.
>>
>> static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
>> {
>>         struct nvme_tcp_ctrl *tcp_ctrl =
>>                         container_of(to_delayed_work(work),
>>                         struct nvme_tcp_ctrl, connect_work);
>>         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>>
>>         ++ctrl->nr_reconnects;
>>
>>         if (nvme_tcp_setup_ctrl(ctrl, false))
>>                 goto requeue;
>>
>>         dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
>>                         ctrl->nr_reconnects);
>>
>>         ctrl->nr_reconnects = 0;
>>
>>         return;
>>
>> requeue:
>>         dev_info(ctrl->device, "Failed reconnect attempt %d\n",
>>
>> and nvme_tcp_setup_ctrl() returns either a negative errno or an NVMe 
>> status code (which might include the DNR bit).
> 
> I thought this is about the initialization. yes today we ignore the 
> status in re-connection assuming that whatever
> happened, may (or may not) resolve itself. The basis for this assumption 
> is that if we managed to connect the first
> time there is no reason to assume that connecting again should fail 
> persistently.
> 
And that is another issue where I'm not really comfortable with.
While it would make sense to have the connect functionality to be
one-shot, and let userspace retry if needed, the problem is that we
don't have a means of transporting that information to userspace.
The only thing which we can transport is an error number, which
could be anything and mean anything.
If we had a defined way stating: 'This is a retryable, retry with the 
same options.' vs 'This is retryable error, retry with modified 
options.' vs 'This a non-retryable error, don't bother.' I'd be
fine with delegating retries to userspace.
But currently we don't.

> If there is a consensus that we should not assume it, its a valid 
> argument. I didn't see where this happens with respect
> to authentication though.

nvmf_connect_admin_queue():

             /* Authentication required */
             ret = nvme_auth_negotiate(ctrl, 0);
             if (ret) {
                     dev_warn(ctrl->device,
                              "qid 0: authentication setup failed\n");
                     ret = NVME_SC_AUTH_REQUIRED;
                     goto out_free_data;
             }
             ret = nvme_auth_wait(ctrl, 0);
             if (ret)
                     dev_warn(ctrl->device,
                              "qid 0: authentication failed\n");
             else
                     dev_info(ctrl->device,
                              "qid 0: authenticated\n");

The first call to 'nvme_auth_negotiate()' is just for setting up
the negotiation context and start the protocol. So if we get
an error here it's pretty much non-retryable as it's completely
controlled by the fabrics options.
nvme_auth_wait(), OTOH, contains the actual result from the negotiation,
so there we might or might not retry, depending on the value of 'ret'.

Cheers,

Hannes


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries
  2024-03-07 12:52           ` Hannes Reinecke
@ 2024-03-08 10:21             ` Sagi Grimberg
  0 siblings, 0 replies; 13+ messages in thread
From: Sagi Grimberg @ 2024-03-08 10:21 UTC (permalink / raw)
  To: Hannes Reinecke, Daniel Wagner, James Smart
  Cc: Keith Busch, Christoph Hellwig, linux-nvme, linux-kernel



On 07/03/2024 14:52, Hannes Reinecke wrote:
> On 3/7/24 13:14, Sagi Grimberg wrote:
>>
>>
>> On 07/03/2024 13:45, Hannes Reinecke wrote:
>>> On 3/7/24 12:30, Sagi Grimberg wrote:
>>>>
> [ .. ]
>>>>
>>>> Where is this retried today, I don't see where connect failure is 
>>>> retried, outside of a periodic reconnect.
>>>> Maybe I'm missing where what is the actual failure here.
>>>
>>> static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
>>> {
>>>         struct nvme_tcp_ctrl *tcp_ctrl =
>>>                         container_of(to_delayed_work(work),
>>>                         struct nvme_tcp_ctrl, connect_work);
>>>         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>>>
>>>         ++ctrl->nr_reconnects;
>>>
>>>         if (nvme_tcp_setup_ctrl(ctrl, false))
>>>                 goto requeue;
>>>
>>>         dev_info(ctrl->device, "Successfully reconnected (%d 
>>> attempt)\n",
>>>                         ctrl->nr_reconnects);
>>>
>>>         ctrl->nr_reconnects = 0;
>>>
>>>         return;
>>>
>>> requeue:
>>>         dev_info(ctrl->device, "Failed reconnect attempt %d\n",
>>>
>>> and nvme_tcp_setup_ctrl() returns either a negative errno or an NVMe 
>>> status code (which might include the DNR bit).
>>
>> I thought this is about the initialization. yes today we ignore the 
>> status in re-connection assuming that whatever
>> happened, may (or may not) resolve itself. The basis for this 
>> assumption is that if we managed to connect the first
>> time there is no reason to assume that connecting again should fail 
>> persistently.
>>
> And that is another issue where I'm not really comfortable with.
> While it would make sense to have the connect functionality to be
> one-shot, and let userspace retry if needed, the problem is that we
> don't have a means of transporting that information to userspace.
> The only thing which we can transport is an error number, which
> could be anything and mean anything.

Not necessarily. error codes semantics exists for a reason.
I just really don't think that doing reconnects on a user-driven 
initialization is a good idea at all
unlike the case where the controller was connected and got disrupted, 
this is not user driven and
hence makes sense.

> If we had a defined way stating: 'This is a retryable, retry with the 
> same options.' vs 'This is retryable error, retry with modified 
> options.' vs 'This a non-retryable error, don't bother.' I'd be
> fine with delegating retries to userspace.
> But currently we don't.

Well, TBH I don't know if userspace even needs it. Most likely what a 
user would want is to define
a number of retries and give up if they expire. Adding the intelligence 
for what connect is retry-able or
not does not seem all that useful to me.

>
>> If there is a consensus that we should not assume it, its a valid 
>> argument. I didn't see where this happens with respect
>> to authentication though.
>
> nvmf_connect_admin_queue():
>
>             /* Authentication required */
>             ret = nvme_auth_negotiate(ctrl, 0);
>             if (ret) {
>                     dev_warn(ctrl->device,
>                              "qid 0: authentication setup failed\n");
>                     ret = NVME_SC_AUTH_REQUIRED;
>                     goto out_free_data;
>             }
>             ret = nvme_auth_wait(ctrl, 0);
>             if (ret)
>                     dev_warn(ctrl->device,
>                              "qid 0: authentication failed\n");
>             else
>                     dev_info(ctrl->device,
>                              "qid 0: authenticated\n");
>
> The first call to 'nvme_auth_negotiate()' is just for setting up
> the negotiation context and start the protocol. So if we get
> an error here it's pretty much non-retryable as it's completely
> controlled by the fabrics options.
> nvme_auth_wait(), OTOH, contains the actual result from the negotiation,
> so there we might or might not retry, depending on the value of 'ret'.
>
> Cheers,
>
> Hannes
>


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2024-03-08 10:21 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-05  8:00 [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries Daniel Wagner
2024-03-05  8:00 ` [PATCH v3 1/2] nvme-tcp: short-circuit reconnect retries Daniel Wagner
2024-03-06  8:10   ` Chaitanya Kulkarni
2024-03-05  8:00 ` [PATCH v3 2/2] nvme-rdma: " Daniel Wagner
2024-03-06  8:11   ` Chaitanya Kulkarni
2024-03-06  8:17     ` Chaitanya Kulkarni
2024-03-07  8:00 ` [PATCH v3 0/2] nvme-fabrics: short-circuit connect retries Sagi Grimberg
2024-03-07 10:37   ` Hannes Reinecke
2024-03-07 11:30     ` Sagi Grimberg
2024-03-07 11:45       ` Hannes Reinecke
2024-03-07 12:14         ` Sagi Grimberg
2024-03-07 12:52           ` Hannes Reinecke
2024-03-08 10:21             ` Sagi Grimberg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).