[PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec
@ 2019-02-05 17:39 James Smart
  2019-02-06 13:44 ` Ewan D. Milne
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: James Smart @ 2019-02-05 17:39 UTC (permalink / raw)


The FC-NVME spec, when finally approved, modified the disconnect LS
such that the only scope available is the association.

Rework the Disconnect LS processing to be in accordance with the
change.

Signed-off-by: Nigel Kirkland <nigel.kirkland at broadcom.com>
Signed-off-by: James Smart <jsmart2021 at gmail.com>
---
 drivers/nvme/target/fc.c | 33 ++-------------------------------
 1 file changed, 2 insertions(+), 31 deletions(-)

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 9d065850cda4..2f67b315674b 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1514,10 +1514,8 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
 			(struct fcnvme_ls_disconnect_rqst *)iod->rqstbuf;
 	struct fcnvme_ls_disconnect_acc *acc =
 			(struct fcnvme_ls_disconnect_acc *)iod->rspbuf;
-	struct nvmet_fc_tgt_queue *queue = NULL;
 	struct nvmet_fc_tgt_assoc *assoc;
 	int ret = 0;
-	bool del_assoc = false;
 
 	memset(acc, 0, sizeof(*acc));
 
@@ -1548,18 +1546,7 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
 		assoc = nvmet_fc_find_target_assoc(tgtport,
 				be64_to_cpu(rqst->associd.association_id));
 		iod->assoc = assoc;
-		if (assoc) {
-			if (rqst->discon_cmd.scope ==
-					FCNVME_DISCONN_CONNECTION) {
-				queue = nvmet_fc_find_target_queue(tgtport,
-						be64_to_cpu(
-							rqst->discon_cmd.id));
-				if (!queue) {
-					nvmet_fc_tgt_a_put(assoc);
-					ret = VERR_NO_CONN;
-				}
-			}
-		} else
+		if (!assoc)
 			ret = VERR_NO_ASSOC;
 	}
 
@@ -1587,26 +1574,10 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
 				sizeof(struct fcnvme_ls_disconnect_acc)),
 			FCNVME_LS_DISCONNECT);
 
-
-	/* are we to delete a Connection ID (queue) */
-	if (queue) {
-		int qid = queue->qid;
-
-		nvmet_fc_delete_target_queue(queue);
-
-		/* release the get taken by find_target_queue */
-		nvmet_fc_tgt_q_put(queue);
-
-		/* tear association down if io queue terminated */
-		if (!qid)
-			del_assoc = true;
-	}
-
 	/* release get taken in nvmet_fc_find_target_assoc */
 	nvmet_fc_tgt_a_put(iod->assoc);
 
-	if (del_assoc)
-		nvmet_fc_delete_target_assoc(iod->assoc);
+	nvmet_fc_delete_target_assoc(iod->assoc);
 }
 
 
-- 
2.13.7

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec
  2019-02-05 17:39 [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec James Smart
@ 2019-02-06 13:44 ` Ewan D. Milne
       [not found] ` <20190220221454.GA31450@osmithde-lnx.cisco.com>
  2019-03-12 19:31 ` Christoph Hellwig
  2 siblings, 0 replies; 10+ messages in thread
From: Ewan D. Milne @ 2019-02-06 13:44 UTC (permalink / raw)


On Tue, 2019-02-05@09:39 -0800, James Smart wrote:
> The FC-NVME spec, when finally approved, modified the disconnect LS
> such that the only scope available is the association.
> 
> Rework the Disconnect LS processing to be in accordance with the
> change.
> 
> Signed-off-by: Nigel Kirkland <nigel.kirkland at broadcom.com>
> Signed-off-by: James Smart <jsmart2021 at gmail.com>
> ---
>  drivers/nvme/target/fc.c | 33 ++-------------------------------
>  1 file changed, 2 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
> index 9d065850cda4..2f67b315674b 100644
> --- a/drivers/nvme/target/fc.c
> +++ b/drivers/nvme/target/fc.c
> @@ -1514,10 +1514,8 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
>  			(struct fcnvme_ls_disconnect_rqst *)iod->rqstbuf;
>  	struct fcnvme_ls_disconnect_acc *acc =
>  			(struct fcnvme_ls_disconnect_acc *)iod->rspbuf;
> -	struct nvmet_fc_tgt_queue *queue = NULL;
>  	struct nvmet_fc_tgt_assoc *assoc;
>  	int ret = 0;
> -	bool del_assoc = false;
>  
>  	memset(acc, 0, sizeof(*acc));
>  
> @@ -1548,18 +1546,7 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
>  		assoc = nvmet_fc_find_target_assoc(tgtport,
>  				be64_to_cpu(rqst->associd.association_id));
>  		iod->assoc = assoc;
> -		if (assoc) {
> -			if (rqst->discon_cmd.scope ==
> -					FCNVME_DISCONN_CONNECTION) {
> -				queue = nvmet_fc_find_target_queue(tgtport,
> -						be64_to_cpu(
> -							rqst->discon_cmd.id));
> -				if (!queue) {
> -					nvmet_fc_tgt_a_put(assoc);
> -					ret = VERR_NO_CONN;
> -				}
> -			}
> -		} else
> +		if (!assoc)
>  			ret = VERR_NO_ASSOC;
>  	}
>  
> @@ -1587,26 +1574,10 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
>  				sizeof(struct fcnvme_ls_disconnect_acc)),
>  			FCNVME_LS_DISCONNECT);
>  
> -
> -	/* are we to delete a Connection ID (queue) */
> -	if (queue) {
> -		int qid = queue->qid;
> -
> -		nvmet_fc_delete_target_queue(queue);
> -
> -		/* release the get taken by find_target_queue */
> -		nvmet_fc_tgt_q_put(queue);
> -
> -		/* tear association down if io queue terminated */
> -		if (!qid)
> -			del_assoc = true;
> -	}
> -
>  	/* release get taken in nvmet_fc_find_target_assoc */
>  	nvmet_fc_tgt_a_put(iod->assoc);
>  
> -	if (del_assoc)
> -		nvmet_fc_delete_target_assoc(iod->assoc);
> +	nvmet_fc_delete_target_assoc(iod->assoc);
>  }
>  
>  

Reviewed-by: Ewan D. Milne <emilne at redhat.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec
       [not found] ` <20190220221454.GA31450@osmithde-lnx.cisco.com>
@ 2019-02-21 17:35   ` Oliver Smith-Denny
  2019-02-21 18:29   ` James Smart
  1 sibling, 0 replies; 10+ messages in thread
From: Oliver Smith-Denny @ 2019-02-21 17:35 UTC (permalink / raw)


On Tue, Feb 05, 2019@09:39:02AM -0800, James Smart wrote:
 > The FC-NVME spec, when finally approved, modified the disconnect LS
 > such that the only scope available is the association.

(Sorry, resent, my CC's got messed up on first attempt and
somehow I lost the mailing list...having issues with my mail
client, trying another.)

Should something along the lines below also be added with this patch?
(Caveat, untested, I just checked that no one else is using these).
 From my version of the FC-NVMe spec (maybe there is a newer one,
I'm not sure), it looks like not only is the scope field not
present in the Disconnect descriptor, but the association id
is also no longer stored in the Disconnect descriptor and there
are 5 words of reserved fields.

How does this look? Please let me know if I am missing something!
Thanks,
Oliver

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 89accc7..8f5285f 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1458,8 +1458,6 @@ enum {
          discon_rqst->discon_cmd.desc_len =
                          fcnvme_lsdesc_len(
                                  sizeof(struct fcnvme_lsdesc_disconn_cmd));
-       discon_rqst->discon_cmd.scope = FCNVME_DISCONN_ASSOCIATION;
-       discon_rqst->discon_cmd.id = cpu_to_be64(ctrl->association_id);

          lsreq->rqstaddr = discon_rqst;
          lsreq->rqstlen = sizeof(*discon_rqst);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index f98f5c5..7a7b3bc 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1292,13 +1292,12 @@ enum {
          VERR_DISCONN_RQST_LEN   = 19,
          VERR_DISCONN_CMD        = 20,
          VERR_DISCONN_CMD_LEN    = 21,
-       VERR_DISCONN_SCOPE      = 22,
-       VERR_RS_LEN             = 23,
-       VERR_RS_RQST_LEN        = 24,
-       VERR_RS_CMD             = 25,
-       VERR_RS_CMD_LEN         = 26,
-       VERR_RS_RCTL            = 27,
-       VERR_RS_RO              = 28,
+       VERR_RS_LEN             = 22,
+       VERR_RS_RQST_LEN        = 23,
+       VERR_RS_CMD             = 24,
+       VERR_RS_CMD_LEN         = 25,
+       VERR_RS_RCTL            = 26,
+       VERR_RS_RO              = 27,
   };

   static char *validation_errors[] = {
@@ -1541,9 +1540,6 @@ enum {
                          fcnvme_lsdesc_len(
                                  sizeof(struct fcnvme_lsdesc_disconn_cmd)))
                  ret = VERR_DISCONN_CMD_LEN;
-       else if ((rqst->discon_cmd.scope != FCNVME_DISCONN_ASSOCIATION) &&
-                       (rqst->discon_cmd.scope != 
FCNVME_DISCONN_CONNECTION))
-               ret = VERR_DISCONN_SCOPE;
          else {
                  /* match an active association */
                  assoc = nvmet_fc_find_target_assoc(tgtport,
diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h
index 36cca93..580e7b57 100644
--- a/include/linux/nvme-fc.h
+++ b/include/linux/nvme-fc.h
@@ -221,21 +221,11 @@ struct fcnvme_lsdesc_cr_conn_cmd {
          __be32  rsvd52;
   };

-/* Disconnect Scope Values */
-enum {
-       FCNVME_DISCONN_ASSOCIATION      = 0,
-       FCNVME_DISCONN_CONNECTION       = 1,
-};
-
   /* FCNVME_LSDESC_DISCONN_CMD */
   struct fcnvme_lsdesc_disconn_cmd {
          __be32  desc_tag;               /* FCNVME_LSDESC_xxx */
          __be32  desc_len;
-       u8      rsvd8[3];
-       /* note: scope is really a 1 bit field */
-       u8      scope;                  /* FCNVME_DISCONN_xxx */
-       __be32  rsvd12;
-       __be64  id;
+       u8      rsvd8[16];
   };
   /* FCNVME_LSDESC_CONN_ID */

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec
       [not found] ` <20190220221454.GA31450@osmithde-lnx.cisco.com>
  2019-02-21 17:35   ` Oliver Smith-Denny
@ 2019-02-21 18:29   ` James Smart
  2019-02-21 18:45     ` Oliver Smith-Denny
  1 sibling, 1 reply; 10+ messages in thread
From: James Smart @ 2019-02-21 18:29 UTC (permalink / raw)


On 2/20/2019 2:14 PM, Oliver Smith-Denny wrote:
> On Tue, Feb 05, 2019@09:39:02AM -0800, James Smart wrote:
>> The FC-NVME spec, when finally approved, modified the disconnect LS
>> such that the only scope available is the association.
>   
> (Sorry, resent, my CC's got messed up on first attempt and
> somehow I lost the mailing list...)
> 
> Should something along the lines below also be added with this patch?
> (Caveat, untested, I just checked that no one else is using these).
>  From my version of the FC-NVMe spec (maybe there is a newer one,
> I'm not sure), it looks like not only is the scope field not
> present in the Disconnect descriptor, but the association id
> is also no longer stored in the Disconnect descriptor and there
> are 5 words of reserved fields.

The Association Id is definitely still present in the Association 
Identifier descriptor. Association ID was only in the Disconnect 
Descriptor if the scope was "0". However, we got into specmanship 
circles about why is the id there twice, how to say it must be the same 
value, and as well as behaviors when the nvmeof spec didn't progress to 
allow single connection disconnect - we simply null the content of the 
disconnect descriptor and ensured the only intent was an association 
termination.  If single-connection disconnect comes back, we will likely 
add a new LS for it rather than dual-roling the exising one.


> 
> How does this look? Please let me know if I am missing something!
> 
> Thanks,
> Oliver
>   
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index 89accc7..8f5285f 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -1458,8 +1458,6 @@ enum {
>          discon_rqst->discon_cmd.desc_len =
>                          fcnvme_lsdesc_len(
>                                  sizeof(struct fcnvme_lsdesc_disconn_cmd));
> -       discon_rqst->discon_cmd.scope = FCNVME_DISCONN_ASSOCIATION;
> -       discon_rqst->discon_cmd.id = cpu_to_be64(ctrl->association_id);
>   
>          lsreq->rqstaddr = discon_rqst;
>          lsreq->rqstlen = sizeof(*discon_rqst);
> diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
> index f98f5c5..7a7b3bc 100644
> --- a/drivers/nvme/target/fc.c
> +++ b/drivers/nvme/target/fc.c
> @@ -1292,13 +1292,12 @@ enum {
>          VERR_DISCONN_RQST_LEN   = 19,
>          VERR_DISCONN_CMD        = 20,
>          VERR_DISCONN_CMD_LEN    = 21,
> -       VERR_DISCONN_SCOPE      = 22,
> -       VERR_RS_LEN             = 23,
> -       VERR_RS_RQST_LEN        = 24,
> -       VERR_RS_CMD             = 25,
> -       VERR_RS_CMD_LEN         = 26,
> -       VERR_RS_RCTL            = 27,
> -       VERR_RS_RO              = 28,
> +       VERR_RS_LEN             = 22,
> +       VERR_RS_RQST_LEN        = 23,
> +       VERR_RS_CMD             = 24,
> +       VERR_RS_CMD_LEN         = 25,
> +       VERR_RS_RCTL            = 26,
> +       VERR_RS_RO              = 27,
>   };
>   
>   static char *validation_errors[] = {


Assuming we want to get rid of VERR_DISCONN_SCOPE - it would be almost 
ok. What you missed was updating the validation_errors[] array to also 
remove the string for VERR_DISCONN_SCOPE. The enum's are indexes into 
the array.

But, after reviewing below - I don't want to move this error case.


> @@ -1541,9 +1540,6 @@ enum {
>                          fcnvme_lsdesc_len(
>                                  sizeof(struct fcnvme_lsdesc_disconn_cmd)))
>                  ret = VERR_DISCONN_CMD_LEN;
> -       else if ((rqst->discon_cmd.scope != FCNVME_DISCONN_ASSOCIATION) &&
> -                       (rqst->discon_cmd.scope != FCNVME_DISCONN_CONNECTION))
> -               ret = VERR_DISCONN_SCOPE;

Actually, I don't want to get rid of this check. It's basically a 
validation on the content being NULL'd as well.  Ensures we're not 
talking to something old that was expecting a connection disconnect.

>          else {
>                  /* match an active association */
>                  assoc = nvmet_fc_find_target_assoc(tgtport,
> diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h
> index 36cca93..580e7b57 100644
> --- a/include/linux/nvme-fc.h
> +++ b/include/linux/nvme-fc.h
> @@ -221,21 +221,11 @@ struct fcnvme_lsdesc_cr_conn_cmd {
>          __be32  rsvd52;
>   };
>   
> -/* Disconnect Scope Values */
> -enum {
> -       FCNVME_DISCONN_ASSOCIATION      = 0,
> -       FCNVME_DISCONN_CONNECTION       = 1,
> -};
> -
>   /* FCNVME_LSDESC_DISCONN_CMD */
>   struct fcnvme_lsdesc_disconn_cmd {
>          __be32  desc_tag;               /* FCNVME_LSDESC_xxx */
>          __be32  desc_len;
> -       u8      rsvd8[3];
> -       /* note: scope is really a 1 bit field */
> -       u8      scope;                  /* FCNVME_DISCONN_xxx */
> -       __be32  rsvd12;
> -       __be64  id;
> +       u8      rsvd8[16];
>   };
>   
>   /* FCNVME_LSDESC_CONN_ID */
> 

The header changes are ok. But if you make these, you need to update the 
check above so that it looks at the area where scope was and ensures 
it's 0.

I plan to make another pass through the transport for spec compliance in 
a couple of weeks. Part of that was ensuring all the headers and 
disconnect behaviors were in sync. We should also get some of the SLER 
bits in. I can roll your changes in with that or you're free to make the 
suggested changes called out above.

-- james

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec
  2019-02-21 18:29   ` James Smart
@ 2019-02-21 18:45     ` Oliver Smith-Denny
  2019-02-21 23:16       ` Oliver Smith-Denny
  0 siblings, 1 reply; 10+ messages in thread
From: Oliver Smith-Denny @ 2019-02-21 18:45 UTC (permalink / raw)

On 02/21/2019 10:29 AM, James Smart wrote:[snip]
> I plan to make another pass through the transport for spec compliance in 
> a couple of weeks. Part of that was ensuring all the headers and 
> disconnect behaviors were in sync. We should also get some of the SLER 
> bits in. I can roll your changes in with that or you're free to make the 
> suggested changes called out above.

Sounds good to roll these header changes in with the rest of the spec
compliance you are going to do. I agree with you on your other comments,
makes sense.

I have been testing with these changes and have been getting one
warning (kernel/workqueue.c:3028) when the discovery controller gets
NVMe_Disconnect. I also have been trying some error injection (not
sending the occasional response from the target LLDD for write data)
and getting blocked tasks for > 120 seconds, with the following call
trace (this is after getting NVMe_Disconnect for the data controller):

INFO: task kworker/27:2:35310 blocked for more than 120 seconds.
Tainted: G        W  O      5.0.0-rc7-next-20190220+ #1
  kworker/27:2    D    0 35310      2 0x80000080
Workqueue: events nvmet_fc_handle_ls_rqst_work [nvmet_fc]
Call Trace:
__schedule+0x2ab/0x880
? complete+0x4d/0x60
schedule+0x36/0x70
schedule_timeout+0x1dc/0x300
complete+0x4d/0x60
nvmet_destroy_namespace+0x20/0x20 [nvmet]
wait_for_completion+0x121/0x180
wake_up_q+0x80/0x80
nvmet_sq_destroy+0x4f/0xf0 [nvmet]
nvmet_fc_delete_target_assoc+0x2fd/0x3f0 [nvmet_fc]
nvmet_fc_handle_ls_rqst_work+0x6ad/0xa40 [nvmet_fc]
process_one_work+0x179/0x3a0
worker_thread+0x4f/0x3e0
kthread+0x105/0x140
? max_active_store+0x80/0x80
? kthread_bind+0x20/0x20
ret_from_fork+0x35/0x40

So I will investigate this, make sure first that it is not
from anything I am doing incorrectly in the LLDD. If that is
not the case I will update you with fuller results. I only
see this after incorporating your changes and mine (though
I will start with taking out mine, since not all of them
should be there).

Thanks,
Oliver

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec
  2019-02-21 18:45     ` Oliver Smith-Denny
@ 2019-02-21 23:16       ` Oliver Smith-Denny
  2019-02-26 21:53         ` James Smart
  0 siblings, 1 reply; 10+ messages in thread
From: Oliver Smith-Denny @ 2019-02-21 23:16 UTC (permalink / raw)

On 02/21/2019 10:45 AM, Oliver Smith-Denny wrote:> I have been testing 
with these changes and have been getting one
> warning (kernel/workqueue.c:3028) when the discovery controller gets
> NVMe_Disconnect. I also have been trying some error injection (not
> sending the occasional response from the target LLDD for write data)
> and getting blocked tasks for > 120 seconds, with the following call
> trace (this is after getting NVMe_Disconnect for the data controller):
> 
> INFO: task kworker/27:2:35310 blocked for more than 120 seconds.
> Tainted: G??????? W? O????? 5.0.0-rc7-next-20190220+ #1
>  ?kworker/27:2??? D??? 0 35310????? 2 0x80000080
> Workqueue: events nvmet_fc_handle_ls_rqst_work [nvmet_fc]
> Call Trace:
> __schedule+0x2ab/0x880
> ? complete+0x4d/0x60
> schedule+0x36/0x70
> schedule_timeout+0x1dc/0x300
> complete+0x4d/0x60
> nvmet_destroy_namespace+0x20/0x20 [nvmet]
> wait_for_completion+0x121/0x180
> wake_up_q+0x80/0x80
> nvmet_sq_destroy+0x4f/0xf0 [nvmet]
> nvmet_fc_delete_target_assoc+0x2fd/0x3f0 [nvmet_fc]
> nvmet_fc_handle_ls_rqst_work+0x6ad/0xa40 [nvmet_fc]
> process_one_work+0x179/0x3a0
> worker_thread+0x4f/0x3e0
> kthread+0x105/0x140
> ? max_active_store+0x80/0x80
> ? kthread_bind+0x20/0x20
> ret_from_fork+0x35/0x40

I tried running the vanilla 5.0-rc7 kernel and I did not see
either the warning or the blocked tasked (but that makes sense because 
the vanilla kernel doesn't delete controllers with the current logic on 
NVME_Disconnect).

I then readded your patch to the kernel and I see both the warning and 
blocked task. I also added some printks. From what I can see for the
above blocked task, after NVMe_Disconnect comes in to the target
(expected, the initiator has sent ABTS for a write that never
completed, then it sends NVMe_Disconnect), nvmet gets stuck
deleting I/O queues. In my setup, there are 80 I/O queues.
On queue 14 (after deleting 80 - 15), nvmet hangs in nvmet_sq_destroy()
doing wait_for_completion(&sq->free_done);

20 seconds pass and the nvmet keep-alive timer expires (I believe 
because the initiator has sent the NVMe_Disconnect and so stops
sending keep alives to that controller). At this point, the process
to delete the association starts happening a second time. It picks up
at queue 14, retrying it. This time, the nvmet_sq_destroy() finishes, 
but the queue is not freed (as two references remain). The remaining
I/O queues and the admin queue are then destroyed. After this the above
call trace is dumped.

The warning I have less information about. Here is the call trace printed:
WARNING: CPU: 8 PID: 53 at kernel/workqueue.c:3028 
__flush_work.isra.31+0x1a2/0x1b0
RIP: 0010:__flush_work.isra.31+0x1a2/0x1b0
Code: fb 66 0f 1f 44 00 00 31 c0 eb aa 4c 89 e7 c6 07 00 0f 1f 40 00 fb 
66 0f 1f 44 00 00 31 c0 eb 95 e8 63 01 fe ff 0f 0b 90 eb 8b <0f> 0b 31 
c0 eb 85 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89
RSP: 0018:ffffc90006747be8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff888c03830148 RCX: 0000000000000000
RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff888c03830148
RBP: ffffc90006747c58 R08: 0000000000000006 R09: 0000000000000006
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: 0000000000000000 R14: ffffc90006747c88 R15: ffff888c0c105ac0
FS:  0000000000000000(0000) GS:ffff888c10a00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fb4fe290000 CR3: 000000000220e003 CR4: 00000000007606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
? del_timer+0x59/0x80
__cancel_work_timer+0x10e/0x190
? pcpu_free_area+0x1da/0x2b0
cancel_work_sync+0x10/0x20
nvmet_ctrl_free+0x112/0x1b0 [nvmet]
nvmet_sq_destroy+0x8a/0xf0 [nvmet]
nvmet_fc_delete_target_assoc+0x2fd/0x3f0 [nvmet_fc]
nmet_fc_handle_ls_rqst_work+0x6ba/0xa40 [nvmet_fc]
process_one_work+0x179/0x3a0
worker_thread+0x4f/0x3e0
kthread+0x105/0x140
? max_active_store+0x80/0x80
? kthread_bind+0x20/0x20
ret_from_fork+0x35/0x40
---[ end trace b209b06283b18e21 ]---

I am still investigating the possibility that the LLDD is
doing something poorly (it is in bringup, so I wouldn't
be surprised in the slightest :) but from what I can tell
something is unhappy in the delete_assoc path. Thoughts from
your end? I am also happy to wait until you have completed
your spec review and test with the full changeset. Let me
know what I can do to help debug. I will continue looking
on my end.

Thanks,
Oliver

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec
  2019-02-21 23:16       ` Oliver Smith-Denny
@ 2019-02-26 21:53         ` James Smart
  2019-02-27 20:25           ` Oliver Smith-Denny
  0 siblings, 1 reply; 10+ messages in thread
From: James Smart @ 2019-02-26 21:53 UTC (permalink / raw)


On 2/21/2019 3:16 PM, Oliver Smith-Denny wrote:
> On 02/21/2019 10:45 AM, Oliver Smith-Denny wrote:> I have been testing 
> with these changes and have been getting one
>> warning (kernel/workqueue.c:3028) when the discovery controller gets
>> NVMe_Disconnect. I also have been trying some error injection (not
>> sending the occasional response from the target LLDD for write data)
>> and getting blocked tasks for > 120 seconds, with the following call
>> trace (this is after getting NVMe_Disconnect for the data controller):
>>
>> INFO: task kworker/27:2:35310 blocked for more than 120 seconds.
>> Tainted: G??????? W? O????? 5.0.0-rc7-next-20190220+ #1
>> ??kworker/27:2??? D??? 0 35310????? 2 0x80000080
>> Workqueue: events nvmet_fc_handle_ls_rqst_work [nvmet_fc]
>> Call Trace:
>> __schedule+0x2ab/0x880
>> ? complete+0x4d/0x60
>> schedule+0x36/0x70
>> schedule_timeout+0x1dc/0x300
>> complete+0x4d/0x60
>> nvmet_destroy_namespace+0x20/0x20 [nvmet]
>> wait_for_completion+0x121/0x180
>> wake_up_q+0x80/0x80
>> nvmet_sq_destroy+0x4f/0xf0 [nvmet]
>> nvmet_fc_delete_target_assoc+0x2fd/0x3f0 [nvmet_fc]
>> nvmet_fc_handle_ls_rqst_work+0x6ad/0xa40 [nvmet_fc]
>> process_one_work+0x179/0x3a0
>> worker_thread+0x4f/0x3e0
>> kthread+0x105/0x140
>> ? max_active_store+0x80/0x80
>> ? kthread_bind+0x20/0x20
>> ret_from_fork+0x35/0x40
>
> I tried running the vanilla 5.0-rc7 kernel and I did not see
> either the warning or the blocked tasked (but that makes sense because 
> the vanilla kernel doesn't delete controllers with the current logic 
> on NVME_Disconnect).
>
> I then readded your patch to the kernel and I see both the warning and 
> blocked task.

Oliver,

I took at look at the two patches, and the one had missed at ! check on 
scheduling the work. Thus it resulted in an extra put being done, thus 
it would be released too soon.

Try with this v2 patch and let me know.

-- james

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec
  2019-02-26 21:53         ` James Smart
@ 2019-02-27 20:25           ` Oliver Smith-Denny
  2019-02-28 22:47             ` Oliver Smith-Denny
  0 siblings, 1 reply; 10+ messages in thread
From: Oliver Smith-Denny @ 2019-02-27 20:25 UTC (permalink / raw)

On 02/26/2019 01:53 PM, James Smart wrote:
> On 2/21/2019 3:16 PM, Oliver Smith-Denny wrote:
>> On 02/21/2019 10:45 AM, Oliver Smith-Denny wrote:
>>>
>>> INFO: task kworker/27:2:35310 blocked for more than 120 seconds.
>>> Tainted: G??????? W? O????? 5.0.0-rc7-next-20190220+ #1
>>> ??kworker/27:2??? D??? 0 35310????? 2 0x80000080
>>> Workqueue: events nvmet_fc_handle_ls_rqst_work [nvmet_fc]
>>> Call Trace:
>>> __schedule+0x2ab/0x880
>>> ? complete+0x4d/0x60
>>> schedule+0x36/0x70
>>> schedule_timeout+0x1dc/0x300
>>> complete+0x4d/0x60
>>> nvmet_destroy_namespace+0x20/0x20 [nvmet]
>>> wait_for_completion+0x121/0x180
>>> wake_up_q+0x80/0x80
>>> nvmet_sq_destroy+0x4f/0xf0 [nvmet]
>>> nvmet_fc_delete_target_assoc+0x2fd/0x3f0 [nvmet_fc]
>>> nvmet_fc_handle_ls_rqst_work+0x6ad/0xa40 [nvmet_fc]
>>> process_one_work+0x179/0x3a0
>>> worker_thread+0x4f/0x3e0
>>> kthread+0x105/0x140
>>> ? max_active_store+0x80/0x80
>>> ? kthread_bind+0x20/0x20
>>> ret_from_fork+0x35/0x40
> 
> I took at look at the two patches, and the one had missed at ! check on 
> scheduling the work. Thus it resulted in an extra put being done, thus 
> it would be released too soon.
> 
> Try with this v2 patch and let me know.

I ran the same tests with the 5.0.0-rc7 kernel with the diconnect
patch and the v2 patch of the targetport assoc_list patch applied.

When I ran normal traffic (no dropping of write responses), I still saw
the warning (see below) happen when the discovery controller got
deleted. I took the host offline to trigger a keep alive failure
in the controller, which successfully deleted the data controller.

WARNING: CPU: 30 PID: 403 at kernel/workqueue.c:3028 
__flush_work.isra.31+0x1a2/0x1b0
Workqueue: events nvmet_fc_handle_ls_rqst_work [nvmet_fc]
RIP: 0010:__flush_work.isra.31+0x1a2/0x1b0
Code: fb 66 0f 1f 44 00 00 31 c0 eb aa 4c 89 e7 c6 07 00 0f 1f 40 00 fb 
66 0f 1f 44 00 00 31 c0 eb 95 e8 63 01 fe ff 0f 0b 90 eb 8b <0f> 0b 31 
c0 eb 85 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89
RSP: 0018:ffffc90008edbbe8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff888bf150c148 RCX: 0000000000000000
RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff888bf150c148
RBP: ffffc90008edbc58 R08: 0000000000002a15 R09: 0000000000002a15
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: 0000000000000000 R14: ffffc90008edbc88 R15: ffff888c07b90000
FS:  0000000000000000(0000) GS:ffff888c10c00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fd491f52140 CR3: 000000000220e004 CR4: 00000000007606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 000000000000400
PKRU: 55555554
Call Trace:
? del_timer+0x59/0x80
__cancel_work_timer+0x10e/0x190
cancel_work_sync+0x10/0x20
nvmet_ctrl_free+0x112/0x1b0 [nvmet]
nvmet_sq_destroy+0xdb/0x140 [nvmet]
nvmet_fc_delete_target_assoc+0x2f2/0x370 [nvmet_fc]
nvmet_fc_handle_ls_rqst_work+0x6b8/0xa20 [nvmet_fc]
process_one_work+0x179/0x3a0
worker_thread+0x4f/0x3e0
kthread+0x105/0x140
? max_active_store+0x80/0x80
? kthread_bind+0x20/0x20
ret_from_fork+0x35/0x40
---[ end trace 5d3c8b3548a4fb95 ]---

When I ran traffic with dropping the occasional write response, I again
the above warning when the discovery controller gets NVMe_Disconnect.
After the host sent ABTS and NVMe_Disconnect to the data controller,
I saw the same hung task as before (slightly different call trace,
shown below, the original is quoted above).

It occurred in the same spot, as the controller got hung up in
nvmet_sq_destroy, doing wait_for_completion(&sq->free_done);
I see the below call trace ~10 times in dmesg.

INFO: task kworker/30:1:403 blocked for more than 120 seconds.
kworker/30:1    D    0   403      2 0x80000000
Workqueue: events nvmet_fc_handle_ls_rqst_work [nvmet_fc]
Call Trace:
__schedule+0x2ab/0x880
schedule+0x36/0x70
schedule_timeout+0x1dc/0x300
wait_for_completion+0x121/0x180
? wake_up_q+0x80/0x80
nvmet_sq_destroy+0x84/0x140 [nvmet]
nvmet_fc_delete_target_assoc+0x2f2/0x370 [nvmet_fc]
nvmet_fc_handle_ls_rqst_work+0x6b8/0xa20 [nvmet_fc]
process_one_work+0x179/0x3a0
worker_thread+0x4f/0x3e0
kthread+0x105/0x140
? max_active_store+0x80/0x80
? kthread_bind+0x20/0x20
ret_from_fork+0x35/0x40

Thanks again for your help in looking into this. Let me
know if there are other patches I should apply or other
things to test.

Thanks,
Oliver

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec
  2019-02-27 20:25           ` Oliver Smith-Denny
@ 2019-02-28 22:47             ` Oliver Smith-Denny
  0 siblings, 0 replies; 10+ messages in thread
From: Oliver Smith-Denny @ 2019-02-28 22:47 UTC (permalink / raw)


On 02/27/2019 12:25 PM, Oliver Smith-Denny wrote:
> When I ran normal traffic (no dropping of write responses), I still saw
> the warning (see below) happen when the discovery controller got
> deleted. I took the host offline to trigger a keep alive failure
> in the controller, which successfully deleted the data controller.
> 
> WARNING: CPU: 30 PID: 403 at kernel/workqueue.c:3028 
> __flush_work.isra.31+0x1a2/0x1b0
> Workqueue: events nvmet_fc_handle_ls_rqst_work [nvmet_fc]
> RIP: 0010:__flush_work.isra.31+0x1a2/0x1b0
> Code: fb 66 0f 1f 44 00 00 31 c0 eb aa 4c 89 e7 c6 07 00 0f 1f 40 00 fb 
> 66 0f 1f 44 00 00 31 c0 eb 95 e8 63 01 fe ff 0f 0b 90 eb 8b <0f> 0b 31 
> c0 eb 85 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89
> RSP: 0018:ffffc90008edbbe8 EFLAGS: 00010246
> RAX: 0000000000000000 RBX: ffff888bf150c148 RCX: 0000000000000000
> RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff888bf150c148
> RBP: ffffc90008edbc58 R08: 0000000000002a15 R09: 0000000000002a15
> R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> R13: 0000000000000000 R14: ffffc90008edbc88 R15: ffff888c07b90000
> FS:? 0000000000000000(0000) GS:ffff888c10c00000(0000) 
> knlGS:0000000000000000
> CS:? 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007fd491f52140 CR3: 000000000220e004 CR4: 00000000007606e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 000000000000400
> PKRU: 55555554
> Call Trace:
> ? del_timer+0x59/0x80
> __cancel_work_timer+0x10e/0x190
> cancel_work_sync+0x10/0x20
> nvmet_ctrl_free+0x112/0x1b0 [nvmet]
> nvmet_sq_destroy+0xdb/0x140 [nvmet]
> nvmet_fc_delete_target_assoc+0x2f2/0x370 [nvmet_fc]
> nvmet_fc_handle_ls_rqst_work+0x6b8/0xa20 [nvmet_fc]
> process_one_work+0x179/0x3a0
> worker_thread+0x4f/0x3e0
> kthread+0x105/0x140
> ? max_active_store+0x80/0x80
> ? kthread_bind+0x20/0x20
> ret_from_fork+0x35/0x40
> ---[ end trace 5d3c8b3548a4fb95 ]---

The hung task looks like it was an issue in the LLDD, I fixed
that issue (giving the wrong struct nvmefc_tgt_fcp_req to
nvmet_fc_rcv_fcp_abort) and now the call trace for the hung
task now longer shows up. However, I do still see the warning
above in workqueue.c when the discovery controller gets
deleted from NVMe_Disconnect.

Thanks,
Oliver

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec
  2019-02-05 17:39 [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec James Smart
  2019-02-06 13:44 ` Ewan D. Milne
       [not found] ` <20190220221454.GA31450@osmithde-lnx.cisco.com>
@ 2019-03-12 19:31 ` Christoph Hellwig
  2 siblings, 0 replies; 10+ messages in thread
From: Christoph Hellwig @ 2019-03-12 19:31 UTC (permalink / raw)


Thanks,

applied to nvme-5.1.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-03-12 19:31 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-05 17:39 [PATCH] nvmet-fc: Bring Disconnect into compliance with FC-NVME spec James Smart
2019-02-06 13:44 ` Ewan D. Milne
     [not found] ` <20190220221454.GA31450@osmithde-lnx.cisco.com>
2019-02-21 17:35   ` Oliver Smith-Denny
2019-02-21 18:29   ` James Smart
2019-02-21 18:45     ` Oliver Smith-Denny
2019-02-21 23:16       ` Oliver Smith-Denny
2019-02-26 21:53         ` James Smart
2019-02-27 20:25           ` Oliver Smith-Denny
2019-02-28 22:47             ` Oliver Smith-Denny
2019-03-12 19:31 ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.