nvmet_sq_destroy stuck forever when data digest is turned on

All of lore.kernel.org
 help / color / mirror / Atom feed

* nvmet_sq_destroy stuck forever when data digest is turned on
@ 2023-09-19 11:52 Grupi, Elad
  2023-09-19 12:22 ` Sagi Grimberg
  2023-09-19 16:05 ` Chaitanya Kulkarni
  0 siblings, 2 replies; 5+ messages in thread
From: Grupi, Elad @ 2023-09-19 11:52 UTC (permalink / raw)
  To: linux-nvme; +Cc: Engel, Amit, Zinger, Eldad

Hi

I have an issue with nvmet_tcp_release_queue_work hitting hung task after 2 minutes of waiting for nvmet_sq_destroy.
This issue reproduces only when data digest is on.

I am inspecting the code of nvmet_tcp_release_queue_work and I see that the code handles 'data in' commands
This means that it calls nvmet_req_uninit for any command that its data is still in transit.

There might be commands that the data transfer is already done, but data digest was not received from socket yet (aka rcv_state is NVMET_TCP_RECV_DDGST)
The data digest will never be read from the socket because the socket is blocked by NVMET_TCP_RECV_ERR
Hence nvmet_sq_destroy will be stuck forever waiting for nvmet_tcp_try_recv_ddgst to execute.

Can you suggest a fix for such an issue?

Thanks,
Elad

Internal Use - Confidential

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: nvmet_sq_destroy stuck forever when data digest is turned on
  2023-09-19 11:52 nvmet_sq_destroy stuck forever when data digest is turned on Grupi, Elad
@ 2023-09-19 12:22 ` Sagi Grimberg
  2023-09-21 14:35   ` Grupi, Elad
  2023-10-12 14:03   ` Grupi, Elad
  2023-09-19 16:05 ` Chaitanya Kulkarni
  1 sibling, 2 replies; 5+ messages in thread
From: Sagi Grimberg @ 2023-09-19 12:22 UTC (permalink / raw)
  To: Grupi, Elad, linux-nvme; +Cc: Engel, Amit, Zinger, Eldad


> Hi
> 
> I have an issue with nvmet_tcp_release_queue_work hitting hung task after 2 minutes of waiting for nvmet_sq_destroy.
> This issue reproduces only when data digest is on.
> 
> I am inspecting the code of nvmet_tcp_release_queue_work and I see that the code handles 'data in' commands
> This means that it calls nvmet_req_uninit for any command that its data is still in transit.
> 
> There might be commands that the data transfer is already done, but data digest was not received from socket yet (aka rcv_state is NVMET_TCP_RECV_DDGST)
> The data digest will never be read from the socket because the socket is blocked by NVMET_TCP_RECV_ERR
> Hence nvmet_sq_destroy will be stuck forever waiting for nvmet_tcp_try_recv_ddgst to execute.
> 
> Can you suggest a fix for such an issue?

Does this (untested) patch make the issue go away?
--
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 97d07488072d..f5eaf3457ada 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -204,10 +204,16 @@ static inline u16 nvmet_tcp_cmd_tag(struct 
nvmet_tcp_queue *queue,
         return cmd - queue->cmds;
  }

+static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
+{
+       return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
  static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
  {
-       return nvme_is_write(cmd->req.cmd) &&
-               cmd->rbytes_done < cmd->req.transfer_len;
+       u32 total_len = cmd->req.transfer_len + 
nvmet_tcp_ddgst_len(cmd->queue);
+
+       return nvme_is_write(cmd->req.cmd) && cmd->rbytes_done < total_len;
  }

  static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
@@ -265,11 +271,6 @@ static inline u8 nvmet_tcp_hdgst_len(struct 
nvmet_tcp_queue *queue)
         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
  }

-static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
-{
-       return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
-}
-
  static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
                 void *pdu, size_t len)
  {
@@ -1221,8 +1222,8 @@ static int nvmet_tcp_try_recv_ddgst(struct 
nvmet_tcp_queue *queue)
                 goto out;
         }

-       if (cmd->rbytes_done == cmd->req.transfer_len)
-               nvmet_tcp_execute_request(cmd);
+       cmd->rbytes_done += NVME_TCP_DIGEST_LENGTH;
+       nvmet_tcp_execute_request(cmd);

         ret = 0;
  out:
--


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: nvmet_sq_destroy stuck forever when data digest is turned on
  2023-09-19 11:52 nvmet_sq_destroy stuck forever when data digest is turned on Grupi, Elad
  2023-09-19 12:22 ` Sagi Grimberg
@ 2023-09-19 16:05 ` Chaitanya Kulkarni
  1 sibling, 0 replies; 5+ messages in thread
From: Chaitanya Kulkarni @ 2023-09-19 16:05 UTC (permalink / raw)
  To: Grupi, Elad, linux-nvme; +Cc: Engel, Amit, Zinger, Eldad


> Thanks,
> Elad
> 
> 
> Internal Use - Confidential

Please remove above banner when posting on the list.

-ck

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: nvmet_sq_destroy stuck forever when data digest is turned on
  2023-09-19 12:22 ` Sagi Grimberg
@ 2023-09-21 14:35   ` Grupi, Elad
  2023-10-12 14:03   ` Grupi, Elad
  1 sibling, 0 replies; 5+ messages in thread
From: Grupi, Elad @ 2023-09-21 14:35 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme; +Cc: Engel, Amit, Zinger, Eldad

Thanks for the quick reply Sagi

I will be able to test the patch next week and I will update on my findings.

Meanwhile, it seems to me that we need to update the cmd->rbytes_done += NVME_TCP_DIGEST_LENGTH also in case of failure of data digest handling.

Elad

-----Original Message-----
From: Sagi Grimberg <sagi@grimberg.me> 
Sent: Tuesday, 19 September 2023 15:23
To: Grupi, Elad <Elad.Grupi@dell.com>; linux-nvme@lists.infradead.org
Cc: Engel, Amit <Amit.Engel@Dell.com>; Zinger, Eldad <Eldad.Zinger@dell.com>
Subject: Re: nvmet_sq_destroy stuck forever when data digest is turned on


[EXTERNAL EMAIL] 


> Hi
> 
> I have an issue with nvmet_tcp_release_queue_work hitting hung task after 2 minutes of waiting for nvmet_sq_destroy.
> This issue reproduces only when data digest is on.
> 
> I am inspecting the code of nvmet_tcp_release_queue_work and I see 
> that the code handles 'data in' commands This means that it calls nvmet_req_uninit for any command that its data is still in transit.
> 
> There might be commands that the data transfer is already done, but 
> data digest was not received from socket yet (aka rcv_state is 
> NVMET_TCP_RECV_DDGST) The data digest will never be read from the socket because the socket is blocked by NVMET_TCP_RECV_ERR Hence nvmet_sq_destroy will be stuck forever waiting for nvmet_tcp_try_recv_ddgst to execute.
> 
> Can you suggest a fix for such an issue?

Does this (untested) patch make the issue go away?
--
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 97d07488072d..f5eaf3457ada 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -204,10 +204,16 @@ static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
         return cmd - queue->cmds;
  }

+static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) {
+       return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; }
+
  static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
  {
-       return nvme_is_write(cmd->req.cmd) &&
-               cmd->rbytes_done < cmd->req.transfer_len;
+       u32 total_len = cmd->req.transfer_len +
nvmet_tcp_ddgst_len(cmd->queue);
+
+       return nvme_is_write(cmd->req.cmd) && cmd->rbytes_done < 
+ total_len;
  }

  static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd) @@ -265,11 +271,6 @@ static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
  }

-static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) -{
-       return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
-}
-
  static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
                 void *pdu, size_t len)
  {
@@ -1221,8 +1222,8 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
                 goto out;
         }

-       if (cmd->rbytes_done == cmd->req.transfer_len)
-               nvmet_tcp_execute_request(cmd);
+       cmd->rbytes_done += NVME_TCP_DIGEST_LENGTH;
+       nvmet_tcp_execute_request(cmd);

         ret = 0;
  out:
--

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: nvmet_sq_destroy stuck forever when data digest is turned on
  2023-09-19 12:22 ` Sagi Grimberg
  2023-09-21 14:35   ` Grupi, Elad
@ 2023-10-12 14:03   ` Grupi, Elad
  1 sibling, 0 replies; 5+ messages in thread
From: Grupi, Elad @ 2023-10-12 14:03 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme; +Cc: Engel, Amit, Zinger, Eldad

Hi Sagi

The patch causes double completion when 'cmd->exp_ddgst != cmd->recv_ddgst'
Other than that it works fine and solves the issue

With this patch, the meaning of cmd->rbytes_done becomes unclear to me,
It is now more complex than counting the bytes we read from the socket.
If the IO is split between multiple PDUs, the rbytes_done represents the bytes read in each PDU not including the data_digest except the last PDU.


Internal Use - Confidential
-----Original Message-----
From: Sagi Grimberg <sagi@grimberg.me>
Sent: Tuesday, 19 September 2023 15:23
To: Grupi, Elad <Elad.Grupi@dell.com>; linux-nvme@lists.infradead.org
Cc: Engel, Amit <Amit.Engel@Dell.com>; Zinger, Eldad <Eldad.Zinger@dell.com>
Subject: Re: nvmet_sq_destroy stuck forever when data digest is turned on


[EXTERNAL EMAIL]


> Hi
>
> I have an issue with nvmet_tcp_release_queue_work hitting hung task after 2 minutes of waiting for nvmet_sq_destroy.
> This issue reproduces only when data digest is on.
>
> I am inspecting the code of nvmet_tcp_release_queue_work and I see
> that the code handles 'data in' commands This means that it calls nvmet_req_uninit for any command that its data is still in transit.
>
> There might be commands that the data transfer is already done, but
> data digest was not received from socket yet (aka rcv_state is
> NVMET_TCP_RECV_DDGST) The data digest will never be read from the socket because the socket is blocked by NVMET_TCP_RECV_ERR Hence nvmet_sq_destroy will be stuck forever waiting for nvmet_tcp_try_recv_ddgst to execute.
>
> Can you suggest a fix for such an issue?

Does this (untested) patch make the issue go away?
--
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 97d07488072d..f5eaf3457ada 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -204,10 +204,16 @@ static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
         return cmd - queue->cmds;
  }

+static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) {
+       return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; }
+
  static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
  {
-       return nvme_is_write(cmd->req.cmd) &&
-               cmd->rbytes_done < cmd->req.transfer_len;
+       u32 total_len = cmd->req.transfer_len +
nvmet_tcp_ddgst_len(cmd->queue);
+
+       return nvme_is_write(cmd->req.cmd) && cmd->rbytes_done <
+ total_len;
  }

  static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd) @@ -265,11 +271,6 @@ static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
  }

-static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) -{
-       return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
-}
-
  static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
                 void *pdu, size_t len)
  {
@@ -1221,8 +1222,8 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
                 goto out;
         }

-       if (cmd->rbytes_done == cmd->req.transfer_len)
-               nvmet_tcp_execute_request(cmd);
+       cmd->rbytes_done += NVME_TCP_DIGEST_LENGTH;
+       nvmet_tcp_execute_request(cmd);

         ret = 0;
  out:
--


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-10-12 14:03 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-19 11:52 nvmet_sq_destroy stuck forever when data digest is turned on Grupi, Elad
2023-09-19 12:22 ` Sagi Grimberg
2023-09-21 14:35   ` Grupi, Elad
2023-10-12 14:03   ` Grupi, Elad
2023-09-19 16:05 ` Chaitanya Kulkarni

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.