[PATCH 0/2] Fix a race condition when performing a controller reset

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2] Fix a race condition when performing a controller reset
@ 2021-10-21  8:41 Maurizio Lombardi
  2021-10-21  8:41 ` [PATCH 1/2] nvmet: add an helper to free the iovec Maurizio Lombardi
  2021-10-21  8:41 ` [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work Maurizio Lombardi
  0 siblings, 2 replies; 21+ messages in thread
From: Maurizio Lombardi @ 2021-10-21  8:41 UTC (permalink / raw)
  To: linux-nvme; +Cc: hch, sagi, hare, chaitanya.kulkarni, jmeneghi

Memory leaks and kernel panics involving the nvmet driver
have been observed when an initiator executes a reset_controller
operation while doing I/O.

The problem is due to a race condition between io_work
and release_queue, the latter may end up destroying the
commands while io_work is still running, causing use-after-free
and memory leaks.

Maurizio Lombardi (2):
  nvmet: add an helper to free the iovec
  nvmet: fix a race condition between release_queue and io_work

 drivers/nvme/target/tcp.c | 41 ++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

-- 
2.27.0



^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/2] nvmet: add an helper to free the iovec
  2021-10-21  8:41 [PATCH 0/2] Fix a race condition when performing a controller reset Maurizio Lombardi
@ 2021-10-21  8:41 ` Maurizio Lombardi
  2021-10-21 14:56   ` John Meneghini
  2021-10-21  8:41 ` [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work Maurizio Lombardi
  1 sibling, 1 reply; 21+ messages in thread
From: Maurizio Lombardi @ 2021-10-21  8:41 UTC (permalink / raw)
  To: linux-nvme; +Cc: hch, sagi, hare, chaitanya.kulkarni, jmeneghi

Makes the code easier to read and to debug.

Sets the freed pointers to NULL, it will be useful
when destroying the queues to understand if the
iovecs have been released already or not.

Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
---
 drivers/nvme/target/tcp.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index c33a0464346f..2f03a94725ae 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -166,6 +166,8 @@ static struct workqueue_struct *nvmet_tcp_wq;
 static const struct nvmet_fabrics_ops nvmet_tcp_ops;
 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
+static void nvmet_tcp_free_iovec(struct nvmet_tcp_cmd *cmd);
+static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd);
 
 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
 		struct nvmet_tcp_cmd *cmd)
@@ -297,6 +299,16 @@ static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
 	return 0;
 }
 
+static void nvmet_tcp_free_iovec(struct nvmet_tcp_cmd *cmd)
+{
+	WARN_ON(unlikely(cmd->nr_mapped > 0));
+
+	kfree(cmd->iov);
+	sgl_free(cmd->req.sg);
+	cmd->iov = NULL;
+	cmd->req.sg = NULL;
+}
+
 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
 {
 	struct scatterlist *sg;
@@ -306,6 +318,8 @@ static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
 
 	for (i = 0; i < cmd->nr_mapped; i++)
 		kunmap(sg_page(&sg[i]));
+
+	cmd->nr_mapped = 0;
 }
 
 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
@@ -387,7 +401,7 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
 
 	return 0;
 err:
-	sgl_free(cmd->req.sg);
+	nvmet_tcp_free_iovec(cmd);
 	return NVME_SC_INTERNAL;
 }
 
@@ -632,10 +646,8 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 		}
 	}
 
-	if (queue->nvme_sq.sqhd_disabled) {
-		kfree(cmd->iov);
-		sgl_free(cmd->req.sg);
-	}
+	if (queue->nvme_sq.sqhd_disabled)
+		nvmet_tcp_free_iovec(cmd);
 
 	return 1;
 
@@ -664,8 +676,7 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
 	if (left)
 		return -EAGAIN;
 
-	kfree(cmd->iov);
-	sgl_free(cmd->req.sg);
+	nvmet_tcp_free_iovec(cmd);
 	cmd->queue->snd_cmd = NULL;
 	nvmet_tcp_put_cmd(cmd);
 	return 1;
@@ -1406,8 +1417,7 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
 {
 	nvmet_req_uninit(&cmd->req);
 	nvmet_tcp_unmap_pdu_iovec(cmd);
-	kfree(cmd->iov);
-	sgl_free(cmd->req.sg);
+	nvmet_tcp_free_iovec(cmd);
 }
 
 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
-- 
2.27.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-10-21  8:41 [PATCH 0/2] Fix a race condition when performing a controller reset Maurizio Lombardi
  2021-10-21  8:41 ` [PATCH 1/2] nvmet: add an helper to free the iovec Maurizio Lombardi
@ 2021-10-21  8:41 ` Maurizio Lombardi
  2021-10-21 14:57   ` John Meneghini
  2021-10-26 15:42   ` Sagi Grimberg
  1 sibling, 2 replies; 21+ messages in thread
From: Maurizio Lombardi @ 2021-10-21  8:41 UTC (permalink / raw)
  To: linux-nvme; +Cc: hch, sagi, hare, chaitanya.kulkarni, jmeneghi

If the initiator executes a reset controller operation while
performing I/O, the target kernel will crash because of a race condition
between release_queue and io_work;
nvmet_tcp_uninit_data_in_cmds() may be executed while io_work
is running, calling flush_work(io_work) was not sufficient to
prevent this because io_work could requeue itself.

* Fix this bug by preventing io_work from being enqueued when
sk_user_data is NULL (it means that the queue is going to be deleted)

* Ensure that all the memory allocated for the commands' iovec is freed

Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
---
 drivers/nvme/target/tcp.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 2f03a94725ae..1eedbd83c95f 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -551,6 +551,7 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req)
 	struct nvmet_tcp_cmd *cmd =
 		container_of(req, struct nvmet_tcp_cmd, req);
 	struct nvmet_tcp_queue	*queue = cmd->queue;
+	struct socket *sock = queue->sock;
 	struct nvme_sgl_desc *sgl;
 	u32 len;
 
@@ -570,7 +571,10 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req)
 	}
 
 	llist_add(&cmd->lentry, &queue->resp_list);
-	queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
+	read_lock_bh(&sock->sk->sk_callback_lock);
+	if (likely(sock->sk->sk_user_data))
+		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
+	read_unlock_bh(&sock->sk->sk_callback_lock);
 }
 
 static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
@@ -1427,7 +1431,9 @@ static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
 
 	for (i = 0; i < queue->nr_cmds; i++, cmd++) {
 		if (nvmet_tcp_need_data_in(cmd))
-			nvmet_tcp_finish_cmd(cmd);
+			nvmet_req_uninit(&cmd->req);
+		nvmet_tcp_unmap_pdu_iovec(cmd);
+		nvmet_tcp_free_iovec(cmd);
 	}
 
 	if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
@@ -1447,11 +1453,10 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
 	mutex_unlock(&nvmet_tcp_queue_mutex);
 
 	nvmet_tcp_restore_socket_callbacks(queue);
-	flush_work(&queue->io_work);
+	cancel_work_sync(&queue->io_work);
 
 	nvmet_tcp_uninit_data_in_cmds(queue);
 	nvmet_sq_destroy(&queue->nvme_sq);
-	cancel_work_sync(&queue->io_work);
 	sock_release(queue->sock);
 	nvmet_tcp_free_cmds(queue);
 	if (queue->hdr_digest || queue->data_digest)
-- 
2.27.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] nvmet: add an helper to free the iovec
  2021-10-21  8:41 ` [PATCH 1/2] nvmet: add an helper to free the iovec Maurizio Lombardi
@ 2021-10-21 14:56   ` John Meneghini
  2021-10-21 14:58     ` John Meneghini
  2021-10-27  0:15     ` Chaitanya Kulkarni
  0 siblings, 2 replies; 21+ messages in thread
From: John Meneghini @ 2021-10-21 14:56 UTC (permalink / raw)
  To: Maurizio Lombardi, linux-nvme
  Cc: hch, sagi, hare, chaitanya.kulkarni, John Meneghini

Looks good Maurizio.

You should also mention that you have a simple test you can run which reproduces this bug and results in a kernel panic.

We might want to add that test to the blktests tool.

Reviewed-by: John Meneghini <jmeneghi.redhat.com>


On 10/21/21 4:41 AM, Maurizio Lombardi wrote:
> Makes the code easier to read and to debug.Reviewed-by: John Meneghini <jmeneghi at redhat.com>
> 
> Sets the freed pointers to NULL, it will be useful
> when destroying the queues to understand if the
> iovecs have been released already or not.
> 
> Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
> ---
>   drivers/nvme/target/tcp.c | 28 +++++++++++++++++++---------
>   1 file changed, 19 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index c33a0464346f..2f03a94725ae 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -166,6 +166,8 @@ static struct workqueue_struct *nvmet_tcp_wq;
>   static const struct nvmet_fabrics_ops nvmet_tcp_ops;
>   static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
>   static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
> +static void nvmet_tcp_free_iovec(struct nvmet_tcp_cmd *cmd);
> +static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd);
>   
>   static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
>   		struct nvmet_tcp_cmd *cmd)
> @@ -297,6 +299,16 @@ static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
>   	return 0;
>   }
>   
> +static void nvmet_tcp_free_iovec(struct nvmet_tcp_cmd *cmd)
> +{
> +	WARN_ON(unlikely(cmd->nr_mapped > 0));
> +
> +	kfree(cmd->iov);
> +	sgl_free(cmd->req.sg);
> +	cmd->iov = NULL;
> +	cmd->req.sg = NULL;
> +}
> +
>   static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
>   {
>   	struct scatterlist *sg;
> @@ -306,6 +318,8 @@ static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
>   
>   	for (i = 0; i < cmd->nr_mapped; i++)
>   		kunmap(sg_page(&sg[i]));
> +
> +	cmd->nr_mapped = 0;
>   }
>   
>   static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
> @@ -387,7 +401,7 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
>   
>   	return 0;
>   err:
> -	sgl_free(cmd->req.sg);
> +	nvmet_tcp_free_iovec(cmd);
>   	return NVME_SC_INTERNAL;
>   }
>   
> @@ -632,10 +646,8 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
>   		}
>   	}
>   
> -	if (queue->nvme_sq.sqhd_disabled) {
> -		kfree(cmd->iov);
> -		sgl_free(cmd->req.sg);
> -	}
> +	if (queue->nvme_sq.sqhd_disabled)
> +		nvmet_tcp_free_iovec(cmd);
>   
>   	return 1;
>   
> @@ -664,8 +676,7 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
>   	if (left)
>   		return -EAGAIN;
>   
> -	kfree(cmd->iov);
> -	sgl_free(cmd->req.sg);
> +	nvmet_tcp_free_iovec(cmd);
>   	cmd->queue->snd_cmd = NULL;
>   	nvmet_tcp_put_cmd(cmd);
>   	return 1;
> @@ -1406,8 +1417,7 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
>   {
>   	nvmet_req_uninit(&cmd->req);
>   	nvmet_tcp_unmap_pdu_iovec(cmd);
> -	kfree(cmd->iov);
> -	sgl_free(cmd->req.sg);
> +	nvmet_tcp_free_iovec(cmd);
>   }
>   
>   static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
> 



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-10-21  8:41 ` [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work Maurizio Lombardi
@ 2021-10-21 14:57   ` John Meneghini
  2021-10-26 15:42   ` Sagi Grimberg
  1 sibling, 0 replies; 21+ messages in thread
From: John Meneghini @ 2021-10-21 14:57 UTC (permalink / raw)
  To: Maurizio Lombardi, linux-nvme
  Cc: hch, sagi, hare, chaitanya.kulkarni, John Meneghini

Reviewed-by: John Meneghini <jmeneghi@redhat.com>

On 10/21/21 4:41 AM, Maurizio Lombardi wrote:
> If the initiator executes a reset controller operation while
> performing I/O, the target kernel will crash because of a race condition
> between release_queue and io_work;
> nvmet_tcp_uninit_data_in_cmds() may be executed while io_work
> is running, calling flush_work(io_work) was not sufficient to
> prevent this because io_work could requeue itself.
> 
> * Fix this bug by preventing io_work from being enqueued when
> sk_user_data is NULL (it means that the queue is going to be deleted)
> 
> * Ensure that all the memory allocated for the commands' iovec is freed
> 
> Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
> ---
>   drivers/nvme/target/tcp.c | 13 +++++++++----
>   1 file changed, 9 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index 2f03a94725ae..1eedbd83c95f 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -551,6 +551,7 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req)
>   	struct nvmet_tcp_cmd *cmd =
>   		container_of(req, struct nvmet_tcp_cmd, req);
>   	struct nvmet_tcp_queue	*queue = cmd->queue;
> +	struct socket *sock = queue->sock;
>   	struct nvme_sgl_desc *sgl;
>   	u32 len;
>   
> @@ -570,7 +571,10 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req)
>   	}
>   
>   	llist_add(&cmd->lentry, &queue->resp_list);
> -	queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
> +	read_lock_bh(&sock->sk->sk_callback_lock);
> +	if (likely(sock->sk->sk_user_data))
> +		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
> +	read_unlock_bh(&sock->sk->sk_callback_lock);
>   }
>   
>   static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
> @@ -1427,7 +1431,9 @@ static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
>   
>   	for (i = 0; i < queue->nr_cmds; i++, cmd++) {
>   		if (nvmet_tcp_need_data_in(cmd))
> -			nvmet_tcp_finish_cmd(cmd);
> +			nvmet_req_uninit(&cmd->req);
> +		nvmet_tcp_unmap_pdu_iovec(cmd);
> +		nvmet_tcp_free_iovec(cmd);
>   	}
>   
>   	if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
> @@ -1447,11 +1453,10 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
>   	mutex_unlock(&nvmet_tcp_queue_mutex);
>   
>   	nvmet_tcp_restore_socket_callbacks(queue);
> -	flush_work(&queue->io_work);
> +	cancel_work_sync(&queue->io_work);
>   
>   	nvmet_tcp_uninit_data_in_cmds(queue);
>   	nvmet_sq_destroy(&queue->nvme_sq);
> -	cancel_work_sync(&queue->io_work);
>   	sock_release(queue->sock);
>   	nvmet_tcp_free_cmds(queue);
>   	if (queue->hdr_digest || queue->data_digest)
> 



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] nvmet: add an helper to free the iovec
  2021-10-21 14:56   ` John Meneghini
@ 2021-10-21 14:58     ` John Meneghini
  2021-10-27  0:15     ` Chaitanya Kulkarni
  1 sibling, 0 replies; 21+ messages in thread
From: John Meneghini @ 2021-10-21 14:58 UTC (permalink / raw)
  To: Maurizio Lombardi, linux-nvme; +Cc: hch, sagi, hare, chaitanya.kulkarni

Reviewed-by: John Meneghini <jmeneghi@redhat.com>

On 10/21/21 10:56 AM, John Meneghini wrote:
> Looks good Maurizio.
> 
> You should also mention that you have a simple test you can run which reproduces this bug and results in a kernel panic.
> 
> We might want to add that test to the blktests tool.
> 
> Reviewed-by: John Meneghini <jmeneghi.redhat.com>
> 
> 
> On 10/21/21 4:41 AM, Maurizio Lombardi wrote:
>> Makes the code easier to read and to debug.Reviewed-by: John Meneghini <jmeneghi at redhat.com>
>>
>> Sets the freed pointers to NULL, it will be useful
>> when destroying the queues to understand if the
>> iovecs have been released already or not.
>>
>> Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
>> ---
>>   drivers/nvme/target/tcp.c | 28 +++++++++++++++++++---------
>>   1 file changed, 19 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
>> index c33a0464346f..2f03a94725ae 100644
>> --- a/drivers/nvme/target/tcp.c
>> +++ b/drivers/nvme/target/tcp.c
>> @@ -166,6 +166,8 @@ static struct workqueue_struct *nvmet_tcp_wq;
>>   static const struct nvmet_fabrics_ops nvmet_tcp_ops;
>>   static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
>>   static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
>> +static void nvmet_tcp_free_iovec(struct nvmet_tcp_cmd *cmd);
>> +static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd);
>>   static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
>>           struct nvmet_tcp_cmd *cmd)
>> @@ -297,6 +299,16 @@ static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
>>       return 0;
>>   }
>> +static void nvmet_tcp_free_iovec(struct nvmet_tcp_cmd *cmd)
>> +{
>> +    WARN_ON(unlikely(cmd->nr_mapped > 0));
>> +
>> +    kfree(cmd->iov);
>> +    sgl_free(cmd->req.sg);
>> +    cmd->iov = NULL;
>> +    cmd->req.sg = NULL;
>> +}
>> +
>>   static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
>>   {
>>       struct scatterlist *sg;
>> @@ -306,6 +318,8 @@ static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
>>       for (i = 0; i < cmd->nr_mapped; i++)
>>           kunmap(sg_page(&sg[i]));
>> +
>> +    cmd->nr_mapped = 0;
>>   }
>>   static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
>> @@ -387,7 +401,7 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
>>       return 0;
>>   err:
>> -    sgl_free(cmd->req.sg);
>> +    nvmet_tcp_free_iovec(cmd);
>>       return NVME_SC_INTERNAL;
>>   }
>> @@ -632,10 +646,8 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
>>           }
>>       }
>> -    if (queue->nvme_sq.sqhd_disabled) {
>> -        kfree(cmd->iov);
>> -        sgl_free(cmd->req.sg);
>> -    }
>> +    if (queue->nvme_sq.sqhd_disabled)
>> +        nvmet_tcp_free_iovec(cmd);
>>       return 1;
>> @@ -664,8 +676,7 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
>>       if (left)
>>           return -EAGAIN;
>> -    kfree(cmd->iov);
>> -    sgl_free(cmd->req.sg);
>> +    nvmet_tcp_free_iovec(cmd);
>>       cmd->queue->snd_cmd = NULL;
>>       nvmet_tcp_put_cmd(cmd);
>>       return 1;
>> @@ -1406,8 +1417,7 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
>>   {
>>       nvmet_req_uninit(&cmd->req);
>>       nvmet_tcp_unmap_pdu_iovec(cmd);
>> -    kfree(cmd->iov);
>> -    sgl_free(cmd->req.sg);
>> +    nvmet_tcp_free_iovec(cmd);
>>   }
>>   static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
>>
> 



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-10-21  8:41 ` [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work Maurizio Lombardi
  2021-10-21 14:57   ` John Meneghini
@ 2021-10-26 15:42   ` Sagi Grimberg
  2021-10-28  7:55     ` Maurizio Lombardi
  1 sibling, 1 reply; 21+ messages in thread
From: Sagi Grimberg @ 2021-10-26 15:42 UTC (permalink / raw)
  To: Maurizio Lombardi, linux-nvme; +Cc: hch, hare, chaitanya.kulkarni, jmeneghi

Hi Maurizio,

On 10/21/21 11:41 AM, Maurizio Lombardi wrote:
> If the initiator executes a reset controller operation while
> performing I/O, the target kernel will crash because of a race condition
> between release_queue and io_work;

Can you add the stack trace?

> nvmet_tcp_uninit_data_in_cmds() may be executed while io_work
> is running, calling flush_work(io_work) was not sufficient to
> prevent this because io_work could requeue itself.

OK, then this should be sufficient to fix it right?

> * Fix this bug by preventing io_work from being enqueued when
> sk_user_data is NULL (it means that the queue is going to be deleted)

This is triggered from the completion path, where the commands
are not in a state where they are still fetching data from the
host. How does this prevent the crash?

> * Ensure that all the memory allocated for the commands' iovec is freed

How is this needed to prevent a crash?


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/2] nvmet: add an helper to free the iovec
  2021-10-21 14:56   ` John Meneghini
  2021-10-21 14:58     ` John Meneghini
@ 2021-10-27  0:15     ` Chaitanya Kulkarni
  1 sibling, 0 replies; 21+ messages in thread
From: Chaitanya Kulkarni @ 2021-10-27  0:15 UTC (permalink / raw)
  To: John Meneghini, Maurizio Lombardi, linux-nvme
  Cc: hch, sagi, hare, chaitanya.kulkarni

On 10/21/21 7:56 AM, John Meneghini wrote:
> Looks good Maurizio.
> 
> You should also mention that you have a simple test you can run which 
> reproduces this bug and results in a kernel panic.
> 
> We might want to add that test to the blktests tool.
> 
> Reviewed-by: John Meneghini <jmeneghi.redhat.com>
> 


Please add this test to block tests so that this will get tested
on regular basis.




^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-10-26 15:42   ` Sagi Grimberg
@ 2021-10-28  7:55     ` Maurizio Lombardi
  2021-11-03  9:28       ` Sagi Grimberg
  0 siblings, 1 reply; 21+ messages in thread
From: Maurizio Lombardi @ 2021-10-28  7:55 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi

Hi Sagi,

On Tue, Oct 26, 2021 at 06:42:02PM +0300, Sagi Grimberg wrote:
> On 10/21/21 11:41 AM, Maurizio Lombardi wrote:
> > If the initiator executes a reset controller operation while
> > performing I/O, the target kernel will crash because of a race condition
> > between release_queue and io_work;
> 
> Can you add the stack trace?

Yes, this is an example:

[  526.368376] nvmet: creating controller 1 for subsystem test-nqn for NQN nqn.2014-08.org.nvmexpress:uuid:119cb961-04c6-5845-9ed0-66a692ae0ddd.
[  526.598367] nvmet: creating controller 2 for subsystem test-nqn for NQN nqn.2014-08.org.nvmexpress:uuid:119cb961-04c6-5845-9ed0-66a692ae0ddd.
[  526.827356] nvmet: creating controller 1 for subsystem test-nqn for NQN nqn.2014-08.org.nvmexpress:uuid:119cb961-04c6-5845-9ed0-66a692ae0ddd.
[  526.837836] general protection fault, probably for non-canonical address 0x893b50fb71133fa5: 0000 [#1] SMP PTI
[  526.840045] CPU: 1 PID: 225 Comm: kworker/1:1H Kdump: loaded Not tainted 5.14.0_nvme1+ #1
[  526.841814] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[  526.843064] Workqueue: nvmet_tcp_wq nvmet_tcp_io_work [nvmet_tcp]
[  526.844398] RIP: 0010:__kmalloc+0xed/0x3f0
[  526.845300] Code: 08 65 4c 03 05 94 ad 2e 45 49 83 78 10 00 4d 8b 20 0f 84 95 02 00 00 4d 85 e4 0f 84 8c 02 00 00 8b 45 28 48 8b 7d 00 4c 01 e0 <48> 8b 18 48 89 c1 48 33 9d b8 00 00 00 4c 89 e0 48 0f c9 48 31 cb
[  526.849296] RSP: 0018:ffffaf66c014fc38 EFLAGS: 00010286
[  526.850427] RAX: 893b50fb71133fa5 RBX: ffff8e1c83f02450 RCX: 0000000000000000
[  526.851959] RDX: 000000000009de38 RSI: 0000000000000cc0 RDI: 00000000000311c0
[  526.853496] RBP: ffff8e1d80042e00 R08: ffff8e1eb7d311c0 R09: 0000000000000000
[  526.855027] R10: 00000000000035ee R11: 0000000000000000 R12: 893b50fb71132fa5
[  526.856561] R13: 0000000000000cc0 R14: 0000000000000000 R15: ffffffffc072a240
[  526.858092] FS:  0000000000000000(0000) GS:ffff8e1eb7d00000(0000) knlGS:0000000000000000
[  526.859804] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  526.860913] CR2: 000055aa6316b0e8 CR3: 0000000052010001 CR4: 00000000007706e0
[  526.862292] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  526.863658] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  526.865023] PKRU: 55555554
[  526.865564] Call Trace:
[  526.866059]  nvmet_tcp_map_data+0xc0/0x110 [nvmet_tcp]
[  526.867052]  nvmet_tcp_done_recv_pdu+0x13b/0x2b0 [nvmet_tcp]
[  526.868144]  nvmet_tcp_try_recv_pdu+0x17d/0x210 [nvmet_tcp]
[  526.869222]  nvmet_tcp_try_recv.constprop.0+0x67/0x300 [nvmet_tcp]
[  526.870414]  ? inet_sendpage+0x4f/0x80
[  526.871156]  nvmet_tcp_io_work+0x44/0xa69 [nvmet_tcp]
[  526.872134]  ? dequeue_task_fair+0xb1/0x370
[  526.872952]  ? finish_task_switch.isra.0+0xb1/0x290
[  526.873904]  process_one_work+0x1e6/0x380
[  526.874694]  worker_thread+0x53/0x3d0
[  526.875420]  ? process_one_work+0x380/0x380
[  526.876233]  kthread+0x10f/0x130
[  526.876872]  ? set_kthread_struct+0x40/0x40
[  526.877686]  ret_from_fork+0x22/0x30

if you want to reproduce it just run something like $watch -n 0.1 "echo 1 > reset_controller"
while you do some I/O with dd


> 
> > nvmet_tcp_uninit_data_in_cmds() may be executed while io_work
> > is running, calling flush_work(io_work) was not sufficient to
> > prevent this because io_work could requeue itself.
> 
> OK, then this should be sufficient to fix it right?

with "this" you mean calling cancel_work_sync() before nvmet_tcp_uninit_data_in_cmds()
and after nvmet_sq_destroy(), without checking sk_user_data in nvmet_tcp_queue_response() ?

If yes, I tried that; it's not sufficient, it still crashes:

[  374.050932] nvmet_tcp: failed cmd 00000000ba978688 id 50 opcode 1, data_len: 1310720
[  374.052699] nvmet_tcp: failed cmd 000000004ba3f36e id 60 opcode 1, data_len: 1310720
[  374.054465] nvmet_tcp: failed cmd 0000000071ea477d id 61 opcode 1, data_len: 1310720
[  374.056253] BUG: kernel NULL pointer dereference, address: 000000000000000c
[  374.057836] #PF: supervisor read access in kernel mode
[  374.059009] #PF: error_code(0x0000) - not-present page
[  374.060178] PGD 0 P4D 0
[  374.060777] Oops: 0000 [#1] SMP PTI
[  374.061578] CPU: 1 PID: 226 Comm: kworker/1:1H Kdump: loaded Not tainted 5.14.0_nvme1+ #1
[  374.063427] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[  374.064749] Workqueue: nvmet_tcp_wq nvmet_tcp_io_work [nvmet_tcp]
[  374.066137] RIP: 0010:nvmet_tcp_map_pdu_iovec+0x66/0x110 [nvmet_tcp]
[  374.067573] Code: 00 00 48 c1 e8 0c 41 c1 ef 0c 44 89 bf e8 01 00 00 48 63 c8 49 c1 e7 05 4c 03 7f 30 89 87 ec 01 00 00 85 ed 0f 84 9d 00 00 00 <45> 8b 67 0c 49 8b 1f 45 29 ec 41 39 ec 44 0f 47 e5 48 83 e3 fc 49
[  374.071780] RSP: 0018:ffffa98c8016fc48 EFLAGS: 00010206
[  374.072965] RAX: 0000000000000140 RBX: ffff91c1da819208 RCX: 0000000000000140
[  374.074579] RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff91c1da819208
[  374.076188] RBP: 0000000000140000 R08: 0000000000010230 R09: 0000000000017e3a
[  374.077788] R10: 0000000000000125 R11: 0000000000000010 R12: 0000000000000000
[  374.079396] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[  374.081004] FS:  0000000000000000(0000) GS:ffff91c2f7d00000(0000) knlGS:0000000000000000
[  374.082835] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  374.084117] CR2: 000000000000000c CR3: 000000010c79a003 CR4: 00000000007706e0
[  374.085669] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  374.087147] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  374.088586] PKRU: 55555554
[  374.089214] Call Trace:
[  374.089807]  nvmet_tcp_done_recv_pdu+0x24c/0x2b0 [nvmet_tcp]
[  374.090971]  nvmet_tcp_try_recv_pdu+0x17d/0x210 [nvmet_tcp]
[  374.092101]  ? update_load_avg+0x7a/0x5a0
[  374.092969]  nvmet_tcp_try_recv.constprop.0+0x67/0x310 [nvmet_tcp]
[  374.094323]  ? set_next_entity+0xac/0x1e0
[  374.095149]  ? pick_next_task_fair+0x18e/0x3b0
[  374.096057]  nvmet_tcp_io_work+0x44/0xabd [nvmet_tcp]
[  374.097141]  ? dequeue_task_fair+0xb1/0x370
[  374.098087]  ? finish_task_switch.isra.0+0xb1/0x290
[  374.099088]  process_one_work+0x1e6/0x380
[  374.099922]  worker_thread+0x53/0x3d0
[  374.100670]  ? process_one_work+0x380/0x380
[  374.101631]  kthread+0x10f/0x130
[  374.102336]  ? set_kthread_struct+0x40/0x40
[  374.103194]  ret_from_fork+0x22/0x30
[  374.103974] Modules linked in: nvmet_tcp nvmet t10_pi rfkill intel_rapl_msr intel_rapl_common isst_if_common nfit libnvdimm i2c_piix4 pcspkr joydev virtio_balloon drm fuse xfs libcrc32c ata_generic crct10dif_pclmul crc32_pclmul crc32c_intel virtio_net ghash_clmulni_intel net_failover ata_piix virtio_blk libata serio_raw failover dm_mirror dm_region_hash dm_log dm_mod
[  374.110690] CR2: 000000000000000c

> 
> > * Fix this bug by preventing io_work from being enqueued when
> > sk_user_data is NULL (it means that the queue is going to be deleted)
> 
> This is triggered from the completion path, where the commands
> are not in a state where they are still fetching data from the
> host. How does this prevent the crash?

io_work is also triggered every time a nvmet_req_init() fails and when
nvmet_sq_destroy() is called, I am not really sure about the state
of the commands in those cases.


> 
> > * Ensure that all the memory allocated for the commands' iovec is freed
> 
> How is this needed to prevent a crash?

It's not really needed to prevent the crash, but without this
you will see massive memory leaks.
It happened sometimes that it exhausted the entire system memory
before hitting the crash.


Maurizio



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-10-28  7:55     ` Maurizio Lombardi
@ 2021-11-03  9:28       ` Sagi Grimberg
  2021-11-03 11:31         ` Maurizio Lombardi
  0 siblings, 1 reply; 21+ messages in thread
From: Sagi Grimberg @ 2021-11-03  9:28 UTC (permalink / raw)
  To: Maurizio Lombardi; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi



On 10/28/21 10:55 AM, Maurizio Lombardi wrote:
> Hi Sagi,
> 
> On Tue, Oct 26, 2021 at 06:42:02PM +0300, Sagi Grimberg wrote:
>> On 10/21/21 11:41 AM, Maurizio Lombardi wrote:
>>> If the initiator executes a reset controller operation while
>>> performing I/O, the target kernel will crash because of a race condition
>>> between release_queue and io_work;
>>
>> Can you add the stack trace?
> 
> Yes, this is an example:
> 
> [  526.368376] nvmet: creating controller 1 for subsystem test-nqn for NQN nqn.2014-08.org.nvmexpress:uuid:119cb961-04c6-5845-9ed0-66a692ae0ddd.
> [  526.598367] nvmet: creating controller 2 for subsystem test-nqn for NQN nqn.2014-08.org.nvmexpress:uuid:119cb961-04c6-5845-9ed0-66a692ae0ddd.
> [  526.827356] nvmet: creating controller 1 for subsystem test-nqn for NQN nqn.2014-08.org.nvmexpress:uuid:119cb961-04c6-5845-9ed0-66a692ae0ddd.
> [  526.837836] general protection fault, probably for non-canonical address 0x893b50fb71133fa5: 0000 [#1] SMP PTI
> [  526.840045] CPU: 1 PID: 225 Comm: kworker/1:1H Kdump: loaded Not tainted 5.14.0_nvme1+ #1
> [  526.841814] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> [  526.843064] Workqueue: nvmet_tcp_wq nvmet_tcp_io_work [nvmet_tcp]
> [  526.844398] RIP: 0010:__kmalloc+0xed/0x3f0
> [  526.845300] Code: 08 65 4c 03 05 94 ad 2e 45 49 83 78 10 00 4d 8b 20 0f 84 95 02 00 00 4d 85 e4 0f 84 8c 02 00 00 8b 45 28 48 8b 7d 00 4c 01 e0 <48> 8b 18 48 89 c1 48 33 9d b8 00 00 00 4c 89 e0 48 0f c9 48 31 cb
> [  526.849296] RSP: 0018:ffffaf66c014fc38 EFLAGS: 00010286
> [  526.850427] RAX: 893b50fb71133fa5 RBX: ffff8e1c83f02450 RCX: 0000000000000000
> [  526.851959] RDX: 000000000009de38 RSI: 0000000000000cc0 RDI: 00000000000311c0
> [  526.853496] RBP: ffff8e1d80042e00 R08: ffff8e1eb7d311c0 R09: 0000000000000000
> [  526.855027] R10: 00000000000035ee R11: 0000000000000000 R12: 893b50fb71132fa5
> [  526.856561] R13: 0000000000000cc0 R14: 0000000000000000 R15: ffffffffc072a240
> [  526.858092] FS:  0000000000000000(0000) GS:ffff8e1eb7d00000(0000) knlGS:0000000000000000
> [  526.859804] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  526.860913] CR2: 000055aa6316b0e8 CR3: 0000000052010001 CR4: 00000000007706e0
> [  526.862292] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  526.863658] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  526.865023] PKRU: 55555554
> [  526.865564] Call Trace:
> [  526.866059]  nvmet_tcp_map_data+0xc0/0x110 [nvmet_tcp]
> [  526.867052]  nvmet_tcp_done_recv_pdu+0x13b/0x2b0 [nvmet_tcp]
> [  526.868144]  nvmet_tcp_try_recv_pdu+0x17d/0x210 [nvmet_tcp]
> [  526.869222]  nvmet_tcp_try_recv.constprop.0+0x67/0x300 [nvmet_tcp]
> [  526.870414]  ? inet_sendpage+0x4f/0x80
> [  526.871156]  nvmet_tcp_io_work+0x44/0xa69 [nvmet_tcp]
> [  526.872134]  ? dequeue_task_fair+0xb1/0x370
> [  526.872952]  ? finish_task_switch.isra.0+0xb1/0x290
> [  526.873904]  process_one_work+0x1e6/0x380
> [  526.874694]  worker_thread+0x53/0x3d0
> [  526.875420]  ? process_one_work+0x380/0x380
> [  526.876233]  kthread+0x10f/0x130
> [  526.876872]  ? set_kthread_struct+0x40/0x40
> [  526.877686]  ret_from_fork+0x22/0x30
> 
> if you want to reproduce it just run something like $watch -n 0.1 "echo 1 > reset_controller"
> while you do some I/O with dd
> 
> 
>>
>>> nvmet_tcp_uninit_data_in_cmds() may be executed while io_work
>>> is running, calling flush_work(io_work) was not sufficient to
>>> prevent this because io_work could requeue itself.
>>
>> OK, then this should be sufficient to fix it right?
> 
> with "this" you mean calling cancel_work_sync() before nvmet_tcp_uninit_data_in_cmds()
> and after nvmet_sq_destroy(), without checking sk_user_data in nvmet_tcp_queue_response() ?
> 
> If yes, I tried that; it's not sufficient, it still crashes:
> 
> [  374.050932] nvmet_tcp: failed cmd 00000000ba978688 id 50 opcode 1, data_len: 1310720
> [  374.052699] nvmet_tcp: failed cmd 000000004ba3f36e id 60 opcode 1, data_len: 1310720
> [  374.054465] nvmet_tcp: failed cmd 0000000071ea477d id 61 opcode 1, data_len: 1310720
> [  374.056253] BUG: kernel NULL pointer dereference, address: 000000000000000c
> [  374.057836] #PF: supervisor read access in kernel mode
> [  374.059009] #PF: error_code(0x0000) - not-present page
> [  374.060178] PGD 0 P4D 0
> [  374.060777] Oops: 0000 [#1] SMP PTI
> [  374.061578] CPU: 1 PID: 226 Comm: kworker/1:1H Kdump: loaded Not tainted 5.14.0_nvme1+ #1
> [  374.063427] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> [  374.064749] Workqueue: nvmet_tcp_wq nvmet_tcp_io_work [nvmet_tcp]
> [  374.066137] RIP: 0010:nvmet_tcp_map_pdu_iovec+0x66/0x110 [nvmet_tcp]
> [  374.067573] Code: 00 00 48 c1 e8 0c 41 c1 ef 0c 44 89 bf e8 01 00 00 48 63 c8 49 c1 e7 05 4c 03 7f 30 89 87 ec 01 00 00 85 ed 0f 84 9d 00 00 00 <45> 8b 67 0c 49 8b 1f 45 29 ec 41 39 ec 44 0f 47 e5 48 83 e3 fc 49
> [  374.071780] RSP: 0018:ffffa98c8016fc48 EFLAGS: 00010206
> [  374.072965] RAX: 0000000000000140 RBX: ffff91c1da819208 RCX: 0000000000000140
> [  374.074579] RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff91c1da819208
> [  374.076188] RBP: 0000000000140000 R08: 0000000000010230 R09: 0000000000017e3a
> [  374.077788] R10: 0000000000000125 R11: 0000000000000010 R12: 0000000000000000
> [  374.079396] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> [  374.081004] FS:  0000000000000000(0000) GS:ffff91c2f7d00000(0000) knlGS:0000000000000000
> [  374.082835] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  374.084117] CR2: 000000000000000c CR3: 000000010c79a003 CR4: 00000000007706e0
> [  374.085669] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  374.087147] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  374.088586] PKRU: 55555554
> [  374.089214] Call Trace:
> [  374.089807]  nvmet_tcp_done_recv_pdu+0x24c/0x2b0 [nvmet_tcp]
> [  374.090971]  nvmet_tcp_try_recv_pdu+0x17d/0x210 [nvmet_tcp]
> [  374.092101]  ? update_load_avg+0x7a/0x5a0
> [  374.092969]  nvmet_tcp_try_recv.constprop.0+0x67/0x310 [nvmet_tcp]
> [  374.094323]  ? set_next_entity+0xac/0x1e0
> [  374.095149]  ? pick_next_task_fair+0x18e/0x3b0
> [  374.096057]  nvmet_tcp_io_work+0x44/0xabd [nvmet_tcp]
> [  374.097141]  ? dequeue_task_fair+0xb1/0x370
> [  374.098087]  ? finish_task_switch.isra.0+0xb1/0x290
> [  374.099088]  process_one_work+0x1e6/0x380
> [  374.099922]  worker_thread+0x53/0x3d0
> [  374.100670]  ? process_one_work+0x380/0x380
> [  374.101631]  kthread+0x10f/0x130
> [  374.102336]  ? set_kthread_struct+0x40/0x40
> [  374.103194]  ret_from_fork+0x22/0x30
> [  374.103974] Modules linked in: nvmet_tcp nvmet t10_pi rfkill intel_rapl_msr intel_rapl_common isst_if_common nfit libnvdimm i2c_piix4 pcspkr joydev virtio_balloon drm fuse xfs libcrc32c ata_generic crct10dif_pclmul crc32_pclmul crc32c_intel virtio_net ghash_clmulni_intel net_failover ata_piix virtio_blk libata serio_raw failover dm_mirror dm_region_hash dm_log dm_mod
> [  374.110690] CR2: 000000000000000c
So this means we still get data from the network when
we shouldn't. Maybe we are simply missing a kernel_sock_shutdown
for SHUT_RD?

> 
>>
>>> * Fix this bug by preventing io_work from being enqueued when
>>> sk_user_data is NULL (it means that the queue is going to be deleted)
>>
>> This is triggered from the completion path, where the commands
>> are not in a state where they are still fetching data from the
>> host. How does this prevent the crash?
> 
> io_work is also triggered every time a nvmet_req_init() fails and when
> nvmet_sq_destroy() is called, I am not really sure about the state
> of the commands in those cases.

But that is from the workqueue context - which means that
cancel_work_sync should prevent it right?

> 
> 
>>
>>> * Ensure that all the memory allocated for the commands' iovec is freed
>>
>> How is this needed to prevent a crash?
> 
> It's not really needed to prevent the crash, but without this
> you will see massive memory leaks.
> It happened sometimes that it exhausted the entire system memory
> before hitting the crash.

But that needs to be a separate fix and not combined with other
fixes.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-03  9:28       ` Sagi Grimberg
@ 2021-11-03 11:31         ` Maurizio Lombardi
  2021-11-04 12:59           ` Sagi Grimberg
  0 siblings, 1 reply; 21+ messages in thread
From: Maurizio Lombardi @ 2021-11-03 11:31 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi

On Wed, Nov 03, 2021 at 11:28:35AM +0200, Sagi Grimberg wrote:
> 
> So this means we still get data from the network when
> we shouldn't. Maybe we are simply missing a kernel_sock_shutdown
> for SHUT_RD?

Hmm, right, kernel_sock_shutdown(queue->sock) is executed in
nvmet_tcp_delete_ctrl() and sock_release(queue->sock) is called
in nvmet_tcp_release_queue_work(), so there could be a race here.

I will try to move kernel_sock_shutdown(queue->sock) in nvmet_tcp_release_queue_work()
and test it.

> 
> > 
> > > 
> > > > * Fix this bug by preventing io_work from being enqueued when
> > > > sk_user_data is NULL (it means that the queue is going to be deleted)
> > > 
> > > This is triggered from the completion path, where the commands
> > > are not in a state where they are still fetching data from the
> > > host. How does this prevent the crash?
> > 
> > io_work is also triggered every time a nvmet_req_init() fails and when
> > nvmet_sq_destroy() is called, I am not really sure about the state
> > of the commands in those cases.
> 
> But that is from the workqueue context - which means that
> cancel_work_sync should prevent it right?


But nvmet_sq_destroy() is called from the release_work context,
we call cancel_work_sync() immediately after but we can't be sure
that the work will be canceled, io_work might have started already and
cancel_work_sync() will block until io_work ends its job, right?

> 
> But that needs to be a separate fix and not combined with other
> fixes.

Ok I will submit it as a separate patch.

Maurizio



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-03 11:31         ` Maurizio Lombardi
@ 2021-11-04 12:59           ` Sagi Grimberg
  2021-11-12 10:54             ` Maurizio Lombardi
  0 siblings, 1 reply; 21+ messages in thread
From: Sagi Grimberg @ 2021-11-04 12:59 UTC (permalink / raw)
  To: Maurizio Lombardi; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi


>> So this means we still get data from the network when
>> we shouldn't. Maybe we are simply missing a kernel_sock_shutdown
>> for SHUT_RD?
> 
> Hmm, right, kernel_sock_shutdown(queue->sock) is executed in
> nvmet_tcp_delete_ctrl() and sock_release(queue->sock) is called
> in nvmet_tcp_release_queue_work(), so there could be a race here.
> 
> I will try to move kernel_sock_shutdown(queue->sock) in nvmet_tcp_release_queue_work()
> and test it.

Not move, but perhaps add another SHUT_RD call.

> 
>>
>>>
>>>>
>>>>> * Fix this bug by preventing io_work from being enqueued when
>>>>> sk_user_data is NULL (it means that the queue is going to be deleted)
>>>>
>>>> This is triggered from the completion path, where the commands
>>>> are not in a state where they are still fetching data from the
>>>> host. How does this prevent the crash?
>>>
>>> io_work is also triggered every time a nvmet_req_init() fails and when
>>> nvmet_sq_destroy() is called, I am not really sure about the state
>>> of the commands in those cases.
>>
>> But that is from the workqueue context - which means that
>> cancel_work_sync should prevent it right?
> 
> 
> But nvmet_sq_destroy() is called from the release_work context,

Yes

> we call cancel_work_sync() immediately after but we can't be sure
> that the work will be canceled, io_work might have started already and
> cancel_work_sync() will block until io_work ends its job, right?

Right, after the call to cancel_work_sync we will know that io_work
is not running. Note that it can run as a result of a backend completion
but that is ok and we do want to let it run and return completion to the
host, but the socket should already be shut down for recv, so we cannot
get any other byte from the network.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-04 12:59           ` Sagi Grimberg
@ 2021-11-12 10:54             ` Maurizio Lombardi
  2021-11-12 15:54               ` John Meneghini
  2021-11-14 10:28               ` Sagi Grimberg
  0 siblings, 2 replies; 21+ messages in thread
From: Maurizio Lombardi @ 2021-11-12 10:54 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi

Hi Sagi,

On Thu, Nov 04, 2021 at 02:59:53PM +0200, Sagi Grimberg wrote:
> 
> Right, after the call to cancel_work_sync we will know that io_work
> is not running. Note that it can run as a result of a backend completion
> but that is ok and we do want to let it run and return completion to the
> host, but the socket should already be shut down for recv, so we cannot
> get any other byte from the network.

I did some tests and I found out that kernel_recvmsg() sometimes returns
data even if the socket has been already shut down (maybe it's data it received
before the call to kernel_sock_shutdown() and waiting in some internal buffer?).

So when nvmet_sq_destroy() triggered io_work, recvmsg() still returned data
and the kernel crashed again despite the socket was closed.

Therefore, I think that after we shut down the socket we
should let io_work run and requeue itself until it finishes its job
and no more data is returned by recvmsg(),
one way to achieve this is to repeatedly call flush_work() until it returns
false.

Right now I am testing the patch below and it works perfectly.

Note that when the socket is closed recvmsg() might return 0,
nvmet_tcp_try_recv_data() should return -EAGAIN
in that case otherwise we end up in an infinite loop (io_work
will countinously requeue itself).

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 2f03a94725ae..7b441071c6b9 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1139,8 +1139,10 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
 	while (msg_data_left(&cmd->recv_msg)) {
 		ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
 			cmd->recv_msg.msg_flags);
-		if (ret <= 0)
+		if (ret < 0)
 			return ret;
+		else if (ret == 0)
+			return -EAGAIN;

 		cmd->pdu_recv += ret;
 		cmd->rbytes_done += ret;
@@ -1446,8 +1450,10 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
 	list_del_init(&queue->queue_list);
 	mutex_unlock(&nvmet_tcp_queue_mutex);

+	kernel_sock_shutdown(queue->sock, SHUT_RD);
+
 	nvmet_tcp_restore_socket_callbacks(queue);
-	flush_work(&queue->io_work);
+	while (flush_work(&queue->io_work));

 	nvmet_tcp_uninit_data_in_cmds(queue);
 	nvmet_sq_destroy(&queue->nvme_sq);

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-12 10:54             ` Maurizio Lombardi
@ 2021-11-12 15:54               ` John Meneghini
  2021-11-15  7:52                 ` Maurizio Lombardi
  2021-11-14 10:28               ` Sagi Grimberg
  1 sibling, 1 reply; 21+ messages in thread
From: John Meneghini @ 2021-11-12 15:54 UTC (permalink / raw)
  To: Maurizio Lombardi, Sagi Grimberg
  Cc: linux-nvme, hch, hare, chaitanya.kulkarni

Nice work Maurizio. This should solve some of the problems we are seeing with nvme/tcp shutdown.

Do you think we have a similar problem on the host side, in nvme_tcp_init_connection?

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 8cb15ee5b249..adca40c932b7 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1271,8 +1271,12 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
         memset(&msg, 0, sizeof(msg));
         iov.iov_base = icresp;
         iov.iov_len = sizeof(*icresp);
-       ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+
+       do {
+               ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
                         iov.iov_len, msg.msg_flags);
+        } while (ret == 0);
+
         if (ret < 0)
                 goto free_icresp;



On 11/12/21 05:54, Maurizio Lombardi wrote:
> Hi Sagi,
> 
> On Thu, Nov 04, 2021 at 02:59:53PM +0200, Sagi Grimberg wrote:
>>
>> Right, after the call to cancel_work_sync we will know that io_work
>> is not running. Note that it can run as a result of a backend completion
>> but that is ok and we do want to let it run and return completion to the
>> host, but the socket should already be shut down for recv, so we cannot
>> get any other byte from the network.
> 
> 
> I did some tests and I found out that kernel_recvmsg() sometimes returns
> data even if the socket has been already shut down (maybe it's data it received
> before the call to kernel_sock_shutdown() and waiting in some internal buffer?).
> 
> So when nvmet_sq_destroy() triggered io_work, recvmsg() still returned data
> and the kernel crashed again despite the socket was closed.
> 
> Therefore, I think that after we shut down the socket we
> should let io_work run and requeue itself until it finishes its job
> and no more data is returned by recvmsg(),
> one way to achieve this is to repeatedly call flush_work() until it returns
> false.
> 
> Right now I am testing the patch below and it works perfectly.
> 
> Note that when the socket is closed recvmsg() might return 0,
> nvmet_tcp_try_recv_data() should return -EAGAIN
> in that case otherwise we end up in an infinite loop (io_work
> will countinously requeue itself).
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index 2f03a94725ae..7b441071c6b9 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -1139,8 +1139,10 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
>   	while (msg_data_left(&cmd->recv_msg)) {
>   		ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
>   			cmd->recv_msg.msg_flags);
> -		if (ret <= 0)
> +		if (ret < 0)
>   			return ret;
> +		else if (ret == 0)
> +			return -EAGAIN;
>   
>   		cmd->pdu_recv += ret;
>   		cmd->rbytes_done += ret;
> @@ -1446,8 +1450,10 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
>   	list_del_init(&queue->queue_list);
>   	mutex_unlock(&nvmet_tcp_queue_mutex);
>   
> +	kernel_sock_shutdown(queue->sock, SHUT_RD);
> +
>   	nvmet_tcp_restore_socket_callbacks(queue);
> -	flush_work(&queue->io_work);
> +	while (flush_work(&queue->io_work));
>   
>   	nvmet_tcp_uninit_data_in_cmds(queue);
>   	nvmet_sq_destroy(&queue->nvme_sq);
> 



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-12 10:54             ` Maurizio Lombardi
  2021-11-12 15:54               ` John Meneghini
@ 2021-11-14 10:28               ` Sagi Grimberg
  2021-11-15  7:47                 ` Maurizio Lombardi
  1 sibling, 1 reply; 21+ messages in thread
From: Sagi Grimberg @ 2021-11-14 10:28 UTC (permalink / raw)
  To: Maurizio Lombardi; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi



On 11/12/21 12:54 PM, Maurizio Lombardi wrote:
> Hi Sagi,
> 
> On Thu, Nov 04, 2021 at 02:59:53PM +0200, Sagi Grimberg wrote:
>>
>> Right, after the call to cancel_work_sync we will know that io_work
>> is not running. Note that it can run as a result of a backend completion
>> but that is ok and we do want to let it run and return completion to the
>> host, but the socket should already be shut down for recv, so we cannot
>> get any other byte from the network.
> 
> 
> I did some tests and I found out that kernel_recvmsg() sometimes returns
> data even if the socket has been already shut down (maybe it's data it received
> before the call to kernel_sock_shutdown() and waiting in some internal buffer?).
> 
> So when nvmet_sq_destroy() triggered io_work, recvmsg() still returned data
> and the kernel crashed again despite the socket was closed.
> 
> Therefore, I think that after we shut down the socket we
> should let io_work run and requeue itself until it finishes its job
> and no more data is returned by recvmsg(),
> one way to achieve this is to repeatedly call flush_work() until it returns
> false.
> 
> Right now I am testing the patch below and it works perfectly.
> 
> Note that when the socket is closed recvmsg() might return 0,
> nvmet_tcp_try_recv_data() should return -EAGAIN
> in that case otherwise we end up in an infinite loop (io_work
> will countinously requeue itself).
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index 2f03a94725ae..7b441071c6b9 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -1139,8 +1139,10 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
>   	while (msg_data_left(&cmd->recv_msg)) {
>   		ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
>   			cmd->recv_msg.msg_flags);
> -		if (ret <= 0)
> +		if (ret < 0)
>   			return ret;
> +		else if (ret == 0)
> +			return -EAGAIN;
>   
>   		cmd->pdu_recv += ret;
>   		cmd->rbytes_done += ret;
> @@ -1446,8 +1450,10 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
>   	list_del_init(&queue->queue_list);
>   	mutex_unlock(&nvmet_tcp_queue_mutex);
>   
> +	kernel_sock_shutdown(queue->sock, SHUT_RD);
> +
>   	nvmet_tcp_restore_socket_callbacks(queue);

I think you should shutdown the socket after you restore the socket
callbacks, and after that cancel_work_sync to deny a self-requeue.

> -	flush_work(&queue->io_work);
> +	while (flush_work(&queue->io_work));
>   
>   	nvmet_tcp_uninit_data_in_cmds(queue);
>   	nvmet_sq_destroy(&queue->nvme_sq);
> 

How about something like this:
--
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 6eb0b3153477..6425375faf5b 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1436,7 +1436,9 @@ static void nvmet_tcp_release_queue_work(struct 
work_struct *w)
         mutex_unlock(&nvmet_tcp_queue_mutex);

         nvmet_tcp_restore_socket_callbacks(queue);
-       flush_work(&queue->io_work);
+       /* stop accepting incoming data */
+       kernel_sock_shutdown(queue->sock, SHUT_RD);
+       cancel_work_sync(&queue->io_work);

         nvmet_tcp_uninit_data_in_cmds(queue);
         nvmet_sq_destroy(&queue->nvme_sq);
--



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-14 10:28               ` Sagi Grimberg
@ 2021-11-15  7:47                 ` Maurizio Lombardi
  2021-11-15  9:48                   ` Sagi Grimberg
  0 siblings, 1 reply; 21+ messages in thread
From: Maurizio Lombardi @ 2021-11-15  7:47 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi

Hello Sagi,

On Sun, Nov 14, 2021 at 12:28:45PM +0200, Sagi Grimberg wrote:
> 
> 
> I think you should shutdown the socket after you restore the socket
> callbacks, and after that cancel_work_sync to deny a self-requeue.
> 
> > -	flush_work(&queue->io_work);
> > +	while (flush_work(&queue->io_work));
> >   	nvmet_tcp_uninit_data_in_cmds(queue);
> >   	nvmet_sq_destroy(&queue->nvme_sq);
> > 
> 
> How about something like this:
> --
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index 6eb0b3153477..6425375faf5b 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -1436,7 +1436,9 @@ static void nvmet_tcp_release_queue_work(struct
> work_struct *w)
>         mutex_unlock(&nvmet_tcp_queue_mutex);
> 
>         nvmet_tcp_restore_socket_callbacks(queue);
> -       flush_work(&queue->io_work);
> +       /* stop accepting incoming data */
> +       kernel_sock_shutdown(queue->sock, SHUT_RD);
> +       cancel_work_sync(&queue->io_work);
> 
>         nvmet_tcp_uninit_data_in_cmds(queue);
>         nvmet_sq_destroy(&queue->nvme_sq);
> --
> 

This one doesn't work, remember that nvmet_sq_destroy() will start io_work
again and, as I wrote in my previous email, recvmsg() might still return some
data after the socket has been shut down.

[ 1034.234574] nvmet: creating controller 2 for subsystem test-nqn for NQN nqn.2014-08.org.nvmexpress:uuid:a5f7dc87-f3f0-ea44-bfd1-0019ebbb4112. 
[ 1034.764498] nvmet: creating controller 1 for subsystem test-nqn for NQN nqn.2014-08.org.nvmexpress:uuid:a5f7dc87-f3f0-ea44-bfd1-0019ebbb4112. 
[ 1035.291398] nvmet: creating controller 2 for subsystem test-nqn for NQN nqn.2014-08.org.nvmexpress:uuid:a5f7dc87-f3f0-ea44-bfd1-0019ebbb4112. 
[ 1038.471686] nvmet: creating controller 1 for subsystem test-nqn for NQN nqn.2014-08.org.nvmexpress:uuid:a5f7dc87-f3f0-ea44-bfd1-0019ebbb4112. 
[ 1039.002853] general protection fault, probably for non-canonical address 0x51ede3866dd88e22: 0000 [#1] SMP PTI 
[ 1039.006622] CPU: 1 PID: 4869 Comm: kworker/1:2H Kdump: loaded Tainted: G        W        --------- ---  5.14.0_nvmet1+ #1 
[ 1039.008778] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 
[ 1039.009840] Workqueue: nvmet_tcp_wq nvmet_tcp_io_work [nvmet_tcp] 
[ 1039.011040] RIP: 0010:memcpy_erms+0x6/0x10 
[ 1039.011841] Code: cc cc cc cc eb 1e 0f 1f 00 48 89 f8 48 89 d1 48 c1 e9 03 83 e2 07 f3 48 a5 89 d1 f3 a4 c3 66 0f 1f 44 00 00 48 89 f8 48 89 d1 <f3> a4 c3 0f 1f 80 00 00 00 00 48 89 f8 48 83 fa 20 72 7e 40 38 fe 
[ 1039.015216] RSP: 0018:ffffc02c4059fb30 EFLAGS: 00010206 
[ 1039.016176] RAX: 51ede3866dd88e22 RBX: 000000000000248e RCX: 0000000000001000 
[ 1039.017472] RDX: 0000000000001000 RSI: ffff99d7c397948e RDI: 51ede3866dd88e22 
[ 1039.018767] RBP: ffff99d7cc02b000 R08: 0000000000000001 R09: 000000000000fe88 
[ 1039.020074] R10: 000000000000012a R11: 0000000000003ed6 R12: 0000000000000000 
[ 1039.021371] R13: 0000000000001000 R14: ffff99d7cc02b010 R15: 0000000000001a48 
[ 1039.022668] FS:  0000000000000000(0000) GS:ffff99d8f7d00000(0000) knlGS:0000000000000000 
[ 1039.024134] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033 
[ 1039.025181] CR2: 00007ffbf45c1658 CR3: 00000001051e4003 CR4: 00000000003706e0 
[ 1039.026497] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 
[ 1039.027836] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 
[ 1039.029185] Call Trace: 
[ 1039.029696]  _copy_to_iter+0x44d/0x710 
[ 1039.030483]  ? __virt_addr_valid+0x45/0x70 
[ 1039.031284]  ? __check_object_size.part.0+0x45/0x140 
[ 1039.032200]  __skb_datagram_iter+0x19b/0x2d0 
[ 1039.032990]  ? zerocopy_sg_from_iter+0x50/0x50 
[ 1039.033788]  skb_copy_datagram_iter+0x33/0x80 
[ 1039.034568]  tcp_recvmsg_locked+0x299/0x960 
[ 1039.035338]  tcp_recvmsg+0x72/0x1d0 
[ 1039.036023]  ? bpf_lsm_socket_sendmsg+0x10/0x10 
[ 1039.036929]  inet_recvmsg+0x57/0xf0 
[ 1039.037624]  nvmet_tcp_try_recv.constprop.0+0xcb/0x310 [nvmet_tcp] 
[ 1039.038913]  ? pick_next_task_fair+0x39/0x3b0 
[ 1039.039771]  nvmet_tcp_io_work+0x44/0xabd [nvmet_tcp] 
[ 1039.040684]  ? dequeue_task_fair+0xb1/0x370 
[ 1039.041430]  ? finish_task_switch.isra.0+0xb1/0x290 
[ 1039.042305]  process_one_work+0x1e6/0x380 
[ 1039.043061]  worker_thread+0x53/0x3d0 
[ 1039.043720]  ? process_one_work+0x380/0x380 
[ 1039.044467]  kthread+0x10f/0x130 
[ 1039.045059]  ? set_kthread_struct+0x40/0x40 
[ 1039.045806]  ret_from_fork+0x22/0x30 


Maurizio



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-12 15:54               ` John Meneghini
@ 2021-11-15  7:52                 ` Maurizio Lombardi
  0 siblings, 0 replies; 21+ messages in thread
From: Maurizio Lombardi @ 2021-11-15  7:52 UTC (permalink / raw)
  To: John Meneghini; +Cc: Sagi Grimberg, linux-nvme, hch, hare, chaitanya.kulkarni

Hi,

On Fri, Nov 12, 2021 at 10:54:42AM -0500, John Meneghini wrote:
> Nice work Maurizio. This should solve some of the problems we are seeing with nvme/tcp shutdown.
> 
> Do you think we have a similar problem on the host side, in nvme_tcp_init_connection?
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 8cb15ee5b249..adca40c932b7 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -1271,8 +1271,12 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
>         memset(&msg, 0, sizeof(msg));
>         iov.iov_base = icresp;
>         iov.iov_len = sizeof(*icresp);
> -       ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
> +
> +       do {
> +               ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
>                         iov.iov_len, msg.msg_flags);
> +        } while (ret == 0);
> +
>         if (ret < 0)
>                 goto free_icresp;
>


At the moment I don't know if there is a similar problem in the host side, I'll have to look
at it a bit.
However, I don't think this patch will work. If the socket has been shut down recvmsg() will
return 0 and it will fall into an infinite loop.

Maurizio



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-15  7:47                 ` Maurizio Lombardi
@ 2021-11-15  9:48                   ` Sagi Grimberg
  2021-11-15 10:00                     ` Maurizio Lombardi
  0 siblings, 1 reply; 21+ messages in thread
From: Sagi Grimberg @ 2021-11-15  9:48 UTC (permalink / raw)
  To: Maurizio Lombardi; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi


>> How about something like this:
>> --
>> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
>> index 6eb0b3153477..6425375faf5b 100644
>> --- a/drivers/nvme/target/tcp.c
>> +++ b/drivers/nvme/target/tcp.c
>> @@ -1436,7 +1436,9 @@ static void nvmet_tcp_release_queue_work(struct
>> work_struct *w)
>>          mutex_unlock(&nvmet_tcp_queue_mutex);
>>
>>          nvmet_tcp_restore_socket_callbacks(queue);
>> -       flush_work(&queue->io_work);
>> +       /* stop accepting incoming data */
>> +       kernel_sock_shutdown(queue->sock, SHUT_RD);
>> +       cancel_work_sync(&queue->io_work);
>>
>>          nvmet_tcp_uninit_data_in_cmds(queue);
>>          nvmet_sq_destroy(&queue->nvme_sq);
>> --
>>
> 
> This one doesn't work, remember that nvmet_sq_destroy() will start io_work
> again and, as I wrote in my previous email, recvmsg() might still return some
> data after the socket has been shut down.

I see, the reason why we hit this is because we uninit_data_in_cmds as
we need to clear the the sq references so nvmet_sq_destroy() can
complete, and then when nvmet_sq_destroy schedules io_work we hit this.

I think what we need is to make sure we don't recv from the socket.
How about this patch:
--
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 6eb0b3153477..65210dec3f1a 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1436,6 +1436,8 @@ static void nvmet_tcp_release_queue_work(struct 
work_struct *w)
         mutex_unlock(&nvmet_tcp_queue_mutex);

         nvmet_tcp_restore_socket_callbacks(queue);
+       /* stop accepting incoming data */
+       queue->rcv_state = NVMET_TCP_RECV_ERR;
         flush_work(&queue->io_work);

         nvmet_tcp_uninit_data_in_cmds(queue);
--


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-15  9:48                   ` Sagi Grimberg
@ 2021-11-15 10:00                     ` Maurizio Lombardi
  2021-11-15 10:13                       ` Sagi Grimberg
  2021-11-15 10:57                       ` Maurizio Lombardi
  0 siblings, 2 replies; 21+ messages in thread
From: Maurizio Lombardi @ 2021-11-15 10:00 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi

On Mon, Nov 15, 2021 at 11:48:38AM +0200, Sagi Grimberg wrote:
> I see, the reason why we hit this is because we uninit_data_in_cmds as
> we need to clear the the sq references so nvmet_sq_destroy() can
> complete, and then when nvmet_sq_destroy schedules io_work we hit this.
> 
> I think what we need is to make sure we don't recv from the socket.
> How about this patch:
> --
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index 6eb0b3153477..65210dec3f1a 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -1436,6 +1436,8 @@ static void nvmet_tcp_release_queue_work(struct
> work_struct *w)
>         mutex_unlock(&nvmet_tcp_queue_mutex);
> 
>         nvmet_tcp_restore_socket_callbacks(queue);
> +       /* stop accepting incoming data */
> +       queue->rcv_state = NVMET_TCP_RECV_ERR;
>         flush_work(&queue->io_work);
> 
>         nvmet_tcp_uninit_data_in_cmds(queue);
> --
> 

Ok I can repeat the test, but you probably want to do this instead:

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index fb72e2d67fd5..d21b525fd4cb 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1450,7 +1450,9 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
        mutex_unlock(&nvmet_tcp_queue_mutex);
 
        nvmet_tcp_restore_socket_callbacks(queue);
-       flush_work(&queue->io_work);
+       cancel_work_sync(&queue->io_work);
+       /* stop accepting incoming data */
+       queue->rcv_state = NVMET_TCP_RECV_ERR;
 
        nvmet_tcp_uninit_data_in_cmds(queue);
        nvmet_sq_destroy(&queue->nvme_sq);

If you don't perform a cancel_work_sync() you may race against a running
io_work thread that may overwrite rcv_state with some other value.

Maurizio



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-15 10:00                     ` Maurizio Lombardi
@ 2021-11-15 10:13                       ` Sagi Grimberg
  2021-11-15 10:57                       ` Maurizio Lombardi
  1 sibling, 0 replies; 21+ messages in thread
From: Sagi Grimberg @ 2021-11-15 10:13 UTC (permalink / raw)
  To: Maurizio Lombardi; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi


>> --
>> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
>> index 6eb0b3153477..65210dec3f1a 100644
>> --- a/drivers/nvme/target/tcp.c
>> +++ b/drivers/nvme/target/tcp.c
>> @@ -1436,6 +1436,8 @@ static void nvmet_tcp_release_queue_work(struct
>> work_struct *w)
>>          mutex_unlock(&nvmet_tcp_queue_mutex);
>>
>>          nvmet_tcp_restore_socket_callbacks(queue);
>> +       /* stop accepting incoming data */
>> +       queue->rcv_state = NVMET_TCP_RECV_ERR;
>>          flush_work(&queue->io_work);
>>
>>          nvmet_tcp_uninit_data_in_cmds(queue);
>> --
>>
> 
> Ok I can repeat the test, but you probably want to do this instead:
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index fb72e2d67fd5..d21b525fd4cb 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -1450,7 +1450,9 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
>          mutex_unlock(&nvmet_tcp_queue_mutex);
>   
>          nvmet_tcp_restore_socket_callbacks(queue);
> -       flush_work(&queue->io_work);
> +       cancel_work_sync(&queue->io_work);
> +       /* stop accepting incoming data */
> +       queue->rcv_state = NVMET_TCP_RECV_ERR;
>   
>          nvmet_tcp_uninit_data_in_cmds(queue);
>          nvmet_sq_destroy(&queue->nvme_sq);
> 
> If you don't perform a cancel_work_sync() you may race against a running
> io_work thread that may overwrite rcv_state with some other value.

Yes, that makes sense.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work
  2021-11-15 10:00                     ` Maurizio Lombardi
  2021-11-15 10:13                       ` Sagi Grimberg
@ 2021-11-15 10:57                       ` Maurizio Lombardi
  1 sibling, 0 replies; 21+ messages in thread
From: Maurizio Lombardi @ 2021-11-15 10:57 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, hch, hare, chaitanya.kulkarni, jmeneghi

On Mon, Nov 15, 2021 at 11:00:04AM +0100, Maurizio Lombardi wrote:
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index fb72e2d67fd5..d21b525fd4cb 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -1450,7 +1450,9 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
>         mutex_unlock(&nvmet_tcp_queue_mutex);
>  
>         nvmet_tcp_restore_socket_callbacks(queue);
> -       flush_work(&queue->io_work);
> +       cancel_work_sync(&queue->io_work);
> +       /* stop accepting incoming data */
> +       queue->rcv_state = NVMET_TCP_RECV_ERR;
>  
>         nvmet_tcp_uninit_data_in_cmds(queue);
>         nvmet_sq_destroy(&queue->nvme_sq);
> 
> If you don't perform a cancel_work_sync() you may race against a running
> io_work thread that may overwrite rcv_state with some other value.

This one works correctly.
Unless an unforseen problem arises, I am going to resubmit
the patchset soon.

Maurizio



^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2021-11-15 10:57 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-21  8:41 [PATCH 0/2] Fix a race condition when performing a controller reset Maurizio Lombardi
2021-10-21  8:41 ` [PATCH 1/2] nvmet: add an helper to free the iovec Maurizio Lombardi
2021-10-21 14:56   ` John Meneghini
2021-10-21 14:58     ` John Meneghini
2021-10-27  0:15     ` Chaitanya Kulkarni
2021-10-21  8:41 ` [PATCH 2/2] nvmet: fix a race condition between release_queue and io_work Maurizio Lombardi
2021-10-21 14:57   ` John Meneghini
2021-10-26 15:42   ` Sagi Grimberg
2021-10-28  7:55     ` Maurizio Lombardi
2021-11-03  9:28       ` Sagi Grimberg
2021-11-03 11:31         ` Maurizio Lombardi
2021-11-04 12:59           ` Sagi Grimberg
2021-11-12 10:54             ` Maurizio Lombardi
2021-11-12 15:54               ` John Meneghini
2021-11-15  7:52                 ` Maurizio Lombardi
2021-11-14 10:28               ` Sagi Grimberg
2021-11-15  7:47                 ` Maurizio Lombardi
2021-11-15  9:48                   ` Sagi Grimberg
2021-11-15 10:00                     ` Maurizio Lombardi
2021-11-15 10:13                       ` Sagi Grimberg
2021-11-15 10:57                       ` Maurizio Lombardi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.