[PATCH] nvme-tcp: proper handling of tcp socket closing flows

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] nvme-tcp: proper handling of tcp socket closing flows
@ 2021-01-28 15:27 elad.grupi
  2021-01-28 22:03 ` Sagi Grimberg
  2021-01-29 22:10 ` Sagi Grimberg
  0 siblings, 2 replies; 14+ messages in thread
From: elad.grupi @ 2021-01-28 15:27 UTC (permalink / raw)
  To: sagi, linux-nvme; +Cc: Elad Grupi

From: Elad Grupi <elad.grupi@dell.com>

avoid calling nvmet_tcp_release_queue_work if tcp socket was closed
before setting the sk callbacks.

Signed-off-by: Elad Grupi <elad.grupi@dell.com>
---
 drivers/nvme/target/tcp.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index d535080b781f..dac737bac874 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -647,7 +647,7 @@ static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
 	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
 	int ret = 0;
 
-	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
+	if (!cmd) {
 		cmd = nvmet_tcp_fetch_cmd(queue);
 		if (unlikely(!cmd))
 			return 0;
@@ -1453,9 +1453,27 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 	sock->sk->sk_state_change = nvmet_tcp_state_change;
 	queue->write_space = sock->sk->sk_write_space;
 	sock->sk->sk_write_space = nvmet_tcp_write_space;
+
+	switch (sk->sk_state) {
+	case TCP_FIN_WAIT1:
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSE:
+		/* FALLTHRU */
+		sock->sk->sk_data_ready =  queue->data_ready;
+		sock->sk->sk_state_change = queue->state_change;
+		sock->sk->sk_write_space = queue->write_space;
+		sk->sk_user_data = NULL;
+		queue->state = NVMET_TCP_Q_DISCONNECTING;
+		ret = -ENOTCONN;
+		break;
+	default:
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+		ret = 0;
+	}
+
 	write_unlock_bh(&sock->sk->sk_callback_lock);
 
-	return 0;
+	return ret;
 }
 
 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
@@ -1506,8 +1524,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 	if (ret)
 		goto out_destroy_sq;
 
-	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
-
 	return 0;
 out_destroy_sq:
 	mutex_lock(&nvmet_tcp_queue_mutex);
-- 
2.16.5


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-28 15:27 [PATCH] nvme-tcp: proper handling of tcp socket closing flows elad.grupi
@ 2021-01-28 22:03 ` Sagi Grimberg
  2021-01-28 23:07   ` Grupi, Elad
  2021-01-29 22:10 ` Sagi Grimberg
  1 sibling, 1 reply; 14+ messages in thread
From: Sagi Grimberg @ 2021-01-28 22:03 UTC (permalink / raw)
  To: elad.grupi, linux-nvme


> ---
>   drivers/nvme/target/tcp.c | 24 ++++++++++++++++++++----
>   1 file changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index d535080b781f..dac737bac874 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -647,7 +647,7 @@ static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
>   	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
>   	int ret = 0;
>   
> -	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
> +	if (!cmd) {
>   		cmd = nvmet_tcp_fetch_cmd(queue);
>   		if (unlikely(!cmd))
>   			return 0;
> @@ -1453,9 +1453,27 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>   	sock->sk->sk_state_change = nvmet_tcp_state_change;
>   	queue->write_space = sock->sk->sk_write_space;
>   	sock->sk->sk_write_space = nvmet_tcp_write_space;
> +
> +	switch (sk->sk_state) {
> +	case TCP_FIN_WAIT1:
> +	case TCP_CLOSE_WAIT:
> +	case TCP_CLOSE:
> +		/* FALLTHRU */
> +		sock->sk->sk_data_ready =  queue->data_ready;
> +		sock->sk->sk_state_change = queue->state_change;
> +		sock->sk->sk_write_space = queue->write_space;
> +		sk->sk_user_data = NULL;
> +		queue->state = NVMET_TCP_Q_DISCONNECTING;
> +		ret = -ENOTCONN;
> +		break;
> +	default:
> +		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
> +		ret = 0;
> +	}
> +
>   	write_unlock_bh(&sock->sk->sk_callback_lock);
>   
> -	return 0;
> +	return ret;
>   }
>   
>   static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
> @@ -1506,8 +1524,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
>   	if (ret)
>   		goto out_destroy_sq;
>   
> -	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
> -
>   	return 0;
>   out_destroy_sq:
>   	mutex_lock(&nvmet_tcp_queue_mutex);
> 


What about this instead?
--
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index c41902f7ce39..6388d18ca7c2 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1494,16 +1494,28 @@ static int nvmet_tcp_set_queue_sock(struct 
nvmet_tcp_queue *queue)
                 ip_sock_set_tos(sock->sk, inet->rcv_tos);

         write_lock_bh(&sock->sk->sk_callback_lock);
-       sock->sk->sk_user_data = queue;
-       queue->data_ready = sock->sk->sk_data_ready;
-       sock->sk->sk_data_ready = nvmet_tcp_data_ready;
-       queue->state_change = sock->sk->sk_state_change;
-       sock->sk->sk_state_change = nvmet_tcp_state_change;
-       queue->write_space = sock->sk->sk_write_space;
-       sock->sk->sk_write_space = nvmet_tcp_write_space;
+       switch (sk->sk_state) {
+       case TCP_FIN_WAIT1:
+       case TCP_CLOSE_WAIT:
+       case TCP_CLOSE:
+               /*
+                * If the socket is already closing, don't even start
+                * consuming it
+                */
+               ret = -ENOTCONN;
+               break;
+       default:
+               sock->sk->sk_user_data = queue;
+               queue->data_ready = sock->sk->sk_data_ready;
+               sock->sk->sk_data_ready = nvmet_tcp_data_ready;
+               queue->state_change = sock->sk->sk_state_change;
+               sock->sk->sk_state_change = nvmet_tcp_state_change;
+               queue->write_space = sock->sk->sk_write_space;
+               sock->sk->sk_write_space = nvmet_tcp_write_space;
+       }
         write_unlock_bh(&sock->sk->sk_callback_lock);

-       return 0;
+       return ret;
  }
--

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* RE: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-28 22:03 ` Sagi Grimberg
@ 2021-01-28 23:07   ` Grupi, Elad
  2021-01-28 23:33     ` Sagi Grimberg
  0 siblings, 1 reply; 14+ messages in thread
From: Grupi, Elad @ 2021-01-28 23:07 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme

That will work if sk_state_change is called under sk_callback_lock.

In addition, need to flush_work(&queue->port->accept_work) in nvmet_tcp_release_queue_work because accept_work is still using queue struct after unlocking sk_callback_lock.


-----Original Message-----
From: Sagi Grimberg <sagi@grimberg.me> 
Sent: Friday, 29 January 2021 0:03
To: Grupi, Elad; linux-nvme@lists.infradead.org
Subject: Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows


[EXTERNAL EMAIL] 


> ---
>   drivers/nvme/target/tcp.c | 24 ++++++++++++++++++++----
>   1 file changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c 
> index d535080b781f..dac737bac874 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -647,7 +647,7 @@ static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
>   	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
>   	int ret = 0;
>   
> -	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
> +	if (!cmd) {
>   		cmd = nvmet_tcp_fetch_cmd(queue);
>   		if (unlikely(!cmd))
>   			return 0;
> @@ -1453,9 +1453,27 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>   	sock->sk->sk_state_change = nvmet_tcp_state_change;
>   	queue->write_space = sock->sk->sk_write_space;
>   	sock->sk->sk_write_space = nvmet_tcp_write_space;
> +
> +	switch (sk->sk_state) {
> +	case TCP_FIN_WAIT1:
> +	case TCP_CLOSE_WAIT:
> +	case TCP_CLOSE:
> +		/* FALLTHRU */
> +		sock->sk->sk_data_ready =  queue->data_ready;
> +		sock->sk->sk_state_change = queue->state_change;
> +		sock->sk->sk_write_space = queue->write_space;
> +		sk->sk_user_data = NULL;
> +		queue->state = NVMET_TCP_Q_DISCONNECTING;
> +		ret = -ENOTCONN;
> +		break;
> +	default:
> +		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
> +		ret = 0;
> +	}
> +
>   	write_unlock_bh(&sock->sk->sk_callback_lock);
>   
> -	return 0;
> +	return ret;
>   }
>   
>   static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, @@ 
> -1506,8 +1524,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
>   	if (ret)
>   		goto out_destroy_sq;
>   
> -	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
> -
>   	return 0;
>   out_destroy_sq:
>   	mutex_lock(&nvmet_tcp_queue_mutex);
> 


What about this instead?
--
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index c41902f7ce39..6388d18ca7c2 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1494,16 +1494,28 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
                 ip_sock_set_tos(sock->sk, inet->rcv_tos);

         write_lock_bh(&sock->sk->sk_callback_lock);
-       sock->sk->sk_user_data = queue;
-       queue->data_ready = sock->sk->sk_data_ready;
-       sock->sk->sk_data_ready = nvmet_tcp_data_ready;
-       queue->state_change = sock->sk->sk_state_change;
-       sock->sk->sk_state_change = nvmet_tcp_state_change;
-       queue->write_space = sock->sk->sk_write_space;
-       sock->sk->sk_write_space = nvmet_tcp_write_space;
+       switch (sk->sk_state) {
+       case TCP_FIN_WAIT1:
+       case TCP_CLOSE_WAIT:
+       case TCP_CLOSE:
+               /*
+                * If the socket is already closing, don't even start
+                * consuming it
+                */
+               ret = -ENOTCONN;
+               break;
+       default:
+               sock->sk->sk_user_data = queue;
+               queue->data_ready = sock->sk->sk_data_ready;
+               sock->sk->sk_data_ready = nvmet_tcp_data_ready;
+               queue->state_change = sock->sk->sk_state_change;
+               sock->sk->sk_state_change = nvmet_tcp_state_change;
+               queue->write_space = sock->sk->sk_write_space;
+               sock->sk->sk_write_space = nvmet_tcp_write_space;
+       }
         write_unlock_bh(&sock->sk->sk_callback_lock);

-       return 0;
+       return ret;
  }
--
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-28 23:07   ` Grupi, Elad
@ 2021-01-28 23:33     ` Sagi Grimberg
  2021-01-28 23:43       ` Grupi, Elad
  2021-01-31 15:47       ` Grupi, Elad
  0 siblings, 2 replies; 14+ messages in thread
From: Sagi Grimberg @ 2021-01-28 23:33 UTC (permalink / raw)
  To: Grupi, Elad, linux-nvme


> That will work if sk_state_change is called under sk_callback_lock.

It is.

> In addition, need to flush_work(&queue->port->accept_work) in nvmet_tcp_release_queue_work because accept_work is still using queue struct after unlocking sk_callback_lock.


But sk->sk_user_data was not set, so I don't see how the release work
can invoke.

> What about this instead?
> --
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index c41902f7ce39..6388d18ca7c2 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -1494,16 +1494,28 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>                   ip_sock_set_tos(sock->sk, inet->rcv_tos);
> 
>           write_lock_bh(&sock->sk->sk_callback_lock);
> -       sock->sk->sk_user_data = queue;
> -       queue->data_ready = sock->sk->sk_data_ready;
> -       sock->sk->sk_data_ready = nvmet_tcp_data_ready;
> -       queue->state_change = sock->sk->sk_state_change;
> -       sock->sk->sk_state_change = nvmet_tcp_state_change;
> -       queue->write_space = sock->sk->sk_write_space;
> -       sock->sk->sk_write_space = nvmet_tcp_write_space;
> +       switch (sk->sk_state) {
> +       case TCP_FIN_WAIT1:
> +       case TCP_CLOSE_WAIT:
> +       case TCP_CLOSE:
> +               /*
> +                * If the socket is already closing, don't even start
> +                * consuming it
> +                */
> +               ret = -ENOTCONN;
> +               break;
> +       default:
> +               sock->sk->sk_user_data = queue;
> +               queue->data_ready = sock->sk->sk_data_ready;
> +               sock->sk->sk_data_ready = nvmet_tcp_data_ready;
> +               queue->state_change = sock->sk->sk_state_change;
> +               sock->sk->sk_state_change = nvmet_tcp_state_change;
> +               queue->write_space = sock->sk->sk_write_space;
> +               sock->sk->sk_write_space = nvmet_tcp_write_space;
> +       }
>           write_unlock_bh(&sock->sk->sk_callback_lock);
> 
> -       return 0;
> +       return ret;
>    }
> --
> 

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-28 23:33     ` Sagi Grimberg
@ 2021-01-28 23:43       ` Grupi, Elad
  2021-01-28 23:54         ` Sagi Grimberg
  2021-01-31 15:47       ` Grupi, Elad
  1 sibling, 1 reply; 14+ messages in thread
From: Grupi, Elad @ 2021-01-28 23:43 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme

Release work might get invoked if nvmet_tcp_set_queue_sock is completed successfully and set sk_user_data, but sk_state_change is triggered by network stack before queue_work_on is invoked. That case - there is a race between release_work and accept_work.

-----Original Message-----
From: Sagi Grimberg <sagi@grimberg.me> 
Sent: Friday, 29 January 2021 1:34
To: Grupi, Elad; linux-nvme@lists.infradead.org
Subject: Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows


[EXTERNAL EMAIL] 


> That will work if sk_state_change is called under sk_callback_lock.

It is.

> In addition, need to flush_work(&queue->port->accept_work) in nvmet_tcp_release_queue_work because accept_work is still using queue struct after unlocking sk_callback_lock.


But sk->sk_user_data was not set, so I don't see how the release work can invoke.

> What about this instead?
> --
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c 
> index c41902f7ce39..6388d18ca7c2 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -1494,16 +1494,28 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>                   ip_sock_set_tos(sock->sk, inet->rcv_tos);
> 
>           write_lock_bh(&sock->sk->sk_callback_lock);
> -       sock->sk->sk_user_data = queue;
> -       queue->data_ready = sock->sk->sk_data_ready;
> -       sock->sk->sk_data_ready = nvmet_tcp_data_ready;
> -       queue->state_change = sock->sk->sk_state_change;
> -       sock->sk->sk_state_change = nvmet_tcp_state_change;
> -       queue->write_space = sock->sk->sk_write_space;
> -       sock->sk->sk_write_space = nvmet_tcp_write_space;
> +       switch (sk->sk_state) {
> +       case TCP_FIN_WAIT1:
> +       case TCP_CLOSE_WAIT:
> +       case TCP_CLOSE:
> +               /*
> +                * If the socket is already closing, don't even start
> +                * consuming it
> +                */
> +               ret = -ENOTCONN;
> +               break;
> +       default:
> +               sock->sk->sk_user_data = queue;
> +               queue->data_ready = sock->sk->sk_data_ready;
> +               sock->sk->sk_data_ready = nvmet_tcp_data_ready;
> +               queue->state_change = sock->sk->sk_state_change;
> +               sock->sk->sk_state_change = nvmet_tcp_state_change;
> +               queue->write_space = sock->sk->sk_write_space;
> +               sock->sk->sk_write_space = nvmet_tcp_write_space;
> +       }
>           write_unlock_bh(&sock->sk->sk_callback_lock);
> 
> -       return 0;
> +       return ret;
>    }
> --
> 
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-28 23:43       ` Grupi, Elad
@ 2021-01-28 23:54         ` Sagi Grimberg
  2021-01-29  0:01           ` Grupi, Elad
  0 siblings, 1 reply; 14+ messages in thread
From: Sagi Grimberg @ 2021-01-28 23:54 UTC (permalink / raw)
  To: Grupi, Elad, linux-nvme


> Release work might get invoked if nvmet_tcp_set_queue_sock is completed successfully and set sk_user_data, but sk_state_change is triggered by network stack before queue_work_on is invoked. That case - there is a race between release_work and accept_work.

This is just like any other case where release_work races with io_work.
That is not an exception from other cases which release_work will need
to fence from.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-28 23:54         ` Sagi Grimberg
@ 2021-01-29  0:01           ` Grupi, Elad
  2021-01-29  0:07             ` Sagi Grimberg
  0 siblings, 1 reply; 14+ messages in thread
From: Grupi, Elad @ 2021-01-29  0:01 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme

That's not what I meant.

My concern is release_work races with accept_work.

nvmet_tcp_alloc_queue is called from accept_work context and still accessing the queue struct after setting sk callbacks.

-----Original Message-----
From: Sagi Grimberg <sagi@grimberg.me> 
Sent: Friday, 29 January 2021 1:54
To: Grupi, Elad; linux-nvme@lists.infradead.org
Subject: Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows

[EXTERNAL EMAIL] 

> Release work might get invoked if nvmet_tcp_set_queue_sock is completed successfully and set sk_user_data, but sk_state_change is triggered by network stack before queue_work_on is invoked. That case - there is a race between release_work and accept_work.

This is just like any other case where release_work races with io_work.
That is not an exception from other cases which release_work will need to fence from.
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-29  0:01           ` Grupi, Elad
@ 2021-01-29  0:07             ` Sagi Grimberg
  0 siblings, 0 replies; 14+ messages in thread
From: Sagi Grimberg @ 2021-01-29  0:07 UTC (permalink / raw)
  To: Grupi, Elad, linux-nvme


> That's not what I meant.
> 
> My concern is release_work races with accept_work.
> 
> nvmet_tcp_alloc_queue is called from accept_work context and still accessing the queue struct after setting sk callbacks.

I see, we can move the queue_work to set_queue_sock then.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-28 15:27 [PATCH] nvme-tcp: proper handling of tcp socket closing flows elad.grupi
  2021-01-28 22:03 ` Sagi Grimberg
@ 2021-01-29 22:10 ` Sagi Grimberg
  2021-01-31 15:47   ` Grupi, Elad
  1 sibling, 1 reply; 14+ messages in thread
From: Sagi Grimberg @ 2021-01-29 22:10 UTC (permalink / raw)
  To: elad.grupi, linux-nvme


> @@ -1453,9 +1453,27 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>   	sock->sk->sk_state_change = nvmet_tcp_state_change;
>   	queue->write_space = sock->sk->sk_write_space;
>   	sock->sk->sk_write_space = nvmet_tcp_write_space;
> +
> +	switch (sk->sk_state) {
> +	case TCP_FIN_WAIT1:
> +	case TCP_CLOSE_WAIT:
> +	case TCP_CLOSE:

BTW, shouldn't this be if (sk->sk_state != TCP_ESTABLISHED) ?

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-28 23:33     ` Sagi Grimberg
  2021-01-28 23:43       ` Grupi, Elad
@ 2021-01-31 15:47       ` Grupi, Elad
  1 sibling, 0 replies; 14+ messages in thread
From: Grupi, Elad @ 2021-01-31 15:47 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme

Is it possible that we will have the following flow:

Thread 1 taking sk_callback_lock and goes into the switch case of sk_state, going to line:
	sock->sk->sk_user_data = queue;

at that point, thread 2 invokes sk_state_change, but this is still the original state_change callback because we didn't set yet to nvmet_tcp_state_change.

Thread 1 continues and set the callback to nvmet_tcp_state_change, but the state was already changed and nvmet_tcp_state_change will be never called.

I think this race still exist even when locked by sk_callback_lock.


-----Original Message-----
From: Sagi Grimberg <sagi@grimberg.me> 
Sent: Friday, 29 January 2021 1:34
To: Grupi, Elad; linux-nvme@lists.infradead.org
Subject: Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows


[EXTERNAL EMAIL] 


> That will work if sk_state_change is called under sk_callback_lock.

It is.

> In addition, need to flush_work(&queue->port->accept_work) in nvmet_tcp_release_queue_work because accept_work is still using queue struct after unlocking sk_callback_lock.


But sk->sk_user_data was not set, so I don't see how the release work can invoke.

> What about this instead?
> --
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c 
> index c41902f7ce39..6388d18ca7c2 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -1494,16 +1494,28 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>                   ip_sock_set_tos(sock->sk, inet->rcv_tos);
> 
>           write_lock_bh(&sock->sk->sk_callback_lock);
> -       sock->sk->sk_user_data = queue;
> -       queue->data_ready = sock->sk->sk_data_ready;
> -       sock->sk->sk_data_ready = nvmet_tcp_data_ready;
> -       queue->state_change = sock->sk->sk_state_change;
> -       sock->sk->sk_state_change = nvmet_tcp_state_change;
> -       queue->write_space = sock->sk->sk_write_space;
> -       sock->sk->sk_write_space = nvmet_tcp_write_space;
> +       switch (sk->sk_state) {
> +       case TCP_FIN_WAIT1:
> +       case TCP_CLOSE_WAIT:
> +       case TCP_CLOSE:
> +               /*
> +                * If the socket is already closing, don't even start
> +                * consuming it
> +                */
> +               ret = -ENOTCONN;
> +               break;
> +       default:
> +               sock->sk->sk_user_data = queue;
> +               queue->data_ready = sock->sk->sk_data_ready;
> +               sock->sk->sk_data_ready = nvmet_tcp_data_ready;
> +               queue->state_change = sock->sk->sk_state_change;
> +               sock->sk->sk_state_change = nvmet_tcp_state_change;
> +               queue->write_space = sock->sk->sk_write_space;
> +               sock->sk->sk_write_space = nvmet_tcp_write_space;
> +       }
>           write_unlock_bh(&sock->sk->sk_callback_lock);
> 
> -       return 0;
> +       return ret;
>    }
> --
> 
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-29 22:10 ` Sagi Grimberg
@ 2021-01-31 15:47   ` Grupi, Elad
  0 siblings, 0 replies; 14+ messages in thread
From: Grupi, Elad @ 2021-01-31 15:47 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme

Yes. That's looks good

-----Original Message-----
From: Sagi Grimberg <sagi@grimberg.me> 
Sent: Saturday, 30 January 2021 0:10
To: Grupi, Elad; linux-nvme@lists.infradead.org
Subject: Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows


[EXTERNAL EMAIL] 


> @@ -1453,9 +1453,27 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>   	sock->sk->sk_state_change = nvmet_tcp_state_change;
>   	queue->write_space = sock->sk->sk_write_space;
>   	sock->sk->sk_write_space = nvmet_tcp_write_space;
> +
> +	switch (sk->sk_state) {
> +	case TCP_FIN_WAIT1:
> +	case TCP_CLOSE_WAIT:
> +	case TCP_CLOSE:

BTW, shouldn't this be if (sk->sk_state != TCP_ESTABLISHED) ?
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-28  7:54 ` Sagi Grimberg
@ 2021-01-28 15:27   ` Grupi, Elad
  0 siblings, 0 replies; 14+ messages in thread
From: Grupi, Elad @ 2021-01-28 15:27 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme

> But we need to call the release_queue work, not sure I understand how this works.

If the tcp socket was closed before setting the socket callback,
nvmet_tcp_set_queue_sock will return -ENOTCONN and everything will be cleared by goto out_destroy_sq.
No need a worker to clean resources in that case and everything is protected by sk_callback_lock.

> io_work is going to run with release_queue_work, because it is also coming from the backend completion. The right way to solve this is to correctly fence them.

> Can you describe the exact race you are referring to?

Correct, the fence already exist in the code, cancel_work_sync is called after nvmet_sq_destroy in release_queue_work. I will remove that part form the patch.

-----Original Message-----
From: Sagi Grimberg <sagi@grimberg.me> 
Sent: Thursday, 28 January 2021 9:55
To: Grupi, Elad; linux-nvme@lists.infradead.org
Subject: Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows


[EXTERNAL EMAIL] 

> From: Elad Grupi <elad.grupi@dell.com>
> 
> avoid calling nvmet_tcp_release_queue_work if tcp socket was closed 
> before setting the sk callbacks.

But we need to call the release_queue work, not sure I understand how this works.

> 
> prevent io_work from enqueuing while closing the tcp queue to avoid 
> race with nvmet_tcp_release_queue_work.

io_work is going to run with release_queue_work, because it is also coming from the backend completion. The right way to solve this is to correctly fence them.

Can you describe the exact race you are referring to?

> 
> Signed-off-by: Elad Grupi <elad.grupi@dell.com>
> ---
>   drivers/nvme/target/tcp.c | 26 +++++++++++++++++++++-----
>   1 file changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c 
> index d535080b781f..937f2a746d8b 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -647,7 +647,7 @@ static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
>   	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
>   	int ret = 0;
>   
> -	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
> +	if (!cmd) {
>   		cmd = nvmet_tcp_fetch_cmd(queue);
>   		if (unlikely(!cmd))
>   			return 0;
> @@ -1196,7 +1196,7 @@ static void nvmet_tcp_io_work(struct work_struct *w)
>   	/*
>   	 * We exahusted our budget, requeue our selves
>   	 */
> -	if (pending)
> +	if (pending && queue->state != NVMET_TCP_Q_DISCONNECTING)
>   		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
>   }
>   
> @@ -1453,9 +1453,27 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>   	sock->sk->sk_state_change = nvmet_tcp_state_change;
>   	queue->write_space = sock->sk->sk_write_space;
>   	sock->sk->sk_write_space = nvmet_tcp_write_space;
> +
> +	switch (sk->sk_state) {
> +	case TCP_FIN_WAIT1:
> +	case TCP_CLOSE_WAIT:
> +	case TCP_CLOSE:
> +		/* FALLTHRU */
> +		sock->sk->sk_data_ready =  queue->data_ready;
> +		sock->sk->sk_state_change = queue->state_change;
> +		sock->sk->sk_write_space = queue->write_space;
> +		sk->sk_user_data = NULL;
> +		queue->state = NVMET_TCP_Q_DISCONNECTING;
> +		ret = -ENOTCONN;
> +		break;
> +	default:
> +		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
> +		ret = 0;
> +	}
> +
>   	write_unlock_bh(&sock->sk->sk_callback_lock);
>   
> -	return 0;
> +	return ret;
>   }
>   
>   static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, @@ 
> -1506,8 +1524,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
>   	if (ret)
>   		goto out_destroy_sq;
>   
> -	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
> -
>   	return 0;
>   out_destroy_sq:
>   	mutex_lock(&nvmet_tcp_queue_mutex);
> 
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] nvme-tcp: proper handling of tcp socket closing flows
  2021-01-26 15:37 elad.grupi
@ 2021-01-28  7:54 ` Sagi Grimberg
  2021-01-28 15:27   ` Grupi, Elad
  0 siblings, 1 reply; 14+ messages in thread
From: Sagi Grimberg @ 2021-01-28  7:54 UTC (permalink / raw)
  To: elad.grupi, linux-nvme

> From: Elad Grupi <elad.grupi@dell.com>
> 
> avoid calling nvmet_tcp_release_queue_work if tcp socket was closed
> before setting the sk callbacks.

But we need to call the release_queue work, not sure I understand
how this works.

> 
> prevent io_work from enqueuing while closing the tcp queue to
> avoid race with nvmet_tcp_release_queue_work.

io_work is going to run with release_queue_work, because
it is also coming from the backend completion. The right
way to solve this is to correctly fence them.

Can you describe the exact race you are referring to?

> 
> Signed-off-by: Elad Grupi <elad.grupi@dell.com>
> ---
>   drivers/nvme/target/tcp.c | 26 +++++++++++++++++++++-----
>   1 file changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index d535080b781f..937f2a746d8b 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -647,7 +647,7 @@ static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
>   	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
>   	int ret = 0;
>   
> -	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
> +	if (!cmd) {
>   		cmd = nvmet_tcp_fetch_cmd(queue);
>   		if (unlikely(!cmd))
>   			return 0;
> @@ -1196,7 +1196,7 @@ static void nvmet_tcp_io_work(struct work_struct *w)
>   	/*
>   	 * We exahusted our budget, requeue our selves
>   	 */
> -	if (pending)
> +	if (pending && queue->state != NVMET_TCP_Q_DISCONNECTING)
>   		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
>   }
>   
> @@ -1453,9 +1453,27 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
>   	sock->sk->sk_state_change = nvmet_tcp_state_change;
>   	queue->write_space = sock->sk->sk_write_space;
>   	sock->sk->sk_write_space = nvmet_tcp_write_space;
> +
> +	switch (sk->sk_state) {
> +	case TCP_FIN_WAIT1:
> +	case TCP_CLOSE_WAIT:
> +	case TCP_CLOSE:
> +		/* FALLTHRU */
> +		sock->sk->sk_data_ready =  queue->data_ready;
> +		sock->sk->sk_state_change = queue->state_change;
> +		sock->sk->sk_write_space = queue->write_space;
> +		sk->sk_user_data = NULL;
> +		queue->state = NVMET_TCP_Q_DISCONNECTING;
> +		ret = -ENOTCONN;
> +		break;
> +	default:
> +		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
> +		ret = 0;
> +	}
> +
>   	write_unlock_bh(&sock->sk->sk_callback_lock);
>   
> -	return 0;
> +	return ret;
>   }
>   
>   static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
> @@ -1506,8 +1524,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
>   	if (ret)
>   		goto out_destroy_sq;
>   
> -	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
> -
>   	return 0;
>   out_destroy_sq:
>   	mutex_lock(&nvmet_tcp_queue_mutex);
> 

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] nvme-tcp: proper handling of tcp socket closing flows
@ 2021-01-26 15:37 elad.grupi
  2021-01-28  7:54 ` Sagi Grimberg
  0 siblings, 1 reply; 14+ messages in thread
From: elad.grupi @ 2021-01-26 15:37 UTC (permalink / raw)
  To: sagi, linux-nvme; +Cc: Elad Grupi

From: Elad Grupi <elad.grupi@dell.com>

avoid calling nvmet_tcp_release_queue_work if tcp socket was closed
before setting the sk callbacks.

prevent io_work from enqueuing while closing the tcp queue to
avoid race with nvmet_tcp_release_queue_work.

Signed-off-by: Elad Grupi <elad.grupi@dell.com>
---
 drivers/nvme/target/tcp.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index d535080b781f..937f2a746d8b 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -647,7 +647,7 @@ static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
 	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
 	int ret = 0;
 
-	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
+	if (!cmd) {
 		cmd = nvmet_tcp_fetch_cmd(queue);
 		if (unlikely(!cmd))
 			return 0;
@@ -1196,7 +1196,7 @@ static void nvmet_tcp_io_work(struct work_struct *w)
 	/*
 	 * We exahusted our budget, requeue our selves
 	 */
-	if (pending)
+	if (pending && queue->state != NVMET_TCP_Q_DISCONNECTING)
 		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
 }
 
@@ -1453,9 +1453,27 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 	sock->sk->sk_state_change = nvmet_tcp_state_change;
 	queue->write_space = sock->sk->sk_write_space;
 	sock->sk->sk_write_space = nvmet_tcp_write_space;
+
+	switch (sk->sk_state) {
+	case TCP_FIN_WAIT1:
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSE:
+		/* FALLTHRU */
+		sock->sk->sk_data_ready =  queue->data_ready;
+		sock->sk->sk_state_change = queue->state_change;
+		sock->sk->sk_write_space = queue->write_space;
+		sk->sk_user_data = NULL;
+		queue->state = NVMET_TCP_Q_DISCONNECTING;
+		ret = -ENOTCONN;
+		break;
+	default:
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+		ret = 0;
+	}
+
 	write_unlock_bh(&sock->sk->sk_callback_lock);
 
-	return 0;
+	return ret;
 }
 
 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
@@ -1506,8 +1524,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 	if (ret)
 		goto out_destroy_sq;
 
-	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
-
 	return 0;
 out_destroy_sq:
 	mutex_lock(&nvmet_tcp_queue_mutex);
-- 
2.16.5


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2021-01-31 15:47 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-28 15:27 [PATCH] nvme-tcp: proper handling of tcp socket closing flows elad.grupi
2021-01-28 22:03 ` Sagi Grimberg
2021-01-28 23:07   ` Grupi, Elad
2021-01-28 23:33     ` Sagi Grimberg
2021-01-28 23:43       ` Grupi, Elad
2021-01-28 23:54         ` Sagi Grimberg
2021-01-29  0:01           ` Grupi, Elad
2021-01-29  0:07             ` Sagi Grimberg
2021-01-31 15:47       ` Grupi, Elad
2021-01-29 22:10 ` Sagi Grimberg
2021-01-31 15:47   ` Grupi, Elad
  -- strict thread matches above, loose matches on Subject: below --
2021-01-26 15:37 elad.grupi
2021-01-28  7:54 ` Sagi Grimberg
2021-01-28 15:27   ` Grupi, Elad

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.