All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
@ 2023-01-22 22:21 Kirill Tkhai
  2023-01-24 17:57 ` Kuniyuki Iwashima
  2023-01-25  1:35 ` Jakub Kicinski
  0 siblings, 2 replies; 11+ messages in thread
From: Kirill Tkhai @ 2023-01-22 22:21 UTC (permalink / raw)
  To: Linux Kernel Network Developers
  Cc: davem, edumazet, kuba, pabeni, kuniyu, gorcunov, tkhai

Some functions use unlocked check for sock::sk_state. This does not guarantee
a new value assigned to sk_state on some CPU is already visible on this CPU.

Example:

[CPU0:Task0]                    [CPU1:Task1]
unix_listen()
  unix_state_lock(sk);
  sk->sk_state = TCP_LISTEN;
  unix_state_unlock(sk);
                                unix_accept()
                                  if (sk->sk_state != TCP_LISTEN) /* not visible */
                                     goto out;                    /* return error */

Task1 may miss new sk->sk_state value, and then unix_accept() returns error.
Since in this situation unix_accept() is called chronologically later, such
behavior is not obvious and it is wrong.

This patch aims to fix the problem. A new function unix_sock_state() is
introduced, and it makes sure a user never misses a new state assigned just
before the function is called. We will use it in the places, where unlocked
sk_state dereferencing was used before.

Note, that there remain some more places with sk_state unfixed. Also, the same
problem is with unix_peer(). This will be a subject for future patches.

Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
---
 net/unix/af_unix.c |   43 +++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 009616fa0256..f53e09a0753b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -247,6 +247,28 @@ struct sock *unix_peer_get(struct sock *s)
 }
 EXPORT_SYMBOL_GPL(unix_peer_get);
 
+/* This function returns current sk->sk_state guaranteeing
+ * its relevance in case of assignment was made on other CPU.
+ */
+static unsigned char unix_sock_state(struct sock *sk)
+{
+	unsigned char s_state = READ_ONCE(sk->sk_state);
+
+	/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
+	 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
+	 * We may avoid taking the lock in case of those states are
+	 * already visible.
+	 */
+	if ((s_state == TCP_ESTABLISHED || s_state == TCP_LISTEN)
+	    && sk->sk_type != SOCK_DGRAM)
+		return s_state;
+
+	unix_state_lock(sk);
+	s_state = sk->sk_state;
+	unix_state_unlock(sk);
+	return s_state;
+}
+
 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 					     int addr_len)
 {
@@ -812,13 +834,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 	int nr_fds = 0;
 
 	if (sk) {
-		s_state = READ_ONCE(sk->sk_state);
+		s_state = unix_sock_state(sk);
 		u = unix_sk(sk);
 
-		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
-		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
-		 * SOCK_DGRAM is ordinary. So, no lock is needed.
-		 */
 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
 		else if (s_state == TCP_LISTEN)
@@ -1686,7 +1704,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
 		goto out;
 
 	err = -EINVAL;
-	if (sk->sk_state != TCP_LISTEN)
+	if (unix_sock_state(sk) != TCP_LISTEN)
 		goto out;
 
 	/* If socket state is TCP_LISTEN it cannot change (for now...),
@@ -2178,7 +2196,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	}
 
 	if (msg->msg_namelen) {
-		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
+		unsigned char s_state = unix_sock_state(sk);
+		err = s_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
 		goto out_err;
 	} else {
 		err = -ENOTCONN;
@@ -2279,7 +2298,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
 		return -EOPNOTSUPP;
 
 	other = unix_peer(sk);
-	if (!other || sk->sk_state != TCP_ESTABLISHED)
+	if (!other || unix_sock_state(sk) != TCP_ESTABLISHED)
 		return -ENOTCONN;
 
 	if (false) {
@@ -2391,7 +2410,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
 	if (err)
 		return err;
 
-	if (sk->sk_state != TCP_ESTABLISHED)
+	if (unix_sock_state(sk) != TCP_ESTABLISHED)
 		return -ENOTCONN;
 
 	if (msg->msg_namelen)
@@ -2405,7 +2424,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
 {
 	struct sock *sk = sock->sk;
 
-	if (sk->sk_state != TCP_ESTABLISHED)
+	if (unix_sock_state(sk) != TCP_ESTABLISHED)
 		return -ENOTCONN;
 
 	return unix_dgram_recvmsg(sock, msg, size, flags);
@@ -2689,7 +2708,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
 
 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 {
-	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
+	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED))
 		return -ENOTCONN;
 
 	return unix_read_skb(sk, recv_actor);
@@ -2713,7 +2732,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 	size_t size = state->size;
 	unsigned int last_len;
 
-	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
+	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) {
 		err = -EINVAL;
 		goto out;
 	}



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
  2023-01-22 22:21 [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu Kirill Tkhai
@ 2023-01-24 17:57 ` Kuniyuki Iwashima
  2023-01-24 21:05   ` Kirill Tkhai
  2023-01-24 22:32   ` Cyrill Gorcunov
  2023-01-25  1:35 ` Jakub Kicinski
  1 sibling, 2 replies; 11+ messages in thread
From: Kuniyuki Iwashima @ 2023-01-24 17:57 UTC (permalink / raw)
  To: tkhai; +Cc: davem, edumazet, gorcunov, kuba, kuniyu, netdev, pabeni

From:   Kirill Tkhai <tkhai@ya.ru>
Date:   Mon, 23 Jan 2023 01:21:20 +0300
> Some functions use unlocked check for sock::sk_state. This does not guarantee
> a new value assigned to sk_state on some CPU is already visible on this CPU.
> 
> Example:
> 
> [CPU0:Task0]                    [CPU1:Task1]
> unix_listen()
>   unix_state_lock(sk);
>   sk->sk_state = TCP_LISTEN;
>   unix_state_unlock(sk);
>                                 unix_accept()
>                                   if (sk->sk_state != TCP_LISTEN) /* not visible */
>                                      goto out;                    /* return error */
> 
> Task1 may miss new sk->sk_state value, and then unix_accept() returns error.
> Since in this situation unix_accept() is called chronologically later, such
> behavior is not obvious and it is wrong.

Have you seen this on a real workload ?

It sounds like a userspace bug that accept() is called on a different
CPU before listen() returns.  At least, accept() is fetching sk at the
same time, then I think there should be no guarantee that sk_state is
TCP_LISTEN.

Same for other TCP_ESTABLISHED tests, it seems a program is calling
sendmsg()/recvmsg() when sk is still TCP_CLOSE and betting concurrent
connect() will finish earlier.


> 
> This patch aims to fix the problem. A new function unix_sock_state() is
> introduced, and it makes sure a user never misses a new state assigned just
> before the function is called. We will use it in the places, where unlocked
> sk_state dereferencing was used before.
> 
> Note, that there remain some more places with sk_state unfixed. Also, the same
> problem is with unix_peer(). This will be a subject for future patches.
> 
> Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
> ---
>  net/unix/af_unix.c |   43 +++++++++++++++++++++++++++++++------------
>  1 file changed, 31 insertions(+), 12 deletions(-)
> 
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 009616fa0256..f53e09a0753b 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -247,6 +247,28 @@ struct sock *unix_peer_get(struct sock *s)
>  }
>  EXPORT_SYMBOL_GPL(unix_peer_get);
>  
> +/* This function returns current sk->sk_state guaranteeing
> + * its relevance in case of assignment was made on other CPU.
> + */
> +static unsigned char unix_sock_state(struct sock *sk)
> +{
> +	unsigned char s_state = READ_ONCE(sk->sk_state);
> +
> +	/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
> +	 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
> +	 * We may avoid taking the lock in case of those states are
> +	 * already visible.
> +	 */
> +	if ((s_state == TCP_ESTABLISHED || s_state == TCP_LISTEN)
> +	    && sk->sk_type != SOCK_DGRAM)
> +		return s_state;
> +
> +	unix_state_lock(sk);
> +	s_state = sk->sk_state;
> +	unix_state_unlock(sk);
> +	return s_state;
> +}
> +
>  static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
>  					     int addr_len)
>  {
> @@ -812,13 +834,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
>  	int nr_fds = 0;
>  
>  	if (sk) {
> -		s_state = READ_ONCE(sk->sk_state);
> +		s_state = unix_sock_state(sk);
>  		u = unix_sk(sk);
>  
> -		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
> -		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
> -		 * SOCK_DGRAM is ordinary. So, no lock is needed.
> -		 */
>  		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
>  			nr_fds = atomic_read(&u->scm_stat.nr_fds);
>  		else if (s_state == TCP_LISTEN)
> @@ -1686,7 +1704,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
>  		goto out;
>  
>  	err = -EINVAL;
> -	if (sk->sk_state != TCP_LISTEN)
> +	if (unix_sock_state(sk) != TCP_LISTEN)
>  		goto out;
>  
>  	/* If socket state is TCP_LISTEN it cannot change (for now...),
> @@ -2178,7 +2196,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
>  	}
>  
>  	if (msg->msg_namelen) {
> -		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
> +		unsigned char s_state = unix_sock_state(sk);
> +		err = s_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;

No need to define s_state here, or a blank line is needed after
the definition.
https://patchwork.kernel.org/project/netdevbpf/patch/72ae40ef-2d68-2e89-46d3-fc8f820db42a@ya.ru/

>  		goto out_err;
>  	} else {
>  		err = -ENOTCONN;
> @@ -2279,7 +2298,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
>  		return -EOPNOTSUPP;
>  
>  	other = unix_peer(sk);
> -	if (!other || sk->sk_state != TCP_ESTABLISHED)
> +	if (!other || unix_sock_state(sk) != TCP_ESTABLISHED)
>  		return -ENOTCONN;
>  
>  	if (false) {
> @@ -2391,7 +2410,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
>  	if (err)
>  		return err;
>  
> -	if (sk->sk_state != TCP_ESTABLISHED)
> +	if (unix_sock_state(sk) != TCP_ESTABLISHED)
>  		return -ENOTCONN;
>  
>  	if (msg->msg_namelen)
> @@ -2405,7 +2424,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
>  {
>  	struct sock *sk = sock->sk;
>  
> -	if (sk->sk_state != TCP_ESTABLISHED)
> +	if (unix_sock_state(sk) != TCP_ESTABLISHED)
>  		return -ENOTCONN;
>  
>  	return unix_dgram_recvmsg(sock, msg, size, flags);
> @@ -2689,7 +2708,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
>  
>  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
>  {
> -	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
> +	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED))
>  		return -ENOTCONN;
>  
>  	return unix_read_skb(sk, recv_actor);
> @@ -2713,7 +2732,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
>  	size_t size = state->size;
>  	unsigned int last_len;
>  
> -	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
> +	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) {
>  		err = -EINVAL;
>  		goto out;
>  	}

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
  2023-01-24 17:57 ` Kuniyuki Iwashima
@ 2023-01-24 21:05   ` Kirill Tkhai
  2023-01-24 22:32   ` Cyrill Gorcunov
  1 sibling, 0 replies; 11+ messages in thread
From: Kirill Tkhai @ 2023-01-24 21:05 UTC (permalink / raw)
  To: Kuniyuki Iwashima; +Cc: davem, edumazet, gorcunov, kuba, netdev, pabeni

Hi Kuniyuki,

On 24.01.2023 20:57, Kuniyuki Iwashima wrote:
> From:   Kirill Tkhai <tkhai@ya.ru>
> Date:   Mon, 23 Jan 2023 01:21:20 +0300
>> Some functions use unlocked check for sock::sk_state. This does not guarantee
>> a new value assigned to sk_state on some CPU is already visible on this CPU.
>>
>> Example:
>>
>> [CPU0:Task0]                    [CPU1:Task1]
>> unix_listen()
>>   unix_state_lock(sk);
>>   sk->sk_state = TCP_LISTEN;
>>   unix_state_unlock(sk);
>>                                 unix_accept()
>>                                   if (sk->sk_state != TCP_LISTEN) /* not visible */
>>                                      goto out;                    /* return error */
>>
>> Task1 may miss new sk->sk_state value, and then unix_accept() returns error.
>> Since in this situation unix_accept() is called chronologically later, such
>> behavior is not obvious and it is wrong.
> 
> Have you seen this on a real workload ?

No, I haven't seen. This is my observation on current code, which I made during work
on recent scm_fds patch.

> It sounds like a userspace bug that accept() is called on a different
> CPU before listen() returns.  At least, accept() is fetching sk at the
> same time, then I think there should be no guarantee that sk_state is
> TCP_LISTEN.
>
> Same for other TCP_ESTABLISHED tests, it seems a program is calling
> sendmsg()/recvmsg() when sk is still TCP_CLOSE and betting concurrent
> connect() will finish earlier.

Not exactly. This is not about the case of "accept() is called before listen() returns".
This is about "accept is called after listen returns".

 unix_listen()
   unix_state_lock(sk);
   sk->sk_state = TCP_LISTEN;
   unix_state_unlock(sk);

 <<returned from syscall>>
                                 unix_accept()
                                   if (sk->sk_state != TCP_LISTEN) /* not visible */
                                      goto out;                    /* return error */

A syscall enter and exit do not imply any memory barriers. New sk_state in unix_accept()
may be not visible.

Speaking about other cases. Even changes made by three sequential syscalls connect(),
accept() and sendmsg() may be not visible on other CPU:

CPU0                                   CPU1                 CPU2
connect(sk)
  sk2->sk_state = TCP_ESTABLISHED                     
sendmsg(sk)
                                       sk2 = accept()
                                                            read("fdinfo of sk2")
                                                              sk2->sk_state is not visible

There are a lot of possibilities of smp reordering in recvmsg().


CPU0                                  CPU1                 CPU2
connect(sk)
  sk2->sk_state = TCP_ESTABLISHED                     
sendmsg(sk)
                                      sk2 = accept()
                                   
                                                           sk2 = pidfd_getfd("fd of sk2)
                                                           ioctl(sk2, SIOCATMARK, &answ) -> answ=1
                                                           if (answ)
                                                             recvmsg(sk2) --> TCP_ESTABLISHED is not visible

^^^ in this example there is also smp reordering between answ and sk_state

There are a lot of cases. We should not wait for real appearance, because this may appear on !x86 cpus,
and this may result in long and difficult debug for users. We should provide stable interfaces for them.

The big advantage is this should not affect performance, since generic case still goes thru unlocked
fastpath case.

Kirill
 
>>
>> This patch aims to fix the problem. A new function unix_sock_state() is
>> introduced, and it makes sure a user never misses a new state assigned just
>> before the function is called. We will use it in the places, where unlocked
>> sk_state dereferencing was used before.
>>
>> Note, that there remain some more places with sk_state unfixed. Also, the same
>> problem is with unix_peer(). This will be a subject for future patches.
>>
>> Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
>> ---
>>  net/unix/af_unix.c |   43 +++++++++++++++++++++++++++++++------------
>>  1 file changed, 31 insertions(+), 12 deletions(-)
>>
>> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
>> index 009616fa0256..f53e09a0753b 100644
>> --- a/net/unix/af_unix.c
>> +++ b/net/unix/af_unix.c
>> @@ -247,6 +247,28 @@ struct sock *unix_peer_get(struct sock *s)
>>  }
>>  EXPORT_SYMBOL_GPL(unix_peer_get);
>>  
>> +/* This function returns current sk->sk_state guaranteeing
>> + * its relevance in case of assignment was made on other CPU.
>> + */
>> +static unsigned char unix_sock_state(struct sock *sk)
>> +{
>> +	unsigned char s_state = READ_ONCE(sk->sk_state);
>> +
>> +	/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
>> +	 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
>> +	 * We may avoid taking the lock in case of those states are
>> +	 * already visible.
>> +	 */
>> +	if ((s_state == TCP_ESTABLISHED || s_state == TCP_LISTEN)
>> +	    && sk->sk_type != SOCK_DGRAM)
>> +		return s_state;
>> +
>> +	unix_state_lock(sk);
>> +	s_state = sk->sk_state;
>> +	unix_state_unlock(sk);
>> +	return s_state;
>> +}
>> +
>>  static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
>>  					     int addr_len)
>>  {
>> @@ -812,13 +834,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
>>  	int nr_fds = 0;
>>  
>>  	if (sk) {
>> -		s_state = READ_ONCE(sk->sk_state);
>> +		s_state = unix_sock_state(sk);
>>  		u = unix_sk(sk);
>>  
>> -		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
>> -		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
>> -		 * SOCK_DGRAM is ordinary. So, no lock is needed.
>> -		 */
>>  		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
>>  			nr_fds = atomic_read(&u->scm_stat.nr_fds);
>>  		else if (s_state == TCP_LISTEN)
>> @@ -1686,7 +1704,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
>>  		goto out;
>>  
>>  	err = -EINVAL;
>> -	if (sk->sk_state != TCP_LISTEN)
>> +	if (unix_sock_state(sk) != TCP_LISTEN)
>>  		goto out;
>>  
>>  	/* If socket state is TCP_LISTEN it cannot change (for now...),
>> @@ -2178,7 +2196,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
>>  	}
>>  
>>  	if (msg->msg_namelen) {
>> -		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
>> +		unsigned char s_state = unix_sock_state(sk);
>> +		err = s_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
> 
> No need to define s_state here, or a blank line is needed after
> the definition.
> https://patchwork.kernel.org/project/netdevbpf/patch/72ae40ef-2d68-2e89-46d3-fc8f820db42a@ya.ru/
> 
>>  		goto out_err;
>>  	} else {
>>  		err = -ENOTCONN;
>> @@ -2279,7 +2298,7 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
>>  		return -EOPNOTSUPP;
>>  
>>  	other = unix_peer(sk);
>> -	if (!other || sk->sk_state != TCP_ESTABLISHED)
>> +	if (!other || unix_sock_state(sk) != TCP_ESTABLISHED)
>>  		return -ENOTCONN;
>>  
>>  	if (false) {
>> @@ -2391,7 +2410,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
>>  	if (err)
>>  		return err;
>>  
>> -	if (sk->sk_state != TCP_ESTABLISHED)
>> +	if (unix_sock_state(sk) != TCP_ESTABLISHED)
>>  		return -ENOTCONN;
>>  
>>  	if (msg->msg_namelen)
>> @@ -2405,7 +2424,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
>>  {
>>  	struct sock *sk = sock->sk;
>>  
>> -	if (sk->sk_state != TCP_ESTABLISHED)
>> +	if (unix_sock_state(sk) != TCP_ESTABLISHED)
>>  		return -ENOTCONN;
>>  
>>  	return unix_dgram_recvmsg(sock, msg, size, flags);
>> @@ -2689,7 +2708,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
>>  
>>  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
>>  {
>> -	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
>> +	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED))
>>  		return -ENOTCONN;
>>  
>>  	return unix_read_skb(sk, recv_actor);
>> @@ -2713,7 +2732,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
>>  	size_t size = state->size;
>>  	unsigned int last_len;
>>  
>> -	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
>> +	if (unlikely(unix_sock_state(sk) != TCP_ESTABLISHED)) {
>>  		err = -EINVAL;
>>  		goto out;
>>  	}


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
  2023-01-24 17:57 ` Kuniyuki Iwashima
  2023-01-24 21:05   ` Kirill Tkhai
@ 2023-01-24 22:32   ` Cyrill Gorcunov
  1 sibling, 0 replies; 11+ messages in thread
From: Cyrill Gorcunov @ 2023-01-24 22:32 UTC (permalink / raw)
  To: Kuniyuki Iwashima; +Cc: tkhai, davem, edumazet, kuba, netdev, pabeni

On Tue, Jan 24, 2023 at 09:57:12AM -0800, Kuniyuki Iwashima wrote:
> From:   Kirill Tkhai <tkhai@ya.ru>
> Date:   Mon, 23 Jan 2023 01:21:20 +0300
> > Some functions use unlocked check for sock::sk_state. This does not guarantee
> > a new value assigned to sk_state on some CPU is already visible on this CPU.
> > 
> > Example:
> > 
> > [CPU0:Task0]                    [CPU1:Task1]
> > unix_listen()
> >   unix_state_lock(sk);
> >   sk->sk_state = TCP_LISTEN;
> >   unix_state_unlock(sk);
> >                                 unix_accept()
> >                                   if (sk->sk_state != TCP_LISTEN) /* not visible */
> >                                      goto out;                    /* return error */
> > 
> > Task1 may miss new sk->sk_state value, and then unix_accept() returns error.
> > Since in this situation unix_accept() is called chronologically later, such
> > behavior is not obvious and it is wrong.
> 
> Have you seen this on a real workload ?
> 
> It sounds like a userspace bug that accept() is called on a different
> CPU before listen() returns.  At least, accept() is fetching sk at the

I must confess I don't get why accept() can't be called on different cpu
while listen() is in progress. As far as I see there is a small race window
which of course not critical at all since in worst case we simply report an
error back to userspace, still a nit worth to fix.

> same time, then I think there should be no guarantee that sk_state is
> TCP_LISTEN.
> 
> Same for other TCP_ESTABLISHED tests, it seems a program is calling
> sendmsg()/recvmsg() when sk is still TCP_CLOSE and betting concurrent
> connect() will finish earlier.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
  2023-01-22 22:21 [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu Kirill Tkhai
  2023-01-24 17:57 ` Kuniyuki Iwashima
@ 2023-01-25  1:35 ` Jakub Kicinski
  2023-01-25 21:09   ` Kirill Tkhai
  1 sibling, 1 reply; 11+ messages in thread
From: Jakub Kicinski @ 2023-01-25  1:35 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: Linux Kernel Network Developers, davem, edumazet, pabeni, kuniyu,
	gorcunov

On Mon, 23 Jan 2023 01:21:20 +0300 Kirill Tkhai wrote:
> Since in this situation unix_accept() is called chronologically later, such
> behavior is not obvious and it is wrong.

Noob question, perhaps - how do we establish the ordering ?
CPU1 knows that listen() has succeed without a barrier ?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
  2023-01-25  1:35 ` Jakub Kicinski
@ 2023-01-25 21:09   ` Kirill Tkhai
  2023-01-26  6:10     ` Jakub Kicinski
  0 siblings, 1 reply; 11+ messages in thread
From: Kirill Tkhai @ 2023-01-25 21:09 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Linux Kernel Network Developers, davem, edumazet, pabeni, kuniyu,
	gorcunov

On 25.01.2023 04:35, Jakub Kicinski wrote:
> On Mon, 23 Jan 2023 01:21:20 +0300 Kirill Tkhai wrote:
>> Since in this situation unix_accept() is called chronologically later, such
>> behavior is not obvious and it is wrong.
> 
> Noob question, perhaps - how do we establish the ordering ?
> CPU1 knows that listen() has succeed without a barrier ?

1)There are a many combinations with third task involved:

[CPU0:Task0]  [CPU1:Task1]                           [CPU2:Task2]
listen(sk)
              kernel:
                sk_diag_fill(sk)
                  rep->udiag_state = TCP_LISTEN
                return_from_syscall
              userspace:
                mutex_lock()
                shared_mem_var = rep->udiag_state 
                mutex_unlock()

                                                     userspace: 
                                                       mutex_lock()
                                                       if (shared_mem_var == TCP_LISTEN)
                                                         accept(sk); /* -> fail, since sk_state is not visible */
                                                       mutex_unlock()

In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility
to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(),
there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2.

2)My understanding is chronologically later accept() mustn't miss sk_state.
Otherwise, kernel says that ordering between internal syscalls data
is userspace duty, which is wrong. Userspace knows nothing about internal
kernel data.

It's not prohibited to call accept() for a socket obtained via pidfd_getfd()
by a side application. Why doesn't the code guarantee, that accept() see
actual sk_state?

[CPU0:Task0]          [CPU1:Task1]
listen(sk)
                      sk = pidfd_getfd()
                      accept(sk) /* -> fail */

3)Such possible situations in log file also look strange:

[CPU0:Task0]                                           [CPU1:Task1]
listen()
get_time(&time1)
write_log("listening at ...", time1)

                                                       get_time(&time2)
                                                       sk = accept()
                                                       if (sk < 0)
                                                          write_log("accept failed at ...", time2)

In case of there is no kernel ordering, we may see in their logs something
like the below:

Task1.log
"listening at time1"

Task2.log
"accept failed at time2"

and time2 > time1.

Kirill

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
  2023-01-25 21:09   ` Kirill Tkhai
@ 2023-01-26  6:10     ` Jakub Kicinski
  2023-01-26 20:25       ` Paul E. McKenney
  2023-02-02 19:42       ` Kirill Tkhai
  0 siblings, 2 replies; 11+ messages in thread
From: Jakub Kicinski @ 2023-01-26  6:10 UTC (permalink / raw)
  To: Kirill Tkhai, Paul E. McKenney
  Cc: Linux Kernel Network Developers, davem, edumazet, pabeni, kuniyu,
	gorcunov

On Thu, 26 Jan 2023 00:09:08 +0300 Kirill Tkhai wrote:
> 1)There are a many combinations with third task involved:
> 
> [CPU0:Task0]  [CPU1:Task1]                           [CPU2:Task2]
> listen(sk)
>               kernel:
>                 sk_diag_fill(sk)
>                   rep->udiag_state = TCP_LISTEN
>                 return_from_syscall
>               userspace:
>                 mutex_lock()
>                 shared_mem_var = rep->udiag_state 
>                 mutex_unlock()
> 
>                                                      userspace: 
>                                                        mutex_lock()
>                                                        if (shared_mem_var == TCP_LISTEN)
>                                                          accept(sk); /* -> fail, since sk_state is not visible */
>                                                        mutex_unlock()
> 
> In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility
> to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(),
> there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2.

Me trying to prove that memory ordering is transitive would be 100%
speculation. Let's ask Paul instead - is the above valid? Or the fact
that CPU1 observes state from CPU0 and is strongly ordered with CPU2
implies that CPU2 will also observe CPU0's state?

> 2)My understanding is chronologically later accept() mustn't miss sk_state.
> Otherwise, kernel says that ordering between internal syscalls data
> is userspace duty, which is wrong. Userspace knows nothing about internal
> kernel data.

> 3)Such possible situations in log file also look strange:

Dunno those points are a bit subjective. Squeezing perf out of
distributed systems requires sacrifices 🤷️

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
  2023-01-26  6:10     ` Jakub Kicinski
@ 2023-01-26 20:25       ` Paul E. McKenney
  2023-01-26 21:33         ` Jakub Kicinski
  2023-02-02 19:42       ` Kirill Tkhai
  1 sibling, 1 reply; 11+ messages in thread
From: Paul E. McKenney @ 2023-01-26 20:25 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Kirill Tkhai, Linux Kernel Network Developers, davem, edumazet,
	pabeni, kuniyu, gorcunov

On Wed, Jan 25, 2023 at 10:10:53PM -0800, Jakub Kicinski wrote:
> On Thu, 26 Jan 2023 00:09:08 +0300 Kirill Tkhai wrote:
> > 1)There are a many combinations with third task involved:
> > 
> > [CPU0:Task0]  [CPU1:Task1]                           [CPU2:Task2]
> > listen(sk)
> >               kernel:
> >                 sk_diag_fill(sk)
> >                   rep->udiag_state = TCP_LISTEN
> >                 return_from_syscall
> >               userspace:
> >                 mutex_lock()
> >                 shared_mem_var = rep->udiag_state 
> >                 mutex_unlock()
> > 
> >                                                      userspace: 
> >                                                        mutex_lock()
> >                                                        if (shared_mem_var == TCP_LISTEN)
> >                                                          accept(sk); /* -> fail, since sk_state is not visible */
> >                                                        mutex_unlock()
> > 
> > In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility
> > to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(),
> > there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2.
> 
> Me trying to prove that memory ordering is transitive would be 100%
> speculation. Let's ask Paul instead - is the above valid? Or the fact
> that CPU1 observes state from CPU0 and is strongly ordered with CPU2
> implies that CPU2 will also observe CPU0's state?

Hmmm...  What is listen() doing?  There seem to be a lot of them
in the kernel.

But proceeding on first principles...

Sometimes.  Memory ordering is transitive only when the ordering is
sufficiently strong.

In this case, I do not see any ordering between CPU 0 and anything else.
If the listen() function were to acquire the same mutex as CPU1 and CPU2
did, and if it acquired it first, then CPU2 would be guaranteed to see
anything CPU0 did while holding that mutex.

Alternatively, if CPU0 wrote to some memory, and CPU1 read that value
before releasing the mutex (including possibly before acquiring that
mutex), then CPU2 would be guaranteed to see that value (or the value
written by some later write to that same memory) after acquiring that
mutex.

So here are some things you can count on transitively:

1.	After acquiring a given lock (or mutex or whatever), you will
	see any values written or read prior to any earlier conflicting
	release of that same lock.

2.	After an access with acquire semantics (for example,
	smp_load_acquire()) you will see any values written or read
	prior to any earlier access with release semantics (for example,
	smp_store_release()).

Or in all cases, you might see later values, in case someone else also
did a write to the location in question.

Does that help, or am I missing a turn in there somewhere?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
  2023-01-26 20:25       ` Paul E. McKenney
@ 2023-01-26 21:33         ` Jakub Kicinski
  2023-01-26 21:47           ` Paul E. McKenney
  0 siblings, 1 reply; 11+ messages in thread
From: Jakub Kicinski @ 2023-01-26 21:33 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Kirill Tkhai, Linux Kernel Network Developers, davem, edumazet,
	pabeni, kuniyu, gorcunov

On Thu, 26 Jan 2023 12:25:11 -0800 Paul E. McKenney wrote:
> > Me trying to prove that memory ordering is transitive would be 100%
> > speculation. Let's ask Paul instead - is the above valid? Or the fact
> > that CPU1 observes state from CPU0 and is strongly ordered with CPU2
> > implies that CPU2 will also observe CPU0's state?  
> 
> Hmmm...  What is listen() doing?  There seem to be a lot of them
> in the kernel.
> 
> But proceeding on first principles...
> 
> Sometimes.  Memory ordering is transitive only when the ordering is
> sufficiently strong.
> 
> In this case, I do not see any ordering between CPU 0 and anything else.
> If the listen() function were to acquire the same mutex as CPU1 and CPU2
> did, and if it acquired it first, then CPU2 would be guaranteed to see
> anything CPU0 did while holding that mutex.

The fuller picture would be:

[CPU0]                     [CPU1]                [CPU2]
WRITE_ONCE(sk->sk_state,
           TCP_LISTEN);
                           val = READ_ONCE(sk->sk_state) 
                           mutex_lock()
                           shared_mem_var = val
                           mutex_unlock()
                                                  mutex_lock()
                                                  if (shared_mem_var == TCP_LISTEN)
                                                     BUG_ON(READ_ONCE(sk->sk_state)
                                                            != TCP_LISTEN)
                                                  mutex_unlock()

> Alternatively, if CPU0 wrote to some memory, and CPU1 read that value
> before releasing the mutex (including possibly before acquiring that
> mutex), then CPU2 would be guaranteed to see that value (or the value
> written by some later write to that same memory) after acquiring that
> mutex.

Which I believe is exactly what happens in the example.

> So here are some things you can count on transitively:
> 
> 1.	After acquiring a given lock (or mutex or whatever), you will
> 	see any values written or read prior to any earlier conflicting
> 	release of that same lock.
> 
> 2.	After an access with acquire semantics (for example,
> 	smp_load_acquire()) you will see any values written or read
> 	prior to any earlier access with release semantics (for example,
> 	smp_store_release()).
> 
> Or in all cases, you might see later values, in case someone else also
> did a write to the location in question.
> 
> Does that help, or am I missing a turn in there somewhere?

Very much so, thank you!

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
  2023-01-26 21:33         ` Jakub Kicinski
@ 2023-01-26 21:47           ` Paul E. McKenney
  0 siblings, 0 replies; 11+ messages in thread
From: Paul E. McKenney @ 2023-01-26 21:47 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Kirill Tkhai, Linux Kernel Network Developers, davem, edumazet,
	pabeni, kuniyu, gorcunov

On Thu, Jan 26, 2023 at 01:33:22PM -0800, Jakub Kicinski wrote:
> On Thu, 26 Jan 2023 12:25:11 -0800 Paul E. McKenney wrote:
> > > Me trying to prove that memory ordering is transitive would be 100%
> > > speculation. Let's ask Paul instead - is the above valid? Or the fact
> > > that CPU1 observes state from CPU0 and is strongly ordered with CPU2
> > > implies that CPU2 will also observe CPU0's state?  
> > 
> > Hmmm...  What is listen() doing?  There seem to be a lot of them
> > in the kernel.
> > 
> > But proceeding on first principles...
> > 
> > Sometimes.  Memory ordering is transitive only when the ordering is
> > sufficiently strong.
> > 
> > In this case, I do not see any ordering between CPU 0 and anything else.
> > If the listen() function were to acquire the same mutex as CPU1 and CPU2
> > did, and if it acquired it first, then CPU2 would be guaranteed to see
> > anything CPU0 did while holding that mutex.
> 
> The fuller picture would be:
> 
> [CPU0]                     [CPU1]                [CPU2]
> WRITE_ONCE(sk->sk_state,
>            TCP_LISTEN);
>                            val = READ_ONCE(sk->sk_state) 
>                            mutex_lock()
>                            shared_mem_var = val
>                            mutex_unlock()
>                                                   mutex_lock()
>                                                   if (shared_mem_var == TCP_LISTEN)
>                                                      BUG_ON(READ_ONCE(sk->sk_state)
>                                                             != TCP_LISTEN)
>                                                   mutex_unlock()

And just to make sure that I represented your example correctly, please
see the litmus test below.

LKMM says that the bad outcome cannot happen, that is, if CPU1 sees
CPU0's write to sk->sk_state ("ss" in the litmus test) and if CPU2
sees CPU1's write to shared_mem_var ("smv" in the litmus test), then
it cannot be the case that CPU2 sees the old value of sk->sk_state that
was overwritten by CPU0.

And as you note below, it is no surprise that LKMM has this opinion.  ;-)

But I need to make sure that I didn't misrepresent your diagram above.

> > Alternatively, if CPU0 wrote to some memory, and CPU1 read that value
> > before releasing the mutex (including possibly before acquiring that
> > mutex), then CPU2 would be guaranteed to see that value (or the value
> > written by some later write to that same memory) after acquiring that
> > mutex.
> 
> Which I believe is exactly what happens in the example.
> 
> > So here are some things you can count on transitively:
> > 
> > 1.	After acquiring a given lock (or mutex or whatever), you will
> > 	see any values written or read prior to any earlier conflicting
> > 	release of that same lock.
> > 
> > 2.	After an access with acquire semantics (for example,
> > 	smp_load_acquire()) you will see any values written or read
> > 	prior to any earlier access with release semantics (for example,
> > 	smp_store_release()).
> > 
> > Or in all cases, you might see later values, in case someone else also
> > did a write to the location in question.
> > 
> > Does that help, or am I missing a turn in there somewhere?
> 
> Very much so, thank you!

------------------------------------------------------------------------

C C-Jakub-listen

(*
 * Result: Never
 *
 * Message-ID: <20230126133322.3bfab5e0@kernel.org>
 *)

{
}

P0(int *ss, int *smv, spinlock_t *gbl)
{
	WRITE_ONCE(*ss, 1);
}

P1(int *ss, int *smv, spinlock_t *gbl)
{
        int r1;

	r1 = READ_ONCE(*ss);
	spin_lock(gbl);
	WRITE_ONCE(*smv, 1);
	spin_unlock(gbl);
}

P2(int *ss, int *smv, spinlock_t *gbl)
{
	int r1;
        int r2;

	spin_lock(gbl);
	r1 = READ_ONCE(*smv);
	r2 = READ_ONCE(*ss);
	spin_unlock(gbl);
}

exists
(1:r1=1 /\ 2:r1=1 /\ 2:r2=0)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu
  2023-01-26  6:10     ` Jakub Kicinski
  2023-01-26 20:25       ` Paul E. McKenney
@ 2023-02-02 19:42       ` Kirill Tkhai
  1 sibling, 0 replies; 11+ messages in thread
From: Kirill Tkhai @ 2023-02-02 19:42 UTC (permalink / raw)
  To: Jakub Kicinski, Paul E. McKenney
  Cc: Linux Kernel Network Developers, davem, edumazet, pabeni, kuniyu,
	gorcunov

On 26.01.2023 09:10, Jakub Kicinski wrote:
> On Thu, 26 Jan 2023 00:09:08 +0300 Kirill Tkhai wrote:
>> 1)There are a many combinations with third task involved:
>>
>> [CPU0:Task0]  [CPU1:Task1]                           [CPU2:Task2]
>> listen(sk)
>>               kernel:
>>                 sk_diag_fill(sk)
>>                   rep->udiag_state = TCP_LISTEN
>>                 return_from_syscall
>>               userspace:
>>                 mutex_lock()
>>                 shared_mem_var = rep->udiag_state 
>>                 mutex_unlock()
>>
>>                                                      userspace: 
>>                                                        mutex_lock()
>>                                                        if (shared_mem_var == TCP_LISTEN)
>>                                                          accept(sk); /* -> fail, since sk_state is not visible */
>>                                                        mutex_unlock()
>>
>> In this situation Task2 definitely knows Task0's listen() has succeed, but there is no a possibility
>> to guarantee its accept() won't fail. Despite there are appropriate barriers in mutex_lock() and mutex_unlock(),
>> there is no a possibility to add a barrier on CPU1 to make Task0's store visible on CPU2.
> 
> Me trying to prove that memory ordering is transitive would be 100%
> speculation. Let's ask Paul instead - is the above valid? Or the fact
> that CPU1 observes state from CPU0 and is strongly ordered with CPU2
> implies that CPU2 will also observe CPU0's state?

Thanks Paul for the answer about shared mutex. Despite that situation is safe, the issue with pidfd_getfd() still exists.
 
>> 2)My understanding is chronologically later accept() mustn't miss sk_state.
>> Otherwise, kernel says that ordering between internal syscalls data
>> is userspace duty, which is wrong. Userspace knows nothing about internal
>> kernel data.
> 
>> 3)Such possible situations in log file also look strange:
> 
> Dunno those points are a bit subjective. Squeezing perf out of
> distributed systems requires sacrifices 🤷️


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2023-02-02 19:42 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-22 22:21 [PATCH net-next] unix: Guarantee sk_state relevance in case of it was assigned by a task on other cpu Kirill Tkhai
2023-01-24 17:57 ` Kuniyuki Iwashima
2023-01-24 21:05   ` Kirill Tkhai
2023-01-24 22:32   ` Cyrill Gorcunov
2023-01-25  1:35 ` Jakub Kicinski
2023-01-25 21:09   ` Kirill Tkhai
2023-01-26  6:10     ` Jakub Kicinski
2023-01-26 20:25       ` Paul E. McKenney
2023-01-26 21:33         ` Jakub Kicinski
2023-01-26 21:47           ` Paul E. McKenney
2023-02-02 19:42       ` Kirill Tkhai

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.