linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/3] af_unix: fix use-after-free
@ 2015-10-02 20:43 Jason Baron
  2015-10-02 20:43 ` [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll() Jason Baron
                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Jason Baron @ 2015-10-02 20:43 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	rweikusat, viro, davidel, dave, olivier, pageexec, torvalds,
	peterz

Hi,

These patches are against mainline, I can re-base to net-next, just
let me know.

They have been tested against: https://lkml.org/lkml/2015/9/13/195,
which causes the use-after-free quite quickly and here:
https://lkml.org/lkml/2015/10/2/693.

Thanks,

-Jason


Jason Baron (3):
  unix: fix use-after-free in unix_dgram_poll()
  af_unix: Convert gc_flags to flags
  af_unix: optimize the unix_dgram_recvmsg()

 include/net/af_unix.h |   4 +-
 net/unix/af_unix.c    | 104 ++++++++++++++++++++++++++++++++++++++------------
 net/unix/garbage.c    |  12 +++---
 3 files changed, 88 insertions(+), 32 deletions(-)

-- 
1.8.2.rc2


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-02 20:43 [PATCH v2 0/3] af_unix: fix use-after-free Jason Baron
@ 2015-10-02 20:43 ` Jason Baron
  2015-10-03  5:46   ` Mathias Krause
  2015-10-05 16:31   ` Rainer Weikusat
  2015-10-02 20:43 ` [PATCH v2 2/3] af_unix: Convert gc_flags to flags Jason Baron
  2015-10-02 20:44 ` [PATCH v2 3/3] af_unix: optimize the unix_dgram_recvmsg() Jason Baron
  2 siblings, 2 replies; 26+ messages in thread
From: Jason Baron @ 2015-10-02 20:43 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	rweikusat, viro, davidel, dave, olivier, pageexec, torvalds,
	peterz

The unix_dgram_poll() routine calls sock_poll_wait() not only for the wait
queue associated with the socket s that we are poll'ing against, but also calls
sock_poll_wait() for a remote peer socket p, if it is connected. Thus,
if we call poll()/select()/epoll() for the socket s, there are then
a couple of code paths in which the remote peer socket p and its associated
peer_wait queue can be freed before poll()/select()/epoll() have a chance
to remove themselves from the remote peer socket.

The way that remote peer socket can be freed are:

1. If s calls connect() to a connect to a new socket other than p, it will
drop its reference on p, and thus a close() on p will free it.

2. If we call close on p(), then a subsequent sendmsg() from s, will drop
the final reference to p, allowing it to be freed.

Address this issue, by reverting unix_dgram_poll() to only register with
the wait queue associated with s and register a callback with the remote peer
socket on connect() that will wake up the wait queue associated with s. If
scenarios 1 or 2 occur above we then simply remove the callback from the
remote peer. This then presents the expected semantics to poll()/select()/
epoll().

I've implemented this for sock-type, SOCK_RAW, SOCK_DGRAM, and SOCK_SEQPACKET
but not for SOCK_STREAM, since SOCK_STREAM does not use unix_dgram_poll().

Signed-off-by: Jason Baron <jbaron@akamai.com>
---
 include/net/af_unix.h |  1 +
 net/unix/af_unix.c    | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 4a167b3..9698aff 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -62,6 +62,7 @@ struct unix_sock {
 #define UNIX_GC_CANDIDATE	0
 #define UNIX_GC_MAYBE_CYCLE	1
 	struct socket_wq	peer_wq;
+	wait_queue_t		wait;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 03ee4d3..f789423 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -420,6 +420,9 @@ static void unix_release_sock(struct sock *sk, int embrion)
 	skpair = unix_peer(sk);
 
 	if (skpair != NULL) {
+		if (sk->sk_type != SOCK_STREAM)
+			remove_wait_queue(&unix_sk(skpair)->peer_wait,
+					  &u->wait);
 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 			unix_state_lock(skpair);
 			/* No more writes */
@@ -636,6 +639,16 @@ static struct proto unix_proto = {
  */
 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 
+static int peer_wake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct unix_sock *u;
+
+	u = container_of(wait, struct unix_sock, wait);
+	wake_up_interruptible_sync_poll(sk_sleep(&u->sk), key);
+
+	return 0;
+}
+
 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 {
 	struct sock *sk = NULL;
@@ -664,6 +677,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
+	init_waitqueue_func_entry(&u->wait, peer_wake);
 	unix_insert_socket(unix_sockets_unbound(sk), sk);
 out:
 	if (sk == NULL)
@@ -1030,7 +1044,11 @@ restart:
 	 */
 	if (unix_peer(sk)) {
 		struct sock *old_peer = unix_peer(sk);
+
+		remove_wait_queue(&unix_sk(old_peer)->peer_wait,
+				  &unix_sk(sk)->wait);
 		unix_peer(sk) = other;
+		add_wait_queue(&unix_sk(other)->peer_wait, &unix_sk(sk)->wait);
 		unix_state_double_unlock(sk, other);
 
 		if (other != old_peer)
@@ -1038,8 +1056,12 @@ restart:
 		sock_put(old_peer);
 	} else {
 		unix_peer(sk) = other;
+		add_wait_queue(&unix_sk(other)->peer_wait, &unix_sk(sk)->wait);
 		unix_state_double_unlock(sk, other);
 	}
+	/* New remote may have created write space for us */
+	wake_up_interruptible_sync_poll(sk_sleep(sk),
+					POLLOUT | POLLWRNORM | POLLWRBAND);
 	return 0;
 
 out_unlock:
@@ -1194,6 +1216,8 @@ restart:
 
 	sock_hold(sk);
 	unix_peer(newsk)	= sk;
+	if (sk->sk_type == SOCK_SEQPACKET)
+		add_wait_queue(&unix_sk(sk)->peer_wait, &unix_sk(newsk)->wait);
 	newsk->sk_state		= TCP_ESTABLISHED;
 	newsk->sk_type		= sk->sk_type;
 	init_peercred(newsk);
@@ -1220,6 +1244,8 @@ restart:
 
 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
 	unix_peer(sk)	= newsk;
+	if (sk->sk_type == SOCK_SEQPACKET)
+		add_wait_queue(&unix_sk(newsk)->peer_wait, &unix_sk(sk)->wait);
 
 	unix_state_unlock(sk);
 
@@ -1254,6 +1280,10 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb)
 	sock_hold(skb);
 	unix_peer(ska) = skb;
 	unix_peer(skb) = ska;
+	if (ska->sk_type != SOCK_STREAM) {
+		add_wait_queue(&unix_sk(ska)->peer_wait, &unix_sk(skb)->wait);
+		add_wait_queue(&unix_sk(skb)->peer_wait, &unix_sk(ska)->wait);
+	}
 	init_peercred(ska);
 	init_peercred(skb);
 
@@ -1565,6 +1595,7 @@ restart:
 		unix_state_lock(sk);
 		if (unix_peer(sk) == other) {
 			unix_peer(sk) = NULL;
+			remove_wait_queue(&unix_sk(other)->peer_wait, &u->wait);
 			unix_state_unlock(sk);
 
 			unix_dgram_disconnected(sk, other);
@@ -2441,7 +2472,6 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 	other = unix_peer_get(sk);
 	if (other) {
 		if (unix_peer(other) != sk) {
-			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
 			if (unix_recvq_full(other))
 				writable = 0;
 		}
-- 
1.8.2.rc2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 2/3] af_unix: Convert gc_flags to flags
  2015-10-02 20:43 [PATCH v2 0/3] af_unix: fix use-after-free Jason Baron
  2015-10-02 20:43 ` [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll() Jason Baron
@ 2015-10-02 20:43 ` Jason Baron
  2015-10-02 20:44 ` [PATCH v2 3/3] af_unix: optimize the unix_dgram_recvmsg() Jason Baron
  2 siblings, 0 replies; 26+ messages in thread
From: Jason Baron @ 2015-10-02 20:43 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	rweikusat, viro, davidel, dave, olivier, pageexec, torvalds,
	peterz

Convert gc_flags to flags in preparation for the subsequent patch, which will
make use of a flag bit for a non-gc purpose.

Signed-off-by: Jason Baron <jbaron@akamai.com>
---
 include/net/af_unix.h |  2 +-
 net/unix/garbage.c    | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 9698aff..6a4a345 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -58,7 +58,7 @@ struct unix_sock {
 	atomic_long_t		inflight;
 	spinlock_t		lock;
 	unsigned char		recursion_level;
-	unsigned long		gc_flags;
+	unsigned long		flags;
 #define UNIX_GC_CANDIDATE	0
 #define UNIX_GC_MAYBE_CYCLE	1
 	struct socket_wq	peer_wq;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index a73a226..39794d9 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -179,7 +179,7 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
 					 * have been added to the queues after
 					 * starting the garbage collection
 					 */
-					if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
+					if (test_bit(UNIX_GC_CANDIDATE, &u->flags)) {
 						hit = true;
 
 						func(u);
@@ -246,7 +246,7 @@ static void inc_inflight_move_tail(struct unix_sock *u)
 	 * of the list, so that it's checked even if it was already
 	 * passed over
 	 */
-	if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
+	if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->flags))
 		list_move_tail(&u->link, &gc_candidates);
 }
 
@@ -305,8 +305,8 @@ void unix_gc(void)
 		BUG_ON(total_refs < inflight_refs);
 		if (total_refs == inflight_refs) {
 			list_move_tail(&u->link, &gc_candidates);
-			__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
-			__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+			__set_bit(UNIX_GC_CANDIDATE, &u->flags);
+			__set_bit(UNIX_GC_MAYBE_CYCLE, &u->flags);
 		}
 	}
 
@@ -332,7 +332,7 @@ void unix_gc(void)
 
 		if (atomic_long_read(&u->inflight) > 0) {
 			list_move_tail(&u->link, &not_cycle_list);
-			__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+			__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->flags);
 			scan_children(&u->sk, inc_inflight_move_tail, NULL);
 		}
 	}
@@ -343,7 +343,7 @@ void unix_gc(void)
 	 */
 	while (!list_empty(&not_cycle_list)) {
 		u = list_entry(not_cycle_list.next, struct unix_sock, link);
-		__clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
+		__clear_bit(UNIX_GC_CANDIDATE, &u->flags);
 		list_move_tail(&u->link, &gc_inflight_list);
 	}
 
-- 
1.8.2.rc2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v2 3/3] af_unix: optimize the unix_dgram_recvmsg()
  2015-10-02 20:43 [PATCH v2 0/3] af_unix: fix use-after-free Jason Baron
  2015-10-02 20:43 ` [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll() Jason Baron
  2015-10-02 20:43 ` [PATCH v2 2/3] af_unix: Convert gc_flags to flags Jason Baron
@ 2015-10-02 20:44 ` Jason Baron
  2015-10-05  7:41   ` Peter Zijlstra
  2 siblings, 1 reply; 26+ messages in thread
From: Jason Baron @ 2015-10-02 20:44 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	rweikusat, viro, davidel, dave, olivier, pageexec, torvalds,
	peterz

Now that connect() permanently registers a callback routine, we can induce
extra overhead in unix_dgram_recvmsg(), which unconditionally wakes up
its peer_wait queue on every receive. This patch makes the wakeup there
conditional on there being waiters interested in wait events.

Signed-off-by: Jason Baron <jbaron@akamai.com>
---
 include/net/af_unix.h |  1 +
 net/unix/af_unix.c    | 72 ++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 6a4a345..cf21ffd 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -61,6 +61,7 @@ struct unix_sock {
 	unsigned long		flags;
 #define UNIX_GC_CANDIDATE	0
 #define UNIX_GC_MAYBE_CYCLE	1
+#define UNIX_NOSPACE		2
 	struct socket_wq	peer_wq;
 	wait_queue_t		wait;
 };
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f789423..b8ed1bc 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -326,7 +326,7 @@ found:
 	return s;
 }
 
-static inline int unix_writable(struct sock *sk)
+static inline bool unix_writable(struct sock *sk)
 {
 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 }
@@ -1079,6 +1079,9 @@ static long unix_wait_for_peer(struct sock *other, long timeo)
 
 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
 
+	set_bit(UNIX_NOSPACE, &u->flags);
+	/* pairs with mb in unix_dgram_recv */
+	smp_mb__after_atomic();
 	sched = !sock_flag(other, SOCK_DEAD) &&
 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
 		unix_recvq_full(other);
@@ -1623,17 +1626,22 @@ restart:
 
 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
 		if (!timeo) {
-			err = -EAGAIN;
-			goto out_unlock;
-		}
-
-		timeo = unix_wait_for_peer(other, timeo);
+			set_bit(UNIX_NOSPACE, &unix_sk(other)->flags);
+			/* pairs with mb in unix_dgram_recv */
+			smp_mb__after_atomic();
+			if (unix_recvq_full(other)) {
+				err = -EAGAIN;
+				goto out_unlock;
+			}
+		} else {
+			timeo = unix_wait_for_peer(other, timeo);
 
-		err = sock_intr_errno(timeo);
-		if (signal_pending(current))
-			goto out_free;
+			err = sock_intr_errno(timeo);
+			if (signal_pending(current))
+				goto out_free;
 
-		goto restart;
+			goto restart;
+		}
 	}
 
 	if (sock_flag(other, SOCK_RCVTSTAMP))
@@ -1939,8 +1947,14 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 		goto out_unlock;
 	}
 
-	wake_up_interruptible_sync_poll(&u->peer_wait,
-					POLLOUT | POLLWRNORM | POLLWRBAND);
+	/* pairs with unix_dgram_poll() and wait_for_peer() */
+	smp_mb();
+	if (test_bit(UNIX_NOSPACE, &u->flags)) {
+		clear_bit(UNIX_NOSPACE, &u->flags);
+		wake_up_interruptible_sync_poll(&u->peer_wait,
+						POLLOUT | POLLWRNORM |
+						POLLWRBAND);
+	}
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
@@ -2432,11 +2446,22 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
 	return mask;
 }
 
+static bool unix_dgram_writable(struct sock *sk, struct sock *other)
+{
+	bool writable;
+
+	writable = unix_writable(sk);
+	if (other && unix_peer(other) != sk && unix_recvq_full(other))
+		writable = false;
+
+	return writable;
+}
+
 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 				    poll_table *wait)
 {
 	struct sock *sk = sock->sk, *other;
-	unsigned int mask, writable;
+	unsigned int mask;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
@@ -2468,20 +2493,19 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
 		return mask;
 
-	writable = unix_writable(sk);
 	other = unix_peer_get(sk);
-	if (other) {
-		if (unix_peer(other) != sk) {
-			if (unix_recvq_full(other))
-				writable = 0;
-		}
-		sock_put(other);
-	}
-
-	if (writable)
+	if (unix_dgram_writable(sk, other)) {
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
-	else
+	} else {
 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		set_bit(UNIX_NOSPACE, &unix_sk(other)->flags);
+		/* pairs with mb in unix_dgram_recv */
+		smp_mb__after_atomic();
+		if (unix_dgram_writable(sk, other))
+			mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+	}
+	if (other)
+		sock_put(other);
 
 	return mask;
 }
-- 
1.8.2.rc2


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-02 20:43 ` [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll() Jason Baron
@ 2015-10-03  5:46   ` Mathias Krause
  2015-10-03 17:02     ` Rainer Weikusat
  2015-10-05 16:31   ` Rainer Weikusat
  1 sibling, 1 reply; 26+ messages in thread
From: Mathias Krause @ 2015-10-03  5:46 UTC (permalink / raw)
  To: Jason Baron
  Cc: David S. Miller, netdev, linux-kernel, Eric Wong, Eric Dumazet,
	Rainer Weikusat, Al Viro, Davide Libenzi, Davidlohr Bueso,
	Olivier Mauras, PaX Team, Linus Torvalds, Peter Zijlstra

On 2 October 2015 at 22:43, Jason Baron <jbaron@akamai.com> wrote:
> The unix_dgram_poll() routine calls sock_poll_wait() not only for the wait
> queue associated with the socket s that we are poll'ing against, but also calls
> sock_poll_wait() for a remote peer socket p, if it is connected. Thus,
> if we call poll()/select()/epoll() for the socket s, there are then
> a couple of code paths in which the remote peer socket p and its associated
> peer_wait queue can be freed before poll()/select()/epoll() have a chance
> to remove themselves from the remote peer socket.
>
> The way that remote peer socket can be freed are:
>
> 1. If s calls connect() to a connect to a new socket other than p, it will
> drop its reference on p, and thus a close() on p will free it.
>
> 2. If we call close on p(), then a subsequent sendmsg() from s, will drop
> the final reference to p, allowing it to be freed.
>
> Address this issue, by reverting unix_dgram_poll() to only register with
> the wait queue associated with s and register a callback with the remote peer
> socket on connect() that will wake up the wait queue associated with s. If
> scenarios 1 or 2 occur above we then simply remove the callback from the
> remote peer. This then presents the expected semantics to poll()/select()/
> epoll().
>
> I've implemented this for sock-type, SOCK_RAW, SOCK_DGRAM, and SOCK_SEQPACKET
> but not for SOCK_STREAM, since SOCK_STREAM does not use unix_dgram_poll().
>
> Signed-off-by: Jason Baron <jbaron@akamai.com>
> ---
>  include/net/af_unix.h |  1 +
>  net/unix/af_unix.c    | 32 +++++++++++++++++++++++++++++++-
>  2 files changed, 32 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/af_unix.h b/include/net/af_unix.h
> index 4a167b3..9698aff 100644
> --- a/include/net/af_unix.h
> +++ b/include/net/af_unix.h
> @@ -62,6 +62,7 @@ struct unix_sock {
>  #define UNIX_GC_CANDIDATE      0
>  #define UNIX_GC_MAYBE_CYCLE    1
>         struct socket_wq        peer_wq;
> +       wait_queue_t            wait;
>  };
>  #define unix_sk(__sk) ((struct unix_sock *)__sk)
>
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 03ee4d3..f789423 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -420,6 +420,9 @@ static void unix_release_sock(struct sock *sk, int embrion)
>         skpair = unix_peer(sk);
>
>         if (skpair != NULL) {
> +               if (sk->sk_type != SOCK_STREAM)
> +                       remove_wait_queue(&unix_sk(skpair)->peer_wait,
> +                                         &u->wait);
>                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
>                         unix_state_lock(skpair);
>                         /* No more writes */
> @@ -636,6 +639,16 @@ static struct proto unix_proto = {
>   */
>  static struct lock_class_key af_unix_sk_receive_queue_lock_key;
>
> +static int peer_wake(wait_queue_t *wait, unsigned mode, int sync, void *key)
> +{
> +       struct unix_sock *u;
> +
> +       u = container_of(wait, struct unix_sock, wait);
> +       wake_up_interruptible_sync_poll(sk_sleep(&u->sk), key);
> +
> +       return 0;
> +}
> +
>  static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
>  {
>         struct sock *sk = NULL;
> @@ -664,6 +677,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
>         INIT_LIST_HEAD(&u->link);
>         mutex_init(&u->readlock); /* single task reading lock */
>         init_waitqueue_head(&u->peer_wait);
> +       init_waitqueue_func_entry(&u->wait, peer_wake);
>         unix_insert_socket(unix_sockets_unbound(sk), sk);
>  out:
>         if (sk == NULL)
> @@ -1030,7 +1044,11 @@ restart:
>          */
>         if (unix_peer(sk)) {
>                 struct sock *old_peer = unix_peer(sk);
> +
> +               remove_wait_queue(&unix_sk(old_peer)->peer_wait,
> +                                 &unix_sk(sk)->wait);
>                 unix_peer(sk) = other;
> +               add_wait_queue(&unix_sk(other)->peer_wait, &unix_sk(sk)->wait);
>                 unix_state_double_unlock(sk, other);
>
>                 if (other != old_peer)
> @@ -1038,8 +1056,12 @@ restart:
>                 sock_put(old_peer);
>         } else {
>                 unix_peer(sk) = other;
> +               add_wait_queue(&unix_sk(other)->peer_wait, &unix_sk(sk)->wait);
>                 unix_state_double_unlock(sk, other);
>         }
> +       /* New remote may have created write space for us */
> +       wake_up_interruptible_sync_poll(sk_sleep(sk),
> +                                       POLLOUT | POLLWRNORM | POLLWRBAND);
>         return 0;
>
>  out_unlock:
> @@ -1194,6 +1216,8 @@ restart:
>
>         sock_hold(sk);
>         unix_peer(newsk)        = sk;
> +       if (sk->sk_type == SOCK_SEQPACKET)
> +               add_wait_queue(&unix_sk(sk)->peer_wait, &unix_sk(newsk)->wait);
>         newsk->sk_state         = TCP_ESTABLISHED;
>         newsk->sk_type          = sk->sk_type;
>         init_peercred(newsk);
> @@ -1220,6 +1244,8 @@ restart:
>
>         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
>         unix_peer(sk)   = newsk;
> +       if (sk->sk_type == SOCK_SEQPACKET)
> +               add_wait_queue(&unix_sk(newsk)->peer_wait, &unix_sk(sk)->wait);
>
>         unix_state_unlock(sk);
>
> @@ -1254,6 +1280,10 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb)
>         sock_hold(skb);
>         unix_peer(ska) = skb;
>         unix_peer(skb) = ska;
> +       if (ska->sk_type != SOCK_STREAM) {
> +               add_wait_queue(&unix_sk(ska)->peer_wait, &unix_sk(skb)->wait);
> +               add_wait_queue(&unix_sk(skb)->peer_wait, &unix_sk(ska)->wait);
> +       }
>         init_peercred(ska);
>         init_peercred(skb);
>
> @@ -1565,6 +1595,7 @@ restart:
>                 unix_state_lock(sk);
>                 if (unix_peer(sk) == other) {
>                         unix_peer(sk) = NULL;
> +                       remove_wait_queue(&unix_sk(other)->peer_wait, &u->wait);
>                         unix_state_unlock(sk);
>
>                         unix_dgram_disconnected(sk, other);
> @@ -2441,7 +2472,6 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
>         other = unix_peer_get(sk);
>         if (other) {
>                 if (unix_peer(other) != sk) {
> -                       sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
>                         if (unix_recvq_full(other))
>                                 writable = 0;
>                 }
> --
> 1.8.2.rc2
>

My reproducer runs on this patch for more than 3 days now without
triggering anything anymore.

Tested-by: Mathias Krause <minipli@googlemail.com>

Thanks Jason!

Regards,
Mathias

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-03  5:46   ` Mathias Krause
@ 2015-10-03 17:02     ` Rainer Weikusat
  2015-10-04 17:41       ` Rainer Weikusat
  0 siblings, 1 reply; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-03 17:02 UTC (permalink / raw)
  To: Mathias Krause
  Cc: Jason Baron, David S. Miller, netdev, linux-kernel, Eric Wong,
	Eric Dumazet, Rainer Weikusat, Al Viro, Davide Libenzi,
	Davidlohr Bueso, Olivier Mauras, PaX Team, Linus Torvalds,
	Peter Zijlstra

Mathias Krause <minipli@googlemail.com> writes:
> On 2 October 2015 at 22:43, Jason Baron <jbaron@akamai.com> wrote:
>> The unix_dgram_poll() routine calls sock_poll_wait() not only for the wait
>> queue associated with the socket s that we are poll'ing against, but also calls

[useless full-quote removed]

> My reproducer runs on this patch for more than 3 days now without
> triggering anything anymore.

Since the behaviour of your program is random, using it to "test"
anything doesn't really provide any insight: It could have been
executing the same codepath which doesn't happen to trigger any problems
for all of these three days. Nobody can tell.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-03 17:02     ` Rainer Weikusat
@ 2015-10-04 17:41       ` Rainer Weikusat
  0 siblings, 0 replies; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-04 17:41 UTC (permalink / raw)
  To: Rainer Weikusat
  Cc: Mathias Krause, Jason Baron, David S. Miller, netdev,
	linux-kernel, Eric Wong, Eric Dumazet, Al Viro, Davide Libenzi,
	Davidlohr Bueso, Olivier Mauras, PaX Team, Peter Zijlstra

Rainer Weikusat <rweikusat@mobileactivedefense.com> writes:

> Mathias Krause <minipli@googlemail.com> writes:
>> On 2 October 2015 at 22:43, Jason Baron <jbaron@akamai.com> wrote:
>>> The unix_dgram_poll() routine calls sock_poll_wait() not only for the wait
>>> queue associated with the socket s that we are poll'ing against, but also calls
>
> [useless full-quote removed]
>
>> My reproducer runs on this patch for more than 3 days now without
>> triggering anything anymore.
>
> Since the behaviour of your program is random, using it to "test"
> anything doesn't really provide any insight: It could have been
> executing the same codepath which doesn't happen to trigger any problems
> for all of these three days. Nobody can tell.

Since this "strangely" seems to have been lost in the thread: Here's the
test program showing that the reconnect while in epoll actually causes a
problem (at least I think so):

--------
#include <fcntl.h>
#include <pthread.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/epoll.h>
#include <signal.h>
#include <unistd.h>

static int sk, tg0, tg1;

static void *epoller(void *unused)
{
    struct epoll_event epev;
    int epfd;
    
    epfd = epoll_create(1);
    if (epfd == -1) exit(0);

    epev.events = EPOLLOUT;
    epoll_ctl(epfd, EPOLL_CTL_ADD, sk, &epev);
    epoll_wait(epfd, &epev, 1, 5000);

    close(sk);

    execl("./a.out", "./a.out", (void *)0);

    return NULL;
}

int main(void)
{
    struct sockaddr_un sun;
    pthread_t tid;
    int rc;

    sun.sun_family = AF_UNIX;
    
    tg0 = socket(AF_UNIX, SOCK_DGRAM, 0);
    strncpy(sun.sun_path, "/tmp/tg0", sizeof(sun.sun_path));
    unlink(sun.sun_path);
    bind(tg0, (struct sockaddr *)&sun, sizeof(sun));
    
    tg1 = socket(AF_UNIX, SOCK_DGRAM, 0);
    strncpy(sun.sun_path, "/tmp/tg1", sizeof(sun.sun_path));
    unlink(sun.sun_path);
    bind(tg1, (struct sockaddr *)&sun, sizeof(sun));

    sk = socket(AF_UNIX, SOCK_DGRAM, 0);
    connect(sk, (struct sockaddr *)&sun, sizeof(sun));

    fcntl(sk, F_SETFL, fcntl(sk, F_GETFL) | O_NONBLOCK);
    
    while ((rc = write(sk, "bla", 3)) != -1);

    pthread_create(&tid, NULL, epoller, NULL);

    usleep(5);

    strncpy(sun.sun_path, "/tmp/tg0", sizeof(sun.sun_path));
    connect(sk, (struct sockaddr *)&sun, sizeof(sun));
    close(tg1);
    
    pause();

    return 0;
}
----------

And here the other demonstrating the poller not being woken up despite
it could write something:

----------
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/poll.h>
#include <sys/wait.h>
#include <unistd.h>

int main(void)
{
    struct sockaddr_un sun;
    struct pollfd pfd;
    int tg, sk0, sk1, rc;
    char buf[16];

    sun.sun_family = AF_UNIX;
    
    tg = socket(AF_UNIX, SOCK_DGRAM, 0);
    strncpy(sun.sun_path, "/tmp/tg", sizeof(sun.sun_path));
    unlink(sun.sun_path);
    bind(tg, (struct sockaddr *)&sun, sizeof(sun));
    
    sk0 = socket(AF_UNIX, SOCK_DGRAM, 0);
    connect(sk0, (struct sockaddr *)&sun, sizeof(sun));
    
    sk1 = socket(AF_UNIX, SOCK_DGRAM, 0);
    connect(sk1, (struct sockaddr *)&sun, sizeof(sun));

    fcntl(sk0, F_SETFL, fcntl(sk0, F_GETFL) | O_NONBLOCK);
    fcntl(sk1, F_SETFL, fcntl(sk1, F_GETFL) | O_NONBLOCK);
    
    while (write(sk0, "bla", 3) != -1);

    if (fork() == 0) {
	pfd.fd = sk1;
	pfd.events = POLLOUT;
	rc = poll(&pfd, 1, -1);

	_exit(0);
    }
    
    sleep(3);
    read(tg, buf, sizeof(buf));
    wait(&rc);

    return 0;
}
-----------

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 3/3] af_unix: optimize the unix_dgram_recvmsg()
  2015-10-02 20:44 ` [PATCH v2 3/3] af_unix: optimize the unix_dgram_recvmsg() Jason Baron
@ 2015-10-05  7:41   ` Peter Zijlstra
  2015-10-05 17:13     ` Jason Baron
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-10-05  7:41 UTC (permalink / raw)
  To: Jason Baron
  Cc: davem, netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	rweikusat, viro, davidel, dave, olivier, pageexec, torvalds

On Fri, Oct 02, 2015 at 08:44:02PM +0000, Jason Baron wrote:
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index f789423..b8ed1bc 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c

> @@ -1079,6 +1079,9 @@ static long unix_wait_for_peer(struct sock *other, long timeo)
>  
>  	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
>  
> +	set_bit(UNIX_NOSPACE, &u->flags);
> +	/* pairs with mb in unix_dgram_recv */
> +	smp_mb__after_atomic();
>  	sched = !sock_flag(other, SOCK_DEAD) &&
>  		!(other->sk_shutdown & RCV_SHUTDOWN) &&
>  		unix_recvq_full(other);
> @@ -1623,17 +1626,22 @@ restart:
>  
>  	if (unix_peer(other) != sk && unix_recvq_full(other)) {
>  		if (!timeo) {
> +			set_bit(UNIX_NOSPACE, &unix_sk(other)->flags);
> +			/* pairs with mb in unix_dgram_recv */
> +			smp_mb__after_atomic();
> +			if (unix_recvq_full(other)) {
> +				err = -EAGAIN;
> +				goto out_unlock;
> +			}
> +		} else {

> @@ -1939,8 +1947,14 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>  		goto out_unlock;
>  	}
>  
> +	/* pairs with unix_dgram_poll() and wait_for_peer() */
> +	smp_mb();
> +	if (test_bit(UNIX_NOSPACE, &u->flags)) {
> +		clear_bit(UNIX_NOSPACE, &u->flags);
> +		wake_up_interruptible_sync_poll(&u->peer_wait,
> +						POLLOUT | POLLWRNORM |
> +						POLLWRBAND);
> +	}
>  
>  	if (msg->msg_name)
>  		unix_copy_addr(msg, skb->sk);
> @@ -2468,20 +2493,19 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
>  	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
>  		return mask;
>  
>  	other = unix_peer_get(sk);
> +	if (unix_dgram_writable(sk, other)) {
>  		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
> +	} else {
>  		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
> +		set_bit(UNIX_NOSPACE, &unix_sk(other)->flags);
> +		/* pairs with mb in unix_dgram_recv */
> +		smp_mb__after_atomic();
> +		if (unix_dgram_writable(sk, other))
> +			mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
> +	}
> +	if (other)
> +		sock_put(other);
>  
>  	return mask;
>  }


So I must object to these barrier comments; stating which other barrier
they pair with is indeed good and required, but its not sufficient.

A barrier comment should also explain the data ordering -- the most
important part.

As it stands its not clear to me these barriers are required at all, but
this is not code I understand so I might well miss something obvious.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-02 20:43 ` [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll() Jason Baron
  2015-10-03  5:46   ` Mathias Krause
@ 2015-10-05 16:31   ` Rainer Weikusat
  2015-10-05 16:54     ` Eric Dumazet
  2015-10-05 17:55     ` Jason Baron
  1 sibling, 2 replies; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-05 16:31 UTC (permalink / raw)
  To: Jason Baron
  Cc: davem, netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	rweikusat, viro, davidel, dave, olivier, pageexec, torvalds,
	peterz

Jason Baron <jbaron@akamai.com> writes:
> The unix_dgram_poll() routine calls sock_poll_wait() not only for the wait
> queue associated with the socket s that we are poll'ing against, but also calls
> sock_poll_wait() for a remote peer socket p, if it is connected. Thus,
> if we call poll()/select()/epoll() for the socket s, there are then
> a couple of code paths in which the remote peer socket p and its associated
> peer_wait queue can be freed before poll()/select()/epoll() have a chance
> to remove themselves from the remote peer socket.
>
> The way that remote peer socket can be freed are:
>
> 1. If s calls connect() to a connect to a new socket other than p, it will
> drop its reference on p, and thus a close() on p will free it.
>
> 2. If we call close on p(), then a subsequent sendmsg() from s, will drop
> the final reference to p, allowing it to be freed.

Here's a more simple idea which _might_ work. The underlying problem
seems to be that the second sock_poll_wait introduces a covert reference
to the peer socket which isn't accounted for. The basic idea behind this
is to execute an additional sock_hold for the peer whenever the
sock_poll_wait is called for it and store it (the struct sock *) in a
new struct unix_sock member. Upon entering unix_dgram_poll, if the
member is not NULL, it's cleared and a sock_put for its former value is
done. The 'poll peer not NULL -> sock_put it' code is also added to the
destructor, although I'm unsure if this is really necessary. The patch
below also includes the additional SOCK_DEAD test suggested by Martin as
that seems generally sensible to me.

NB: This has survived both Martin's and my test programs for a number
of executions/ longer periods of time than was common before without
generating list corruption warnings. The patch below is against 'my'
3.2.54 and is here provided as a suggestion in the hope that it will be
useful for someting, not as patch submission, as I spent less time
thinking through this than I should ideally have but despite of this,
it's another 2.5 hours of my life spent on something completely
different than what I should be working on at the moment.

--------------
diff -pru linux-2-6/include/net/af_unix.h linux-2-6.p/include/net/af_unix.h
--- linux-2-6/include/net/af_unix.h	2014-01-20 21:52:53.000000000 +0000
+++ linux-2-6.p/include/net/af_unix.h	2015-10-05 15:11:20.270958297 +0100
@@ -50,6 +50,7 @@ struct unix_sock {
 	struct vfsmount		*mnt;
 	struct mutex		readlock;
 	struct sock		*peer;
+	struct sock		*poll_peer;
 	struct sock		*other;
 	struct list_head	link;
 	atomic_long_t		inflight;
diff -pru linux-2-6/net/unix/af_unix.c linux-2-6.p/net/unix/af_unix.c
--- linux-2-6/net/unix/af_unix.c	2014-01-22 16:51:52.000000000 +0000
+++ linux-2-6.p/net/unix/af_unix.c	2015-10-05 17:05:28.358273567 +0100
@@ -361,6 +361,9 @@ static void unix_sock_destructor(struct
 	if (u->addr)
 		unix_release_addr(u->addr);
 
+	if (u->poll_peer)
+		sock_put(u->poll_peer);
+
 	atomic_long_dec(&unix_nr_socks);
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
@@ -625,6 +628,7 @@ static struct sock *unix_create1(struct
 	u	  = unix_sk(sk);
 	u->dentry = NULL;
 	u->mnt	  = NULL;
+	u->poll_peer = NULL;
 	spin_lock_init(&u->lock);
 	atomic_long_set(&u->inflight, 0);
 	INIT_LIST_HEAD(&u->link);
@@ -2135,8 +2139,16 @@ static unsigned int unix_poll(struct fil
 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 				    poll_table *wait)
 {
-	struct sock *sk = sock->sk, *other;
-	unsigned int mask, writable;
+	struct sock *sk = sock->sk, *other, *pp;
+	struct unix_sock *u;
+	unsigned int mask, writable, dead;
+
+	u = unix_sk(sk);
+	pp = u->poll_peer;
+	if (pp) {
+		u->poll_peer = NULL;
+		sock_put(pp);
+	}
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
@@ -2170,7 +2182,20 @@ static unsigned int unix_dgram_poll(stru
 	other = unix_peer_get(sk);
 	if (other) {
 		if (unix_peer(other) != sk) {
-			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
+			unix_state_lock(other);
+
+			dead = sock_flag(other, SOCK_DEAD);
+			if (!dead)
+				sock_poll_wait(file, &unix_sk(other)->peer_wait,
+					       wait);
+
+			unix_state_unlock(other);
+
+			if (!dead) {
+				u->poll_peer = other;
+				sock_hold(other);
+			}
+
 			if (unix_recvq_full(other))
 				writable = 0;
 		}

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-05 16:31   ` Rainer Weikusat
@ 2015-10-05 16:54     ` Eric Dumazet
  2015-10-05 17:20       ` Rainer Weikusat
  2015-10-05 17:55     ` Jason Baron
  1 sibling, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2015-10-05 16:54 UTC (permalink / raw)
  To: Rainer Weikusat
  Cc: Jason Baron, davem, netdev, linux-kernel, minipli, normalperson,
	viro, davidel, dave, olivier, pageexec, torvalds, peterz

On Mon, 2015-10-05 at 17:31 +0100, Rainer Weikusat wrote:

>  	atomic_long_set(&u->inflight, 0);
>  	INIT_LIST_HEAD(&u->link);
> @@ -2135,8 +2139,16 @@ static unsigned int unix_poll(struct fil
>  static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
>  				    poll_table *wait)
>  {
> -	struct sock *sk = sock->sk, *other;
> -	unsigned int mask, writable;
> +	struct sock *sk = sock->sk, *other, *pp;
> +	struct unix_sock *u;
> +	unsigned int mask, writable, dead;
> +
> +	u = unix_sk(sk);
> +	pp = u->poll_peer;
> +	if (pp) {
> +		u->poll_peer = NULL;
> +		sock_put(pp);
> +	}


This looks racy.
Multiple threads could use poll() at the same time,
and you would have too many sock_put()




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 3/3] af_unix: optimize the unix_dgram_recvmsg()
  2015-10-05  7:41   ` Peter Zijlstra
@ 2015-10-05 17:13     ` Jason Baron
  0 siblings, 0 replies; 26+ messages in thread
From: Jason Baron @ 2015-10-05 17:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: davem, netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	rweikusat, viro, davidel, dave, olivier, pageexec, torvalds

On 10/05/2015 03:41 AM, Peter Zijlstra wrote:
> On Fri, Oct 02, 2015 at 08:44:02PM +0000, Jason Baron wrote:
>> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
>> index f789423..b8ed1bc 100644
>> --- a/net/unix/af_unix.c
>> +++ b/net/unix/af_unix.c
> 
>> @@ -1079,6 +1079,9 @@ static long unix_wait_for_peer(struct sock *other, long timeo)
>>  
>>  	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
>>  
>> +	set_bit(UNIX_NOSPACE, &u->flags);
>> +	/* pairs with mb in unix_dgram_recv */
>> +	smp_mb__after_atomic();
>>  	sched = !sock_flag(other, SOCK_DEAD) &&
>>  		!(other->sk_shutdown & RCV_SHUTDOWN) &&
>>  		unix_recvq_full(other);
>> @@ -1623,17 +1626,22 @@ restart:
>>  
>>  	if (unix_peer(other) != sk && unix_recvq_full(other)) {
>>  		if (!timeo) {
>> +			set_bit(UNIX_NOSPACE, &unix_sk(other)->flags);
>> +			/* pairs with mb in unix_dgram_recv */
>> +			smp_mb__after_atomic();
>> +			if (unix_recvq_full(other)) {
>> +				err = -EAGAIN;
>> +				goto out_unlock;
>> +			}
>> +		} else {
> 
>> @@ -1939,8 +1947,14 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
>>  		goto out_unlock;
>>  	}
>>  
>> +	/* pairs with unix_dgram_poll() and wait_for_peer() */
>> +	smp_mb();
>> +	if (test_bit(UNIX_NOSPACE, &u->flags)) {
>> +		clear_bit(UNIX_NOSPACE, &u->flags);
>> +		wake_up_interruptible_sync_poll(&u->peer_wait,
>> +						POLLOUT | POLLWRNORM |
>> +						POLLWRBAND);
>> +	}
>>  
>>  	if (msg->msg_name)
>>  		unix_copy_addr(msg, skb->sk);
>> @@ -2468,20 +2493,19 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
>>  	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
>>  		return mask;
>>  
>>  	other = unix_peer_get(sk);
>> +	if (unix_dgram_writable(sk, other)) {
>>  		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
>> +	} else {
>>  		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
>> +		set_bit(UNIX_NOSPACE, &unix_sk(other)->flags);
>> +		/* pairs with mb in unix_dgram_recv */
>> +		smp_mb__after_atomic();
>> +		if (unix_dgram_writable(sk, other))
>> +			mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
>> +	}
>> +	if (other)
>> +		sock_put(other);
>>  
>>  	return mask;
>>  }
> 
> 
> So I must object to these barrier comments; stating which other barrier
> they pair with is indeed good and required, but its not sufficient.
> 
> A barrier comment should also explain the data ordering -- the most
> important part.
> 
> As it stands its not clear to me these barriers are required at all, but
> this is not code I understand so I might well miss something obvious.
> 

So in patch 1/3 I've added sockets that call connect() onto the 'peer_wait'
queue of the server. The reason being that when the server reads from its
socket (unix_dgram_recvmsg()) it can wake up n clients that may be waiting
for the receive queue of the server to empty. This was previously
accomplished in unix_dgram_poll() via the:

sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);

which I've removed. The issue with that approach is that the 'other' socket
or server socket can close() and go away out from under the poll() call.
Thus, I've converted back unix_dgram_poll(), to simply wait on its
own socket's wait queue, which is the semantics that poll()/select()/
epoll() expect. However, I still needed a way to signal the client socket,
and thus I've added the callback on connect() in patch 1/3.

However, now that the client sockets are added during connect(), and
not poll() as was previously done, this means that any calls to
unix_dgram_recvmsg() have to walk the entire wakeup list, even if nobody
is sitting in poll().

Thus, I've introduced the UNIX_SOCK flag to signal that we are in poll()
to the server side, such that it doesn't have to potentially walk a long
list of wakeups for no reason. This makes a difference on this test
case:

http://www.spinics.net/lists/netdev/msg145533.html

which I found while working on this issue. And this patch restores the
performance for that test case.

So what we need to guarantee is that we either see that the socket
as writable in unix_dgram_poll(), *or* that we perform the proper
wakeup in unix_dgram_recvmsg().

So unix_dgram_poll() does:

...

set_bit(UNIX_NOSPACE, &unix_sk(other)->flags);
smp_mb__after_atomic();
if (unix_dgram_writable(sk, other))
	mask |= POLLOUT | POLLWRNORM | POLLWRBAND;


and the wakeup side in unix_dgram_recvmsg():

skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
if (!skb) {
 ....
}
smp_mb();
if (test_bit(UNIX_NOSPACE, &u->flags)) {
       clear_bit(UNIX_NOSPACE, &u->flags);
       wake_up_interruptible_sync_poll(&u->peer_wait,
                                       POLLOUT | POLLWRNORM |
                                       POLLWRBAND);
}

the '__skb_recv_datagram()' there potentially sets the wakeup
condition that we are interested in.

The barrier in unix_wait_for_peer() is for the same thing - either we
see the condition there and don't go to sleep or we get a proper wakeup.

Finally, I needed a barrier in unix_dgram_sendmsg() in the -EAGAIN
case b/c epoll edge trigger needs to guarantee a wakeup happens there
as well (it can't rely on UNIX_NOSPACE being set from
unix_dgram_recvmsg()). 

Thanks,

-Jason


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-05 16:54     ` Eric Dumazet
@ 2015-10-05 17:20       ` Rainer Weikusat
  0 siblings, 0 replies; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-05 17:20 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Rainer Weikusat, Jason Baron, davem, netdev, linux-kernel,
	minipli, normalperson, viro, davidel, dave, olivier, pageexec,
	torvalds, peterz

Eric Dumazet <eric.dumazet@gmail.com> writes:
> On Mon, 2015-10-05 at 17:31 +0100, Rainer Weikusat wrote:
>
>>  	atomic_long_set(&u->inflight, 0);
>>  	INIT_LIST_HEAD(&u->link);
>> @@ -2135,8 +2139,16 @@ static unsigned int unix_poll(struct fil
>>  static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
>>  				    poll_table *wait)
>>  {
>> -	struct sock *sk = sock->sk, *other;
>> -	unsigned int mask, writable;
>> +	struct sock *sk = sock->sk, *other, *pp;
>> +	struct unix_sock *u;
>> +	unsigned int mask, writable, dead;
>> +
>> +	u = unix_sk(sk);
>> +	pp = u->poll_peer;
>> +	if (pp) {
>> +		u->poll_peer = NULL;
>> +		sock_put(pp);
>> +	}
>
>
> This looks racy.
> Multiple threads could use poll() at the same time,
> and you would have too many sock_put()

That's one of the reasons why I wrote "might work": The use of a single
structure member without any locking for the sock_poll_wait suggests
that this is taken care of in some other way, as does the absence of any
comment about that in the 'public' LDDs ("Linux Device Drivers"),
however, I don't really know if this is true. If not, this simple idea
can't work.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-05 16:31   ` Rainer Weikusat
  2015-10-05 16:54     ` Eric Dumazet
@ 2015-10-05 17:55     ` Jason Baron
  2015-10-12 20:41       ` Rainer Weikusat
  1 sibling, 1 reply; 26+ messages in thread
From: Jason Baron @ 2015-10-05 17:55 UTC (permalink / raw)
  To: Rainer Weikusat
  Cc: davem, netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	viro, davidel, dave, olivier, pageexec, torvalds, peterz

On 10/05/2015 12:31 PM, Rainer Weikusat wrote:
> Jason Baron <jbaron@akamai.com> writes:
>> The unix_dgram_poll() routine calls sock_poll_wait() not only for the wait
>> queue associated with the socket s that we are poll'ing against, but also calls
>> sock_poll_wait() for a remote peer socket p, if it is connected. Thus,
>> if we call poll()/select()/epoll() for the socket s, there are then
>> a couple of code paths in which the remote peer socket p and its associated
>> peer_wait queue can be freed before poll()/select()/epoll() have a chance
>> to remove themselves from the remote peer socket.
>>
>> The way that remote peer socket can be freed are:
>>
>> 1. If s calls connect() to a connect to a new socket other than p, it will
>> drop its reference on p, and thus a close() on p will free it.
>>
>> 2. If we call close on p(), then a subsequent sendmsg() from s, will drop
>> the final reference to p, allowing it to be freed.
> 
> Here's a more simple idea which _might_ work. The underlying problem
> seems to be that the second sock_poll_wait introduces a covert reference
> to the peer socket which isn't accounted for. The basic idea behind this
> is to execute an additional sock_hold for the peer whenever the
> sock_poll_wait is called for it and store it (the struct sock *) in a
> new struct unix_sock member. Upon entering unix_dgram_poll, if the
> member is not NULL, it's cleared and a sock_put for its former value is
> done. The 'poll peer not NULL -> sock_put it' code is also added to the
> destructor, although I'm unsure if this is really necessary. The patch
> below also includes the additional SOCK_DEAD test suggested by Martin as
> that seems generally sensible to me.
> 
> NB: This has survived both Martin's and my test programs for a number
> of executions/ longer periods of time than was common before without
> generating list corruption warnings. The patch below is against 'my'
> 3.2.54 and is here provided as a suggestion in the hope that it will be
> useful for someting, not as patch submission, as I spent less time
> thinking through this than I should ideally have but despite of this,
> it's another 2.5 hours of my life spent on something completely
> different than what I should be working on at the moment.
> 
> --------------
> diff -pru linux-2-6/include/net/af_unix.h linux-2-6.p/include/net/af_unix.h
> --- linux-2-6/include/net/af_unix.h	2014-01-20 21:52:53.000000000 +0000
> +++ linux-2-6.p/include/net/af_unix.h	2015-10-05 15:11:20.270958297 +0100
> @@ -50,6 +50,7 @@ struct unix_sock {
>  	struct vfsmount		*mnt;
>  	struct mutex		readlock;
>  	struct sock		*peer;
> +	struct sock		*poll_peer;
>  	struct sock		*other;
>  	struct list_head	link;
>  	atomic_long_t		inflight;
> diff -pru linux-2-6/net/unix/af_unix.c linux-2-6.p/net/unix/af_unix.c
> --- linux-2-6/net/unix/af_unix.c	2014-01-22 16:51:52.000000000 +0000
> +++ linux-2-6.p/net/unix/af_unix.c	2015-10-05 17:05:28.358273567 +0100
> @@ -361,6 +361,9 @@ static void unix_sock_destructor(struct
>  	if (u->addr)
>  		unix_release_addr(u->addr);
>  
> +	if (u->poll_peer)
> +		sock_put(u->poll_peer);
> +
>  	atomic_long_dec(&unix_nr_socks);
>  	local_bh_disable();
>  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
> @@ -625,6 +628,7 @@ static struct sock *unix_create1(struct
>  	u	  = unix_sk(sk);
>  	u->dentry = NULL;
>  	u->mnt	  = NULL;
> +	u->poll_peer = NULL;
>  	spin_lock_init(&u->lock);
>  	atomic_long_set(&u->inflight, 0);
>  	INIT_LIST_HEAD(&u->link);
> @@ -2135,8 +2139,16 @@ static unsigned int unix_poll(struct fil
>  static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
>  				    poll_table *wait)
>  {
> -	struct sock *sk = sock->sk, *other;
> -	unsigned int mask, writable;
> +	struct sock *sk = sock->sk, *other, *pp;
> +	struct unix_sock *u;
> +	unsigned int mask, writable, dead;
> +
> +	u = unix_sk(sk);
> +	pp = u->poll_peer;
> +	if (pp) {
> +		u->poll_peer = NULL;
> +		sock_put(pp);
> +	}
>  
>  	sock_poll_wait(file, sk_sleep(sk), wait);
>  	mask = 0;
> @@ -2170,7 +2182,20 @@ static unsigned int unix_dgram_poll(stru
>  	other = unix_peer_get(sk);
>  	if (other) {
>  		if (unix_peer(other) != sk) {
> -			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
> +			unix_state_lock(other);
> +
> +			dead = sock_flag(other, SOCK_DEAD);
> +			if (!dead)
> +				sock_poll_wait(file, &unix_sk(other)->peer_wait,
> +					       wait);
> +
> +			unix_state_unlock(other);
> +
> +			if (!dead) {
> +				u->poll_peer = other;
> +				sock_hold(other);
> +			}
> +
>  			if (unix_recvq_full(other))
>  				writable = 0;
>  		}
> 


Interesting - will this work for the test case you supplied where we are in
epoll_wait() and another thread does a connect() to a new target? In that
case, even if we issue a wakeup to the epoll thread, its not going to have
a non-NULL poll_table, so it wouldn't be added to the new target. IE
the first test case here:

https://lkml.org/lkml/2015/10/4/154

Only a re-add or modify using epoll_ctl() will re-register with a non-NULL
poll_table.

Thanks,

-Jason

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-05 17:55     ` Jason Baron
@ 2015-10-12 20:41       ` Rainer Weikusat
  2015-10-14  3:44         ` Jason Baron
  0 siblings, 1 reply; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-12 20:41 UTC (permalink / raw)
  To: Jason Baron
  Cc: Rainer Weikusat, davem, netdev, linux-kernel, minipli,
	normalperson, eric.dumazet, viro, davidel, dave, olivier,
	pageexec, torvalds, peterz

Jason Baron <jbaron@akamai.com> writes:
> On 10/05/2015 12:31 PM, Rainer Weikusat wrote:

[...]

>> Here's a more simple idea which _might_ work. The underlying problem
>> seems to be that the second sock_poll_wait introduces a covert reference
>> to the peer socket which isn't accounted for. The basic idea behind this
>> is to execute an additional sock_hold for the peer whenever the
>> sock_poll_wait is called for it and store it (the struct sock *) in a
>> new struct unix_sock member.

[...]

> Interesting - will this work for the test case you supplied where we are in
> epoll_wait() and another thread does a connect() to a new target? In that
> case, even if we issue a wakeup to the epoll thread, its not going to have
> a non-NULL poll_table, so it wouldn't be added to the new target. IE
> the first test case here:
>
> https://lkml.org/lkml/2015/10/4/154

"Indeed it would not." I've also meanwhile found the time to check what
is and isn't locked here and found that Eric's "this looks racy" was
also justified. In theory, a clean solution could be based on modifying
the various polling implementations to keep a piece of data for a polled
something and provided that again on each subsequent poll call. This
could then be used to keep the peer socket alive for as long as
necessary were it possible to change the set of wait queues with every
poll call. Since this also isn't the case, the idea to increment the
reference count of the peer socket won't fly.

OTOH, something I seriously dislike about your relaying implementation
is the unconditional add_wait_queue upon connect as this builds up a
possibly large wait queue of entities entirely uninterested in the
event which will need to be traversed even if peer_wait wakeups will
only happen if at least someone actually wants to be notified. This
could be changed such that the struct unix_sock member is only put onto
the peer's wait queue in poll and only if it hasn't already been put
onto it. The connection could then be severed if some kind of
'disconnect' occurs.

The code below (again based on 3.2.54) is what I'm currently running and
it has survived testing during the day (without trying the exercise in
hexadecimal as that doesn't cause failure for me, anyway). The wakeup
relaying function checks that a socket wait queue actually still exists
because I used to get null pointers oopses without every now and then
(I've also tested this with an additional printk printing 'no q' in case
the pointer was actually null to verify that this really occurs here).

---------------------
--- linux-2-6.b/net/unix/af_unix.c	2015-10-12 21:07:27.237102277 +0100
+++ linux-2-6/net/unix/af_unix.c	2015-10-12 21:10:26.319384298 +0100
@@ -303,6 +303,51 @@ found:
 	return s;
 }
 
+static int unix_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+				void *key)
+{
+	struct unix_sock *u;
+	wait_queue_head_t *u_sleep;
+
+	u = container_of(q, struct unix_sock, peer_wake);
+
+	u_sleep = sk_sleep(&u->sk);
+	if (u_sleep)
+		wake_up_interruptible_poll(u_sleep, key);
+
+	return 0;
+}
+
+static void unix_peer_wake_connect(struct sock *me, struct sock *peer)
+{
+	struct unix_sock *u, *u_peer;
+
+	u = unix_sk(me);
+	u_peer = unix_sk(peer);
+
+	if (!u->peer_wake.private)
+		return;
+
+	u->peer_wake.private = peer;
+	add_wait_queue(&u_peer->peer_wait, &u->peer_wake);
+}
+
+static int unix_peer_wake_disconnect(struct sock *me, struct sock *peer)
+{
+	struct unix_sock *u, *u_peer;
+
+	u = unix_sk(me);
+	u_peer = unix_sk(peer);
+
+	if (u->peer_wake.private != peer)
+		return 0;
+
+	remove_wait_queue(&u_peer->peer_wait, &u->peer_wake);
+	u->peer_wake.private = NULL;
+
+	return 1;
+}
+
 static inline int unix_writable(struct sock *sk)
 {
 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
@@ -409,6 +454,8 @@ static void unix_release_sock(struct soc
 			skpair->sk_state_change(skpair);
 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 		}
+
+		unix_peer_wake_disconnect(sk, skpair);
 		sock_put(skpair); /* It may now die */
 		unix_peer(sk) = NULL;
 	}
@@ -630,6 +677,7 @@ static struct sock *unix_create1(struct
 	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
+	init_waitqueue_func_entry(&u->peer_wake, unix_peer_wake_relay);
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
 	if (sk == NULL)
@@ -953,7 +1001,7 @@ static int unix_dgram_connect(struct soc
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 	struct sock *other;
 	unsigned hash;
-	int err;
+	int err, disconned;
 
 	if (addr->sa_family != AF_UNSPEC) {
 		err = unix_mkname(sunaddr, alen, &hash);
@@ -1001,6 +1049,11 @@ restart:
 	if (unix_peer(sk)) {
 		struct sock *old_peer = unix_peer(sk);
 		unix_peer(sk) = other;
+
+		disconned = unix_peer_wake_disconnect(sk, other);
+		if (disconned)
+			wake_up_interruptible_poll(sk_sleep(sk), POLLOUT);
+
 		unix_state_double_unlock(sk, other);
 
 		if (other != old_peer)
@@ -1439,7 +1492,7 @@ static int unix_dgram_sendmsg(struct kio
 	struct sk_buff *skb;
 	long timeo;
 	struct scm_cookie tmp_scm;
-	int max_level;
+	int max_level, disconned;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1525,6 +1578,12 @@ restart:
 		unix_state_lock(sk);
 		if (unix_peer(sk) == other) {
 			unix_peer(sk) = NULL;
+
+			disconned = unix_peer_wake_disconnect(sk, other);
+			if (disconned)
+				wake_up_interruptible_poll(sk_sleep(sk),
+							POLLOUT);
+
 			unix_state_unlock(sk);
 
 			unix_dgram_disconnected(sk, other);
@@ -2170,10 +2229,20 @@ static unsigned int unix_dgram_poll(stru
 	other = unix_peer_get(sk);
 	if (other) {
 		if (unix_peer(other) != sk) {
-			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
-			if (unix_recvq_full(other))
-				writable = 0;
+			unix_state_lock(sk);
+
+			if (unix_peer(sk) == other) {
+				unix_peer_wake_connect(sk, other);
+
+				if (unix_recvq_full(other))
+					writable = 0;
+				else
+					unix_peer_wake_disconnect(sk, other);
+			}
+
+			unix_state_unlock(sk);
 		}
+
 		sock_put(other);
 	}
 
--- linux-2-6.b/include/net/af_unix.h	2015-10-12 21:07:27.237102277 +0100
+++ linux-2-6/include/net/af_unix.h	2015-10-11 20:12:47.598690519 +0100
@@ -58,6 +58,7 @@ struct unix_sock {
 	unsigned int		gc_maybe_cycle : 1;
 	unsigned char		recursion_level;
 	struct socket_wq	peer_wq;
+	wait_queue_t		peer_wake;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-12 20:41       ` Rainer Weikusat
@ 2015-10-14  3:44         ` Jason Baron
  2015-10-14 17:47           ` Rainer Weikusat
  0 siblings, 1 reply; 26+ messages in thread
From: Jason Baron @ 2015-10-14  3:44 UTC (permalink / raw)
  To: Rainer Weikusat
  Cc: davem, netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	viro, davidel, dave, olivier, pageexec, torvalds, peterz

On 10/12/2015 04:41 PM, Rainer Weikusat wrote:
> Jason Baron <jbaron@akamai.com> writes:
>> On 10/05/2015 12:31 PM, Rainer Weikusat wrote:
> 
> [...]
> 
>>> Here's a more simple idea which _might_ work. The underlying problem
>>> seems to be that the second sock_poll_wait introduces a covert reference
>>> to the peer socket which isn't accounted for. The basic idea behind this
>>> is to execute an additional sock_hold for the peer whenever the
>>> sock_poll_wait is called for it and store it (the struct sock *) in a
>>> new struct unix_sock member.
> 
> [...]
> 
>> Interesting - will this work for the test case you supplied where we are in
>> epoll_wait() and another thread does a connect() to a new target? In that
>> case, even if we issue a wakeup to the epoll thread, its not going to have
>> a non-NULL poll_table, so it wouldn't be added to the new target. IE
>> the first test case here:
>>
>> https://lkml.org/lkml/2015/10/4/154
> 
> "Indeed it would not." I've also meanwhile found the time to check what
> is and isn't locked here and found that Eric's "this looks racy" was
> also justified. In theory, a clean solution could be based on modifying
> the various polling implementations to keep a piece of data for a polled
> something and provided that again on each subsequent poll call. This
> could then be used to keep the peer socket alive for as long as
> necessary were it possible to change the set of wait queues with every
> poll call. Since this also isn't the case, the idea to increment the
> reference count of the peer socket won't fly.
> 
> OTOH, something I seriously dislike about your relaying implementation
> is the unconditional add_wait_queue upon connect as this builds up a
> possibly large wait queue of entities entirely uninterested in the
> event which will need to be traversed even if peer_wait wakeups will
> only happen if at least someone actually wants to be notified. This
> could be changed such that the struct unix_sock member is only put onto
> the peer's wait queue in poll and only if it hasn't already been put
> onto it. The connection could then be severed if some kind of
> 'disconnect' occurs.
> 
> The code below (again based on 3.2.54) is what I'm currently running and
> it has survived testing during the day (without trying the exercise in
> hexadecimal as that doesn't cause failure for me, anyway). The wakeup
> relaying function checks that a socket wait queue actually still exists
> because I used to get null pointers oopses without every now and then
> (I've also tested this with an additional printk printing 'no q' in case
> the pointer was actually null to verify that this really occurs here).
> 

Hi,

What about the following race?

1) thread A does poll() on f, finds the wakeup condition low, and adds
itself to the remote peer_wait queue.

2) thread B sets the wake up condition in dgram_recvmsg(), but does not
execute the wakeup of threads yet.

3) thread C also does a poll() on f, finds now that the wakeup condition
is set, and hence removes f from the remote peer_wait queue.

Then, thread A misses the POLLOUT, and hangs.

I understand your concern about POLLIN only waiters-I think we
could add the 'relay callback' in dgram_poll() only for those who are
looking for POLLOUT, and simply avoid the de-registration, as in practice
I think its unlikely we are going to have a socket switching from
POLLOUT to *only* POLLIN. I suspect that will cover most of the cases
that concern you?

Thanks,

-Jason


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-14  3:44         ` Jason Baron
@ 2015-10-14 17:47           ` Rainer Weikusat
  2015-10-15  2:54             ` Jason Baron
  0 siblings, 1 reply; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-14 17:47 UTC (permalink / raw)
  To: Jason Baron
  Cc: Rainer Weikusat, davem, netdev, linux-kernel, minipli,
	normalperson, eric.dumazet, viro, davidel, dave, olivier,
	pageexec, torvalds, peterz

Jason Baron <jbaron@akamai.com> writes:
> On 10/12/2015 04:41 PM, Rainer Weikusat wrote:
>> Jason Baron <jbaron@akamai.com> writes:
>>> On 10/05/2015 12:31 PM, Rainer Weikusat wrote:

[...]

>> OTOH, something I seriously dislike about your relaying implementation
>> is the unconditional add_wait_queue upon connect as this builds up a
>> possibly large wait queue of entities entirely uninterested in the
>> event which will need to be traversed even if peer_wait wakeups will
>> only happen if at least someone actually wants to be notified. This
>> could be changed such that the struct unix_sock member is only put onto
>> the peer's wait queue in poll and only if it hasn't already been put
>> onto it. The connection could then be severed if some kind of
>> 'disconnect' occurs.

[...]


> What about the following race?
>
> 1) thread A does poll() on f, finds the wakeup condition low, and adds
> itself to the remote peer_wait queue.
>
> 2) thread B sets the wake up condition in dgram_recvmsg(), but does not
> execute the wakeup of threads yet.
>
> 3) thread C also does a poll() on f, finds now that the wakeup condition
> is set, and hence removes f from the remote peer_wait queue.
>
> Then, thread A misses the POLLOUT, and hangs.

Indeed. This starts to turn into an interesting, little problem. Based
on the code I posted, there are (AFAICT) four possible situations a
thread looking for 'peer will accept data' can find itself in:

A -	supposed to mean 'it connected to the peer_wait queue'
B -	the peer socket is ready to accept messages

A && B
------

Since A, no other thread can be waiting for this event and since B, A
was undone. Fine to return.


!A && B
-------

!A means 'was enqueued to peer_wait during an earlier call'. This means
any number of threads may be waiting. Because of this, do an additional
wake up after disconnecting since the current thread may never have
slept, ie, possibly, no wake up occurred yet.


 A && !B
 -------

 Now connected to receive peer wake ups. Go to sleep.


 !A && !B
 --------

 Someone else connected. Go to sleep.

> I understand your concern about POLLIN only waiters-I think we
> could add the 'relay callback' in dgram_poll() only for those who are
> looking for POLLOUT, and simply avoid the de-registration,

Whether or not the currently running thread wanted to write or to read
can only be determined (AIUI) if a poll_table was passed. If this was
the case and the thread didn't look for 'write space', it will never
execute the code in question, cf

	/* No write status requested, avoid expensive OUT tests. */
	if (wait && !(wait->key & POLL_OUT_ALL))
		return mask;

As to 'concerns', I'll try to word that somewhat differently: The usual
way to handle a 'may sleep' situation is

1. Enqueue on the associated wait_queue.

2. Test the condition

2a) No sleep, dequeue and return

2b) Sleep until woken up

This means the wait queue is used to 'register' threads which want to be
woken up if a certain event happens. It's not used such that a thread
first enqueues itself on all wait queues it could ever need to enqueued
on and that an associated flag is used to inform the code on the way to
doing a wake up whether or not any of the entries on the list actually
corresponds to someone waiting for one. Even if such a flag exists, the
time needed to do the wake up is still proportional to size of the list.

For the given case, take a usual socketpair: It's absolutely pointless
to put these on the peer_wait queues of each other because all datagrams
send by one go to the other, ie, a situation where a thread goes to
sleep because it has to wait until data written via unrelated sockets
has been consumed can never happen.

I'll include the patch I'm currently using below, including an
X-Signed-Off by line to signify that this is the result of kernel
development sponsored by my employer and made available under the usual
terms. Changes since the last version:

	- fixed the inverted (rather accidentally uninverted) test in
          _connect

	- simplified the peer-checking logic: Instead of getting the
          peer with the state locked and then doing all kinds of checks
          which will hopefully enable avoiding to lock the state again,
          then, possibly, lock it again, lock the socket, do the peer
          checks followed by whatever else is necessary, then unlock it

	- addressed the race described above by means of doing a wake_up
          on sk_sleep(sk) if a thread breaks the connection which
          didn't also establish it

	- a comment (yohoho) explaining all of this -- I don't usually
          use comments in my own code because I consider them rather an
          annoyance than helpful

Unrelated change:

	- #defined POLL_OUT_ALL (POLLOUT | POLLWRNORM | POLLWRBAND)
          and use that instead of the explicit triple

X-Signed-Off-By: Rainer Weikusat <rweikusat@mobileactivedefense.com>

---
--- linux-2-6.b/net/unix/af_unix.c	2015-10-14 17:36:28.233632890 +0100
+++ linux-2-6/net/unix/af_unix.c	2015-10-14 17:40:13.678601252 +0100
@@ -115,6 +115,8 @@
 #include <net/checksum.h>
 #include <linux/security.h>
 
+#define POLL_OUT_ALL	(POLLOUT | POLLWRNORM | POLLWRBAND)
+
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
 static atomic_long_t unix_nr_socks;
@@ -303,6 +305,53 @@ found:
 	return s;
 }
 
+static int unix_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+				void *key)
+{
+	struct unix_sock *u;
+	wait_queue_head_t *u_sleep;
+
+	u = container_of(q, struct unix_sock, peer_wake);
+
+	u_sleep = sk_sleep(&u->sk);
+	if (u_sleep)
+		wake_up_interruptible_poll(u_sleep, key);
+
+	return 0;
+}
+
+static inline int unix_peer_wake_connect(struct sock *me, struct sock *peer)
+{
+	struct unix_sock *u, *u_peer;
+
+	u = unix_sk(me);
+	u_peer = unix_sk(peer);
+
+	if (u->peer_wake.private)
+		return 0;
+
+	u->peer_wake.private = peer;
+	add_wait_queue(&u_peer->peer_wait, &u->peer_wake);
+
+	return 1;
+}
+
+static inline int unix_peer_wake_disconnect(struct sock *me, struct sock *peer)
+{
+	struct unix_sock *u, *u_peer;
+
+	u = unix_sk(me);
+	u_peer = unix_sk(peer);
+
+	if (u->peer_wake.private != peer)
+		return 0;
+
+	remove_wait_queue(&u_peer->peer_wait, &u->peer_wake);
+	u->peer_wake.private = NULL;
+
+	return 1;
+}
+
 static inline int unix_writable(struct sock *sk)
 {
 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
@@ -317,7 +366,7 @@ static void unix_write_space(struct sock
 		wq = rcu_dereference(sk->sk_wq);
 		if (wq_has_sleeper(wq))
 			wake_up_interruptible_sync_poll(&wq->wait,
-				POLLOUT | POLLWRNORM | POLLWRBAND);
+							POLL_OUT_ALL);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	rcu_read_unlock();
@@ -409,6 +458,8 @@ static void unix_release_sock(struct soc
 			skpair->sk_state_change(skpair);
 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 		}
+
+		unix_peer_wake_disconnect(sk, skpair);
 		sock_put(skpair); /* It may now die */
 		unix_peer(sk) = NULL;
 	}
@@ -630,6 +681,7 @@ static struct sock *unix_create1(struct
 	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
+	init_waitqueue_func_entry(&u->peer_wake, unix_peer_wake_relay);
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
 	if (sk == NULL)
@@ -953,7 +1005,7 @@ static int unix_dgram_connect(struct soc
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 	struct sock *other;
 	unsigned hash;
-	int err;
+	int err, disconned;
 
 	if (addr->sa_family != AF_UNSPEC) {
 		err = unix_mkname(sunaddr, alen, &hash);
@@ -1001,6 +1053,11 @@ restart:
 	if (unix_peer(sk)) {
 		struct sock *old_peer = unix_peer(sk);
 		unix_peer(sk) = other;
+
+		disconned = unix_peer_wake_disconnect(sk, other);
+		if (disconned)
+			wake_up_interruptible_poll(sk_sleep(sk), POLL_OUT_ALL);
+
 		unix_state_double_unlock(sk, other);
 
 		if (other != old_peer)
@@ -1439,7 +1496,7 @@ static int unix_dgram_sendmsg(struct kio
 	struct sk_buff *skb;
 	long timeo;
 	struct scm_cookie tmp_scm;
-	int max_level;
+	int max_level, disconned;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1525,6 +1582,12 @@ restart:
 		unix_state_lock(sk);
 		if (unix_peer(sk) == other) {
 			unix_peer(sk) = NULL;
+
+			disconned = unix_peer_wake_disconnect(sk, other);
+			if (disconned)
+				wake_up_interruptible_poll(sk_sleep(sk),
+							POLL_OUT_ALL);
+
 			unix_state_unlock(sk);
 
 			unix_dgram_disconnected(sk, other);
@@ -1783,8 +1846,7 @@ static int unix_dgram_recvmsg(struct kio
 		goto out_unlock;
 	}
 
-	wake_up_interruptible_sync_poll(&u->peer_wait,
-					POLLOUT | POLLWRNORM | POLLWRBAND);
+	wake_up_interruptible_sync_poll(&u->peer_wait, POLL_OUT_ALL);
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
@@ -2127,7 +2189,7 @@ static unsigned int unix_poll(struct fil
 	 * connection. This prevents stuck sockets.
 	 */
 	if (unix_writable(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= POLL_OUT_ALL;
 
 	return mask;
 }
@@ -2137,6 +2199,7 @@ static unsigned int unix_dgram_poll(stru
 {
 	struct sock *sk = sock->sk, *other;
 	unsigned int mask, writable;
+	int need_wakeup;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
@@ -2163,22 +2226,50 @@ static unsigned int unix_dgram_poll(stru
 	}
 
 	/* No write status requested, avoid expensive OUT tests. */
-	if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT)))
+	if (wait && !(wait->key & POLL_OUT_ALL))
 		return mask;
 
 	writable = unix_writable(sk);
-	other = unix_peer_get(sk);
-	if (other) {
-		if (unix_peer(other) != sk) {
-			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
-			if (unix_recvq_full(other))
+	if (writable) {
+		/*
+		 * If sk is asymmetrically connected to a peer,
+		 * whether or not data can actually be written depends
+		 * on the number of messages already enqueued for
+		 * reception on the peer socket, so check for this
+		 * condition, too.
+		 *
+		 * In case the socket is deemed not writeable because
+		 * of this, link it onto the peer_wait queue of the
+		 * peer to ensure that a wake up happens even if no
+		 * datagram already enqueued for processing was sent
+		 * using sk.
+		 *
+		 * If a thread finds the socket writable but wasn't
+		 * the one which connected, other threads might be
+		 * sleeping so do a wake up after disconnecting.
+		 */
+
+		need_wakeup = 0;
+		unix_state_lock(sk);
+
+		other = unix_peer(sk);
+		if (other && unix_peer(other) != sk) {
+			need_wakeup = !unix_peer_wake_connect(sk, other);
+
+			if (unix_recvq_full(other)) {
 				writable = 0;
+				need_wakeup = 0;
+			} else
+				unix_peer_wake_disconnect(sk, other);
 		}
-		sock_put(other);
+
+		unix_state_unlock(sk);
+		if (need_wakeup)
+			wake_up_interruptible_poll(sk_sleep(sk), POLL_OUT_ALL);
 	}
 
 	if (writable)
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= POLL_OUT_ALL;
 	else
 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
--- linux-2-6.b/include/net/af_unix.h	2015-10-14 17:36:28.237632764 +0100
+++ linux-2-6/include/net/af_unix.h	2015-10-11 20:12:47.598690519 +0100
@@ -58,6 +58,7 @@ struct unix_sock {
 	unsigned int		gc_maybe_cycle : 1;
 	unsigned char		recursion_level;
 	struct socket_wq	peer_wq;
+	wait_queue_t		peer_wake;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-14 17:47           ` Rainer Weikusat
@ 2015-10-15  2:54             ` Jason Baron
  2015-10-18 20:58               ` Rainer Weikusat
  0 siblings, 1 reply; 26+ messages in thread
From: Jason Baron @ 2015-10-15  2:54 UTC (permalink / raw)
  To: Rainer Weikusat
  Cc: davem, netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	viro, davidel, dave, olivier, pageexec, torvalds, peterz

....
> 
> X-Signed-Off-By: Rainer Weikusat <rweikusat@mobileactivedefense.com>
> 

Hi,

So the patches I've posted and yours both use the idea of a relaying
the remote peer wakeup via callbacks that are internal to the net/unix,
such that we avoid exposing the remote peer wakeup to the external
poll()/select()/epoll(). They differ in when and how those callbacks
are registered/unregistered.

So I think your approach here will generally keep the peer wait wakeup
queue to its absolute minimum, by removing from that queue when
we set POLLOUT, however it requires taking the peer waitqueue lock on
every poll() call. So I think there are tradeoffs here vs. what I've
posted. So for example, if there are a lot of writers against one 'server'
socket, there is going to be a lot of lock contention with your approach
here. So I think the performance is going to depend on the workload that
is tested.

I don't have a specific workload that I am trying to solve here, but
since you introduced the waiting on the remote peer queue, perhaps you
can post numbers comparing the patches that have been posted for the
workload that this was developed for? I will send you the latest version
of what I have privately - so as not to overly spam the list.

Thanks,

-Jason

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-15  2:54             ` Jason Baron
@ 2015-10-18 20:58               ` Rainer Weikusat
  2015-10-19 15:07                 ` Jason Baron
  0 siblings, 1 reply; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-18 20:58 UTC (permalink / raw)
  To: Jason Baron
  Cc: Rainer Weikusat, davem, netdev, linux-kernel, minipli,
	normalperson, eric.dumazet, viro, davidel, dave, olivier,
	pageexec, torvalds, peterz

Jason Baron <jbaron@akamai.com> writes:
> ....
>> 
>> X-Signed-Off-By: Rainer Weikusat <rweikusat@mobileactivedefense.com>
>> 

Sorry for the delayed reply but I had to do some catching up on what the
people who pay me consider 'useful work' :-).

> So the patches I've posted and yours both use the idea of a relaying
> the remote peer wakeup via callbacks that are internal to the net/unix,
> such that we avoid exposing the remote peer wakeup to the external
> poll()/select()/epoll(). They differ in when and how those callbacks
> are registered/unregistered.

Yes. In my opinion, that's the next best idea considering that the need
to handle this particular situation is presumably specfcific to the
af_unix.c code and thus, doesn't warrant changes to all I/O multiplexing
implementations.

> So I think your approach here will generally keep the peer wait wakeup
> queue to its absolute minimum, by removing from that queue when
> we set POLLOUT, however it requires taking the peer waitqueue lock on
> every poll() call.
>
> So I think there are tradeoffs here vs. what I've
> posted. So for example, if there are a lot of writers against one 'server'
> socket, there is going to be a lot of lock contention with your approach
> here. So I think the performance is going to depend on the workload that
> is tested.

This approach is really completely run-of-the-mill "possibly sleep until
some event occurs" code, eg, you'll find a description of this exact
procedure on p. 157 of chapter 6 of

https://lwn.net/Kernel/LDD3/

The idea behind 'the wait queue' (insofar I'm aware of it) is that it
will be used as list of threads who need to be notified when the
associated event occurs. Since you seem to argue that the run-of-the-mill
algorithm is too slow for this particular case, is there anything to
back this up?

This is also sort-of a zero-sum game as the idea to enqueue on a wait
queue because the event could possibly become interesting in future
really just shifts (for n:1 connected sockets) work from the (possibly
many) clients to the single server. And considering that 'the wait'
became necessary (if it became necessary) because flow-control kicked in
to stop clients from sending more data to the server until it had time
to catch up with the already sent data, this doesn't seem like a good
idea to me.

Using a flag to signal that at least some of the members of this queue
actually want to be woken up will also only save work if it is assumed
that most threads won't ever really be waiting for this, ie, won't
execute the corresponding unix_dgram_poll code. But if that code isn't
going to be executed most of the time, anyway, why optimize it?

It's also not really necessary to take the waitqueue lock on every poll,
eg, the code in unix_dgram_poll could be changed like this:

                need_wakeup = 0;
                unix_state_lock(sk);

                other = unix_peer(sk);
                if (other && unix_peer(other) != sk) {
                        if (unix_recvq_full(other)) {
                                need_wakeup = !unix_peer_wake_connect(sk, other);

                                if (unix_recvq_full(other)) {
                                        writable = 0;
                                        need_wakeup = 0;
                                } else
                                        unix_peer_wake_disconnect(sk, other);
                        } else
                                need_wakeup = unix_peer_wake_disconnect(sk, other);
                }

                unix_state_unlock(sk);
                if (need_wakeup)
                        wake_up_interruptible_poll(sk_sleep(sk), POLL_OUT_ALL);

I'm no big fan of adding complications like this based on the conjecture
that they might be useful, but this one seems justified to me. IMHO,
there are two interesting cases here:

1. The current thread won't have to wait. It should detect this quickly
   so that it can get back to work, hence, check for this before
   'preparing to wait'.

2. Nothing to be done here right now, hence, performing an additional
   check for 'still not writeable' shouldn't matter much.

Yet another 3.2.54 patch with the change above

-----------
--- linux-2-6.b/net/unix/af_unix.c	2015-10-18 20:41:10.829404899 +0100
+++ linux-2-6/net/unix/af_unix.c	2015-10-18 20:17:34.819134482 +0100
@@ -115,6 +115,8 @@
 #include <net/checksum.h>
 #include <linux/security.h>
 
+#define POLL_OUT_ALL	(POLLOUT | POLLWRNORM | POLLWRBAND)
+
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
 static atomic_long_t unix_nr_socks;
@@ -303,6 +305,53 @@ found:
 	return s;
 }
 
+static int unix_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+				void *key)
+{
+	struct unix_sock *u;
+	wait_queue_head_t *u_sleep;
+
+	u = container_of(q, struct unix_sock, peer_wake);
+
+	u_sleep = sk_sleep(&u->sk);
+	if (u_sleep)
+		wake_up_interruptible_poll(u_sleep, key);
+
+	return 0;
+}
+
+static inline int unix_peer_wake_connect(struct sock *me, struct sock *peer)
+{
+	struct unix_sock *u, *u_peer;
+
+	u = unix_sk(me);
+	u_peer = unix_sk(peer);
+
+	if (u->peer_wake.private)
+		return 0;
+
+	u->peer_wake.private = peer;
+	add_wait_queue(&u_peer->peer_wait, &u->peer_wake);
+
+	return 1;
+}
+
+static inline int unix_peer_wake_disconnect(struct sock *me, struct sock *peer)
+{
+	struct unix_sock *u, *u_peer;
+
+	u = unix_sk(me);
+	u_peer = unix_sk(peer);
+
+	if (u->peer_wake.private != peer)
+		return 0;
+
+	remove_wait_queue(&u_peer->peer_wait, &u->peer_wake);
+	u->peer_wake.private = NULL;
+
+	return 1;
+}
+
 static inline int unix_writable(struct sock *sk)
 {
 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
@@ -317,7 +366,7 @@ static void unix_write_space(struct sock
 		wq = rcu_dereference(sk->sk_wq);
 		if (wq_has_sleeper(wq))
 			wake_up_interruptible_sync_poll(&wq->wait,
-				POLLOUT | POLLWRNORM | POLLWRBAND);
+							POLL_OUT_ALL);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	rcu_read_unlock();
@@ -409,6 +458,8 @@ static void unix_release_sock(struct soc
 			skpair->sk_state_change(skpair);
 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 		}
+
+		unix_peer_wake_disconnect(sk, skpair);
 		sock_put(skpair); /* It may now die */
 		unix_peer(sk) = NULL;
 	}
@@ -630,6 +681,7 @@ static struct sock *unix_create1(struct
 	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
+	init_waitqueue_func_entry(&u->peer_wake, unix_peer_wake_relay);
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
 	if (sk == NULL)
@@ -953,7 +1005,7 @@ static int unix_dgram_connect(struct soc
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 	struct sock *other;
 	unsigned hash;
-	int err;
+	int err, disconned;
 
 	if (addr->sa_family != AF_UNSPEC) {
 		err = unix_mkname(sunaddr, alen, &hash);
@@ -1001,6 +1053,11 @@ restart:
 	if (unix_peer(sk)) {
 		struct sock *old_peer = unix_peer(sk);
 		unix_peer(sk) = other;
+
+		disconned = unix_peer_wake_disconnect(sk, other);
+		if (disconned)
+			wake_up_interruptible_poll(sk_sleep(sk), POLL_OUT_ALL);
+
 		unix_state_double_unlock(sk, other);
 
 		if (other != old_peer)
@@ -1439,7 +1496,7 @@ static int unix_dgram_sendmsg(struct kio
 	struct sk_buff *skb;
 	long timeo;
 	struct scm_cookie tmp_scm;
-	int max_level;
+	int max_level, disconned;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1525,6 +1582,12 @@ restart:
 		unix_state_lock(sk);
 		if (unix_peer(sk) == other) {
 			unix_peer(sk) = NULL;
+
+			disconned = unix_peer_wake_disconnect(sk, other);
+			if (disconned)
+				wake_up_interruptible_poll(sk_sleep(sk),
+							POLL_OUT_ALL);
+
 			unix_state_unlock(sk);
 
 			unix_dgram_disconnected(sk, other);
@@ -1783,8 +1846,7 @@ static int unix_dgram_recvmsg(struct kio
 		goto out_unlock;
 	}
 
-	wake_up_interruptible_sync_poll(&u->peer_wait,
-					POLLOUT | POLLWRNORM | POLLWRBAND);
+	wake_up_interruptible_sync_poll(&u->peer_wait, POLL_OUT_ALL);
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
@@ -2127,7 +2189,7 @@ static unsigned int unix_poll(struct fil
 	 * connection. This prevents stuck sockets.
 	 */
 	if (unix_writable(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= POLL_OUT_ALL;
 
 	return mask;
 }
@@ -2137,6 +2199,7 @@ static unsigned int unix_dgram_poll(stru
 {
 	struct sock *sk = sock->sk, *other;
 	unsigned int mask, writable;
+	int need_wakeup;
 
 	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
@@ -2163,22 +2226,55 @@ static unsigned int unix_dgram_poll(stru
 	}
 
 	/* No write status requested, avoid expensive OUT tests. */
-	if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT)))
+	if (wait && !(wait->key & POLL_OUT_ALL))
 		return mask;
 
 	writable = unix_writable(sk);
-	other = unix_peer_get(sk);
-	if (other) {
-		if (unix_peer(other) != sk) {
-			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
-			if (unix_recvq_full(other))
-				writable = 0;
+	if (writable) {
+		/*
+		 * If sk is asymmetrically connected to a peer,
+		 * whether or not data can actually be written depends
+		 * on the number of messages already enqueued for
+		 * reception on the peer socket, so check for this
+		 * condition, too.
+		 *
+		 * In case the socket is deemed not writeable because
+		 * of this, link it onto the peer_wait queue of the
+		 * peer to ensure that a wake up happens even if no
+		 * datagram already enqueued for processing was sent
+		 * using sk.
+		 *
+		 * If a thread finds the socket writable but wasn't
+		 * the one which connected, do a wake up as the
+		 * current thread might never have slept and hence,
+		 * might have broken the link before regular wake up
+		 * could happen.
+		 */
+
+		need_wakeup = 0;
+		unix_state_lock(sk);
+
+		other = unix_peer(sk);
+		if (other && unix_peer(other) != sk) {
+			if (unix_recvq_full(other)) {
+				need_wakeup = !unix_peer_wake_connect(sk, other);
+
+				if (unix_recvq_full(other)) {
+					writable = 0;
+					need_wakeup = 0;
+				} else
+					unix_peer_wake_disconnect(sk, other);
+			} else
+				need_wakeup = unix_peer_wake_disconnect(sk, other);
 		}
-		sock_put(other);
+
+		unix_state_unlock(sk);
+		if (need_wakeup)
+			wake_up_interruptible_poll(sk_sleep(sk), POLL_OUT_ALL);
 	}
 
 	if (writable)
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= POLL_OUT_ALL;
 	else
 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
--- linux-2-6.b/include/net/af_unix.h	2015-10-18 20:41:10.833404775 +0100
+++ linux-2-6/include/net/af_unix.h	2015-10-11 20:12:47.598690519 +0100
@@ -58,6 +58,7 @@ struct unix_sock {
 	unsigned int		gc_maybe_cycle : 1;
 	unsigned char		recursion_level;
 	struct socket_wq	peer_wq;
+	wait_queue_t		peer_wake;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-18 20:58               ` Rainer Weikusat
@ 2015-10-19 15:07                 ` Jason Baron
  2015-10-20 22:29                   ` Rainer Weikusat
  0 siblings, 1 reply; 26+ messages in thread
From: Jason Baron @ 2015-10-19 15:07 UTC (permalink / raw)
  To: Rainer Weikusat
  Cc: davem, netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	viro, davidel, dave, olivier, pageexec, torvalds, peterz

On 10/18/2015 04:58 PM, Rainer Weikusat wrote:

[...]

> 
> The idea behind 'the wait queue' (insofar I'm aware of it) is that it
> will be used as list of threads who need to be notified when the
> associated event occurs. Since you seem to argue that the run-of-the-mill
> algorithm is too slow for this particular case, is there anything to
> back this up?
>

Generally the poll() routines only add to a wait queue once at the
beginning, and all subsequent calls to poll() simply check the wakeup
conditions. So here you are proposing to add/remove to the wait queue on
subsequent invocations of poll(). So the initial patch I did, continued
in the usual pattern and only added once on registration or connect().

However, I do think that this is a special case since the registration
is on a shared wait queue, and thus having a long list of registered
waiters is going to affect all waiters. So I am fine with doing the
add/removes in the poll() routine and I agree that the patch below is
more compact that what I initially posted.

A couple of notes on your patch:

1)

In unix_peer_wake_relay() function, 'sk_wq' is an __rcu pointer and thus
it requires proper dereferencing. Something like:

struct unix_sock *u;
struct socket_wq *wq;

u = container_of(wait, struct unix_sock, wait);
rcu_read_lock();
wq = rcu_dereference(u->sk.sk_wq);
if (wq_has_sleeper(wq))
	wake_up_interruptible_sync_poll(&wq->wait, key);
rcu_read_unlock();


2)

For the case of epoll() in edge triggered mode we need to ensure that
when we return -EAGAIN from unix_dgram_sendmsg() when unix_recvq_full()
is true, we need to add a unix_peer_wake_connect() call to guarantee a
wakeup. Otherwise, we are going to potentially hang there.

With these changes (or tell me why they are not needed), I'm happy to
ack this patch.

Thanks,

-Jason

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-19 15:07                 ` Jason Baron
@ 2015-10-20 22:29                   ` Rainer Weikusat
  2015-10-21 17:34                     ` Rainer Weikusat
  2015-10-28 16:46                     ` [RFC] " Rainer Weikusat
  0 siblings, 2 replies; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-20 22:29 UTC (permalink / raw)
  To: Jason Baron
  Cc: Rainer Weikusat, davem, netdev, linux-kernel, minipli,
	normalperson, eric.dumazet, viro, davidel, dave, olivier,
	pageexec, torvalds, peterz

Jason Baron <jbaron@akamai.com> writes:
> On 10/18/2015 04:58 PM, Rainer Weikusat wrote:
>
> [...]
>
>> 
>> The idea behind 'the wait queue' (insofar I'm aware of it) is that it
>> will be used as list of threads who need to be notified when the
>> associated event occurs. Since you seem to argue that the run-of-the-mill
>> algorithm is too slow for this particular case, is there anything to
>> back this up?
>>
>
> Generally the poll() routines only add to a wait queue once at the
> beginning, and all subsequent calls to poll() simply check the wakeup
> conditions. So here you are proposing to add/remove to the wait queue on
> subsequent invocations of poll(). So the initial patch I did, continued
> in the usual pattern and only added once on registration or connect().

The code uses the private member of a wait_queue_t to record if it the
add_wait_queue was already executed so the add/remove will only happen
if the wakeup condition changed in the meantime (which usually ought to
be the case, though). As far as I understand this, this really only
makes a difference for epoll as only epoll will keep everything on the
wait queues managed by it between 'polling calls'. In order to support
epoll-style wait queue management outside of epoll, the poll management
code would need to execute a cleanup callback instead of just the setup
callback it already executes.

> 1)
>
> In unix_peer_wake_relay() function, 'sk_wq' is an __rcu pointer and thus
> it requires proper dereferencing. Something like:
>
> struct unix_sock *u;
> struct socket_wq *wq;
>
> u = container_of(wait, struct unix_sock, wait);
> rcu_read_lock();
> wq = rcu_dereference(u->sk.sk_wq);
> if (wq_has_sleeper(wq))
> 	wake_up_interruptible_sync_poll(&wq->wait, key);
> rcu_read_unlock();

I think this may be unecessary but I need more time to check this than
the odd "half an hour after work after 11pm [UK time]" I could put into
this today.

> 2)
>
> For the case of epoll() in edge triggered mode we need to ensure that
> when we return -EAGAIN from unix_dgram_sendmsg() when unix_recvq_full()
> is true, we need to add a unix_peer_wake_connect() call to guarantee a
> wakeup. Otherwise, we are going to potentially hang there.

I consider this necessary.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll()
  2015-10-20 22:29                   ` Rainer Weikusat
@ 2015-10-21 17:34                     ` Rainer Weikusat
  2015-10-28 16:46                     ` [RFC] " Rainer Weikusat
  1 sibling, 0 replies; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-21 17:34 UTC (permalink / raw)
  To: Jason Baron
  Cc: Rainer Weikusat, davem, netdev, linux-kernel, minipli,
	normalperson, eric.dumazet, viro, davidel, dave, olivier,
	pageexec, torvalds, peterz

Rainer Weikusat <rw@doppelsaurus.mobileactivedefense.com> writes:
> Jason Baron <jbaron@akamai.com> writes:
>> On 10/18/2015 04:58 PM, Rainer Weikusat wrote:

[...]

>> 1)
>>
>> In unix_peer_wake_relay() function, 'sk_wq' is an __rcu pointer and thus
>> it requires proper dereferencing. Something like:
>>
>> struct unix_sock *u;
>> struct socket_wq *wq;
>>
>> u = container_of(wait, struct unix_sock, wait);
>> rcu_read_lock();
>> wq = rcu_dereference(u->sk.sk_wq);
>> if (wq_has_sleeper(wq))
>> 	wake_up_interruptible_sync_poll(&wq->wait, key);
>> rcu_read_unlock();
>
> I think this may be unecessary

I consider this unnecessary now.

Rationale: The wait queue is allocated and freed in tandem with the
socket inode which means it will remain allocated until after the
protocol release function (unix_release with the bulk of the
implementation being in unix_release_sock) returned. As the connection
with the other socket is broken in unix_release_sock, any relayed wake
up must have completed before this time (since both operations take
place while holding the same wait queue lock).

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFC] unix: fix use-after-free in unix_dgram_poll()
  2015-10-20 22:29                   ` Rainer Weikusat
  2015-10-21 17:34                     ` Rainer Weikusat
@ 2015-10-28 16:46                     ` Rainer Weikusat
  2015-10-28 17:57                       ` Jason Baron
  2015-10-30 20:52                       ` [RFC] unix: fix use-after-free in unix_dgram_poll()/ 4.2.5 Rainer Weikusat
  1 sibling, 2 replies; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-28 16:46 UTC (permalink / raw)
  To: Jason Baron
  Cc: Rainer Weikusat, davem, netdev, linux-kernel, minipli,
	normalperson, eric.dumazet, viro, davidel, dave, olivier,
	pageexec, torvalds, peterz

Rainer Weikusat <rw@doppelsaurus.mobileactivedefense.com> writes:
> Jason Baron <jbaron@akamai.com> writes:

[...]

>> 2)
>>
>> For the case of epoll() in edge triggered mode we need to ensure that
>> when we return -EAGAIN from unix_dgram_sendmsg() when unix_recvq_full()
>> is true, we need to add a unix_peer_wake_connect() call to guarantee a
>> wakeup. Otherwise, we are going to potentially hang there.
>
> I consider this necessary.

(As already discussed privately) just doing this would open up another
way for sockets to be enqueued on the peer_wait queue of the peer
forever despite no one wants to be notified of write space
availability. Here's another RFC patch addressing the issues so far plus
this one by breaking the connection to the peer socket from the wake up
relaying function. This has the nice additional property that the
dgram_poll code becomes somewhat simpler as the "dequeued where we
didn't enqueue" situation can no longer occur and the not-so-nice
additional property that the connect and disconnect functions need to
take the peer_wait.lock spinlock explicitly so that this lock is used to
ensure that no two threads modifiy the private pointer of the client
wait_queue_t.

I've also moved the check, possibly enqueue then recheck and possibly
dequeue dance into a pair of functions as this code would be identical
for both unix_dgram_poll and unix_dgram_sendmsg (I'm not really happy
with the names, though).

---
--- linux-2-6.b/net/unix/af_unix.c	2015-10-28 16:06:29.581960497 +0000
+++ linux-2-6/net/unix/af_unix.c	2015-10-28 16:14:55.326065483 +0000
@@ -115,6 +115,8 @@
 #include <net/checksum.h>
 #include <linux/security.h>
 
+#define POLL_OUT_ALL	(POLLOUT | POLLWRNORM | POLLWRBAND)
+
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
 static atomic_long_t unix_nr_socks;
@@ -303,6 +305,117 @@ found:
 	return s;
 }
 
+/*
+ * Support code for asymmetrically connected dgram sockets
+ *
+ * If a datagram socket is connected to a socket not itself connected
+ * to the first socket (eg, /dev/log), clients may only enqueue more
+ * messages if the present receive queue of the server socket is not
+ * "too large". This means there's a second writability condition poll
+ * and sendmsg need to test. The dgram recv code will do a wake up on
+ * the peer_wait wait queue of a socket upon reception of a datagram
+ * which needs to be propagated to sleeping writers since these might
+ * not yet have sent anything. This can't be accomplished via
+ * poll_wait because the lifetime of the server socket might be less
+ * than that of its clients if these break their association with it
+ * or if the server socket is closed while clients are still connected
+ * to it and there's no way to inform "a polling implementation" that
+ * it should let go of a certain wait queue
+ *
+ * In order to achieve wake up propagation, a wait_queue_t of the
+ * client socket is thus enqueued on the peer_wait queue of the server
+ * socket whose wake function does a wake_up on the ordinary client
+ * socket wait queue. This connection is established whenever a write
+ * (or poll for write) hit the flow control condition and broken when
+ * the connection to the server socket is dissolved or after a wake up
+ * was relayed.
+ */
+
+static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+				void *key)
+{
+	struct unix_sock *u;
+	wait_queue_head_t *u_sleep;
+
+	u = container_of(q, struct unix_sock, peer_wake);
+
+	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
+			    &u->peer_wake);
+	u->peer_wake.private = NULL;
+
+	/* relaying can only happen while the wq still exists */
+	u_sleep = sk_sleep(&u->sk);
+	if (u_sleep)
+		wake_up_interruptible_poll(u_sleep, key);
+
+	return 0;
+}
+
+static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
+{
+	struct unix_sock *u, *u_other;
+	int rc;
+
+	u = unix_sk(sk);
+	u_other = unix_sk(other);
+	rc = 0;
+
+	spin_lock(&u_other->peer_wait.lock);
+
+	if (!u->peer_wake.private) {
+		u->peer_wake.private = other;
+		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
+
+		rc = 1;
+	}
+
+	spin_unlock(&u_other->peer_wait.lock);
+	return rc;
+}
+
+static int unix_dgram_peer_wake_disconnect(struct sock *sk, struct sock *other)
+{
+	struct unix_sock *u, *u_other;
+	int rc;
+
+	u = unix_sk(sk);
+	u_other = unix_sk(other);
+	rc = 0;
+
+	spin_lock(&u_other->peer_wait.lock);
+
+	if (u->peer_wake.private == other) {
+		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
+		u->peer_wake.private = NULL;
+
+		rc = 1;
+	}
+
+	spin_unlock(&u_other->peer_wait.lock);
+	return rc;
+}
+
+static inline int unix_dgram_peer_recvq_space(struct sock *sk,
+					      struct sock *other)
+{
+	return !(unix_peer(other) != sk && unix_recvq_full(other));
+}
+
+static int unix_dgram_peer_recv_wake_prep(struct sock *sk, struct sock *other)
+{
+	int queued;
+
+	queued = unix_dgram_peer_wake_connect(sk, other);
+
+	if (unix_recvq_full(other))
+		return 1;
+
+	if (queued)
+		unix_dgram_peer_wake_disconnect(sk, other);
+
+	return 0;
+}
+
 static inline int unix_writable(struct sock *sk)
 {
 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
@@ -317,7 +430,7 @@ static void unix_write_space(struct sock
 		wq = rcu_dereference(sk->sk_wq);
 		if (wq_has_sleeper(wq))
 			wake_up_interruptible_sync_poll(&wq->wait,
-				POLLOUT | POLLWRNORM | POLLWRBAND);
+							POLL_OUT_ALL);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	rcu_read_unlock();
@@ -409,6 +522,8 @@ static void unix_release_sock(struct soc
 			skpair->sk_state_change(skpair);
 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 		}
+
+		unix_dgram_peer_wake_disconnect(sk, skpair);
 		sock_put(skpair); /* It may now die */
 		unix_peer(sk) = NULL;
 	}
@@ -630,6 +745,7 @@ static struct sock *unix_create1(struct
 	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
+	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
 	if (sk == NULL)
@@ -953,7 +1069,7 @@ static int unix_dgram_connect(struct soc
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 	struct sock *other;
 	unsigned hash;
-	int err;
+	int err, disconned;
 
 	if (addr->sa_family != AF_UNSPEC) {
 		err = unix_mkname(sunaddr, alen, &hash);
@@ -1001,6 +1117,11 @@ restart:
 	if (unix_peer(sk)) {
 		struct sock *old_peer = unix_peer(sk);
 		unix_peer(sk) = other;
+
+		disconned = unix_dgram_peer_wake_disconnect(sk, other);
+		if (disconned)
+			wake_up_interruptible_poll(sk_sleep(sk), POLL_OUT_ALL);
+
 		unix_state_double_unlock(sk, other);
 
 		if (other != old_peer)
@@ -1439,7 +1560,7 @@ static int unix_dgram_sendmsg(struct kio
 	struct sk_buff *skb;
 	long timeo;
 	struct scm_cookie tmp_scm;
-	int max_level;
+	int max_level, disconned;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1525,6 +1646,12 @@ restart:
 		unix_state_lock(sk);
 		if (unix_peer(sk) == other) {
 			unix_peer(sk) = NULL;
+
+			disconned = unix_dgram_peer_wake_disconnect(sk, other);
+			if (disconned)
+				wake_up_interruptible_poll(sk_sleep(sk),
+							POLL_OUT_ALL);
+
 			unix_state_unlock(sk);
 
 			unix_dgram_disconnected(sk, other);
@@ -1550,8 +1677,8 @@ restart:
 			goto out_unlock;
 	}
 
-	if (unix_peer(other) != sk && unix_recvq_full(other)) {
-		if (!timeo) {
+	if (!unix_dgram_peer_recvq_space(sk, other)) {
+		if (!timeo && unix_dgram_peer_recv_wake_prep(sk, other)) {
 			err = -EAGAIN;
 			goto out_unlock;
 		}
@@ -1783,8 +1910,7 @@ static int unix_dgram_recvmsg(struct kio
 		goto out_unlock;
 	}
 
-	wake_up_interruptible_sync_poll(&u->peer_wait,
-					POLLOUT | POLLWRNORM | POLLWRBAND);
+	wake_up_interruptible_sync_poll(&u->peer_wait, POLL_OUT_ALL);
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
@@ -2127,7 +2253,7 @@ static unsigned int unix_poll(struct fil
 	 * connection. This prevents stuck sockets.
 	 */
 	if (unix_writable(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= POLL_OUT_ALL;
 
 	return mask;
 }
@@ -2163,22 +2289,24 @@ static unsigned int unix_dgram_poll(stru
 	}
 
 	/* No write status requested, avoid expensive OUT tests. */
-	if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT)))
+	if (wait && !(wait->key & POLL_OUT_ALL))
 		return mask;
 
 	writable = unix_writable(sk);
-	other = unix_peer_get(sk);
-	if (other) {
-		if (unix_peer(other) != sk) {
-			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
-			if (unix_recvq_full(other))
-				writable = 0;
-		}
-		sock_put(other);
+	if (writable) {
+		unix_state_lock(sk);
+
+		other = unix_peer(sk);
+		if (other
+		    && !unix_dgram_peer_recvq_space(sk, other)
+		    && unix_dgram_peer_recv_wake_prep(sk, other))
+			writable = 0;
+
+		unix_state_unlock(sk);
 	}
 
 	if (writable)
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= POLL_OUT_ALL;
 	else
 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
--- linux-2-6.b/include/net/af_unix.h	2015-10-28 16:06:29.613959493 +0000
+++ linux-2-6/include/net/af_unix.h	2015-10-11 20:12:47.598690519 +0100
@@ -58,6 +58,7 @@ struct unix_sock {
 	unsigned int		gc_maybe_cycle : 1;
 	unsigned char		recursion_level;
 	struct socket_wq	peer_wq;
+	wait_queue_t		peer_wake;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] unix: fix use-after-free in unix_dgram_poll()
  2015-10-28 16:46                     ` [RFC] " Rainer Weikusat
@ 2015-10-28 17:57                       ` Jason Baron
  2015-10-29 14:23                         ` Rainer Weikusat
  2015-10-30 20:52                       ` [RFC] unix: fix use-after-free in unix_dgram_poll()/ 4.2.5 Rainer Weikusat
  1 sibling, 1 reply; 26+ messages in thread
From: Jason Baron @ 2015-10-28 17:57 UTC (permalink / raw)
  To: Rainer Weikusat
  Cc: davem, netdev, linux-kernel, minipli, normalperson, eric.dumazet,
	viro, davidel, dave, olivier, pageexec, torvalds, peterz

On 10/28/2015 12:46 PM, Rainer Weikusat wrote:
> Rainer Weikusat <rw@doppelsaurus.mobileactivedefense.com> writes:
>> Jason Baron <jbaron@akamai.com> writes:
> 
> [...]
> 
>>> 2)
>>>
>>> For the case of epoll() in edge triggered mode we need to ensure that
>>> when we return -EAGAIN from unix_dgram_sendmsg() when unix_recvq_full()
>>> is true, we need to add a unix_peer_wake_connect() call to guarantee a
>>> wakeup. Otherwise, we are going to potentially hang there.
>>
>> I consider this necessary.
> 
> (As already discussed privately) just doing this would open up another
> way for sockets to be enqueued on the peer_wait queue of the peer
> forever despite no one wants to be notified of write space
> availability. Here's another RFC patch addressing the issues so far plus
> this one by breaking the connection to the peer socket from the wake up
> relaying function. This has the nice additional property that the
> dgram_poll code becomes somewhat simpler as the "dequeued where we
> didn't enqueue" situation can no longer occur and the not-so-nice
> additional property that the connect and disconnect functions need to
> take the peer_wait.lock spinlock explicitly so that this lock is used to
> ensure that no two threads modifiy the private pointer of the client
> wait_queue_t.

Hmmm...I thought these were already all guarded by unix_state_lock(sk).
In any case, rest of the patch overall looks good to me.

Thanks,

-Jason


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] unix: fix use-after-free in unix_dgram_poll()
  2015-10-28 17:57                       ` Jason Baron
@ 2015-10-29 14:23                         ` Rainer Weikusat
  0 siblings, 0 replies; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-29 14:23 UTC (permalink / raw)
  To: Jason Baron
  Cc: Rainer Weikusat, davem, netdev, linux-kernel, minipli,
	normalperson, eric.dumazet, viro, davidel, dave, olivier,
	pageexec, torvalds, peterz

Jason Baron <jbaron@akamai.com> writes:
> On 10/28/2015 12:46 PM, Rainer Weikusat wrote:
>> Rainer Weikusat <rw@doppelsaurus.mobileactivedefense.com> writes:
>>> Jason Baron <jbaron@akamai.com> writes:

[...]

>> and the not-so-nice additional property that the connect and
>> disconnect functions need to take the peer_wait.lock spinlock
>> explicitly so that this lock is used to ensure that no two threads
>> modifiy the private pointer of the client wait_queue_t.
>
> Hmmm...I thought these were already all guarded by unix_state_lock(sk).
> In any case, rest of the patch overall looks good to me.

All but the one in unix_dgram_peer_wake_relay: That's
called with the wait queue lock of the peer_wait queue held and even if
it was desirable to acquire the state lock of the involved socket, it
wouldn't easily be possible because 'other code' (in poll and sendmsg)
acquires the wait queue lock while holding the state lock.

The idea behind this is that the state lock ensures that the 'connection
state' doesn't change while some code is examining it and that the wait
queue lock of the peer_wait queue is (slightly ab-)used to ensure
exclusive access to the private pointer which guards against concurrent
inserts or removes of the same wait_queue_t.

Poking around in the implementation of abstract interfaces like this
isn't something I'm overly fond of but 'other code' does this too, eg,
in eventfd.c


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] unix: fix use-after-free in unix_dgram_poll()/ 4.2.5
  2015-10-28 16:46                     ` [RFC] " Rainer Weikusat
  2015-10-28 17:57                       ` Jason Baron
@ 2015-10-30 20:52                       ` Rainer Weikusat
       [not found]                         ` <57d2f5b6aae251957bff7a1a52b8bf2c@core-hosting.net>
  1 sibling, 1 reply; 26+ messages in thread
From: Rainer Weikusat @ 2015-10-30 20:52 UTC (permalink / raw)
  To: Jason Baron
  Cc: Rainer Weikusat, davem, netdev, linux-kernel, minipli,
	normalperson, eric.dumazet, viro, davidel, dave, olivier,
	pageexec, torvalds, peterz

Same changes ported to 4.2.5 with some minor improvments (I hope),
namely,

	- applied a round of DeMorgan to the 'quick' check function in
          order to simplify the condition

	- fixed a (minor) error in the dgram_sendmsg change: In case the
          2nd check resulted in 'can send now', the code would continue
          with 'wait until timeout expired' (since timeo was 0 in the
          case, this shouldn't make much of a practical difference)

	- (hopefully) more intelligible function names and better
          explanation

	- dropped the POLL_OUT_ALL macro again as that's really
          unrelated

---
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -62,6 +62,7 @@ struct unix_sock {
 #define UNIX_GC_CANDIDATE	0
 #define UNIX_GC_MAYBE_CYCLE	1
 	struct socket_wq	peer_wq;
+	wait_queue_t		peer_wake;
 };
 
 static inline struct unix_sock *unix_sk(struct sock *sk)
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -326,6 +326,122 @@ found:
 	return s;
 }
 
+/* Support code for asymmetrically connected dgram sockets
+ *
+ * If a datagram socket is connected to a socket not itself connected
+ * to the first socket (eg, /dev/log), clients may only enqueue more
+ * messages if the present receive queue of the server socket is not
+ * "too large". This means there's a second writeability condition
+ * poll and sendmsg need to test. The dgram recv code will do a wake
+ * up on the peer_wait wait queue of a socket upon reception of a
+ * datagram which needs to be propagated to sleeping would-be writers
+ * since these might not have sent anything so far. This can't be
+ * accomplished via poll_wait because the lifetime of the server
+ * socket might be less than that of its clients if these break their
+ * association with it or if the server socket is closed while clients
+ * are still connected to it and there's no way to inform "a polling
+ * implementation" that it should let go of a certain wait queue
+ *
+ * In order to propagate a wake up, a wait_queue_t of the client
+ * socket is enqueued on the peer_wait queue of the server socket
+ * whose wake function does a wake_up on the ordinary client socket
+ * wait queue. This connection is established whenever a write (or
+ * poll for write) hit the flow control condition and broken when the
+ * association to the server socket is dissolved or after a wake up
+ * was relayed.
+ */
+
+static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+				      void *key)
+{
+	struct unix_sock *u;
+	wait_queue_head_t *u_sleep;
+
+	u = container_of(q, struct unix_sock, peer_wake);
+
+	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
+			    &u->peer_wake);
+	u->peer_wake.private = NULL;
+
+	/* relaying can only happen while the wq still exists */
+	u_sleep = sk_sleep(&u->sk);
+	if (u_sleep)
+		wake_up_interruptible_poll(u_sleep, key);
+
+	return 0;
+}
+
+static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
+{
+	struct unix_sock *u, *u_other;
+	int rc;
+
+	u = unix_sk(sk);
+	u_other = unix_sk(other);
+	rc = 0;
+
+	spin_lock(&u_other->peer_wait.lock);
+
+	if (!u->peer_wake.private) {
+		u->peer_wake.private = other;
+		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
+
+		rc = 1;
+	}
+
+	spin_unlock(&u_other->peer_wait.lock);
+	return rc;
+}
+
+static int unix_dgram_peer_wake_disconnect(struct sock *sk, struct sock *other)
+{
+	struct unix_sock *u, *u_other;
+	int rc;
+
+	u = unix_sk(sk);
+	u_other = unix_sk(other);
+	rc = 0;
+
+	spin_lock(&u_other->peer_wait.lock);
+
+	if (u->peer_wake.private == other) {
+		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
+		u->peer_wake.private = NULL;
+
+		rc = 1;
+	}
+
+	spin_unlock(&u_other->peer_wait.lock);
+	return rc;
+}
+
+/* Needs 'this' unix state lock. Lockless check if data can (likely)
+ * be sent.
+ */
+static inline int unix_dgram_peer_recv_ready(struct sock *sk,
+					     struct sock *other)
+{
+	return unix_peer(other) == sk || !unix_recvq_full(other);
+}
+
+/* Needs 'this' unix state lock. After recv_ready indicated not ready,
+ * establish peer_wait connection if still needed.
+ */
+static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
+{
+	int conned;
+
+	conned = unix_dgram_peer_wake_connect(sk, other);
+
+	if (unix_recvq_full(other))
+		return 1;
+
+	if (conned)
+		unix_dgram_peer_wake_disconnect(sk, other);
+
+	return 0;
+}
+
 static inline int unix_writable(struct sock *sk)
 {
 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
@@ -430,6 +546,8 @@ static void unix_release_sock(struct sock *sk, int embrion)
 			skpair->sk_state_change(skpair);
 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 		}
+
+		unix_dgram_peer_wake_disconnect(sk, skpair);
 		sock_put(skpair); /* It may now die */
 		unix_peer(sk) = NULL;
 	}
@@ -664,6 +782,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
+	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 	unix_insert_socket(unix_sockets_unbound(sk), sk);
 out:
 	if (sk == NULL)
@@ -983,7 +1102,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 	struct sock *other;
 	unsigned int hash;
-	int err;
+	int err, disconned;
 
 	if (addr->sa_family != AF_UNSPEC) {
 		err = unix_mkname(sunaddr, alen, &hash);
@@ -1031,6 +1150,14 @@ restart:
 	if (unix_peer(sk)) {
 		struct sock *old_peer = unix_peer(sk);
 		unix_peer(sk) = other;
+
+		disconned = unix_dgram_peer_wake_disconnect(sk, other);
+		if (disconned)
+			wake_up_interruptible_poll(sk_sleep(sk),
+						   POLLOUT |
+						   POLLWRNORM |
+						   POLLWRBAND);
+
 		unix_state_double_unlock(sk, other);
 
 		if (other != old_peer)
@@ -1463,7 +1590,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
 	struct sock *other = NULL;
 	int namelen = 0; /* fake GCC */
-	int err;
+	int err, disconned;
 	unsigned int hash;
 	struct sk_buff *skb;
 	long timeo;
@@ -1565,6 +1692,14 @@ restart:
 		unix_state_lock(sk);
 		if (unix_peer(sk) == other) {
 			unix_peer(sk) = NULL;
+
+			disconned = unix_dgram_peer_wake_disconnect(sk, other);
+			if (disconned)
+				wake_up_interruptible_poll(sk_sleep(sk),
+							   POLLOUT |
+							   POLLWRNORM |
+							   POLLWRBAND);
+
 			unix_state_unlock(sk);
 
 			unix_dgram_disconnected(sk, other);
@@ -1590,10 +1725,14 @@ restart:
 			goto out_unlock;
 	}
 
-	if (unix_peer(other) != sk && unix_recvq_full(other)) {
+	if (!unix_dgram_peer_recv_ready(sk, other)) {
 		if (!timeo) {
-			err = -EAGAIN;
-			goto out_unlock;
+			if (unix_dgram_peer_wake_me(sk, other)) {
+				err = -EAGAIN;
+				goto out_unlock;
+			}
+
+			goto restart;
 		}
 
 		timeo = unix_wait_for_peer(other, timeo);
@@ -2453,14 +2592,16 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 		return mask;
 
 	writable = unix_writable(sk);
-	other = unix_peer_get(sk);
-	if (other) {
-		if (unix_peer(other) != sk) {
-			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
-			if (unix_recvq_full(other))
-				writable = 0;
-		}
-		sock_put(other);
+	if (writable) {
+		unix_state_lock(sk);
+
+		other = unix_peer(sk);
+		if (other &&
+		    !unix_dgram_peer_recv_ready(sk, other) &&
+		    unix_dgram_peer_wake_me(sk, other))
+			writable = 0;
+
+		unix_state_unlock(sk);
 	}
 
 	if (writable)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] unix: fix use-after-free in unix_dgram_poll()/ 4.2.5
       [not found]                         ` <57d2f5b6aae251957bff7a1a52b8bf2c@core-hosting.net>
@ 2015-11-02 21:55                           ` Rainer Weikusat
  0 siblings, 0 replies; 26+ messages in thread
From: Rainer Weikusat @ 2015-11-02 21:55 UTC (permalink / raw)
  To: olivier
  Cc: Rainer Weikusat, Jason Baron, davem, netdev, linux-kernel,
	minipli, normalperson, eric.dumazet, viro, davidel, dave,
	pageexec, torvalds, peterz

Olivier Mauras <olivier@mauras.ch> writes:

[...]

> I've encountered  issues with Jason's patch ported to 3.14.x which would break
> openldap, rendering it unable to answer any query - Here's a strace of the
> slapd process in this state http://pastebin.ca/3226383
> Just ported Rainer's patch to 3.14 and so far I can't reproduce the issue -

I may be missing something here but the final state according to the
trace it that thread 775 of the process blocks in epoll_wait with a
descriptor set containing only a listening TCP socket (8) and waiting
for new connections. I don't think this can execute any code
changed by my patch and I'm fairly certain for this for Jason's, too:
Both are about AF_UNIX datagram sockets and the specific case where
either a write couldn't complete because the backlog of the receive
queue of the 1 side of a n:1 datagram socket arrangement was considered
too large or where a 'poll for write' check returned 'not writeable' for
the same reason.

Judging from the 2.4.42 sources, OpenLDAP doesn't use AF_UNIX datagram
sockets at all so it shouldn't ever be affected by any changes to the
code handling them.

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2015-11-02 21:56 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-02 20:43 [PATCH v2 0/3] af_unix: fix use-after-free Jason Baron
2015-10-02 20:43 ` [PATCH v2 1/3] unix: fix use-after-free in unix_dgram_poll() Jason Baron
2015-10-03  5:46   ` Mathias Krause
2015-10-03 17:02     ` Rainer Weikusat
2015-10-04 17:41       ` Rainer Weikusat
2015-10-05 16:31   ` Rainer Weikusat
2015-10-05 16:54     ` Eric Dumazet
2015-10-05 17:20       ` Rainer Weikusat
2015-10-05 17:55     ` Jason Baron
2015-10-12 20:41       ` Rainer Weikusat
2015-10-14  3:44         ` Jason Baron
2015-10-14 17:47           ` Rainer Weikusat
2015-10-15  2:54             ` Jason Baron
2015-10-18 20:58               ` Rainer Weikusat
2015-10-19 15:07                 ` Jason Baron
2015-10-20 22:29                   ` Rainer Weikusat
2015-10-21 17:34                     ` Rainer Weikusat
2015-10-28 16:46                     ` [RFC] " Rainer Weikusat
2015-10-28 17:57                       ` Jason Baron
2015-10-29 14:23                         ` Rainer Weikusat
2015-10-30 20:52                       ` [RFC] unix: fix use-after-free in unix_dgram_poll()/ 4.2.5 Rainer Weikusat
     [not found]                         ` <57d2f5b6aae251957bff7a1a52b8bf2c@core-hosting.net>
2015-11-02 21:55                           ` Rainer Weikusat
2015-10-02 20:43 ` [PATCH v2 2/3] af_unix: Convert gc_flags to flags Jason Baron
2015-10-02 20:44 ` [PATCH v2 3/3] af_unix: optimize the unix_dgram_recvmsg() Jason Baron
2015-10-05  7:41   ` Peter Zijlstra
2015-10-05 17:13     ` Jason Baron

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).