All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
@ 2016-09-02 18:13 Linus Torvalds
  2016-09-02 18:17 ` Linus Torvalds
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Linus Torvalds @ 2016-09-02 18:13 UTC (permalink / raw)
  To: David S. Miller
  Cc: Hannes Frederic Sowa, Rainer Weikusat, Eric Dumazet,
	willy tarreau, netdev


From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 1 Sep 2016 14:43:53 -0700
Subject: [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'

Right now we use the 'readlock' both for protecting some of the af_unix
IO path and for making the bind be single-threaded.

The two are independent, but using the same lock makes for a nasty
deadlock due to ordering with regards to filesystem locking.  The bind
locking would want to nest outside the VSF pathname locking, but the IO
locking wants to nest inside some of those same locks.

We tried to fix this earlier with commit c845acb324aa ("af_unix: Fix
splice-bind deadlock") which moved the readlock inside the vfs locks,
but that caused problems with overlayfs that will then call back into
filesystem routines that take the lock in the wrong order anyway.

Splitting the locks means that we can go back to having the bind lock be
the outermost lock, and we don't have any deadlocks with lock ordering.

Acked-by: Rainer Weikusat <rweikusat@cyberadapt.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---

This patch is really trivial, and I've tried to be careful and look at the 
locking, but somebody who really knows the AF_UNIX code should definitely 
take a second look.

Note that I did the revert (that re-introduces the original splice 
deadlock) first, because that made the whole series much easier to 
explain. Doing it in the other order made the revert nastier because this 
patch obviously touches the same code that the revert in 1/2 does.

So this way the series ends up being "go back to the original code with 
the original deadlock, and then fix that original deadlock by splitting 
the bind lock".

 include/net/af_unix.h |  2 +-
 net/unix/af_unix.c    | 45 +++++++++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 9b4c418bebd8..fd60eccb59a6 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -52,7 +52,7 @@ struct unix_sock {
 	struct sock		sk;
 	struct unix_address     *addr;
 	struct path		path;
-	struct mutex		readlock;
+	struct mutex		iolock, bindlock;
 	struct sock		*peer;
 	struct list_head	link;
 	atomic_long_t		inflight;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 433ae1bbef97..8309687a56b0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -661,11 +661,11 @@ static int unix_set_peek_off(struct sock *sk, int val)
 {
 	struct unix_sock *u = unix_sk(sk);
 
-	if (mutex_lock_interruptible(&u->readlock))
+	if (mutex_lock_interruptible(&u->iolock))
 		return -EINTR;
 
 	sk->sk_peek_off = val;
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 
 	return 0;
 }
@@ -779,7 +779,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 	spin_lock_init(&u->lock);
 	atomic_long_set(&u->inflight, 0);
 	INIT_LIST_HEAD(&u->link);
-	mutex_init(&u->readlock); /* single task reading lock */
+	mutex_init(&u->iolock); /* single task reading lock */
+	mutex_init(&u->bindlock); /* single task binding lock */
 	init_waitqueue_head(&u->peer_wait);
 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 	unix_insert_socket(unix_sockets_unbound(sk), sk);
@@ -848,7 +849,7 @@ static int unix_autobind(struct socket *sock)
 	int err;
 	unsigned int retries = 0;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
 		return err;
 
@@ -895,7 +896,7 @@ retry:
 	spin_unlock(&unix_table_lock);
 	err = 0;
 
-out:	mutex_unlock(&u->readlock);
+out:	mutex_unlock(&u->bindlock);
 	return err;
 }
 
@@ -1009,7 +1010,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	addr_len = err;
 
-	err = mutex_lock_interruptible(&u->readlock);
+	err = mutex_lock_interruptible(&u->bindlock);
 	if (err)
 		goto out;
 
@@ -1063,7 +1064,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 out_unlock:
 	spin_unlock(&unix_table_lock);
 out_up:
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->bindlock);
 out:
 	return err;
 }
@@ -1955,17 +1956,17 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
 	if (false) {
 alloc_skb:
 		unix_state_unlock(other);
-		mutex_unlock(&unix_sk(other)->readlock);
+		mutex_unlock(&unix_sk(other)->iolock);
 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
 					      &err, 0);
 		if (!newskb)
 			goto err;
 	}
 
-	/* we must acquire readlock as we modify already present
+	/* we must acquire iolock as we modify already present
 	 * skbs in the sk_receive_queue and mess with skb->len
 	 */
-	err = mutex_lock_interruptible(&unix_sk(other)->readlock);
+	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
 	if (err) {
 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
 		goto err;
@@ -2032,7 +2033,7 @@ alloc_skb:
 	}
 
 	unix_state_unlock(other);
-	mutex_unlock(&unix_sk(other)->readlock);
+	mutex_unlock(&unix_sk(other)->iolock);
 
 	other->sk_data_ready(other);
 	scm_destroy(&scm);
@@ -2041,7 +2042,7 @@ alloc_skb:
 err_state_unlock:
 	unix_state_unlock(other);
 err_unlock:
-	mutex_unlock(&unix_sk(other)->readlock);
+	mutex_unlock(&unix_sk(other)->iolock);
 err:
 	kfree_skb(newskb);
 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
@@ -2109,7 +2110,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	do {
-		mutex_lock(&u->readlock);
+		mutex_lock(&u->iolock);
 
 		skip = sk_peek_offset(sk, flags);
 		skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
@@ -2117,14 +2118,14 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 		if (skb)
 			break;
 
-		mutex_unlock(&u->readlock);
+		mutex_unlock(&u->iolock);
 
 		if (err != -EAGAIN)
 			break;
 	} while (timeo &&
 		 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
 
-	if (!skb) { /* implies readlock unlocked */
+	if (!skb) { /* implies iolock unlocked */
 		unix_state_lock(sk);
 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
@@ -2189,7 +2190,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 
 out_free:
 	skb_free_datagram(sk, skb);
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 out:
 	return err;
 }
@@ -2284,7 +2285,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
 	/* Lock the socket to prevent queue disordering
 	 * while sleeps in memcpy_tomsg
 	 */
-	mutex_lock(&u->readlock);
+	mutex_lock(&u->iolock);
 
 	if (flags & MSG_PEEK)
 		skip = sk_peek_offset(sk, flags);
@@ -2326,7 +2327,7 @@ again:
 				break;
 			}
 
-			mutex_unlock(&u->readlock);
+			mutex_unlock(&u->iolock);
 
 			timeo = unix_stream_data_wait(sk, timeo, last,
 						      last_len);
@@ -2337,7 +2338,7 @@ again:
 				goto out;
 			}
 
-			mutex_lock(&u->readlock);
+			mutex_lock(&u->iolock);
 			goto redo;
 unlock:
 			unix_state_unlock(sk);
@@ -2440,7 +2441,7 @@ unlock:
 		}
 	} while (size);
 
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 	if (state->msg)
 		scm_recv(sock, state->msg, &scm, flags);
 	else
@@ -2481,9 +2482,9 @@ static ssize_t skb_unix_socket_splice(struct sock *sk,
 	int ret;
 	struct unix_sock *u = unix_sk(sk);
 
-	mutex_unlock(&u->readlock);
+	mutex_unlock(&u->iolock);
 	ret = splice_to_pipe(pipe, spd);
-	mutex_lock(&u->readlock);
+	mutex_lock(&u->iolock);
 
 	return ret;
 }
-- 
2.10.0.rc0.2.g0a9fa47

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
  2016-09-02 18:13 [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock' Linus Torvalds
@ 2016-09-02 18:17 ` Linus Torvalds
  2016-09-02 19:15   ` David Miller
  2016-09-02 20:25 ` Hannes Frederic Sowa
  2016-09-04 20:29 ` David Miller
  2 siblings, 1 reply; 7+ messages in thread
From: Linus Torvalds @ 2016-09-02 18:17 UTC (permalink / raw)
  To: David S. Miller
  Cc: Hannes Frederic Sowa, Rainer Weikusat, Eric Dumazet,
	willy tarreau, Network Development

On Fri, Sep 2, 2016 at 11:13 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Date: Thu, 1 Sep 2016 14:43:53 -0700
> Subject: [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
>
> Right now we use the 'readlock' both for protecting some of the af_unix
> IO path and for making the bind be single-threaded.
>
> The two are independent, but using the same lock makes for a nasty
> deadlock due to ordering with regards to filesystem locking.  The bind
> locking would want to nest outside the VSF pathname locking, but the IO
> locking wants to nest inside some of those same locks.
>
> We tried to fix this earlier with commit c845acb324aa ("af_unix: Fix
> splice-bind deadlock") which moved the readlock inside the vfs locks,
> but that caused problems with overlayfs that will then call back into
> filesystem routines that take the lock in the wrong order anyway.
>
> Splitting the locks means that we can go back to having the bind lock be
> the outermost lock, and we don't have any deadlocks with lock ordering.
>
> Acked-by: Rainer Weikusat <rweikusat@cyberadapt.com>
> Acked-by: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Oh, this was missing a

  Reported-and-tested-by: CAI Qian <caiqian@redhat.com>

who found the new deadlock.

There's now *another* lockdep deadlock report by him, but that one has
nothing to do with networking.

(And neither of these deadlocks will actually deadlock the machine in
practice, but you can trigger the lockdep reports with some odd splice
patterns and overlayfs use)

                 Linus

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
  2016-09-02 18:17 ` Linus Torvalds
@ 2016-09-02 19:15   ` David Miller
  2016-09-02 20:00     ` Linus Torvalds
  2016-09-02 20:23     ` Hannes Frederic Sowa
  0 siblings, 2 replies; 7+ messages in thread
From: David Miller @ 2016-09-02 19:15 UTC (permalink / raw)
  To: torvalds; +Cc: hannes, rweikusat, edumazet, w, netdev

From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 2 Sep 2016 11:17:18 -0700

> Oh, this was missing a
> 
>   Reported-and-tested-by: CAI Qian <caiqian@redhat.com>
> 
> who found the new deadlock.
> 
> There's now *another* lockdep deadlock report by him, but that one has
> nothing to do with networking.
> 
> (And neither of these deadlocks will actually deadlock the machine in
> practice, but you can trigger the lockdep reports with some odd splice
> patterns and overlayfs use)

I read over this and can't find any problems.

The main thing I was concerned about was an I/O path that really
expects the socket's hash not to change for whatever reason, but even
all of the unix_find_other() calls are done outside of the mutex
already.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
  2016-09-02 19:15   ` David Miller
@ 2016-09-02 20:00     ` Linus Torvalds
  2016-09-02 20:23     ` Hannes Frederic Sowa
  1 sibling, 0 replies; 7+ messages in thread
From: Linus Torvalds @ 2016-09-02 20:00 UTC (permalink / raw)
  To: David Miller
  Cc: Hannes Frederic Sowa, Rainer Weikusat, Eric Dumazet,
	Willy Tarreau, Network Development

On Fri, Sep 2, 2016 at 12:15 PM, David Miller <davem@davemloft.net> wrote:
>
> The main thing I was concerned about was an I/O path that really
> expects the socket's hash not to change for whatever reason, but even
> all of the unix_find_other() calls are done outside of the mutex
> already.

That's my read of it too. I did actually go through the effort on
trying to follow all the locking in there yesterday, so while I would
want an AF_UNIX person to double-check me, I think it's fine.

The real locking seem to be the u->lock spinlock (taken by
unix_state_lock() and friends).

The bindlock is only about serializing the binders, which do things
that can sleep (notably the mknod of the unix socket node in the
filesystem).

The one thing I looked at and wasn't sure about was
unix_stream_connect(), which creates a *new* unix domain socket and
binds it to the old one without holding the bindlock. Note that it
never did hold the lock, so that's not a new thing, but it worries me
a bit.

I *think* it's ok, because I think this all happens before that new
socket is actually reachable, so it cannot race against somebody else
doing a bind. And my patch doesn't actually change anything in this
area, so I'm not making it worse. But it was the one thing I reacted
to when going through the locking there.

(Well, not the "one thing". The other thing I reacted to was the
overall reaction that none of this is documented, and the locking was
clearly not very clear).

                 Linus

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
  2016-09-02 19:15   ` David Miller
  2016-09-02 20:00     ` Linus Torvalds
@ 2016-09-02 20:23     ` Hannes Frederic Sowa
  1 sibling, 0 replies; 7+ messages in thread
From: Hannes Frederic Sowa @ 2016-09-02 20:23 UTC (permalink / raw)
  To: David Miller, torvalds; +Cc: rweikusat, edumazet, w, netdev

On 02.09.2016 21:15, David Miller wrote:
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Date: Fri, 2 Sep 2016 11:17:18 -0700
> 
>> Oh, this was missing a
>>
>>   Reported-and-tested-by: CAI Qian <caiqian@redhat.com>
>>
>> who found the new deadlock.
>>
>> There's now *another* lockdep deadlock report by him, but that one has
>> nothing to do with networking.
>>
>> (And neither of these deadlocks will actually deadlock the machine in
>> practice, but you can trigger the lockdep reports with some odd splice
>> patterns and overlayfs use)
> 
> I read over this and can't find any problems.
> 
> The main thing I was concerned about was an I/O path that really
> expects the socket's hash not to change for whatever reason, but even
> all of the unix_find_other() calls are done outside of the mutex
> already.

Furthermore we don't allow rehashing, if the socket is "published" once,
it can only be destroyed but can't change identity during its lifetime.
IIRC this was another bug we had a while ago.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
  2016-09-02 18:13 [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock' Linus Torvalds
  2016-09-02 18:17 ` Linus Torvalds
@ 2016-09-02 20:25 ` Hannes Frederic Sowa
  2016-09-04 20:29 ` David Miller
  2 siblings, 0 replies; 7+ messages in thread
From: Hannes Frederic Sowa @ 2016-09-02 20:25 UTC (permalink / raw)
  To: Linus Torvalds, David S. Miller
  Cc: Rainer Weikusat, Eric Dumazet, willy tarreau, netdev

On 02.09.2016 20:13, Linus Torvalds wrote:
> 
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Date: Thu, 1 Sep 2016 14:43:53 -0700
> Subject: [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
> 
> Right now we use the 'readlock' both for protecting some of the af_unix
> IO path and for making the bind be single-threaded.
> 
> The two are independent, but using the same lock makes for a nasty
> deadlock due to ordering with regards to filesystem locking.  The bind
> locking would want to nest outside the VSF pathname locking, but the IO
> locking wants to nest inside some of those same locks.
> 
> We tried to fix this earlier with commit c845acb324aa ("af_unix: Fix
> splice-bind deadlock") which moved the readlock inside the vfs locks,
> but that caused problems with overlayfs that will then call back into
> filesystem routines that take the lock in the wrong order anyway.
> 
> Splitting the locks means that we can go back to having the bind lock be
> the outermost lock, and we don't have any deadlocks with lock ordering.
> 
> Acked-by: Rainer Weikusat <rweikusat@cyberadapt.com>
> Acked-by: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
> ---
> 
> This patch is really trivial, and I've tried to be careful and look at the 
> locking, but somebody who really knows the AF_UNIX code should definitely 
> take a second look.
> 
> Note that I did the revert (that re-introduces the original splice 
> deadlock) first, because that made the whole series much easier to 
> explain. Doing it in the other order made the revert nastier because this 
> patch obviously touches the same code that the revert in 1/2 does.
> 
> So this way the series ends up being "go back to the original code with 
> the original deadlock, and then fix that original deadlock by splitting 
> the bind lock".

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
  2016-09-02 18:13 [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock' Linus Torvalds
  2016-09-02 18:17 ` Linus Torvalds
  2016-09-02 20:25 ` Hannes Frederic Sowa
@ 2016-09-04 20:29 ` David Miller
  2 siblings, 0 replies; 7+ messages in thread
From: David Miller @ 2016-09-04 20:29 UTC (permalink / raw)
  To: torvalds; +Cc: hannes, rweikusat, edumazet, w, netdev

From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 2 Sep 2016 11:13:09 -0700 (PDT)

> 
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Date: Thu, 1 Sep 2016 14:43:53 -0700
> Subject: [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock'
> 
> Right now we use the 'readlock' both for protecting some of the af_unix
> IO path and for making the bind be single-threaded.
> 
> The two are independent, but using the same lock makes for a nasty
> deadlock due to ordering with regards to filesystem locking.  The bind
> locking would want to nest outside the VSF pathname locking, but the IO
> locking wants to nest inside some of those same locks.
> 
> We tried to fix this earlier with commit c845acb324aa ("af_unix: Fix
> splice-bind deadlock") which moved the readlock inside the vfs locks,
> but that caused problems with overlayfs that will then call back into
> filesystem routines that take the lock in the wrong order anyway.
> 
> Splitting the locks means that we can go back to having the bind lock be
> the outermost lock, and we don't have any deadlocks with lock ordering.
> 
> Acked-by: Rainer Weikusat <rweikusat@cyberadapt.com>
> Acked-by: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Applied.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-09-04 20:37 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-02 18:13 [PATCH 2/2] af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock' Linus Torvalds
2016-09-02 18:17 ` Linus Torvalds
2016-09-02 19:15   ` David Miller
2016-09-02 20:00     ` Linus Torvalds
2016-09-02 20:23     ` Hannes Frederic Sowa
2016-09-02 20:25 ` Hannes Frederic Sowa
2016-09-04 20:29 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.