linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Linus Torvalds <torvalds@linux-foundation.org>
To: Josef Bacik <josef@toxicpanda.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-fsdevel <linux-fsdevel@vger.kernel.org>
Subject: Re: [PATCH] pipe: fix hang when racing with a wakeup
Date: Thu, 1 Oct 2020 11:38:29 -0700	[thread overview]
Message-ID: <CAHk-=whwZxj0WdGk2ryax574ut1xPq-=12DcFxZgq9rmCBdDbg@mail.gmail.com> (raw)
In-Reply-To: <eb829164-8035-92ee-e7ba-8e6b062ab1d8@toxicpanda.com>

[-- Attachment #1: Type: text/plain, Size: 724 bytes --]

On Thu, Oct 1, 2020 at 10:41 AM Josef Bacik <josef@toxicpanda.com> wrote:
>
> Obviously not ideal, but I figured the simpler fix was better for stable, and
> then we could work out something better.

I think the attached is the proper fix, and it's not really any more
complicated.

The patch is bigger, but it's pretty obvious: get rid of the
non-specific "pipe_wait()", and replace them with specific versions
that wait for a particular thing.

NOTE! Entirely untested. It seems to build fine for me, and it _looks_
obvious, but I haven't actually rebooted to see if it works at all. I
don't think I have any real splice-heavy test cases.

Mind trying this out on the load that showed problems?

                    Linus

[-- Attachment #2: patch --]
[-- Type: application/octet-stream, Size: 4712 bytes --]

 fs/pipe.c                 | 62 +++++++++++++++++++++++++++++++----------------
 fs/splice.c               |  8 +++---
 include/linux/pipe_fs_i.h |  5 ++--
 3 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 60dbee457143..7639e1117521 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -106,25 +106,6 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
 	}
 }
 
-/* Drop the inode semaphore and wait for a pipe event, atomically */
-void pipe_wait(struct pipe_inode_info *pipe)
-{
-	DEFINE_WAIT(rdwait);
-	DEFINE_WAIT(wrwait);
-
-	/*
-	 * Pipes are system-local resources, so sleeping on them
-	 * is considered a noninteractive wait:
-	 */
-	prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
-	prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE);
-	pipe_unlock(pipe);
-	schedule();
-	finish_wait(&pipe->rd_wait, &rdwait);
-	finish_wait(&pipe->wr_wait, &wrwait);
-	pipe_lock(pipe);
-}
-
 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 				  struct pipe_buffer *buf)
 {
@@ -1035,12 +1016,52 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 	return do_pipe2(fildes, 0);
 }
 
+/*
+ * This is the stupid "wait for pipe to be readable or writable"
+ * model.
+ *
+ * See pipe_read/write() for the proper kind of exclusive wait,
+ * but that requires that we wake up any other readers/writers
+ * if we then do not end up reading everything (ie the whole
+ * "wake_next_reader/writer" logic in pipe_read/write()).
+ */
+void pipe_wait_readable(struct pipe_inode_info *pipe)
+{
+	pipe_unlock(pipe);
+	wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
+	pipe_lock(pipe);
+}
+
+void pipe_wait_writable(struct pipe_inode_info *pipe)
+{
+	pipe_unlock(pipe);
+	wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
+	pipe_lock(pipe);
+}
+
+/*
+ * This depends on both the (here) wait and the wakeup (wake_up_partner)
+ * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
+ * race with the count check and waitqueue prep.
+ *
+ * Normally in order to avoid races, you'd do the prepare_to_wait() first,
+ * then check the condition you're waiting for, and only then sleep. But
+ * because of the pipe lock, we can check the condition before being on
+ * the wait queue.
+ *
+ * We use the 'rd_wait' waitqueue for pipe partner waiting.
+ */
 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
 {
+	DEFINE_WAIT(rdwait);
 	int cur = *cnt;
 
 	while (cur == *cnt) {
-		pipe_wait(pipe);
+		prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
+		pipe_unlock(pipe);
+		schedule();
+		finish_wait(&pipe->rd_wait, &rdwait);
+		pipe_lock(pipe);
 		if (signal_pending(current))
 			break;
 	}
@@ -1050,7 +1071,6 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
 static void wake_up_partner(struct pipe_inode_info *pipe)
 {
 	wake_up_interruptible_all(&pipe->rd_wait);
-	wake_up_interruptible_all(&pipe->wr_wait);
 }
 
 static int fifo_open(struct inode *inode, struct file *filp)
diff --git a/fs/splice.c b/fs/splice.c
index d7c8a7c4db07..c3d00dfc7344 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -563,7 +563,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
 			sd->need_wakeup = false;
 		}
 
-		pipe_wait(pipe);
+		pipe_wait_readable(pipe);
 	}
 
 	return 1;
@@ -1077,7 +1077,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
 			return -EAGAIN;
 		if (signal_pending(current))
 			return -ERESTARTSYS;
-		pipe_wait(pipe);
+		pipe_wait_writable(pipe);
 	}
 }
 
@@ -1454,7 +1454,7 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 			ret = -EAGAIN;
 			break;
 		}
-		pipe_wait(pipe);
+		pipe_wait_readable(pipe);
 	}
 
 	pipe_unlock(pipe);
@@ -1493,7 +1493,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 			ret = -ERESTARTSYS;
 			break;
 		}
-		pipe_wait(pipe);
+		pipe_wait_writable(pipe);
 	}
 
 	pipe_unlock(pipe);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 50afd0d0084c..5d2705f1d01c 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -240,8 +240,9 @@ extern unsigned int pipe_max_size;
 extern unsigned long pipe_user_pages_hard;
 extern unsigned long pipe_user_pages_soft;
 
-/* Drop the inode semaphore and wait for a pipe event, atomically */
-void pipe_wait(struct pipe_inode_info *pipe);
+/* Wait for a pipe to be readable/writable while dropping the pipe lock */
+void pipe_wait_readable(struct pipe_inode_info *);
+void pipe_wait_writable(struct pipe_inode_info *);
 
 struct pipe_inode_info *alloc_pipe_info(void);
 void free_pipe_info(struct pipe_inode_info *);

  reply	other threads:[~2020-10-01 18:39 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-01 15:58 [PATCH] pipe: fix hang when racing with a wakeup Josef Bacik
2020-10-01 17:28 ` Linus Torvalds
2020-10-01 17:41   ` Josef Bacik
2020-10-01 18:38     ` Linus Torvalds [this message]
2020-10-01 19:52       ` Josef Bacik
2020-10-01 21:08       ` Josef Bacik

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAHk-=whwZxj0WdGk2ryax574ut1xPq-=12DcFxZgq9rmCBdDbg@mail.gmail.com' \
    --to=torvalds@linux-foundation.org \
    --cc=akpm@linux-foundation.org \
    --cc=josef@toxicpanda.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).