linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] pipe bugfix /cleanup
@ 2002-10-03 13:27 Manfred Spraul
  2002-10-04  7:40 ` Alex Riesen
  0 siblings, 1 reply; 3+ messages in thread
From: Manfred Spraul @ 2002-10-03 13:27 UTC (permalink / raw)
  To: torvalds; +Cc: linux-kernel

Hi Linus,

pipe_write contains a wakeup storm, 2 writers that write into the same 
fifo can wake each other up, and spend 100% cpu time with wakeup/schedule, 
without making any progress.

The patch below was part of -dj, and IIRC even of -ac, for a few months, 
but it's more a rewrite of pipe_read and _write.

Could you merge it, or do you want a minimal fix?

The only regression I'm aware of is that

  $ dd if=/dev/zero | grep not_there

will fail due to OOM, because grep does something like

	for(;;) {
		rlen = read(fd, buf, len);
		if (rlen == len) {
			len *= 2;
			buf = realloc(buf, len);
		}
	}

if it operates on pipes, and due to the improved syscall merging, read 
will always return the maximum possible amount of data. But that's a grep 
bug, not a kernel problem.

The diff is older, but applies to 2.5.40 without offsets.
--
	Manfred
<<<<<<<<<<<<<<

diff -urpN --exclude-from=/home/davej/.exclude bk-linus/fs/pipe.c linux-2.5/fs/pipe.c
--- bk-linus/fs/pipe.c	2002-08-20 13:15:16.000000000 +0100
+++ linux-2.5/fs/pipe.c	2002-08-20 13:25:24.000000000 +0100
@@ -25,6 +25,9 @@
  *
  * FIFOs and Pipes now generate SIGIO for both readers and writers.
  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
+ *
+ * pipe_read & write cleanup
+ * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  */
 
 /* Drop the inode semaphore and wait for a pipe event, atomically */
@@ -44,97 +47,81 @@ static ssize_t
 pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	ssize_t size, read, ret;
+	int do_wakeup;
+	ssize_t ret;
 
-	/* Seeks are not allowed on pipes.  */
-	ret = -ESPIPE;
-	read = 0;
-	if (ppos != &filp->f_pos)
-		goto out_nolock;
+	/* pread is not allowed on pipes. */
+	if (unlikely(ppos != &filp->f_pos))
+		return -ESPIPE;
+	
+	/* Null read succeeds. */
+	if (unlikely(count == 0))
+		return 0;
 
-	/* Always return 0 on null read.  */
+	do_wakeup = 0;
 	ret = 0;
-	if (count == 0)
-		goto out_nolock;
+	down(PIPE_SEM(*inode));
+	for (;;) {
+		int size = PIPE_LEN(*inode);
+		if (size) {
+			char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
+			ssize_t chars = PIPE_MAX_RCHUNK(*inode);
 
-	/* Get the pipe semaphore */
-	ret = -ERESTARTSYS;
-	if (down_interruptible(PIPE_SEM(*inode)))
-		goto out_nolock;
-
-	if (PIPE_EMPTY(*inode)) {
-do_more_read:
-		ret = 0;
-		if (!PIPE_WRITERS(*inode))
-			goto out;
+			if (chars > count)
+				chars = count;
+			if (chars > size)
+				chars = size;
 
-		ret = -EAGAIN;
-		if (filp->f_flags & O_NONBLOCK)
-			goto out;
-
-		for (;;) {
-			PIPE_WAITING_READERS(*inode)++;
-			pipe_wait(inode);
-			PIPE_WAITING_READERS(*inode)--;
-			ret = -ERESTARTSYS;
-			if (signal_pending(current))
-				goto out;
-			ret = 0;
-			if (!PIPE_EMPTY(*inode))
+			if (copy_to_user(buf, pipebuf, chars)) {
+				if (!ret) ret = -EFAULT;
 				break;
-			if (!PIPE_WRITERS(*inode))
-				goto out;
-		}
-	}
+			}
+			ret += chars;
 
-	/* Read what data is available.  */
-	ret = -EFAULT;
-	while (count > 0 && (size = PIPE_LEN(*inode))) {
-		char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
-		ssize_t chars = PIPE_MAX_RCHUNK(*inode);
-
-		if (chars > count)
-			chars = count;
-		if (chars > size)
-			chars = size;
-
-		if (copy_to_user(buf, pipebuf, chars))
-			goto out;
-
-		read += chars;
-		PIPE_START(*inode) += chars;
-		PIPE_START(*inode) &= (PIPE_SIZE - 1);
-		PIPE_LEN(*inode) -= chars;
-		count -= chars;
-		buf += chars;
+			PIPE_START(*inode) += chars;
+			PIPE_START(*inode) &= (PIPE_SIZE - 1);
+			PIPE_LEN(*inode) -= chars;
+			count -= chars;
+			buf += chars;
+			do_wakeup = 1;
+		}
+		if (!count)
+			break;	/* common path: read succeeded */
+		if (PIPE_LEN(*inode)) /* test for cyclic buffers */
+			continue;
+		if (!PIPE_WRITERS(*inode))
+			break;
+		if (!PIPE_WAITING_WRITERS(*inode)) {
+			/* syscall merging: Usually we must not sleep
+			 * if O_NONBLOCK is set, or if we got some data.
+			 * But if a writer sleeps in kernel space, then
+			 * we can wait for that data without violating POSIX.
+			 */
+			if (ret)
+				break;
+			if (filp->f_flags & O_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+		}
+		if (signal_pending(current)) {
+			if (!ret) ret = -ERESTARTSYS;
+			break;
+		}
+		if (do_wakeup) {
+			wake_up_interruptible(PIPE_WAIT(*inode));
+ 			kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+		}
+		pipe_wait(inode);
 	}
-
-	/* Cache behaviour optimization */
-	if (!PIPE_LEN(*inode))
-		PIPE_START(*inode) = 0;
-
-	if (count && PIPE_WAITING_WRITERS(*inode) && !(filp->f_flags & O_NONBLOCK)) {
-		/*
-		 * We know that we are going to sleep: signal
-		 * writers synchronously that there is more
-		 * room.
-		 */
+	up(PIPE_SEM(*inode));
+	/* Signal writers asynchronously that there is more room.  */
+	if (do_wakeup) {
 		wake_up_interruptible_sync(PIPE_WAIT(*inode));
- 		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
-		if (!PIPE_EMPTY(*inode))
-			BUG();
-		goto do_more_read;
+		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 	}
-	/* Signal writers asynchronously that there is more room.  */
-	wake_up_interruptible(PIPE_WAIT(*inode));
- 	kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
-
-	ret = read;
-out:
-	up(PIPE_SEM(*inode));
-out_nolock:
-	if (read)
-		ret = read;
+	if (ret > 0)
+		UPDATE_ATIME(inode);
 	return ret;
 }
 
@@ -142,116 +129,90 @@ static ssize_t
 pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	ssize_t free, written, ret;
+	ssize_t ret;
+	size_t min;
+	int do_wakeup;
+
+	/* pwrite is not allowed on pipes. */
+	if (unlikely(ppos != &filp->f_pos))
+		return -ESPIPE;
+	
+	/* Null write succeeds. */
+	if (unlikely(count == 0))
+		return 0;
 
-	/* Seeks are not allowed on pipes.  */
-	ret = -ESPIPE;
-	written = 0;
-	if (ppos != &filp->f_pos)
-		goto out_nolock;
-
-	/* Null write succeeds.  */
+	do_wakeup = 0;
 	ret = 0;
-	if (count == 0)
-		goto out_nolock;
-
-	ret = -ERESTARTSYS;
-	if (down_interruptible(PIPE_SEM(*inode)))
-		goto out_nolock;
-
-	/* No readers yields SIGPIPE.  */
-	if (!PIPE_READERS(*inode))
-		goto sigpipe;
-
-	/* If count <= PIPE_BUF, we have to make it atomic.  */
-	free = (count <= PIPE_BUF ? count : 1);
-
-	/* Wait, or check for, available space.  */
-	if (filp->f_flags & O_NONBLOCK) {
-		ret = -EAGAIN;
-		if (PIPE_FREE(*inode) < free)
-			goto out;
-	} else {
-		while (PIPE_FREE(*inode) < free) {
-			PIPE_WAITING_WRITERS(*inode)++;
-			pipe_wait(inode);
-			PIPE_WAITING_WRITERS(*inode)--;
-			ret = -ERESTARTSYS;
-			if (signal_pending(current))
-				goto out;
-
-			if (!PIPE_READERS(*inode))
-				goto sigpipe;
+	min = count;
+	if (min > PIPE_BUF)
+		min = 1;
+	down(PIPE_SEM(*inode));
+	for (;;) {
+		int free;
+		if (!PIPE_READERS(*inode)) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret) ret = -EPIPE;
+			break;
 		}
-	}
-
-	/* Copy into available space.  */
-	ret = -EFAULT;
-	while (count > 0) {
-		int space;
-		char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
-		ssize_t chars = PIPE_MAX_WCHUNK(*inode);
-
-		if ((space = PIPE_FREE(*inode)) != 0) {
+		free = PIPE_FREE(*inode);
+		if (free >= min) {
+			/* transfer data */
+			ssize_t chars = PIPE_MAX_WCHUNK(*inode);
+			char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
+			/* Always wakeup, even if the copy fails. Otherwise
+			 * we lock up (O_NONBLOCK-)readers that sleep due to
+			 * syscall merging.
+			 */
+			do_wakeup = 1;
 			if (chars > count)
 				chars = count;
-			if (chars > space)
-				chars = space;
+			if (chars > free)
+				chars = free;
 
-			if (copy_from_user(pipebuf, buf, chars))
-				goto out;
+			if (copy_from_user(pipebuf, buf, chars)) {
+				if (!ret) ret = -EFAULT;
+				break;
+			}
 
-			written += chars;
+			ret += chars;
 			PIPE_LEN(*inode) += chars;
 			count -= chars;
 			buf += chars;
-			space = PIPE_FREE(*inode);
+		}
+		if (!count)
+			break;
+		if (PIPE_FREE(*inode) && ret) {
+			/* handle cyclic data buffers */
+			min = 1;
 			continue;
 		}
-
-		ret = written;
-		if (filp->f_flags & O_NONBLOCK)
+		if (filp->f_flags & O_NONBLOCK) {
+			if (!ret) ret = -EAGAIN;
 			break;
-
-		do {
-			/*
-			 * Synchronous wake-up: it knows that this process
-			 * is going to give up this CPU, so it doesn't have
-			 * to do idle reschedules.
-			 */
+		}
+		if (signal_pending(current)) {
+			if (!ret) ret = -ERESTARTSYS;
+			break;
+		}
+		if (do_wakeup) {
 			wake_up_interruptible_sync(PIPE_WAIT(*inode));
- 			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
-			PIPE_WAITING_WRITERS(*inode)++;
-			pipe_wait(inode);
-			PIPE_WAITING_WRITERS(*inode)--;
-			if (signal_pending(current))
-				goto out;
-			if (!PIPE_READERS(*inode))
-				goto sigpipe;
-		} while (!PIPE_FREE(*inode));
-		ret = -EFAULT;
+			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+			do_wakeup = 0;
+		}
+		PIPE_WAITING_WRITERS(*inode)++;
+		pipe_wait(inode);
+		PIPE_WAITING_WRITERS(*inode)--;
 	}
-
-	/* Signal readers asynchronously that there is more data.  */
-	wake_up_interruptible(PIPE_WAIT(*inode));
-	kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
-
-	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	mark_inode_dirty(inode);
-
-out:
 	up(PIPE_SEM(*inode));
-out_nolock:
-	if (written)
-		ret = written;
+	if (do_wakeup) {
+		wake_up_interruptible(PIPE_WAIT(*inode));
+		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+	}
+	if (ret > 0) {
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+		mark_inode_dirty(inode);
+	}
 	return ret;
-
-sigpipe:
-	if (written)
-		goto out;
-	up(PIPE_SEM(*inode));
-	send_sig(SIGPIPE, current, 0);
-	return -EPIPE;
 }
 
 static ssize_t
@@ -525,7 +486,7 @@ struct inode* pipe_new(struct inode* ino
 	PIPE_BASE(*inode) = (char*) page;
 	PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
 	PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
-	PIPE_WAITING_READERS(*inode) = PIPE_WAITING_WRITERS(*inode) = 0;
+	PIPE_WAITING_WRITERS(*inode) = 0;
 	PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
 	*PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
 
diff -urpN --exclude-from=/home/davej/.exclude bk-linus/include/linux/pipe_fs_i.h linux-2.5/include/linux/pipe_fs_i.h
--- bk-linus/include/linux/pipe_fs_i.h	2002-08-20 13:15:58.000000000 +0100
+++ linux-2.5/include/linux/pipe_fs_i.h	2002-08-20 13:26:25.000000000 +0100
@@ -9,7 +9,6 @@ struct pipe_inode_info {
 	unsigned int start;
 	unsigned int readers;
 	unsigned int writers;
-	unsigned int waiting_readers;
 	unsigned int waiting_writers;
 	unsigned int r_counter;
 	unsigned int w_counter;
@@ -28,7 +27,6 @@ struct pipe_inode_info {
 #define PIPE_LEN(inode)		((inode).i_pipe->len)
 #define PIPE_READERS(inode)	((inode).i_pipe->readers)
 #define PIPE_WRITERS(inode)	((inode).i_pipe->writers)
-#define PIPE_WAITING_READERS(inode)	((inode).i_pipe->waiting_readers)
 #define PIPE_WAITING_WRITERS(inode)	((inode).i_pipe->waiting_writers)
 #define PIPE_RCOUNTER(inode)	((inode).i_pipe->r_counter)
 #define PIPE_WCOUNTER(inode)	((inode).i_pipe->w_counter)
-- 
<<<<<<<<<<<<<<


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] pipe bugfix /cleanup
  2002-10-03 13:27 [PATCH] pipe bugfix /cleanup Manfred Spraul
@ 2002-10-04  7:40 ` Alex Riesen
  2002-10-04 13:46   ` Manfred Spraul
  0 siblings, 1 reply; 3+ messages in thread
From: Alex Riesen @ 2002-10-04  7:40 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: linux-kernel

On Thu, Oct 03, 2002 at 03:27:49PM +0200, Manfred Spraul wrote:
> Hi Linus,
> 
> pipe_write contains a wakeup storm, 2 writers that write into the same 
> fifo can wake each other up, and spend 100% cpu time with wakeup/schedule, 
> without making any progress.
> 
> The patch below was part of -dj, and IIRC even of -ac, for a few months, 
> but it's more a rewrite of pipe_read and _write.
> 
> Could you merge it, or do you want a minimal fix?
> 
> The only regression I'm aware of is that
> 
>   $ dd if=/dev/zero | grep not_there

This makes 2.4.19 + Con Kolivas patch behave very bad.
The system goes slow, freeze, than wakes up, freeze again, etc.

I did "cat /dev/zero | grep nothing".

   procs                      memory    swap          io     system         cpu
 r  b  w   swpd   free   buff  cache  si  so    bi    bo   in    cs  us  sy  id
 0  0  0 124592 292056  60624  70488   0   1    27    19    7     2   5   1   2
 0  0  0 124592 292048  60632  70488   0   0     0    12  132   141   0   0 100
 0  0  0 124588 292048  60632  70488   0   0     0     0  126   109   0   0 100
 0  0  0 124588 292048  60632  70488   0   0     0     0  126   108   0   0 100
 0  0  0 124588 292048  60632  70488   0   0     0     0  127   102   0   0 100
 0  0  0 124588 292048  60632  70488   0   0     0     0  130   122   1   0  99
 0  0  0 124568 292004  60632  70488  28   0    28     0  151  1836   5   1  94
 1  0  0 124552 292004  60632  70488   0   0     0     0  150   530   6   1  93
 0  0  0 124552 292004  60632  70488   0   0     0     0  130   496   4   1  95
 0  0  0 124552 292004  60632  70488   0   0     0     0  138  1935   4   3  93
 0  0  0 124528 290956  60632  70488   0   0     0     0  250  3075  17   2  81
 0  0  0 124528 290948  60640  70488   0   0     0    44  143   197   0   0 100
 0  0  0 124516 290948  60640  70488   0   0     0     0  134   170   0   0 100
 1  0  0 124516 290952  60644  70488   0   0     4     0  143   186   0   1  99
 1  0  0 124516 290944  60644  70488   0   0     0     0  126   104   0   0 100
 2  0  0 124516 208760  60644  70488   4   0     4     0  136 37577  39  45  17
 2  0  0 124516 126544  60652  70488   0   0     0    12  123 16596  72  28   0
 2  0  1 253164   4832  54036  70120  16 660    16   660  148 69994   0 100   0
 1  1  2 439472   3264  53880  70120   0 20704     0 20696  398   978   0  28  72
 0  1  1 438920   3656  53088  70120   0 19496     0 19496  450   722   0   5  95
 0  1  1 438596   3716  53024  70120   0 19944     0 19944  442   134   0   7  93
 0  1  2 438364   3712  53004  70132   0 19944    20 19944  507   252   0   5  95
 1  1  3 438104   3784  52932  70132   0 19804     0 19804  442   171   0   1  99
 0  2  1 437884   3804  52892  70132   8 17956     8 17956  452   219   0   5  95
 0  2  1 437660   3736  52860  70132 100 18124   100 18124  435   205   0   1  99
 0  2  1 437440   3168  52796  70132   4 20392     4 20392  493   542   0   5  95
 0  2  2 437328   3144  52760  70168   0 19976     0 20180  602   341   1   1  98
10  0  6 143432 294816  52384  70168   8 128296     8 128436 2961   502   0   5  95
 1  0  0 133448 307256  52320  70168 352   0   356    44  225   330   0   1  99
 1  0  0 133372 307256  52320  70168   0   0     0     0  130   109   0   0 100
 1  0  0 133296 307240  52336  70168   0   0     0    56  123   101   0   0 100
 1  0  0 133244 307240  52336  70168   0   0     0     0  119    96   0   0 100
 2  0  0 132952 307240  52336  70168   0   0     0     0  124   309   1   1  98


> will fail due to OOM, because grep does something like
> 
> 	for(;;) {
> 		rlen = read(fd, buf, len);
> 		if (rlen == len) {
> 			len *= 2;
> 			buf = realloc(buf, len);
> 		}
> 	}
> 
> if it operates on pipes, and due to the improved syscall merging, read 
> will always return the maximum possible amount of data. But that's a grep 
> bug, not a kernel problem.
> 
> The diff is older, but applies to 2.5.40 without offsets.

doesn't apply to 2.4.19

-alex

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] pipe bugfix /cleanup
  2002-10-04  7:40 ` Alex Riesen
@ 2002-10-04 13:46   ` Manfred Spraul
  0 siblings, 0 replies; 3+ messages in thread
From: Manfred Spraul @ 2002-10-04 13:46 UTC (permalink / raw)
  To: Alexander.Riesen; +Cc: linux-kernel

Alex Riesen wrote:
> 
> This makes 2.4.19 + Con Kolivas patch behave very bad.
> The system goes slow, freeze, than wakes up, freeze again, etc.
> 
> I did "cat /dev/zero | grep nothing".
> 
>    procs                      memory    swap          io     system         cpu
>  r  b  w   swpd   free   buff  cache  si  so    bi    bo   in    cs  us  sy  id
>  0  0  0 124592 292056  60624  70488   0   1    27    19    7     2   5   1   2
[snip]
>  0  2  1 437440   3168  52796  70132   4 20392     4 20392  493   542   0   5  95
>  0  2  2 437328   3144  52760  70168   0 19976     0 20180  602   341   1   1  98
> 10  0  6 143432 294816  52384  70168   8 128296     8 128436 2961   502   0   5  95
>  1  0  0 133448 307256  52320  70168 352   0   356    44  225   330   0   1  99

As I wrote, this is a grep problem. I assume that regexp are more 
efficient over long chunks, but there must be a limit - regexp over 
swapped out memory are slow ;-)

I'll send a bugreport to the grep maintainers.

> 
> doesn't apply to 2.4.19
> 

Yes. likely/unlikely changes, and one additional change log entry.


--
	Manfred


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2002-10-04 13:41 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-10-03 13:27 [PATCH] pipe bugfix /cleanup Manfred Spraul
2002-10-04  7:40 ` Alex Riesen
2002-10-04 13:46   ` Manfred Spraul

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).