linux-security-module.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: David Howells <dhowells@redhat.com>
To: torvalds@linux-foundation.org
Cc: dhowells@redhat.com, Rasmus Villemoes <linux@rasmusvillemoes.dk>,
	Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	Peter Zijlstra <peterz@infradead.org>,
	nicolas.dichtel@6wind.com, raven@themaw.net,
	Christian Brauner <christian@brauner.io>,
	dhowells@redhat.com, keyrings@vger.kernel.org,
	linux-usb@vger.kernel.org, linux-block@vger.kernel.org,
	linux-security-module@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org,
	linux-security-module@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH 05/11] pipe: Allow pipes to have kernel-reserved slots [ver #3]
Date: Fri, 01 Nov 2019 17:34:46 +0000	[thread overview]
Message-ID: <157262968684.13142.910827322486465736.stgit@warthog.procyon.org.uk> (raw)
In-Reply-To: <157262963995.13142.5568934007158044624.stgit@warthog.procyon.org.uk>

Split pipe->ring_size into two numbers:

 (1) pipe->ring_size - indicates the hard size of the pipe ring.

 (2) pipe->max_usage - indicates the maximum number of pipe ring slots that
     userspace orchestrated events can fill.

This allows for a pipe that is both writable by the general kernel
notification facility and by userspace, allowing plenty of ring space for
notifications to be added whilst preventing userspace from being able to
pin too much unswappable kernel space.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/fuse/dev.c             |    8 ++++----
 fs/pipe.c                 |   10 ++++++----
 fs/splice.c               |   26 +++++++++++++-------------
 include/linux/pipe_fs_i.h |    6 +++++-
 lib/iov_iter.c            |    4 ++--
 5 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c56011f95a87..423b6c657bf0 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -703,7 +703,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 			cs->pipebufs++;
 			cs->nr_segs--;
 		} else {
-			if (cs->nr_segs >= cs->pipe->ring_size)
+			if (cs->nr_segs >= cs->pipe->max_usage)
 				return -EIO;
 
 			page = alloc_page(GFP_HIGHUSER);
@@ -879,7 +879,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
 	struct pipe_buffer *buf;
 	int err;
 
-	if (cs->nr_segs >= cs->pipe->ring_size)
+	if (cs->nr_segs >= cs->pipe->max_usage)
 		return -EIO;
 
 	err = unlock_request(cs->req);
@@ -1341,7 +1341,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 	if (!fud)
 		return -EPERM;
 
-	bufs = kvmalloc_array(pipe->ring_size, sizeof(struct pipe_buffer),
+	bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer),
 			      GFP_KERNEL);
 	if (!bufs)
 		return -ENOMEM;
@@ -1353,7 +1353,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 	if (ret < 0)
 		goto out;
 
-	if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->ring_size) {
+	if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) {
 		ret = -EIO;
 		goto out;
 	}
diff --git a/fs/pipe.c b/fs/pipe.c
index e9b361cb093e..69afeab8a73a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -404,7 +404,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 
 	tail = pipe->tail;
 	head = pipe->head;
-	max_usage = pipe->ring_size;
+	max_usage = pipe->max_usage;
 	mask = pipe->ring_size - 1;
 
 	/* We try to merge small writes */
@@ -571,7 +571,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 	}
 
 	if (filp->f_mode & FMODE_WRITE) {
-		if (!pipe_full(head, tail, pipe->ring_size))
+		if (!pipe_full(head, tail, pipe->max_usage))
 			mask |= EPOLLOUT | EPOLLWRNORM;
 		/*
 		 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
@@ -696,6 +696,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
 	if (pipe->bufs) {
 		init_waitqueue_head(&pipe->wait);
 		pipe->r_counter = pipe->w_counter = 1;
+		pipe->max_usage = pipe_bufs;
 		pipe->ring_size = pipe_bufs;
 		pipe->user = user;
 		mutex_init(&pipe->mutex);
@@ -1150,9 +1151,10 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 	kfree(pipe->bufs);
 	pipe->bufs = bufs;
 	pipe->ring_size = nr_slots;
+	pipe->max_usage = nr_slots;
 	pipe->tail = tail;
 	pipe->head = head;
-	return pipe->ring_size * PAGE_SIZE;
+	return pipe->max_usage * PAGE_SIZE;
 
 out_revert_acct:
 	(void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size);
@@ -1185,7 +1187,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 		ret = pipe_set_size(pipe, arg);
 		break;
 	case F_GETPIPE_SZ:
-		ret = pipe->ring_size * PAGE_SIZE;
+		ret = pipe->max_usage * PAGE_SIZE;
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/fs/splice.c b/fs/splice.c
index 22b0a47a35c0..c521090a0469 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -199,7 +199,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 		goto out;
 	}
 
-	while (!pipe_full(head, tail, pipe->ring_size)) {
+	while (!pipe_full(head, tail, pipe->max_usage)) {
 		struct pipe_buffer *buf = &pipe->bufs[head & mask];
 
 		buf->page = spd->pages[page_nr];
@@ -239,7 +239,7 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 	if (unlikely(!pipe->readers)) {
 		send_sig(SIGPIPE, current, 0);
 		ret = -EPIPE;
-	} else if (pipe_full(head, tail, pipe->ring_size)) {
+	} else if (pipe_full(head, tail, pipe->max_usage)) {
 		ret = -EAGAIN;
 	} else {
 		pipe->bufs[head & mask] = *buf;
@@ -257,7 +257,7 @@ EXPORT_SYMBOL(add_to_pipe);
  */
 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
 {
-	unsigned int max_usage = READ_ONCE(pipe->ring_size);
+	unsigned int max_usage = READ_ONCE(pipe->max_usage);
 
 	spd->nr_pages_max = max_usage;
 	if (max_usage <= PIPE_DEF_BUFFERS)
@@ -381,7 +381,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	ssize_t res;
 	int i;
 
-	if (pipe_full(pipe->head, pipe->tail, pipe->ring_size))
+	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
 		return -EAGAIN;
 
 	/*
@@ -698,7 +698,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		.pos = *ppos,
 		.u.file = out,
 	};
-	int nbufs = pipe->ring_size;
+	int nbufs = pipe->max_usage;
 	struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
 					GFP_KERNEL);
 	ssize_t ret;
@@ -721,9 +721,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		if (ret <= 0)
 			break;
 
-		if (unlikely(nbufs < pipe->ring_size)) {
+		if (unlikely(nbufs < pipe->max_usage)) {
 			kfree(array);
-			nbufs = pipe->ring_size;
+			nbufs = pipe->max_usage;
 			array = kcalloc(nbufs, sizeof(struct bio_vec),
 					GFP_KERNEL);
 			if (!array) {
@@ -963,7 +963,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 		loff_t pos = sd->pos, prev_pos = pos;
 
 		/* Don't try to read more the pipe has space for. */
-		p_space = pipe->ring_size -
+		p_space = pipe->max_usage -
 			pipe_occupancy(pipe->head, pipe->tail);
 		read_len = min_t(size_t, len, p_space << PAGE_SHIFT);
 		ret = do_splice_to(in, &pos, pipe, read_len, flags);
@@ -1090,7 +1090,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
 			send_sig(SIGPIPE, current, 0);
 			return -EPIPE;
 		}
-		if (!pipe_full(pipe->head, pipe->tail, pipe->ring_size))
+		if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
 			return 0;
 		if (flags & SPLICE_F_NONBLOCK)
 			return -EAGAIN;
@@ -1498,13 +1498,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 	 * Check pipe occupancy without the inode lock first. This function
 	 * is speculative anyways, so missing one is ok.
 	 */
-	if (pipe_full(pipe->head, pipe->tail, pipe->ring_size))
+	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
 		return 0;
 
 	ret = 0;
 	pipe_lock(pipe);
 
-	while (pipe_full(pipe->head, pipe->tail, pipe->ring_size)) {
+	while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			ret = -EPIPE;
@@ -1584,7 +1584,7 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 		 * pipe is empty or the output pipe is full.
 		 */
 		if (pipe_empty(i_head, i_tail) ||
-		    pipe_full(o_head, o_tail, opipe->ring_size)) {
+		    pipe_full(o_head, o_tail, opipe->max_usage)) {
 			/* Already processed some buffers, break */
 			if (ret)
 				break;
@@ -1706,7 +1706,7 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		 * output room, break.
 		 */
 		if (pipe_empty(i_head, i_tail) ||
-		    pipe_full(o_head, o_tail, opipe->ring_size))
+		    pipe_full(o_head, o_tail, opipe->max_usage))
 			break;
 
 		ibuf = &ipipe->bufs[i_tail & i_mask];
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 96158ca80456..44f2245debda 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -32,6 +32,7 @@ struct pipe_buffer {
  *	@wait: reader/writer wait point in case of empty/full pipe
  *	@head: The point of buffer production
  *	@tail: The point of buffer consumption
+ *	@max_usage: The maximum number of slots that may be used in the ring
  *	@ring_size: total number of buffers (should be a power of 2)
  *	@tmp_page: cached released page
  *	@readers: number of current readers of this pipe
@@ -50,6 +51,7 @@ struct pipe_inode_info {
 	wait_queue_head_t wait;
 	unsigned int head;
 	unsigned int tail;
+	unsigned int max_usage;
 	unsigned int ring_size;
 	unsigned int readers;
 	unsigned int writers;
@@ -150,9 +152,11 @@ static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int t
 	unsigned int p_occupancy, p_space;
 
 	p_occupancy = pipe_occupancy(head, tail);
-	if (p_occupancy >= pipe->ring_size)
+	if (p_occupancy >= pipe->max_usage)
 		return 0;
 	p_space = pipe->ring_size - p_occupancy;
+	if (p_space > pipe->max_usage)
+		p_space = pipe->max_usage;
 	return p_space;
 }
 
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 6607e9c875ce..8225279b96e3 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -395,7 +395,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
 		i_head++;
 		buf = &pipe->bufs[i_head & p_mask];
 	}
-	if (pipe_full(i_head, p_tail, pipe->ring_size))
+	if (pipe_full(i_head, p_tail, pipe->max_usage))
 		return 0;
 
 	buf->ops = &page_cache_pipe_buf_ops;
@@ -528,7 +528,7 @@ static size_t push_pipe(struct iov_iter *i, size_t size,
 		pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
 		iter_head++;
 	}
-	while (!pipe_full(iter_head, p_tail, pipe->ring_size)) {
+	while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
 		struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
 		struct page *page = alloc_page(GFP_USER);
 		if (!page)


  parent reply	other threads:[~2019-11-01 17:35 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-11-01 17:34 [RFC PATCH 00/11] pipe: Notification queue preparation [ver #3] David Howells
2019-11-01 17:34 ` [RFC PATCH 01/11] pipe: Reduce #inclusion of pipe_fs_i.h " David Howells
2019-11-01 17:34 ` [RFC PATCH 02/11] Remove the nr_exclusive argument from __wake_up_sync_key() " David Howells
2019-11-01 17:34 ` [RFC PATCH 03/11] Add wake_up_interruptible_sync_poll_locked() " David Howells
2019-11-01 17:34 ` [RFC PATCH 04/11] pipe: Use head and tail pointers for the ring, not cursor and length " David Howells
     [not found]   ` <20191107090306.GV29418@shao2-debian>
2019-11-07 16:51     ` [pipe] d60337eff1: phoronix-test-suite.noise-level.0.activity_level 144.0% improvement Linus Torvalds
2019-11-07 17:42     ` David Howells
2019-11-01 17:34 ` David Howells [this message]
2019-11-01 17:34 ` [RFC PATCH 06/11] pipe: Advance tail pointer inside of wait spinlock in pipe_read() [ver #3] David Howells
2019-11-01 17:35 ` [RFC PATCH 07/11] pipe: Conditionalise wakeup " David Howells
2019-11-01 17:35 ` [RFC PATCH 08/11] pipe: Rearrange sequence in pipe_write() to preallocate slot " David Howells
2019-11-01 17:35 ` [RFC PATCH 09/11] pipe: Remove redundant wakeup from pipe_write() " David Howells
2019-11-01 17:35 ` [RFC PATCH 10/11] pipe: Check for ring full inside of the spinlock in " David Howells
2019-11-01 17:35 ` [RFC PATCH 11/11] pipe: Increase the writer-wakeup threshold to reduce context-switch count " David Howells
2019-11-01 19:24 ` [RFC PATCH 00/11] pipe: Notification queue preparation " Linus Torvalds
2019-11-01 22:05 ` David Howells
2019-11-01 22:12   ` Linus Torvalds
2019-11-05 16:02 ` Details on the UAPI of implementing notifications on pipes David Howells

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=157262968684.13142.910827322486465736.stgit@warthog.procyon.org.uk \
    --to=dhowells@redhat.com \
    --cc=christian@brauner.io \
    --cc=gregkh@linuxfoundation.org \
    --cc=keyrings@vger.kernel.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-security-module@vger.kernel.org \
    --cc=linux-usb@vger.kernel.org \
    --cc=linux@rasmusvillemoes.dk \
    --cc=nicolas.dichtel@6wind.com \
    --cc=peterz@infradead.org \
    --cc=raven@themaw.net \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).