Linux-Block Archive on lore.kernel.org
 help / color / Atom feed
* [PATCHSET v2] io_uring: support fileset add/remove/modify
@ 2019-10-04 16:22 Jens Axboe
  2019-10-04 16:22 ` [PATCH 1/2] io_uring: allow sparse fixed file sets Jens Axboe
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Jens Axboe @ 2019-10-04 16:22 UTC (permalink / raw)
  To: linux-block; +Cc: jmoyer

Currently we only support registrering a fixed file set. If changes need
to be made to that set, the application must unregister the existing set
first, then register a new one.

This patchset adds support for sparse file sets (patch 1), which means
the application can register a fileset with room for expansion. This is
done through having unregistered slots use fd == -1.

On top of that, we can add IORING_REGISTER_FILES_UPDATE. This allows
modifying the existing file set through:

- Replacing empty slots with valid file descriptors
- Replacing valid descriptors with an empty slot
- Modifying an existing slot, replacing a file descriptor with a new one

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/2] io_uring: allow sparse fixed file sets
  2019-10-04 16:22 [PATCHSET v2] io_uring: support fileset add/remove/modify Jens Axboe
@ 2019-10-04 16:22 ` Jens Axboe
  2019-10-04 16:22 ` [PATCH 2/2] io_uring: add support for IORING_REGISTER_FILES_UPDATE Jens Axboe
  2019-10-04 18:17 ` [PATCHSET v2] io_uring: support fileset add/remove/modify Jeff Moyer
  2 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2019-10-04 16:22 UTC (permalink / raw)
  To: linux-block; +Cc: jmoyer, Jens Axboe

This is in preparation for allowing updates to fixed file sets without
requiring a full unregister+register.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 83a07a47683d..6d4f1394cfca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2319,6 +2319,8 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 		if (unlikely(!ctx->user_files ||
 		    (unsigned) fd >= ctx->nr_user_files))
 			return -EBADF;
+		if (!ctx->user_files[fd])
+			return -EBADF;
 		req->file = ctx->user_files[fd];
 		req->flags |= REQ_F_FIXED_FILE;
 	} else {
@@ -2937,7 +2939,8 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	int i;
 
 	for (i = 0; i < ctx->nr_user_files; i++)
-		fput(ctx->user_files[i]);
+		if (ctx->user_files[i])
+			fput(ctx->user_files[i]);
 #endif
 }
 
@@ -3001,7 +3004,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 	struct sock *sk = ctx->ring_sock->sk;
 	struct scm_fp_list *fpl;
 	struct sk_buff *skb;
-	int i;
+	int i, nr_files;
 
 	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
 		unsigned long inflight = ctx->user->unix_inflight + nr;
@@ -3023,19 +3026,26 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 	skb->sk = sk;
 	skb->destructor = io_destruct_skb;
 
+	nr_files = 0;
 	fpl->user = get_uid(ctx->user);
 	for (i = 0; i < nr; i++) {
-		fpl->fp[i] = get_file(ctx->user_files[i + offset]);
-		unix_inflight(fpl->user, fpl->fp[i]);
+		if (!ctx->user_files[i + offset])
+			continue;
+		fpl->fp[nr_files] = get_file(ctx->user_files[i + offset]);
+		unix_inflight(fpl->user, fpl->fp[nr_files]);
+		nr_files++;
 	}
 
-	fpl->max = fpl->count = nr;
-	UNIXCB(skb).fp = fpl;
-	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
-	skb_queue_head(&sk->sk_receive_queue, skb);
+	if (nr_files) {
+		fpl->max = fpl->count = nr_files;
+		UNIXCB(skb).fp = fpl;
+		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+		skb_queue_head(&sk->sk_receive_queue, skb);
 
-	for (i = 0; i < nr; i++)
-		fput(fpl->fp[i]);
+		for (i = 0; i < nr_files; i++)
+			fput(fpl->fp[i]);
+	} else
+		kfree_skb(skb);
 
 	return 0;
 }
@@ -3066,7 +3076,8 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 		return 0;
 
 	while (total < ctx->nr_user_files) {
-		fput(ctx->user_files[total]);
+		if (ctx->user_files[total])
+			fput(ctx->user_files[total]);
 		total++;
 	}
 
@@ -3097,10 +3108,15 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (!ctx->user_files)
 		return -ENOMEM;
 
-	for (i = 0; i < nr_args; i++) {
+	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
 		ret = -EFAULT;
 		if (copy_from_user(&fd, &fds[i], sizeof(fd)))
 			break;
+		/* allow sparse sets */
+		if (fd == -1) {
+			ret = 0;
+			continue;
+		}
 
 		ctx->user_files[i] = fget(fd);
 
@@ -3118,13 +3134,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(ctx->user_files[i]);
 			break;
 		}
-		ctx->nr_user_files++;
 		ret = 0;
 	}
 
 	if (ret) {
 		for (i = 0; i < ctx->nr_user_files; i++)
-			fput(ctx->user_files[i]);
+			if (ctx->user_files[i])
+				fput(ctx->user_files[i]);
 
 		kfree(ctx->user_files);
 		ctx->user_files = NULL;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 2/2] io_uring: add support for IORING_REGISTER_FILES_UPDATE
  2019-10-04 16:22 [PATCHSET v2] io_uring: support fileset add/remove/modify Jens Axboe
  2019-10-04 16:22 ` [PATCH 1/2] io_uring: allow sparse fixed file sets Jens Axboe
@ 2019-10-04 16:22 ` Jens Axboe
  2019-10-04 18:17 ` [PATCHSET v2] io_uring: support fileset add/remove/modify Jeff Moyer
  2 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2019-10-04 16:22 UTC (permalink / raw)
  To: linux-block; +Cc: jmoyer, Jens Axboe

Allows the application to remove/replace/add files to/from a file set.
Passes in a struct:

struct io_uring_files_update {
	__u32 offset;
	__s32 *fds;
};

that holds an array of fds, size of array passed in through the usual
nr_args part of the io_uring_register() system call. The logic is as
follows:

1) If ->fds[i] is -1, the existing file at i + ->offset is removed from
   the set.
2) If ->fds[i] is a valid fd, the existing file at i + ->offset is
   replaced with ->fds[i].

For case #2, is the existing file is currently empty (fd == -1), the
new fd is simply added to the array.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 168 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |   6 ++
 2 files changed, 174 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6d4f1394cfca..8afb0b689523 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3155,6 +3155,171 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
+static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
+{
+#if defined(CONFIG_UNIX)
+	struct file *file = ctx->user_files[index];
+	struct sock *sock = ctx->ring_sock->sk;
+	struct sk_buff_head list, *head = &sock->sk_receive_queue;
+	struct sk_buff *skb;
+	int i;
+
+	__skb_queue_head_init(&list);
+
+	/*
+	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
+	 * remove this entry and rearrange the file array.
+	 */
+	skb = skb_dequeue(head);
+	while (skb) {
+		struct scm_fp_list *fp;
+
+		fp = UNIXCB(skb).fp;
+		for (i = 0; i < fp->count; i++) {
+			int left;
+
+			if (fp->fp[i] != file)
+				continue;
+
+			unix_notinflight(fp->user, fp->fp[i]);
+			left = fp->count - 1 - i;
+			if (left) {
+				memmove(&fp->fp[i], &fp->fp[i + 1],
+						left * sizeof(struct file *));
+			}
+			fp->count--;
+			if (!fp->count) {
+				kfree_skb(skb);
+				skb = NULL;
+			} else if (skb_peek(&list)) {
+				spin_lock_irq(&head->lock);
+				__skb_queue_tail(&list, skb);
+				while ((skb = __skb_dequeue(&list)) != NULL)
+					__skb_queue_tail(head, skb);
+				spin_unlock_irq(&head->lock);
+			}
+			file = NULL;
+			break;
+		}
+
+		if (!file)
+			break;
+
+		__skb_queue_tail(&list, skb);
+
+		skb = skb_dequeue(head);
+	}
+#else
+	fput(ctx->user_files[index]);
+#endif
+}
+
+static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
+				int index)
+{
+#if defined(CONFIG_UNIX)
+	struct sock *sock = ctx->ring_sock->sk;
+	struct sk_buff_head *head = &sock->sk_receive_queue;
+	struct sk_buff *skb;
+
+	/*
+	 * See if we can merge this file into an existing skb SCM_RIGHTS
+	 * file set. If there's no room, fall back to allocating a new skb
+	 * and filling it in.
+	 */
+	spin_lock_irq(&head->lock);
+	skb = skb_peek(head);
+	if (skb) {
+		struct scm_fp_list *fpl = UNIXCB(skb).fp;
+
+		if (fpl->count < SCM_MAX_FD) {
+			__skb_unlink(skb, head);
+			spin_unlock_irq(&head->lock);
+			fpl->fp[fpl->count] = get_file(file);
+			unix_inflight(fpl->user, fpl->fp[fpl->count]);
+			fpl->count++;
+			spin_lock_irq(&head->lock);
+			__skb_queue_head(head, skb);
+		} else {
+			skb = NULL;
+		}
+	}
+	spin_unlock_irq(&head->lock);
+
+	if (skb) {
+		fput(file);
+		return 0;
+	}
+
+	return __io_sqe_files_scm(ctx, 1, index);
+#else
+	return 0;
+#endif
+}
+
+static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
+			       unsigned nr_args)
+{
+	struct io_uring_files_update up;
+	int fd, i, err, done;
+	__s32 __user *fds;
+
+	if (!ctx->user_files)
+		return -ENXIO;
+	if (!nr_args)
+		return -EINVAL;
+	if (copy_from_user(&up, arg, sizeof(up)))
+		return -EFAULT;
+	if (up.offset + nr_args > ctx->nr_user_files)
+		return -EINVAL;
+
+	done = 0;
+	i = up.offset;
+	fds = (__s32 __user *) up.fds;
+	while (nr_args) {
+		err = 0;
+		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+			err = -EFAULT;
+			break;
+		}
+		if (ctx->user_files[i]) {
+			io_sqe_file_unregister(ctx, i);
+			ctx->user_files[i] = NULL;
+		}
+		if (fd != -1) {
+			struct file *file;
+
+			file = fget(fd);
+			if (!file) {
+				err = -EBADF;
+				break;
+			}
+			/*
+			 * Don't allow io_uring instances to be registered. If
+			 * UNIX isn't enabled, then this causes a reference
+			 * cycle and this instance can never get freed. If UNIX
+			 * is enabled we'll handle it just fine, but there's
+			 * still no point in allowing a ring fd as it doesn't
+			 * support regular read/write anyway.
+			 */
+			if (file->f_op == &io_uring_fops) {
+				fput(file);
+				err = -EBADF;
+				break;
+			}
+			ctx->user_files[i] = file;
+			err = io_sqe_file_register(ctx, file, i);
+			if (err)
+				break;
+		}
+		nr_args--;
+		done++;
+		i++;
+	}
+
+	return done ? done : err;
+}
+
 static int io_sq_offload_start(struct io_ring_ctx *ctx,
 			       struct io_uring_params *p)
 {
@@ -3969,6 +4134,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_sqe_files_unregister(ctx);
 		break;
+	case IORING_REGISTER_FILES_UPDATE:
+		ret = io_sqe_files_update(ctx, arg, nr_args);
+		break;
 	case IORING_REGISTER_EVENTFD:
 		ret = -EINVAL;
 		if (nr_args != 1)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ea57526a5b89..4f532d9c0554 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -150,5 +150,11 @@ struct io_uring_params {
 #define IORING_UNREGISTER_FILES		3
 #define IORING_REGISTER_EVENTFD		4
 #define IORING_UNREGISTER_EVENTFD	5
+#define IORING_REGISTER_FILES_UPDATE	6
+
+struct io_uring_files_update {
+	__u32 offset;
+	__s32 *fds;
+};
 
 #endif
-- 
2.17.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCHSET v2] io_uring: support fileset add/remove/modify
  2019-10-04 16:22 [PATCHSET v2] io_uring: support fileset add/remove/modify Jens Axboe
  2019-10-04 16:22 ` [PATCH 1/2] io_uring: allow sparse fixed file sets Jens Axboe
  2019-10-04 16:22 ` [PATCH 2/2] io_uring: add support for IORING_REGISTER_FILES_UPDATE Jens Axboe
@ 2019-10-04 18:17 ` Jeff Moyer
  2019-10-04 18:52   ` Jens Axboe
  2 siblings, 1 reply; 5+ messages in thread
From: Jeff Moyer @ 2019-10-04 18:17 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

Jens Axboe <axboe@kernel.dk> writes:

> Currently we only support registrering a fixed file set. If changes need
> to be made to that set, the application must unregister the existing set
> first, then register a new one.
>
> This patchset adds support for sparse file sets (patch 1), which means
> the application can register a fileset with room for expansion. This is
> done through having unregistered slots use fd == -1.
>
> On top of that, we can add IORING_REGISTER_FILES_UPDATE. This allows
> modifying the existing file set through:
>
> - Replacing empty slots with valid file descriptors
> - Replacing valid descriptors with an empty slot
> - Modifying an existing slot, replacing a file descriptor with a new one

I don't pretend to understand the socket code you wrote.  The io_uring
bits look good to me.  I also added a testcase to your file-register.c
program--diff below.  The test passes, of course.  :)

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>

Cheers,
Jeff

diff --git a/test/file-register.c b/test/file-register.c
index b25f0f5..322545f 100644
--- a/test/file-register.c
+++ b/test/file-register.c
@@ -358,6 +358,40 @@ err:
 	return 1;
 }
 
+/*
+ * Register 0 files, but reserve space for 10.  Then add one file.
+ */
+static int test_zero(struct io_uring *ring)
+{
+	struct io_uring_files_update up;
+	int *files;
+	int ret;
+
+	files = open_files(0, 10, 0);
+	ret = io_uring_register(ring->ring_fd, IORING_REGISTER_FILES, files, 10);
+	if (ret)
+		goto err;
+
+	up.fds = open_files(1, 0, 1);
+	up.offset = 0;
+	ret = io_uring_register(ring->ring_fd,
+				IORING_REGISTER_FILES_UPDATE, &up, 1);
+	if (ret != 1) {
+		printf("ret=%d, errno=%d\n", ret, errno);
+		goto err;
+	}
+
+	ret = io_uring_register(ring->ring_fd, IORING_UNREGISTER_FILES, NULL, 0);
+	if (ret)
+		goto err;
+
+	close_files(up.fds, 1, 1);
+	return 0;
+err:
+	close_files(up.fds, 1, 1);
+	return 1;
+}
+
 int main(int argc, char *argv[])
 {
 	struct io_uring ring;
@@ -426,5 +460,11 @@ int main(int argc, char *argv[])
 		return ret;
 	}
 
+	ret = test_zero(&ring);
+	if (ret) {
+		printf("test_zero failed\n");
+		return ret;
+	}
+
 	return 0;
 }

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCHSET v2] io_uring: support fileset add/remove/modify
  2019-10-04 18:17 ` [PATCHSET v2] io_uring: support fileset add/remove/modify Jeff Moyer
@ 2019-10-04 18:52   ` Jens Axboe
  0 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2019-10-04 18:52 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-block

On 10/4/19 12:17 PM, Jeff Moyer wrote:
> Jens Axboe <axboe@kernel.dk> writes:
> 
>> Currently we only support registrering a fixed file set. If changes need
>> to be made to that set, the application must unregister the existing set
>> first, then register a new one.
>>
>> This patchset adds support for sparse file sets (patch 1), which means
>> the application can register a fileset with room for expansion. This is
>> done through having unregistered slots use fd == -1.
>>
>> On top of that, we can add IORING_REGISTER_FILES_UPDATE. This allows
>> modifying the existing file set through:
>>
>> - Replacing empty slots with valid file descriptors
>> - Replacing valid descriptors with an empty slot
>> - Modifying an existing slot, replacing a file descriptor with a new one
> 
> I don't pretend to understand the socket code you wrote.  The io_uring
> bits look good to me.  I also added a testcase to your file-register.c
> program--diff below.  The test passes, of course.  :)

Great, thanks Jeff, I'll add this to the test case.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, back to index

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-04 16:22 [PATCHSET v2] io_uring: support fileset add/remove/modify Jens Axboe
2019-10-04 16:22 ` [PATCH 1/2] io_uring: allow sparse fixed file sets Jens Axboe
2019-10-04 16:22 ` [PATCH 2/2] io_uring: add support for IORING_REGISTER_FILES_UPDATE Jens Axboe
2019-10-04 18:17 ` [PATCHSET v2] io_uring: support fileset add/remove/modify Jeff Moyer
2019-10-04 18:52   ` Jens Axboe

Linux-Block Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-block/0 linux-block/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-block linux-block/ https://lore.kernel.org/linux-block \
		linux-block@vger.kernel.org linux-block@archiver.kernel.org
	public-inbox-index linux-block

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-block


AGPL code for this site: git clone https://public-inbox.org/ public-inbox