From: Jens Axboe <axboe@kernel.dk> To: linux-aio@kvack.org, linux-block@vger.kernel.org, linux-api@vger.kernel.org Cc: hch@lst.de, jmoyer@redhat.com, avi@scylladb.com, jannh@google.com, viro@ZenIV.linux.org.uk, Jens Axboe <axboe@kernel.dk> Subject: [PATCH 14/19] io_uring: add file set registration Date: Fri, 8 Feb 2019 10:34:18 -0700 [thread overview] Message-ID: <20190208173423.27014-15-axboe@kernel.dk> (raw) In-Reply-To: <20190208173423.27014-1-axboe@kernel.dk> We normally have to fget/fput for each IO we do on a file. Even with the batching we do, the cost of the atomic inc/dec of the file usage count adds up. This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes for the io_uring_register(2) system call. The arguments passed in must be an array of __s32 holding file descriptors, and nr_args should hold the number of file descriptors the application wishes to pin for the duration of the io_uring instance (or until IORING_UNREGISTER_FILES is called). When used, the application must set IOSQE_FIXED_FILE in the sqe->flags member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd to the index in the array passed in to IORING_REGISTER_FILES. Files are automatically unregistered when the io_uring instance is torn down. An application need only unregister if it wishes to register a new set of fds. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- fs/io_uring.c | 256 ++++++++++++++++++++++++++++++---- include/uapi/linux/io_uring.h | 9 +- 2 files changed, 235 insertions(+), 30 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 50c48e43d56e..244fb71e3424 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -29,6 +29,7 @@ #include <linux/net.h> #include <net/sock.h> #include <net/af_unix.h> +#include <net/scm.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> @@ -41,6 +42,7 @@ #include "internal.h" #define IORING_MAX_ENTRIES 4096 +#define IORING_MAX_FIXED_FILES 1024 struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -102,6 +104,14 @@ struct io_ring_ctx { struct fasync_struct *cq_fasync; } ____cacheline_aligned_in_smp; + /* + * If used, fixed file set. Writers must ensure that ->refs is dead, + * readers must ensure that ->refs is alive as long as the file* is + * used. Only updated through io_uring_register(2). + */ + struct file **user_files; + unsigned nr_user_files; + /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; struct io_mapped_ubuf *user_bufs; @@ -149,6 +159,7 @@ struct io_kiocb { unsigned int flags; #define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ +#define REQ_F_FIXED_FILE 4 /* ctx owns file */ u64 user_data; u64 error; @@ -376,15 +387,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * Batched puts of the same file, to avoid dirtying the * file usage count multiple times, if avoidable. */ - if (!file) { - file = req->rw.ki_filp; - file_count = 1; - } else if (file == req->rw.ki_filp) { - file_count++; - } else { - fput_many(file, file_count); - file = req->rw.ki_filp; - file_count = 1; + if (!(req->flags & REQ_F_FIXED_FILE)) { + if (!file) { + file = req->rw.ki_filp; + file_count = 1; + } else if (file == req->rw.ki_filp) { + file_count++; + } else { + fput_many(file, file_count); + file = req->rw.ki_filp; + file_count = 1; + } } if (to_free == ARRAY_SIZE(reqs)) @@ -516,13 +529,19 @@ static void kiocb_end_write(struct kiocb *kiocb) } } +static void io_fput(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_FIXED_FILE)) + fput(req->rw.ki_filp); +} + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); kiocb_end_write(kiocb); - fput(kiocb->ki_filp); + io_fput(req); io_cqring_add_event(req->ctx, req->user_data, res, 0); io_free_req(req); } @@ -638,19 +657,29 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, { struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; - unsigned ioprio; + unsigned ioprio, flags; int fd, ret; /* For -EAGAIN retry, everything is already prepped */ if (kiocb->ki_filp) return 0; + flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); - kiocb->ki_filp = io_file_get(state, fd); - if (unlikely(!kiocb->ki_filp)) - return -EBADF; - if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) - force_nonblock = false; + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || + (unsigned) fd >= ctx->nr_user_files)) + return -EBADF; + kiocb->ki_filp = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + kiocb->ki_filp = io_file_get(state, fd); + if (unlikely(!kiocb->ki_filp)) + return -EBADF; + if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) + force_nonblock = false; + } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -690,10 +719,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, } return 0; out_fput: - /* in case of error, we didn't use this file reference. drop it. */ - if (state) - state->used_refs--; - io_file_put(state, kiocb->ki_filp); + if (!(flags & IOSQE_FIXED_FILE)) { + /* + * in case of error, we didn't use this file reference. drop it. + */ + if (state) + state->used_refs--; + io_file_put(state, kiocb->ki_filp); + } return ret; } @@ -825,7 +858,7 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, out_fput: /* Hold on to the file for -EAGAIN */ if (unlikely(ret && ret != -EAGAIN)) - fput(file); + io_fput(req); return ret; } @@ -879,7 +912,7 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, kfree(iovec); out_fput: if (unlikely(ret)) - fput(file); + io_fput(req); return ret; } @@ -905,7 +938,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, loff_t sqe_off = READ_ONCE(sqe->off); loff_t sqe_len = READ_ONCE(sqe->len); loff_t end = sqe_off + sqe_len; - unsigned fsync_flags; + unsigned fsync_flags, flags; struct file *file; int ret, fd; @@ -923,14 +956,23 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; fd = READ_ONCE(sqe->fd); - file = fget(fd); + flags = READ_ONCE(sqe->flags); + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) + return -EBADF; + file = ctx->user_files[fd]; + } else { + file = fget(fd); + } if (unlikely(!file)) return -EBADF; ret = vfs_fsync_range(file, sqe_off, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); - fput(file); + if (!(flags & IOSQE_FIXED_FILE)) + fput(file); io_cqring_add_event(ctx, sqe->user_data, ret, 0); io_free_req(req); return 0; @@ -1067,7 +1109,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, const struct sqe_submit *s, ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags)) + if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) return -EINVAL; req = io_get_req(ctx, state); @@ -1255,6 +1297,151 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ring->r.head == ring->r.tail ? ret : 0; } +static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) { + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) + kfree_skb(skb); + } +#else + int i; + + for (i = 0; i < ctx->nr_user_files; i++) + fput(ctx->user_files[i]); +#endif +} + +static int io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ + if (!ctx->user_files) + return -ENXIO; + + __io_sqe_files_unregister(ctx); + kfree(ctx->user_files); + ctx->user_files = NULL; + return 0; +} + +#if defined(CONFIG_UNIX) +static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) +{ + struct scm_fp_list *fpl; + struct sk_buff *skb; + int i; + + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) { + kfree(fpl); + return -ENOMEM; + } + + skb->sk = ctx->ring_sock->sk; + skb->destructor = unix_destruct_scm; + + fpl->user = get_uid(ctx->user); + for (i = 0; i < nr; i++) { + fpl->fp[i] = get_file(ctx->user_files[i + offset]); + unix_inflight(fpl->user, fpl->fp[i]); + fput(fpl->fp[i]); + } + + fpl->max = fpl->count = nr; + UNIXCB(skb).fp = fpl; + skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb); + return 0; +} + +/* + * If UNIX sockets are enabled, fd passing can cause a reference cycle which + * causes regular reference counting to break down. We rely on the UNIX + * garbage collection to take care of this problem for us. + */ +static int io_sqe_files_scm(struct io_ring_ctx *ctx) +{ + unsigned left, total; + int ret = 0; + + total = 0; + left = ctx->nr_user_files; + while (left) { + unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); + int ret; + + ret = __io_sqe_files_scm(ctx, this_files, total); + if (ret) + break; + left -= this_files; + total += this_files; + } + + return ret; +} +#else +static int io_sqe_files_scm(struct io_ring_ctx *ctx) +{ + return 0; +} +#endif + +static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + __s32 __user *fds = (__s32 __user *) arg; + int fd, ret = 0; + unsigned i; + + if (ctx->user_files) + return -EBUSY; + if (!nr_args) + return -EINVAL; + if (nr_args > IORING_MAX_FIXED_FILES) + return -EMFILE; + + ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL); + if (!ctx->user_files) + return -ENOMEM; + + for (i = 0; i < nr_args; i++) { + ret = -EFAULT; + if (copy_from_user(&fd, &fds[i], sizeof(fd))) + break; + + ctx->user_files[i] = fget(fd); + + ret = -EBADF; + if (!ctx->user_files[i]) + break; + /* + * Don't allow io_uring instances to be registered. If UNIX + * isn't enabled, then this causes a reference cycle and this + * instance can never get freed. If UNIX is enabled we'll + * handle it just fine, but there's still no point in allowing + * a ring fd as it doesn't suppor regular read/write anyway. + */ + if (ctx->user_files[i]->f_op == &io_uring_fops) { + fput(ctx->user_files[i]); + break; + } + ctx->nr_user_files++; + ret = 0; + } + + if (!ret) + ret = io_sqe_files_scm(ctx); + if (ret) + io_sqe_files_unregister(ctx); + + return ret; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx) { int ret; @@ -1521,14 +1708,16 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) destroy_workqueue(ctx->sqo_wq); if (ctx->sqo_mm) mmdrop(ctx->sqo_mm); + + io_iopoll_reap_events(ctx); + io_sqe_buffer_unregister(ctx); + io_sqe_files_unregister(ctx); + #if defined(CONFIG_UNIX) if (ctx->ring_sock) sock_release(ctx->ring_sock); #endif - io_iopoll_reap_events(ctx); - io_sqe_buffer_unregister(ctx); - io_mem_free(ctx->sq_ring); io_mem_free(ctx->sq_sqes); io_mem_free(ctx->cq_ring); @@ -1886,6 +2075,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sqe_buffer_unregister(ctx); break; + case IORING_REGISTER_FILES: + ret = io_sqe_files_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_FILES: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_sqe_files_unregister(ctx); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cf28f7a11f12..6257478d55e9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -16,7 +16,7 @@ */ struct io_uring_sqe { __u8 opcode; /* type of operation for this sqe */ - __u8 flags; /* as of now unused */ + __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ __u64 off; /* offset into file */ @@ -33,6 +33,11 @@ struct io_uring_sqe { }; }; +/* + * sqe->flags + */ +#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ + /* * io_uring_setup() flags */ @@ -113,5 +118,7 @@ struct io_uring_params { */ #define IORING_REGISTER_BUFFERS 0 #define IORING_UNREGISTER_BUFFERS 1 +#define IORING_REGISTER_FILES 2 +#define IORING_UNREGISTER_FILES 3 #endif -- 2.17.1
WARNING: multiple messages have this Message-ID (diff)
From: Jens Axboe <axboe@kernel.dk> To: linux-aio@kvack.org, linux-block@vger.kernel.org, linux-api@vger.kernel.org Cc: hch@lst.de, jmoyer@redhat.com, avi@scylladb.com, jannh@google.com, viro@ZenIV.linux.org.uk, Jens Axboe <axboe@kernel.dk> Subject: [PATCH 14/19] io_uring: add file set registration Date: Fri, 8 Feb 2019 10:34:18 -0700 [thread overview] Message-ID: <20190208173423.27014-15-axboe@kernel.dk> (raw) In-Reply-To: <20190208173423.27014-1-axboe@kernel.dk> We normally have to fget/fput for each IO we do on a file. Even with the batching we do, the cost of the atomic inc/dec of the file usage count adds up. This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes for the io_uring_register(2) system call. The arguments passed in must be an array of __s32 holding file descriptors, and nr_args should hold the number of file descriptors the application wishes to pin for the duration of the io_uring instance (or until IORING_UNREGISTER_FILES is called). When used, the application must set IOSQE_FIXED_FILE in the sqe->flags member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd to the index in the array passed in to IORING_REGISTER_FILES. Files are automatically unregistered when the io_uring instance is torn down. An application need only unregister if it wishes to register a new set of fds. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- fs/io_uring.c | 256 ++++++++++++++++++++++++++++++---- include/uapi/linux/io_uring.h | 9 +- 2 files changed, 235 insertions(+), 30 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 50c48e43d56e..244fb71e3424 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -29,6 +29,7 @@ #include <linux/net.h> #include <net/sock.h> #include <net/af_unix.h> +#include <net/scm.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> @@ -41,6 +42,7 @@ #include "internal.h" #define IORING_MAX_ENTRIES 4096 +#define IORING_MAX_FIXED_FILES 1024 struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -102,6 +104,14 @@ struct io_ring_ctx { struct fasync_struct *cq_fasync; } ____cacheline_aligned_in_smp; + /* + * If used, fixed file set. Writers must ensure that ->refs is dead, + * readers must ensure that ->refs is alive as long as the file* is + * used. Only updated through io_uring_register(2). + */ + struct file **user_files; + unsigned nr_user_files; + /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; struct io_mapped_ubuf *user_bufs; @@ -149,6 +159,7 @@ struct io_kiocb { unsigned int flags; #define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ +#define REQ_F_FIXED_FILE 4 /* ctx owns file */ u64 user_data; u64 error; @@ -376,15 +387,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * Batched puts of the same file, to avoid dirtying the * file usage count multiple times, if avoidable. */ - if (!file) { - file = req->rw.ki_filp; - file_count = 1; - } else if (file == req->rw.ki_filp) { - file_count++; - } else { - fput_many(file, file_count); - file = req->rw.ki_filp; - file_count = 1; + if (!(req->flags & REQ_F_FIXED_FILE)) { + if (!file) { + file = req->rw.ki_filp; + file_count = 1; + } else if (file == req->rw.ki_filp) { + file_count++; + } else { + fput_many(file, file_count); + file = req->rw.ki_filp; + file_count = 1; + } } if (to_free == ARRAY_SIZE(reqs)) @@ -516,13 +529,19 @@ static void kiocb_end_write(struct kiocb *kiocb) } } +static void io_fput(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_FIXED_FILE)) + fput(req->rw.ki_filp); +} + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); kiocb_end_write(kiocb); - fput(kiocb->ki_filp); + io_fput(req); io_cqring_add_event(req->ctx, req->user_data, res, 0); io_free_req(req); } @@ -638,19 +657,29 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, { struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; - unsigned ioprio; + unsigned ioprio, flags; int fd, ret; /* For -EAGAIN retry, everything is already prepped */ if (kiocb->ki_filp) return 0; + flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); - kiocb->ki_filp = io_file_get(state, fd); - if (unlikely(!kiocb->ki_filp)) - return -EBADF; - if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) - force_nonblock = false; + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || + (unsigned) fd >= ctx->nr_user_files)) + return -EBADF; + kiocb->ki_filp = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + kiocb->ki_filp = io_file_get(state, fd); + if (unlikely(!kiocb->ki_filp)) + return -EBADF; + if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) + force_nonblock = false; + } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -690,10 +719,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, } return 0; out_fput: - /* in case of error, we didn't use this file reference. drop it. */ - if (state) - state->used_refs--; - io_file_put(state, kiocb->ki_filp); + if (!(flags & IOSQE_FIXED_FILE)) { + /* + * in case of error, we didn't use this file reference. drop it. + */ + if (state) + state->used_refs--; + io_file_put(state, kiocb->ki_filp); + } return ret; } @@ -825,7 +858,7 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, out_fput: /* Hold on to the file for -EAGAIN */ if (unlikely(ret && ret != -EAGAIN)) - fput(file); + io_fput(req); return ret; } @@ -879,7 +912,7 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, kfree(iovec); out_fput: if (unlikely(ret)) - fput(file); + io_fput(req); return ret; } @@ -905,7 +938,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, loff_t sqe_off = READ_ONCE(sqe->off); loff_t sqe_len = READ_ONCE(sqe->len); loff_t end = sqe_off + sqe_len; - unsigned fsync_flags; + unsigned fsync_flags, flags; struct file *file; int ret, fd; @@ -923,14 +956,23 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; fd = READ_ONCE(sqe->fd); - file = fget(fd); + flags = READ_ONCE(sqe->flags); + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) + return -EBADF; + file = ctx->user_files[fd]; + } else { + file = fget(fd); + } if (unlikely(!file)) return -EBADF; ret = vfs_fsync_range(file, sqe_off, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); - fput(file); + if (!(flags & IOSQE_FIXED_FILE)) + fput(file); io_cqring_add_event(ctx, sqe->user_data, ret, 0); io_free_req(req); return 0; @@ -1067,7 +1109,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, const struct sqe_submit *s, ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags)) + if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) return -EINVAL; req = io_get_req(ctx, state); @@ -1255,6 +1297,151 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ring->r.head == ring->r.tail ? ret : 0; } +static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) { + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) + kfree_skb(skb); + } +#else + int i; + + for (i = 0; i < ctx->nr_user_files; i++) + fput(ctx->user_files[i]); +#endif +} + +static int io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ + if (!ctx->user_files) + return -ENXIO; + + __io_sqe_files_unregister(ctx); + kfree(ctx->user_files); + ctx->user_files = NULL; + return 0; +} + +#if defined(CONFIG_UNIX) +static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) +{ + struct scm_fp_list *fpl; + struct sk_buff *skb; + int i; + + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) { + kfree(fpl); + return -ENOMEM; + } + + skb->sk = ctx->ring_sock->sk; + skb->destructor = unix_destruct_scm; + + fpl->user = get_uid(ctx->user); + for (i = 0; i < nr; i++) { + fpl->fp[i] = get_file(ctx->user_files[i + offset]); + unix_inflight(fpl->user, fpl->fp[i]); + fput(fpl->fp[i]); + } + + fpl->max = fpl->count = nr; + UNIXCB(skb).fp = fpl; + skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb); + return 0; +} + +/* + * If UNIX sockets are enabled, fd passing can cause a reference cycle which + * causes regular reference counting to break down. We rely on the UNIX + * garbage collection to take care of this problem for us. + */ +static int io_sqe_files_scm(struct io_ring_ctx *ctx) +{ + unsigned left, total; + int ret = 0; + + total = 0; + left = ctx->nr_user_files; + while (left) { + unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); + int ret; + + ret = __io_sqe_files_scm(ctx, this_files, total); + if (ret) + break; + left -= this_files; + total += this_files; + } + + return ret; +} +#else +static int io_sqe_files_scm(struct io_ring_ctx *ctx) +{ + return 0; +} +#endif + +static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + __s32 __user *fds = (__s32 __user *) arg; + int fd, ret = 0; + unsigned i; + + if (ctx->user_files) + return -EBUSY; + if (!nr_args) + return -EINVAL; + if (nr_args > IORING_MAX_FIXED_FILES) + return -EMFILE; + + ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL); + if (!ctx->user_files) + return -ENOMEM; + + for (i = 0; i < nr_args; i++) { + ret = -EFAULT; + if (copy_from_user(&fd, &fds[i], sizeof(fd))) + break; + + ctx->user_files[i] = fget(fd); + + ret = -EBADF; + if (!ctx->user_files[i]) + break; + /* + * Don't allow io_uring instances to be registered. If UNIX + * isn't enabled, then this causes a reference cycle and this + * instance can never get freed. If UNIX is enabled we'll + * handle it just fine, but there's still no point in allowing + * a ring fd as it doesn't suppor regular read/write anyway. + */ + if (ctx->user_files[i]->f_op == &io_uring_fops) { + fput(ctx->user_files[i]); + break; + } + ctx->nr_user_files++; + ret = 0; + } + + if (!ret) + ret = io_sqe_files_scm(ctx); + if (ret) + io_sqe_files_unregister(ctx); + + return ret; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx) { int ret; @@ -1521,14 +1708,16 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) destroy_workqueue(ctx->sqo_wq); if (ctx->sqo_mm) mmdrop(ctx->sqo_mm); + + io_iopoll_reap_events(ctx); + io_sqe_buffer_unregister(ctx); + io_sqe_files_unregister(ctx); + #if defined(CONFIG_UNIX) if (ctx->ring_sock) sock_release(ctx->ring_sock); #endif - io_iopoll_reap_events(ctx); - io_sqe_buffer_unregister(ctx); - io_mem_free(ctx->sq_ring); io_mem_free(ctx->sq_sqes); io_mem_free(ctx->cq_ring); @@ -1886,6 +2075,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sqe_buffer_unregister(ctx); break; + case IORING_REGISTER_FILES: + ret = io_sqe_files_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_FILES: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_sqe_files_unregister(ctx); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cf28f7a11f12..6257478d55e9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -16,7 +16,7 @@ */ struct io_uring_sqe { __u8 opcode; /* type of operation for this sqe */ - __u8 flags; /* as of now unused */ + __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ __u64 off; /* offset into file */ @@ -33,6 +33,11 @@ struct io_uring_sqe { }; }; +/* + * sqe->flags + */ +#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ + /* * io_uring_setup() flags */ @@ -113,5 +118,7 @@ struct io_uring_params { */ #define IORING_REGISTER_BUFFERS 0 #define IORING_UNREGISTER_BUFFERS 1 +#define IORING_REGISTER_FILES 2 +#define IORING_UNREGISTER_FILES 3 #endif -- 2.17.1 -- To unsubscribe, send a message with 'unsubscribe linux-aio' in the body to majordomo@kvack.org. For more info on Linux AIO, see: http://www.kvack.org/aio/ Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>
next prev parent reply other threads:[~2019-02-08 17:35 UTC|newest] Thread overview: 140+ messages / expand[flat|nested] mbox.gz Atom feed top 2019-02-08 17:34 [PATCHSET v13] io_uring IO interface Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-08 17:34 ` [PATCH 01/19] fs: add an iopoll method to struct file_operations Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 9:20 ` Hannes Reinecke 2019-02-09 9:20 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 02/19] block: wire up block device iopoll method Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 9:22 ` Hannes Reinecke 2019-02-09 9:22 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 03/19] block: add bio_set_polled() helper Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 9:24 ` Hannes Reinecke 2019-02-09 9:24 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 04/19] iomap: wire up the iopoll method Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 9:25 ` Hannes Reinecke 2019-02-09 9:25 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 05/19] Add io_uring IO interface Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-08 22:12 ` Jann Horn 2019-02-08 22:12 ` Jann Horn 2019-02-09 4:15 ` Jens Axboe 2019-02-09 4:15 ` Jens Axboe 2019-02-12 21:42 ` Jann Horn 2019-02-12 21:42 ` Jann Horn 2019-02-12 22:03 ` Jens Axboe 2019-02-12 22:03 ` Jens Axboe 2019-02-12 22:06 ` Jens Axboe 2019-02-12 22:06 ` Jens Axboe 2019-02-12 22:40 ` Jann Horn 2019-02-12 22:40 ` Jann Horn 2019-02-12 22:45 ` Jens Axboe 2019-02-12 22:45 ` Jens Axboe 2019-02-12 22:52 ` Jens Axboe 2019-02-12 22:52 ` Jens Axboe 2019-02-12 22:57 ` Jann Horn 2019-02-12 22:57 ` Jann Horn 2019-02-12 23:00 ` Jens Axboe 2019-02-12 23:00 ` Jens Axboe 2019-02-12 23:11 ` Jann Horn 2019-02-12 23:11 ` Jann Horn 2019-02-12 23:19 ` Jens Axboe 2019-02-12 23:19 ` Jens Axboe 2019-02-12 23:28 ` Jann Horn 2019-02-12 23:28 ` Jann Horn 2019-02-12 23:46 ` Jens Axboe 2019-02-12 23:46 ` Jens Axboe 2019-02-12 23:53 ` Jens Axboe 2019-02-12 23:53 ` Jens Axboe 2019-02-13 0:07 ` Andy Lutomirski 2019-02-13 0:07 ` Andy Lutomirski 2019-02-13 0:14 ` Jann Horn 2019-02-13 0:14 ` Jann Horn 2019-02-13 0:24 ` Jens Axboe 2019-02-13 0:24 ` Jens Axboe 2019-02-09 9:35 ` Hannes Reinecke 2019-02-09 9:35 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 06/19] io_uring: add fsync support Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-08 22:36 ` Jann Horn 2019-02-08 22:36 ` Jann Horn 2019-02-08 23:31 ` Jens Axboe 2019-02-08 23:31 ` Jens Axboe 2019-02-09 9:37 ` Hannes Reinecke 2019-02-09 9:37 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 07/19] io_uring: support for IO polling Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 9:39 ` Hannes Reinecke 2019-02-09 9:39 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 08/19] fs: add fget_many() and fput_many() Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 9:41 ` Hannes Reinecke 2019-02-09 9:41 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 09/19] io_uring: use fget/fput_many() for file references Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 9:42 ` Hannes Reinecke 2019-02-09 9:42 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 10/19] io_uring: batch io_kiocb allocation Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 9:43 ` Hannes Reinecke 2019-02-09 9:43 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 11/19] block: implement bio helper to add iter bvec pages to bio Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 9:45 ` Hannes Reinecke 2019-02-09 9:45 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 12/19] io_uring: add support for pre-mapped user IO buffers Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-08 22:54 ` Jann Horn 2019-02-08 22:54 ` Jann Horn 2019-02-08 23:38 ` Jens Axboe 2019-02-08 23:38 ` Jens Axboe 2019-02-09 16:50 ` Jens Axboe 2019-02-09 16:50 ` Jens Axboe 2019-02-09 9:48 ` Hannes Reinecke 2019-02-09 9:48 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 13/19] net: split out functions related to registering inflight socket files Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-08 19:49 ` David Miller 2019-02-08 19:49 ` David Miller 2019-02-08 19:51 ` Jens Axboe 2019-02-09 9:49 ` Hannes Reinecke 2019-02-09 9:49 ` Hannes Reinecke 2019-02-08 17:34 ` Jens Axboe [this message] 2019-02-08 17:34 ` [PATCH 14/19] io_uring: add file set registration Jens Axboe 2019-02-08 20:26 ` Jann Horn 2019-02-08 20:26 ` Jann Horn 2019-02-09 0:16 ` Jens Axboe 2019-02-09 0:16 ` Jens Axboe 2019-02-09 9:50 ` Hannes Reinecke 2019-02-09 9:50 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 15/19] io_uring: add submission polling Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 9:53 ` Hannes Reinecke 2019-02-09 9:53 ` Hannes Reinecke 2019-02-08 17:34 ` [PATCH 16/19] io_uring: add io_kiocb ref count Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-08 17:34 ` [PATCH 17/19] io_uring: add support for IORING_OP_POLL Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-08 17:34 ` [PATCH 18/19] io_uring: allow workqueue item to handle multiple buffered requests Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-08 17:34 ` [PATCH 19/19] io_uring: add io_uring_event cache hit information Jens Axboe 2019-02-08 17:34 ` Jens Axboe 2019-02-09 21:13 [PATCHSET v14] io_uring IO interface Jens Axboe 2019-02-09 21:13 ` [PATCH 14/19] io_uring: add file set registration Jens Axboe 2019-02-09 21:13 ` Jens Axboe 2019-02-09 23:52 ` Matt Mullins 2019-02-10 0:47 ` Jens Axboe 2019-02-10 0:47 ` Jens Axboe 2019-02-10 1:11 ` Matt Mullins 2019-02-10 2:34 ` Jens Axboe 2019-02-10 2:34 ` Jens Axboe 2019-02-10 2:57 ` Jens Axboe 2019-02-10 2:57 ` Jens Axboe 2019-02-10 19:55 ` Matt Mullins 2019-02-11 19:00 [PATCHSET v15] io_uring IO interface Jens Axboe 2019-02-11 19:00 ` [PATCH 14/19] io_uring: add file set registration Jens Axboe 2019-02-11 19:00 ` Jens Axboe 2019-02-19 16:12 ` Jann Horn 2019-02-19 16:12 ` Jann Horn 2019-02-22 22:29 ` Jens Axboe 2019-02-22 22:29 ` Jens Axboe
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20190208173423.27014-15-axboe@kernel.dk \ --to=axboe@kernel.dk \ --cc=avi@scylladb.com \ --cc=hch@lst.de \ --cc=jannh@google.com \ --cc=jmoyer@redhat.com \ --cc=linux-aio@kvack.org \ --cc=linux-api@vger.kernel.org \ --cc=linux-block@vger.kernel.org \ --cc=viro@ZenIV.linux.org.uk \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.