(These patches depend on IORING_OP_MKDIRAT going in first -- see the changelog entry for v2 below.) These patches add support for IORING_OP_GETDENTS, which is a new io_uring opcode that more or less does an lseek(sqe->fd, sqe->off, SEEK_SET) followed by a getdents64(sqe->fd, (void *)sqe->addr, sqe->len). A dumb test program which recursively scans through a directory tree and prints the names of all directories and files it encounters along the way is available here: https://krautbox.wantstofly.org/~buytenh/uringfind-v3.c Changes since v3 RFC: - Made locking in io_getdents() unconditional, as the prior optimization was racy. (Pointed out by Pavel Begunkov.) - Rebase onto for-5.13/io_uring as of 2021/03/12 plus a manually applied version of the mkdirat patch. Changes since v2 RFC: - Rebase onto io_uring-2021-02-17 plus a manually applied version of the mkdirat patch. The latter is needed because userland (liburing) has already merged the opcode for IORING_OP_MKDIRAT (in commit "io_uring.h: 5.12 pending kernel sync") while this opcode isn't in the kernel yet (as of io_uring-2021-02-17), and this means that this can't be merged until IORING_OP_MKDIRAT is merged. - Adapt to changes made in "io_uring: replace force_nonblock with flags" that are in io_uring-2021-02-17. Changes since v1 RFC: - Drop the trailing '64' from IORING_OP_GETDENTS64 (suggested by Matthew Wilcox). - Instead of requiring that sqe->off be zero, use this field to pass in a directory offset to start reading from. For the first IORING_OP_GETDENTS call on a directory, this can be set to zero, and for subsequent calls, it can be set to the ->d_off field of the last struct linux_dirent64 returned by the previous call. Lennert Buytenhek (2): readdir: split the core of getdents64(2) out into vfs_getdents() io_uring: add support for IORING_OP_GETDENTS fs/io_uring.c | 66 ++++++++++++++++++++++++++++++++++++++++++ fs/readdir.c | 25 ++++++++++----- include/linux/fs.h | 4 ++ include/uapi/linux/io_uring.h | 1 4 files changed, 88 insertions(+), 8 deletions(-)
So that IORING_OP_GETDENTS may use it, split out the core of the getdents64(2) syscall into a helper function, vfs_getdents(). vfs_getdents() calls into filesystems' ->iterate{,_shared}() which expect serialization on struct file, which means that callers of vfs_getdents() are responsible for either using fdget_pos() or performing the equivalent serialization by hand. Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org> --- fs/readdir.c | 25 +++++++++++++++++-------- include/linux/fs.h | 4 ++++ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/readdir.c b/fs/readdir.c index 19434b3c982c..f52167c1eb61 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -348,10 +348,9 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return -EFAULT; } -SYSCALL_DEFINE3(getdents64, unsigned int, fd, - struct linux_dirent64 __user *, dirent, unsigned int, count) +int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent, + unsigned int count) { - struct fd f; struct getdents_callback64 buf = { .ctx.actor = filldir64, .count = count, @@ -359,11 +358,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, }; int error; - f = fdget_pos(fd); - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); + error = iterate_dir(file, &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { @@ -376,6 +371,20 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, else error = count - buf.count; } + return error; +} + +SYSCALL_DEFINE3(getdents64, unsigned int, fd, + struct linux_dirent64 __user *, dirent, unsigned int, count) +{ + struct fd f; + int error; + + f = fdget_pos(fd); + if (!f.file) + return -EBADF; + + error = vfs_getdents(f.file, dirent, count); fdput_pos(f); return error; } diff --git a/include/linux/fs.h b/include/linux/fs.h index ec8f3ddf4a6a..c03235883e18 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3227,6 +3227,10 @@ extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); +struct linux_dirent64; +int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent, + unsigned int count); + int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags); int vfs_fstat(int fd, struct kstat *stat); -- 2.29.2
IORING_OP_GETDENTS behaves much like getdents64(2) and takes the same arguments, but with a small twist: it takes an additional offset argument, and reading from the specified directory starts at the given offset. For the first IORING_OP_GETDENTS call on a directory, the offset parameter can be set to zero, and for subsequent calls, it can be set to the ->d_off field of the last struct linux_dirent64 returned by the previous IORING_OP_GETDENTS call. Internally, if necessary, IORING_OP_GETDENTS will vfs_llseek() to the right directory position before calling vfs_getdents(). IORING_OP_GETDENTS may or may not update the specified directory's file offset, and the file offset should not be relied upon having any particular value during or after an IORING_OP_GETDENTS call. Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org> --- fs/io_uring.c | 66 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 67 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index eef957139915..306e2bd9fd75 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -640,6 +640,13 @@ struct io_mkdir { struct filename *filename; }; +struct io_getdents { + struct file *file; + struct linux_dirent64 __user *dirent; + unsigned int count; + loff_t pos; +}; + struct io_completion { struct file *file; struct list_head list; @@ -774,6 +781,7 @@ struct io_kiocb { struct io_rename rename; struct io_unlink unlink; struct io_mkdir mkdir; + struct io_getdents getdents; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -988,6 +996,9 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_RENAMEAT] = {}, [IORING_OP_UNLINKAT] = {}, [IORING_OP_MKDIRAT] = {}, + [IORING_OP_GETDENTS] = { + .needs_file = 1, + }, }; static bool io_disarm_next(struct io_kiocb *req); @@ -4310,6 +4321,56 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_getdents_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_getdents *getdents = &req->getdents; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->rw_flags || sqe->buf_index) + return -EINVAL; + + getdents->pos = READ_ONCE(sqe->off); + getdents->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr)); + getdents->count = READ_ONCE(sqe->len); + return 0; +} + +static int io_getdents(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_getdents *getdents = &req->getdents; + int ret = 0; + + /* getdents always requires a blocking context */ + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + /* for vfs_llseek and to serialize ->iterate_shared() on this file */ + mutex_lock(&req->file->f_pos_lock); + + if (req->file->f_pos != getdents->pos) { + loff_t res = vfs_llseek(req->file, getdents->pos, SEEK_SET); + if (res < 0) + ret = res; + } + + if (ret == 0) { + ret = vfs_getdents(req->file, getdents->dirent, + getdents->count); + } + + mutex_unlock(&req->file->f_pos_lock); + + if (ret < 0) { + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail_links(req); + } + io_req_complete(req, ret); + return 0; +} + #if defined(CONFIG_NET) static int io_setup_async_msg(struct io_kiocb *req, struct io_async_msghdr *kmsg) @@ -5813,6 +5874,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_unlinkat_prep(req, sqe); case IORING_OP_MKDIRAT: return io_mkdirat_prep(req, sqe); + case IORING_OP_GETDENTS: + return io_getdents_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6075,6 +6138,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_MKDIRAT: ret = io_mkdirat(req, issue_flags); break; + case IORING_OP_GETDENTS: + ret = io_getdents(req, issue_flags); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 89b1225998c0..b12d49361022 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -138,6 +138,7 @@ enum { IORING_OP_RENAMEAT, IORING_OP_UNLINKAT, IORING_OP_MKDIRAT, + IORING_OP_GETDENTS, /* this goes last, obviously */ IORING_OP_LAST, -- 2.29.2
On 3/12/21 8:49 AM, Lennert Buytenhek wrote:
> So that IORING_OP_GETDENTS may use it, split out the core of the
> getdents64(2) syscall into a helper function, vfs_getdents().
>
> vfs_getdents() calls into filesystems' ->iterate{,_shared}() which
> expect serialization on struct file, which means that callers of
> vfs_getdents() are responsible for either using fdget_pos() or
> performing the equivalent serialization by hand.
Al, how do you feel about this one?
--
Jens Axboe