From: Christoph Hellwig <hch@lst.de>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-fsdevel@vger.kernel.org, linux-aio@kvack.org,
linux-block@vger.kernel.org, linux-arch@vger.kernel.org,
hch@lst.de, jmoyer@redhat.com, avi@scylladb.com
Subject: Re: [PATCH 05/16] Add io_uring IO interface
Date: Wed, 9 Jan 2019 13:10:30 +0100 [thread overview]
Message-ID: <20190109121030.GA13779@lst.de> (raw)
In-Reply-To: <20190108165645.19311-6-axboe@kernel.dk>
> index 293733f61594..9ef9987b4192 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -29,7 +29,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o
> obj-$(CONFIG_TIMERFD) += timerfd.o
> obj-$(CONFIG_EVENTFD) += eventfd.o
> obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
> -obj-$(CONFIG_AIO) += aio.o
> +obj-$(CONFIG_AIO) += aio.o io_uring.o
It is probablt worth adding a new config symbol for the uring as no
code is shared with aio.
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> new file mode 100644
> index 000000000000..ae2b886282bb
> --- /dev/null
> +++ b/fs/io_uring.c
> @@ -0,0 +1,849 @@
> +/*
> + * Shared application/kernel submission and completion ring pairs, for
> + * supporting fast/efficient IO.
> + *
> + * Copyright (C) 2019 Jens Axboe
> + */
Add an SPDX header to all new files, please.
> +struct io_sq_ring {
> + struct io_uring r;
> + u32 ring_mask;
> + u32 ring_entries;
> + u32 dropped;
> + u32 flags;
> + u32 array[0];
> +};
field[0] is a legacy gcc extension, the proper C99+ way is field[].
> +
> +struct io_iocb_ring {
> + struct io_sq_ring *ring;
> + unsigned entries;
> + unsigned ring_mask;
> + struct io_uring_iocb *iocbs;
> +};
> +
> +struct io_event_ring {
> + struct io_cq_ring *ring;
> + unsigned entries;
> + unsigned ring_mask;
> +};
Btw, do we really need there structures? It would seem simpler
to just embedd them into the containing structure as:
struct io_sq_ring *sq_ring;
unsigned sq_ring_entries;
unsigned sq_ring_mask;
struct io_uring_iocb *sq_ring_iocbs;
struct io_cq_ring *cq_ring;
unsigned cq_ring_entries;
unsigned cq_ring_mask;
> +struct io_ring_ctx {
> + struct percpu_ref refs;
> +
> + unsigned int flags;
> + unsigned int max_reqs;
max_reqs can probably go away in favour of the sq ring nr_entries
field.
> + struct io_iocb_ring sq_ring;
> + struct io_event_ring cq_ring;
> +
> + struct work_struct work;
> +
> + struct {
> + struct mutex uring_lock;
> + } ____cacheline_aligned_in_smp;
> +
> + struct {
> + struct mutex ring_lock;
> + wait_queue_head_t wait;
> + } ____cacheline_aligned_in_smp;
> +
> + struct {
> + spinlock_t completion_lock;
> + } ____cacheline_aligned_in_smp;
> +};
Can you take a deep look if we need to keep all of ring_lock,
completion_lock and the later added poll locking? From a quick look
is isn't entirely clear what the locking strategy on the completion
side is. It needs to be documented and can hopefully be simplified.
> +struct fsync_iocb {
> + struct work_struct work;
> + struct file *file;
> + bool datasync;
> +};
Do we actually need this? Can't we just reuse the later thread
offload for fsync? Maybe just add fsync support once everything else
is done to make that simpler.
> +static const struct file_operations io_scqring_fops;
> +
> +static void io_ring_ctx_free(struct work_struct *work);
> +static void io_ring_ctx_ref_free(struct percpu_ref *ref);
Can you try to avoid to need the forward delcaration? (except for the
fops, where we probably need it).
>
> +
> +static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> +{
> + struct io_ring_ctx *ctx;
> +
> + ctx = kmem_cache_zalloc(ioctx_cachep, GFP_KERNEL);
> + if (!ctx)
> + return NULL;
Do we really need an explicit slab for the contexts?
> +static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx)
Maybe replace the req name with something matching the structure
name? (and more on the structure name later).
> +{
> + struct io_kiocb *req;
> +
> + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
> + if (!req)
> + return NULL;
> +
> + percpu_ref_get(&ctx->refs);
> + req->ki_ctx = ctx;
> + INIT_LIST_HEAD(&req->ki_list);
We never do a list_empty ceck on ki_list, so there should be no need
to initialize it.
> +static void io_fill_event(struct io_uring_event *ev, struct io_kiocb *kiocb,
> + long res, unsigned flags)
> +{
> + ev->index = kiocb->ki_index;
> + ev->res = res;
> + ev->flags = flags;
> +}
Probably no need for this helper.
> +static void io_complete_scqring(struct io_kiocb *iocb, long res, unsigned flags)
> +{
> + io_cqring_fill_event(iocb, res, flags);
> + io_complete_iocb(iocb->ki_ctx, iocb);
> +}
Probably no need for this helper either.
> + ret = kiocb_set_rw_flags(req, iocb->rw_flags);
> + if (unlikely(ret))
> + goto out_fput;
> +
> + /* no one is going to poll for this I/O */
> + req->ki_flags &= ~IOCB_HIPRI;
Now that we don't have the aio legacy to deal with should we just
reject IOCB_HIPRI on a non-polled context?
> +static int io_setup_rw(int rw, const struct io_uring_iocb *iocb,
> + struct iovec **iovec, struct iov_iter *iter)
> +{
> + void __user *buf = (void __user *)(uintptr_t)iocb->addr;
> + size_t ret;
> +
> + ret = import_single_range(rw, buf, iocb->len, *iovec, iter);
> + *iovec = NULL;
> + return ret;
> +}
Is there any point in supporting non-vectored operations here?
> + if (S_ISREG(file_inode(file)->i_mode)) {
> + __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
> + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
> + }
Overly long lines.
> +static int __io_submit_one(struct io_ring_ctx *ctx,
> + const struct io_uring_iocb *iocb,
> + unsigned long ki_index)
Maybe calls this io_ring_submit_one? Or generally find a nice prefix
for all the functions in this file?
> + f = fdget(fd);
> + if (f.file) {
> + struct io_ring_ctx *ctx;
Please just return early on fialure instead of forcing another level
of indentation.
> +
> + ctx->sq_ring.iocbs = io_mem_alloc(sizeof(struct io_uring_iocb) *
> + p->sq_entries);
Use array_size().
> +/*
> + * sys_io_uring_setup:
> + * Sets up an aio uring context, and returns the fd. Applications asks
> + * for a ring size, we return the actual sq/cq ring sizes (among other
> + * things) in the params structure passed in.
> + */
Can we drop this odd aio-style comment format? In fact the syscall
documentation probably just belongs into the man page only anyway.
Same for the uring_enter syscall.
> +struct io_uring_iocb {
Should we just call this io_uring_sqe?
> +/*
> + * IO completion data structure
> + */
> +struct io_uring_event {
> + __u64 index; /* what iocb this event came from */
> + __s32 res; /* result code for this event */
> + __u32 flags;
> +};
io_uring_cqe?
next prev parent reply other threads:[~2019-01-09 12:10 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-01-08 16:56 [PATCHSET v1] io_uring IO interface Jens Axboe
2019-01-08 16:56 ` [PATCH 01/16] fs: add an iopoll method to struct file_operations Jens Axboe
2019-01-08 16:56 ` [PATCH 02/16] block: wire up block device iopoll method Jens Axboe
2019-01-08 16:56 ` [PATCH 03/16] block: add bio_set_polled() helper Jens Axboe
2019-01-10 9:43 ` Ming Lei
2019-01-10 16:05 ` Jens Axboe
2019-01-08 16:56 ` [PATCH 04/16] iomap: wire up the iopoll method Jens Axboe
2019-01-08 16:56 ` [PATCH 05/16] Add io_uring IO interface Jens Axboe
2019-01-09 12:10 ` Christoph Hellwig [this message]
2019-01-09 15:53 ` Jens Axboe
2019-01-09 18:30 ` Christoph Hellwig
2019-01-09 20:07 ` Jens Axboe
2019-01-08 16:56 ` [PATCH 06/16] io_uring: support for IO polling Jens Axboe
2019-01-09 12:11 ` Christoph Hellwig
2019-01-09 15:53 ` Jens Axboe
2019-01-08 16:56 ` [PATCH 07/16] io_uring: add submission side request cache Jens Axboe
2019-01-08 16:56 ` [PATCH 08/16] fs: add fget_many() and fput_many() Jens Axboe
2019-01-08 16:56 ` [PATCH 09/16] io_uring: use fget/fput_many() for file references Jens Axboe
2019-01-08 16:56 ` [PATCH 10/16] io_uring: split kiocb init from allocation Jens Axboe
2019-01-09 12:12 ` Christoph Hellwig
2019-01-09 16:56 ` Jens Axboe
2019-01-08 16:56 ` [PATCH 11/16] io_uring: batch io_kiocb allocation Jens Axboe
2019-01-09 12:13 ` Christoph Hellwig
2019-01-09 16:57 ` Jens Axboe
2019-01-09 19:03 ` Christoph Hellwig
2019-01-09 20:08 ` Jens Axboe
2019-01-08 16:56 ` [PATCH 12/16] block: implement bio helper to add iter bvec pages to bio Jens Axboe
2019-01-08 16:56 ` [PATCH 13/16] io_uring: add support for pre-mapped user IO buffers Jens Axboe
2019-01-09 12:16 ` Christoph Hellwig
2019-01-09 17:06 ` Jens Axboe
2019-01-08 16:56 ` [PATCH 14/16] io_uring: support kernel side submission Jens Axboe
2019-01-09 19:06 ` Christoph Hellwig
2019-01-09 20:49 ` Jens Axboe
2019-01-08 16:56 ` [PATCH 15/16] io_uring: add submission polling Jens Axboe
2019-01-08 16:56 ` [PATCH 16/16] io_uring: add io_uring_event cache hit information Jens Axboe
2019-01-09 16:00 ` [PATCHSET v1] io_uring IO interface Matthew Wilcox
2019-01-09 16:27 ` Chris Mason
2019-01-12 21:29 [PATCHSET v3] " Jens Axboe
2019-01-12 21:30 ` [PATCH 05/16] Add " Jens Axboe
[not found] <20190115025531.13985-1-axboe@kernel.dk>
2019-01-15 2:55 ` Jens Axboe
2019-01-15 16:51 ` Jonathan Corbet
2019-01-15 16:55 ` Jens Axboe
2019-01-15 17:26 ` Jens Axboe
2019-01-16 10:41 ` Arnd Bergmann
2019-01-16 11:00 ` Arnd Bergmann
2019-01-16 15:12 ` Jens Axboe
2019-01-16 15:16 ` Arnd Bergmann
2019-01-16 15:25 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190109121030.GA13779@lst.de \
--to=hch@lst.de \
--cc=avi@scylladb.com \
--cc=axboe@kernel.dk \
--cc=jmoyer@redhat.com \
--cc=linux-aio@kvack.org \
--cc=linux-arch@vger.kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).