Re: [PATCH 05/16] Add io_uring IO interface

From: Christoph Hellwig <hch@lst.de>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-fsdevel@vger.kernel.org, linux-aio@kvack.org,
	linux-block@vger.kernel.org, linux-arch@vger.kernel.org,
	hch@lst.de, jmoyer@redhat.com, avi@scylladb.com
Subject: Re: [PATCH 05/16] Add io_uring IO interface
Date: Wed, 9 Jan 2019 13:10:30 +0100	[thread overview]
Message-ID: <20190109121030.GA13779@lst.de> (raw)
In-Reply-To: <20190108165645.19311-6-axboe@kernel.dk>

> index 293733f61594..9ef9987b4192 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -29,7 +29,7 @@ obj-$(CONFIG_SIGNALFD)		+= signalfd.o
>  obj-$(CONFIG_TIMERFD)		+= timerfd.o
>  obj-$(CONFIG_EVENTFD)		+= eventfd.o
>  obj-$(CONFIG_USERFAULTFD)	+= userfaultfd.o
> -obj-$(CONFIG_AIO)               += aio.o
> +obj-$(CONFIG_AIO)               += aio.o io_uring.o

It is probablt worth adding a new config symbol for the uring as no
code is shared with aio.

> diff --git a/fs/io_uring.c b/fs/io_uring.c
> new file mode 100644
> index 000000000000..ae2b886282bb
> --- /dev/null
> +++ b/fs/io_uring.c
> @@ -0,0 +1,849 @@
> +/*
> + * Shared application/kernel submission and completion ring pairs, for
> + * supporting fast/efficient IO.
> + *
> + * Copyright (C) 2019 Jens Axboe
> + */

Add an SPDX header to all new files, please.

> +struct io_sq_ring {
> +	struct io_uring		r;
> +	u32			ring_mask;
> +	u32			ring_entries;
> +	u32			dropped;
> +	u32			flags;
> +	u32			array[0];
> +};

field[0] is a legacy gcc extension, the proper C99+ way is field[].

> +
> +struct io_iocb_ring {
> +	struct			io_sq_ring *ring;
> +	unsigned		entries;
> +	unsigned		ring_mask;
> +	struct io_uring_iocb	*iocbs;
> +};
> +
> +struct io_event_ring {
> +	struct io_cq_ring	*ring;
> +	unsigned		entries;
> +	unsigned		ring_mask;
> +};

Btw, do we really need there structures?  It would seem simpler
to just embedd them into the containing structure as:

	struct io_sq_ring	*sq_ring;
	unsigned		sq_ring_entries;
	unsigned		sq_ring_mask;
	struct io_uring_iocb	*sq_ring_iocbs;

	struct io_cq_ring	*cq_ring;
	unsigned		cq_ring_entries;
	unsigned		cq_ring_mask;

> +struct io_ring_ctx {
> +	struct percpu_ref	refs;
> +
> +	unsigned int		flags;
> +	unsigned int		max_reqs;

max_reqs can probably go away in favour of the sq ring nr_entries
field.

> +	struct io_iocb_ring	sq_ring;
> +	struct io_event_ring	cq_ring;
> +
> +	struct work_struct	work;
> +
> +	struct {
> +		struct mutex uring_lock;
> +	} ____cacheline_aligned_in_smp;
> +
> +	struct {
> +		struct mutex    ring_lock;
> +		wait_queue_head_t wait;
> +	} ____cacheline_aligned_in_smp;
> +
> +	struct {
> +		spinlock_t      completion_lock;
> +	} ____cacheline_aligned_in_smp;
> +};

Can you take a deep look if we need to keep all of ring_lock,
completion_lock and the later added poll locking?  From a quick look
is isn't entirely clear what the locking strategy on the completion
side is.  It needs to be documented and can hopefully be simplified.

> +struct fsync_iocb {
> +	struct work_struct	work;
> +	struct file		*file;
> +	bool			datasync;
> +};

Do we actually need this?  Can't we just reuse the later thread
offload for fsync?  Maybe just add fsync support once everything else
is done to make that simpler.

> +static const struct file_operations io_scqring_fops;
> +
> +static void io_ring_ctx_free(struct work_struct *work);
> +static void io_ring_ctx_ref_free(struct percpu_ref *ref);

Can you try to avoid to need the forward delcaration?  (except for the
fops, where we probably need it).

>
> +
> +static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> +{
> +	struct io_ring_ctx *ctx;
> +
> +	ctx = kmem_cache_zalloc(ioctx_cachep, GFP_KERNEL);
> +	if (!ctx)
> +		return NULL;

Do we really need an explicit slab for the contexts?

> +static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx)

Maybe replace the req name with something matching the structure
name?  (and more on the structure name later).

> +{
> +	struct io_kiocb *req;
> +
> +	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
> +	if (!req)
> +		return NULL;
> +
> +	percpu_ref_get(&ctx->refs);
> +	req->ki_ctx = ctx;
> +	INIT_LIST_HEAD(&req->ki_list);

We never do a list_empty ceck on ki_list, so there should be no need
to initialize it.

> +static void io_fill_event(struct io_uring_event *ev, struct io_kiocb *kiocb,
> +			  long res, unsigned flags)
> +{
> +	ev->index = kiocb->ki_index;
> +	ev->res = res;
> +	ev->flags = flags;
> +}

Probably no need for this helper.

> +static void io_complete_scqring(struct io_kiocb *iocb, long res, unsigned flags)
> +{
> +	io_cqring_fill_event(iocb, res, flags);
> +	io_complete_iocb(iocb->ki_ctx, iocb);
> +}

Probably no need for this helper either.

> +	ret = kiocb_set_rw_flags(req, iocb->rw_flags);
> +	if (unlikely(ret))
> +		goto out_fput;
> +
> +	/* no one is going to poll for this I/O */
> +	req->ki_flags &= ~IOCB_HIPRI;

Now that we don't have the aio legacy to deal with should we just
reject IOCB_HIPRI on a non-polled context?

> +static int io_setup_rw(int rw, const struct io_uring_iocb *iocb,
> +		       struct iovec **iovec, struct iov_iter *iter)
> +{
> +	void __user *buf = (void __user *)(uintptr_t)iocb->addr;
> +	size_t ret;
> +
> +	ret = import_single_range(rw, buf, iocb->len, *iovec, iter);
> +	*iovec = NULL;
> +	return ret;
> +}

Is there any point in supporting non-vectored operations here?

> +		if (S_ISREG(file_inode(file)->i_mode)) {
> +			__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
> +			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
> +		}

Overly long lines.

> +static int __io_submit_one(struct io_ring_ctx *ctx,
> +			   const struct io_uring_iocb *iocb,
> +			   unsigned long ki_index)

Maybe calls this io_ring_submit_one?  Or generally find a nice prefix
for all the functions in this file?

> +	f = fdget(fd);
> +	if (f.file) {
> +		struct io_ring_ctx *ctx;

Please just return early on fialure instead of forcing another level
of indentation.

> +
> +	ctx->sq_ring.iocbs = io_mem_alloc(sizeof(struct io_uring_iocb) *
> +						p->sq_entries);

Use array_size().

> +/*
> + * sys_io_uring_setup:
> + *	Sets up an aio uring context, and returns the fd. Applications asks
> + *	for a ring size, we return the actual sq/cq ring sizes (among other
> + *	things) in the params structure passed in.
> + */

Can we drop this odd aio-style comment format?  In fact the syscall
documentation probably just belongs into the man page only anyway.

Same for the uring_enter syscall.

> +struct io_uring_iocb {

Should we just call this io_uring_sqe?

> +/*
> + * IO completion data structure
> + */
> +struct io_uring_event {
> +	__u64	index;		/* what iocb this event came from */
> +	__s32	res;		/* result code for this event */
> +	__u32	flags;
> +};

io_uring_cqe?