All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christoph Hellwig <hch@lst.de>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-fsdevel@vger.kernel.org, linux-aio@kvack.org,
	linux-block@vger.kernel.org, linux-arch@vger.kernel.org,
	hch@lst.de, jmoyer@redhat.com, avi@scylladb.com
Subject: Re: [PATCH 05/16] Add io_uring IO interface
Date: Wed, 9 Jan 2019 13:10:30 +0100	[thread overview]
Message-ID: <20190109121030.GA13779@lst.de> (raw)
In-Reply-To: <20190108165645.19311-6-axboe@kernel.dk>

> index 293733f61594..9ef9987b4192 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -29,7 +29,7 @@ obj-$(CONFIG_SIGNALFD)		+= signalfd.o
>  obj-$(CONFIG_TIMERFD)		+= timerfd.o
>  obj-$(CONFIG_EVENTFD)		+= eventfd.o
>  obj-$(CONFIG_USERFAULTFD)	+= userfaultfd.o
> -obj-$(CONFIG_AIO)               += aio.o
> +obj-$(CONFIG_AIO)               += aio.o io_uring.o

It is probablt worth adding a new config symbol for the uring as no
code is shared with aio.

> diff --git a/fs/io_uring.c b/fs/io_uring.c
> new file mode 100644
> index 000000000000..ae2b886282bb
> --- /dev/null
> +++ b/fs/io_uring.c
> @@ -0,0 +1,849 @@
> +/*
> + * Shared application/kernel submission and completion ring pairs, for
> + * supporting fast/efficient IO.
> + *
> + * Copyright (C) 2019 Jens Axboe
> + */

Add an SPDX header to all new files, please.

> +struct io_sq_ring {
> +	struct io_uring		r;
> +	u32			ring_mask;
> +	u32			ring_entries;
> +	u32			dropped;
> +	u32			flags;
> +	u32			array[0];
> +};

field[0] is a legacy gcc extension, the proper C99+ way is field[].

> +
> +struct io_iocb_ring {
> +	struct			io_sq_ring *ring;
> +	unsigned		entries;
> +	unsigned		ring_mask;
> +	struct io_uring_iocb	*iocbs;
> +};
> +
> +struct io_event_ring {
> +	struct io_cq_ring	*ring;
> +	unsigned		entries;
> +	unsigned		ring_mask;
> +};

Btw, do we really need there structures?  It would seem simpler
to just embedd them into the containing structure as:

	struct io_sq_ring	*sq_ring;
	unsigned		sq_ring_entries;
	unsigned		sq_ring_mask;
	struct io_uring_iocb	*sq_ring_iocbs;

	struct io_cq_ring	*cq_ring;
	unsigned		cq_ring_entries;
	unsigned		cq_ring_mask;
	

> +struct io_ring_ctx {
> +	struct percpu_ref	refs;
> +
> +	unsigned int		flags;
> +	unsigned int		max_reqs;

max_reqs can probably go away in favour of the sq ring nr_entries
field.

> +	struct io_iocb_ring	sq_ring;
> +	struct io_event_ring	cq_ring;
> +
> +	struct work_struct	work;
> +
> +	struct {
> +		struct mutex uring_lock;
> +	} ____cacheline_aligned_in_smp;
> +
> +	struct {
> +		struct mutex    ring_lock;
> +		wait_queue_head_t wait;
> +	} ____cacheline_aligned_in_smp;
> +
> +	struct {
> +		spinlock_t      completion_lock;
> +	} ____cacheline_aligned_in_smp;
> +};

Can you take a deep look if we need to keep all of ring_lock,
completion_lock and the later added poll locking?  From a quick look
is isn't entirely clear what the locking strategy on the completion
side is.  It needs to be documented and can hopefully be simplified.

> +struct fsync_iocb {
> +	struct work_struct	work;
> +	struct file		*file;
> +	bool			datasync;
> +};

Do we actually need this?  Can't we just reuse the later thread
offload for fsync?  Maybe just add fsync support once everything else
is done to make that simpler.

> +static const struct file_operations io_scqring_fops;
> +
> +static void io_ring_ctx_free(struct work_struct *work);
> +static void io_ring_ctx_ref_free(struct percpu_ref *ref);

Can you try to avoid to need the forward delcaration?  (except for the
fops, where we probably need it).

>
> +
> +static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> +{
> +	struct io_ring_ctx *ctx;
> +
> +	ctx = kmem_cache_zalloc(ioctx_cachep, GFP_KERNEL);
> +	if (!ctx)
> +		return NULL;

Do we really need an explicit slab for the contexts?

> +static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx)

Maybe replace the req name with something matching the structure
name?  (and more on the structure name later).

> +{
> +	struct io_kiocb *req;
> +
> +	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
> +	if (!req)
> +		return NULL;
> +
> +	percpu_ref_get(&ctx->refs);
> +	req->ki_ctx = ctx;
> +	INIT_LIST_HEAD(&req->ki_list);

We never do a list_empty ceck on ki_list, so there should be no need
to initialize it.

> +static void io_fill_event(struct io_uring_event *ev, struct io_kiocb *kiocb,
> +			  long res, unsigned flags)
> +{
> +	ev->index = kiocb->ki_index;
> +	ev->res = res;
> +	ev->flags = flags;
> +}

Probably no need for this helper.

> +static void io_complete_scqring(struct io_kiocb *iocb, long res, unsigned flags)
> +{
> +	io_cqring_fill_event(iocb, res, flags);
> +	io_complete_iocb(iocb->ki_ctx, iocb);
> +}

Probably no need for this helper either.

> +	ret = kiocb_set_rw_flags(req, iocb->rw_flags);
> +	if (unlikely(ret))
> +		goto out_fput;
> +
> +	/* no one is going to poll for this I/O */
> +	req->ki_flags &= ~IOCB_HIPRI;

Now that we don't have the aio legacy to deal with should we just
reject IOCB_HIPRI on a non-polled context?

> +static int io_setup_rw(int rw, const struct io_uring_iocb *iocb,
> +		       struct iovec **iovec, struct iov_iter *iter)
> +{
> +	void __user *buf = (void __user *)(uintptr_t)iocb->addr;
> +	size_t ret;
> +
> +	ret = import_single_range(rw, buf, iocb->len, *iovec, iter);
> +	*iovec = NULL;
> +	return ret;
> +}

Is there any point in supporting non-vectored operations here?

> +		if (S_ISREG(file_inode(file)->i_mode)) {
> +			__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
> +			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
> +		}

Overly long lines.

> +static int __io_submit_one(struct io_ring_ctx *ctx,
> +			   const struct io_uring_iocb *iocb,
> +			   unsigned long ki_index)

Maybe calls this io_ring_submit_one?  Or generally find a nice prefix
for all the functions in this file?

> +	f = fdget(fd);
> +	if (f.file) {
> +		struct io_ring_ctx *ctx;

Please just return early on fialure instead of forcing another level
of indentation.

> +
> +	ctx->sq_ring.iocbs = io_mem_alloc(sizeof(struct io_uring_iocb) *
> +						p->sq_entries);

Use array_size().

> +/*
> + * sys_io_uring_setup:
> + *	Sets up an aio uring context, and returns the fd. Applications asks
> + *	for a ring size, we return the actual sq/cq ring sizes (among other
> + *	things) in the params structure passed in.
> + */

Can we drop this odd aio-style comment format?  In fact the syscall
documentation probably just belongs into the man page only anyway.

Same for the uring_enter syscall.

> +struct io_uring_iocb {

Should we just call this io_uring_sqe?

> +/*
> + * IO completion data structure
> + */
> +struct io_uring_event {
> +	__u64	index;		/* what iocb this event came from */
> +	__s32	res;		/* result code for this event */
> +	__u32	flags;
> +};

io_uring_cqe?

  reply	other threads:[~2019-01-09 12:10 UTC|newest]

Thread overview: 88+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-08 16:56 [PATCHSET v1] io_uring IO interface Jens Axboe
2019-01-08 16:56 ` Jens Axboe
2019-01-08 16:56 ` [PATCH 01/16] fs: add an iopoll method to struct file_operations Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-08 16:56 ` [PATCH 02/16] block: wire up block device iopoll method Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-08 16:56 ` [PATCH 03/16] block: add bio_set_polled() helper Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-10  9:43   ` Ming Lei
2019-01-10  9:43     ` Ming Lei
2019-01-10 16:05     ` Jens Axboe
2019-01-10 16:05       ` Jens Axboe
2019-01-08 16:56 ` [PATCH 04/16] iomap: wire up the iopoll method Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-08 16:56 ` [PATCH 05/16] Add io_uring IO interface Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-09 12:10   ` Christoph Hellwig [this message]
2019-01-09 15:53     ` Jens Axboe
2019-01-09 15:53       ` Jens Axboe
2019-01-09 18:30       ` Christoph Hellwig
2019-01-09 18:30         ` Christoph Hellwig
2019-01-09 20:07         ` Jens Axboe
2019-01-09 20:07           ` Jens Axboe
2019-01-08 16:56 ` [PATCH 06/16] io_uring: support for IO polling Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-09 12:11   ` Christoph Hellwig
2019-01-09 15:53     ` Jens Axboe
2019-01-09 15:53       ` Jens Axboe
2019-01-08 16:56 ` [PATCH 07/16] io_uring: add submission side request cache Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-08 16:56 ` [PATCH 08/16] fs: add fget_many() and fput_many() Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-08 16:56 ` [PATCH 09/16] io_uring: use fget/fput_many() for file references Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-08 16:56 ` [PATCH 10/16] io_uring: split kiocb init from allocation Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-09 12:12   ` Christoph Hellwig
2019-01-09 12:12     ` Christoph Hellwig
2019-01-09 16:56     ` Jens Axboe
2019-01-09 16:56       ` Jens Axboe
2019-01-08 16:56 ` [PATCH 11/16] io_uring: batch io_kiocb allocation Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-09 12:13   ` Christoph Hellwig
2019-01-09 16:57     ` Jens Axboe
2019-01-09 16:57       ` Jens Axboe
2019-01-09 19:03       ` Christoph Hellwig
2019-01-09 20:08         ` Jens Axboe
2019-01-09 20:08           ` Jens Axboe
2019-01-08 16:56 ` [PATCH 12/16] block: implement bio helper to add iter bvec pages to bio Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-08 16:56 ` [PATCH 13/16] io_uring: add support for pre-mapped user IO buffers Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-09 12:16   ` Christoph Hellwig
2019-01-09 17:06     ` Jens Axboe
2019-01-09 17:06       ` Jens Axboe
2019-01-08 16:56 ` [PATCH 14/16] io_uring: support kernel side submission Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-09 19:06   ` Christoph Hellwig
2019-01-09 20:49     ` Jens Axboe
2019-01-09 20:49       ` Jens Axboe
2019-01-08 16:56 ` [PATCH 15/16] io_uring: add submission polling Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-08 16:56 ` [PATCH 16/16] io_uring: add io_uring_event cache hit information Jens Axboe
2019-01-08 16:56   ` Jens Axboe
2019-01-09 16:00 ` [PATCHSET v1] io_uring IO interface Matthew Wilcox
2019-01-09 16:00   ` Matthew Wilcox
2019-01-09 16:27   ` Chris Mason
2019-01-09 16:27     ` Chris Mason
2019-01-12 21:29 [PATCHSET v3] " Jens Axboe
2019-01-12 21:30 ` [PATCH 05/16] Add " Jens Axboe
2019-01-12 21:30   ` Jens Axboe
2019-01-15  2:55 (unknown), Jens Axboe
2019-01-15  2:55 ` [PATCH 05/16] Add io_uring IO interface Jens Axboe
2019-01-15  2:55   ` Jens Axboe
2019-01-15 16:51   ` Jonathan Corbet
2019-01-15 16:51     ` Jonathan Corbet
2019-01-15 16:55     ` Jens Axboe
2019-01-15 16:55       ` Jens Axboe
2019-01-15 17:26       ` Jens Axboe
2019-01-15 17:26         ` Jens Axboe
2019-01-16 10:41   ` Arnd Bergmann
2019-01-16 10:41     ` Arnd Bergmann
2019-01-16 11:00     ` Arnd Bergmann
2019-01-16 11:00       ` Arnd Bergmann
2019-01-16 15:12     ` Jens Axboe
2019-01-16 15:12       ` Jens Axboe
2019-01-16 15:16       ` Arnd Bergmann
2019-01-16 15:16         ` Arnd Bergmann
2019-01-16 15:25         ` Jens Axboe
2019-01-16 15:25           ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190109121030.GA13779@lst.de \
    --to=hch@lst.de \
    --cc=avi@scylladb.com \
    --cc=axboe@kernel.dk \
    --cc=jmoyer@redhat.com \
    --cc=linux-aio@kvack.org \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.