linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHSET 0/3] io_uring: add sync_file_range and drains
@ 2019-04-11 15:06 Jens Axboe
  2019-04-11 15:06 ` [PATCH 1/3] io_uring: add support for marking commands as draining Jens Axboe
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Jens Axboe @ 2019-04-11 15:06 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: hch, clm

In continuation of the fsync barrier patch from the other day, I
reworked that patch to turn it into a general primitive instead. This
means that any command can be flagged with IOSQE_IO_DRAIN, which will
insert a sequence point in the queue. If a request is marked with
IOSQE_IO_DRAIN, then previous commands must complete before this one
is issued. Subsequent requests are not started until the drain has
completed. The latter is a necessity since we track this through the
CQ index. If we allow later commands, then they could complete before
earlier commands and we'd mistakenly think that we have satisfied the
sequence point.

Patch 2 is just a prep patch for patch 3, which adds support for
sync_file_range() through io_uring. sync_file_range() is heavily used
by RocksDB.

Patches are also in my io_uring-next branch;

git://git.kernel.dk/linux-block io_uring-next

 fs/io_uring.c                 | 142 +++++++++++++++++++++++++++++++++-
 fs/sync.c                     | 135 +++++++++++++++++---------------
 include/linux/fs.h            |   3 +
 include/uapi/linux/io_uring.h |   3 +
 4 files changed, 216 insertions(+), 67 deletions(-)

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/3] io_uring: add support for marking commands as draining
  2019-04-11 15:06 [PATCHSET 0/3] io_uring: add sync_file_range and drains Jens Axboe
@ 2019-04-11 15:06 ` Jens Axboe
  2019-05-06 18:03   ` Andres Freund
  2019-04-11 15:06 ` [PATCH 2/3] fs: add sync_file_range() helper Jens Axboe
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2019-04-11 15:06 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: hch, clm, Jens Axboe

There are no ordering constraints between the submission and completion
side of io_uring. But sometimes that would be useful to have. One common
example is doing an fsync, for instance, and have it ordered with
previous writes. Without support for that, the application must do this
tracking itself.

This adds a general SQE flag, IOSQE_IO_DRAIN. If a command is marked
with this flag, then it will not be issued before previous commands have
completed, and subsequent commands submitted after the drain will not be
issued before the drain is started.. If there are no pending commands,
setting this flag will not change the behavior of the issue of the
command.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 91 +++++++++++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 07d6ef195d05..a10fd5900a17 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -121,6 +121,8 @@ struct io_ring_ctx {
 		unsigned		sq_mask;
 		unsigned		sq_thread_idle;
 		struct io_uring_sqe	*sq_sqes;
+
+		struct list_head	defer_list;
 	} ____cacheline_aligned_in_smp;
 
 	/* IO offload */
@@ -226,8 +228,11 @@ struct io_kiocb {
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
 #define REQ_F_SEQ_PREV		8	/* sequential with previous */
 #define REQ_F_PREPPED		16	/* prep already done */
+#define REQ_F_IO_DRAIN		32	/* drain existing IO first */
+#define REQ_F_IO_DRAINED	64	/* drain done */
 	u64			user_data;
-	u64			error;
+	u32			error;
+	u32			sequence;
 
 	struct work_struct	work;
 };
@@ -255,6 +260,8 @@ struct io_submit_state {
 	unsigned int		ios_left;
 };
 
+static void io_sq_wq_submit_work(struct work_struct *work);
+
 static struct kmem_cache *req_cachep;
 
 static const struct file_operations io_uring_fops;
@@ -306,10 +313,36 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	spin_lock_init(&ctx->completion_lock);
 	INIT_LIST_HEAD(&ctx->poll_list);
 	INIT_LIST_HEAD(&ctx->cancel_list);
+	INIT_LIST_HEAD(&ctx->defer_list);
 	return ctx;
 }
 
-static void io_commit_cqring(struct io_ring_ctx *ctx)
+static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
+				     struct io_kiocb *req)
+{
+	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
+		return false;
+
+	return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
+}
+
+static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	if (list_empty(&ctx->defer_list))
+		return NULL;
+
+	req = list_first_entry(&ctx->defer_list, struct io_kiocb, list);
+	if (!io_sequence_defer(ctx, req)) {
+		list_del_init(&req->list);
+		return req;
+	}
+
+	return NULL;
+}
+
+static void __io_commit_cqring(struct io_ring_ctx *ctx)
 {
 	struct io_cq_ring *ring = ctx->cq_ring;
 
@@ -330,6 +363,18 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
 	}
 }
 
+static void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	__io_commit_cqring(ctx);
+
+	while ((req = io_get_deferred_req(ctx)) != NULL) {
+		req->flags |= REQ_F_IO_DRAINED;
+		queue_work(ctx->sqo_wq, &req->work);
+	}
+}
+
 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 {
 	struct io_cq_ring *ring = ctx->cq_ring;
@@ -1337,6 +1382,34 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return ipt.error;
 }
 
+static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			const struct io_uring_sqe *sqe)
+{
+	struct io_uring_sqe *sqe_copy;
+
+	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
+		return 0;
+
+	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
+	if (!sqe_copy)
+		return -EAGAIN;
+
+	spin_lock_irq(&ctx->completion_lock);
+	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
+		spin_unlock_irq(&ctx->completion_lock);
+		kfree(sqe_copy);
+		return 0;
+	}
+
+	memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
+	req->submit.sqe = sqe_copy;
+
+	INIT_WORK(&req->work, io_sq_wq_submit_work);
+	list_add_tail(&req->list, &ctx->defer_list);
+	spin_unlock_irq(&ctx->completion_lock);
+	return -EIOCBQUEUED;
+}
+
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			   const struct sqe_submit *s, bool force_nonblock,
 			   struct io_submit_state *state)
@@ -1585,6 +1658,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 	flags = READ_ONCE(s->sqe->flags);
 	fd = READ_ONCE(s->sqe->fd);
 
+	if (flags & IOSQE_IO_DRAIN) {
+		req->flags |= REQ_F_IO_DRAIN;
+		req->sequence = ctx->cached_sq_head - 1;
+	}
+
 	if (!io_op_needs_file(s->sqe)) {
 		req->file = NULL;
 		return 0;
@@ -1614,7 +1692,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	int ret;
 
 	/* enforce forwards compatibility on users */
-	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
+	if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)))
 		return -EINVAL;
 
 	req = io_get_req(ctx, state);
@@ -1625,6 +1703,13 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	if (unlikely(ret))
 		goto out;
 
+	ret = io_req_defer(ctx, req, s->sqe);
+	if (ret) {
+		if (ret == -EIOCBQUEUED)
+			ret = 0;
+		return ret;
+	}
+
 	ret = __io_submit_sqe(ctx, req, s, true, state);
 	if (ret == -EAGAIN) {
 		struct io_uring_sqe *sqe_copy;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e23408692118..a7a6384d0c70 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -38,6 +38,7 @@ struct io_uring_sqe {
  * sqe->flags
  */
 #define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
+#define IOSQE_IO_DRAIN		(1U << 1)	/* issue after inflight IO */
 
 /*
  * io_uring_setup() flags
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/3] fs: add sync_file_range() helper
  2019-04-11 15:06 [PATCHSET 0/3] io_uring: add sync_file_range and drains Jens Axboe
  2019-04-11 15:06 ` [PATCH 1/3] io_uring: add support for marking commands as draining Jens Axboe
@ 2019-04-11 15:06 ` Jens Axboe
  2019-04-11 15:06 ` [PATCH 3/3] io_uring: add support for IORING_OP_SYNC_FILE_RANGE Jens Axboe
  2019-04-11 15:16 ` [PATCHSET 0/3] io_uring: add sync_file_range and drains Matthew Wilcox
  3 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2019-04-11 15:06 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: hch, clm, Jens Axboe

This just pulls out the ksys_sync_file_range() code to work on a struct
file instead of an fd, so we can use it elsewhere.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/sync.c          | 135 ++++++++++++++++++++++++---------------------
 include/linux/fs.h |   3 +
 2 files changed, 74 insertions(+), 64 deletions(-)

diff --git a/fs/sync.c b/fs/sync.c
index b54e0541ad89..01e82170545a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -234,58 +234,10 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 	return do_fsync(fd, 1);
 }
 
-/*
- * sys_sync_file_range() permits finely controlled syncing over a segment of
- * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
- * zero then sys_sync_file_range() will operate from offset out to EOF.
- *
- * The flag bits are:
- *
- * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
- * before performing the write.
- *
- * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
- * range which are not presently under writeback. Note that this may block for
- * significant periods due to exhaustion of disk request structures.
- *
- * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
- * after performing the write.
- *
- * Useful combinations of the flag bits are:
- *
- * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
- * in the range which were dirty on entry to sys_sync_file_range() are placed
- * under writeout.  This is a start-write-for-data-integrity operation.
- *
- * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
- * are not presently under writeout.  This is an asynchronous flush-to-disk
- * operation.  Not suitable for data integrity operations.
- *
- * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
- * completion of writeout of all pages in the range.  This will be used after an
- * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
- * for that operation to complete and to return the result.
- *
- * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
- * a traditional sync() operation.  This is a write-for-data-integrity operation
- * which will ensure that all pages in the range which were dirty on entry to
- * sys_sync_file_range() are committed to disk.
- *
- *
- * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
- * I/O errors or ENOSPC conditions and will return those to the caller, after
- * clearing the EIO and ENOSPC flags in the address_space.
- *
- * It should be noted that none of these operations write out the file's
- * metadata.  So unless the application is strictly performing overwrites of
- * already-instantiated disk blocks, there are no guarantees here that the data
- * will be available after a crash.
- */
-int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
-			 unsigned int flags)
+int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
+		    unsigned int flags)
 {
 	int ret;
-	struct fd f;
 	struct address_space *mapping;
 	loff_t endbyte;			/* inclusive */
 	umode_t i_mode;
@@ -325,41 +277,96 @@ int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
 	else
 		endbyte--;		/* inclusive */
 
-	ret = -EBADF;
-	f = fdget(fd);
-	if (!f.file)
-		goto out;
-
-	i_mode = file_inode(f.file)->i_mode;
+	i_mode = file_inode(file)->i_mode;
 	ret = -ESPIPE;
 	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
 			!S_ISLNK(i_mode))
-		goto out_put;
+		goto out;
 
-	mapping = f.file->f_mapping;
+	mapping = file->f_mapping;
 	ret = 0;
 	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
-		ret = file_fdatawait_range(f.file, offset, endbyte);
+		ret = file_fdatawait_range(file, offset, endbyte);
 		if (ret < 0)
-			goto out_put;
+			goto out;
 	}
 
 	if (flags & SYNC_FILE_RANGE_WRITE) {
 		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
 						 WB_SYNC_NONE);
 		if (ret < 0)
-			goto out_put;
+			goto out;
 	}
 
 	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
-		ret = file_fdatawait_range(f.file, offset, endbyte);
+		ret = file_fdatawait_range(file, offset, endbyte);
 
-out_put:
-	fdput(f);
 out:
 	return ret;
 }
 
+/*
+ * sys_sync_file_range() permits finely controlled syncing over a segment of
+ * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
+ * zero then sys_sync_file_range() will operate from offset out to EOF.
+ *
+ * The flag bits are:
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
+ * before performing the write.
+ *
+ * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
+ * range which are not presently under writeback. Note that this may block for
+ * significant periods due to exhaustion of disk request structures.
+ *
+ * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
+ * after performing the write.
+ *
+ * Useful combinations of the flag bits are:
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
+ * in the range which were dirty on entry to sys_sync_file_range() are placed
+ * under writeout.  This is a start-write-for-data-integrity operation.
+ *
+ * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
+ * are not presently under writeout.  This is an asynchronous flush-to-disk
+ * operation.  Not suitable for data integrity operations.
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
+ * completion of writeout of all pages in the range.  This will be used after an
+ * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
+ * for that operation to complete and to return the result.
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
+ * a traditional sync() operation.  This is a write-for-data-integrity operation
+ * which will ensure that all pages in the range which were dirty on entry to
+ * sys_sync_file_range() are committed to disk.
+ *
+ *
+ * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
+ * I/O errors or ENOSPC conditions and will return those to the caller, after
+ * clearing the EIO and ENOSPC flags in the address_space.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata.  So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
+ */
+int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
+			 unsigned int flags)
+{
+	int ret;
+	struct fd f;
+
+	ret = -EBADF;
+	f = fdget(fd);
+	if (f.file)
+		ret = sync_file_range(f.file, offset, nbytes, flags);
+
+	fdput(f);
+	return ret;
+}
+
 SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 				unsigned int, flags)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8b42df09b04c..84e2c45ff989 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2782,6 +2782,9 @@ extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
 			   int datasync);
 extern int vfs_fsync(struct file *file, int datasync);
 
+extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
+				unsigned int flags);
+
 /*
  * Sync the bytes written if this was a synchronous write.  Expect ki_pos
  * to already be updated for the write, and will return either the amount
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/3] io_uring: add support for IORING_OP_SYNC_FILE_RANGE
  2019-04-11 15:06 [PATCHSET 0/3] io_uring: add sync_file_range and drains Jens Axboe
  2019-04-11 15:06 ` [PATCH 1/3] io_uring: add support for marking commands as draining Jens Axboe
  2019-04-11 15:06 ` [PATCH 2/3] fs: add sync_file_range() helper Jens Axboe
@ 2019-04-11 15:06 ` Jens Axboe
  2019-04-11 15:16 ` [PATCHSET 0/3] io_uring: add sync_file_range and drains Matthew Wilcox
  3 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2019-04-11 15:06 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: hch, clm, Jens Axboe

This behaves just like sync_file_range(2) does.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 51 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  2 ++
 2 files changed, 53 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a10fd5900a17..cc9854cd99f5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1167,6 +1167,54 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
+static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret = 0;
+
+	if (!req->file)
+		return -EBADF;
+	/* Prep already done (EAGAIN retry) */
+	if (req->flags & REQ_F_PREPPED)
+		return 0;
+
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+		return -EINVAL;
+
+	req->flags |= REQ_F_PREPPED;
+	return ret;
+}
+
+static int io_sync_file_range(struct io_kiocb *req,
+			      const struct io_uring_sqe *sqe,
+			      bool force_nonblock)
+{
+	loff_t sqe_off;
+	loff_t sqe_len;
+	unsigned flags;
+	int ret;
+
+	ret = io_prep_sfr(req, sqe);
+	if (ret)
+		return ret;
+
+	/* sync_file_range always requires a blocking context */
+	if (force_nonblock)
+		return -EAGAIN;
+
+	sqe_off = READ_ONCE(sqe->off);
+	sqe_len = READ_ONCE(sqe->len);
+	flags = READ_ONCE(sqe->sync_range_flags);
+
+	ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
+
+	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
+	io_put_req(req);
+	return 0;
+}
+
 static void io_poll_remove_one(struct io_kiocb *req)
 {
 	struct io_poll_iocb *poll = &req->poll;
@@ -1450,6 +1498,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_POLL_REMOVE:
 		ret = io_poll_remove(req, s->sqe);
 		break;
+	case IORING_OP_SYNC_FILE_RANGE:
+		ret = io_sync_file_range(req, s->sqe, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a7a6384d0c70..e707a17c6908 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -26,6 +26,7 @@ struct io_uring_sqe {
 		__kernel_rwf_t	rw_flags;
 		__u32		fsync_flags;
 		__u16		poll_events;
+		__u32		sync_range_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -55,6 +56,7 @@ struct io_uring_sqe {
 #define IORING_OP_WRITE_FIXED	5
 #define IORING_OP_POLL_ADD	6
 #define IORING_OP_POLL_REMOVE	7
+#define IORING_OP_SYNC_FILE_RANGE	8
 
 /*
  * sqe->fsync_flags
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCHSET 0/3] io_uring: add sync_file_range and drains
  2019-04-11 15:06 [PATCHSET 0/3] io_uring: add sync_file_range and drains Jens Axboe
                   ` (2 preceding siblings ...)
  2019-04-11 15:06 ` [PATCH 3/3] io_uring: add support for IORING_OP_SYNC_FILE_RANGE Jens Axboe
@ 2019-04-11 15:16 ` Matthew Wilcox
  2019-04-11 15:23   ` Jens Axboe
  2019-04-11 16:19   ` Chris Mason
  3 siblings, 2 replies; 8+ messages in thread
From: Matthew Wilcox @ 2019-04-11 15:16 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, linux-block, hch, clm

On Thu, Apr 11, 2019 at 09:06:54AM -0600, Jens Axboe wrote:
> In continuation of the fsync barrier patch from the other day, I
> reworked that patch to turn it into a general primitive instead. This
> means that any command can be flagged with IOSQE_IO_DRAIN, which will
> insert a sequence point in the queue. If a request is marked with
> IOSQE_IO_DRAIN, then previous commands must complete before this one
> is issued. Subsequent requests are not started until the drain has
> completed. The latter is a necessity since we track this through the
> CQ index. If we allow later commands, then they could complete before
> earlier commands and we'd mistakenly think that we have satisfied the
> sequence point.

That's potentially going to cause quite the bubble in the pipeline of
commands being sent.

Do consumers know which writes they are going to want to fence?  We could
do something like tag each command with a stream ID and then fence a
particular stream.  We'd need one nr_pending counter per stream, but
that should be pretty cheap.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCHSET 0/3] io_uring: add sync_file_range and drains
  2019-04-11 15:16 ` [PATCHSET 0/3] io_uring: add sync_file_range and drains Matthew Wilcox
@ 2019-04-11 15:23   ` Jens Axboe
  2019-04-11 16:19   ` Chris Mason
  1 sibling, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2019-04-11 15:23 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, linux-block, hch, clm

On 4/11/19 9:16 AM, Matthew Wilcox wrote:
> On Thu, Apr 11, 2019 at 09:06:54AM -0600, Jens Axboe wrote:
>> In continuation of the fsync barrier patch from the other day, I
>> reworked that patch to turn it into a general primitive instead. This
>> means that any command can be flagged with IOSQE_IO_DRAIN, which will
>> insert a sequence point in the queue. If a request is marked with
>> IOSQE_IO_DRAIN, then previous commands must complete before this one
>> is issued. Subsequent requests are not started until the drain has
>> completed. The latter is a necessity since we track this through the
>> CQ index. If we allow later commands, then they could complete before
>> earlier commands and we'd mistakenly think that we have satisfied the
>> sequence point.
> 
> That's potentially going to cause quite the bubble in the pipeline of
> commands being sent.

Definitely.

> Do consumers know which writes they are going to want to fence?  We could
> do something like tag each command with a stream ID and then fence a
> particular stream.  We'd need one nr_pending counter per stream, but
> that should be pretty cheap.

Or you could just split your streams between io_urings. That has other
overhead of course in terms of resources, but it'd avoid having to do
any extra accounting on the kernel side. A pending counter is not
necessarily cheap, though it'd be acceptable if we required writes
that you want to fence to be tagged (hence it wouldn't happen for
"normal" IO).

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCHSET 0/3] io_uring: add sync_file_range and drains
  2019-04-11 15:16 ` [PATCHSET 0/3] io_uring: add sync_file_range and drains Matthew Wilcox
  2019-04-11 15:23   ` Jens Axboe
@ 2019-04-11 16:19   ` Chris Mason
  1 sibling, 0 replies; 8+ messages in thread
From: Chris Mason @ 2019-04-11 16:19 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Jens Axboe, linux-fsdevel, linux-block, hch



On 11 Apr 2019, at 11:16, Matthew Wilcox wrote:

> On Thu, Apr 11, 2019 at 09:06:54AM -0600, Jens Axboe wrote:
>> In continuation of the fsync barrier patch from the other day, I
>> reworked that patch to turn it into a general primitive instead. This
>> means that any command can be flagged with IOSQE_IO_DRAIN, which will
>> insert a sequence point in the queue. If a request is marked with
>> IOSQE_IO_DRAIN, then previous commands must complete before this one
>> is issued. Subsequent requests are not started until the drain has
>> completed. The latter is a necessity since we track this through the
>> CQ index. If we allow later commands, then they could complete before
>> earlier commands and we'd mistakenly think that we have satisfied the
>> sequence point.
>
> That's potentially going to cause quite the bubble in the pipeline of
> commands being sent.
>
> Do consumers know which writes they are going to want to fence?  We 
> could
> do something like tag each command with a stream ID and then fence a
> particular stream.  We'd need one nr_pending counter per stream, but
> that should be pretty cheap.

It'll be a bubble, but without the drain command, iou_ring users would 
still have the same bubble while they wait for IO in order to enforce 
the ordering themselves.

I prefer Jens' suggestion to limit the drain's impact with multiple 
iou_rings instead of adding a stream id.   I don't have a really solid 
reason for this, but I'd hesitate to add complexity before we have more 
data from users.

-chris

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3] io_uring: add support for marking commands as draining
  2019-04-11 15:06 ` [PATCH 1/3] io_uring: add support for marking commands as draining Jens Axboe
@ 2019-05-06 18:03   ` Andres Freund
  0 siblings, 0 replies; 8+ messages in thread
From: Andres Freund @ 2019-05-06 18:03 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, linux-block, hch, clm

Hi,

On 2019-04-11 09:06:55 -0600, Jens Axboe wrote:
> There are no ordering constraints between the submission and completion
> side of io_uring. But sometimes that would be useful to have. One common
> example is doing an fsync, for instance, and have it ordered with
> previous writes. Without support for that, the application must do this
> tracking itself.

The facility seems useful for at least this postgres developer playing
with optionally using io_uring in parts of postgres. As you say, I'd
otherwise need to manually implement drains in userland.


> This adds a general SQE flag, IOSQE_IO_DRAIN. If a command is marked
> with this flag, then it will not be issued before previous commands have
> completed, and subsequent commands submitted after the drain will not be
> issued before the drain is started.. If there are no pending commands,
> setting this flag will not change the behavior of the issue of the
> command.

I think it'd be good if there were some documentation about how io_uring
interacts with writes done via a different io_uring queue, or
traditional write(2) et al.  And whether IOSQE_IO_DRAIN drain influences
that.

In none of the docs I read it's documented if an io_uring fsync
guarantees that a write(2) that finished before an IORING_OP_FSYNC op is
submitted is durable? Given the current implementation that clearly
seems to be the case, but it's not great to rely on the current
implementation as a user of data integrity operations.

Similarly, it'd be good if there were docs about how traditional
read/write/fsync and multiple io_uring queues interact in the face of
concurrent operations. For plain read/write we have posix providing some
baseline guarantees, but obviously doesn't mean anything for io_uring.

I suspect that most people's intuition will be "it's obvious", but also
that such intuitions are likely to differ between people.

Greetings,

Andres Freund

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-05-06 18:04 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-11 15:06 [PATCHSET 0/3] io_uring: add sync_file_range and drains Jens Axboe
2019-04-11 15:06 ` [PATCH 1/3] io_uring: add support for marking commands as draining Jens Axboe
2019-05-06 18:03   ` Andres Freund
2019-04-11 15:06 ` [PATCH 2/3] fs: add sync_file_range() helper Jens Axboe
2019-04-11 15:06 ` [PATCH 3/3] io_uring: add support for IORING_OP_SYNC_FILE_RANGE Jens Axboe
2019-04-11 15:16 ` [PATCHSET 0/3] io_uring: add sync_file_range and drains Matthew Wilcox
2019-04-11 15:23   ` Jens Axboe
2019-04-11 16:19   ` Chris Mason

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).