All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] zonefs: use zone-append for aio with rwf append
@ 2020-07-20 13:21 Johannes Thumshirn
  2020-07-20 13:21 ` [PATCH 1/2] fs: fix kiocb ki_complete interface Johannes Thumshirn
  2020-07-20 13:21 ` [PATCH 2/2] zonefs: use zone-append for AIO as well Johannes Thumshirn
  0 siblings, 2 replies; 18+ messages in thread
From: Johannes Thumshirn @ 2020-07-20 13:21 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: linux-fsdevel, Jens Axboe, linux-block, Christoph Hellwig,
	Johannes Thumshirn

When an asynchronous I/O, that has the RWF_APPEND flag set, get's submitted
to a sequential zone-file in zonefs, issue this I/O using REQ_OP_ZONE_APPEND.

On a successful write, the location the data has been written to is returned
in AIOs res2 field.

This series has no regressions to current zonefs-tests and specific tests will
be introduced as well.

Damien Le Moal (1):
  fs: fix kiocb ki_complete interface

Johannes Thumshirn (1):
  zonefs: use zone-append for AIO as well

 drivers/block/loop.c              |   3 +-
 drivers/nvme/target/io-cmd-file.c |   3 +-
 drivers/target/target_core_file.c |   3 +-
 fs/aio.c                          |   2 +-
 fs/io_uring.c                     |   5 +-
 fs/zonefs/super.c                 | 143 ++++++++++++++++++++++++++----
 fs/zonefs/zonefs.h                |   3 +
 include/linux/fs.h                |   2 +-
 8 files changed, 139 insertions(+), 25 deletions(-)

-- 
2.26.2


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/2] fs: fix kiocb ki_complete interface
  2020-07-20 13:21 [PATCH 0/2] zonefs: use zone-append for aio with rwf append Johannes Thumshirn
@ 2020-07-20 13:21 ` Johannes Thumshirn
  2020-07-20 13:38   ` Christoph Hellwig
  2020-07-20 13:21 ` [PATCH 2/2] zonefs: use zone-append for AIO as well Johannes Thumshirn
  1 sibling, 1 reply; 18+ messages in thread
From: Johannes Thumshirn @ 2020-07-20 13:21 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: linux-fsdevel, Jens Axboe, linux-block, Christoph Hellwig,
	Damien Le Moal, Johannes Thumshirn

From: Damien Le Moal <damien.lemoal@wdc.com>

The res and res2 fields of struct io_event are signed 64 bits values
(__s64 type). Allow the ki_complete method of struct kiocb to set 64
bits values in these fields by changin its interface from the long type
to long long.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 drivers/block/loop.c              | 3 ++-
 drivers/nvme/target/io-cmd-file.c | 3 ++-
 drivers/target/target_core_file.c | 3 ++-
 fs/aio.c                          | 2 +-
 fs/io_uring.c                     | 5 +++--
 include/linux/fs.h                | 2 +-
 6 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a943207705dd..2a1e0f248436 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -513,7 +513,8 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
 		blk_mq_complete_request(rq);
 }
 
-static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+static void lo_rw_aio_complete(struct kiocb *iocb,
+			       long long ret, long long ret2)
 {
 	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 0abbefd9925e..302bfdff55e5 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -123,7 +123,8 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
 	return call_iter(iocb, &iter);
 }
 
-static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
+static void nvmet_file_io_done(struct kiocb *iocb,
+			       long long ret, long long ret2)
 {
 	struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb);
 	u16 status = NVME_SC_SUCCESS;
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 7143d03f0e02..87eac2313758 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -243,7 +243,8 @@ struct target_core_file_cmd {
 	struct kiocb	iocb;
 };
 
-static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+static void cmd_rw_aio_complete(struct kiocb *iocb,
+				long long ret, long long ret2)
 {
 	struct target_core_file_cmd *cmd;
 
diff --git a/fs/aio.c b/fs/aio.c
index 91e7cc4a9f17..38bce07f9733 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1418,7 +1418,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb)
 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 }
 
-static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void aio_complete_rw(struct kiocb *kiocb, long long res, long long res2)
 {
 	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index ce63e1389568..a2e861b13b1e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2261,14 +2261,15 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 		io_complete_rw_common(&req->rw.kiocb, res, cs);
 }
 
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw(struct kiocb *kiocb, long long res, long long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
 	__io_complete_rw(req, res, res2, NULL);
 }
 
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long long res,
+				  long long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index da90323b9f92..aa8e82496179 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -328,7 +328,7 @@ struct kiocb {
 	randomized_struct_fields_start
 
 	loff_t			ki_pos;
-	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
+	void (*ki_complete)(struct kiocb *iocb, long long ret, long long ret2);
 	void			*private;
 	int			ki_flags;
 	u16			ki_hint;
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-20 13:21 [PATCH 0/2] zonefs: use zone-append for aio with rwf append Johannes Thumshirn
  2020-07-20 13:21 ` [PATCH 1/2] fs: fix kiocb ki_complete interface Johannes Thumshirn
@ 2020-07-20 13:21 ` Johannes Thumshirn
  2020-07-20 13:45   ` Christoph Hellwig
  2020-07-21 12:43   ` Kanchan Joshi
  1 sibling, 2 replies; 18+ messages in thread
From: Johannes Thumshirn @ 2020-07-20 13:21 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: linux-fsdevel, Jens Axboe, linux-block, Christoph Hellwig,
	Johannes Thumshirn

If we get an async I/O iocb with an O_APPEND or RWF_APPEND flag set,
submit it using REQ_OP_ZONE_APPEND to the block layer.

As an REQ_OP_ZONE_APPEND bio must not be split, this does come with an
additional constraint, namely the buffer submitted to zonefs must not be
bigger than the max zone append size of the underlying device. For
synchronous I/O we don't care about this constraint as we can return short
writes, for AIO we need to return an error on too big buffers.

On a successful completion, the position the data is written to is
returned via AIO's res2 field to the calling application.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/zonefs/super.c  | 143 +++++++++++++++++++++++++++++++++++++++------
 fs/zonefs/zonefs.h |   3 +
 2 files changed, 128 insertions(+), 18 deletions(-)

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 5832e9f69268..f155a658675b 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -24,6 +24,8 @@
 
 #include "zonefs.h"
 
+static struct bio_set zonefs_dio_bio_set;
+
 static inline int zonefs_zone_mgmt(struct zonefs_inode_info *zi,
 				   enum req_opf op)
 {
@@ -700,16 +702,71 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = {
 	.end_io			= zonefs_file_write_dio_end_io,
 };
 
+struct zonefs_dio {
+	struct kiocb		*iocb;
+	struct task_struct	*waiter;
+	int			error;
+	struct work_struct	work;
+	size_t			size;
+	u64			sector;
+	struct completion	completion;
+	struct bio		bio;
+};
+
+static void zonefs_dio_complete_work(struct work_struct *work)
+{
+	struct zonefs_dio *dio = container_of(work, struct zonefs_dio, work);
+	struct kiocb *iocb = dio->iocb;
+	size_t size = dio->size;
+	int ret;
+
+	ret = zonefs_file_write_dio_end_io(iocb, size, dio->error, 0);
+	if (ret == 0)
+		iocb->ki_pos += size;
+
+	iocb->ki_complete(iocb, ret, dio->sector);
+
+	bio_put(&dio->bio);
+}
+
+static void zonefs_file_dio_append_end_io(struct bio *bio)
+{
+	struct zonefs_dio *dio = container_of(bio, struct zonefs_dio, bio);
+	struct kiocb *iocb = dio->iocb;
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	if (bio->bi_status)
+		dio->error = blk_status_to_errno(bio->bi_status);
+	else
+		dio->sector = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+	if (is_sync_kiocb(iocb)) {
+		struct task_struct *waiter = dio->waiter;
+
+		blk_wake_io_task(waiter);
+		WRITE_ONCE(dio->waiter, NULL);
+	} else {
+		INIT_WORK(&dio->work, zonefs_dio_complete_work);
+		queue_work(ZONEFS_SB(inode->i_sb)->s_dio_done_wq, &dio->work);
+	}
+
+	bio_release_pages(bio, false);
+	bio_put(bio);
+}
+
 static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct zonefs_inode_info *zi = ZONEFS_I(inode);
 	struct block_device *bdev = inode->i_sb->s_bdev;
+	struct zonefs_dio *dio;
 	unsigned int max;
 	struct bio *bio;
-	ssize_t size;
 	int nr_pages;
 	ssize_t ret;
+	bool sync = is_sync_kiocb(iocb);
+	bool polled;
+	blk_qc_t qc;
 
 	max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
 	max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
@@ -720,15 +777,24 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 		return 0;
 
 
-	bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
+	bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &zonefs_dio_bio_set);
 	if (!bio)
 		return -ENOMEM;
 
+	dio = container_of(bio, struct zonefs_dio, bio);
+	dio->iocb = iocb;
+	dio->error = 0;
+	if (sync) {
+		dio->waiter = current;
+		init_completion(&dio->completion);
+	}
+
 	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = zi->i_zsector;
 	bio->bi_write_hint = iocb->ki_hint;
 	bio->bi_ioprio = iocb->ki_ioprio;
 	bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
+	bio->bi_end_io = zonefs_file_dio_append_end_io;
 	if (iocb->ki_flags & IOCB_DSYNC)
 		bio->bi_opf |= REQ_FUA;
 
@@ -737,21 +803,41 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 		bio_io_error(bio);
 		return ret;
 	}
-	size = bio->bi_iter.bi_size;
+	dio->size = bio->bi_iter.bi_size;
 	task_io_account_write(ret);
 
-	if (iocb->ki_flags & IOCB_HIPRI)
+	if (iocb->ki_flags & IOCB_HIPRI) {
 		bio_set_polled(bio, iocb);
+		polled = true;
+	}
 
-	ret = submit_bio_wait(bio);
+	bio_get(bio);
+	qc = submit_bio(bio);
 
-	bio_put(bio);
+	if (polled)
+		WRITE_ONCE(iocb->ki_cookie, qc);
 
-	zonefs_file_write_dio_end_io(iocb, size, ret, 0);
-	if (ret >= 0) {
-		iocb->ki_pos += size;
-		return size;
+	if (!sync)
+		return -EIOCBQUEUED;
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!READ_ONCE(dio->waiter))
+			break;
+
+		if (!(iocb->ki_flags & IOCB_HIPRI) ||
+		    !blk_poll(bdev_get_queue(bdev), qc, true))
+			blk_io_schedule();
 	}
+	__set_current_state(TASK_RUNNING);
+
+	ret = zonefs_file_write_dio_end_io(iocb, dio->size,
+					   dio->error, 0);
+	if (ret == 0) {
+		ret = dio->size;
+		iocb->ki_pos += dio->size;
+	}
+	bio_put(bio);
 
 	return ret;
 }
@@ -813,7 +899,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 			goto inode_unlock;
 		}
 		mutex_unlock(&zi->i_truncate_mutex);
-		append = sync;
+		append = sync || iocb->ki_flags & IOCB_APPEND;
 	}
 
 	if (append)
@@ -821,8 +907,8 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 	else
 		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
 				   &zonefs_write_dio_ops, sync);
-	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
-	    (ret > 0 || ret == -EIOCBQUEUED)) {
+
+	if (ret > 0 || ret == -EIOCBQUEUED) {
 		if (ret > 0)
 			count = ret;
 		mutex_lock(&zi->i_truncate_mutex);
@@ -1580,6 +1666,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb->s_root)
 		goto cleanup;
 
+	sbi->s_dio_done_wq = alloc_workqueue("zonefs-dio/%s", WQ_MEM_RECLAIM,
+					     0, sb->s_id);
+	if (!sbi->s_dio_done_wq)
+		goto cleanup;
+
 	/* Create and populate files in zone groups directories */
 	for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
 		ret = zonefs_create_zgroup(&zd, t);
@@ -1603,8 +1694,14 @@ static void zonefs_kill_super(struct super_block *sb)
 {
 	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 
-	if (sb->s_root)
+	if (sb->s_root) {
 		d_genocide(sb->s_root);
+
+		if (sbi->s_dio_done_wq) {
+			destroy_workqueue(sbi->s_dio_done_wq);
+			sbi->s_dio_done_wq = NULL;
+		}
+	}
 	kill_block_super(sb);
 	kfree(sbi);
 }
@@ -1651,17 +1748,27 @@ static int __init zonefs_init(void)
 	if (ret)
 		return ret;
 
+	ret = bioset_init(&zonefs_dio_bio_set, 4,
+			  offsetof(struct zonefs_dio, bio), BIOSET_NEED_BVECS);
+	if (ret)
+		goto destroy_inodecache;
+
 	ret = register_filesystem(&zonefs_type);
-	if (ret) {
-		zonefs_destroy_inodecache();
-		return ret;
-	}
+	if (ret)
+		goto exit_bioset;
 
 	return 0;
+
+exit_bioset:
+	bioset_exit(&zonefs_dio_bio_set);
+destroy_inodecache:
+	zonefs_destroy_inodecache();
+	return ret;
 }
 
 static void __exit zonefs_exit(void)
 {
+	bioset_exit(&zonefs_dio_bio_set);
 	zonefs_destroy_inodecache();
 	unregister_filesystem(&zonefs_type);
 }
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 51141907097c..fe91df5eeffe 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -185,6 +185,9 @@ struct zonefs_sb_info {
 
 	unsigned int		s_max_open_zones;
 	atomic_t		s_open_zones;
+
+	/* AIO completions deferred from interrupt context */
+	struct workqueue_struct *s_dio_done_wq;
 };
 
 static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] fs: fix kiocb ki_complete interface
  2020-07-20 13:21 ` [PATCH 1/2] fs: fix kiocb ki_complete interface Johannes Thumshirn
@ 2020-07-20 13:38   ` Christoph Hellwig
  2020-07-20 13:43     ` Damien Le Moal
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2020-07-20 13:38 UTC (permalink / raw)
  To: Johannes Thumshirn
  Cc: Damien Le Moal, linux-fsdevel, Jens Axboe, linux-block,
	Christoph Hellwig

On Mon, Jul 20, 2020 at 10:21:17PM +0900, Johannes Thumshirn wrote:
> From: Damien Le Moal <damien.lemoal@wdc.com>
> 
> The res and res2 fields of struct io_event are signed 64 bits values
> (__s64 type). Allow the ki_complete method of struct kiocb to set 64
> bits values in these fields by changin its interface from the long type
> to long long.

Which doesn't help if the consumers can't deal with these values.
But that shouldn't even be required for using zone append anyway..

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] fs: fix kiocb ki_complete interface
  2020-07-20 13:38   ` Christoph Hellwig
@ 2020-07-20 13:43     ` Damien Le Moal
  2020-07-20 13:47       ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Damien Le Moal @ 2020-07-20 13:43 UTC (permalink / raw)
  To: Christoph Hellwig, Johannes Thumshirn
  Cc: linux-fsdevel, Jens Axboe, linux-block

On 2020/07/20 22:38, Christoph Hellwig wrote:
> On Mon, Jul 20, 2020 at 10:21:17PM +0900, Johannes Thumshirn wrote:
>> From: Damien Le Moal <damien.lemoal@wdc.com>
>>
>> The res and res2 fields of struct io_event are signed 64 bits values
>> (__s64 type). Allow the ki_complete method of struct kiocb to set 64
>> bits values in these fields by changin its interface from the long type
>> to long long.
> 
> Which doesn't help if the consumers can't deal with these values.
> But that shouldn't even be required for using zone append anyway..
> 

Not sure what you mean...

res2 is used to pass back to the user the written file offset, 64bits Bytes
value, for aio case (io_submit()/io_getevent()). The change does not break user
interface at all, no changes needed to any system call. The patch  just enables
passing that 64bit byte offset. The consumer of it would be the user
application, and yes, it does need to know what it is doing. But if it is using
zonefs, likely, the application knows.

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-20 13:21 ` [PATCH 2/2] zonefs: use zone-append for AIO as well Johannes Thumshirn
@ 2020-07-20 13:45   ` Christoph Hellwig
  2020-07-20 16:48     ` Johannes Thumshirn
  2020-07-21 12:43   ` Kanchan Joshi
  1 sibling, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2020-07-20 13:45 UTC (permalink / raw)
  To: Johannes Thumshirn
  Cc: Damien Le Moal, linux-fsdevel, Jens Axboe, linux-block,
	Christoph Hellwig

On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
> On a successful completion, the position the data is written to is
> returned via AIO's res2 field to the calling application.

That is a major, and except for this changelog, undocumented ABI
change.  We had the whole discussion about reporting append results
in a few threads and the issues with that in io_uring.  So let's
have that discussion there and don't mix it up with how zonefs
writes data.  Without that a lot of the boilerplate code should
also go away.

> -	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
> -	    (ret > 0 || ret == -EIOCBQUEUED)) {
> +
> +	if (ret > 0 || ret == -EIOCBQUEUED) {
>  		if (ret > 0)
>  			count = ret;
>  		mutex_lock(&zi->i_truncate_mutex);

Don't we still need the ZONEFS_ZTYPE_SEQ after updating count, but
before updating i_wpoffset?  Also how is this related to the rest
of the patch?

> @@ -1580,6 +1666,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
>  	if (!sb->s_root)
>  		goto cleanup;
>  
> +	sbi->s_dio_done_wq = alloc_workqueue("zonefs-dio/%s", WQ_MEM_RECLAIM,
> +					     0, sb->s_id);
> +	if (!sbi->s_dio_done_wq)
> +		goto cleanup;
> +

Can you reuse the sb->s_dio_done_wq pointer, and maybe even the helper
to initialize it?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/2] fs: fix kiocb ki_complete interface
  2020-07-20 13:43     ` Damien Le Moal
@ 2020-07-20 13:47       ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2020-07-20 13:47 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Christoph Hellwig, Johannes Thumshirn, linux-fsdevel, Jens Axboe,
	linux-block

On Mon, Jul 20, 2020 at 01:43:43PM +0000, Damien Le Moal wrote:
> On 2020/07/20 22:38, Christoph Hellwig wrote:
> > On Mon, Jul 20, 2020 at 10:21:17PM +0900, Johannes Thumshirn wrote:
> >> From: Damien Le Moal <damien.lemoal@wdc.com>
> >>
> >> The res and res2 fields of struct io_event are signed 64 bits values
> >> (__s64 type). Allow the ki_complete method of struct kiocb to set 64
> >> bits values in these fields by changin its interface from the long type
> >> to long long.
> > 
> > Which doesn't help if the consumers can't deal with these values.
> > But that shouldn't even be required for using zone append anyway..
> > 
> 
> Not sure what you mean...
> 
> res2 is used to pass back to the user the written file offset, 64bits Bytes
> value, for aio case (io_submit()/io_getevent()). The change does not break user
> interface at all, no changes needed to any system call. The patch  just enables
> passing that 64bit byte offset. The consumer of it would be the user
> application, and yes, it does need to know what it is doing. But if it is using
> zonefs, likely, the application knows.

Please start a discussion on this ABI on the linux-aio and linux-api
lists.  If we support that for zonefs we should also support it for
other direct I/O writes.  And I'm not sure an API that only works
with aio and not io_uring is going to win a lot of friends these days.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-20 13:45   ` Christoph Hellwig
@ 2020-07-20 16:48     ` Johannes Thumshirn
  2020-07-21  5:54       ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Johannes Thumshirn @ 2020-07-20 16:48 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Damien Le Moal, linux-fsdevel, Jens Axboe, linux-block

On 20/07/2020 15:45, Christoph Hellwig wrote:
> On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
>> On a successful completion, the position the data is written to is
>> returned via AIO's res2 field to the calling application.
> 
> That is a major, and except for this changelog, undocumented ABI
> change.  We had the whole discussion about reporting append results
> in a few threads and the issues with that in io_uring.  So let's
> have that discussion there and don't mix it up with how zonefs
> writes data.  Without that a lot of the boilerplate code should
> also go away.
> 

OK maybe I didn't remember correctly, but wasn't this all around 
io_uring and how we'd report the location back for raw block device
access?

I'll re-read the threads.

>> -	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
>> -	    (ret > 0 || ret == -EIOCBQUEUED)) {
>> +
>> +	if (ret > 0 || ret == -EIOCBQUEUED) {
>>  		if (ret > 0)
>>  			count = ret;
>>  		mutex_lock(&zi->i_truncate_mutex);
> 
> Don't we still need the ZONEFS_ZTYPE_SEQ after updating count, but
> before updating i_wpoffset?  Also how is this related to the rest
> of the patch?

This looks like a leftover from development that I forgot to clean up.
Will be addressing it in the next round.

> 
>> @@ -1580,6 +1666,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
>>  	if (!sb->s_root)
>>  		goto cleanup;
>>  
>> +	sbi->s_dio_done_wq = alloc_workqueue("zonefs-dio/%s", WQ_MEM_RECLAIM,
>> +					     0, sb->s_id);
>> +	if (!sbi->s_dio_done_wq)
>> +		goto cleanup;
>> +
> 
> Can you reuse the sb->s_dio_done_wq pointer, and maybe even the helper
> to initialize it?
> 

IIRC I had some issues with that and then decided to just roll my own as
the s_dio_done_wq will be allocated for every IO if I read iomap correctly.
Zonefs on the other hand needs the dio for all file accesses on sequential 
files, so creating a dedicated wq didn't seem problematic for me.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-20 16:48     ` Johannes Thumshirn
@ 2020-07-21  5:54       ` Christoph Hellwig
  2020-07-22 12:43         ` Johannes Thumshirn
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2020-07-21  5:54 UTC (permalink / raw)
  To: Johannes Thumshirn
  Cc: Christoph Hellwig, Damien Le Moal, linux-fsdevel, Jens Axboe,
	linux-block

On Mon, Jul 20, 2020 at 04:48:50PM +0000, Johannes Thumshirn wrote:
> On 20/07/2020 15:45, Christoph Hellwig wrote:
> > On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
> >> On a successful completion, the position the data is written to is
> >> returned via AIO's res2 field to the calling application.
> > 
> > That is a major, and except for this changelog, undocumented ABI
> > change.  We had the whole discussion about reporting append results
> > in a few threads and the issues with that in io_uring.  So let's
> > have that discussion there and don't mix it up with how zonefs
> > writes data.  Without that a lot of the boilerplate code should
> > also go away.
> > 
> 
> OK maybe I didn't remember correctly, but wasn't this all around 
> io_uring and how we'd report the location back for raw block device
> access?

Report the write offset.  The author seems to be hell bent on making
it block device specific, but that is a horrible idea as it is just
as useful for normal file systems (or zonefs).

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-20 13:21 ` [PATCH 2/2] zonefs: use zone-append for AIO as well Johannes Thumshirn
  2020-07-20 13:45   ` Christoph Hellwig
@ 2020-07-21 12:43   ` Kanchan Joshi
  2020-07-22 14:32     ` Johannes Thumshirn
  1 sibling, 1 reply; 18+ messages in thread
From: Kanchan Joshi @ 2020-07-21 12:43 UTC (permalink / raw)
  To: Johannes Thumshirn
  Cc: Damien Le Moal, linux-fsdevel, Jens Axboe, linux-block,
	Christoph Hellwig

[-- Attachment #1: Type: text/plain, Size: 2678 bytes --]

On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
>If we get an async I/O iocb with an O_APPEND or RWF_APPEND flag set,
>submit it using REQ_OP_ZONE_APPEND to the block layer.
>
>As an REQ_OP_ZONE_APPEND bio must not be split, this does come with an
>additional constraint, namely the buffer submitted to zonefs must not be
>bigger than the max zone append size of the underlying device. For
>synchronous I/O we don't care about this constraint as we can return short
>writes, for AIO we need to return an error on too big buffers.

I wonder what part of the patch implements that constraint on large
buffer and avoids short-write.
Existing code seems to trim iov_iter in the outset. 

        max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
        max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
        iov_iter_truncate(from, max);

This will prevent large-buffer seeing that error, and will lead to partial write.

>On a successful completion, the position the data is written to is
>returned via AIO's res2 field to the calling application.
>
>Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>---
> fs/zonefs/super.c  | 143 +++++++++++++++++++++++++++++++++++++++------
> fs/zonefs/zonefs.h |   3 +
> 2 files changed, 128 insertions(+), 18 deletions(-)
>
>diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
>index 5832e9f69268..f155a658675b 100644
>--- a/fs/zonefs/super.c
>+++ b/fs/zonefs/super.c
>@@ -24,6 +24,8 @@
>
> #include "zonefs.h"
>
>+static struct bio_set zonefs_dio_bio_set;
>+
> static inline int zonefs_zone_mgmt(struct zonefs_inode_info *zi,
> 				   enum req_opf op)
> {
>@@ -700,16 +702,71 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = {
> 	.end_io			= zonefs_file_write_dio_end_io,
> };
>
>+struct zonefs_dio {
>+	struct kiocb		*iocb;
>+	struct task_struct	*waiter;
>+	int			error;
>+	struct work_struct	work;
>+	size_t			size;
>+	u64			sector;
>+	struct completion	completion;
>+	struct bio		bio;
>+};

How about this (will save 32 bytes) - 
+struct zonefs_dio {
+       struct kiocb            *iocb;
+       struct task_struct      *waiter;
+       int                     error;
+	union {
+       	struct work_struct      work;   //only for async IO
+       	struct completion       completion; //only for sync IO
+	};
+       size_t                  size;
+       u64                     sector;
+       struct bio              bio;
+};
And dio->error field is not required.
I see it being used at one place -
+       ret = zonefs_file_write_dio_end_io(iocb, dio->size,
+                                          dio->error, 0);
Here error-code can be picked from dio->bio.

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-21  5:54       ` Christoph Hellwig
@ 2020-07-22 12:43         ` Johannes Thumshirn
  2020-07-22 13:02           ` Damien Le Moal
  2020-07-22 14:51           ` Christoph Hellwig
  0 siblings, 2 replies; 18+ messages in thread
From: Johannes Thumshirn @ 2020-07-22 12:43 UTC (permalink / raw)
  To: hch
  Cc: Christoph Hellwig, Damien Le Moal, linux-fsdevel, Jens Axboe,
	linux-block

On 21/07/2020 07:54, Christoph Hellwig wrote:
> On Mon, Jul 20, 2020 at 04:48:50PM +0000, Johannes Thumshirn wrote:
>> On 20/07/2020 15:45, Christoph Hellwig wrote:
>>> On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
>>>> On a successful completion, the position the data is written to is
>>>> returned via AIO's res2 field to the calling application.
>>>
>>> That is a major, and except for this changelog, undocumented ABI
>>> change.  We had the whole discussion about reporting append results
>>> in a few threads and the issues with that in io_uring.  So let's
>>> have that discussion there and don't mix it up with how zonefs
>>> writes data.  Without that a lot of the boilerplate code should
>>> also go away.
>>>
>>
>> OK maybe I didn't remember correctly, but wasn't this all around 
>> io_uring and how we'd report the location back for raw block device
>> access?
> 
> Report the write offset.  The author seems to be hell bent on making
> it block device specific, but that is a horrible idea as it is just
> as useful for normal file systems (or zonefs).

After having looked into io_uring I don't this there is anything that
prevents io_uring from picking up the write offset from ki_complete's
res2 argument. As of now io_uring ignores the filed but that can be 
changed.

The reporting of the write offset to user-space still needs to be 
decided on from an io_uring PoV.

So the only thing that needs to be done from a zonefs perspective is 
documenting the use of res2 and CC linux-aio and linux-abi (including
an update of the io_getevents man page).

Or am I completely off track now?

Thanks,
	Johannes

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-22 12:43         ` Johannes Thumshirn
@ 2020-07-22 13:02           ` Damien Le Moal
  2020-07-22 14:53             ` Christoph Hellwig
  2020-07-22 14:51           ` Christoph Hellwig
  1 sibling, 1 reply; 18+ messages in thread
From: Damien Le Moal @ 2020-07-22 13:02 UTC (permalink / raw)
  To: Johannes Thumshirn, hch
  Cc: Christoph Hellwig, linux-fsdevel, Jens Axboe, linux-block

On 2020/07/22 21:43, Johannes Thumshirn wrote:
> On 21/07/2020 07:54, Christoph Hellwig wrote:
>> On Mon, Jul 20, 2020 at 04:48:50PM +0000, Johannes Thumshirn wrote:
>>> On 20/07/2020 15:45, Christoph Hellwig wrote:
>>>> On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
>>>>> On a successful completion, the position the data is written to is
>>>>> returned via AIO's res2 field to the calling application.
>>>>
>>>> That is a major, and except for this changelog, undocumented ABI
>>>> change.  We had the whole discussion about reporting append results
>>>> in a few threads and the issues with that in io_uring.  So let's
>>>> have that discussion there and don't mix it up with how zonefs
>>>> writes data.  Without that a lot of the boilerplate code should
>>>> also go away.
>>>>
>>>
>>> OK maybe I didn't remember correctly, but wasn't this all around 
>>> io_uring and how we'd report the location back for raw block device
>>> access?
>>
>> Report the write offset.  The author seems to be hell bent on making
>> it block device specific, but that is a horrible idea as it is just
>> as useful for normal file systems (or zonefs).
> 
> After having looked into io_uring I don't this there is anything that
> prevents io_uring from picking up the write offset from ki_complete's
> res2 argument. As of now io_uring ignores the filed but that can be 
> changed.
> 
> The reporting of the write offset to user-space still needs to be 
> decided on from an io_uring PoV.
> 
> So the only thing that needs to be done from a zonefs perspective is 
> documenting the use of res2 and CC linux-aio and linux-abi (including
> an update of the io_getevents man page).
> 
> Or am I completely off track now?

That is the general idea. But Christoph point was that reporting the effective
write offset back to user space can be done not only for zone append, but also
for regular FS/files that are open with O_APPEND and being written with AIOs,
legacy or io_uring. Since for this case, the aio->aio_offset field is ignored
and the kiocb pos is initialized with the file size, then incremented with size
for the next AIO, the user never actually sees the actual write offset of its
AIOs. Reporting that back for regular files too can be useful, even though
current application can do without this (or do not use O_APPEND because it is
lacking).

Christoph, please loudly shout at me if I misunderstood you :)

For the regular FS/file case, getting the written file offset is simple. Only
need to use the kiocb->pos. That is not a per FS change.

For the user interface, yes, I agree, res2 is the way to go. And we need to
decide for io_uring how to do it. That is an API change, bacward compatible for
legacy AIO, but still a change. So linux-aio and linux-api lists should be
consulted. Ideally, for io_uring, something backward compatible would be nice
too. Not sure how to do it yet.

Whatever the interface, plugging zonefs into it is the trivial part as you
already did the heavier lifting with writing the async zone append path.


> 
> Thanks,
> 	Johannes
> 


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-21 12:43   ` Kanchan Joshi
@ 2020-07-22 14:32     ` Johannes Thumshirn
  0 siblings, 0 replies; 18+ messages in thread
From: Johannes Thumshirn @ 2020-07-22 14:32 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Damien Le Moal, linux-fsdevel, Jens Axboe, linux-block,
	Christoph Hellwig

On 21/07/2020 15:02, Kanchan Joshi wrote:
> On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
>> If we get an async I/O iocb with an O_APPEND or RWF_APPEND flag set,
>> submit it using REQ_OP_ZONE_APPEND to the block layer.
>>
>> As an REQ_OP_ZONE_APPEND bio must not be split, this does come with an
>> additional constraint, namely the buffer submitted to zonefs must not be
>> bigger than the max zone append size of the underlying device. For
>> synchronous I/O we don't care about this constraint as we can return short
>> writes, for AIO we need to return an error on too big buffers.
> 
> I wonder what part of the patch implements that constraint on large
> buffer and avoids short-write.
> Existing code seems to trim iov_iter in the outset. 
> 
>         max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
>         max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
>         iov_iter_truncate(from, max);

This actually needs a 'if (sync)' before the iov_iter_truncate() you're right.

> 
> This will prevent large-buffer seeing that error, and will lead to partial write.
> 
>> On a successful completion, the position the data is written to is
>> returned via AIO's res2 field to the calling application.
>>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> ---
>> fs/zonefs/super.c  | 143 +++++++++++++++++++++++++++++++++++++++------
>> fs/zonefs/zonefs.h |   3 +
>> 2 files changed, 128 insertions(+), 18 deletions(-)
>>
>> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
>> index 5832e9f69268..f155a658675b 100644
>> --- a/fs/zonefs/super.c
>> +++ b/fs/zonefs/super.c
>> @@ -24,6 +24,8 @@
>>
>> #include "zonefs.h"
>>
>> +static struct bio_set zonefs_dio_bio_set;
>> +
>> static inline int zonefs_zone_mgmt(struct zonefs_inode_info *zi,
>> 				   enum req_opf op)
>> {
>> @@ -700,16 +702,71 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = {
>> 	.end_io			= zonefs_file_write_dio_end_io,
>> };
>>
>> +struct zonefs_dio {
>> +	struct kiocb		*iocb;
>> +	struct task_struct	*waiter;
>> +	int			error;
>> +	struct work_struct	work;
>> +	size_t			size;
>> +	u64			sector;
>> +	struct completion	completion;
>> +	struct bio		bio;
>> +};
> 
> How about this (will save 32 bytes) - 
> +struct zonefs_dio {
> +       struct kiocb            *iocb;
> +       struct task_struct      *waiter;
> +       int                     error;
> +	union {
> +       	struct work_struct      work;   //only for async IO
> +       	struct completion       completion; //only for sync IO
> +	};
> +       size_t                  size;
> +       u64                     sector;
> +       struct bio              bio;
> +};
> And dio->error field is not required.
> I see it being used at one place -
> +       ret = zonefs_file_write_dio_end_io(iocb, dio->size,
> +                                          dio->error, 0);
> Here error-code can be picked from dio->bio.
> 

Indeed, did that and also removed the unused completion from 
zonefs_dio and made a union of work and waiter.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-22 12:43         ` Johannes Thumshirn
  2020-07-22 13:02           ` Damien Le Moal
@ 2020-07-22 14:51           ` Christoph Hellwig
  2020-07-22 15:00             ` Johannes Thumshirn
  2020-07-24 13:57             ` Kanchan Joshi
  1 sibling, 2 replies; 18+ messages in thread
From: Christoph Hellwig @ 2020-07-22 14:51 UTC (permalink / raw)
  To: Johannes Thumshirn
  Cc: hch, Christoph Hellwig, Damien Le Moal, linux-fsdevel,
	Jens Axboe, linux-block

On Wed, Jul 22, 2020 at 12:43:21PM +0000, Johannes Thumshirn wrote:
> On 21/07/2020 07:54, Christoph Hellwig wrote:
> > On Mon, Jul 20, 2020 at 04:48:50PM +0000, Johannes Thumshirn wrote:
> >> On 20/07/2020 15:45, Christoph Hellwig wrote:
> >>> On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
> >>>> On a successful completion, the position the data is written to is
> >>>> returned via AIO's res2 field to the calling application.
> >>>
> >>> That is a major, and except for this changelog, undocumented ABI
> >>> change.  We had the whole discussion about reporting append results
> >>> in a few threads and the issues with that in io_uring.  So let's
> >>> have that discussion there and don't mix it up with how zonefs
> >>> writes data.  Without that a lot of the boilerplate code should
> >>> also go away.
> >>>
> >>
> >> OK maybe I didn't remember correctly, but wasn't this all around 
> >> io_uring and how we'd report the location back for raw block device
> >> access?
> > 
> > Report the write offset.  The author seems to be hell bent on making
> > it block device specific, but that is a horrible idea as it is just
> > as useful for normal file systems (or zonefs).
> 
> After having looked into io_uring I don't this there is anything that
> prevents io_uring from picking up the write offset from ki_complete's
> res2 argument. As of now io_uring ignores the filed but that can be 
> changed.

Sure.  Except for the fact that the io_uring CQE doesn't have space
for it.  See the currently ongoing discussion on that..

> So the only thing that needs to be done from a zonefs perspective is 
> documenting the use of res2 and CC linux-aio and linux-abi (including
> an update of the io_getevents man page).
> 
> Or am I completely off track now?

Yes.  We should not have a different ABI just for zonefs.  We need to
support this feature in a generic way and not as a weird one off for
one filesystem and only with the legacy AIO interface.

Either way please make sure you properly separate the interface (
using Write vs Zone Append in zonefs) from the interface (returning
the actually written offset from appending writes), as they are quite
separate issues.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-22 13:02           ` Damien Le Moal
@ 2020-07-22 14:53             ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2020-07-22 14:53 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Johannes Thumshirn, hch, Christoph Hellwig, linux-fsdevel,
	Jens Axboe, linux-block

On Wed, Jul 22, 2020 at 01:02:14PM +0000, Damien Le Moal wrote:
> That is the general idea. But Christoph point was that reporting the effective
> write offset back to user space can be done not only for zone append, but also
> for regular FS/files that are open with O_APPEND and being written with AIOs,
> legacy or io_uring. Since for this case, the aio->aio_offset field is ignored
> and the kiocb pos is initialized with the file size, then incremented with size
> for the next AIO, the user never actually sees the actual write offset of its
> AIOs. Reporting that back for regular files too can be useful, even though
> current application can do without this (or do not use O_APPEND because it is
> lacking).
> 
> Christoph, please loudly shout at me if I misunderstood you :)

I'd never shout at you :)  But yes, this is correct.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-22 14:51           ` Christoph Hellwig
@ 2020-07-22 15:00             ` Johannes Thumshirn
  2020-07-24 13:57             ` Kanchan Joshi
  1 sibling, 0 replies; 18+ messages in thread
From: Johannes Thumshirn @ 2020-07-22 15:00 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: hch, Damien Le Moal, linux-fsdevel, Jens Axboe, linux-block

On 22/07/2020 16:52, Christoph Hellwig wrote:
> On Wed, Jul 22, 2020 at 12:43:21PM +0000, Johannes Thumshirn wrote:
>> On 21/07/2020 07:54, Christoph Hellwig wrote:
>>> On Mon, Jul 20, 2020 at 04:48:50PM +0000, Johannes Thumshirn wrote:
>>>> On 20/07/2020 15:45, Christoph Hellwig wrote:
>>>>> On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
>>>>>> On a successful completion, the position the data is written to is
>>>>>> returned via AIO's res2 field to the calling application.
>>>>>
>>>>> That is a major, and except for this changelog, undocumented ABI
>>>>> change.  We had the whole discussion about reporting append results
>>>>> in a few threads and the issues with that in io_uring.  So let's
>>>>> have that discussion there and don't mix it up with how zonefs
>>>>> writes data.  Without that a lot of the boilerplate code should
>>>>> also go away.
>>>>>
>>>>
>>>> OK maybe I didn't remember correctly, but wasn't this all around 
>>>> io_uring and how we'd report the location back for raw block device
>>>> access?
>>>
>>> Report the write offset.  The author seems to be hell bent on making
>>> it block device specific, but that is a horrible idea as it is just
>>> as useful for normal file systems (or zonefs).
>>
>> After having looked into io_uring I don't this there is anything that
>> prevents io_uring from picking up the write offset from ki_complete's
>> res2 argument. As of now io_uring ignores the filed but that can be 
>> changed.
> 
> Sure.  Except for the fact that the io_uring CQE doesn't have space
> for it.  See the currently ongoing discussion on that..

That one I was aware of, but I thought once that discussion has settled
the write offset can be copied from res2 into what ever people have agreed
on by then.

> 
>> So the only thing that needs to be done from a zonefs perspective is 
>> documenting the use of res2 and CC linux-aio and linux-abi (including
>> an update of the io_getevents man page).
>>
>> Or am I completely off track now?
> 
> Yes.  We should not have a different ABI just for zonefs.  We need to
> support this feature in a generic way and not as a weird one off for
> one filesystem and only with the legacy AIO interface.

OK, will have a look.

> Either way please make sure you properly separate the interface (
> using Write vs Zone Append in zonefs) from the interface (returning
> the actually written offset from appending writes), as they are quite
> separate issues.

So doing async RWF_APPEND writes using Zone Append isn't the problem here,
it's "only" the reporting of the write offset back to user-space? So once
we have sorted this out we can start issuing zone appends for zonefs async
writes?

Thanks,
	Johannes


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-22 14:51           ` Christoph Hellwig
  2020-07-22 15:00             ` Johannes Thumshirn
@ 2020-07-24 13:57             ` Kanchan Joshi
  2020-07-27  3:12               ` Damien Le Moal
  1 sibling, 1 reply; 18+ messages in thread
From: Kanchan Joshi @ 2020-07-24 13:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Johannes Thumshirn, hch, Damien Le Moal, linux-fsdevel,
	Jens Axboe, linux-block

On Wed, Jul 22, 2020 at 8:22 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Wed, Jul 22, 2020 at 12:43:21PM +0000, Johannes Thumshirn wrote:
> > On 21/07/2020 07:54, Christoph Hellwig wrote:
> > > On Mon, Jul 20, 2020 at 04:48:50PM +0000, Johannes Thumshirn wrote:
> > >> On 20/07/2020 15:45, Christoph Hellwig wrote:
> > >>> On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
> > >>>> On a successful completion, the position the data is written to is
> > >>>> returned via AIO's res2 field to the calling application.
> > >>>
> > >>> That is a major, and except for this changelog, undocumented ABI
> > >>> change.  We had the whole discussion about reporting append results
> > >>> in a few threads and the issues with that in io_uring.  So let's
> > >>> have that discussion there and don't mix it up with how zonefs
> > >>> writes data.  Without that a lot of the boilerplate code should
> > >>> also go away.
> > >>>
> > >>
> > >> OK maybe I didn't remember correctly, but wasn't this all around
> > >> io_uring and how we'd report the location back for raw block device
> > >> access?
> > >
> > > Report the write offset.  The author seems to be hell bent on making
> > > it block device specific, but that is a horrible idea as it is just
> > > as useful for normal file systems (or zonefs).

Patchset only made the feature opt-in, due to the constraints that we
had. ZoneFS was always considered and it fits as fine as block-IO.
You already know that  we did not have enough room in io-uring, which
did not really allow to think of other FS (any-size cached-writes).
After working on multiple schemes in io_uring, now we have 64bits, and
we will return absolute offset in bytes now (in V4).

But still, it comes at the cost of sacrificing the ability to do
short-write, which is fine for zone-append but may trigger
behavior-change for regular file-append.
Write may become short if
- spanning beyond end-of-file
- going beyond RLIMIT_FSIZE limit
- probably for MAX_NON_LFS as well

We need to fail all above cases if we extend the current model for
regular FS. And that may break existing file-append users.
Class of applications which just append without caring about the exact
location - attempt was not to affect these while we try to enable the
path for zone-append.

Patches use O/RWF_APPEND, but try to isolate appending-write
(IOCB_APPEND) from appending-write-that-returns-location
(IOCB_ZONE_APPEND - can be renamed when we actually have all that it
takes to apply the feature in regular FS).
Enabling block-IO and zoneFS now, and keeping regular-FS as future
work - hope that does not sound too bad!

> > After having looked into io_uring I don't this there is anything that
> > prevents io_uring from picking up the write offset from ki_complete's
> > res2 argument. As of now io_uring ignores the filed but that can be
> > changed.

We use ret2 of ki_complete to collect append-offset in io_uring too.
It's just that unlike aio it required some work to send it to user-space.


--
Kanchan Joshi

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] zonefs: use zone-append for AIO as well
  2020-07-24 13:57             ` Kanchan Joshi
@ 2020-07-27  3:12               ` Damien Le Moal
  0 siblings, 0 replies; 18+ messages in thread
From: Damien Le Moal @ 2020-07-27  3:12 UTC (permalink / raw)
  To: Kanchan Joshi, Christoph Hellwig
  Cc: Johannes Thumshirn, hch, linux-fsdevel, Jens Axboe, linux-block

On 2020/07/24 22:58, Kanchan Joshi wrote:
> On Wed, Jul 22, 2020 at 8:22 PM Christoph Hellwig <hch@lst.de> wrote:
>>
>> On Wed, Jul 22, 2020 at 12:43:21PM +0000, Johannes Thumshirn wrote:
>>> On 21/07/2020 07:54, Christoph Hellwig wrote:
>>>> On Mon, Jul 20, 2020 at 04:48:50PM +0000, Johannes Thumshirn wrote:
>>>>> On 20/07/2020 15:45, Christoph Hellwig wrote:
>>>>>> On Mon, Jul 20, 2020 at 10:21:18PM +0900, Johannes Thumshirn wrote:
>>>>>>> On a successful completion, the position the data is written to is
>>>>>>> returned via AIO's res2 field to the calling application.
>>>>>>
>>>>>> That is a major, and except for this changelog, undocumented ABI
>>>>>> change.  We had the whole discussion about reporting append results
>>>>>> in a few threads and the issues with that in io_uring.  So let's
>>>>>> have that discussion there and don't mix it up with how zonefs
>>>>>> writes data.  Without that a lot of the boilerplate code should
>>>>>> also go away.
>>>>>>
>>>>>
>>>>> OK maybe I didn't remember correctly, but wasn't this all around
>>>>> io_uring and how we'd report the location back for raw block device
>>>>> access?
>>>>
>>>> Report the write offset.  The author seems to be hell bent on making
>>>> it block device specific, but that is a horrible idea as it is just
>>>> as useful for normal file systems (or zonefs).
> 
> Patchset only made the feature opt-in, due to the constraints that we
> had. ZoneFS was always considered and it fits as fine as block-IO.
> You already know that  we did not have enough room in io-uring, which
> did not really allow to think of other FS (any-size cached-writes).
> After working on multiple schemes in io_uring, now we have 64bits, and
> we will return absolute offset in bytes now (in V4).
> 
> But still, it comes at the cost of sacrificing the ability to do
> short-write, which is fine for zone-append but may trigger
> behavior-change for regular file-append.
> Write may become short if
> - spanning beyond end-of-file

For a O_APPEND/RWF_APPEND write, the file offset written is exactly *at* EOF.
There is no "write spanning EOF", the write is always completely beyond EOF.
This is not a problem, this is the normal behavior of append writes to regular
files.

> - going beyond RLIMIT_FSIZE limit
> - probably for MAX_NON_LFS as well

These limits apply to all write operations, regardless of zone append being used
or not.

> 
> We need to fail all above cases if we extend the current model for
> regular FS. And that may break existing file-append users.
> Class of applications which just append without caring about the exact
> location - attempt was not to affect these while we try to enable the
> path for zone-append.

It seems like you are confusing the interface semantic with its
implementation... For a regular POSIX compliant file system, the implementation
of asynchronous append IOs to a file has to comply to the posix defined
behavior, regardless of the underlying command used for issuing the writes to
the device. We have btrfs running in the lab using zone append for *all* file
data writes, and that does not change the behavior of any system call. xfstests
still pass. (Note: patches coming soon).

Implemented correctly, the written offset reporting change will also be backward
compatible for regular file systems: applications using O_APPEND/RWF_APPEND AIOs
to regular files today will continue working. We should have io_uring interface
backward compatible too. How to do that must first be flushed out: we need to
clarify the interface and its semantic first. Then the implementation will
naturally follow on solid ground.

For the interface semantic, 3 cases must be considered:

(1) Regular files: the only change is that the written *file offset* is returned
to the application in the completion path. No other change is needed. Form the
application perspective, the asynchronous append writes will still result in the
same *file* data layout as before, that is, data is written sequentially at the
end of the file, in the same order a AIOs are issued by the application.

(2) zonefs: This is not a POSIX file system, that is, *file* data placement is
directly dependent on device data placement. This means that for asynchronous
append writes, we need different behaviors:
  (a) Writes at the end of the file without O_APPEND/RWF_APPEND: the data must
be written exactly at the application specified offset, which excludes the use
of zone append writes.
  (b) Append writes with O_APPEND/RWF_APPEND: The plan is to use zone append for
these, with the result that file data may not end up being written in the same
order as AIOs issuing order. The other semantic change is that if one AIO is too
large, it will be failed. A write spanning the file zone capacity will be short
and any append write to a file with a zone already full will be failed (the file
maximum size is already reached when the zone is full).

(3) block device files: O_APPEND/RWF_APPEND is meaningless for these. So the
problems start here: this needs to be enabled in a sensible way for zoned block
devices to mean "the application wants to do a zone append". There should not be
any change for regular block devices. From there, the IO behavior is the same as
for zonefs case (2b) above.

Note: I may be forgetting some points in this list above. We need to complete
this into a coherent specification, including io_uring interface, and get
linux-aio and linux-api ACK to proceed.

> 
> Patches use O/RWF_APPEND, but try to isolate appending-write
> (IOCB_APPEND) from appending-write-that-returns-location
> (IOCB_ZONE_APPEND - can be renamed when we actually have all that it
> takes to apply the feature in regular FS).

And back to Christoph's point: this isolation is not necessary. For an append
asynchronous write, we can return the written *file offset* location for all cases.

> Enabling block-IO and zoneFS now, and keeping regular-FS as future
> work - hope that does not sound too bad!

Implementing the written offset reporting interface will be done in the generic
VFS upper layer, and that will naturally enable regular file systems too. This
should not be a future work, especially if you consider zonefs, since that is a
file system (not a regular one, but the interface is the same as that of a
regular file system).

>>> After having looked into io_uring I don't this there is anything that
>>> prevents io_uring from picking up the write offset from ki_complete's
>>> res2 argument. As of now io_uring ignores the filed but that can be
>>> changed.
> 
> We use ret2 of ki_complete to collect append-offset in io_uring too.
> It's just that unlike aio it required some work to send it to user-space.
> 
> 
> --
> Kanchan Joshi
> 


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2020-07-27  3:12 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-20 13:21 [PATCH 0/2] zonefs: use zone-append for aio with rwf append Johannes Thumshirn
2020-07-20 13:21 ` [PATCH 1/2] fs: fix kiocb ki_complete interface Johannes Thumshirn
2020-07-20 13:38   ` Christoph Hellwig
2020-07-20 13:43     ` Damien Le Moal
2020-07-20 13:47       ` Christoph Hellwig
2020-07-20 13:21 ` [PATCH 2/2] zonefs: use zone-append for AIO as well Johannes Thumshirn
2020-07-20 13:45   ` Christoph Hellwig
2020-07-20 16:48     ` Johannes Thumshirn
2020-07-21  5:54       ` Christoph Hellwig
2020-07-22 12:43         ` Johannes Thumshirn
2020-07-22 13:02           ` Damien Le Moal
2020-07-22 14:53             ` Christoph Hellwig
2020-07-22 14:51           ` Christoph Hellwig
2020-07-22 15:00             ` Johannes Thumshirn
2020-07-24 13:57             ` Kanchan Joshi
2020-07-27  3:12               ` Damien Le Moal
2020-07-21 12:43   ` Kanchan Joshi
2020-07-22 14:32     ` Johannes Thumshirn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.