All of lore.kernel.org
 help / color / mirror / Atom feed
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
To: Damien Le Moal <Damien.LeMoal@wdc.com>
Cc: linux-fsdevel@vger.kernel.org, Jens Axboe <axboe@kernel.dk>,
	linux-block@vger.kernel.org, Christoph Hellwig <hch@lst.de>,
	Johannes Thumshirn <johannes.thumshirn@wdc.com>
Subject: [PATCH 2/2] zonefs: use zone-append for AIO as well
Date: Mon, 20 Jul 2020 22:21:18 +0900	[thread overview]
Message-ID: <20200720132118.10934-3-johannes.thumshirn@wdc.com> (raw)
In-Reply-To: <20200720132118.10934-1-johannes.thumshirn@wdc.com>

If we get an async I/O iocb with an O_APPEND or RWF_APPEND flag set,
submit it using REQ_OP_ZONE_APPEND to the block layer.

As an REQ_OP_ZONE_APPEND bio must not be split, this does come with an
additional constraint, namely the buffer submitted to zonefs must not be
bigger than the max zone append size of the underlying device. For
synchronous I/O we don't care about this constraint as we can return short
writes, for AIO we need to return an error on too big buffers.

On a successful completion, the position the data is written to is
returned via AIO's res2 field to the calling application.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/zonefs/super.c  | 143 +++++++++++++++++++++++++++++++++++++++------
 fs/zonefs/zonefs.h |   3 +
 2 files changed, 128 insertions(+), 18 deletions(-)

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 5832e9f69268..f155a658675b 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -24,6 +24,8 @@
 
 #include "zonefs.h"
 
+static struct bio_set zonefs_dio_bio_set;
+
 static inline int zonefs_zone_mgmt(struct zonefs_inode_info *zi,
 				   enum req_opf op)
 {
@@ -700,16 +702,71 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = {
 	.end_io			= zonefs_file_write_dio_end_io,
 };
 
+struct zonefs_dio {
+	struct kiocb		*iocb;
+	struct task_struct	*waiter;
+	int			error;
+	struct work_struct	work;
+	size_t			size;
+	u64			sector;
+	struct completion	completion;
+	struct bio		bio;
+};
+
+static void zonefs_dio_complete_work(struct work_struct *work)
+{
+	struct zonefs_dio *dio = container_of(work, struct zonefs_dio, work);
+	struct kiocb *iocb = dio->iocb;
+	size_t size = dio->size;
+	int ret;
+
+	ret = zonefs_file_write_dio_end_io(iocb, size, dio->error, 0);
+	if (ret == 0)
+		iocb->ki_pos += size;
+
+	iocb->ki_complete(iocb, ret, dio->sector);
+
+	bio_put(&dio->bio);
+}
+
+static void zonefs_file_dio_append_end_io(struct bio *bio)
+{
+	struct zonefs_dio *dio = container_of(bio, struct zonefs_dio, bio);
+	struct kiocb *iocb = dio->iocb;
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	if (bio->bi_status)
+		dio->error = blk_status_to_errno(bio->bi_status);
+	else
+		dio->sector = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+	if (is_sync_kiocb(iocb)) {
+		struct task_struct *waiter = dio->waiter;
+
+		blk_wake_io_task(waiter);
+		WRITE_ONCE(dio->waiter, NULL);
+	} else {
+		INIT_WORK(&dio->work, zonefs_dio_complete_work);
+		queue_work(ZONEFS_SB(inode->i_sb)->s_dio_done_wq, &dio->work);
+	}
+
+	bio_release_pages(bio, false);
+	bio_put(bio);
+}
+
 static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct zonefs_inode_info *zi = ZONEFS_I(inode);
 	struct block_device *bdev = inode->i_sb->s_bdev;
+	struct zonefs_dio *dio;
 	unsigned int max;
 	struct bio *bio;
-	ssize_t size;
 	int nr_pages;
 	ssize_t ret;
+	bool sync = is_sync_kiocb(iocb);
+	bool polled;
+	blk_qc_t qc;
 
 	max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
 	max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
@@ -720,15 +777,24 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 		return 0;
 
 
-	bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
+	bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &zonefs_dio_bio_set);
 	if (!bio)
 		return -ENOMEM;
 
+	dio = container_of(bio, struct zonefs_dio, bio);
+	dio->iocb = iocb;
+	dio->error = 0;
+	if (sync) {
+		dio->waiter = current;
+		init_completion(&dio->completion);
+	}
+
 	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = zi->i_zsector;
 	bio->bi_write_hint = iocb->ki_hint;
 	bio->bi_ioprio = iocb->ki_ioprio;
 	bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
+	bio->bi_end_io = zonefs_file_dio_append_end_io;
 	if (iocb->ki_flags & IOCB_DSYNC)
 		bio->bi_opf |= REQ_FUA;
 
@@ -737,21 +803,41 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 		bio_io_error(bio);
 		return ret;
 	}
-	size = bio->bi_iter.bi_size;
+	dio->size = bio->bi_iter.bi_size;
 	task_io_account_write(ret);
 
-	if (iocb->ki_flags & IOCB_HIPRI)
+	if (iocb->ki_flags & IOCB_HIPRI) {
 		bio_set_polled(bio, iocb);
+		polled = true;
+	}
 
-	ret = submit_bio_wait(bio);
+	bio_get(bio);
+	qc = submit_bio(bio);
 
-	bio_put(bio);
+	if (polled)
+		WRITE_ONCE(iocb->ki_cookie, qc);
 
-	zonefs_file_write_dio_end_io(iocb, size, ret, 0);
-	if (ret >= 0) {
-		iocb->ki_pos += size;
-		return size;
+	if (!sync)
+		return -EIOCBQUEUED;
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!READ_ONCE(dio->waiter))
+			break;
+
+		if (!(iocb->ki_flags & IOCB_HIPRI) ||
+		    !blk_poll(bdev_get_queue(bdev), qc, true))
+			blk_io_schedule();
 	}
+	__set_current_state(TASK_RUNNING);
+
+	ret = zonefs_file_write_dio_end_io(iocb, dio->size,
+					   dio->error, 0);
+	if (ret == 0) {
+		ret = dio->size;
+		iocb->ki_pos += dio->size;
+	}
+	bio_put(bio);
 
 	return ret;
 }
@@ -813,7 +899,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 			goto inode_unlock;
 		}
 		mutex_unlock(&zi->i_truncate_mutex);
-		append = sync;
+		append = sync || iocb->ki_flags & IOCB_APPEND;
 	}
 
 	if (append)
@@ -821,8 +907,8 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 	else
 		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
 				   &zonefs_write_dio_ops, sync);
-	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
-	    (ret > 0 || ret == -EIOCBQUEUED)) {
+
+	if (ret > 0 || ret == -EIOCBQUEUED) {
 		if (ret > 0)
 			count = ret;
 		mutex_lock(&zi->i_truncate_mutex);
@@ -1580,6 +1666,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb->s_root)
 		goto cleanup;
 
+	sbi->s_dio_done_wq = alloc_workqueue("zonefs-dio/%s", WQ_MEM_RECLAIM,
+					     0, sb->s_id);
+	if (!sbi->s_dio_done_wq)
+		goto cleanup;
+
 	/* Create and populate files in zone groups directories */
 	for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
 		ret = zonefs_create_zgroup(&zd, t);
@@ -1603,8 +1694,14 @@ static void zonefs_kill_super(struct super_block *sb)
 {
 	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 
-	if (sb->s_root)
+	if (sb->s_root) {
 		d_genocide(sb->s_root);
+
+		if (sbi->s_dio_done_wq) {
+			destroy_workqueue(sbi->s_dio_done_wq);
+			sbi->s_dio_done_wq = NULL;
+		}
+	}
 	kill_block_super(sb);
 	kfree(sbi);
 }
@@ -1651,17 +1748,27 @@ static int __init zonefs_init(void)
 	if (ret)
 		return ret;
 
+	ret = bioset_init(&zonefs_dio_bio_set, 4,
+			  offsetof(struct zonefs_dio, bio), BIOSET_NEED_BVECS);
+	if (ret)
+		goto destroy_inodecache;
+
 	ret = register_filesystem(&zonefs_type);
-	if (ret) {
-		zonefs_destroy_inodecache();
-		return ret;
-	}
+	if (ret)
+		goto exit_bioset;
 
 	return 0;
+
+exit_bioset:
+	bioset_exit(&zonefs_dio_bio_set);
+destroy_inodecache:
+	zonefs_destroy_inodecache();
+	return ret;
 }
 
 static void __exit zonefs_exit(void)
 {
+	bioset_exit(&zonefs_dio_bio_set);
 	zonefs_destroy_inodecache();
 	unregister_filesystem(&zonefs_type);
 }
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 51141907097c..fe91df5eeffe 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -185,6 +185,9 @@ struct zonefs_sb_info {
 
 	unsigned int		s_max_open_zones;
 	atomic_t		s_open_zones;
+
+	/* AIO completions deferred from interrupt context */
+	struct workqueue_struct *s_dio_done_wq;
 };
 
 static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
-- 
2.26.2


  parent reply	other threads:[~2020-07-20 13:21 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-07-20 13:21 [PATCH 0/2] zonefs: use zone-append for aio with rwf append Johannes Thumshirn
2020-07-20 13:21 ` [PATCH 1/2] fs: fix kiocb ki_complete interface Johannes Thumshirn
2020-07-20 13:38   ` Christoph Hellwig
2020-07-20 13:43     ` Damien Le Moal
2020-07-20 13:47       ` Christoph Hellwig
2020-07-20 13:21 ` Johannes Thumshirn [this message]
2020-07-20 13:45   ` [PATCH 2/2] zonefs: use zone-append for AIO as well Christoph Hellwig
2020-07-20 16:48     ` Johannes Thumshirn
2020-07-21  5:54       ` Christoph Hellwig
2020-07-22 12:43         ` Johannes Thumshirn
2020-07-22 13:02           ` Damien Le Moal
2020-07-22 14:53             ` Christoph Hellwig
2020-07-22 14:51           ` Christoph Hellwig
2020-07-22 15:00             ` Johannes Thumshirn
2020-07-24 13:57             ` Kanchan Joshi
2020-07-27  3:12               ` Damien Le Moal
2020-07-21 12:43   ` Kanchan Joshi
2020-07-22 14:32     ` Johannes Thumshirn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200720132118.10934-3-johannes.thumshirn@wdc.com \
    --to=johannes.thumshirn@wdc.com \
    --cc=Damien.LeMoal@wdc.com \
    --cc=axboe@kernel.dk \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.