All of lore.kernel.org
 help / color / mirror / Atom feed
* iomap infrastructure and multipage writes V2
@ 2016-04-12 20:52 ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

This series add a new file system I/O path that uses the iomap structure
introduced for the pNFS support and support multi-page buffered writes.

This was first started by Dave Chinner a long time ago, then I did beat
it into shape for production runs in a very constrained ARM NAS
enviroment for Tuxera almost as long ago, and now half a dozen rewrites
later it's back.

The basic idea is to avoid the crazy per-block get_blocks overhead
and make use of extents in the buffered write path by iterating over
them instead.

Chances since V1:
 - add support for fiemap
 - fix a test fail on 1k block sizes
 - prepare for 64-bit length, this will be used in a follow on patchset


^ permalink raw reply	[flat|nested] 30+ messages in thread

* iomap infrastructure and multipage writes V2
@ 2016-04-12 20:52 ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

This series add a new file system I/O path that uses the iomap structure
introduced for the pNFS support and support multi-page buffered writes.

This was first started by Dave Chinner a long time ago, then I did beat
it into shape for production runs in a very constrained ARM NAS
enviroment for Tuxera almost as long ago, and now half a dozen rewrites
later it's back.

The basic idea is to avoid the crazy per-block get_blocks overhead
and make use of extents in the buffered write path by iterating over
them instead.

Chances since V1:
 - add support for fiemap
 - fix a test fail on 1k block sizes
 - prepare for 64-bit length, this will be used in a follow on patchset

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 1/8] fs: move struct iomap from exportfs.h to a separate header
  2016-04-12 20:52 ` Christoph Hellwig
@ 2016-04-12 20:52   ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfsd/blocklayout.c    |  1 +
 fs/nfsd/blocklayoutxdr.c |  1 +
 fs/xfs/xfs_pnfs.c        |  1 +
 include/linux/exportfs.h | 16 +---------------
 include/linux/iomap.h    | 21 +++++++++++++++++++++
 5 files changed, 25 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/iomap.h

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index e55b524..4df16ae 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 #include <linux/exportfs.h>
+#include <linux/iomap.h>
 #include <linux/genhd.h>
 #include <linux/slab.h>
 #include <linux/pr.h>
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6c3b316..4ebaaf4 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
  */
 #include <linux/sunrpc/svc.h>
 #include <linux/exportfs.h>
+#include <linux/iomap.h>
 #include <linux/nfs4.h>
 
 #include "nfsd.h"
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 3332bae..0a56787 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2014 Christoph Hellwig.
  */
+#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index d841450..b03c062 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -6,6 +6,7 @@
 struct dentry;
 struct iattr;
 struct inode;
+struct iomap;
 struct super_block;
 struct vfsmount;
 
@@ -187,21 +188,6 @@ struct fid {
  *    get_name is not (which is possibly inconsistent)
  */
 
-/* types of block ranges for multipage write mappings. */
-#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
-#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
-#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
-#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
-
-#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
-
-struct iomap {
-	sector_t	blkno;	/* first sector of mapping */
-	loff_t		offset;	/* file offset of mapping, bytes */
-	u64		length;	/* length of mapping, bytes */
-	int		type;	/* type of mapping */
-};
-
 struct export_operations {
 	int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
 			struct inode *parent);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
new file mode 100644
index 0000000..1b22197
--- /dev/null
+++ b/include/linux/iomap.h
@@ -0,0 +1,21 @@
+#ifndef LINUX_IOMAP_H
+#define LINUX_IOMAP_H 1
+
+#include <linux/types.h>
+
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
+#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
+
+#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
+
+struct iomap {
+	sector_t	blkno;	/* first sector of mapping */
+	loff_t		offset;	/* file offset of mapping, bytes */
+	u64		length;	/* length of mapping, bytes */
+	int		type;	/* type of mapping */
+};
+
+#endif /* LINUX_IOMAP_H */
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 1/8] fs: move struct iomap from exportfs.h to a separate header
@ 2016-04-12 20:52   ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfsd/blocklayout.c    |  1 +
 fs/nfsd/blocklayoutxdr.c |  1 +
 fs/xfs/xfs_pnfs.c        |  1 +
 include/linux/exportfs.h | 16 +---------------
 include/linux/iomap.h    | 21 +++++++++++++++++++++
 5 files changed, 25 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/iomap.h

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index e55b524..4df16ae 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 #include <linux/exportfs.h>
+#include <linux/iomap.h>
 #include <linux/genhd.h>
 #include <linux/slab.h>
 #include <linux/pr.h>
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6c3b316..4ebaaf4 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
  */
 #include <linux/sunrpc/svc.h>
 #include <linux/exportfs.h>
+#include <linux/iomap.h>
 #include <linux/nfs4.h>
 
 #include "nfsd.h"
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 3332bae..0a56787 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2014 Christoph Hellwig.
  */
+#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index d841450..b03c062 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -6,6 +6,7 @@
 struct dentry;
 struct iattr;
 struct inode;
+struct iomap;
 struct super_block;
 struct vfsmount;
 
@@ -187,21 +188,6 @@ struct fid {
  *    get_name is not (which is possibly inconsistent)
  */
 
-/* types of block ranges for multipage write mappings. */
-#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
-#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
-#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
-#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
-
-#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
-
-struct iomap {
-	sector_t	blkno;	/* first sector of mapping */
-	loff_t		offset;	/* file offset of mapping, bytes */
-	u64		length;	/* length of mapping, bytes */
-	int		type;	/* type of mapping */
-};
-
 struct export_operations {
 	int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
 			struct inode *parent);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
new file mode 100644
index 0000000..1b22197
--- /dev/null
+++ b/include/linux/iomap.h
@@ -0,0 +1,21 @@
+#ifndef LINUX_IOMAP_H
+#define LINUX_IOMAP_H 1
+
+#include <linux/types.h>
+
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
+#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
+
+#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
+
+struct iomap {
+	sector_t	blkno;	/* first sector of mapping */
+	loff_t		offset;	/* file offset of mapping, bytes */
+	u64		length;	/* length of mapping, bytes */
+	int		type;	/* type of mapping */
+};
+
+#endif /* LINUX_IOMAP_H */
-- 
2.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 2/8] fs: introduce iomap infrastructure
  2016-04-12 20:52 ` Christoph Hellwig
@ 2016-04-12 20:52   ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Add infrastructure for multipage buffered writes.  This is implemented
using an main iterator that applies an actor function to a range that
can be written.

This infrastucture is used to implement a buffered write helper, one
to zero file ranges and one to implement the ->page_mkwrite VM
operations.  All of them borrow a fair amount of code from fs/buffers.
for now by using an internal version of __block_write_begin that
gets passed an iomap and builds the corresponding buffer head.

The file system is gets a set of paired ->iomap_begin and ->iomap_end
calls which allow it to map/reserve a range and get a notification
once the write code is finished with it.

Based on earlier code from Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/Kconfig            |   3 +
 fs/Makefile           |   1 +
 fs/buffer.c           |  76 +++++++++-
 fs/internal.h         |   3 +
 fs/iomap.c            | 387 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |  56 +++++++-
 6 files changed, 516 insertions(+), 10 deletions(-)
 create mode 100644 fs/iomap.c

diff --git a/fs/Kconfig b/fs/Kconfig
index 6725f59..276fcfb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
 
 if BLOCK
 
+config FS_IOMAP
+	bool
+
 source "fs/ext2/Kconfig"
 source "fs/ext4/Kconfig"
 source "fs/jbd2/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 85b6e13..ed2b632 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP)		+= coredump.o
 obj-$(CONFIG_SYSCTL)		+= drop_caches.o
 
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
+obj-$(CONFIG_FS_IOMAP)		+= iomap.o
 
 obj-y				+= quota/
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 33be296..a003fd4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
@@ -1891,8 +1892,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
 
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-		get_block_t *get_block)
+static void
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+		struct iomap *iomap)
+{
+	loff_t offset = block << inode->i_blkbits;
+
+	bh->b_bdev = iomap->bdev;
+
+	/*
+	 * Block points to offset in file we need to map, iomap contains
+	 * the offset at which the map starts. If the map ends before the
+	 * current block, then do not map the buffer and let the caller
+	 * handle it.
+	 */
+	BUG_ON(offset >= iomap->offset + iomap->length);
+
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		/*
+		 * If the buffer is not up to date or beyond the current EOF,
+		 * we need to mark it as new to ensure sub-block zeroing is
+		 * executed if necessary.
+		 */
+		if (!buffer_uptodate(bh) ||
+		    (offset >= i_size_read(inode)))
+			set_buffer_new(bh);
+		break;
+	case IOMAP_DELALLOC:
+		if (!buffer_uptodate(bh) ||
+		    (offset >= i_size_read(inode)))
+			set_buffer_new(bh);
+		set_buffer_uptodate(bh);
+		set_buffer_mapped(bh);
+		set_buffer_delay(bh);
+		break;
+	case IOMAP_UNWRITTEN:
+		/*
+		 * For unwritten regions, we always need to ensure that
+		 * sub-block writes cause the regions in the block we are not
+		 * writing to are zeroed. Set the buffer as new to ensre this.
+		 */
+		set_buffer_new(bh);
+		set_buffer_unwritten(bh);
+		/* FALLTHRU */
+	case IOMAP_MAPPED:
+		if (offset >= i_size_read(inode))
+			set_buffer_new(bh);
+		bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+				((offset - iomap->offset) >> inode->i_blkbits);
+		set_buffer_mapped(bh);
+		break;
+	}
+}
+
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block, struct iomap *iomap)
 {
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned to = from + len;
@@ -1928,9 +1983,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 			clear_buffer_new(bh);
 		if (!buffer_mapped(bh)) {
 			WARN_ON(bh->b_size != blocksize);
-			err = get_block(inode, block, bh, 1);
-			if (err)
-				break;
+			if (get_block) {
+				err = get_block(inode, block, bh, 1);
+				if (err)
+					break;
+			} else {
+				iomap_to_bh(inode, block, bh, iomap);
+			}
+
 			if (buffer_new(bh)) {
 				unmap_underlying_metadata(bh->b_bdev,
 							bh->b_blocknr);
@@ -1971,6 +2031,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 		page_zero_new_buffers(page, from, to);
 	return err;
 }
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block)
+{
+	return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
 EXPORT_SYMBOL(__block_write_begin);
 
 static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/internal.h b/fs/internal.h
index b71deee..c0c6f49 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 
 struct super_block;
 struct file_system_type;
+struct iomap;
 struct linux_binprm;
 struct path;
 struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
  * buffer.c
  */
 extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block, struct iomap *iomap);
 
 /*
  * char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644
index 0000000..534cb37
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+		void *data, struct iomap *iomap);
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+		struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+	struct iomap iomap = { 0 };
+	loff_t written = 0, ret;
+
+	/*
+	 * Need to map a range from start position for count bytes. This can
+	 * span multiple pages - it is only guaranteed to return a range of a
+	 * single type of pages (e.g. all into a hole, all mapped or all
+	 * unwritten). Failure at this point has nothing to undo.
+	 *
+	 * If allocation is required for this range, reserve the space now so
+	 * that the allocation is guaranteed to succeed later on. Once we copy
+	 * the data into the page cache pages, then we cannot fail otherwise we
+	 * expose transient stale data. If the reserve fails, we can safely
+	 * back out at this point as there is nothing to undo.
+	 */
+	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+	if (ret)
+		return ret;
+	if (WARN_ON(iomap.offset > pos))
+		return -EIO;
+
+	/*
+	 * Cut down the length to the one actually provided by the filesystem,
+	 * as it might not be able to give us the whole size that we requested.
+	 */
+	if (iomap.offset + iomap.length < pos + length)
+		length = iomap.offset + iomap.length - pos;
+
+	/*
+	 * Now that we have guaranteed that the space allocation will succeed.
+	 * we can do the copy-in page by page without having to worry about
+	 * failures exposing transient data.
+	 */
+	written = actor(inode, pos, length, data, &iomap);
+
+	/*
+	 * Now the data has been copied, commit the range we've copied.  This
+	 * should not fail unless the filesystem has had a fatal error.
+	 */
+	ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+			flags, &iomap);
+
+	return written ? written : ret;
+}
+
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+	loff_t i_size = i_size_read(inode);
+
+	/*
+	 * Only truncate newly allocated pages beyoned EOF, even if the
+	 * write started inside the existing inode size.
+	 */
+	if (pos + len > i_size)
+		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, struct iomap *iomap)
+{
+	pgoff_t index = pos >> PAGE_SHIFT;
+	struct page *page;
+	int status = 0;
+
+	BUG_ON(pos + len > iomap->offset + iomap->length);
+
+	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	status = __block_write_begin_int(page, pos, len, NULL, iomap);
+	if (unlikely(status)) {
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+
+		iomap_write_failed(inode, pos, len);
+	}
+
+	*pagep = page;
+	return status;
+}
+
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+		unsigned copied, struct page *page)
+{
+	int ret;
+
+	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+			copied, page, NULL);
+	if (ret < len)
+		iomap_write_failed(inode, pos, len);
+	return ret;
+}
+
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct iov_iter *i = data;
+	long status = 0;
+	ssize_t written = 0;
+	unsigned int flags = AOP_FLAG_NOFS;
+
+	/*
+	 * Copies from kernel address space cannot fail (NFSD is a big user).
+	 */
+	if (!iter_is_iovec(i))
+		flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	do {
+		struct page *page;
+		unsigned long offset;	/* Offset into pagecache page */
+		unsigned long bytes;	/* Bytes to write to page */
+		size_t copied;		/* Bytes copied from user */
+
+		offset = (pos & (PAGE_SIZE - 1));
+		bytes = min_t(unsigned long, PAGE_SIZE - offset,
+						iov_iter_count(i));
+again:
+		if (bytes > length)
+			bytes = length;
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+			status = -EFAULT;
+			break;
+		}
+
+		status = iomap_write_begin(inode, pos, bytes, flags, &page,
+				iomap);
+		if (unlikely(status))
+			break;
+
+		if (mapping_writably_mapped(inode->i_mapping))
+			flush_dcache_page(page);
+
+		pagefault_disable();
+		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+		pagefault_enable();
+
+		flush_dcache_page(page);
+		mark_page_accessed(page);
+
+		status = iomap_write_end(inode, pos, bytes, copied, page);
+		if (unlikely(status < 0))
+			break;
+		copied = status;
+
+		cond_resched();
+
+		iov_iter_advance(i, copied);
+		if (unlikely(copied == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+						iov_iter_single_seg_count(i));
+			goto again;
+		}
+		pos += copied;
+		written += copied;
+		length -= copied;
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+	} while (iov_iter_count(i) && length);
+
+	return written ? written : status;
+}
+
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+		struct iomap_ops *ops)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+
+	while (iov_iter_count(iter)) {
+		ret = iomap_apply(inode, pos, iov_iter_count(iter),
+				IOMAP_WRITE, ops, iter, iomap_write_actor);
+		if (ret <= 0)
+			break;
+		pos += ret;
+		written += ret;
+	}
+
+	return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+		void *data, struct iomap *iomap)
+{
+	bool *did_zero = data;
+	struct page *page;
+	int status;
+	loff_t written = 0;
+
+	/* already zeroed?  we're done. */
+	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+	    	return count;
+
+	do {
+		unsigned offset, bytes;
+
+		offset = pos & (PAGE_SIZE - 1); /* Within page */
+		bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+
+		status = iomap_write_begin(inode, pos, bytes,
+				AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS,
+				&page, iomap);
+		if (status)
+			break;
+
+		zero_user(page, offset, bytes);
+		mark_page_accessed(page);
+
+		status = iomap_write_end(inode, pos, bytes, bytes, page);
+		if (status)
+			break;
+
+		pos += bytes;
+		count -= bytes;
+		written += bytes;
+		if (did_zero)
+			*did_zero = true;
+	} while (count > 0);
+
+	return status ? status : written;
+}
+
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+		struct iomap_ops *ops)
+{
+	loff_t ret;
+
+	while (len > 0) {
+		ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+				ops, did_zero, iomap_zero_range_actor);
+		if (ret <= 0)
+			return ret;
+
+		pos += ret;
+		len -= ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+		struct iomap_ops *ops)
+{
+	unsigned blocksize = (1 << inode->i_blkbits);
+	unsigned off = pos & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!off)
+		return 0;
+	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct page *page = data;
+	int ret;
+
+	ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
+			NULL, iomap);
+	if (ret)
+		return ret;
+
+	block_commit_write(page, 0, length);
+	return length;
+}
+
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	unsigned long length;
+	loff_t offset, size;
+	ssize_t ret;
+
+	lock_page(page);
+	size = i_size_read(inode);
+	if ((page->mapping != inode->i_mapping) ||
+	    (page_offset(page) > size)) {
+		/* We overload EFAULT to mean page got truncated */
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	/* page is wholly or partially inside EOF */
+	if (((page->index + 1) << PAGE_SHIFT) > size)
+		length = size & ~PAGE_MASK;
+	else
+		length = PAGE_SIZE;
+
+	offset = page_offset(page);
+	while (length > 0) {
+		ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
+				ops, page, iomap_page_mkwrite_actor);
+		if (unlikely(ret <= 0))
+			goto out_unlock;
+		offset += ret;
+		length -= ret;
+	}
+
+	set_page_dirty(page);
+	wait_for_stable_page(page);
+	return 0;
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1b22197..854766f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -3,19 +3,65 @@
 
 #include <linux/types.h>
 
-/* types of block ranges for multipage write mappings. */
+struct inode;
+struct iov_iter;
+struct kiocb;
+struct vm_area_struct;
+struct vm_fault;
+
+/*
+ * Types of block ranges for iomap mappings:
+ */
 #define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
 #define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
 #define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
 #define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
 
+/*
+ * Magic value for blkno:
+ */
 #define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
 
 struct iomap {
-	sector_t	blkno;	/* first sector of mapping */
-	loff_t		offset;	/* file offset of mapping, bytes */
-	u64		length;	/* length of mapping, bytes */
-	int		type;	/* type of mapping */
+	sector_t		blkno;	/* first sector of mapping, fs blocks */
+	loff_t			offset;	/* file offset of mapping, bytes */
+	u64			length;	/* length of mapping, bytes */
+	int			type;	/* type of mapping */
+	struct block_device	*bdev; /* block device for I/O */
+};
+
+/*
+ * Flags for iomap_begin / iomap_end.  No flag implies a read.
+ */
+#define IOMAP_WRITE		(1 << 0)
+#define IOMAP_ZERO		(1 << 1)
+
+struct iomap_ops {
+	/*
+	 * Return the existing mapping at pos, or reserve space starting at
+	 * pos for up to length, as long as we can do it as a single mapping.
+	 * The actual length is returned in iomap->length.
+	 */
+	int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
+			unsigned flags, struct iomap *iomap);
+
+	/*
+	 * Commit and/or unreserve space previous allocated using iomap_begin.
+	 * Written indicates the length of the successful write operation which
+	 * needs to be commited, while the rest needs to be unreserved.
+	 * Written might be zero if no data was written.
+	 */
+	int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
+			ssize_t written, unsigned flags, struct iomap *iomap);
 };
 
+ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+		struct iomap_ops *ops);
+int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
+		bool *did_zero, struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+		struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops);
+
 #endif /* LINUX_IOMAP_H */
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 2/8] fs: introduce iomap infrastructure
@ 2016-04-12 20:52   ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Add infrastructure for multipage buffered writes.  This is implemented
using an main iterator that applies an actor function to a range that
can be written.

This infrastucture is used to implement a buffered write helper, one
to zero file ranges and one to implement the ->page_mkwrite VM
operations.  All of them borrow a fair amount of code from fs/buffers.
for now by using an internal version of __block_write_begin that
gets passed an iomap and builds the corresponding buffer head.

The file system is gets a set of paired ->iomap_begin and ->iomap_end
calls which allow it to map/reserve a range and get a notification
once the write code is finished with it.

Based on earlier code from Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/Kconfig            |   3 +
 fs/Makefile           |   1 +
 fs/buffer.c           |  76 +++++++++-
 fs/internal.h         |   3 +
 fs/iomap.c            | 387 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |  56 +++++++-
 6 files changed, 516 insertions(+), 10 deletions(-)
 create mode 100644 fs/iomap.c

diff --git a/fs/Kconfig b/fs/Kconfig
index 6725f59..276fcfb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
 
 if BLOCK
 
+config FS_IOMAP
+	bool
+
 source "fs/ext2/Kconfig"
 source "fs/ext4/Kconfig"
 source "fs/jbd2/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 85b6e13..ed2b632 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP)		+= coredump.o
 obj-$(CONFIG_SYSCTL)		+= drop_caches.o
 
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
+obj-$(CONFIG_FS_IOMAP)		+= iomap.o
 
 obj-y				+= quota/
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 33be296..a003fd4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
@@ -1891,8 +1892,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
 
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-		get_block_t *get_block)
+static void
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+		struct iomap *iomap)
+{
+	loff_t offset = block << inode->i_blkbits;
+
+	bh->b_bdev = iomap->bdev;
+
+	/*
+	 * Block points to offset in file we need to map, iomap contains
+	 * the offset at which the map starts. If the map ends before the
+	 * current block, then do not map the buffer and let the caller
+	 * handle it.
+	 */
+	BUG_ON(offset >= iomap->offset + iomap->length);
+
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		/*
+		 * If the buffer is not up to date or beyond the current EOF,
+		 * we need to mark it as new to ensure sub-block zeroing is
+		 * executed if necessary.
+		 */
+		if (!buffer_uptodate(bh) ||
+		    (offset >= i_size_read(inode)))
+			set_buffer_new(bh);
+		break;
+	case IOMAP_DELALLOC:
+		if (!buffer_uptodate(bh) ||
+		    (offset >= i_size_read(inode)))
+			set_buffer_new(bh);
+		set_buffer_uptodate(bh);
+		set_buffer_mapped(bh);
+		set_buffer_delay(bh);
+		break;
+	case IOMAP_UNWRITTEN:
+		/*
+		 * For unwritten regions, we always need to ensure that
+		 * sub-block writes cause the regions in the block we are not
+		 * writing to are zeroed. Set the buffer as new to ensre this.
+		 */
+		set_buffer_new(bh);
+		set_buffer_unwritten(bh);
+		/* FALLTHRU */
+	case IOMAP_MAPPED:
+		if (offset >= i_size_read(inode))
+			set_buffer_new(bh);
+		bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+				((offset - iomap->offset) >> inode->i_blkbits);
+		set_buffer_mapped(bh);
+		break;
+	}
+}
+
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block, struct iomap *iomap)
 {
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned to = from + len;
@@ -1928,9 +1983,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 			clear_buffer_new(bh);
 		if (!buffer_mapped(bh)) {
 			WARN_ON(bh->b_size != blocksize);
-			err = get_block(inode, block, bh, 1);
-			if (err)
-				break;
+			if (get_block) {
+				err = get_block(inode, block, bh, 1);
+				if (err)
+					break;
+			} else {
+				iomap_to_bh(inode, block, bh, iomap);
+			}
+
 			if (buffer_new(bh)) {
 				unmap_underlying_metadata(bh->b_bdev,
 							bh->b_blocknr);
@@ -1971,6 +2031,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 		page_zero_new_buffers(page, from, to);
 	return err;
 }
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block)
+{
+	return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
 EXPORT_SYMBOL(__block_write_begin);
 
 static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/internal.h b/fs/internal.h
index b71deee..c0c6f49 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 
 struct super_block;
 struct file_system_type;
+struct iomap;
 struct linux_binprm;
 struct path;
 struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
  * buffer.c
  */
 extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block, struct iomap *iomap);
 
 /*
  * char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644
index 0000000..534cb37
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+		void *data, struct iomap *iomap);
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+		struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+	struct iomap iomap = { 0 };
+	loff_t written = 0, ret;
+
+	/*
+	 * Need to map a range from start position for count bytes. This can
+	 * span multiple pages - it is only guaranteed to return a range of a
+	 * single type of pages (e.g. all into a hole, all mapped or all
+	 * unwritten). Failure at this point has nothing to undo.
+	 *
+	 * If allocation is required for this range, reserve the space now so
+	 * that the allocation is guaranteed to succeed later on. Once we copy
+	 * the data into the page cache pages, then we cannot fail otherwise we
+	 * expose transient stale data. If the reserve fails, we can safely
+	 * back out at this point as there is nothing to undo.
+	 */
+	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+	if (ret)
+		return ret;
+	if (WARN_ON(iomap.offset > pos))
+		return -EIO;
+
+	/*
+	 * Cut down the length to the one actually provided by the filesystem,
+	 * as it might not be able to give us the whole size that we requested.
+	 */
+	if (iomap.offset + iomap.length < pos + length)
+		length = iomap.offset + iomap.length - pos;
+
+	/*
+	 * Now that we have guaranteed that the space allocation will succeed.
+	 * we can do the copy-in page by page without having to worry about
+	 * failures exposing transient data.
+	 */
+	written = actor(inode, pos, length, data, &iomap);
+
+	/*
+	 * Now the data has been copied, commit the range we've copied.  This
+	 * should not fail unless the filesystem has had a fatal error.
+	 */
+	ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+			flags, &iomap);
+
+	return written ? written : ret;
+}
+
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+	loff_t i_size = i_size_read(inode);
+
+	/*
+	 * Only truncate newly allocated pages beyoned EOF, even if the
+	 * write started inside the existing inode size.
+	 */
+	if (pos + len > i_size)
+		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, struct iomap *iomap)
+{
+	pgoff_t index = pos >> PAGE_SHIFT;
+	struct page *page;
+	int status = 0;
+
+	BUG_ON(pos + len > iomap->offset + iomap->length);
+
+	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	status = __block_write_begin_int(page, pos, len, NULL, iomap);
+	if (unlikely(status)) {
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+
+		iomap_write_failed(inode, pos, len);
+	}
+
+	*pagep = page;
+	return status;
+}
+
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+		unsigned copied, struct page *page)
+{
+	int ret;
+
+	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+			copied, page, NULL);
+	if (ret < len)
+		iomap_write_failed(inode, pos, len);
+	return ret;
+}
+
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct iov_iter *i = data;
+	long status = 0;
+	ssize_t written = 0;
+	unsigned int flags = AOP_FLAG_NOFS;
+
+	/*
+	 * Copies from kernel address space cannot fail (NFSD is a big user).
+	 */
+	if (!iter_is_iovec(i))
+		flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	do {
+		struct page *page;
+		unsigned long offset;	/* Offset into pagecache page */
+		unsigned long bytes;	/* Bytes to write to page */
+		size_t copied;		/* Bytes copied from user */
+
+		offset = (pos & (PAGE_SIZE - 1));
+		bytes = min_t(unsigned long, PAGE_SIZE - offset,
+						iov_iter_count(i));
+again:
+		if (bytes > length)
+			bytes = length;
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+			status = -EFAULT;
+			break;
+		}
+
+		status = iomap_write_begin(inode, pos, bytes, flags, &page,
+				iomap);
+		if (unlikely(status))
+			break;
+
+		if (mapping_writably_mapped(inode->i_mapping))
+			flush_dcache_page(page);
+
+		pagefault_disable();
+		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+		pagefault_enable();
+
+		flush_dcache_page(page);
+		mark_page_accessed(page);
+
+		status = iomap_write_end(inode, pos, bytes, copied, page);
+		if (unlikely(status < 0))
+			break;
+		copied = status;
+
+		cond_resched();
+
+		iov_iter_advance(i, copied);
+		if (unlikely(copied == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+						iov_iter_single_seg_count(i));
+			goto again;
+		}
+		pos += copied;
+		written += copied;
+		length -= copied;
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+	} while (iov_iter_count(i) && length);
+
+	return written ? written : status;
+}
+
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+		struct iomap_ops *ops)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+
+	while (iov_iter_count(iter)) {
+		ret = iomap_apply(inode, pos, iov_iter_count(iter),
+				IOMAP_WRITE, ops, iter, iomap_write_actor);
+		if (ret <= 0)
+			break;
+		pos += ret;
+		written += ret;
+	}
+
+	return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+		void *data, struct iomap *iomap)
+{
+	bool *did_zero = data;
+	struct page *page;
+	int status;
+	loff_t written = 0;
+
+	/* already zeroed?  we're done. */
+	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+	    	return count;
+
+	do {
+		unsigned offset, bytes;
+
+		offset = pos & (PAGE_SIZE - 1); /* Within page */
+		bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+
+		status = iomap_write_begin(inode, pos, bytes,
+				AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS,
+				&page, iomap);
+		if (status)
+			break;
+
+		zero_user(page, offset, bytes);
+		mark_page_accessed(page);
+
+		status = iomap_write_end(inode, pos, bytes, bytes, page);
+		if (status)
+			break;
+
+		pos += bytes;
+		count -= bytes;
+		written += bytes;
+		if (did_zero)
+			*did_zero = true;
+	} while (count > 0);
+
+	return status ? status : written;
+}
+
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+		struct iomap_ops *ops)
+{
+	loff_t ret;
+
+	while (len > 0) {
+		ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+				ops, did_zero, iomap_zero_range_actor);
+		if (ret <= 0)
+			return ret;
+
+		pos += ret;
+		len -= ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+		struct iomap_ops *ops)
+{
+	unsigned blocksize = (1 << inode->i_blkbits);
+	unsigned off = pos & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!off)
+		return 0;
+	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct page *page = data;
+	int ret;
+
+	ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
+			NULL, iomap);
+	if (ret)
+		return ret;
+
+	block_commit_write(page, 0, length);
+	return length;
+}
+
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	unsigned long length;
+	loff_t offset, size;
+	ssize_t ret;
+
+	lock_page(page);
+	size = i_size_read(inode);
+	if ((page->mapping != inode->i_mapping) ||
+	    (page_offset(page) > size)) {
+		/* We overload EFAULT to mean page got truncated */
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	/* page is wholly or partially inside EOF */
+	if (((page->index + 1) << PAGE_SHIFT) > size)
+		length = size & ~PAGE_MASK;
+	else
+		length = PAGE_SIZE;
+
+	offset = page_offset(page);
+	while (length > 0) {
+		ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
+				ops, page, iomap_page_mkwrite_actor);
+		if (unlikely(ret <= 0))
+			goto out_unlock;
+		offset += ret;
+		length -= ret;
+	}
+
+	set_page_dirty(page);
+	wait_for_stable_page(page);
+	return 0;
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1b22197..854766f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -3,19 +3,65 @@
 
 #include <linux/types.h>
 
-/* types of block ranges for multipage write mappings. */
+struct inode;
+struct iov_iter;
+struct kiocb;
+struct vm_area_struct;
+struct vm_fault;
+
+/*
+ * Types of block ranges for iomap mappings:
+ */
 #define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
 #define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
 #define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
 #define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
 
+/*
+ * Magic value for blkno:
+ */
 #define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
 
 struct iomap {
-	sector_t	blkno;	/* first sector of mapping */
-	loff_t		offset;	/* file offset of mapping, bytes */
-	u64		length;	/* length of mapping, bytes */
-	int		type;	/* type of mapping */
+	sector_t		blkno;	/* first sector of mapping, fs blocks */
+	loff_t			offset;	/* file offset of mapping, bytes */
+	u64			length;	/* length of mapping, bytes */
+	int			type;	/* type of mapping */
+	struct block_device	*bdev; /* block device for I/O */
+};
+
+/*
+ * Flags for iomap_begin / iomap_end.  No flag implies a read.
+ */
+#define IOMAP_WRITE		(1 << 0)
+#define IOMAP_ZERO		(1 << 1)
+
+struct iomap_ops {
+	/*
+	 * Return the existing mapping at pos, or reserve space starting at
+	 * pos for up to length, as long as we can do it as a single mapping.
+	 * The actual length is returned in iomap->length.
+	 */
+	int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
+			unsigned flags, struct iomap *iomap);
+
+	/*
+	 * Commit and/or unreserve space previous allocated using iomap_begin.
+	 * Written indicates the length of the successful write operation which
+	 * needs to be commited, while the rest needs to be unreserved.
+	 * Written might be zero if no data was written.
+	 */
+	int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
+			ssize_t written, unsigned flags, struct iomap *iomap);
 };
 
+ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+		struct iomap_ops *ops);
+int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
+		bool *did_zero, struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+		struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops);
+
 #endif /* LINUX_IOMAP_H */
-- 
2.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 3/8] xfs: make xfs_bmbt_to_iomap available outside of xfs_pnfs.c
  2016-04-12 20:52 ` Christoph Hellwig
@ 2016-04-12 20:52   ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

And ensure it works for RT subvolume files an set the block device,
both of which will be needed to be able to use the function in the
buffered write path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iomap.c | 27 +++++++++++++++++++++++++++
 fs/xfs/xfs_iomap.h |  4 ++++
 fs/xfs/xfs_pnfs.c  | 26 --------------------------
 3 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5839135..2f37194 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
+#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
@@ -940,3 +941,29 @@ error_on_bmapi_transaction:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
+
+void
+xfs_bmbt_to_iomap(
+	struct xfs_inode	*ip,
+	struct iomap		*iomap,
+	struct xfs_bmbt_irec	*imap)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (imap->br_startblock == HOLESTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_HOLE;
+	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_DELALLOC;
+	} else {
+		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+		if (imap->br_state == XFS_EXT_UNWRITTEN)
+			iomap->type = IOMAP_UNWRITTEN;
+		else
+			iomap->type = IOMAP_MAPPED;
+	}
+	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8688e66..718f07c 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,7 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
 
+struct iomap;
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
@@ -29,4 +30,7 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
 			struct xfs_bmbt_irec *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 
+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+		struct xfs_bmbt_irec *);
+
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 0a56787..dc1a71f 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -80,32 +80,6 @@ xfs_fs_get_uuid(
 	return 0;
 }
 
-static void
-xfs_bmbt_to_iomap(
-	struct xfs_inode	*ip,
-	struct iomap		*iomap,
-	struct xfs_bmbt_irec	*imap)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-
-	if (imap->br_startblock == HOLESTARTBLOCK) {
-		iomap->blkno = IOMAP_NULL_BLOCK;
-		iomap->type = IOMAP_HOLE;
-	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
-		iomap->blkno = IOMAP_NULL_BLOCK;
-		iomap->type = IOMAP_DELALLOC;
-	} else {
-		iomap->blkno =
-			XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
-		if (imap->br_state == XFS_EXT_UNWRITTEN)
-			iomap->type = IOMAP_UNWRITTEN;
-		else
-			iomap->type = IOMAP_MAPPED;
-	}
-	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-}
-
 /*
  * Get a layout for the pNFS client.
  */
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 3/8] xfs: make xfs_bmbt_to_iomap available outside of xfs_pnfs.c
@ 2016-04-12 20:52   ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

And ensure it works for RT subvolume files an set the block device,
both of which will be needed to be able to use the function in the
buffered write path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iomap.c | 27 +++++++++++++++++++++++++++
 fs/xfs/xfs_iomap.h |  4 ++++
 fs/xfs/xfs_pnfs.c  | 26 --------------------------
 3 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5839135..2f37194 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
+#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
@@ -940,3 +941,29 @@ error_on_bmapi_transaction:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
+
+void
+xfs_bmbt_to_iomap(
+	struct xfs_inode	*ip,
+	struct iomap		*iomap,
+	struct xfs_bmbt_irec	*imap)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (imap->br_startblock == HOLESTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_HOLE;
+	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_DELALLOC;
+	} else {
+		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+		if (imap->br_state == XFS_EXT_UNWRITTEN)
+			iomap->type = IOMAP_UNWRITTEN;
+		else
+			iomap->type = IOMAP_MAPPED;
+	}
+	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8688e66..718f07c 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,7 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
 
+struct iomap;
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
@@ -29,4 +30,7 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
 			struct xfs_bmbt_irec *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 
+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+		struct xfs_bmbt_irec *);
+
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 0a56787..dc1a71f 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -80,32 +80,6 @@ xfs_fs_get_uuid(
 	return 0;
 }
 
-static void
-xfs_bmbt_to_iomap(
-	struct xfs_inode	*ip,
-	struct iomap		*iomap,
-	struct xfs_bmbt_irec	*imap)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-
-	if (imap->br_startblock == HOLESTARTBLOCK) {
-		iomap->blkno = IOMAP_NULL_BLOCK;
-		iomap->type = IOMAP_HOLE;
-	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
-		iomap->blkno = IOMAP_NULL_BLOCK;
-		iomap->type = IOMAP_DELALLOC;
-	} else {
-		iomap->blkno =
-			XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
-		if (imap->br_state == XFS_EXT_UNWRITTEN)
-			iomap->type = IOMAP_UNWRITTEN;
-		else
-			iomap->type = IOMAP_MAPPED;
-	}
-	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-}
-
 /*
  * Get a layout for the pNFS client.
  */
-- 
2.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 4/8] xfs: reorder zeroing and flushing sequence in truncate
  2016-04-12 20:52 ` Christoph Hellwig
@ 2016-04-12 20:52   ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Currently zeroing out blocks and waiting for writeout is a bit of a mess in
truncate.  This patch gives it a clear order in preparation for the iomap
path:

 (1) we first wait for any direct I/O to complete to prevent any races for it
 (2) we then perform the actual zeroing, and only use the truncate_page helpers
     for truncating down.  The truncate up case already is handled by the
     separate call to xfs_zero_eof.
 (3) only then we write back dirty data, as zeroing block may cause dirty pages
     when using either xfs_zero_eof or the new iomap infrastructure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iops.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c5d4eba..1e2086d 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -801,20 +801,35 @@ xfs_setattr_size(
 		return error;
 
 	/*
+	 * Wait for all direct I/O to complete.
+	 */
+	inode_dio_wait(inode);
+
+	/*
 	 * File data changes must be complete before we start the transaction to
 	 * modify the inode.  This needs to be done before joining the inode to
 	 * the transaction because the inode cannot be unlocked once it is a
 	 * part of the transaction.
 	 *
-	 * Start with zeroing any data block beyond EOF that we may expose on
-	 * file extension.
+	 * Start with zeroing any data beyond EOF that we may expose on file
+	 & extension, or zeroing out the rest of the block on a downward
+	 * truncate.
 	 */
 	if (newsize > oldsize) {
 		error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
-		if (error)
-			return error;
+	} else {
+		if (IS_DAX(inode)) {
+			error = dax_truncate_page(inode, newsize,
+					xfs_get_blocks_direct);
+		} else {
+			error = block_truncate_page(inode->i_mapping, newsize,
+					xfs_get_blocks);
+		}
 	}
 
+	if (error)
+		return error;
+
 	/*
 	 * We are going to log the inode size change in this transaction so
 	 * any previous writes that are beyond the on disk EOF and the new
@@ -831,9 +846,6 @@ xfs_setattr_size(
 			return error;
 	}
 
-	/* Now wait for all direct I/O to complete. */
-	inode_dio_wait(inode);
-
 	/*
 	 * We've already locked out new page faults, so now we can safely remove
 	 * pages from the page cache knowing they won't get refaulted until we
@@ -851,13 +863,6 @@ xfs_setattr_size(
 	 * to hope that the caller sees ENOMEM and retries the truncate
 	 * operation.
 	 */
-	if (IS_DAX(inode))
-		error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
-	else
-		error = block_truncate_page(inode->i_mapping, newsize,
-					    xfs_get_blocks);
-	if (error)
-		return error;
 	truncate_setsize(inode, newsize);
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 4/8] xfs: reorder zeroing and flushing sequence in truncate
@ 2016-04-12 20:52   ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Currently zeroing out blocks and waiting for writeout is a bit of a mess in
truncate.  This patch gives it a clear order in preparation for the iomap
path:

 (1) we first wait for any direct I/O to complete to prevent any races for it
 (2) we then perform the actual zeroing, and only use the truncate_page helpers
     for truncating down.  The truncate up case already is handled by the
     separate call to xfs_zero_eof.
 (3) only then we write back dirty data, as zeroing block may cause dirty pages
     when using either xfs_zero_eof or the new iomap infrastructure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iops.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c5d4eba..1e2086d 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -801,20 +801,35 @@ xfs_setattr_size(
 		return error;
 
 	/*
+	 * Wait for all direct I/O to complete.
+	 */
+	inode_dio_wait(inode);
+
+	/*
 	 * File data changes must be complete before we start the transaction to
 	 * modify the inode.  This needs to be done before joining the inode to
 	 * the transaction because the inode cannot be unlocked once it is a
 	 * part of the transaction.
 	 *
-	 * Start with zeroing any data block beyond EOF that we may expose on
-	 * file extension.
+	 * Start with zeroing any data beyond EOF that we may expose on file
+	 & extension, or zeroing out the rest of the block on a downward
+	 * truncate.
 	 */
 	if (newsize > oldsize) {
 		error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
-		if (error)
-			return error;
+	} else {
+		if (IS_DAX(inode)) {
+			error = dax_truncate_page(inode, newsize,
+					xfs_get_blocks_direct);
+		} else {
+			error = block_truncate_page(inode->i_mapping, newsize,
+					xfs_get_blocks);
+		}
 	}
 
+	if (error)
+		return error;
+
 	/*
 	 * We are going to log the inode size change in this transaction so
 	 * any previous writes that are beyond the on disk EOF and the new
@@ -831,9 +846,6 @@ xfs_setattr_size(
 			return error;
 	}
 
-	/* Now wait for all direct I/O to complete. */
-	inode_dio_wait(inode);
-
 	/*
 	 * We've already locked out new page faults, so now we can safely remove
 	 * pages from the page cache knowing they won't get refaulted until we
@@ -851,13 +863,6 @@ xfs_setattr_size(
 	 * to hope that the caller sees ENOMEM and retries the truncate
 	 * operation.
 	 */
-	if (IS_DAX(inode))
-		error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
-	else
-		error = block_truncate_page(inode->i_mapping, newsize,
-					    xfs_get_blocks);
-	if (error)
-		return error;
 	truncate_setsize(inode, newsize);
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
-- 
2.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 5/8] xfs: implement iomap based buffered write path
  2016-04-12 20:52 ` Christoph Hellwig
@ 2016-04-12 20:52   ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Convert XFS to use the new iomap based multipage write path. This involves
implementing the ->iomap_begin and ->iomap_end methods, and switching the
buffered file write, page_mkwrite and xfs_iozero paths to the new iomap
helpers.

With this change __xfs_get_blocks will never be used for buffered writes,
and the code handling them can be removed.

Based on earlier code from Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Kconfig     |   1 +
 fs/xfs/xfs_aops.c  | 212 -----------------------------------------------------
 fs/xfs/xfs_file.c  |  71 ++++++++----------
 fs/xfs/xfs_iomap.c | 144 ++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_iomap.h |   5 +-
 fs/xfs/xfs_iops.c  |   9 ++-
 fs/xfs/xfs_trace.h |   3 +
 7 files changed, 187 insertions(+), 258 deletions(-)

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5d47b4d..35faf12 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
 	depends on (64BIT || LBDAF)
 	select EXPORTFS
 	select LIBCRC32C
+	select FS_IOMAP
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 40645a4..e481c80 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1429,216 +1429,6 @@ xfs_vm_direct_IO(
 			xfs_get_blocks_direct, endio, NULL, flags);
 }
 
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
-	struct inode		*inode,
-	loff_t			start,
-	loff_t			end)
-{
-	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fileoff_t		start_fsb;
-	xfs_fileoff_t		end_fsb;
-	int			error;
-
-	start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-	end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-	if (end_fsb <= start_fsb)
-		return;
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-						end_fsb - start_fsb);
-	if (error) {
-		/* something screwed, just bail */
-		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-			xfs_alert(ip->i_mount,
-		"xfs_vm_write_failed: unable to clean up ino %lld",
-					ip->i_ino);
-		}
-	}
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-STATIC void
-xfs_vm_write_failed(
-	struct inode		*inode,
-	struct page		*page,
-	loff_t			pos,
-	unsigned		len)
-{
-	loff_t			block_offset;
-	loff_t			block_start;
-	loff_t			block_end;
-	loff_t			from = pos & (PAGE_CACHE_SIZE - 1);
-	loff_t			to = from + len;
-	struct buffer_head	*bh, *head;
-	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
-
-	/*
-	 * The request pos offset might be 32 or 64 bit, this is all fine
-	 * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-	 * platform, the high 32-bit will be masked off if we evaluate the
-	 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-	 * 0xfffff000 as an unsigned long, hence the result is incorrect
-	 * which could cause the following ASSERT failed in most cases.
-	 * In order to avoid this, we can evaluate the block_offset of the
-	 * start of the page by using shifts rather than masks the mismatch
-	 * problem.
-	 */
-	block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
-
-	ASSERT(block_offset + from == pos);
-
-	head = page_buffers(page);
-	block_start = 0;
-	for (bh = head; bh != head || !block_start;
-	     bh = bh->b_this_page, block_start = block_end,
-				   block_offset += bh->b_size) {
-		block_end = block_start + bh->b_size;
-
-		/* skip buffers before the write */
-		if (block_end <= from)
-			continue;
-
-		/* if the buffer is after the write, we're done */
-		if (block_start >= to)
-			break;
-
-		/*
-		 * Process delalloc and unwritten buffers beyond EOF. We can
-		 * encounter unwritten buffers in the event that a file has
-		 * post-EOF unwritten extents and an extending write happens to
-		 * fail (e.g., an unaligned write that also involves a delalloc
-		 * to the same page).
-		 */
-		if (!buffer_delay(bh) && !buffer_unwritten(bh))
-			continue;
-
-		if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-		    block_offset < i_size_read(inode))
-			continue;
-
-		if (buffer_delay(bh))
-			xfs_vm_kill_delalloc_range(inode, block_offset,
-						   block_offset + bh->b_size);
-
-		/*
-		 * This buffer does not contain data anymore. make sure anyone
-		 * who finds it knows that for certain.
-		 */
-		clear_buffer_delay(bh);
-		clear_buffer_uptodate(bh);
-		clear_buffer_mapped(bh);
-		clear_buffer_new(bh);
-		clear_buffer_dirty(bh);
-		clear_buffer_unwritten(bh);
-	}
-
-}
-
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
-	struct file		*file,
-	struct address_space	*mapping,
-	loff_t			pos,
-	unsigned		len,
-	unsigned		flags,
-	struct page		**pagep,
-	void			**fsdata)
-{
-	pgoff_t			index = pos >> PAGE_CACHE_SHIFT;
-	struct page		*page;
-	int			status;
-	struct xfs_mount	*mp = XFS_I(mapping->host)->i_mount;
-
-	ASSERT(len <= PAGE_CACHE_SIZE);
-
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page)
-		return -ENOMEM;
-
-	status = __block_write_begin(page, pos, len, xfs_get_blocks);
-	if (xfs_mp_fail_writes(mp))
-		status = -EIO;
-	if (unlikely(status)) {
-		struct inode	*inode = mapping->host;
-		size_t		isize = i_size_read(inode);
-
-		xfs_vm_write_failed(inode, page, pos, len);
-		unlock_page(page);
-
-		/*
-		 * If the write is beyond EOF, we only want to kill blocks
-		 * allocated in this write, not blocks that were previously
-		 * written successfully.
-		 */
-		if (xfs_mp_fail_writes(mp))
-			isize = 0;
-		if (pos + len > isize) {
-			ssize_t start = max_t(ssize_t, pos, isize);
-
-			truncate_pagecache_range(inode, start, pos + len);
-		}
-
-		page_cache_release(page);
-		page = NULL;
-	}
-
-	*pagep = page;
-	return status;
-}
-
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
-	struct file		*file,
-	struct address_space	*mapping,
-	loff_t			pos,
-	unsigned		len,
-	unsigned		copied,
-	struct page		*page,
-	void			*fsdata)
-{
-	int			ret;
-
-	ASSERT(len <= PAGE_CACHE_SIZE);
-
-	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-	if (unlikely(ret < len)) {
-		struct inode	*inode = mapping->host;
-		size_t		isize = i_size_read(inode);
-		loff_t		to = pos + len;
-
-		if (to > isize) {
-			/* only kill blocks in this write beyond EOF */
-			if (pos > isize)
-				isize = pos;
-			xfs_vm_kill_delalloc_range(inode, isize, to);
-			truncate_pagecache_range(inode, isize, to);
-		}
-	}
-	return ret;
-}
-
 STATIC sector_t
 xfs_vm_bmap(
 	struct address_space	*mapping,
@@ -1749,8 +1539,6 @@ const struct address_space_operations xfs_address_space_operations = {
 	.set_page_dirty		= xfs_vm_set_page_dirty,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
-	.write_begin		= xfs_vm_write_begin,
-	.write_end		= xfs_vm_write_end,
 	.bmap			= xfs_vm_bmap,
 	.direct_IO		= xfs_vm_direct_IO,
 	.migratepage		= buffer_migrate_page,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 98bbd8f..bcedd80 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
 #include "xfs_log.h"
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
 
 #include <linux/dcache.h>
 #include <linux/falloc.h>
@@ -79,57 +80,27 @@ xfs_rw_ilock_demote(
 		inode_unlock(VFS_I(ip));
 }
 
-/*
- * xfs_iozero clears the specified range supplied via the page cache (except in
- * the DAX case). Writes through the page cache will allocate blocks over holes,
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
- */
-int
-xfs_iozero(
-	struct xfs_inode	*ip,	/* inode			*/
-	loff_t			pos,	/* offset in file		*/
-	size_t			count)	/* size of data to zero		*/
+static int
+xfs_dax_zero_range(
+	struct inode		*inode,
+	loff_t			pos,
+	size_t			count)
 {
-	struct page		*page;
-	struct address_space	*mapping;
 	int			status = 0;
 
-
-	mapping = VFS_I(ip)->i_mapping;
 	do {
 		unsigned offset, bytes;
-		void *fsdata;
 
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		bytes = PAGE_CACHE_SIZE - offset;
 		if (bytes > count)
 			bytes = count;
 
-		if (IS_DAX(VFS_I(ip))) {
-			status = dax_zero_page_range(VFS_I(ip), pos, bytes,
-						     xfs_get_blocks_direct);
-			if (status)
-				break;
-		} else {
-			status = pagecache_write_begin(NULL, mapping, pos, bytes,
-						AOP_FLAG_UNINTERRUPTIBLE,
-						&page, &fsdata);
-			if (status)
-				break;
-
-			zero_user(page, offset, bytes);
+		status = dax_zero_page_range(inode, pos, bytes,
+					     xfs_get_blocks_direct);
+		if (status)
+			break;
 
-			status = pagecache_write_end(NULL, mapping, pos, bytes,
-						bytes, page, fsdata);
-			WARN_ON(status <= 0); /* can't return less than zero! */
-			status = 0;
-		}
 		pos += bytes;
 		count -= bytes;
 	} while (count);
@@ -137,6 +108,24 @@ xfs_iozero(
 	return status;
 }
 
+/*
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
+ */
+int
+xfs_iozero(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	size_t			count)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	if (IS_DAX(VFS_I(ip)))
+		return xfs_dax_zero_range(inode, pos, count);
+	else
+		return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops);
+}
+
 int
 xfs_update_prealloc_flags(
 	struct xfs_inode	*ip,
@@ -842,7 +831,7 @@ xfs_file_buffered_aio_write(
 write_retry:
 	trace_xfs_file_buffered_write(ip, iov_iter_count(from),
 				      iocb->ki_pos, 0);
-	ret = generic_perform_write(file, from, iocb->ki_pos);
+	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
 	if (likely(ret >= 0))
 		iocb->ki_pos += ret;
 
@@ -1558,7 +1547,7 @@ xfs_filemap_page_mkwrite(
 	if (IS_DAX(inode)) {
 		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
 	} else {
-		ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
 	}
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2f37194..73de1ec 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -967,3 +967,147 @@ xfs_bmbt_to_iomap(
 	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
 	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
 }
+
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+	return !nimaps ||
+		imap->br_startblock == HOLESTARTBLOCK ||
+		imap->br_startblock == DELAYSTARTBLOCK;
+}
+
+static int
+xfs_file_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			nimaps = 1, error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	ASSERT(offset <= mp->m_super->s_maxbytes);
+	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+		length = mp->m_super->s_maxbytes - offset;
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+			       &nimaps, XFS_BMAPI_ENTIRE);
+	if (error) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		return error;
+	}
+
+	if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+		/*
+		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+		 * pages to keep the chunks of work done where somewhat symmetric
+		 * with the work writeback does. This is a completely arbitrary
+		 * number pulled out of thin air as a best guess for initial
+		 * testing.
+		 *
+		 * Note that the values needs to be less than 32-bits wide until
+		 * the lower level functions are updated.
+		 */
+		length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+		if (xfs_get_extsz_hint(ip)) {
+			/*
+			 * xfs_iomap_write_direct() expects the shared lock. It
+			 * is unlocked on return.
+			 */
+			xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+			error = xfs_iomap_write_direct(ip, offset, length, &imap,
+					nimaps);
+		} else {
+			error = xfs_iomap_write_delay(ip, offset, length, &imap);
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		}
+
+		if (error)
+			return error;
+
+		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+		xfs_bmbt_to_iomap(ip, iomap, &imap);
+	} else if (nimaps) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+		xfs_bmbt_to_iomap(ip, iomap, &imap);
+	} else {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_HOLE;
+		iomap->offset = offset;
+		iomap->length = length;
+	}
+
+	return 0;
+}
+
+static int
+xfs_file_iomap_end_delalloc(
+	struct xfs_inode	*ip,
+	loff_t			offset,
+	loff_t			length,
+	ssize_t			written)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		start_fsb;
+	xfs_fileoff_t		end_fsb;
+	int			error = 0;
+
+	start_fsb = XFS_B_TO_FSB(mp, offset + written);
+	end_fsb = XFS_B_TO_FSB(mp, offset + length - written);
+
+	/*
+	 * Trim back delalloc blocks if we didn't manage to write the whole
+	 * range reserved.
+	 *
+	 * We don't need to care about racing delalloc as we hold i_mutex
+	 * across the reserve/allocate/unreserve calls. If there are delalloc
+	 * blocks in the range, they are ours.
+	 */
+	if (start_fsb < end_fsb) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+					       end_fsb - start_fsb);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+		if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_alert(mp, "%s: unable to clean up ino %lld",
+				__func__, ip->i_ino);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
+static int
+xfs_file_iomap_end(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	ssize_t			written,
+	unsigned		flags,
+	struct iomap		*iomap)
+{
+	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+				length, written);
+	return 0;
+}
+
+struct iomap_ops xfs_iomap_ops = {
+	.iomap_begin		= xfs_file_iomap_begin,
+	.iomap_end		= xfs_file_iomap_end,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 718f07c..e066d04 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,7 +18,8 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
 
-struct iomap;
+#include <linux/iomap.h>
+
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
@@ -33,4 +34,6 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
 		struct xfs_bmbt_irec *);
 
+extern struct iomap_ops xfs_iomap_ops;
+
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1e2086d..6dfa10c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
 #include "xfs_dir2.h"
 #include "xfs_trans_space.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
 
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -822,8 +823,8 @@ xfs_setattr_size(
 			error = dax_truncate_page(inode, newsize,
 					xfs_get_blocks_direct);
 		} else {
-			error = block_truncate_page(inode->i_mapping, newsize,
-					xfs_get_blocks);
+			error = iomap_truncate_page(inode, newsize,
+					&did_zeroing, &xfs_iomap_ops);
 		}
 	}
 
@@ -838,8 +839,8 @@ xfs_setattr_size(
 	 * problem. Note that this includes any block zeroing we did above;
 	 * otherwise those blocks may not be zeroed after a crash.
 	 */
-	if (newsize > ip->i_d.di_size &&
-	    (oldsize != ip->i_d.di_size || did_zeroing)) {
+	if (did_zeroing ||
+	    (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
 		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 						      ip->i_d.di_size, newsize);
 		if (error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 840d52e..86fb345 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1296,6 +1296,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 5/8] xfs: implement iomap based buffered write path
@ 2016-04-12 20:52   ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:52 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Convert XFS to use the new iomap based multipage write path. This involves
implementing the ->iomap_begin and ->iomap_end methods, and switching the
buffered file write, page_mkwrite and xfs_iozero paths to the new iomap
helpers.

With this change __xfs_get_blocks will never be used for buffered writes,
and the code handling them can be removed.

Based on earlier code from Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Kconfig     |   1 +
 fs/xfs/xfs_aops.c  | 212 -----------------------------------------------------
 fs/xfs/xfs_file.c  |  71 ++++++++----------
 fs/xfs/xfs_iomap.c | 144 ++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_iomap.h |   5 +-
 fs/xfs/xfs_iops.c  |   9 ++-
 fs/xfs/xfs_trace.h |   3 +
 7 files changed, 187 insertions(+), 258 deletions(-)

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5d47b4d..35faf12 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
 	depends on (64BIT || LBDAF)
 	select EXPORTFS
 	select LIBCRC32C
+	select FS_IOMAP
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 40645a4..e481c80 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1429,216 +1429,6 @@ xfs_vm_direct_IO(
 			xfs_get_blocks_direct, endio, NULL, flags);
 }
 
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
-	struct inode		*inode,
-	loff_t			start,
-	loff_t			end)
-{
-	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fileoff_t		start_fsb;
-	xfs_fileoff_t		end_fsb;
-	int			error;
-
-	start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-	end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-	if (end_fsb <= start_fsb)
-		return;
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-						end_fsb - start_fsb);
-	if (error) {
-		/* something screwed, just bail */
-		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-			xfs_alert(ip->i_mount,
-		"xfs_vm_write_failed: unable to clean up ino %lld",
-					ip->i_ino);
-		}
-	}
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-STATIC void
-xfs_vm_write_failed(
-	struct inode		*inode,
-	struct page		*page,
-	loff_t			pos,
-	unsigned		len)
-{
-	loff_t			block_offset;
-	loff_t			block_start;
-	loff_t			block_end;
-	loff_t			from = pos & (PAGE_CACHE_SIZE - 1);
-	loff_t			to = from + len;
-	struct buffer_head	*bh, *head;
-	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
-
-	/*
-	 * The request pos offset might be 32 or 64 bit, this is all fine
-	 * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-	 * platform, the high 32-bit will be masked off if we evaluate the
-	 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-	 * 0xfffff000 as an unsigned long, hence the result is incorrect
-	 * which could cause the following ASSERT failed in most cases.
-	 * In order to avoid this, we can evaluate the block_offset of the
-	 * start of the page by using shifts rather than masks the mismatch
-	 * problem.
-	 */
-	block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
-
-	ASSERT(block_offset + from == pos);
-
-	head = page_buffers(page);
-	block_start = 0;
-	for (bh = head; bh != head || !block_start;
-	     bh = bh->b_this_page, block_start = block_end,
-				   block_offset += bh->b_size) {
-		block_end = block_start + bh->b_size;
-
-		/* skip buffers before the write */
-		if (block_end <= from)
-			continue;
-
-		/* if the buffer is after the write, we're done */
-		if (block_start >= to)
-			break;
-
-		/*
-		 * Process delalloc and unwritten buffers beyond EOF. We can
-		 * encounter unwritten buffers in the event that a file has
-		 * post-EOF unwritten extents and an extending write happens to
-		 * fail (e.g., an unaligned write that also involves a delalloc
-		 * to the same page).
-		 */
-		if (!buffer_delay(bh) && !buffer_unwritten(bh))
-			continue;
-
-		if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-		    block_offset < i_size_read(inode))
-			continue;
-
-		if (buffer_delay(bh))
-			xfs_vm_kill_delalloc_range(inode, block_offset,
-						   block_offset + bh->b_size);
-
-		/*
-		 * This buffer does not contain data anymore. make sure anyone
-		 * who finds it knows that for certain.
-		 */
-		clear_buffer_delay(bh);
-		clear_buffer_uptodate(bh);
-		clear_buffer_mapped(bh);
-		clear_buffer_new(bh);
-		clear_buffer_dirty(bh);
-		clear_buffer_unwritten(bh);
-	}
-
-}
-
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
-	struct file		*file,
-	struct address_space	*mapping,
-	loff_t			pos,
-	unsigned		len,
-	unsigned		flags,
-	struct page		**pagep,
-	void			**fsdata)
-{
-	pgoff_t			index = pos >> PAGE_CACHE_SHIFT;
-	struct page		*page;
-	int			status;
-	struct xfs_mount	*mp = XFS_I(mapping->host)->i_mount;
-
-	ASSERT(len <= PAGE_CACHE_SIZE);
-
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page)
-		return -ENOMEM;
-
-	status = __block_write_begin(page, pos, len, xfs_get_blocks);
-	if (xfs_mp_fail_writes(mp))
-		status = -EIO;
-	if (unlikely(status)) {
-		struct inode	*inode = mapping->host;
-		size_t		isize = i_size_read(inode);
-
-		xfs_vm_write_failed(inode, page, pos, len);
-		unlock_page(page);
-
-		/*
-		 * If the write is beyond EOF, we only want to kill blocks
-		 * allocated in this write, not blocks that were previously
-		 * written successfully.
-		 */
-		if (xfs_mp_fail_writes(mp))
-			isize = 0;
-		if (pos + len > isize) {
-			ssize_t start = max_t(ssize_t, pos, isize);
-
-			truncate_pagecache_range(inode, start, pos + len);
-		}
-
-		page_cache_release(page);
-		page = NULL;
-	}
-
-	*pagep = page;
-	return status;
-}
-
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
-	struct file		*file,
-	struct address_space	*mapping,
-	loff_t			pos,
-	unsigned		len,
-	unsigned		copied,
-	struct page		*page,
-	void			*fsdata)
-{
-	int			ret;
-
-	ASSERT(len <= PAGE_CACHE_SIZE);
-
-	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-	if (unlikely(ret < len)) {
-		struct inode	*inode = mapping->host;
-		size_t		isize = i_size_read(inode);
-		loff_t		to = pos + len;
-
-		if (to > isize) {
-			/* only kill blocks in this write beyond EOF */
-			if (pos > isize)
-				isize = pos;
-			xfs_vm_kill_delalloc_range(inode, isize, to);
-			truncate_pagecache_range(inode, isize, to);
-		}
-	}
-	return ret;
-}
-
 STATIC sector_t
 xfs_vm_bmap(
 	struct address_space	*mapping,
@@ -1749,8 +1539,6 @@ const struct address_space_operations xfs_address_space_operations = {
 	.set_page_dirty		= xfs_vm_set_page_dirty,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
-	.write_begin		= xfs_vm_write_begin,
-	.write_end		= xfs_vm_write_end,
 	.bmap			= xfs_vm_bmap,
 	.direct_IO		= xfs_vm_direct_IO,
 	.migratepage		= buffer_migrate_page,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 98bbd8f..bcedd80 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
 #include "xfs_log.h"
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
 
 #include <linux/dcache.h>
 #include <linux/falloc.h>
@@ -79,57 +80,27 @@ xfs_rw_ilock_demote(
 		inode_unlock(VFS_I(ip));
 }
 
-/*
- * xfs_iozero clears the specified range supplied via the page cache (except in
- * the DAX case). Writes through the page cache will allocate blocks over holes,
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
- */
-int
-xfs_iozero(
-	struct xfs_inode	*ip,	/* inode			*/
-	loff_t			pos,	/* offset in file		*/
-	size_t			count)	/* size of data to zero		*/
+static int
+xfs_dax_zero_range(
+	struct inode		*inode,
+	loff_t			pos,
+	size_t			count)
 {
-	struct page		*page;
-	struct address_space	*mapping;
 	int			status = 0;
 
-
-	mapping = VFS_I(ip)->i_mapping;
 	do {
 		unsigned offset, bytes;
-		void *fsdata;
 
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		bytes = PAGE_CACHE_SIZE - offset;
 		if (bytes > count)
 			bytes = count;
 
-		if (IS_DAX(VFS_I(ip))) {
-			status = dax_zero_page_range(VFS_I(ip), pos, bytes,
-						     xfs_get_blocks_direct);
-			if (status)
-				break;
-		} else {
-			status = pagecache_write_begin(NULL, mapping, pos, bytes,
-						AOP_FLAG_UNINTERRUPTIBLE,
-						&page, &fsdata);
-			if (status)
-				break;
-
-			zero_user(page, offset, bytes);
+		status = dax_zero_page_range(inode, pos, bytes,
+					     xfs_get_blocks_direct);
+		if (status)
+			break;
 
-			status = pagecache_write_end(NULL, mapping, pos, bytes,
-						bytes, page, fsdata);
-			WARN_ON(status <= 0); /* can't return less than zero! */
-			status = 0;
-		}
 		pos += bytes;
 		count -= bytes;
 	} while (count);
@@ -137,6 +108,24 @@ xfs_iozero(
 	return status;
 }
 
+/*
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
+ */
+int
+xfs_iozero(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	size_t			count)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	if (IS_DAX(VFS_I(ip)))
+		return xfs_dax_zero_range(inode, pos, count);
+	else
+		return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops);
+}
+
 int
 xfs_update_prealloc_flags(
 	struct xfs_inode	*ip,
@@ -842,7 +831,7 @@ xfs_file_buffered_aio_write(
 write_retry:
 	trace_xfs_file_buffered_write(ip, iov_iter_count(from),
 				      iocb->ki_pos, 0);
-	ret = generic_perform_write(file, from, iocb->ki_pos);
+	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
 	if (likely(ret >= 0))
 		iocb->ki_pos += ret;
 
@@ -1558,7 +1547,7 @@ xfs_filemap_page_mkwrite(
 	if (IS_DAX(inode)) {
 		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
 	} else {
-		ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
 	}
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2f37194..73de1ec 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -967,3 +967,147 @@ xfs_bmbt_to_iomap(
 	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
 	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
 }
+
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+	return !nimaps ||
+		imap->br_startblock == HOLESTARTBLOCK ||
+		imap->br_startblock == DELAYSTARTBLOCK;
+}
+
+static int
+xfs_file_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			nimaps = 1, error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	ASSERT(offset <= mp->m_super->s_maxbytes);
+	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+		length = mp->m_super->s_maxbytes - offset;
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+			       &nimaps, XFS_BMAPI_ENTIRE);
+	if (error) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		return error;
+	}
+
+	if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+		/*
+		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+		 * pages to keep the chunks of work done where somewhat symmetric
+		 * with the work writeback does. This is a completely arbitrary
+		 * number pulled out of thin air as a best guess for initial
+		 * testing.
+		 *
+		 * Note that the values needs to be less than 32-bits wide until
+		 * the lower level functions are updated.
+		 */
+		length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+		if (xfs_get_extsz_hint(ip)) {
+			/*
+			 * xfs_iomap_write_direct() expects the shared lock. It
+			 * is unlocked on return.
+			 */
+			xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+			error = xfs_iomap_write_direct(ip, offset, length, &imap,
+					nimaps);
+		} else {
+			error = xfs_iomap_write_delay(ip, offset, length, &imap);
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		}
+
+		if (error)
+			return error;
+
+		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+		xfs_bmbt_to_iomap(ip, iomap, &imap);
+	} else if (nimaps) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+		xfs_bmbt_to_iomap(ip, iomap, &imap);
+	} else {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_HOLE;
+		iomap->offset = offset;
+		iomap->length = length;
+	}
+
+	return 0;
+}
+
+static int
+xfs_file_iomap_end_delalloc(
+	struct xfs_inode	*ip,
+	loff_t			offset,
+	loff_t			length,
+	ssize_t			written)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		start_fsb;
+	xfs_fileoff_t		end_fsb;
+	int			error = 0;
+
+	start_fsb = XFS_B_TO_FSB(mp, offset + written);
+	end_fsb = XFS_B_TO_FSB(mp, offset + length - written);
+
+	/*
+	 * Trim back delalloc blocks if we didn't manage to write the whole
+	 * range reserved.
+	 *
+	 * We don't need to care about racing delalloc as we hold i_mutex
+	 * across the reserve/allocate/unreserve calls. If there are delalloc
+	 * blocks in the range, they are ours.
+	 */
+	if (start_fsb < end_fsb) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+					       end_fsb - start_fsb);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+		if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_alert(mp, "%s: unable to clean up ino %lld",
+				__func__, ip->i_ino);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
+static int
+xfs_file_iomap_end(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	ssize_t			written,
+	unsigned		flags,
+	struct iomap		*iomap)
+{
+	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+				length, written);
+	return 0;
+}
+
+struct iomap_ops xfs_iomap_ops = {
+	.iomap_begin		= xfs_file_iomap_begin,
+	.iomap_end		= xfs_file_iomap_end,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 718f07c..e066d04 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,7 +18,8 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
 
-struct iomap;
+#include <linux/iomap.h>
+
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
@@ -33,4 +34,6 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
 		struct xfs_bmbt_irec *);
 
+extern struct iomap_ops xfs_iomap_ops;
+
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1e2086d..6dfa10c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
 #include "xfs_dir2.h"
 #include "xfs_trans_space.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
 
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -822,8 +823,8 @@ xfs_setattr_size(
 			error = dax_truncate_page(inode, newsize,
 					xfs_get_blocks_direct);
 		} else {
-			error = block_truncate_page(inode->i_mapping, newsize,
-					xfs_get_blocks);
+			error = iomap_truncate_page(inode, newsize,
+					&did_zeroing, &xfs_iomap_ops);
 		}
 	}
 
@@ -838,8 +839,8 @@ xfs_setattr_size(
 	 * problem. Note that this includes any block zeroing we did above;
 	 * otherwise those blocks may not be zeroed after a crash.
 	 */
-	if (newsize > ip->i_d.di_size &&
-	    (oldsize != ip->i_d.di_size || did_zeroing)) {
+	if (did_zeroing ||
+	    (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
 		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 						      ip->i_d.di_size, newsize);
 		if (error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 840d52e..86fb345 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1296,6 +1296,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
-- 
2.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 6/8] xfs: remove buffered write support from __xfs_get_blocks
  2016-04-12 20:52 ` Christoph Hellwig
@ 2016-04-12 20:53   ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:53 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 77 +++++++++++++++----------------------------------------
 1 file changed, 21 insertions(+), 56 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e481c80..cb4f75c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1130,7 +1130,6 @@ __xfs_get_blocks(
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create,
-	bool			direct,
 	bool			dax_fault)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
@@ -1151,22 +1150,14 @@ __xfs_get_blocks(
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;
 
-	if (!create && direct && offset >= i_size_read(inode))
+	if (!create && offset >= i_size_read(inode))
 		return 0;
 
 	/*
 	 * Direct I/O is usually done on preallocated files, so try getting
-	 * a block mapping without an exclusive lock first.  For buffered
-	 * writes we already have the exclusive iolock anyway, so avoiding
-	 * a lock roundtrip here by taking the ilock exclusive from the
-	 * beginning is a useful micro optimization.
+	 * a block mapping without an exclusive lock first.
 	 */
-	if (create && !direct) {
-		lockmode = XFS_ILOCK_EXCL;
-		xfs_ilock(ip, lockmode);
-	} else {
-		lockmode = xfs_ilock_data_map_shared(ip);
-	}
+	lockmode = xfs_ilock_data_map_shared(ip);
 
 	ASSERT(offset <= mp->m_super->s_maxbytes);
 	if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1176,19 @@ __xfs_get_blocks(
 	     (imap.br_startblock == HOLESTARTBLOCK ||
 	      imap.br_startblock == DELAYSTARTBLOCK) ||
 	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-		if (direct || xfs_get_extsz_hint(ip)) {
-			/*
-			 * xfs_iomap_write_direct() expects the shared lock. It
-			 * is unlocked on return.
-			 */
-			if (lockmode == XFS_ILOCK_EXCL)
-				xfs_ilock_demote(ip, lockmode);
-
-			error = xfs_iomap_write_direct(ip, offset, size,
-						       &imap, nimaps);
-			if (error)
-				return error;
-			new = 1;
+		/*
+		 * xfs_iomap_write_direct() expects the shared lock. It
+		 * is unlocked on return.
+		 */
+		if (lockmode == XFS_ILOCK_EXCL)
+			xfs_ilock_demote(ip, lockmode);
 
-		} else {
-			/*
-			 * Delalloc reservations do not require a transaction,
-			 * we can go on without dropping the lock here. If we
-			 * are allocating a new delalloc block, make sure that
-			 * we set the new flag so that we mark the buffer new so
-			 * that we know that it is newly allocated if the write
-			 * fails.
-			 */
-			if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
-				new = 1;
-			error = xfs_iomap_write_delay(ip, offset, size, &imap);
-			if (error)
-				goto out_unlock;
+		error = xfs_iomap_write_direct(ip, offset, size,
+					       &imap, nimaps);
+		if (error)
+			return error;
+		new = 1;
 
-			xfs_iunlock(ip, lockmode);
-		}
 		trace_xfs_get_blocks_alloc(ip, offset, size,
 				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
 						   : XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1209,7 @@ __xfs_get_blocks(
 	}
 
 	/* trim mapping down to size requested */
-	if (direct || size > (1 << inode->i_blkbits))
-		xfs_map_trim_size(inode, iblock, bh_result,
-				  &imap, offset, size);
+	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
 
 	/*
 	 * For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1222,7 @@ __xfs_get_blocks(
 		if (ISUNWRITTEN(&imap))
 			set_buffer_unwritten(bh_result);
 		/* direct IO needs special help */
-		if (create && direct) {
+		if (create) {
 			if (dax_fault)
 				ASSERT(!ISUNWRITTEN(&imap));
 			else
@@ -1280,14 +1251,7 @@ __xfs_get_blocks(
 	     (new || ISUNWRITTEN(&imap))))
 		set_buffer_new(bh_result);
 
-	if (imap.br_startblock == DELAYSTARTBLOCK) {
-		BUG_ON(direct);
-		if (create) {
-			set_buffer_uptodate(bh_result);
-			set_buffer_mapped(bh_result);
-			set_buffer_delay(bh_result);
-		}
-	}
+	BUG_ON(imap.br_startblock == DELAYSTARTBLOCK);
 
 	return 0;
 
@@ -1303,7 +1267,8 @@ xfs_get_blocks(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
+	BUG_ON(create);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
 }
 
 int
@@ -1313,7 +1278,7 @@ xfs_get_blocks_direct(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
 }
 
 int
@@ -1323,7 +1288,7 @@ xfs_get_blocks_dax_fault(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
 }
 
 /*
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 6/8] xfs: remove buffered write support from __xfs_get_blocks
@ 2016-04-12 20:53   ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:53 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c | 77 +++++++++++++++----------------------------------------
 1 file changed, 21 insertions(+), 56 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e481c80..cb4f75c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1130,7 +1130,6 @@ __xfs_get_blocks(
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create,
-	bool			direct,
 	bool			dax_fault)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
@@ -1151,22 +1150,14 @@ __xfs_get_blocks(
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;
 
-	if (!create && direct && offset >= i_size_read(inode))
+	if (!create && offset >= i_size_read(inode))
 		return 0;
 
 	/*
 	 * Direct I/O is usually done on preallocated files, so try getting
-	 * a block mapping without an exclusive lock first.  For buffered
-	 * writes we already have the exclusive iolock anyway, so avoiding
-	 * a lock roundtrip here by taking the ilock exclusive from the
-	 * beginning is a useful micro optimization.
+	 * a block mapping without an exclusive lock first.
 	 */
-	if (create && !direct) {
-		lockmode = XFS_ILOCK_EXCL;
-		xfs_ilock(ip, lockmode);
-	} else {
-		lockmode = xfs_ilock_data_map_shared(ip);
-	}
+	lockmode = xfs_ilock_data_map_shared(ip);
 
 	ASSERT(offset <= mp->m_super->s_maxbytes);
 	if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1176,19 @@ __xfs_get_blocks(
 	     (imap.br_startblock == HOLESTARTBLOCK ||
 	      imap.br_startblock == DELAYSTARTBLOCK) ||
 	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-		if (direct || xfs_get_extsz_hint(ip)) {
-			/*
-			 * xfs_iomap_write_direct() expects the shared lock. It
-			 * is unlocked on return.
-			 */
-			if (lockmode == XFS_ILOCK_EXCL)
-				xfs_ilock_demote(ip, lockmode);
-
-			error = xfs_iomap_write_direct(ip, offset, size,
-						       &imap, nimaps);
-			if (error)
-				return error;
-			new = 1;
+		/*
+		 * xfs_iomap_write_direct() expects the shared lock. It
+		 * is unlocked on return.
+		 */
+		if (lockmode == XFS_ILOCK_EXCL)
+			xfs_ilock_demote(ip, lockmode);
 
-		} else {
-			/*
-			 * Delalloc reservations do not require a transaction,
-			 * we can go on without dropping the lock here. If we
-			 * are allocating a new delalloc block, make sure that
-			 * we set the new flag so that we mark the buffer new so
-			 * that we know that it is newly allocated if the write
-			 * fails.
-			 */
-			if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
-				new = 1;
-			error = xfs_iomap_write_delay(ip, offset, size, &imap);
-			if (error)
-				goto out_unlock;
+		error = xfs_iomap_write_direct(ip, offset, size,
+					       &imap, nimaps);
+		if (error)
+			return error;
+		new = 1;
 
-			xfs_iunlock(ip, lockmode);
-		}
 		trace_xfs_get_blocks_alloc(ip, offset, size,
 				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
 						   : XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1209,7 @@ __xfs_get_blocks(
 	}
 
 	/* trim mapping down to size requested */
-	if (direct || size > (1 << inode->i_blkbits))
-		xfs_map_trim_size(inode, iblock, bh_result,
-				  &imap, offset, size);
+	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
 
 	/*
 	 * For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1222,7 @@ __xfs_get_blocks(
 		if (ISUNWRITTEN(&imap))
 			set_buffer_unwritten(bh_result);
 		/* direct IO needs special help */
-		if (create && direct) {
+		if (create) {
 			if (dax_fault)
 				ASSERT(!ISUNWRITTEN(&imap));
 			else
@@ -1280,14 +1251,7 @@ __xfs_get_blocks(
 	     (new || ISUNWRITTEN(&imap))))
 		set_buffer_new(bh_result);
 
-	if (imap.br_startblock == DELAYSTARTBLOCK) {
-		BUG_ON(direct);
-		if (create) {
-			set_buffer_uptodate(bh_result);
-			set_buffer_mapped(bh_result);
-			set_buffer_delay(bh_result);
-		}
-	}
+	BUG_ON(imap.br_startblock == DELAYSTARTBLOCK);
 
 	return 0;
 
@@ -1303,7 +1267,8 @@ xfs_get_blocks(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
+	BUG_ON(create);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
 }
 
 int
@@ -1313,7 +1278,7 @@ xfs_get_blocks_direct(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
 }
 
 int
@@ -1323,7 +1288,7 @@ xfs_get_blocks_dax_fault(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
 }
 
 /*
-- 
2.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 7/8] fs: iomap based fiemap implementation
  2016-04-12 20:52 ` Christoph Hellwig
@ 2016-04-12 20:53   ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:53 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Add a simple fiemap implementation based on iomap_ops, partially based
on a previous implementation from Bob Peterson <rpeterso@redhat.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |  3 ++
 2 files changed, 93 insertions(+)

diff --git a/fs/iomap.c b/fs/iomap.c
index 534cb37..28a8fcb 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -385,3 +385,93 @@ out_unlock:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+struct fiemap_ctx {
+	struct fiemap_extent_info *fi;
+	struct iomap prev;
+};
+
+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
+		struct iomap *iomap, u32 flags)
+{
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		/* skip holes */
+		return 0;
+	case IOMAP_DELALLOC:
+		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
+		break;
+	case IOMAP_UNWRITTEN:
+		flags |= FIEMAP_EXTENT_UNWRITTEN;
+		break;
+	case IOMAP_MAPPED:
+		break;
+	}
+
+	return fiemap_fill_next_extent(fi, iomap->offset,
+			iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+			iomap->length, flags | FIEMAP_EXTENT_MERGED);
+
+}
+
+static loff_t
+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct fiemap_ctx *ctx = data;
+	loff_t ret = length;
+
+	if (iomap->type == IOMAP_HOLE)
+		return length;
+
+	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
+	ctx->prev = *iomap;
+	switch (ret) {
+	case 0:		/* success */
+		return length;
+	case 1:		/* extent array full */
+		return 0;
+	default:
+		return ret;
+	}
+}
+
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
+		loff_t start, loff_t len, struct iomap_ops *ops)
+{
+	struct fiemap_ctx ctx;
+	loff_t ret;
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.fi = fi;
+	ctx.prev.type = IOMAP_HOLE;
+
+	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret)
+		return ret;
+
+	while (len > 0) {
+		ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+				iomap_fiemap_actor);
+		if (ret < 0)
+			return ret;
+		if (ret == 0)
+			break;
+
+		start += ret;
+		len -= ret;
+	}
+
+	if (ctx.prev.type != IOMAP_HOLE) {
+		ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_fiemap);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 854766f..b3deee1 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 
+struct fiemap_extent_info;
 struct inode;
 struct iov_iter;
 struct kiocb;
@@ -63,5 +64,7 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		struct iomap_ops *ops);
 int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		struct iomap_ops *ops);
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		loff_t start, loff_t len, struct iomap_ops *ops);
 
 #endif /* LINUX_IOMAP_H */
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 7/8] fs: iomap based fiemap implementation
@ 2016-04-12 20:53   ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:53 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Add a simple fiemap implementation based on iomap_ops, partially based
on a previous implementation from Bob Peterson <rpeterso@redhat.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |  3 ++
 2 files changed, 93 insertions(+)

diff --git a/fs/iomap.c b/fs/iomap.c
index 534cb37..28a8fcb 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -385,3 +385,93 @@ out_unlock:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+struct fiemap_ctx {
+	struct fiemap_extent_info *fi;
+	struct iomap prev;
+};
+
+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
+		struct iomap *iomap, u32 flags)
+{
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		/* skip holes */
+		return 0;
+	case IOMAP_DELALLOC:
+		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
+		break;
+	case IOMAP_UNWRITTEN:
+		flags |= FIEMAP_EXTENT_UNWRITTEN;
+		break;
+	case IOMAP_MAPPED:
+		break;
+	}
+
+	return fiemap_fill_next_extent(fi, iomap->offset,
+			iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+			iomap->length, flags | FIEMAP_EXTENT_MERGED);
+
+}
+
+static loff_t
+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct fiemap_ctx *ctx = data;
+	loff_t ret = length;
+
+	if (iomap->type == IOMAP_HOLE)
+		return length;
+
+	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
+	ctx->prev = *iomap;
+	switch (ret) {
+	case 0:		/* success */
+		return length;
+	case 1:		/* extent array full */
+		return 0;
+	default:
+		return ret;
+	}
+}
+
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
+		loff_t start, loff_t len, struct iomap_ops *ops)
+{
+	struct fiemap_ctx ctx;
+	loff_t ret;
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.fi = fi;
+	ctx.prev.type = IOMAP_HOLE;
+
+	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret)
+		return ret;
+
+	while (len > 0) {
+		ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+				iomap_fiemap_actor);
+		if (ret < 0)
+			return ret;
+		if (ret == 0)
+			break;
+
+		start += ret;
+		len -= ret;
+	}
+
+	if (ctx.prev.type != IOMAP_HOLE) {
+		ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_fiemap);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 854766f..b3deee1 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 
+struct fiemap_extent_info;
 struct inode;
 struct iov_iter;
 struct kiocb;
@@ -63,5 +64,7 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		struct iomap_ops *ops);
 int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		struct iomap_ops *ops);
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		loff_t start, loff_t len, struct iomap_ops *ops);
 
 #endif /* LINUX_IOMAP_H */
-- 
2.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 8/8] xfs: use iomap fiemap implementation
  2016-04-12 20:52 ` Christoph Hellwig
@ 2016-04-12 20:53   ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:53 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Note that this removes support for the untested FIEMAP_FLAG_XATTR.  It
could be added relatively easily with iomap ops for the attr fork, but
without test coverage I don't feel safe doing this.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iops.c | 80 ++++---------------------------------------------------
 1 file changed, 5 insertions(+), 75 deletions(-)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 6dfa10c..6e3614a 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -44,7 +44,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/fiemap.h>
+#include <linux/iomap.h>
 #include <linux/slab.h>
 
 /*
@@ -1004,51 +1004,6 @@ xfs_vn_update_time(
 	return xfs_trans_commit(tp);
 }
 
-#define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-
-/*
- * Call fiemap helper to fill in user data.
- * Returns positive errors to xfs_getbmap.
- */
-STATIC int
-xfs_fiemap_format(
-	void			**arg,
-	struct getbmapx		*bmv,
-	int			*full)
-{
-	int			error;
-	struct fiemap_extent_info *fieinfo = *arg;
-	u32			fiemap_flags = 0;
-	u64			logical, physical, length;
-
-	/* Do nothing for a hole */
-	if (bmv->bmv_block == -1LL)
-		return 0;
-
-	logical = BBTOB(bmv->bmv_offset);
-	physical = BBTOB(bmv->bmv_block);
-	length = BBTOB(bmv->bmv_length);
-
-	if (bmv->bmv_oflags & BMV_OF_PREALLOC)
-		fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
-	else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
-		fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
-				 FIEMAP_EXTENT_UNKNOWN);
-		physical = 0;   /* no block yet */
-	}
-	if (bmv->bmv_oflags & BMV_OF_LAST)
-		fiemap_flags |= FIEMAP_EXTENT_LAST;
-
-	error = fiemap_fill_next_extent(fieinfo, logical, physical,
-					length, fiemap_flags);
-	if (error > 0) {
-		error = 0;
-		*full = 1;	/* user array now full */
-	}
-
-	return error;
-}
-
 STATIC int
 xfs_vn_fiemap(
 	struct inode		*inode,
@@ -1056,38 +1011,13 @@ xfs_vn_fiemap(
 	u64			start,
 	u64			length)
 {
-	xfs_inode_t		*ip = XFS_I(inode);
-	struct getbmapx		bm;
 	int			error;
 
-	error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
-	if (error)
-		return error;
-
-	/* Set up bmap header for xfs internal routine */
-	bm.bmv_offset = BTOBBT(start);
-	/* Special case for whole file */
-	if (length == FIEMAP_MAX_OFFSET)
-		bm.bmv_length = -1LL;
-	else
-		bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
-
-	/* We add one because in getbmap world count includes the header */
-	bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
-					fieinfo->fi_extents_max + 1;
-	bm.bmv_count = min_t(__s32, bm.bmv_count,
-			     (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-	bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
-	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
-		bm.bmv_iflags |= BMV_IF_ATTRFORK;
-	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
-		bm.bmv_iflags |= BMV_IF_DELALLOC;
-
-	error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
-	if (error)
-		return error;
+	xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
+	error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+	xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
 
-	return 0;
+	return error;
 }
 
 STATIC int
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 8/8] xfs: use iomap fiemap implementation
@ 2016-04-12 20:53   ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-04-12 20:53 UTC (permalink / raw)
  To: xfs; +Cc: rpeterso, linux-fsdevel

Note that this removes support for the untested FIEMAP_FLAG_XATTR.  It
could be added relatively easily with iomap ops for the attr fork, but
without test coverage I don't feel safe doing this.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iops.c | 80 ++++---------------------------------------------------
 1 file changed, 5 insertions(+), 75 deletions(-)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 6dfa10c..6e3614a 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -44,7 +44,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/fiemap.h>
+#include <linux/iomap.h>
 #include <linux/slab.h>
 
 /*
@@ -1004,51 +1004,6 @@ xfs_vn_update_time(
 	return xfs_trans_commit(tp);
 }
 
-#define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-
-/*
- * Call fiemap helper to fill in user data.
- * Returns positive errors to xfs_getbmap.
- */
-STATIC int
-xfs_fiemap_format(
-	void			**arg,
-	struct getbmapx		*bmv,
-	int			*full)
-{
-	int			error;
-	struct fiemap_extent_info *fieinfo = *arg;
-	u32			fiemap_flags = 0;
-	u64			logical, physical, length;
-
-	/* Do nothing for a hole */
-	if (bmv->bmv_block == -1LL)
-		return 0;
-
-	logical = BBTOB(bmv->bmv_offset);
-	physical = BBTOB(bmv->bmv_block);
-	length = BBTOB(bmv->bmv_length);
-
-	if (bmv->bmv_oflags & BMV_OF_PREALLOC)
-		fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
-	else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
-		fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
-				 FIEMAP_EXTENT_UNKNOWN);
-		physical = 0;   /* no block yet */
-	}
-	if (bmv->bmv_oflags & BMV_OF_LAST)
-		fiemap_flags |= FIEMAP_EXTENT_LAST;
-
-	error = fiemap_fill_next_extent(fieinfo, logical, physical,
-					length, fiemap_flags);
-	if (error > 0) {
-		error = 0;
-		*full = 1;	/* user array now full */
-	}
-
-	return error;
-}
-
 STATIC int
 xfs_vn_fiemap(
 	struct inode		*inode,
@@ -1056,38 +1011,13 @@ xfs_vn_fiemap(
 	u64			start,
 	u64			length)
 {
-	xfs_inode_t		*ip = XFS_I(inode);
-	struct getbmapx		bm;
 	int			error;
 
-	error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
-	if (error)
-		return error;
-
-	/* Set up bmap header for xfs internal routine */
-	bm.bmv_offset = BTOBBT(start);
-	/* Special case for whole file */
-	if (length == FIEMAP_MAX_OFFSET)
-		bm.bmv_length = -1LL;
-	else
-		bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
-
-	/* We add one because in getbmap world count includes the header */
-	bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
-					fieinfo->fi_extents_max + 1;
-	bm.bmv_count = min_t(__s32, bm.bmv_count,
-			     (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-	bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
-	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
-		bm.bmv_iflags |= BMV_IF_ATTRFORK;
-	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
-		bm.bmv_iflags |= BMV_IF_DELALLOC;
-
-	error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
-	if (error)
-		return error;
+	xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
+	error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+	xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
 
-	return 0;
+	return error;
 }
 
 STATIC int
-- 
2.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: iomap infrastructure and multipage writes V2
  2016-04-12 20:52 ` Christoph Hellwig
@ 2016-04-13 21:54   ` Dave Chinner
  -1 siblings, 0 replies; 30+ messages in thread
From: Dave Chinner @ 2016-04-13 21:54 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs, rpeterso, linux-fsdevel

On Tue, Apr 12, 2016 at 01:52:54PM -0700, Christoph Hellwig wrote:
> This series add a new file system I/O path that uses the iomap structure
> introduced for the pNFS support and support multi-page buffered writes.
> 
> This was first started by Dave Chinner a long time ago, then I did beat
> it into shape for production runs in a very constrained ARM NAS
> enviroment for Tuxera almost as long ago, and now half a dozen rewrites
> later it's back.
> 
> The basic idea is to avoid the crazy per-block get_blocks overhead
> and make use of extents in the buffered write path by iterating over
> them instead.
> 
> Chances since V1:
>  - add support for fiemap
>  - fix a test fail on 1k block sizes
>  - prepare for 64-bit length, this will be used in a follow on patchset

This appears to pass xfstests on all my test setups without any
regressions. I haven't tested RT devices yet, but we can ignore that
for now. I guess now it's time to look harder at the code.

Christoph, have you done any perf testing of this patchset yet to
check that it does indeed reduce the CPU overhead of large write
operations? I'd also be interested to know if there is any change in
overhead for single page (4k) IOs as well, even though I suspect
there won't be.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: iomap infrastructure and multipage writes V2
@ 2016-04-13 21:54   ` Dave Chinner
  0 siblings, 0 replies; 30+ messages in thread
From: Dave Chinner @ 2016-04-13 21:54 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: rpeterso, linux-fsdevel, xfs

On Tue, Apr 12, 2016 at 01:52:54PM -0700, Christoph Hellwig wrote:
> This series add a new file system I/O path that uses the iomap structure
> introduced for the pNFS support and support multi-page buffered writes.
> 
> This was first started by Dave Chinner a long time ago, then I did beat
> it into shape for production runs in a very constrained ARM NAS
> enviroment for Tuxera almost as long ago, and now half a dozen rewrites
> later it's back.
> 
> The basic idea is to avoid the crazy per-block get_blocks overhead
> and make use of extents in the buffered write path by iterating over
> them instead.
> 
> Chances since V1:
>  - add support for fiemap
>  - fix a test fail on 1k block sizes
>  - prepare for 64-bit length, this will be used in a follow on patchset

This appears to pass xfstests on all my test setups without any
regressions. I haven't tested RT devices yet, but we can ignore that
for now. I guess now it's time to look harder at the code.

Christoph, have you done any perf testing of this patchset yet to
check that it does indeed reduce the CPU overhead of large write
operations? I'd also be interested to know if there is any change in
overhead for single page (4k) IOs as well, even though I suspect
there won't be.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/8] xfs: implement iomap based buffered write path
  2016-04-12 20:52   ` Christoph Hellwig
@ 2016-04-14 12:58     ` Brian Foster
  -1 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2016-04-14 12:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs, rpeterso, linux-fsdevel

On Tue, Apr 12, 2016 at 01:52:59PM -0700, Christoph Hellwig wrote:
> Convert XFS to use the new iomap based multipage write path. This involves
> implementing the ->iomap_begin and ->iomap_end methods, and switching the
> buffered file write, page_mkwrite and xfs_iozero paths to the new iomap
> helpers.
> 
> With this change __xfs_get_blocks will never be used for buffered writes,
> and the code handling them can be removed.
> 
> Based on earlier code from Dave Chinner.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/Kconfig     |   1 +
>  fs/xfs/xfs_aops.c  | 212 -----------------------------------------------------
>  fs/xfs/xfs_file.c  |  71 ++++++++----------
>  fs/xfs/xfs_iomap.c | 144 ++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_iomap.h |   5 +-
>  fs/xfs/xfs_iops.c  |   9 ++-
>  fs/xfs/xfs_trace.h |   3 +
>  7 files changed, 187 insertions(+), 258 deletions(-)
> 
...
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 2f37194..73de1ec 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -967,3 +967,147 @@ xfs_bmbt_to_iomap(
...
> +static int
> +xfs_file_iomap_end_delalloc(
> +	struct xfs_inode	*ip,
> +	loff_t			offset,
> +	loff_t			length,
> +	ssize_t			written)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +	xfs_fileoff_t		start_fsb;
> +	xfs_fileoff_t		end_fsb;
> +	int			error = 0;
> +
> +	start_fsb = XFS_B_TO_FSB(mp, offset + written);
> +	end_fsb = XFS_B_TO_FSB(mp, offset + length - written);
> +

Just skimming over this series... but shouldn't this be offset + length?
Why walk back from the end of the allocated range?

Brian

> +	/*
> +	 * Trim back delalloc blocks if we didn't manage to write the whole
> +	 * range reserved.
> +	 *
> +	 * We don't need to care about racing delalloc as we hold i_mutex
> +	 * across the reserve/allocate/unreserve calls. If there are delalloc
> +	 * blocks in the range, they are ours.
> +	 */
> +	if (start_fsb < end_fsb) {
> +		xfs_ilock(ip, XFS_ILOCK_EXCL);
> +		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
> +					       end_fsb - start_fsb);
> +		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> +		if (error && !XFS_FORCED_SHUTDOWN(mp)) {
> +			xfs_alert(mp, "%s: unable to clean up ino %lld",
> +				__func__, ip->i_ino);
> +			return error;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +xfs_file_iomap_end(
> +	struct inode		*inode,
> +	loff_t			offset,
> +	loff_t			length,
> +	ssize_t			written,
> +	unsigned		flags,
> +	struct iomap		*iomap)
> +{
> +	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
> +		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
> +				length, written);
> +	return 0;
> +}
> +
> +struct iomap_ops xfs_iomap_ops = {
> +	.iomap_begin		= xfs_file_iomap_begin,
> +	.iomap_end		= xfs_file_iomap_end,
> +};
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 718f07c..e066d04 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -18,7 +18,8 @@
>  #ifndef __XFS_IOMAP_H__
>  #define __XFS_IOMAP_H__
>  
> -struct iomap;
> +#include <linux/iomap.h>
> +
>  struct xfs_inode;
>  struct xfs_bmbt_irec;
>  
> @@ -33,4 +34,6 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
>  void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
>  		struct xfs_bmbt_irec *);
>  
> +extern struct iomap_ops xfs_iomap_ops;
> +
>  #endif /* __XFS_IOMAP_H__*/
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 1e2086d..6dfa10c 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -38,6 +38,7 @@
>  #include "xfs_dir2.h"
>  #include "xfs_trans_space.h"
>  #include "xfs_pnfs.h"
> +#include "xfs_iomap.h"
>  
>  #include <linux/capability.h>
>  #include <linux/xattr.h>
> @@ -822,8 +823,8 @@ xfs_setattr_size(
>  			error = dax_truncate_page(inode, newsize,
>  					xfs_get_blocks_direct);
>  		} else {
> -			error = block_truncate_page(inode->i_mapping, newsize,
> -					xfs_get_blocks);
> +			error = iomap_truncate_page(inode, newsize,
> +					&did_zeroing, &xfs_iomap_ops);
>  		}
>  	}
>  
> @@ -838,8 +839,8 @@ xfs_setattr_size(
>  	 * problem. Note that this includes any block zeroing we did above;
>  	 * otherwise those blocks may not be zeroed after a crash.
>  	 */
> -	if (newsize > ip->i_d.di_size &&
> -	    (oldsize != ip->i_d.di_size || did_zeroing)) {
> +	if (did_zeroing ||
> +	    (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
>  		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
>  						      ip->i_d.di_size, newsize);
>  		if (error)
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 840d52e..86fb345 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1296,6 +1296,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
>  DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
>  DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
>  DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
> +DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
> +DEFINE_IOMAP_EVENT(xfs_iomap_found);
> +DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
>  
>  DECLARE_EVENT_CLASS(xfs_simple_io_class,
>  	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
> -- 
> 2.1.4
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/8] xfs: implement iomap based buffered write path
@ 2016-04-14 12:58     ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2016-04-14 12:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: rpeterso, linux-fsdevel, xfs

On Tue, Apr 12, 2016 at 01:52:59PM -0700, Christoph Hellwig wrote:
> Convert XFS to use the new iomap based multipage write path. This involves
> implementing the ->iomap_begin and ->iomap_end methods, and switching the
> buffered file write, page_mkwrite and xfs_iozero paths to the new iomap
> helpers.
> 
> With this change __xfs_get_blocks will never be used for buffered writes,
> and the code handling them can be removed.
> 
> Based on earlier code from Dave Chinner.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/Kconfig     |   1 +
>  fs/xfs/xfs_aops.c  | 212 -----------------------------------------------------
>  fs/xfs/xfs_file.c  |  71 ++++++++----------
>  fs/xfs/xfs_iomap.c | 144 ++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_iomap.h |   5 +-
>  fs/xfs/xfs_iops.c  |   9 ++-
>  fs/xfs/xfs_trace.h |   3 +
>  7 files changed, 187 insertions(+), 258 deletions(-)
> 
...
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 2f37194..73de1ec 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -967,3 +967,147 @@ xfs_bmbt_to_iomap(
...
> +static int
> +xfs_file_iomap_end_delalloc(
> +	struct xfs_inode	*ip,
> +	loff_t			offset,
> +	loff_t			length,
> +	ssize_t			written)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +	xfs_fileoff_t		start_fsb;
> +	xfs_fileoff_t		end_fsb;
> +	int			error = 0;
> +
> +	start_fsb = XFS_B_TO_FSB(mp, offset + written);
> +	end_fsb = XFS_B_TO_FSB(mp, offset + length - written);
> +

Just skimming over this series... but shouldn't this be offset + length?
Why walk back from the end of the allocated range?

Brian

> +	/*
> +	 * Trim back delalloc blocks if we didn't manage to write the whole
> +	 * range reserved.
> +	 *
> +	 * We don't need to care about racing delalloc as we hold i_mutex
> +	 * across the reserve/allocate/unreserve calls. If there are delalloc
> +	 * blocks in the range, they are ours.
> +	 */
> +	if (start_fsb < end_fsb) {
> +		xfs_ilock(ip, XFS_ILOCK_EXCL);
> +		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
> +					       end_fsb - start_fsb);
> +		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> +		if (error && !XFS_FORCED_SHUTDOWN(mp)) {
> +			xfs_alert(mp, "%s: unable to clean up ino %lld",
> +				__func__, ip->i_ino);
> +			return error;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +xfs_file_iomap_end(
> +	struct inode		*inode,
> +	loff_t			offset,
> +	loff_t			length,
> +	ssize_t			written,
> +	unsigned		flags,
> +	struct iomap		*iomap)
> +{
> +	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
> +		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
> +				length, written);
> +	return 0;
> +}
> +
> +struct iomap_ops xfs_iomap_ops = {
> +	.iomap_begin		= xfs_file_iomap_begin,
> +	.iomap_end		= xfs_file_iomap_end,
> +};
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 718f07c..e066d04 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -18,7 +18,8 @@
>  #ifndef __XFS_IOMAP_H__
>  #define __XFS_IOMAP_H__
>  
> -struct iomap;
> +#include <linux/iomap.h>
> +
>  struct xfs_inode;
>  struct xfs_bmbt_irec;
>  
> @@ -33,4 +34,6 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
>  void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
>  		struct xfs_bmbt_irec *);
>  
> +extern struct iomap_ops xfs_iomap_ops;
> +
>  #endif /* __XFS_IOMAP_H__*/
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 1e2086d..6dfa10c 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -38,6 +38,7 @@
>  #include "xfs_dir2.h"
>  #include "xfs_trans_space.h"
>  #include "xfs_pnfs.h"
> +#include "xfs_iomap.h"
>  
>  #include <linux/capability.h>
>  #include <linux/xattr.h>
> @@ -822,8 +823,8 @@ xfs_setattr_size(
>  			error = dax_truncate_page(inode, newsize,
>  					xfs_get_blocks_direct);
>  		} else {
> -			error = block_truncate_page(inode->i_mapping, newsize,
> -					xfs_get_blocks);
> +			error = iomap_truncate_page(inode, newsize,
> +					&did_zeroing, &xfs_iomap_ops);
>  		}
>  	}
>  
> @@ -838,8 +839,8 @@ xfs_setattr_size(
>  	 * problem. Note that this includes any block zeroing we did above;
>  	 * otherwise those blocks may not be zeroed after a crash.
>  	 */
> -	if (newsize > ip->i_d.di_size &&
> -	    (oldsize != ip->i_d.di_size || did_zeroing)) {
> +	if (did_zeroing ||
> +	    (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
>  		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
>  						      ip->i_d.di_size, newsize);
>  		if (error)
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 840d52e..86fb345 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1296,6 +1296,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
>  DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
>  DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
>  DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
> +DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
> +DEFINE_IOMAP_EVENT(xfs_iomap_found);
> +DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
>  
>  DECLARE_EVENT_CLASS(xfs_simple_io_class,
>  	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
> -- 
> 2.1.4
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: iomap infrastructure and multipage writes V2
  2016-04-13 21:54   ` Dave Chinner
@ 2016-05-02 18:23     ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-05-02 18:23 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs, rpeterso, linux-fsdevel

Hi Dave,

sorry for taking forever to get back to this - travel to LSF and some
other meetings and a dealine last week didn't leave me any time for
XFS work.

On Thu, Apr 14, 2016 at 07:54:42AM +1000, Dave Chinner wrote:
> Christoph, have you done any perf testing of this patchset yet to
> check that it does indeed reduce the CPU overhead of large write
> operations? I'd also be interested to know if there is any change in
> overhead for single page (4k) IOs as well, even though I suspect
> there won't be.

I've done a lot of testing earlier, and this version also looks very
promising.  On the sort of hardware I have access to now, the 4k
numbers don't change much, but with 1M writes we both increase the
write bandwith a little bit and significantly lower the cpu usage.

The simple test that demonstrates this is this, the runs are from
a 4p VM with 4G of RAM, access to a fast NVMe SSD and a small enough
data size so that writeback shouldn't throttle the buffered write
path:

MNT=/mnt
PERF="perf_3.16"        # soo smart to have tools in the kernel tree..

#BS=4k
#COUNT=65536
BS=1M
COUNT=256

$PERF stat dd if=/dev/zero of=$MNT/testfile bs=$BS count=$COUNT

with the baseline for-next tree I get the following bandwith and
cpu utilization:

BS=4k: ~600MB/s			0.856 CPUs utilized ( +-  0.32% )
BS=1M: 1.45GB/s			0.820 CPUs utilized ( +-  0.77% )

with all patches applied:

BS=4k:	~610MB/s		0.848 CPUs utilized ( +-  0.36% )
BS=1M:	~1.55GB/s		0.615 CPUs utilized ( +-  0.80% )

This is also visible in the walltime

baseline, 4k:

real	0m0.540s
user	0m0.000s
sys	0m0.533s

baseline, 1M:

real	0m0.310s
user	0m0.000s
sys	0m0.313s

multipage, 4k:

real	0m0.541s
user	0m0.010s
sys	0m0.527s

multipage, 1M:

real	0m0.272s
user	0m0.000s
sys	0m0.263s

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: iomap infrastructure and multipage writes V2
@ 2016-05-02 18:23     ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-05-02 18:23 UTC (permalink / raw)
  To: Dave Chinner; +Cc: rpeterso, linux-fsdevel, xfs

Hi Dave,

sorry for taking forever to get back to this - travel to LSF and some
other meetings and a dealine last week didn't leave me any time for
XFS work.

On Thu, Apr 14, 2016 at 07:54:42AM +1000, Dave Chinner wrote:
> Christoph, have you done any perf testing of this patchset yet to
> check that it does indeed reduce the CPU overhead of large write
> operations? I'd also be interested to know if there is any change in
> overhead for single page (4k) IOs as well, even though I suspect
> there won't be.

I've done a lot of testing earlier, and this version also looks very
promising.  On the sort of hardware I have access to now, the 4k
numbers don't change much, but with 1M writes we both increase the
write bandwith a little bit and significantly lower the cpu usage.

The simple test that demonstrates this is this, the runs are from
a 4p VM with 4G of RAM, access to a fast NVMe SSD and a small enough
data size so that writeback shouldn't throttle the buffered write
path:

MNT=/mnt
PERF="perf_3.16"        # soo smart to have tools in the kernel tree..

#BS=4k
#COUNT=65536
BS=1M
COUNT=256

$PERF stat dd if=/dev/zero of=$MNT/testfile bs=$BS count=$COUNT

with the baseline for-next tree I get the following bandwith and
cpu utilization:

BS=4k: ~600MB/s			0.856 CPUs utilized ( +-  0.32% )
BS=1M: 1.45GB/s			0.820 CPUs utilized ( +-  0.77% )

with all patches applied:

BS=4k:	~610MB/s		0.848 CPUs utilized ( +-  0.36% )
BS=1M:	~1.55GB/s		0.615 CPUs utilized ( +-  0.80% )

This is also visible in the walltime

baseline, 4k:

real	0m0.540s
user	0m0.000s
sys	0m0.533s

baseline, 1M:

real	0m0.310s
user	0m0.000s
sys	0m0.313s

multipage, 4k:

real	0m0.541s
user	0m0.010s
sys	0m0.527s

multipage, 1M:

real	0m0.272s
user	0m0.000s
sys	0m0.263s

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/8] xfs: implement iomap based buffered write path
  2016-04-14 12:58     ` Brian Foster
@ 2016-05-02 18:25       ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-05-02 18:25 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, xfs, rpeterso, linux-fsdevel

On Thu, Apr 14, 2016 at 08:58:14AM -0400, Brian Foster wrote:
> > +static int
> > +xfs_file_iomap_end_delalloc(
> > +	struct xfs_inode	*ip,
> > +	loff_t			offset,
> > +	loff_t			length,
> > +	ssize_t			written)
> > +{
> > +	struct xfs_mount	*mp = ip->i_mount;
> > +	xfs_fileoff_t		start_fsb;
> > +	xfs_fileoff_t		end_fsb;
> > +	int			error = 0;
> > +
> > +	start_fsb = XFS_B_TO_FSB(mp, offset + written);
> > +	end_fsb = XFS_B_TO_FSB(mp, offset + length - written);
> > +
> 
> Just skimming over this series... but shouldn't this be offset + length?
> Why walk back from the end of the allocated range?

Because the interface from the core iomap code need to pass the
length of the actually mapped range, and the amount of bytes successfully
written into it to the filesystem, as other filesystems will require
this for their locking.  We need to convert it back at some point,
and it seems more logical here than in the caller.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/8] xfs: implement iomap based buffered write path
@ 2016-05-02 18:25       ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-05-02 18:25 UTC (permalink / raw)
  To: Brian Foster; +Cc: rpeterso, linux-fsdevel, Christoph Hellwig, xfs

On Thu, Apr 14, 2016 at 08:58:14AM -0400, Brian Foster wrote:
> > +static int
> > +xfs_file_iomap_end_delalloc(
> > +	struct xfs_inode	*ip,
> > +	loff_t			offset,
> > +	loff_t			length,
> > +	ssize_t			written)
> > +{
> > +	struct xfs_mount	*mp = ip->i_mount;
> > +	xfs_fileoff_t		start_fsb;
> > +	xfs_fileoff_t		end_fsb;
> > +	int			error = 0;
> > +
> > +	start_fsb = XFS_B_TO_FSB(mp, offset + written);
> > +	end_fsb = XFS_B_TO_FSB(mp, offset + length - written);
> > +
> 
> Just skimming over this series... but shouldn't this be offset + length?
> Why walk back from the end of the allocated range?

Because the interface from the core iomap code need to pass the
length of the actually mapped range, and the amount of bytes successfully
written into it to the filesystem, as other filesystems will require
this for their locking.  We need to convert it back at some point,
and it seems more logical here than in the caller.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/8] xfs: implement iomap based buffered write path
  2016-05-02 18:25       ` Christoph Hellwig
@ 2016-05-03 15:02         ` Brian Foster
  -1 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2016-05-03 15:02 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: rpeterso, linux-fsdevel, xfs

On Mon, May 02, 2016 at 08:25:23PM +0200, Christoph Hellwig wrote:
> On Thu, Apr 14, 2016 at 08:58:14AM -0400, Brian Foster wrote:
> > > +static int
> > > +xfs_file_iomap_end_delalloc(
> > > +	struct xfs_inode	*ip,
> > > +	loff_t			offset,
> > > +	loff_t			length,
> > > +	ssize_t			written)
> > > +{
> > > +	struct xfs_mount	*mp = ip->i_mount;
> > > +	xfs_fileoff_t		start_fsb;
> > > +	xfs_fileoff_t		end_fsb;
> > > +	int			error = 0;
> > > +
> > > +	start_fsb = XFS_B_TO_FSB(mp, offset + written);
> > > +	end_fsb = XFS_B_TO_FSB(mp, offset + length - written);
> > > +
> > 
> > Just skimming over this series... but shouldn't this be offset + length?
> > Why walk back from the end of the allocated range?
> 
> Because the interface from the core iomap code need to pass the
> length of the actually mapped range, and the amount of bytes successfully
> written into it to the filesystem, as other filesystems will require
> this for their locking.  We need to convert it back at some point,
> and it seems more logical here than in the caller.
> 

I'm not asking about the interface... or at least I'm not following your
point. I'm just suggesting that the calculation of end_fsb is wrong.
E.g., if the intent is to punch out the range that was allocated but not
written to, shouldn't the range to punch be [offset + written, offset +
length]?

Brian

> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/8] xfs: implement iomap based buffered write path
@ 2016-05-03 15:02         ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2016-05-03 15:02 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: rpeterso, linux-fsdevel, xfs

On Mon, May 02, 2016 at 08:25:23PM +0200, Christoph Hellwig wrote:
> On Thu, Apr 14, 2016 at 08:58:14AM -0400, Brian Foster wrote:
> > > +static int
> > > +xfs_file_iomap_end_delalloc(
> > > +	struct xfs_inode	*ip,
> > > +	loff_t			offset,
> > > +	loff_t			length,
> > > +	ssize_t			written)
> > > +{
> > > +	struct xfs_mount	*mp = ip->i_mount;
> > > +	xfs_fileoff_t		start_fsb;
> > > +	xfs_fileoff_t		end_fsb;
> > > +	int			error = 0;
> > > +
> > > +	start_fsb = XFS_B_TO_FSB(mp, offset + written);
> > > +	end_fsb = XFS_B_TO_FSB(mp, offset + length - written);
> > > +
> > 
> > Just skimming over this series... but shouldn't this be offset + length?
> > Why walk back from the end of the allocated range?
> 
> Because the interface from the core iomap code need to pass the
> length of the actually mapped range, and the amount of bytes successfully
> written into it to the filesystem, as other filesystems will require
> this for their locking.  We need to convert it back at some point,
> and it seems more logical here than in the caller.
> 

I'm not asking about the interface... or at least I'm not following your
point. I'm just suggesting that the calculation of end_fsb is wrong.
E.g., if the intent is to punch out the range that was allocated but not
written to, shouldn't the range to punch be [offset + written, offset +
length]?

Brian

> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/8] xfs: implement iomap based buffered write path
  2016-05-03 15:02         ` Brian Foster
@ 2016-05-03 18:15           ` Christoph Hellwig
  -1 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-05-03 18:15 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, rpeterso, linux-fsdevel, xfs

On Tue, May 03, 2016 at 11:02:19AM -0400, Brian Foster wrote:
> > Because the interface from the core iomap code need to pass the
> > length of the actually mapped range, and the amount of bytes successfully
> > written into it to the filesystem, as other filesystems will require
> > this for their locking.  We need to convert it back at some point,
> > and it seems more logical here than in the caller.
> > 
> 
> I'm not asking about the interface... or at least I'm not following your
> point. I'm just suggesting that the calculation of end_fsb is wrong.
> E.g., if the intent is to punch out the range that was allocated but not
> written to, shouldn't the range to punch be [offset + written, offset +
> length]?

Oh, yes.  It probably should - let me fix it and re-run xfstests..

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/8] xfs: implement iomap based buffered write path
@ 2016-05-03 18:15           ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2016-05-03 18:15 UTC (permalink / raw)
  To: Brian Foster; +Cc: rpeterso, linux-fsdevel, Christoph Hellwig, xfs

On Tue, May 03, 2016 at 11:02:19AM -0400, Brian Foster wrote:
> > Because the interface from the core iomap code need to pass the
> > length of the actually mapped range, and the amount of bytes successfully
> > written into it to the filesystem, as other filesystems will require
> > this for their locking.  We need to convert it back at some point,
> > and it seems more logical here than in the caller.
> > 
> 
> I'm not asking about the interface... or at least I'm not following your
> point. I'm just suggesting that the calculation of end_fsb is wrong.
> E.g., if the intent is to punch out the range that was allocated but not
> written to, shouldn't the range to punch be [offset + written, offset +
> length]?

Oh, yes.  It probably should - let me fix it and re-run xfstests..

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2016-05-03 18:16 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-12 20:52 iomap infrastructure and multipage writes V2 Christoph Hellwig
2016-04-12 20:52 ` Christoph Hellwig
2016-04-12 20:52 ` [PATCH 1/8] fs: move struct iomap from exportfs.h to a separate header Christoph Hellwig
2016-04-12 20:52   ` Christoph Hellwig
2016-04-12 20:52 ` [PATCH 2/8] fs: introduce iomap infrastructure Christoph Hellwig
2016-04-12 20:52   ` Christoph Hellwig
2016-04-12 20:52 ` [PATCH 3/8] xfs: make xfs_bmbt_to_iomap available outside of xfs_pnfs.c Christoph Hellwig
2016-04-12 20:52   ` Christoph Hellwig
2016-04-12 20:52 ` [PATCH 4/8] xfs: reorder zeroing and flushing sequence in truncate Christoph Hellwig
2016-04-12 20:52   ` Christoph Hellwig
2016-04-12 20:52 ` [PATCH 5/8] xfs: implement iomap based buffered write path Christoph Hellwig
2016-04-12 20:52   ` Christoph Hellwig
2016-04-14 12:58   ` Brian Foster
2016-04-14 12:58     ` Brian Foster
2016-05-02 18:25     ` Christoph Hellwig
2016-05-02 18:25       ` Christoph Hellwig
2016-05-03 15:02       ` Brian Foster
2016-05-03 15:02         ` Brian Foster
2016-05-03 18:15         ` Christoph Hellwig
2016-05-03 18:15           ` Christoph Hellwig
2016-04-12 20:53 ` [PATCH 6/8] xfs: remove buffered write support from __xfs_get_blocks Christoph Hellwig
2016-04-12 20:53   ` Christoph Hellwig
2016-04-12 20:53 ` [PATCH 7/8] fs: iomap based fiemap implementation Christoph Hellwig
2016-04-12 20:53   ` Christoph Hellwig
2016-04-12 20:53 ` [PATCH 8/8] xfs: use iomap " Christoph Hellwig
2016-04-12 20:53   ` Christoph Hellwig
2016-04-13 21:54 ` iomap infrastructure and multipage writes V2 Dave Chinner
2016-04-13 21:54   ` Dave Chinner
2016-05-02 18:23   ` Christoph Hellwig
2016-05-02 18:23     ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.