All of lore.kernel.org
 help / color / mirror / Atom feed
* + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-01-12 23:11 akpm
  2015-01-15 12:41 ` Jan Kara
  0 siblings, 1 reply; 25+ messages in thread
From: akpm @ 2015-01-12 23:11 UTC (permalink / raw)
  To: ross.zwisler, andreas.dilger, axboe, boaz, david, hch, jack,
	kirill.shutemov, mathieu.desnoyers, matthew.r.wilcox, rdunlap,
	tytso, mm-commits


The patch titled
     Subject: ext4: add DAX functionality
has been added to the -mm tree.  Its filename is
     ext4-add-dax-functionality.patch

This patch should soon appear at
    http://ozlabs.org/~akpm/mmots/broken-out/ext4-add-dax-functionality.patch
and later at
    http://ozlabs.org/~akpm/mmotm/broken-out/ext4-add-dax-functionality.patch

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/SubmitChecklist when testing your code ***

The -mm tree is included into linux-next and is updated
there every 3-4 working days

------------------------------------------------------
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Subject: ext4: add DAX functionality

This is a port of the DAX functionality found in the current version of
ext2.

[matthew.r.wilcox@intel.com: heavily tweaked]
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 Documentation/filesystems/dax.txt  |    1 
 Documentation/filesystems/ext4.txt |    4 +
 fs/ext4/ext4.h                     |    6 +
 fs/ext4/file.c                     |   50 ++++++++++++++-
 fs/ext4/indirect.c                 |   18 +++--
 fs/ext4/inode.c                    |   89 ++++++++++++++++++---------
 fs/ext4/namei.c                    |   10 ++-
 fs/ext4/super.c                    |   39 +++++++++++
 8 files changed, 180 insertions(+), 37 deletions(-)

diff -puN Documentation/filesystems/dax.txt~ext4-add-dax-functionality Documentation/filesystems/dax.txt
--- a/Documentation/filesystems/dax.txt~ext4-add-dax-functionality
+++ a/Documentation/filesystems/dax.txt
@@ -73,6 +73,7 @@ or a write()) work correctly.
 
 These filesystems may be used for inspiration:
 - ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
+- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
 
 
 Shortcomings
diff -puN Documentation/filesystems/ext4.txt~ext4-add-dax-functionality Documentation/filesystems/ext4.txt
--- a/Documentation/filesystems/ext4.txt~ext4-add-dax-functionality
+++ a/Documentation/filesystems/ext4.txt
@@ -386,6 +386,10 @@ max_dir_size_kb=n	This limits the size o
 i_version		Enable 64-bit inode version support. This option is
 			off by default.
 
+dax			Use direct access (no page cache).  See
+			Documentation/filesystems/dax.txt.  Note that
+			this option is incompatible with data=journal.
+
 Data Mode
 =========
 There are 3 different data modes:
diff -puN fs/ext4/ext4.h~ext4-add-dax-functionality fs/ext4/ext4.h
--- a/fs/ext4/ext4.h~ext4-add-dax-functionality
+++ a/fs/ext4/ext4.h
@@ -965,6 +965,11 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_ERRORS_MASK		0x00070
 #define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
+#ifdef CONFIG_FS_DAX
+#define EXT4_MOUNT_DAX			0x00200	/* Direct Access */
+#else
+#define EXT4_MOUNT_DAX			0
+#endif
 #define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
 #define EXT4_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
 #define EXT4_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
@@ -2578,6 +2583,7 @@ extern const struct file_operations ext4
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern const struct file_operations ext4_dax_file_operations;
 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 
 /* inline.c */
diff -puN fs/ext4/file.c~ext4-add-dax-functionality fs/ext4/file.c
--- a/fs/ext4/file.c~ext4-add-dax-functionality
+++ a/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb,
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct mutex *aio_mutex = NULL;
 	struct blk_plug plug;
-	int o_direct = file->f_flags & O_DIRECT;
+	int o_direct = io_is_direct(file);
 	int overwrite = 0;
 	size_t length = iov_iter_count(from);
 	ssize_t ret;
@@ -191,6 +191,27 @@ errout:
 	return ret;
 }
 
+#ifdef CONFIG_FS_DAX
+static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_fault(vma, vmf, ext4_get_block);
+					/* Is this the right get_block? */
+}
+
+static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return dax_mkwrite(vma, vmf, ext4_get_block);
+}
+
+static const struct vm_operations_struct ext4_dax_vm_ops = {
+	.fault		= ext4_dax_fault,
+	.page_mkwrite	= ext4_dax_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
+};
+#else
+#define ext4_dax_vm_ops	ext4_file_vm_ops
+#endif
+
 static const struct vm_operations_struct ext4_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
@@ -201,7 +222,12 @@ static const struct vm_operations_struct
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
-	vma->vm_ops = &ext4_file_vm_ops;
+	if (IS_DAX(file_inode(file))) {
+		vma->vm_ops = &ext4_dax_vm_ops;
+		vma->vm_flags |= VM_MIXEDMAP;
+	} else {
+		vma->vm_ops = &ext4_file_vm_ops;
+	}
 	return 0;
 }
 
@@ -600,6 +626,26 @@ const struct file_operations ext4_file_o
 	.fallocate	= ext4_fallocate,
 };
 
+#ifdef CONFIG_FS_DAX
+const struct file_operations ext4_dax_file_operations = {
+	.llseek		= ext4_llseek,
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= ext4_file_write_iter,
+	.unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext4_compat_ioctl,
+#endif
+	.mmap		= ext4_file_mmap,
+	.open		= ext4_file_open,
+	.release	= ext4_release_file,
+	.fsync		= ext4_sync_file,
+	/* Splice not yet supported with DAX */
+	.fallocate	= ext4_fallocate,
+};
+#endif
+
 const struct inode_operations ext4_file_inode_operations = {
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_getattr,
diff -puN fs/ext4/indirect.c~ext4-add-dax-functionality fs/ext4/indirect.c
--- a/fs/ext4/indirect.c~ext4-add-dax-functionality
+++ a/fs/ext4/indirect.c
@@ -689,14 +689,22 @@ retry:
 			inode_dio_done(inode);
 			goto locked;
 		}
-		ret = __blockdev_direct_IO(rw, iocb, inode,
-				 inode->i_sb->s_bdev, iter, offset,
-				 ext4_get_block, NULL, NULL, 0);
+		if (IS_DAX(inode))
+			ret = dax_do_io(rw, iocb, inode, iter, offset,
+					ext4_get_block, NULL, 0);
+		else
+			ret = __blockdev_direct_IO(rw, iocb, inode,
+					inode->i_sb->s_bdev, iter, offset,
+					ext4_get_block, NULL, NULL, 0);
 		inode_dio_done(inode);
 	} else {
 locked:
-		ret = blockdev_direct_IO(rw, iocb, inode, iter,
-				 offset, ext4_get_block);
+		if (IS_DAX(inode))
+			ret = dax_do_io(rw, iocb, inode, iter, offset,
+					ext4_get_block, NULL, DIO_LOCKING);
+		else
+			ret = blockdev_direct_IO(rw, iocb, inode, iter,
+					offset, ext4_get_block);
 
 		if (unlikely((rw & WRITE) && ret < 0)) {
 			loff_t isize = i_size_read(inode);
diff -puN fs/ext4/inode.c~ext4-add-dax-functionality fs/ext4/inode.c
--- a/fs/ext4/inode.c~ext4-add-dax-functionality
+++ a/fs/ext4/inode.c
@@ -657,6 +657,18 @@ has_zeroout:
 	return retval;
 }
 
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+	struct inode *inode = bh->b_assoc_map->host;
+	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+	int err;
+	if (!uptodate)
+		return;
+	WARN_ON(!buffer_unwritten(bh));
+	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
@@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode
 
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+			bh->b_assoc_map = inode->i_mapping;
+			bh->b_private = (void *)(unsigned long)iblock;
+			bh->b_end_io = ext4_end_io_unwritten;
+		}
 		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
 			set_buffer_defer_completion(bh);
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw
 		get_block_func = ext4_get_block_write;
 		dio_flags = DIO_LOCKING;
 	}
-	ret = __blockdev_direct_IO(rw, iocb, inode,
-				   inode->i_sb->s_bdev, iter,
-				   offset,
-				   get_block_func,
-				   ext4_end_io_dio,
-				   NULL,
-				   dio_flags);
+	if (IS_DAX(inode))
+		ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
+				ext4_end_io_dio, dio_flags);
+	else
+		ret = __blockdev_direct_IO(rw, iocb, inode,
+					   inode->i_sb->s_bdev, iter, offset,
+					   get_block_func,
+					   ext4_end_io_dio, NULL, dio_flags);
 
 	/*
 	 * Put our reference to io_end. This can free the io_end structure e.g.
@@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode)
 		inode->i_mapping->a_ops = &ext4_aops;
 }
 
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'.  The range to be zero'd must
- * be contained with in one block.  If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-static int ext4_block_zero_page_range(handle_t *handle,
+static int __ext4_block_zero_page_range(handle_t *handle,
 		struct address_space *mapping, loff_t from, loff_t length)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned blocksize, max, pos;
+	unsigned blocksize, pos;
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
@@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(ha
 		return -ENOMEM;
 
 	blocksize = inode->i_sb->s_blocksize;
-	max = blocksize - (offset & (blocksize - 1));
-
-	/*
-	 * correct length if it does not fall between
-	 * 'from' and the end of the block
-	 */
-	if (length > max || length < 0)
-		length = max;
 
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 
@@ -3278,6 +3281,33 @@ unlock:
 }
 
 /*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+static int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length)
+{
+	struct inode *inode = mapping->host;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	unsigned max = blocksize - (offset & (blocksize - 1));
+
+	/*
+	 * correct length if it does not fall between
+	 * 'from' and the end of the block
+	 */
+	if (length > max || length < 0)
+		length = max;
+
+	if (IS_DAX(inode))
+		return dax_zero_page_range(inode, from, length, ext4_get_block);
+	return __ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
  * up to the end of the block which corresponds to `from'.
  * This required during truncate. We need to physically zero the tail end
@@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *
 		new_fl |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
+	if (test_opt(inode->i_sb, DAX))
+		new_fl |= S_DAX;
 	inode_set_flags(inode, new_fl,
-			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
 }
 
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_blo
 
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
-		inode->i_fop = &ext4_file_operations;
+		if (test_opt(inode->i_sb, DAX))
+			inode->i_fop = &ext4_dax_file_operations;
+		else
+			inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext4_dir_inode_operations;
@@ -4534,7 +4569,7 @@ int ext4_setattr(struct dentry *dentry,
 		 * Truncate pagecache after we've waited for commit
 		 * in data=journal mode to make pages freeable.
 		 */
-			truncate_pagecache(inode, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 	}
 	/*
 	 * We want to call ext4_truncate() even if attr->ia_size ==
diff -puN fs/ext4/namei.c~ext4-add-dax-functionality fs/ext4/namei.c
--- a/fs/ext4/namei.c~ext4-add-dax-functionality
+++ a/fs/ext4/namei.c
@@ -2235,7 +2235,10 @@ retry:
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext4_file_inode_operations;
-		inode->i_fop = &ext4_file_operations;
+		if (test_opt(inode->i_sb, DAX))
+			inode->i_fop = &ext4_dax_file_operations;
+		else
+			inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
 		err = ext4_add_nondir(handle, dentry, inode);
 		if (!err && IS_DIRSYNC(dir))
@@ -2299,7 +2302,10 @@ retry:
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext4_file_inode_operations;
-		inode->i_fop = &ext4_file_operations;
+		if (test_opt(inode->i_sb, DAX))
+			inode->i_fop = &ext4_dax_file_operations;
+		else
+			inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
 		d_tmpfile(dentry, inode);
 		err = ext4_orphan_add(handle, inode);
diff -puN fs/ext4/super.c~ext4-add-dax-functionality fs/ext4/super.c
--- a/fs/ext4/super.c~ext4-add-dax-functionality
+++ a/fs/ext4/super.c
@@ -1137,7 +1137,7 @@ enum {
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
 	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
-	Opt_usrquota, Opt_grpquota, Opt_i_version,
+	Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
@@ -1200,6 +1200,7 @@ static const match_table_t tokens = {
 	{Opt_barrier, "barrier"},
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_i_version, "i_version"},
+	{Opt_dax, "dax"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
@@ -1384,6 +1385,7 @@ static const struct mount_opts {
 	{Opt_min_batch_time, 0, MOPT_GTE0},
 	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
 	{Opt_init_itable, 0, MOPT_GTE0},
+	{Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
 	{Opt_stripe, 0, MOPT_GTE0},
 	{Opt_resuid, 0, MOPT_GTE0},
 	{Opt_resgid, 0, MOPT_GTE0},
@@ -1620,6 +1622,11 @@ static int handle_mount_opt(struct super
 		}
 		sbi->s_jquota_fmt = m->mount_opt;
 #endif
+#ifndef CONFIG_FS_DAX
+	} else if (token == Opt_dax) {
+		ext4_msg(sb, KERN_INFO, "dax option not supported");
+		return -1;
+#endif
 	} else {
 		if (!args->from)
 			arg = 1;
@@ -3602,6 +3609,11 @@ static int ext4_fill_super(struct super_
 				 "both data=journal and dioread_nolock");
 			goto failed_mount;
 		}
+		if (test_opt(sb, DAX)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and dax");
+			goto failed_mount;
+		}
 		if (test_opt(sb, DELALLOC))
 			clear_opt(sb, DELALLOC);
 	}
@@ -3665,6 +3677,19 @@ static int ext4_fill_super(struct super_
 		goto failed_mount;
 	}
 
+	if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+		if (blocksize != PAGE_SIZE) {
+			ext4_msg(sb, KERN_ERR,
+					"error: unsupported blocksize for dax");
+			goto failed_mount;
+		}
+		if (!sb->s_bdev->bd_disk->fops->direct_access) {
+			ext4_msg(sb, KERN_ERR,
+					"error: device does not support dax");
+			goto failed_mount;
+		}
+	}
+
 	if (sb->s_blocksize != blocksize) {
 		/* Validate the filesystem blocksize */
 		if (!sb_set_blocksize(sb, blocksize)) {
@@ -4882,6 +4907,18 @@ static int ext4_remount(struct super_blo
 			err = -EINVAL;
 			goto restore_opts;
 		}
+		if (test_opt(sb, DAX)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and dax");
+			err = -EINVAL;
+			goto restore_opts;
+		}
+	}
+
+	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
+		ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
+			"dax flag with busy inodes while remounting");
+		sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
 	}
 
 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
_

Patches currently in -mm which might be from ross.zwisler@linux.intel.com are

axonram-fix-bug-in-direct_access.patch
block-change-direct_access-calling-convention.patch
mm-fix-xip-fault-vs-truncate-race.patch
mm-fix-xip-fault-vs-truncate-race-fix.patch
mm-allow-page-fault-handlers-to-perform-the-cow.patch
mm-allow-page-fault-handlers-to-perform-the-cow-fix.patch
vfsext2-introduce-is_daxinode.patch
daxext2-replace-xip-read-and-write-with-dax-i-o.patch
daxext2-replace-ext2_clear_xip_target-with-dax_clear_blocks.patch
daxext2-replace-the-xip-page-fault-handler-with-the-dax-page-fault-handler.patch
daxext2-replace-the-xip-page-fault-handler-with-the-dax-page-fault-handler-fix.patch
daxext2-replace-xip_truncate_page-with-dax_truncate_page.patch
dax-replace-xip-documentation-with-dax-documentation.patch
vfs-remove-get_xip_mem.patch
ext2-remove-ext2_xip_verify_sb.patch
ext2-remove-ext2_use_xip.patch
ext2-remove-xipc-and-xiph.patch
vfsext2-remove-config_ext2_fs_xip-and-rename-config_fs_xip-to-config_fs_dax.patch
ext2-remove-ext2_aops_xip.patch
ext2-get-rid-of-most-mentions-of-xip-in-ext2.patch
dax-add-dax_zero_page_range.patch
dax-add-dax_zero_page_range-fix.patch
ext4-add-dax-functionality.patch
brd-rename-xip-to-dax.patch


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-01-12 23:11 + ext4-add-dax-functionality.patch added to -mm tree akpm
@ 2015-01-15 12:41 ` Jan Kara
  2015-01-16 21:16   ` Wilcox, Matthew R
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Kara @ 2015-01-15 12:41 UTC (permalink / raw)
  To: ross.zwisler
  Cc: akpm, andreas.dilger, axboe, boaz, david, hch, jack,
	kirill.shutemov, mathieu.desnoyers, matthew.r.wilcox, rdunlap,
	tytso, mm-commits, linux-ext4

On Mon 12-01-15 15:11:17, Andrew Morton wrote:
> From: Ross Zwisler <ross.zwisler@linux.intel.com>
> Subject: ext4: add DAX functionality
> 
> This is a port of the DAX functionality found in the current version of
> ext2.
> 
> [matthew.r.wilcox@intel.com: heavily tweaked]
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
> Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
> Cc: Boaz Harrosh <boaz@plexistor.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Dave Chinner <david@fromorbit.com>
> Cc: Jan Kara <jack@suse.cz>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Randy Dunlap <rdunlap@infradead.org>
> Cc: Theodore Ts'o <tytso@mit.edu>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
> 
>  Documentation/filesystems/dax.txt  |    1 
>  Documentation/filesystems/ext4.txt |    4 +
>  fs/ext4/ext4.h                     |    6 +
>  fs/ext4/file.c                     |   50 ++++++++++++++-
>  fs/ext4/indirect.c                 |   18 +++--
>  fs/ext4/inode.c                    |   89 ++++++++++++++++++---------
>  fs/ext4/namei.c                    |   10 ++-
>  fs/ext4/super.c                    |   39 +++++++++++
>  8 files changed, 180 insertions(+), 37 deletions(-)
> 
> diff -puN Documentation/filesystems/dax.txt~ext4-add-dax-functionality Documentation/filesystems/dax.txt
> --- a/Documentation/filesystems/dax.txt~ext4-add-dax-functionality
> +++ a/Documentation/filesystems/dax.txt
> @@ -73,6 +73,7 @@ or a write()) work correctly.
>  
>  These filesystems may be used for inspiration:
>  - ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
> +- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
>  
>  
>  Shortcomings
> diff -puN Documentation/filesystems/ext4.txt~ext4-add-dax-functionality Documentation/filesystems/ext4.txt
> --- a/Documentation/filesystems/ext4.txt~ext4-add-dax-functionality
> +++ a/Documentation/filesystems/ext4.txt
> @@ -386,6 +386,10 @@ max_dir_size_kb=n	This limits the size o
>  i_version		Enable 64-bit inode version support. This option is
>  			off by default.
>  
> +dax			Use direct access (no page cache).  See
> +			Documentation/filesystems/dax.txt.  Note that
> +			this option is incompatible with data=journal.
> +
>  Data Mode
>  =========
>  There are 3 different data modes:
> diff -puN fs/ext4/ext4.h~ext4-add-dax-functionality fs/ext4/ext4.h
> --- a/fs/ext4/ext4.h~ext4-add-dax-functionality
> +++ a/fs/ext4/ext4.h
> @@ -965,6 +965,11 @@ struct ext4_inode_info {
>  #define EXT4_MOUNT_ERRORS_MASK		0x00070
>  #define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
>  #define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
> +#ifdef CONFIG_FS_DAX
> +#define EXT4_MOUNT_DAX			0x00200	/* Direct Access */
> +#else
> +#define EXT4_MOUNT_DAX			0
> +#endif
  Again, why do you make definition of EXT4_MOUNT_DAX dependent on
CONFIG_FS_DAX?

> diff -puN fs/ext4/file.c~ext4-add-dax-functionality fs/ext4/file.c
> --- a/fs/ext4/file.c~ext4-add-dax-functionality
> +++ a/fs/ext4/file.c
> @@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb,
>  	struct inode *inode = file_inode(iocb->ki_filp);
>  	struct mutex *aio_mutex = NULL;
>  	struct blk_plug plug;
> -	int o_direct = file->f_flags & O_DIRECT;
> +	int o_direct = io_is_direct(file);
>  	int overwrite = 0;
>  	size_t length = iov_iter_count(from);
>  	ssize_t ret;
> @@ -191,6 +191,27 @@ errout:
>  	return ret;
>  }
>  
> +#ifdef CONFIG_FS_DAX
> +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	return dax_fault(vma, vmf, ext4_get_block);
> +					/* Is this the right get_block? */
  You can remove the comment. It is the right get_block function.

...
> diff -puN fs/ext4/inode.c~ext4-add-dax-functionality fs/ext4/inode.c
> --- a/fs/ext4/inode.c~ext4-add-dax-functionality
> +++ a/fs/ext4/inode.c
> @@ -657,6 +657,18 @@ has_zeroout:
>  	return retval;
>  }
>  
> +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
> +{
> +	struct inode *inode = bh->b_assoc_map->host;
> +	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
  That should be 16 TB if I'm doing the math right - 32-bit block number *
block size (4k) = 16 TB. And that's the max limit of ext4 (as logical file
offset in blocks has to fit in 32-bits for ext4). So I think you can just
remove the comment. But also see comment below.

> +	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
> +	int err;
> +	if (!uptodate)
> +		return;
> +	WARN_ON(!buffer_unwritten(bh));
> +	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
> +}
> +
>  /* Maximum number of blocks we map for direct IO at once. */
>  #define DIO_MAX_BLOCKS 4096
>  
> @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode
>  
>  		map_bh(bh, inode->i_sb, map.m_pblk);
>  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> +		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> +			bh->b_assoc_map = inode->i_mapping;
> +			bh->b_private = (void *)(unsigned long)iblock;
> +			bh->b_end_io = ext4_end_io_unwritten;
> +		}
  So why is this needed? It would deserve a comment. It confuses me in
particular because:
1) This is a often a phony bh used just as a container for passed data and
   b_end_io is just ignored.
2) Even if it was real bh attached to a page, for DAX we don't do any
   writeback and thus ->b_end_io will never get called?
3) And if it does get called, you certainly cannot call
   ext4_convert_unwritten_extents() from softirq context where ->b_end_io
   gets called.

>  		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
>  			set_buffer_defer_completion(bh);
>  		bh->b_size = inode->i_sb->s_blocksize * map.m_len;

								Honza

^ permalink raw reply	[flat|nested] 25+ messages in thread

* RE: + ext4-add-dax-functionality.patch added to -mm tree
  2015-01-15 12:41 ` Jan Kara
@ 2015-01-16 21:16   ` Wilcox, Matthew R
  2015-01-19 14:18       ` Jan Kara
  0 siblings, 1 reply; 25+ messages in thread
From: Wilcox, Matthew R @ 2015-01-16 21:16 UTC (permalink / raw)
  To: Jan Kara, ross.zwisler
  Cc: akpm, Dilger, Andreas, axboe, boaz, david, hch, kirill.shutemov,
	mathieu.desnoyers, rdunlap, tytso, mm-commits, linux-ext4,
	Matthew Wilcox

-----Original Message-----
From: Jan Kara [mailto:jack@suse.cz] 
Sent: Thursday, January 15, 2015 4:41 AM
To: ross.zwisler@linux.intel.com
Cc: akpm@linux-foundation.org; Dilger, Andreas; axboe@kernel.dk; boaz@plexistor.com; david@fromorbit.com; hch@lst.de; jack@suse.cz; kirill.shutemov@linux.intel.com; mathieu.desnoyers@efficios.com; Wilcox, Matthew R; rdunlap@infradead.org; tytso@mit.edu; mm-commits@vger.kernel.org; linux-ext4@vger.kernel.org
Subject: Re: + ext4-add-dax-functionality.patch added to -mm tree

On Mon 12-01-15 15:11:17, Andrew Morton wrote:
> +#ifdef CONFIG_FS_DAX
> +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	return dax_fault(vma, vmf, ext4_get_block);
> +					/* Is this the right get_block? */
  You can remove the comment. It is the right get_block function.

Are you sure it shouldn't be ext4_get_block_write, or _write_nolock?  According to the comments, ext4_get_block() doesn't allocate uninitialized extents, which we do want it to do.

> diff -puN fs/ext4/inode.c~ext4-add-dax-functionality fs/ext4/inode.c
> --- a/fs/ext4/inode.c~ext4-add-dax-functionality
> +++ a/fs/ext4/inode.c
> @@ -657,6 +657,18 @@ has_zeroout:
>  	return retval;
>  }
>  
> +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
> +{
> +	struct inode *inode = bh->b_assoc_map->host;
> +	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
  That should be 16 TB if I'm doing the math right - 32-bit block number *
block size (4k) = 16 TB. And that's the max limit of ext4 (as logical file
offset in blocks has to fit in 32-bits for ext4). So I think you can just
remove the comment. But also see comment below.

Blargh, yes, you're right.

> @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode
>  
>  		map_bh(bh, inode->i_sb, map.m_pblk);
>  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> +		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> +			bh->b_assoc_map = inode->i_mapping;
> +			bh->b_private = (void *)(unsigned long)iblock;
> +			bh->b_end_io = ext4_end_io_unwritten;
> +		}
  So why is this needed? It would deserve a comment. It confuses me in
particular because:
1) This is a often a phony bh used just as a container for passed data and
   b_end_io is just ignored.
2) Even if it was real bh attached to a page, for DAX we don't do any
   writeback and thus ->b_end_io will never get called?
3) And if it does get called, you certainly cannot call
   ext4_convert_unwritten_extents() from softirq context where ->b_end_io
   gets called.

This got added to fix a problem that Dave Chinner pointed out.  We need the allocated extent to either be zeroed (as ext2 does), or marked as unwritten (ext4, XFS) so that a racing read/page fault doesn't return uninitialized data.  If it's marked as unwritten, we need to convert it to a written extent after we've initialised the contents.  We use the b_end_io() callback to do this, and it's called from the DAX code, not in softirq context.

>  		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
>  			set_buffer_defer_completion(bh);
>  		bh->b_size = inode->i_sb->s_blocksize * map.m_len;

								Honza

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-01-16 21:16   ` Wilcox, Matthew R
@ 2015-01-19 14:18       ` Jan Kara
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2015-01-19 14:18 UTC (permalink / raw)
  To: Wilcox, Matthew R
  Cc: Jan Kara, ross.zwisler, akpm, Dilger, Andreas, axboe, boaz,
	david, hch, kirill.shutemov, mathieu.desnoyers, rdunlap, tytso,
	mm-commits, linux-ext4, Matthew Wilcox, xfs

On Fri 16-01-15 21:16:03, Wilcox, Matthew R wrote:
> On Mon 12-01-15 15:11:17, Andrew Morton wrote:
> > > +#ifdef CONFIG_FS_DAX
> > > +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> > > +{
> > > +	return dax_fault(vma, vmf, ext4_get_block);
> > > +					/* Is this the right get_block? */
> >   You can remove the comment. It is the right get_block function.
> 
> Are you sure it shouldn't be ext4_get_block_write, or _write_nolock?
> According to the comments, ext4_get_block() doesn't allocate
> uninitialized extents, which we do want it to do.
  Hum, so if I understand the code right dax_fault() will allocate a block
(== page in persistent memory) for a faulted address and will map this
block directly into process' address space. Thus that block has to be
zeroed out before the fault finishes no matter what (so that userspace
doesn't see garbage) - unwritten block handling in the filesystem doesn't
really matter (and would only cause unnecessary overhead) because of the
direct mapping of the block to process' address space. So I would think
that it would be easiest if dax_fault() simply zeroed out blocks which got
allocated. You could rewrite part of dax_fault() to something like:

	create = !vmf->cow_page && (vmf->flags & FAULT_FLAG_WRITE);
	error = get_block(inode, block, &bh, create);
	if (!error && (bh.b_size < PAGE_SIZE))
		error = -EIO;
	if (error)
		goto unlock_page;

	if (buffer_new(&bh)) {
		count_vm_event(PGMAJFAULT);
		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
		major = VM_FAULT_MAJOR;
		dax_clear_blocks(inode, bh->b_blocknr, PAGE_SIZE);
	} else if (!buffer_mapped(&bh))
		return dax_load_hole(mapping, page, vmf);

Note, that we also avoided calling get_block() callback twice on major fault
as that's relatively expensive due to locking, extent tree lookups, etc.

Also note that ext2 then doesn't have to call dax_clear_blocks() at all if
I understand the code right.

And with this change to dax_fault() using ext4_get_block() is the right
function to call. It will just allocate a block if necessary and attach it
to an appropriate place in the extent tree which is all you need.

> > > @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode
> > >  
> > >  		map_bh(bh, inode->i_sb, map.m_pblk);
> > >  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> > > +		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> > > +			bh->b_assoc_map = inode->i_mapping;
> > > +			bh->b_private = (void *)(unsigned long)iblock;
> > > +			bh->b_end_io = ext4_end_io_unwritten;
> > > +		}
> >   So why is this needed? It would deserve a comment. It confuses me in
> > particular because:
> > 1) This is a often a phony bh used just as a container for passed data and
> >    b_end_io is just ignored.
> > 2) Even if it was real bh attached to a page, for DAX we don't do any
> >    writeback and thus ->b_end_io will never get called?
> > 3) And if it does get called, you certainly cannot call
> >    ext4_convert_unwritten_extents() from softirq context where ->b_end_io
> >    gets called.
> 
> This got added to fix a problem that Dave Chinner pointed out.  We need
> the allocated extent to either be zeroed (as ext2 does), or marked as
> unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> uninitialized data.  If it's marked as unwritten, we need to convert it
> to a written extent after we've initialised the contents.  We use the
> b_end_io() callback to do this, and it's called from the DAX code, not in
> softirq context.
  OK, I see. But I didn't find where ->b_end_io gets called from dax code
(specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
point me please?

Also abusing b_end_io of a phony buffer for that looks ugly to me (we are
trying to get away from passing phony bh around and this would entangle us
even more into that mess). Normally I would think that end_io() callback
passed into dax_do_io() should perform necessary conversions and for
dax_fault() we could do necessary conversions inside foofs_page_mkwrite()...

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-01-19 14:18       ` Jan Kara
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2015-01-19 14:18 UTC (permalink / raw)
  To: Wilcox, Matthew R
  Cc: axboe, boaz, xfs, Dilger, Andreas, rdunlap, hch,
	mathieu.desnoyers, tytso, mm-commits, Matthew Wilcox, Jan Kara,
	ross.zwisler, linux-ext4, akpm, kirill.shutemov

On Fri 16-01-15 21:16:03, Wilcox, Matthew R wrote:
> On Mon 12-01-15 15:11:17, Andrew Morton wrote:
> > > +#ifdef CONFIG_FS_DAX
> > > +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> > > +{
> > > +	return dax_fault(vma, vmf, ext4_get_block);
> > > +					/* Is this the right get_block? */
> >   You can remove the comment. It is the right get_block function.
> 
> Are you sure it shouldn't be ext4_get_block_write, or _write_nolock?
> According to the comments, ext4_get_block() doesn't allocate
> uninitialized extents, which we do want it to do.
  Hum, so if I understand the code right dax_fault() will allocate a block
(== page in persistent memory) for a faulted address and will map this
block directly into process' address space. Thus that block has to be
zeroed out before the fault finishes no matter what (so that userspace
doesn't see garbage) - unwritten block handling in the filesystem doesn't
really matter (and would only cause unnecessary overhead) because of the
direct mapping of the block to process' address space. So I would think
that it would be easiest if dax_fault() simply zeroed out blocks which got
allocated. You could rewrite part of dax_fault() to something like:

	create = !vmf->cow_page && (vmf->flags & FAULT_FLAG_WRITE);
	error = get_block(inode, block, &bh, create);
	if (!error && (bh.b_size < PAGE_SIZE))
		error = -EIO;
	if (error)
		goto unlock_page;

	if (buffer_new(&bh)) {
		count_vm_event(PGMAJFAULT);
		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
		major = VM_FAULT_MAJOR;
		dax_clear_blocks(inode, bh->b_blocknr, PAGE_SIZE);
	} else if (!buffer_mapped(&bh))
		return dax_load_hole(mapping, page, vmf);

Note, that we also avoided calling get_block() callback twice on major fault
as that's relatively expensive due to locking, extent tree lookups, etc.

Also note that ext2 then doesn't have to call dax_clear_blocks() at all if
I understand the code right.

And with this change to dax_fault() using ext4_get_block() is the right
function to call. It will just allocate a block if necessary and attach it
to an appropriate place in the extent tree which is all you need.

> > > @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode
> > >  
> > >  		map_bh(bh, inode->i_sb, map.m_pblk);
> > >  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> > > +		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> > > +			bh->b_assoc_map = inode->i_mapping;
> > > +			bh->b_private = (void *)(unsigned long)iblock;
> > > +			bh->b_end_io = ext4_end_io_unwritten;
> > > +		}
> >   So why is this needed? It would deserve a comment. It confuses me in
> > particular because:
> > 1) This is a often a phony bh used just as a container for passed data and
> >    b_end_io is just ignored.
> > 2) Even if it was real bh attached to a page, for DAX we don't do any
> >    writeback and thus ->b_end_io will never get called?
> > 3) And if it does get called, you certainly cannot call
> >    ext4_convert_unwritten_extents() from softirq context where ->b_end_io
> >    gets called.
> 
> This got added to fix a problem that Dave Chinner pointed out.  We need
> the allocated extent to either be zeroed (as ext2 does), or marked as
> unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> uninitialized data.  If it's marked as unwritten, we need to convert it
> to a written extent after we've initialised the contents.  We use the
> b_end_io() callback to do this, and it's called from the DAX code, not in
> softirq context.
  OK, I see. But I didn't find where ->b_end_io gets called from dax code
(specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
point me please?

Also abusing b_end_io of a phony buffer for that looks ugly to me (we are
trying to get away from passing phony bh around and this would entangle us
even more into that mess). Normally I would think that end_io() callback
passed into dax_do_io() should perform necessary conversions and for
dax_fault() we could do necessary conversions inside foofs_page_mkwrite()...

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-01-19 14:18       ` Jan Kara
@ 2015-02-17  8:52         ` Jan Kara
  -1 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2015-02-17  8:52 UTC (permalink / raw)
  To: Wilcox, Matthew R
  Cc: Jan Kara, ross.zwisler, akpm, Dilger, Andreas, axboe, boaz,
	david, hch, kirill.shutemov, mathieu.desnoyers, rdunlap, tytso,
	mm-commits, linux-ext4, Matthew Wilcox, xfs

  Matthew, I think I still didn't see response to this. I think we can
fixup things after they are merged (since Andrew sent this patch to Linus)
but IMHO it needs some action...

								Honza
On Mon 19-01-15 15:18:58, Jan Kara wrote:
> On Fri 16-01-15 21:16:03, Wilcox, Matthew R wrote:
> > On Mon 12-01-15 15:11:17, Andrew Morton wrote:
> > > > +#ifdef CONFIG_FS_DAX
> > > > +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> > > > +{
> > > > +	return dax_fault(vma, vmf, ext4_get_block);
> > > > +					/* Is this the right get_block? */
> > >   You can remove the comment. It is the right get_block function.
> > 
> > Are you sure it shouldn't be ext4_get_block_write, or _write_nolock?
> > According to the comments, ext4_get_block() doesn't allocate
> > uninitialized extents, which we do want it to do.
>   Hum, so if I understand the code right dax_fault() will allocate a block
> (== page in persistent memory) for a faulted address and will map this
> block directly into process' address space. Thus that block has to be
> zeroed out before the fault finishes no matter what (so that userspace
> doesn't see garbage) - unwritten block handling in the filesystem doesn't
> really matter (and would only cause unnecessary overhead) because of the
> direct mapping of the block to process' address space. So I would think
> that it would be easiest if dax_fault() simply zeroed out blocks which got
> allocated. You could rewrite part of dax_fault() to something like:
> 
> 	create = !vmf->cow_page && (vmf->flags & FAULT_FLAG_WRITE);
> 	error = get_block(inode, block, &bh, create);
> 	if (!error && (bh.b_size < PAGE_SIZE))
> 		error = -EIO;
> 	if (error)
> 		goto unlock_page;
> 
> 	if (buffer_new(&bh)) {
> 		count_vm_event(PGMAJFAULT);
> 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> 		major = VM_FAULT_MAJOR;
> 		dax_clear_blocks(inode, bh->b_blocknr, PAGE_SIZE);
> 	} else if (!buffer_mapped(&bh))
> 		return dax_load_hole(mapping, page, vmf);
> 
> Note, that we also avoided calling get_block() callback twice on major fault
> as that's relatively expensive due to locking, extent tree lookups, etc.
> 
> Also note that ext2 then doesn't have to call dax_clear_blocks() at all if
> I understand the code right.
> 
> And with this change to dax_fault() using ext4_get_block() is the right
> function to call. It will just allocate a block if necessary and attach it
> to an appropriate place in the extent tree which is all you need.
> 
> > > > @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode
> > > >  
> > > >  		map_bh(bh, inode->i_sb, map.m_pblk);
> > > >  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> > > > +		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> > > > +			bh->b_assoc_map = inode->i_mapping;
> > > > +			bh->b_private = (void *)(unsigned long)iblock;
> > > > +			bh->b_end_io = ext4_end_io_unwritten;
> > > > +		}
> > >   So why is this needed? It would deserve a comment. It confuses me in
> > > particular because:
> > > 1) This is a often a phony bh used just as a container for passed data and
> > >    b_end_io is just ignored.
> > > 2) Even if it was real bh attached to a page, for DAX we don't do any
> > >    writeback and thus ->b_end_io will never get called?
> > > 3) And if it does get called, you certainly cannot call
> > >    ext4_convert_unwritten_extents() from softirq context where ->b_end_io
> > >    gets called.
> > 
> > This got added to fix a problem that Dave Chinner pointed out.  We need
> > the allocated extent to either be zeroed (as ext2 does), or marked as
> > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > uninitialized data.  If it's marked as unwritten, we need to convert it
> > to a written extent after we've initialised the contents.  We use the
> > b_end_io() callback to do this, and it's called from the DAX code, not in
> > softirq context.
>   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> point me please?
> 
> Also abusing b_end_io of a phony buffer for that looks ugly to me (we are
> trying to get away from passing phony bh around and this would entangle us
> even more into that mess). Normally I would think that end_io() callback
> passed into dax_do_io() should perform necessary conversions and for
> dax_fault() we could do necessary conversions inside foofs_page_mkwrite()...
> 
> 									Honza
> -- 
> Jan Kara <jack@suse.cz>
> SUSE Labs, CR
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-02-17  8:52         ` Jan Kara
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2015-02-17  8:52 UTC (permalink / raw)
  To: Wilcox, Matthew R
  Cc: axboe, boaz, xfs, Dilger, Andreas, rdunlap, hch,
	mathieu.desnoyers, tytso, mm-commits, Matthew Wilcox, Jan Kara,
	ross.zwisler, linux-ext4, akpm, kirill.shutemov

  Matthew, I think I still didn't see response to this. I think we can
fixup things after they are merged (since Andrew sent this patch to Linus)
but IMHO it needs some action...

								Honza
On Mon 19-01-15 15:18:58, Jan Kara wrote:
> On Fri 16-01-15 21:16:03, Wilcox, Matthew R wrote:
> > On Mon 12-01-15 15:11:17, Andrew Morton wrote:
> > > > +#ifdef CONFIG_FS_DAX
> > > > +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> > > > +{
> > > > +	return dax_fault(vma, vmf, ext4_get_block);
> > > > +					/* Is this the right get_block? */
> > >   You can remove the comment. It is the right get_block function.
> > 
> > Are you sure it shouldn't be ext4_get_block_write, or _write_nolock?
> > According to the comments, ext4_get_block() doesn't allocate
> > uninitialized extents, which we do want it to do.
>   Hum, so if I understand the code right dax_fault() will allocate a block
> (== page in persistent memory) for a faulted address and will map this
> block directly into process' address space. Thus that block has to be
> zeroed out before the fault finishes no matter what (so that userspace
> doesn't see garbage) - unwritten block handling in the filesystem doesn't
> really matter (and would only cause unnecessary overhead) because of the
> direct mapping of the block to process' address space. So I would think
> that it would be easiest if dax_fault() simply zeroed out blocks which got
> allocated. You could rewrite part of dax_fault() to something like:
> 
> 	create = !vmf->cow_page && (vmf->flags & FAULT_FLAG_WRITE);
> 	error = get_block(inode, block, &bh, create);
> 	if (!error && (bh.b_size < PAGE_SIZE))
> 		error = -EIO;
> 	if (error)
> 		goto unlock_page;
> 
> 	if (buffer_new(&bh)) {
> 		count_vm_event(PGMAJFAULT);
> 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> 		major = VM_FAULT_MAJOR;
> 		dax_clear_blocks(inode, bh->b_blocknr, PAGE_SIZE);
> 	} else if (!buffer_mapped(&bh))
> 		return dax_load_hole(mapping, page, vmf);
> 
> Note, that we also avoided calling get_block() callback twice on major fault
> as that's relatively expensive due to locking, extent tree lookups, etc.
> 
> Also note that ext2 then doesn't have to call dax_clear_blocks() at all if
> I understand the code right.
> 
> And with this change to dax_fault() using ext4_get_block() is the right
> function to call. It will just allocate a block if necessary and attach it
> to an appropriate place in the extent tree which is all you need.
> 
> > > > @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode
> > > >  
> > > >  		map_bh(bh, inode->i_sb, map.m_pblk);
> > > >  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> > > > +		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> > > > +			bh->b_assoc_map = inode->i_mapping;
> > > > +			bh->b_private = (void *)(unsigned long)iblock;
> > > > +			bh->b_end_io = ext4_end_io_unwritten;
> > > > +		}
> > >   So why is this needed? It would deserve a comment. It confuses me in
> > > particular because:
> > > 1) This is a often a phony bh used just as a container for passed data and
> > >    b_end_io is just ignored.
> > > 2) Even if it was real bh attached to a page, for DAX we don't do any
> > >    writeback and thus ->b_end_io will never get called?
> > > 3) And if it does get called, you certainly cannot call
> > >    ext4_convert_unwritten_extents() from softirq context where ->b_end_io
> > >    gets called.
> > 
> > This got added to fix a problem that Dave Chinner pointed out.  We need
> > the allocated extent to either be zeroed (as ext2 does), or marked as
> > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > uninitialized data.  If it's marked as unwritten, we need to convert it
> > to a written extent after we've initialised the contents.  We use the
> > b_end_io() callback to do this, and it's called from the DAX code, not in
> > softirq context.
>   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> point me please?
> 
> Also abusing b_end_io of a phony buffer for that looks ugly to me (we are
> trying to get away from passing phony bh around and this would entangle us
> even more into that mess). Normally I would think that end_io() callback
> passed into dax_do_io() should perform necessary conversions and for
> dax_fault() we could do necessary conversions inside foofs_page_mkwrite()...
> 
> 									Honza
> -- 
> Jan Kara <jack@suse.cz>
> SUSE Labs, CR
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-02-17  8:52         ` Jan Kara
@ 2015-02-17 13:37           ` Matthew Wilcox
  -1 siblings, 0 replies; 25+ messages in thread
From: Matthew Wilcox @ 2015-02-17 13:37 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wilcox, Matthew R, ross.zwisler, akpm, Dilger, Andreas, axboe,
	boaz, david, hch, kirill.shutemov, mathieu.desnoyers, rdunlap,
	tytso, mm-commits, linux-ext4, Matthew Wilcox, xfs

On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
>   Matthew, I think I still didn't see response to this. I think we can
> fixup things after they are merged (since Andrew sent this patch to Linus)
> but IMHO it needs some action...

Sorry, I thought I'd replied to this.

> On Mon 19-01-15 15:18:58, Jan Kara wrote:
> > On Fri 16-01-15 21:16:03, Wilcox, Matthew R wrote:
> > > Are you sure it shouldn't be ext4_get_block_write, or _write_nolock?
> > > According to the comments, ext4_get_block() doesn't allocate
> > > uninitialized extents, which we do want it to do.
> >   Hum, so if I understand the code right dax_fault() will allocate a block
> > (== page in persistent memory) for a faulted address and will map this
> > block directly into process' address space. Thus that block has to be
> > zeroed out before the fault finishes no matter what (so that userspace
> > doesn't see garbage) - unwritten block handling in the filesystem doesn't
> > really matter (and would only cause unnecessary overhead) because of the
> > direct mapping of the block to process' address space. So I would think
> > that it would be easiest if dax_fault() simply zeroed out blocks which got
> > allocated. You could rewrite part of dax_fault() to something like:
> > 
> > 	create = !vmf->cow_page && (vmf->flags & FAULT_FLAG_WRITE);
> > 	error = get_block(inode, block, &bh, create);
> > 	if (!error && (bh.b_size < PAGE_SIZE))
> > 		error = -EIO;
> > 	if (error)
> > 		goto unlock_page;
> > 
> > 	if (buffer_new(&bh)) {
> > 		count_vm_event(PGMAJFAULT);
> > 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> > 		major = VM_FAULT_MAJOR;
> > 		dax_clear_blocks(inode, bh->b_blocknr, PAGE_SIZE);
> > 	} else if (!buffer_mapped(&bh))
> > 		return dax_load_hole(mapping, page, vmf);
> > 
> > Note, that we also avoided calling get_block() callback twice on major fault
> > as that's relatively expensive due to locking, extent tree lookups, etc.
> > 
> > Also note that ext2 then doesn't have to call dax_clear_blocks() at all if
> > I understand the code right.

I think you've missed the case where we lose power after ext2 has
allocated the block and before dax_clear_blocks() is called.  After power
returns, ext4 will show an unwritten extent in the tree, which will be
zeroed before being handed to a user.  ext2 must have zeroed the block
before linking it into the inode's data blocks.

I didn't realise that calling get_block() was an expensive operation;
I'm open to reworking this piece of code to only call it once.

> > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > to a written extent after we've initialised the contents.  We use the
> > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > softirq context.
> >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > point me please?

For faults, we call it in dax_insert_mapping(), the very last thing
before returning in the fault path.  The normal I/O path gets to use
the dio_iodone_t for the same purpose.

> > Also abusing b_end_io of a phony buffer for that looks ugly to me (we are
> > trying to get away from passing phony bh around and this would entangle us
> > even more into that mess). Normally I would think that end_io() callback
> > passed into dax_do_io() should perform necessary conversions and for
> > dax_fault() we could do necessary conversions inside foofs_page_mkwrite()...

Dave sees to be the one trying the hardest to get rid of the phony BHs
... and it was his idea to (ab)use b_end_io for this.  The problem with
doing the conversion in ext4_page_mkwrite() is that we don't know at
that point whether the BH is unwritten or not.



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-02-17 13:37           ` Matthew Wilcox
  0 siblings, 0 replies; 25+ messages in thread
From: Matthew Wilcox @ 2015-02-17 13:37 UTC (permalink / raw)
  To: Jan Kara
  Cc: axboe, boaz, xfs, Dilger, Andreas, rdunlap, hch,
	mathieu.desnoyers, Wilcox, Matthew R, mm-commits, Matthew Wilcox,
	tytso, ross.zwisler, linux-ext4, akpm, kirill.shutemov

On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
>   Matthew, I think I still didn't see response to this. I think we can
> fixup things after they are merged (since Andrew sent this patch to Linus)
> but IMHO it needs some action...

Sorry, I thought I'd replied to this.

> On Mon 19-01-15 15:18:58, Jan Kara wrote:
> > On Fri 16-01-15 21:16:03, Wilcox, Matthew R wrote:
> > > Are you sure it shouldn't be ext4_get_block_write, or _write_nolock?
> > > According to the comments, ext4_get_block() doesn't allocate
> > > uninitialized extents, which we do want it to do.
> >   Hum, so if I understand the code right dax_fault() will allocate a block
> > (== page in persistent memory) for a faulted address and will map this
> > block directly into process' address space. Thus that block has to be
> > zeroed out before the fault finishes no matter what (so that userspace
> > doesn't see garbage) - unwritten block handling in the filesystem doesn't
> > really matter (and would only cause unnecessary overhead) because of the
> > direct mapping of the block to process' address space. So I would think
> > that it would be easiest if dax_fault() simply zeroed out blocks which got
> > allocated. You could rewrite part of dax_fault() to something like:
> > 
> > 	create = !vmf->cow_page && (vmf->flags & FAULT_FLAG_WRITE);
> > 	error = get_block(inode, block, &bh, create);
> > 	if (!error && (bh.b_size < PAGE_SIZE))
> > 		error = -EIO;
> > 	if (error)
> > 		goto unlock_page;
> > 
> > 	if (buffer_new(&bh)) {
> > 		count_vm_event(PGMAJFAULT);
> > 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> > 		major = VM_FAULT_MAJOR;
> > 		dax_clear_blocks(inode, bh->b_blocknr, PAGE_SIZE);
> > 	} else if (!buffer_mapped(&bh))
> > 		return dax_load_hole(mapping, page, vmf);
> > 
> > Note, that we also avoided calling get_block() callback twice on major fault
> > as that's relatively expensive due to locking, extent tree lookups, etc.
> > 
> > Also note that ext2 then doesn't have to call dax_clear_blocks() at all if
> > I understand the code right.

I think you've missed the case where we lose power after ext2 has
allocated the block and before dax_clear_blocks() is called.  After power
returns, ext4 will show an unwritten extent in the tree, which will be
zeroed before being handed to a user.  ext2 must have zeroed the block
before linking it into the inode's data blocks.

I didn't realise that calling get_block() was an expensive operation;
I'm open to reworking this piece of code to only call it once.

> > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > to a written extent after we've initialised the contents.  We use the
> > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > softirq context.
> >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > point me please?

For faults, we call it in dax_insert_mapping(), the very last thing
before returning in the fault path.  The normal I/O path gets to use
the dio_iodone_t for the same purpose.

> > Also abusing b_end_io of a phony buffer for that looks ugly to me (we are
> > trying to get away from passing phony bh around and this would entangle us
> > even more into that mess). Normally I would think that end_io() callback
> > passed into dax_do_io() should perform necessary conversions and for
> > dax_fault() we could do necessary conversions inside foofs_page_mkwrite()...

Dave sees to be the one trying the hardest to get rid of the phony BHs
... and it was his idea to (ab)use b_end_io for this.  The problem with
doing the conversion in ext4_page_mkwrite() is that we don't know at
that point whether the BH is unwritten or not.


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-02-17 13:37           ` Matthew Wilcox
@ 2015-02-18 10:40             ` Jan Kara
  -1 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2015-02-18 10:40 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Jan Kara, Wilcox, Matthew R, ross.zwisler, akpm, Dilger, Andreas,
	axboe, boaz, david, hch, kirill.shutemov, mathieu.desnoyers,
	rdunlap, tytso, mm-commits, linux-ext4, xfs

On Tue 17-02-15 08:37:45, Matthew Wilcox wrote:
> On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
> >   Matthew, I think I still didn't see response to this. I think we can
> > fixup things after they are merged (since Andrew sent this patch to Linus)
> > but IMHO it needs some action...
> 
> Sorry, I thought I'd replied to this.
> 
> > On Mon 19-01-15 15:18:58, Jan Kara wrote:
> > > On Fri 16-01-15 21:16:03, Wilcox, Matthew R wrote:
> > > > Are you sure it shouldn't be ext4_get_block_write, or _write_nolock?
> > > > According to the comments, ext4_get_block() doesn't allocate
> > > > uninitialized extents, which we do want it to do.
> > >   Hum, so if I understand the code right dax_fault() will allocate a block
> > > (== page in persistent memory) for a faulted address and will map this
> > > block directly into process' address space. Thus that block has to be
> > > zeroed out before the fault finishes no matter what (so that userspace
> > > doesn't see garbage) - unwritten block handling in the filesystem doesn't
> > > really matter (and would only cause unnecessary overhead) because of the
> > > direct mapping of the block to process' address space. So I would think
> > > that it would be easiest if dax_fault() simply zeroed out blocks which got
> > > allocated. You could rewrite part of dax_fault() to something like:
> > > 
> > > 	create = !vmf->cow_page && (vmf->flags & FAULT_FLAG_WRITE);
> > > 	error = get_block(inode, block, &bh, create);
> > > 	if (!error && (bh.b_size < PAGE_SIZE))
> > > 		error = -EIO;
> > > 	if (error)
> > > 		goto unlock_page;
> > > 
> > > 	if (buffer_new(&bh)) {
> > > 		count_vm_event(PGMAJFAULT);
> > > 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> > > 		major = VM_FAULT_MAJOR;
> > > 		dax_clear_blocks(inode, bh->b_blocknr, PAGE_SIZE);
> > > 	} else if (!buffer_mapped(&bh))
> > > 		return dax_load_hole(mapping, page, vmf);
> > > 
> > > Note, that we also avoided calling get_block() callback twice on major fault
> > > as that's relatively expensive due to locking, extent tree lookups, etc.
> > > 
> > > Also note that ext2 then doesn't have to call dax_clear_blocks() at all if
> > > I understand the code right.
> 
> I think you've missed the case where we lose power after ext2 has
> allocated the block and before dax_clear_blocks() is called.  After power
> returns, ext4 will show an unwritten extent in the tree, which will be
> zeroed before being handed to a user.  ext2 must have zeroed the block
> before linking it into the inode's data blocks.
  So the way ext4 traditionally deals with this is that we remember inode
data needs to be flushed before transaction doing the allocation commits.

So to handle this it can start transaction in ext4_dax_fault() /
ext4_dax_mkwrite() if write is requested and call ext4_jbd2_file_inode()
after dax_fault() / dax_mkwrite() returns. Complete function will look
something like follows:

static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	handle_t *handle;
	int create = !vmf->cow_page && (vmf->flags & FAULT_FLAG_WRITE);
	struct inode *inode = file_inode(vma->vm_file);
	int ret;
	int retries = 0;

	if (create) {
		sb_start_pagefault(inode->i_sb);
		file_update_time(vma->vm_file);
retry_alloc:
		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
					ext4_writepage_trans_blocks(inode));
		if (IS_ERR(handle)) {
			ret = PTR_ERR(handle);
			goto out;
		}
	}
	ret = do_dax_fault(vma, vmf, ext4_get_block);
	if (create) {
		if (!ret &&
		    ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
			ret = ext4_jbd2_file_inode(handle, inode);
		ext4_journal_stop(handle);
		if (ret == -ENOSPC &&
		    ext4_should_retry_alloc(inode->i_sb, &retries))
			goto retry_alloc;
	}
out:
	if (create)
		sb_end_pagefault(sb);
	return block_page_mkwrite_return(ret);
}

This will be much faster than dances with unwritten extents. It seems more
complex but as a bonus you get proper retry logic on ENOSPC (ext4 can see
temporary ENOSPC errors due to pending transaction commit) and also you
won't rely on _ext4_get_block() to start a transaction for you - looking at
that current code has a lock ordering violation (at least for the case
where the page exists) because page lock ranks under transaction start but
you end up calling ext4_get_block() with create == 1 and without
transaction started under page lock (we don't have lockdep instrumentation
for page locks so these problems show up only as deadlocks under load).

You may notice that for this to work, I need do_dax_fault() exported
(that's due to lock ordering of sb_start_pagefault() which ranks above
transaction start). I also need do_dax_fault() to return standard errno
(similarly to __block_page_mkwrite()) which the caller then converts to
fault handler return value via block_page_mkwrite_return() (or you could
create your own dax_fault_return()).

> > > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > > to a written extent after we've initialised the contents.  We use the
> > > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > > softirq context.
> > >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > > point me please?
> 
> For faults, we call it in dax_insert_mapping(), the very last thing
> before returning in the fault path.  The normal I/O path gets to use
> the dio_iodone_t for the same purpose.
  I see. I didn't think of races with reads (hum, I actually wonder whether
we don't have this data exposure problem for ext4 for mmapped write into
a hole vs direct read as well). So I guess we do need those unwritten
extent dances after all (or we would need to have a page covering hole when
writing to it via mmap but I guess unwritten extent dances are somewhat
more standard).

So in that case you need ext4_get_block_write() in the above call to
do_dax_fault() (note that we still do need ext4 version of the fault
function like above due to lock ranking and retry on ENOSPC). And please
comment about the read races at that call site so that we have that
subtlety documented somewhere.

> > > Also abusing b_end_io of a phony buffer for that looks ugly to me (we are
> > > trying to get away from passing phony bh around and this would entangle us
> > > even more into that mess). Normally I would think that end_io() callback
> > > passed into dax_do_io() should perform necessary conversions and for
> > > dax_fault() we could do necessary conversions inside foofs_page_mkwrite()...
> 
> Dave sees to be the one trying the hardest to get rid of the phony BHs
> ... and it was his idea to (ab)use b_end_io for this.  The problem with
> doing the conversion in ext4_page_mkwrite() is that we don't know at
> that point whether the BH is unwritten or not.
  I see, thanks for explanation. So it would be enough to pass a bit of
information (unwritten / written) to a caller of do_dax_fault() and that
can then call conversion function. IMO doing that (either in a return value
or in a separate argument of do_dax_fault()) would be cleaner and more
readable than playing with b_private and b_end_io... Thoughts?

								Honza 
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-02-18 10:40             ` Jan Kara
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2015-02-18 10:40 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: axboe, boaz, xfs, Dilger, Andreas, rdunlap, tytso, hch,
	mathieu.desnoyers, Wilcox, Matthew R, mm-commits, Jan Kara,
	ross.zwisler, linux-ext4, akpm, kirill.shutemov

On Tue 17-02-15 08:37:45, Matthew Wilcox wrote:
> On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
> >   Matthew, I think I still didn't see response to this. I think we can
> > fixup things after they are merged (since Andrew sent this patch to Linus)
> > but IMHO it needs some action...
> 
> Sorry, I thought I'd replied to this.
> 
> > On Mon 19-01-15 15:18:58, Jan Kara wrote:
> > > On Fri 16-01-15 21:16:03, Wilcox, Matthew R wrote:
> > > > Are you sure it shouldn't be ext4_get_block_write, or _write_nolock?
> > > > According to the comments, ext4_get_block() doesn't allocate
> > > > uninitialized extents, which we do want it to do.
> > >   Hum, so if I understand the code right dax_fault() will allocate a block
> > > (== page in persistent memory) for a faulted address and will map this
> > > block directly into process' address space. Thus that block has to be
> > > zeroed out before the fault finishes no matter what (so that userspace
> > > doesn't see garbage) - unwritten block handling in the filesystem doesn't
> > > really matter (and would only cause unnecessary overhead) because of the
> > > direct mapping of the block to process' address space. So I would think
> > > that it would be easiest if dax_fault() simply zeroed out blocks which got
> > > allocated. You could rewrite part of dax_fault() to something like:
> > > 
> > > 	create = !vmf->cow_page && (vmf->flags & FAULT_FLAG_WRITE);
> > > 	error = get_block(inode, block, &bh, create);
> > > 	if (!error && (bh.b_size < PAGE_SIZE))
> > > 		error = -EIO;
> > > 	if (error)
> > > 		goto unlock_page;
> > > 
> > > 	if (buffer_new(&bh)) {
> > > 		count_vm_event(PGMAJFAULT);
> > > 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> > > 		major = VM_FAULT_MAJOR;
> > > 		dax_clear_blocks(inode, bh->b_blocknr, PAGE_SIZE);
> > > 	} else if (!buffer_mapped(&bh))
> > > 		return dax_load_hole(mapping, page, vmf);
> > > 
> > > Note, that we also avoided calling get_block() callback twice on major fault
> > > as that's relatively expensive due to locking, extent tree lookups, etc.
> > > 
> > > Also note that ext2 then doesn't have to call dax_clear_blocks() at all if
> > > I understand the code right.
> 
> I think you've missed the case where we lose power after ext2 has
> allocated the block and before dax_clear_blocks() is called.  After power
> returns, ext4 will show an unwritten extent in the tree, which will be
> zeroed before being handed to a user.  ext2 must have zeroed the block
> before linking it into the inode's data blocks.
  So the way ext4 traditionally deals with this is that we remember inode
data needs to be flushed before transaction doing the allocation commits.

So to handle this it can start transaction in ext4_dax_fault() /
ext4_dax_mkwrite() if write is requested and call ext4_jbd2_file_inode()
after dax_fault() / dax_mkwrite() returns. Complete function will look
something like follows:

static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	handle_t *handle;
	int create = !vmf->cow_page && (vmf->flags & FAULT_FLAG_WRITE);
	struct inode *inode = file_inode(vma->vm_file);
	int ret;
	int retries = 0;

	if (create) {
		sb_start_pagefault(inode->i_sb);
		file_update_time(vma->vm_file);
retry_alloc:
		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
					ext4_writepage_trans_blocks(inode));
		if (IS_ERR(handle)) {
			ret = PTR_ERR(handle);
			goto out;
		}
	}
	ret = do_dax_fault(vma, vmf, ext4_get_block);
	if (create) {
		if (!ret &&
		    ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
			ret = ext4_jbd2_file_inode(handle, inode);
		ext4_journal_stop(handle);
		if (ret == -ENOSPC &&
		    ext4_should_retry_alloc(inode->i_sb, &retries))
			goto retry_alloc;
	}
out:
	if (create)
		sb_end_pagefault(sb);
	return block_page_mkwrite_return(ret);
}

This will be much faster than dances with unwritten extents. It seems more
complex but as a bonus you get proper retry logic on ENOSPC (ext4 can see
temporary ENOSPC errors due to pending transaction commit) and also you
won't rely on _ext4_get_block() to start a transaction for you - looking at
that current code has a lock ordering violation (at least for the case
where the page exists) because page lock ranks under transaction start but
you end up calling ext4_get_block() with create == 1 and without
transaction started under page lock (we don't have lockdep instrumentation
for page locks so these problems show up only as deadlocks under load).

You may notice that for this to work, I need do_dax_fault() exported
(that's due to lock ordering of sb_start_pagefault() which ranks above
transaction start). I also need do_dax_fault() to return standard errno
(similarly to __block_page_mkwrite()) which the caller then converts to
fault handler return value via block_page_mkwrite_return() (or you could
create your own dax_fault_return()).

> > > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > > to a written extent after we've initialised the contents.  We use the
> > > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > > softirq context.
> > >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > > point me please?
> 
> For faults, we call it in dax_insert_mapping(), the very last thing
> before returning in the fault path.  The normal I/O path gets to use
> the dio_iodone_t for the same purpose.
  I see. I didn't think of races with reads (hum, I actually wonder whether
we don't have this data exposure problem for ext4 for mmapped write into
a hole vs direct read as well). So I guess we do need those unwritten
extent dances after all (or we would need to have a page covering hole when
writing to it via mmap but I guess unwritten extent dances are somewhat
more standard).

So in that case you need ext4_get_block_write() in the above call to
do_dax_fault() (note that we still do need ext4 version of the fault
function like above due to lock ranking and retry on ENOSPC). And please
comment about the read races at that call site so that we have that
subtlety documented somewhere.

> > > Also abusing b_end_io of a phony buffer for that looks ugly to me (we are
> > > trying to get away from passing phony bh around and this would entangle us
> > > even more into that mess). Normally I would think that end_io() callback
> > > passed into dax_do_io() should perform necessary conversions and for
> > > dax_fault() we could do necessary conversions inside foofs_page_mkwrite()...
> 
> Dave sees to be the one trying the hardest to get rid of the phony BHs
> ... and it was his idea to (ab)use b_end_io for this.  The problem with
> doing the conversion in ext4_page_mkwrite() is that we don't know at
> that point whether the BH is unwritten or not.
  I see, thanks for explanation. So it would be enough to pass a bit of
information (unwritten / written) to a caller of do_dax_fault() and that
can then call conversion function. IMO doing that (either in a return value
or in a separate argument of do_dax_fault()) would be cleaner and more
readable than playing with b_private and b_end_io... Thoughts?

								Honza 
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-02-18 10:40             ` Jan Kara
  (?)
@ 2015-02-18 21:55             ` Dave Chinner
  2015-02-18 21:59                 ` hch
  2015-02-19 15:42               ` Jan Kara
  -1 siblings, 2 replies; 25+ messages in thread
From: Dave Chinner @ 2015-02-18 21:55 UTC (permalink / raw)
  To: Jan Kara
  Cc: axboe, boaz, xfs, Dilger, Andreas, rdunlap, tytso, hch,
	mathieu.desnoyers, Wilcox, Matthew R, mm-commits, Matthew Wilcox,
	ross.zwisler, linux-ext4, akpm, kirill.shutemov

On Wed, Feb 18, 2015 at 11:40:09AM +0100, Jan Kara wrote:
> On Tue 17-02-15 08:37:45, Matthew Wilcox wrote:
> > On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
> > > > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > > > to a written extent after we've initialised the contents.  We use the
> > > > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > > > softirq context.
> > > >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > > > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > > > point me please?
> > 
> > For faults, we call it in dax_insert_mapping(), the very last thing
> > before returning in the fault path.  The normal I/O path gets to use
> > the dio_iodone_t for the same purpose.
>   I see. I didn't think of races with reads (hum, I actually wonder whether
> we don't have this data exposure problem for ext4 for mmapped write into
> a hole vs direct read as well). So I guess we do need those unwritten
> extent dances after all (or we would need to have a page covering hole when
> writing to it via mmap but I guess unwritten extent dances are somewhat
> more standard).

Right, that was the reason for doing it that way - it leveraged all
the existing methods we have for avoiding data exposure races in
XFS. but it's also not just for races - it's for ensuring that if we
crash between the allocation and the write to the persistent store
we don't expose the underlying contents when the system next comes
up.

These problems were found long ago on XFS and that's one of the
reasons why all direct IO block allocation uses unwritten extents -
until the data is on disk there is a window for stale data exposure
and things like crashing systems running high throughput IO are
very good at exposing such small windows of exposure. Hence I
thought it best to have DAX close that hole for everyone.

> So in that case you need ext4_get_block_write() in the above call to
> do_dax_fault() (note that we still do need ext4 version of the fault
> function like above due to lock ranking and retry on ENOSPC). And please
> comment about the read races at that call site so that we have that
> subtlety documented somewhere.

Actually, I thought it was obvious that  ;)

> 
> > > > Also abusing b_end_io of a phony buffer for that looks ugly to me (we are
> > > > trying to get away from passing phony bh around and this would entangle us
> > > > even more into that mess). Normally I would think that end_io() callback
> > > > passed into dax_do_io() should perform necessary conversions and for
> > > > dax_fault() we could do necessary conversions inside foofs_page_mkwrite()...
> > 
> > Dave sees to be the one trying the hardest to get rid of the phony BHs
> > ... and it was his idea to (ab)use b_end_io for this.  The problem with
> > doing the conversion in ext4_page_mkwrite() is that we don't know at
> > that point whether the BH is unwritten or not.
>   I see, thanks for explanation. So it would be enough to pass a bit of
> information (unwritten / written) to a caller of do_dax_fault() and that
> can then call conversion function. IMO doing that (either in a return value
> or in a separate argument of do_dax_fault()) would be cleaner and more
> readable than playing with b_private and b_end_io... Thoughts?

I'm happy for a better mechanism to be thought up. The current one
was expedient, but not pretty. The need for the end_io function was
because unwritten conversion needed to happen after the page was
zeroed. If we change dax_fault() to also take a "end_io" function
callback (already takes a get_blocks callback), then we can avoid
the need to use the phony bh for this purpose. i.e filesystems that
allocate unwritten extents can pass a completion function

I've got to update the XFS DAX patches I have for the next merge
cycle, so I'll take another at it when I page that information back
into my brain.

And speaking of phony BHs, the pnfs block layout changes introduce
an struct iomap and a "map_blocks" method to the export_ops in
exportfs.h. This is the model what we should be using instead of
phony BHs for block mapping/allocation operations...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-02-18 21:55             ` Dave Chinner
@ 2015-02-18 21:59                 ` hch
  2015-02-19 15:42               ` Jan Kara
  1 sibling, 0 replies; 25+ messages in thread
From: hch @ 2015-02-18 21:59 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Matthew Wilcox, Wilcox, Matthew R, ross.zwisler, akpm,
	Dilger, Andreas, axboe, boaz, hch, kirill.shutemov,
	mathieu.desnoyers, rdunlap, tytso, mm-commits, linux-ext4, xfs

On Thu, Feb 19, 2015 at 08:55:23AM +1100, Dave Chinner wrote:
> And speaking of phony BHs, the pnfs block layout changes introduce
> an struct iomap and a "map_blocks" method to the export_ops in
> exportfs.h. This is the model what we should be using instead of
> phony BHs for block mapping/allocation operations...

I've got some WIP patches to move the direct I/O code over to that,
DAX would be another natural fit.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-02-18 21:59                 ` hch
  0 siblings, 0 replies; 25+ messages in thread
From: hch @ 2015-02-18 21:59 UTC (permalink / raw)
  To: Dave Chinner
  Cc: axboe, boaz, xfs, Jan Kara, rdunlap, tytso, hch,
	mathieu.desnoyers, Wilcox, Matthew R, mm-commits, Dilger,
	Andreas, Matthew Wilcox, ross.zwisler, linux-ext4, akpm,
	kirill.shutemov

On Thu, Feb 19, 2015 at 08:55:23AM +1100, Dave Chinner wrote:
> And speaking of phony BHs, the pnfs block layout changes introduce
> an struct iomap and a "map_blocks" method to the export_ops in
> exportfs.h. This is the model what we should be using instead of
> phony BHs for block mapping/allocation operations...

I've got some WIP patches to move the direct I/O code over to that,
DAX would be another natural fit.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-02-18 21:55             ` Dave Chinner
  2015-02-18 21:59                 ` hch
@ 2015-02-19 15:42               ` Jan Kara
  2015-02-19 21:12                   ` Dave Chinner
  1 sibling, 1 reply; 25+ messages in thread
From: Jan Kara @ 2015-02-19 15:42 UTC (permalink / raw)
  To: Dave Chinner
  Cc: axboe, boaz, xfs, Jan Kara, rdunlap, tytso, hch,
	mathieu.desnoyers, Wilcox, Matthew R, mm-commits, Dilger,
	Andreas, Matthew Wilcox, ross.zwisler, linux-ext4, akpm,
	kirill.shutemov

On Thu 19-02-15 08:55:23, Dave Chinner wrote:
> On Wed, Feb 18, 2015 at 11:40:09AM +0100, Jan Kara wrote:
> > On Tue 17-02-15 08:37:45, Matthew Wilcox wrote:
> > > On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
> > > > > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > > > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > > > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > > > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > > > > to a written extent after we've initialised the contents.  We use the
> > > > > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > > > > softirq context.
> > > > >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > > > > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > > > > point me please?
> > > 
> > > For faults, we call it in dax_insert_mapping(), the very last thing
> > > before returning in the fault path.  The normal I/O path gets to use
> > > the dio_iodone_t for the same purpose.
> >   I see. I didn't think of races with reads (hum, I actually wonder whether
> > we don't have this data exposure problem for ext4 for mmapped write into
> > a hole vs direct read as well). So I guess we do need those unwritten
> > extent dances after all (or we would need to have a page covering hole when
> > writing to it via mmap but I guess unwritten extent dances are somewhat
> > more standard).
> 
> Right, that was the reason for doing it that way - it leveraged all
> the existing methods we have for avoiding data exposure races in
> XFS. but it's also not just for races - it's for ensuring that if we
> crash between the allocation and the write to the persistent store
> we don't expose the underlying contents when the system next comes
> up.
  Well, ext3/4 handles the crash situation differently - we make sure we
flush data to allocated blocks before committing a transaction that
allocates them. That works perfectly for crashes but doesn't avoid the
race with DIO.

> > > > > Also abusing b_end_io of a phony buffer for that looks ugly to me (we are
> > > > > trying to get away from passing phony bh around and this would entangle us
> > > > > even more into that mess). Normally I would think that end_io() callback
> > > > > passed into dax_do_io() should perform necessary conversions and for
> > > > > dax_fault() we could do necessary conversions inside foofs_page_mkwrite()...
> > > 
> > > Dave sees to be the one trying the hardest to get rid of the phony BHs
> > > ... and it was his idea to (ab)use b_end_io for this.  The problem with
> > > doing the conversion in ext4_page_mkwrite() is that we don't know at
> > > that point whether the BH is unwritten or not.
> >   I see, thanks for explanation. So it would be enough to pass a bit of
> > information (unwritten / written) to a caller of do_dax_fault() and that
> > can then call conversion function. IMO doing that (either in a return value
> > or in a separate argument of do_dax_fault()) would be cleaner and more
> > readable than playing with b_private and b_end_io... Thoughts?
> 
> I'm happy for a better mechanism to be thought up. The current one
> was expedient, but not pretty. The need for the end_io function was
> because unwritten conversion needed to happen after the page was
> zeroed. If we change dax_fault() to also take a "end_io" function
> callback (already takes a get_blocks callback), then we can avoid
> the need to use the phony bh for this purpose. i.e filesystems that
> allocate unwritten extents can pass a completion function
  Yeah, that's probably even better.

> And speaking of phony BHs, the pnfs block layout changes introduce
> an struct iomap and a "map_blocks" method to the export_ops in
> exportfs.h. This is the model what we should be using instead of
> phony BHs for block mapping/allocation operations...
  Yup, that'd be nice.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-02-19 15:42               ` Jan Kara
@ 2015-02-19 21:12                   ` Dave Chinner
  0 siblings, 0 replies; 25+ messages in thread
From: Dave Chinner @ 2015-02-19 21:12 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox, Wilcox, Matthew R, ross.zwisler, akpm, Dilger,
	Andreas, axboe, boaz, hch, kirill.shutemov, mathieu.desnoyers,
	rdunlap, tytso, mm-commits, linux-ext4, xfs

On Thu, Feb 19, 2015 at 04:42:41PM +0100, Jan Kara wrote:
> On Thu 19-02-15 08:55:23, Dave Chinner wrote:
> > On Wed, Feb 18, 2015 at 11:40:09AM +0100, Jan Kara wrote:
> > > On Tue 17-02-15 08:37:45, Matthew Wilcox wrote:
> > > > On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
> > > > > > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > > > > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > > > > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > > > > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > > > > > to a written extent after we've initialised the contents.  We use the
> > > > > > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > > > > > softirq context.
> > > > > >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > > > > > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > > > > > point me please?
> > > > 
> > > > For faults, we call it in dax_insert_mapping(), the very last thing
> > > > before returning in the fault path.  The normal I/O path gets to use
> > > > the dio_iodone_t for the same purpose.
> > >   I see. I didn't think of races with reads (hum, I actually wonder whether
> > > we don't have this data exposure problem for ext4 for mmapped write into
> > > a hole vs direct read as well). So I guess we do need those unwritten
> > > extent dances after all (or we would need to have a page covering hole when
> > > writing to it via mmap but I guess unwritten extent dances are somewhat
> > > more standard).
> > 
> > Right, that was the reason for doing it that way - it leveraged all
> > the existing methods we have for avoiding data exposure races in
> > XFS. but it's also not just for races - it's for ensuring that if we
> > crash between the allocation and the write to the persistent store
> > we don't expose the underlying contents when the system next comes
> > up.
>   Well, ext3/4 handles the crash situation differently - we make sure we
> flush data to allocated blocks before committing a transaction that
> allocates them. That works perfectly for crashes but doesn't avoid the
> race with DIO.

I was talking about direct IO, not buffered IO. DAX is modeled on
the direct IO stack, not buffered IO. I did go and look at the ext4
IO completion path, and I can see where ext4_end_io_dio() triggers a
commit outside of doing unwritten extent conversion. Can you clue me
in - IO completion in ext4 is a maze of twisty passages...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-02-19 21:12                   ` Dave Chinner
  0 siblings, 0 replies; 25+ messages in thread
From: Dave Chinner @ 2015-02-19 21:12 UTC (permalink / raw)
  To: Jan Kara
  Cc: axboe, boaz, xfs, Dilger, Andreas, rdunlap, tytso, hch,
	mathieu.desnoyers, Wilcox, Matthew R, mm-commits, Matthew Wilcox,
	ross.zwisler, linux-ext4, akpm, kirill.shutemov

On Thu, Feb 19, 2015 at 04:42:41PM +0100, Jan Kara wrote:
> On Thu 19-02-15 08:55:23, Dave Chinner wrote:
> > On Wed, Feb 18, 2015 at 11:40:09AM +0100, Jan Kara wrote:
> > > On Tue 17-02-15 08:37:45, Matthew Wilcox wrote:
> > > > On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
> > > > > > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > > > > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > > > > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > > > > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > > > > > to a written extent after we've initialised the contents.  We use the
> > > > > > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > > > > > softirq context.
> > > > > >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > > > > > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > > > > > point me please?
> > > > 
> > > > For faults, we call it in dax_insert_mapping(), the very last thing
> > > > before returning in the fault path.  The normal I/O path gets to use
> > > > the dio_iodone_t for the same purpose.
> > >   I see. I didn't think of races with reads (hum, I actually wonder whether
> > > we don't have this data exposure problem for ext4 for mmapped write into
> > > a hole vs direct read as well). So I guess we do need those unwritten
> > > extent dances after all (or we would need to have a page covering hole when
> > > writing to it via mmap but I guess unwritten extent dances are somewhat
> > > more standard).
> > 
> > Right, that was the reason for doing it that way - it leveraged all
> > the existing methods we have for avoiding data exposure races in
> > XFS. but it's also not just for races - it's for ensuring that if we
> > crash between the allocation and the write to the persistent store
> > we don't expose the underlying contents when the system next comes
> > up.
>   Well, ext3/4 handles the crash situation differently - we make sure we
> flush data to allocated blocks before committing a transaction that
> allocates them. That works perfectly for crashes but doesn't avoid the
> race with DIO.

I was talking about direct IO, not buffered IO. DAX is modeled on
the direct IO stack, not buffered IO. I did go and look at the ext4
IO completion path, and I can see where ext4_end_io_dio() triggers a
commit outside of doing unwritten extent conversion. Can you clue me
in - IO completion in ext4 is a maze of twisty passages...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-02-19 21:12                   ` Dave Chinner
@ 2015-02-19 23:08                     ` Dave Chinner
  -1 siblings, 0 replies; 25+ messages in thread
From: Dave Chinner @ 2015-02-19 23:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox, Wilcox, Matthew R, ross.zwisler, akpm, Dilger,
	Andreas, axboe, boaz, hch, kirill.shutemov, mathieu.desnoyers,
	rdunlap, tytso, mm-commits, linux-ext4, xfs

On Fri, Feb 20, 2015 at 08:12:10AM +1100, Dave Chinner wrote:
> On Thu, Feb 19, 2015 at 04:42:41PM +0100, Jan Kara wrote:
> > On Thu 19-02-15 08:55:23, Dave Chinner wrote:
> > > On Wed, Feb 18, 2015 at 11:40:09AM +0100, Jan Kara wrote:
> > > > On Tue 17-02-15 08:37:45, Matthew Wilcox wrote:
> > > > > On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
> > > > > > > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > > > > > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > > > > > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > > > > > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > > > > > > to a written extent after we've initialised the contents.  We use the
> > > > > > > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > > > > > > softirq context.
> > > > > > >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > > > > > > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > > > > > > point me please?
> > > > > 
> > > > > For faults, we call it in dax_insert_mapping(), the very last thing
> > > > > before returning in the fault path.  The normal I/O path gets to use
> > > > > the dio_iodone_t for the same purpose.
> > > >   I see. I didn't think of races with reads (hum, I actually wonder whether
> > > > we don't have this data exposure problem for ext4 for mmapped write into
> > > > a hole vs direct read as well). So I guess we do need those unwritten
> > > > extent dances after all (or we would need to have a page covering hole when
> > > > writing to it via mmap but I guess unwritten extent dances are somewhat
> > > > more standard).
> > > 
> > > Right, that was the reason for doing it that way - it leveraged all
> > > the existing methods we have for avoiding data exposure races in
> > > XFS. but it's also not just for races - it's for ensuring that if we
> > > crash between the allocation and the write to the persistent store
> > > we don't expose the underlying contents when the system next comes
> > > up.
> >   Well, ext3/4 handles the crash situation differently - we make sure we
> > flush data to allocated blocks before committing a transaction that
> > allocates them. That works perfectly for crashes but doesn't avoid the
> > race with DIO.
> 
> I was talking about direct IO, not buffered IO. DAX is modeled on
> the direct IO stack, not buffered IO. I did go and look at the ext4
> IO completion path, and I can see where ext4_end_io_dio() triggers a
                            ^^^^^^^
                            can't see.

Stupid fingers can type what I'm thinking. :/

> commit outside of doing unwritten extent conversion. Can you clue me
> in - IO completion in ext4 is a maze of twisty passages...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-02-19 23:08                     ` Dave Chinner
  0 siblings, 0 replies; 25+ messages in thread
From: Dave Chinner @ 2015-02-19 23:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: axboe, boaz, xfs, Dilger, Andreas, rdunlap, tytso, hch,
	mathieu.desnoyers, Wilcox, Matthew R, mm-commits, Matthew Wilcox,
	ross.zwisler, linux-ext4, akpm, kirill.shutemov

On Fri, Feb 20, 2015 at 08:12:10AM +1100, Dave Chinner wrote:
> On Thu, Feb 19, 2015 at 04:42:41PM +0100, Jan Kara wrote:
> > On Thu 19-02-15 08:55:23, Dave Chinner wrote:
> > > On Wed, Feb 18, 2015 at 11:40:09AM +0100, Jan Kara wrote:
> > > > On Tue 17-02-15 08:37:45, Matthew Wilcox wrote:
> > > > > On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
> > > > > > > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > > > > > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > > > > > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > > > > > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > > > > > > to a written extent after we've initialised the contents.  We use the
> > > > > > > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > > > > > > softirq context.
> > > > > > >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > > > > > > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > > > > > > point me please?
> > > > > 
> > > > > For faults, we call it in dax_insert_mapping(), the very last thing
> > > > > before returning in the fault path.  The normal I/O path gets to use
> > > > > the dio_iodone_t for the same purpose.
> > > >   I see. I didn't think of races with reads (hum, I actually wonder whether
> > > > we don't have this data exposure problem for ext4 for mmapped write into
> > > > a hole vs direct read as well). So I guess we do need those unwritten
> > > > extent dances after all (or we would need to have a page covering hole when
> > > > writing to it via mmap but I guess unwritten extent dances are somewhat
> > > > more standard).
> > > 
> > > Right, that was the reason for doing it that way - it leveraged all
> > > the existing methods we have for avoiding data exposure races in
> > > XFS. but it's also not just for races - it's for ensuring that if we
> > > crash between the allocation and the write to the persistent store
> > > we don't expose the underlying contents when the system next comes
> > > up.
> >   Well, ext3/4 handles the crash situation differently - we make sure we
> > flush data to allocated blocks before committing a transaction that
> > allocates them. That works perfectly for crashes but doesn't avoid the
> > race with DIO.
> 
> I was talking about direct IO, not buffered IO. DAX is modeled on
> the direct IO stack, not buffered IO. I did go and look at the ext4
> IO completion path, and I can see where ext4_end_io_dio() triggers a
                            ^^^^^^^
                            can't see.

Stupid fingers can type what I'm thinking. :/

> commit outside of doing unwritten extent conversion. Can you clue me
> in - IO completion in ext4 is a maze of twisty passages...
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-02-19 21:12                   ` Dave Chinner
@ 2015-02-20 12:05                     ` Jan Kara
  -1 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2015-02-20 12:05 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Matthew Wilcox, Wilcox, Matthew R, ross.zwisler, akpm,
	Dilger, Andreas, axboe, boaz, hch, kirill.shutemov,
	mathieu.desnoyers, rdunlap, tytso, mm-commits, linux-ext4, xfs

On Fri 20-02-15 08:12:10, Dave Chinner wrote:
> On Thu, Feb 19, 2015 at 04:42:41PM +0100, Jan Kara wrote:
> > On Thu 19-02-15 08:55:23, Dave Chinner wrote:
> > > On Wed, Feb 18, 2015 at 11:40:09AM +0100, Jan Kara wrote:
> > > > On Tue 17-02-15 08:37:45, Matthew Wilcox wrote:
> > > > > On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
> > > > > > > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > > > > > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > > > > > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > > > > > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > > > > > > to a written extent after we've initialised the contents.  We use the
> > > > > > > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > > > > > > softirq context.
> > > > > > >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > > > > > > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > > > > > > point me please?
> > > > > 
> > > > > For faults, we call it in dax_insert_mapping(), the very last thing
> > > > > before returning in the fault path.  The normal I/O path gets to use
> > > > > the dio_iodone_t for the same purpose.
> > > >   I see. I didn't think of races with reads (hum, I actually wonder whether
> > > > we don't have this data exposure problem for ext4 for mmapped write into
> > > > a hole vs direct read as well). So I guess we do need those unwritten
> > > > extent dances after all (or we would need to have a page covering hole when
> > > > writing to it via mmap but I guess unwritten extent dances are somewhat
> > > > more standard).
> > > 
> > > Right, that was the reason for doing it that way - it leveraged all
> > > the existing methods we have for avoiding data exposure races in
> > > XFS. but it's also not just for races - it's for ensuring that if we
> > > crash between the allocation and the write to the persistent store
> > > we don't expose the underlying contents when the system next comes
> > > up.
> >   Well, ext3/4 handles the crash situation differently - we make sure we
> > flush data to allocated blocks before committing a transaction that
> > allocates them. That works perfectly for crashes but doesn't avoid the
> > race with DIO.
> 
> I was talking about direct IO, not buffered IO. DAX is modeled on
  Ah, OK. For DIO writes ext4 uses unwritten extents as well. But the race
I was talking about is between mmap allocating write (i.e. going through
page cache) and DIO read of the same location.

> the direct IO stack, not buffered IO. I did go and look at the ext4
> IO completion path, and I can see where ext4_end_io_dio() triggers a
> commit outside of doing unwritten extent conversion. Can you clue me
> in - IO completion in ext4 is a maze of twisty passages...
  I don't quite follow you. Why should ext4_end_io_dio() trigger a commit?

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-02-20 12:05                     ` Jan Kara
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2015-02-20 12:05 UTC (permalink / raw)
  To: Dave Chinner
  Cc: axboe, boaz, xfs, Jan Kara, rdunlap, tytso, hch,
	mathieu.desnoyers, Wilcox, Matthew R, mm-commits, Dilger,
	Andreas, Matthew Wilcox, ross.zwisler, linux-ext4, akpm,
	kirill.shutemov

On Fri 20-02-15 08:12:10, Dave Chinner wrote:
> On Thu, Feb 19, 2015 at 04:42:41PM +0100, Jan Kara wrote:
> > On Thu 19-02-15 08:55:23, Dave Chinner wrote:
> > > On Wed, Feb 18, 2015 at 11:40:09AM +0100, Jan Kara wrote:
> > > > On Tue 17-02-15 08:37:45, Matthew Wilcox wrote:
> > > > > On Tue, Feb 17, 2015 at 09:52:00AM +0100, Jan Kara wrote:
> > > > > > > > This got added to fix a problem that Dave Chinner pointed out.  We need
> > > > > > > > the allocated extent to either be zeroed (as ext2 does), or marked as
> > > > > > > > unwritten (ext4, XFS) so that a racing read/page fault doesn't return
> > > > > > > > uninitialized data.  If it's marked as unwritten, we need to convert it
> > > > > > > > to a written extent after we've initialised the contents.  We use the
> > > > > > > > b_end_io() callback to do this, and it's called from the DAX code, not in
> > > > > > > > softirq context.
> > > > > > >   OK, I see. But I didn't find where ->b_end_io gets called from dax code
> > > > > > > (specifically I don't see it anywhere in dax_do_IO() or dax_io()). Can you
> > > > > > > point me please?
> > > > > 
> > > > > For faults, we call it in dax_insert_mapping(), the very last thing
> > > > > before returning in the fault path.  The normal I/O path gets to use
> > > > > the dio_iodone_t for the same purpose.
> > > >   I see. I didn't think of races with reads (hum, I actually wonder whether
> > > > we don't have this data exposure problem for ext4 for mmapped write into
> > > > a hole vs direct read as well). So I guess we do need those unwritten
> > > > extent dances after all (or we would need to have a page covering hole when
> > > > writing to it via mmap but I guess unwritten extent dances are somewhat
> > > > more standard).
> > > 
> > > Right, that was the reason for doing it that way - it leveraged all
> > > the existing methods we have for avoiding data exposure races in
> > > XFS. but it's also not just for races - it's for ensuring that if we
> > > crash between the allocation and the write to the persistent store
> > > we don't expose the underlying contents when the system next comes
> > > up.
> >   Well, ext3/4 handles the crash situation differently - we make sure we
> > flush data to allocated blocks before committing a transaction that
> > allocates them. That works perfectly for crashes but doesn't avoid the
> > race with DIO.
> 
> I was talking about direct IO, not buffered IO. DAX is modeled on
  Ah, OK. For DIO writes ext4 uses unwritten extents as well. But the race
I was talking about is between mmap allocating write (i.e. going through
page cache) and DIO read of the same location.

> the direct IO stack, not buffered IO. I did go and look at the ext4
> IO completion path, and I can see where ext4_end_io_dio() triggers a
> commit outside of doing unwritten extent conversion. Can you clue me
> in - IO completion in ext4 is a maze of twisty passages...
  I don't quite follow you. Why should ext4_end_io_dio() trigger a commit?

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-02-18 10:40             ` Jan Kara
@ 2015-02-20 22:15               ` Matthew Wilcox
  -1 siblings, 0 replies; 25+ messages in thread
From: Matthew Wilcox @ 2015-02-20 22:15 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox, Wilcox, Matthew R, ross.zwisler, akpm, Dilger,
	Andreas, axboe, boaz, david, hch, kirill.shutemov,
	mathieu.desnoyers, rdunlap, tytso, mm-commits, linux-ext4, xfs

> So to handle this it can start transaction in ext4_dax_fault() /
> ext4_dax_mkwrite() if write is requested and call ext4_jbd2_file_inode()
> after dax_fault() / dax_mkwrite() returns. Complete function will look
> something like follows:

How about this?  I tried to encompass both the unwritten extent conversion
as well as starting the journal at the right point in the locking hierarchy.

If we're going to expose do_dax_fault(), I think it needs to be called
__dax_fault().

I decided to return VM_FAULT_RETRY and a new flag VM_FAULT_UNWRITTEN from
__dax_fault(), rather than convert it to return an errno.

P.S. I love patches which touch *both* fs.h *and* mm.h.  In case there
were any files that weren't already being rebuilt.

diff --git a/fs/dax.c b/fs/dax.c
index 556238f..81dbdaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -316,7 +316,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	return error;
 }
 
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			get_block_t get_block)
 {
 	struct file *file = vma->vm_file;
@@ -329,7 +329,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	sector_t block;
 	pgoff_t size;
 	int error;
-	int major = 0;
+	int ret = 0;
 
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -367,13 +367,15 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		error = -EIO;		/* fs corruption? */
 	if (error)
 		goto unlock_page;
+	if (buffer_unwritten(&bh))
+		ret |= VM_FAULT_UNWRITTEN;
 
 	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
 		if (vmf->flags & FAULT_FLAG_WRITE) {
 			error = get_block(inode, block, &bh, 1);
 			count_vm_event(PGMAJFAULT);
 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-			major = VM_FAULT_MAJOR;
+			ret = VM_FAULT_MAJOR;
 			if (!error && (bh.b_size < PAGE_SIZE))
 				error = -EIO;
 			if (error)
@@ -407,7 +409,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	}
 
 	/* Check we didn't race with a read fault installing a new page */
-	if (!page && major)
+	if (!page && (ret & VM_FAULT_MAJOR))
 		page = find_lock_page(mapping, vmf->pgoff);
 
 	if (page) {
@@ -421,12 +423,14 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	error = dax_insert_mapping(inode, &bh, vma, vmf);
 
  out:
+	if (error == -ENOSPC)
+		return VM_FAULT_RETRY | ret;
 	if (error == -ENOMEM)
-		return VM_FAULT_OOM | major;
+		return VM_FAULT_OOM | ret;
 	/* -EBUSY is fine, somebody else faulted on the same PTE */
 	if ((error < 0) && (error != -EBUSY))
-		return VM_FAULT_SIGBUS | major;
-	return VM_FAULT_NOPAGE | major;
+		return VM_FAULT_SIGBUS | ret;
+	return VM_FAULT_NOPAGE | ret;
 
  unlock_page:
 	if (page) {
@@ -435,6 +439,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	}
 	goto out;
 }
+EXPORT_SYMBOL_GPL(__dax_fault);
 
 /**
  * dax_fault - handle a page fault on a DAX file
@@ -455,7 +460,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
-	result = do_dax_fault(vma, vmf, get_block);
+	result = __dax_fault(vma, vmf, get_block);
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(sb);
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4340e38..84b4f1c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -194,7 +194,58 @@ errout:
 #ifdef CONFIG_FS_DAX
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_fault(vma, vmf, ext4_get_block_write);
+	handle_t *handle;
+	int create = (vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page;
+	struct inode *inode = file_inode(vma->vm_file);
+	int ret, err = 0;
+	int retries = 0;
+
+	if (create) {
+		sb_start_pagefault(inode->i_sb);
+		file_update_time(vma->vm_file);
+ retry_alloc:
+		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+					ext4_writepage_trans_blocks(inode));
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto err;
+		}
+	}
+
+	ret = __dax_fault(vma, vmf, ext4_get_block);
+
+	if (create) {
+		if (ret & VM_FAULT_UNWRITTEN) {
+			loff_t offset = (loff_t)vmf->pgoff << PAGE_SHIFT;
+			err = ext4_convert_unwritten_extents(NULL, inode,
+							offset, PAGE_SIZE);
+			ret &= ~VM_FAULT_UNWRITTEN;
+		}
+		if (!err &&
+		    ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+			err = ext4_jbd2_file_inode(handle, inode);
+
+		if (err == -ENOSPC) {
+			ret |= VM_FAULT_RETRY;
+			err = 0;
+		}
+
+		ext4_journal_stop(handle);
+		if (err < 0)
+			goto err;
+		if ((ret & VM_FAULT_RETRY) &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_alloc;
+		ret &= ~VM_FAULT_RETRY;
+	}
+
+ out:
+	if (create)
+		sb_end_pagefault(inode->i_sb);
+	return ret;
+ err:
+	ret = block_page_mkwrite_return(err);
+	goto out;
 }
 
 static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 85404f1..8f1ea7d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,18 +657,6 @@ has_zeroout:
 	return retval;
 }
 
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-	struct inode *inode = bh->b_assoc_map->host;
-	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
-	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-	int err;
-	if (!uptodate)
-		return;
-	WARN_ON(!buffer_unwritten(bh));
-	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
@@ -706,11 +694,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
-			bh->b_assoc_map = inode->i_mapping;
-			bh->b_private = (void *)(unsigned long)iblock;
-			bh->b_end_io = ext4_end_io_unwritten;
-		}
 		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
 			set_buffer_defer_completion(bh);
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 239c89c..2af5050 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2597,6 +2597,7 @@ int dax_clear_blocks(struct inode *, sector_t block, long size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
 					unsigned int flags, get_block_t);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ceb50ec..ffc9947 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1100,7 +1100,7 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
 #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
 #define VM_FAULT_SIGSEGV 0x0040

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-02-20 22:15               ` Matthew Wilcox
  0 siblings, 0 replies; 25+ messages in thread
From: Matthew Wilcox @ 2015-02-20 22:15 UTC (permalink / raw)
  To: Jan Kara
  Cc: axboe, boaz, xfs, Dilger, Andreas, rdunlap, tytso, hch,
	mathieu.desnoyers, Wilcox, Matthew R, mm-commits, Matthew Wilcox,
	ross.zwisler, linux-ext4, akpm, kirill.shutemov

> So to handle this it can start transaction in ext4_dax_fault() /
> ext4_dax_mkwrite() if write is requested and call ext4_jbd2_file_inode()
> after dax_fault() / dax_mkwrite() returns. Complete function will look
> something like follows:

How about this?  I tried to encompass both the unwritten extent conversion
as well as starting the journal at the right point in the locking hierarchy.

If we're going to expose do_dax_fault(), I think it needs to be called
__dax_fault().

I decided to return VM_FAULT_RETRY and a new flag VM_FAULT_UNWRITTEN from
__dax_fault(), rather than convert it to return an errno.

P.S. I love patches which touch *both* fs.h *and* mm.h.  In case there
were any files that weren't already being rebuilt.

diff --git a/fs/dax.c b/fs/dax.c
index 556238f..81dbdaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -316,7 +316,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	return error;
 }
 
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			get_block_t get_block)
 {
 	struct file *file = vma->vm_file;
@@ -329,7 +329,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	sector_t block;
 	pgoff_t size;
 	int error;
-	int major = 0;
+	int ret = 0;
 
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -367,13 +367,15 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		error = -EIO;		/* fs corruption? */
 	if (error)
 		goto unlock_page;
+	if (buffer_unwritten(&bh))
+		ret |= VM_FAULT_UNWRITTEN;
 
 	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
 		if (vmf->flags & FAULT_FLAG_WRITE) {
 			error = get_block(inode, block, &bh, 1);
 			count_vm_event(PGMAJFAULT);
 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-			major = VM_FAULT_MAJOR;
+			ret = VM_FAULT_MAJOR;
 			if (!error && (bh.b_size < PAGE_SIZE))
 				error = -EIO;
 			if (error)
@@ -407,7 +409,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	}
 
 	/* Check we didn't race with a read fault installing a new page */
-	if (!page && major)
+	if (!page && (ret & VM_FAULT_MAJOR))
 		page = find_lock_page(mapping, vmf->pgoff);
 
 	if (page) {
@@ -421,12 +423,14 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	error = dax_insert_mapping(inode, &bh, vma, vmf);
 
  out:
+	if (error == -ENOSPC)
+		return VM_FAULT_RETRY | ret;
 	if (error == -ENOMEM)
-		return VM_FAULT_OOM | major;
+		return VM_FAULT_OOM | ret;
 	/* -EBUSY is fine, somebody else faulted on the same PTE */
 	if ((error < 0) && (error != -EBUSY))
-		return VM_FAULT_SIGBUS | major;
-	return VM_FAULT_NOPAGE | major;
+		return VM_FAULT_SIGBUS | ret;
+	return VM_FAULT_NOPAGE | ret;
 
  unlock_page:
 	if (page) {
@@ -435,6 +439,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	}
 	goto out;
 }
+EXPORT_SYMBOL_GPL(__dax_fault);
 
 /**
  * dax_fault - handle a page fault on a DAX file
@@ -455,7 +460,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
-	result = do_dax_fault(vma, vmf, get_block);
+	result = __dax_fault(vma, vmf, get_block);
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(sb);
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4340e38..84b4f1c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -194,7 +194,58 @@ errout:
 #ifdef CONFIG_FS_DAX
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_fault(vma, vmf, ext4_get_block_write);
+	handle_t *handle;
+	int create = (vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page;
+	struct inode *inode = file_inode(vma->vm_file);
+	int ret, err = 0;
+	int retries = 0;
+
+	if (create) {
+		sb_start_pagefault(inode->i_sb);
+		file_update_time(vma->vm_file);
+ retry_alloc:
+		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+					ext4_writepage_trans_blocks(inode));
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto err;
+		}
+	}
+
+	ret = __dax_fault(vma, vmf, ext4_get_block);
+
+	if (create) {
+		if (ret & VM_FAULT_UNWRITTEN) {
+			loff_t offset = (loff_t)vmf->pgoff << PAGE_SHIFT;
+			err = ext4_convert_unwritten_extents(NULL, inode,
+							offset, PAGE_SIZE);
+			ret &= ~VM_FAULT_UNWRITTEN;
+		}
+		if (!err &&
+		    ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+			err = ext4_jbd2_file_inode(handle, inode);
+
+		if (err == -ENOSPC) {
+			ret |= VM_FAULT_RETRY;
+			err = 0;
+		}
+
+		ext4_journal_stop(handle);
+		if (err < 0)
+			goto err;
+		if ((ret & VM_FAULT_RETRY) &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_alloc;
+		ret &= ~VM_FAULT_RETRY;
+	}
+
+ out:
+	if (create)
+		sb_end_pagefault(inode->i_sb);
+	return ret;
+ err:
+	ret = block_page_mkwrite_return(err);
+	goto out;
 }
 
 static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 85404f1..8f1ea7d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,18 +657,6 @@ has_zeroout:
 	return retval;
 }
 
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-	struct inode *inode = bh->b_assoc_map->host;
-	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
-	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-	int err;
-	if (!uptodate)
-		return;
-	WARN_ON(!buffer_unwritten(bh));
-	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
@@ -706,11 +694,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
-			bh->b_assoc_map = inode->i_mapping;
-			bh->b_private = (void *)(unsigned long)iblock;
-			bh->b_end_io = ext4_end_io_unwritten;
-		}
 		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
 			set_buffer_defer_completion(bh);
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 239c89c..2af5050 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2597,6 +2597,7 @@ int dax_clear_blocks(struct inode *, sector_t block, long size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
 					unsigned int flags, get_block_t);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ceb50ec..ffc9947 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1100,7 +1100,7 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
 #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
 #define VM_FAULT_SIGSEGV 0x0040
-
+#define VM_FAULT_UNWRITTEN 0x0080	/* Unwritten extent needs conversion */
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
  2015-02-20 22:15               ` Matthew Wilcox
@ 2015-02-23 12:52                 ` Jan Kara
  -1 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2015-02-23 12:52 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Jan Kara, Wilcox, Matthew R, ross.zwisler, akpm, Dilger, Andreas,
	axboe, boaz, david, hch, kirill.shutemov, mathieu.desnoyers,
	rdunlap, tytso, mm-commits, linux-ext4, xfs

On Fri 20-02-15 17:15:51, Matthew Wilcox wrote:
> > So to handle this it can start transaction in ext4_dax_fault() /
> > ext4_dax_mkwrite() if write is requested and call ext4_jbd2_file_inode()
> > after dax_fault() / dax_mkwrite() returns. Complete function will look
> > something like follows:
> 
> How about this?  I tried to encompass both the unwritten extent conversion
> as well as starting the journal at the right point in the locking hierarchy.
> 
> If we're going to expose do_dax_fault(), I think it needs to be called
> __dax_fault().
> 
> I decided to return VM_FAULT_RETRY and a new flag VM_FAULT_UNWRITTEN from
> __dax_fault(), rather than convert it to return an errno.
  I don't like using VM_FAULT_RETRY for ENOSPC. Different filesystems may
want different things on this condition. In particular, if a filesystem
decides to use dax_fault(), VM_FAULT_RETRY will get propagated up into mm
code which just retries the fault (or gets confused if FAULT_FLAG_ALLOW_RETRY
wasn't set).

If you want to stay with VM_FAULT_XXX return values (which makes some sense),
then I guess you need something like VM_FAULT_ENOSPC and convert that to
VM_FAULT_SIGBUS in dax_fault().

Otherwise the patch looks good.

								Honza

> P.S. I love patches which touch *both* fs.h *and* mm.h.  In case there
> were any files that weren't already being rebuilt.
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 556238f..81dbdaa 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -316,7 +316,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  	return error;
>  }
>  
> -static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> +int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  			get_block_t get_block)
>  {
>  	struct file *file = vma->vm_file;
> @@ -329,7 +329,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	sector_t block;
>  	pgoff_t size;
>  	int error;
> -	int major = 0;
> +	int ret = 0;
>  
>  	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
>  	if (vmf->pgoff >= size)
> @@ -367,13 +367,15 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		error = -EIO;		/* fs corruption? */
>  	if (error)
>  		goto unlock_page;
> +	if (buffer_unwritten(&bh))
> +		ret |= VM_FAULT_UNWRITTEN;
>  
>  	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
>  		if (vmf->flags & FAULT_FLAG_WRITE) {
>  			error = get_block(inode, block, &bh, 1);
>  			count_vm_event(PGMAJFAULT);
>  			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> -			major = VM_FAULT_MAJOR;
> +			ret = VM_FAULT_MAJOR;
>  			if (!error && (bh.b_size < PAGE_SIZE))
>  				error = -EIO;
>  			if (error)
> @@ -407,7 +409,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	}
>  
>  	/* Check we didn't race with a read fault installing a new page */
> -	if (!page && major)
> +	if (!page && (ret & VM_FAULT_MAJOR))
>  		page = find_lock_page(mapping, vmf->pgoff);
>  
>  	if (page) {
> @@ -421,12 +423,14 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	error = dax_insert_mapping(inode, &bh, vma, vmf);
>  
>   out:
> +	if (error == -ENOSPC)
> +		return VM_FAULT_RETRY | ret;
>  	if (error == -ENOMEM)
> -		return VM_FAULT_OOM | major;
> +		return VM_FAULT_OOM | ret;
>  	/* -EBUSY is fine, somebody else faulted on the same PTE */
>  	if ((error < 0) && (error != -EBUSY))
> -		return VM_FAULT_SIGBUS | major;
> -	return VM_FAULT_NOPAGE | major;
> +		return VM_FAULT_SIGBUS | ret;
> +	return VM_FAULT_NOPAGE | ret;
>  
>   unlock_page:
>  	if (page) {
> @@ -435,6 +439,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	}
>  	goto out;
>  }
> +EXPORT_SYMBOL_GPL(__dax_fault);
>  
>  /**
>   * dax_fault - handle a page fault on a DAX file
> @@ -455,7 +460,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		sb_start_pagefault(sb);
>  		file_update_time(vma->vm_file);
>  	}
> -	result = do_dax_fault(vma, vmf, get_block);
> +	result = __dax_fault(vma, vmf, get_block);
>  	if (vmf->flags & FAULT_FLAG_WRITE)
>  		sb_end_pagefault(sb);
>  
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 4340e38..84b4f1c 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -194,7 +194,58 @@ errout:
>  #ifdef CONFIG_FS_DAX
>  static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	return dax_fault(vma, vmf, ext4_get_block_write);
> +	handle_t *handle;
> +	int create = (vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page;
> +	struct inode *inode = file_inode(vma->vm_file);
> +	int ret, err = 0;
> +	int retries = 0;
> +
> +	if (create) {
> +		sb_start_pagefault(inode->i_sb);
> +		file_update_time(vma->vm_file);
> + retry_alloc:
> +		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
> +					ext4_writepage_trans_blocks(inode));
> +		if (IS_ERR(handle)) {
> +			err = PTR_ERR(handle);
> +			goto err;
> +		}
> +	}
> +
> +	ret = __dax_fault(vma, vmf, ext4_get_block);
> +
> +	if (create) {
> +		if (ret & VM_FAULT_UNWRITTEN) {
> +			loff_t offset = (loff_t)vmf->pgoff << PAGE_SHIFT;
> +			err = ext4_convert_unwritten_extents(NULL, inode,
> +							offset, PAGE_SIZE);
> +			ret &= ~VM_FAULT_UNWRITTEN;
> +		}
> +		if (!err &&
> +		    ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
> +			err = ext4_jbd2_file_inode(handle, inode);
> +
> +		if (err == -ENOSPC) {
> +			ret |= VM_FAULT_RETRY;
> +			err = 0;
> +		}
> +
> +		ext4_journal_stop(handle);
> +		if (err < 0)
> +			goto err;
> +		if ((ret & VM_FAULT_RETRY) &&
> +		    ext4_should_retry_alloc(inode->i_sb, &retries))
> +			goto retry_alloc;
> +		ret &= ~VM_FAULT_RETRY;
> +	}
> +
> + out:
> +	if (create)
> +		sb_end_pagefault(inode->i_sb);
> +	return ret;
> + err:
> +	ret = block_page_mkwrite_return(err);
> +	goto out;
>  }
>  
>  static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 85404f1..8f1ea7d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -657,18 +657,6 @@ has_zeroout:
>  	return retval;
>  }
>  
> -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
> -{
> -	struct inode *inode = bh->b_assoc_map->host;
> -	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
> -	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
> -	int err;
> -	if (!uptodate)
> -		return;
> -	WARN_ON(!buffer_unwritten(bh));
> -	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
> -}
> -
>  /* Maximum number of blocks we map for direct IO at once. */
>  #define DIO_MAX_BLOCKS 4096
>  
> @@ -706,11 +694,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
>  
>  		map_bh(bh, inode->i_sb, map.m_pblk);
>  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> -		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> -			bh->b_assoc_map = inode->i_mapping;
> -			bh->b_private = (void *)(unsigned long)iblock;
> -			bh->b_end_io = ext4_end_io_unwritten;
> -		}
>  		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
>  			set_buffer_defer_completion(bh);
>  		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 239c89c..2af5050 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2597,6 +2597,7 @@ int dax_clear_blocks(struct inode *, sector_t block, long size);
>  int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
>  int dax_truncate_page(struct inode *, loff_t from, get_block_t);
>  int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
> +int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
>  int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
>  					unsigned int flags, get_block_t);
>  #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ceb50ec..ffc9947 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1100,7 +1100,7 @@ static inline int page_mapped(struct page *page)
>  #define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
>  #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
>  #define VM_FAULT_SIGSEGV 0x0040
> -
> +#define VM_FAULT_UNWRITTEN 0x0080	/* Unwritten extent needs conversion */
>  #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
>  #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
>  #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: + ext4-add-dax-functionality.patch added to -mm tree
@ 2015-02-23 12:52                 ` Jan Kara
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Kara @ 2015-02-23 12:52 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: axboe, boaz, xfs, Dilger, Andreas, rdunlap, tytso, hch,
	mathieu.desnoyers, Wilcox, Matthew R, mm-commits, Jan Kara,
	ross.zwisler, linux-ext4, akpm, kirill.shutemov

On Fri 20-02-15 17:15:51, Matthew Wilcox wrote:
> > So to handle this it can start transaction in ext4_dax_fault() /
> > ext4_dax_mkwrite() if write is requested and call ext4_jbd2_file_inode()
> > after dax_fault() / dax_mkwrite() returns. Complete function will look
> > something like follows:
> 
> How about this?  I tried to encompass both the unwritten extent conversion
> as well as starting the journal at the right point in the locking hierarchy.
> 
> If we're going to expose do_dax_fault(), I think it needs to be called
> __dax_fault().
> 
> I decided to return VM_FAULT_RETRY and a new flag VM_FAULT_UNWRITTEN from
> __dax_fault(), rather than convert it to return an errno.
  I don't like using VM_FAULT_RETRY for ENOSPC. Different filesystems may
want different things on this condition. In particular, if a filesystem
decides to use dax_fault(), VM_FAULT_RETRY will get propagated up into mm
code which just retries the fault (or gets confused if FAULT_FLAG_ALLOW_RETRY
wasn't set).

If you want to stay with VM_FAULT_XXX return values (which makes some sense),
then I guess you need something like VM_FAULT_ENOSPC and convert that to
VM_FAULT_SIGBUS in dax_fault().

Otherwise the patch looks good.

								Honza

> P.S. I love patches which touch *both* fs.h *and* mm.h.  In case there
> were any files that weren't already being rebuilt.
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 556238f..81dbdaa 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -316,7 +316,7 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
>  	return error;
>  }
>  
> -static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> +int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  			get_block_t get_block)
>  {
>  	struct file *file = vma->vm_file;
> @@ -329,7 +329,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	sector_t block;
>  	pgoff_t size;
>  	int error;
> -	int major = 0;
> +	int ret = 0;
>  
>  	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
>  	if (vmf->pgoff >= size)
> @@ -367,13 +367,15 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		error = -EIO;		/* fs corruption? */
>  	if (error)
>  		goto unlock_page;
> +	if (buffer_unwritten(&bh))
> +		ret |= VM_FAULT_UNWRITTEN;
>  
>  	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
>  		if (vmf->flags & FAULT_FLAG_WRITE) {
>  			error = get_block(inode, block, &bh, 1);
>  			count_vm_event(PGMAJFAULT);
>  			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> -			major = VM_FAULT_MAJOR;
> +			ret = VM_FAULT_MAJOR;
>  			if (!error && (bh.b_size < PAGE_SIZE))
>  				error = -EIO;
>  			if (error)
> @@ -407,7 +409,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	}
>  
>  	/* Check we didn't race with a read fault installing a new page */
> -	if (!page && major)
> +	if (!page && (ret & VM_FAULT_MAJOR))
>  		page = find_lock_page(mapping, vmf->pgoff);
>  
>  	if (page) {
> @@ -421,12 +423,14 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	error = dax_insert_mapping(inode, &bh, vma, vmf);
>  
>   out:
> +	if (error == -ENOSPC)
> +		return VM_FAULT_RETRY | ret;
>  	if (error == -ENOMEM)
> -		return VM_FAULT_OOM | major;
> +		return VM_FAULT_OOM | ret;
>  	/* -EBUSY is fine, somebody else faulted on the same PTE */
>  	if ((error < 0) && (error != -EBUSY))
> -		return VM_FAULT_SIGBUS | major;
> -	return VM_FAULT_NOPAGE | major;
> +		return VM_FAULT_SIGBUS | ret;
> +	return VM_FAULT_NOPAGE | ret;
>  
>   unlock_page:
>  	if (page) {
> @@ -435,6 +439,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	}
>  	goto out;
>  }
> +EXPORT_SYMBOL_GPL(__dax_fault);
>  
>  /**
>   * dax_fault - handle a page fault on a DAX file
> @@ -455,7 +460,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		sb_start_pagefault(sb);
>  		file_update_time(vma->vm_file);
>  	}
> -	result = do_dax_fault(vma, vmf, get_block);
> +	result = __dax_fault(vma, vmf, get_block);
>  	if (vmf->flags & FAULT_FLAG_WRITE)
>  		sb_end_pagefault(sb);
>  
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 4340e38..84b4f1c 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -194,7 +194,58 @@ errout:
>  #ifdef CONFIG_FS_DAX
>  static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -	return dax_fault(vma, vmf, ext4_get_block_write);
> +	handle_t *handle;
> +	int create = (vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page;
> +	struct inode *inode = file_inode(vma->vm_file);
> +	int ret, err = 0;
> +	int retries = 0;
> +
> +	if (create) {
> +		sb_start_pagefault(inode->i_sb);
> +		file_update_time(vma->vm_file);
> + retry_alloc:
> +		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
> +					ext4_writepage_trans_blocks(inode));
> +		if (IS_ERR(handle)) {
> +			err = PTR_ERR(handle);
> +			goto err;
> +		}
> +	}
> +
> +	ret = __dax_fault(vma, vmf, ext4_get_block);
> +
> +	if (create) {
> +		if (ret & VM_FAULT_UNWRITTEN) {
> +			loff_t offset = (loff_t)vmf->pgoff << PAGE_SHIFT;
> +			err = ext4_convert_unwritten_extents(NULL, inode,
> +							offset, PAGE_SIZE);
> +			ret &= ~VM_FAULT_UNWRITTEN;
> +		}
> +		if (!err &&
> +		    ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
> +			err = ext4_jbd2_file_inode(handle, inode);
> +
> +		if (err == -ENOSPC) {
> +			ret |= VM_FAULT_RETRY;
> +			err = 0;
> +		}
> +
> +		ext4_journal_stop(handle);
> +		if (err < 0)
> +			goto err;
> +		if ((ret & VM_FAULT_RETRY) &&
> +		    ext4_should_retry_alloc(inode->i_sb, &retries))
> +			goto retry_alloc;
> +		ret &= ~VM_FAULT_RETRY;
> +	}
> +
> + out:
> +	if (create)
> +		sb_end_pagefault(inode->i_sb);
> +	return ret;
> + err:
> +	ret = block_page_mkwrite_return(err);
> +	goto out;
>  }
>  
>  static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 85404f1..8f1ea7d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -657,18 +657,6 @@ has_zeroout:
>  	return retval;
>  }
>  
> -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
> -{
> -	struct inode *inode = bh->b_assoc_map->host;
> -	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
> -	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
> -	int err;
> -	if (!uptodate)
> -		return;
> -	WARN_ON(!buffer_unwritten(bh));
> -	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
> -}
> -
>  /* Maximum number of blocks we map for direct IO at once. */
>  #define DIO_MAX_BLOCKS 4096
>  
> @@ -706,11 +694,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
>  
>  		map_bh(bh, inode->i_sb, map.m_pblk);
>  		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> -		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> -			bh->b_assoc_map = inode->i_mapping;
> -			bh->b_private = (void *)(unsigned long)iblock;
> -			bh->b_end_io = ext4_end_io_unwritten;
> -		}
>  		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
>  			set_buffer_defer_completion(bh);
>  		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 239c89c..2af5050 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2597,6 +2597,7 @@ int dax_clear_blocks(struct inode *, sector_t block, long size);
>  int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
>  int dax_truncate_page(struct inode *, loff_t from, get_block_t);
>  int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
> +int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
>  int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
>  					unsigned int flags, get_block_t);
>  #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ceb50ec..ffc9947 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1100,7 +1100,7 @@ static inline int page_mapped(struct page *page)
>  #define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
>  #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
>  #define VM_FAULT_SIGSEGV 0x0040
> -
> +#define VM_FAULT_UNWRITTEN 0x0080	/* Unwritten extent needs conversion */
>  #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
>  #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
>  #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2015-02-23 12:52 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-12 23:11 + ext4-add-dax-functionality.patch added to -mm tree akpm
2015-01-15 12:41 ` Jan Kara
2015-01-16 21:16   ` Wilcox, Matthew R
2015-01-19 14:18     ` Jan Kara
2015-01-19 14:18       ` Jan Kara
2015-02-17  8:52       ` Jan Kara
2015-02-17  8:52         ` Jan Kara
2015-02-17 13:37         ` Matthew Wilcox
2015-02-17 13:37           ` Matthew Wilcox
2015-02-18 10:40           ` Jan Kara
2015-02-18 10:40             ` Jan Kara
2015-02-18 21:55             ` Dave Chinner
2015-02-18 21:59               ` hch
2015-02-18 21:59                 ` hch
2015-02-19 15:42               ` Jan Kara
2015-02-19 21:12                 ` Dave Chinner
2015-02-19 21:12                   ` Dave Chinner
2015-02-19 23:08                   ` Dave Chinner
2015-02-19 23:08                     ` Dave Chinner
2015-02-20 12:05                   ` Jan Kara
2015-02-20 12:05                     ` Jan Kara
2015-02-20 22:15             ` Matthew Wilcox
2015-02-20 22:15               ` Matthew Wilcox
2015-02-23 12:52               ` Jan Kara
2015-02-23 12:52                 ` Jan Kara

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.