All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-20 12:10 ` sunqiuyang
  0 siblings, 0 replies; 33+ messages in thread
From: sunqiuyang @ 2017-07-20 12:10 UTC (permalink / raw)
  To: linux-kernel, linux-fsdevel, linux-f2fs-devel; +Cc: jaegeuk, sunqiuyang

From: Qiuyang Sun <sunqiuyang@huawei.com>

This patch implements Direct Access (DAX) in F2FS, including:
 - a mount option to choose whether to enable DAX or not
 - read/write and mmap of regular files in the DAX way
 - zero-out of unaligned partial blocks in the DAX way
 - garbage collection of DAX files, by mapping both old and new physical
   addresses of a data page into memory and copy data between them directly
 - incompatibility of DAX with inline data, atomic or volatile write, 
   collapse|insert_range, etc.

Signed-off-by: Qiuyang Sun <sunqiuyang@huawei.com>
---
Changelog v7 -> v8:
 - Introduce the macro f2fs_dax_file() to judge if a file is DAX for cases
   when CONFIG_FS_DAX is set or not
 - Return -ENOTSUPP when an operation does not support DAX
 - In f2fs_iomap_begin(), convert the inline data of an inode (if any) 
   before mapping blocks
 - Minor cleanups
---
 Documentation/filesystems/f2fs.txt |   2 +
 fs/f2fs/data.c                     | 132 +++++++++++++++++++++++++-
 fs/f2fs/f2fs.h                     |  15 +++
 fs/f2fs/file.c                     | 183 ++++++++++++++++++++++++++++++++++++-
 fs/f2fs/gc.c                       | 103 ++++++++++++++++++++-
 fs/f2fs/inline.c                   |   3 +
 fs/f2fs/inode.c                    |   8 +-
 fs/f2fs/namei.c                    |   5 +
 fs/f2fs/super.c                    |  15 +++
 9 files changed, 454 insertions(+), 12 deletions(-)

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 273ccb2..c86c421 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -164,6 +164,8 @@ io_bits=%u             Set the bit size of write IO requests. It should be set
                        with "mode=lfs".
 usrquota               Enable plain user disk quota accounting.
 grpquota               Enable plain group disk quota accounting.
+dax                    Use direct access (no page cache). See
+                       Documentation/filesystems/dax.txt.
 
 ================================================================================
 DEBUGFS ENTRIES
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 87c1f41..4eb4b76 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -910,6 +910,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 				err = -EIO;
 				goto sync_out;
 			}
+			/*
+			 * If newly allocated blocks are to be zeroed out later,
+			 * a single f2fs_map_blocks must not contain both old
+			 * and new blocks at the same time.
+			 */
+			if (flag == F2FS_GET_BLOCK_ZERO
+					&& (map->m_flags & F2FS_MAP_MAPPED)
+					&& !(map->m_flags & F2FS_MAP_NEW))
+				goto sync_out;
 			if (flag == F2FS_GET_BLOCK_PRE_AIO) {
 				if (blkaddr == NULL_ADDR) {
 					prealloc++;
@@ -938,6 +947,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 						blkaddr != NEW_ADDR)
 				goto sync_out;
 		}
+	} else if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
+		goto sync_out;
 	}
 
 	if (flag == F2FS_GET_BLOCK_PRE_AIO)
@@ -996,6 +1007,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	goto next_dnode;
 
 sync_out:
+	if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
+		clean_bdev_aliases(inode->i_sb->s_bdev,
+				map->m_pblk, map->m_len);
+		err = sb_issue_zeroout(inode->i_sb, map->m_pblk,
+				map->m_len, GFP_NOFS);
+	}
 	f2fs_put_dnode(&dn);
 unlock_out:
 	if (create) {
@@ -1808,16 +1825,19 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	return 0;
 }
 
-static void f2fs_write_failed(struct address_space *mapping, loff_t to)
+static void f2fs_write_failed(struct address_space *mapping, loff_t to,
+								bool lock)
 {
 	struct inode *inode = mapping->host;
 	loff_t i_size = i_size_read(inode);
 
 	if (to > i_size) {
-		down_write(&F2FS_I(inode)->i_mmap_sem);
+		if (lock)
+			down_write(&F2FS_I(inode)->i_mmap_sem);
 		truncate_pagecache(inode, i_size);
 		truncate_blocks(inode, i_size, true);
-		up_write(&F2FS_I(inode)->i_mmap_sem);
+		if (lock)
+			up_write(&F2FS_I(inode)->i_mmap_sem);
 	}
 }
 
@@ -2000,7 +2020,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 
 fail:
 	f2fs_put_page(page, 1);
-	f2fs_write_failed(mapping, pos + len);
+	f2fs_write_failed(mapping, pos + len, true);
 	return err;
 }
 
@@ -2077,7 +2097,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		if (err > 0)
 			set_inode_flag(inode, FI_UPDATE_WRITE);
 		else if (err < 0)
-			f2fs_write_failed(mapping, offset + count);
+			f2fs_write_failed(mapping, offset + count, true);
 	}
 
 	trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
@@ -2274,3 +2294,105 @@ int f2fs_migrate_page(struct address_space *mapping,
 	.migratepage    = f2fs_migrate_page,
 #endif
 };
+
+#ifdef CONFIG_FS_DAX
+#include <linux/iomap.h>
+#include <linux/dax.h>
+
+static int f2fs_iomap_begin(struct inode *inode, loff_t offset,
+	loff_t length, unsigned int flags, struct iomap *iomap)
+{
+	struct block_device *bdev;
+	unsigned long first_block = F2FS_BYTES_TO_BLK(offset);
+	unsigned long last_block = F2FS_BYTES_TO_BLK(offset + length - 1);
+	struct f2fs_map_blocks map;
+	int ret;
+
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
+
+	map.m_lblk = first_block;
+	map.m_len = last_block - first_block + 1;
+	map.m_next_pgofs = NULL;
+
+	if (!(flags & IOMAP_WRITE)) {
+		ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
+	} else {
+	/* i_size should be kept here and changed later in f2fs_iomap_end */
+		loff_t original_i_size = i_size_read(inode);
+
+		ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_ZERO);
+		if (i_size_read(inode) > original_i_size) {
+			f2fs_i_size_write(inode, original_i_size);
+			if (ret)
+				f2fs_write_failed(inode->i_mapping,
+						offset + length,
+						!(flags & IOMAP_FAULT));
+		}
+	}
+
+	if (ret)
+		return ret;
+
+	iomap->flags = 0;
+	bdev = inode->i_sb->s_bdev;
+	iomap->bdev = bdev;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
+	iomap->offset = F2FS_BLK_TO_BYTES((u64)first_block);
+
+	if (map.m_len == 0) {
+		iomap->type = IOMAP_HOLE;
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->length = F2FS_BLKSIZE;
+	} else {
+		if (map.m_flags & F2FS_MAP_UNWRITTEN) {
+			iomap->type = IOMAP_UNWRITTEN;
+		} else if (map.m_flags & F2FS_MAP_MAPPED) {
+			iomap->type = IOMAP_MAPPED;
+		} else {
+			WARN_ON_ONCE(1);
+			return -EIO;
+		}
+		iomap->blkno =
+			(sector_t)map.m_pblk << F2FS_LOG_SECTORS_PER_BLOCK;
+		iomap->length = F2FS_BLK_TO_BYTES((u64)map.m_len);
+	}
+
+	if (map.m_flags & F2FS_MAP_NEW)
+		iomap->flags |= IOMAP_F_NEW;
+	return 0;
+}
+
+static int f2fs_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+	ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+	put_dax(iomap->dax_dev);
+	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
+		return 0;
+
+	if (offset + written > i_size_read(inode))
+		f2fs_i_size_write(inode, offset + written);
+
+	if (iomap->offset + iomap->length >
+			ALIGN(i_size_read(inode), F2FS_BLKSIZE)) {
+		block_t written_blk = F2FS_BYTES_TO_BLK(offset + written);
+		block_t end_blk = F2FS_BYTES_TO_BLK(offset + length);
+
+		if (written_blk < end_blk)
+			f2fs_write_failed(inode->i_mapping, offset + length,
+									true);
+	}
+
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+	return 0;
+}
+
+struct iomap_ops f2fs_iomap_ops = {
+	.iomap_begin	= f2fs_iomap_begin,
+	.iomap_end	= f2fs_iomap_end,
+};
+#endif
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 70777a8..b6d629a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -91,6 +91,11 @@ struct f2fs_fault_info {
 #define F2FS_MOUNT_LFS			0x00040000
 #define F2FS_MOUNT_USRQUOTA		0x00080000
 #define F2FS_MOUNT_GRPQUOTA		0x00100000
+#ifdef CONFIG_FS_DAX
+#define F2FS_MOUNT_DAX			0x00400000 /* Direct Access */
+#else
+#define F2FS_MOUNT_DAX			0
+#endif
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -482,6 +487,7 @@ struct f2fs_map_blocks {
 #define F2FS_GET_BLOCK_BMAP		3
 #define F2FS_GET_BLOCK_PRE_DIO		4
 #define F2FS_GET_BLOCK_PRE_AIO		5
+#define F2FS_GET_BLOCK_ZERO		6
 
 /*
  * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
@@ -506,6 +512,12 @@ struct f2fs_map_blocks {
 #define file_keep_isize(inode)	is_file(inode, FADVISE_KEEP_SIZE_BIT)
 #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
 
+#ifdef CONFIG_FS_DAX
+#define f2fs_dax_file(inode)	IS_DAX(inode)
+#else
+#define f2fs_dax_file(inode)	false
+#endif
+
 #define DEF_DIR_LEVEL		0
 
 struct f2fs_inode_info {
@@ -2439,6 +2451,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
 			struct page *page, enum migrate_mode mode);
 #endif
+#ifdef CONFIG_FS_DAX
+extern struct iomap_ops f2fs_iomap_ops;
+#endif
 
 /*
  * gc.c
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 2706130..e26114f 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -23,6 +23,10 @@
 #include <linux/uio.h>
 #include <linux/uuid.h>
 #include <linux/file.h>
+#ifdef CONFIG_FS_DAX
+#include <linux/dax.h>
+#include <linux/iomap.h>
+#endif
 
 #include "f2fs.h"
 #include "node.h"
@@ -121,6 +125,64 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 	.page_mkwrite	= f2fs_vm_page_mkwrite,
 };
 
+#ifdef CONFIG_FS_DAX
+static int f2fs_dax_huge_fault(struct vm_fault *vmf,
+	enum page_entry_size pe_size)
+{
+	int result;
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	struct super_block *sb = inode->i_sb;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+	if (write) {
+		sb_start_pagefault(sb);
+		file_update_time(vmf->vma->vm_file);
+	}
+	down_read(&F2FS_I(inode)->i_mmap_sem);
+	result = dax_iomap_fault(vmf, pe_size, &f2fs_iomap_ops);
+	up_read(&F2FS_I(inode)->i_mmap_sem);
+	if (write)
+		sb_end_pagefault(sb);
+
+	return result;
+}
+
+static int f2fs_dax_fault(struct vm_fault *vmf)
+{
+	return f2fs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static int f2fs_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	struct super_block *sb = inode->i_sb;
+	loff_t size;
+	int ret;
+
+	sb_start_pagefault(sb);
+	file_update_time(vmf->vma->vm_file);
+	down_read(&F2FS_I(inode)->i_mmap_sem);
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		ret = VM_FAULT_SIGBUS;
+	else
+		ret = dax_pfn_mkwrite(vmf);
+	up_read(&F2FS_I(inode)->i_mmap_sem);
+	sb_end_pagefault(sb);
+
+	return ret;
+}
+
+static const struct vm_operations_struct f2fs_dax_vm_ops = {
+	.fault		= f2fs_dax_fault,
+	.huge_fault	= f2fs_dax_huge_fault,
+	.page_mkwrite	= f2fs_dax_fault,
+	.pfn_mkwrite	= f2fs_dax_pfn_mkwrite,
+};
+#else
+#define f2fs_dax_vm_ops f2fs_file_vm_ops
+#endif
+
 static int get_parent_ino(struct inode *inode, nid_t *pino)
 {
 	struct dentry *dentry;
@@ -436,7 +498,13 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 		return err;
 
 	file_accessed(file);
-	vma->vm_ops = &f2fs_file_vm_ops;
+
+	if (f2fs_dax_file(inode)) {
+		vma->vm_ops = &f2fs_dax_vm_ops;
+		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	} else {
+		vma->vm_ops = &f2fs_file_vm_ops;
+	}
 	return 0;
 }
 
@@ -519,6 +587,16 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 	if (!offset && !cache_only)
 		return 0;
 
+	if (f2fs_dax_file(inode)) {
+		int ret;
+
+		down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+		ret = iomap_zero_range(inode, from, PAGE_SIZE - offset,
+			NULL, &f2fs_iomap_ops);
+		up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+		return ret;
+	}
+
 	if (cache_only) {
 		page = find_lock_page(mapping, index);
 		if (page && PageUptodate(page))
@@ -799,6 +877,17 @@ static int fill_zero(struct inode *inode, pgoff_t index,
 	if (!len)
 		return 0;
 
+	if (f2fs_dax_file(inode)) {
+		int ret;
+
+		down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+		ret = iomap_zero_range(inode,
+			F2FS_BLK_TO_BYTES((loff_t)index) + start,
+			len, NULL, &f2fs_iomap_ops);
+		up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+		return ret;
+	}
+
 	f2fs_balance_fs(sbi, true);
 
 	f2fs_lock_op(sbi);
@@ -1121,6 +1210,10 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	loff_t new_size;
 	int ret;
 
+	/* The current implementation does not apply to DAX files. */
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (offset + len >= i_size_read(inode))
 		return -EINVAL;
 
@@ -1311,6 +1404,10 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	loff_t new_size;
 	int ret = 0;
 
+	/* The current implementation does not apply to DAX files. */
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	new_size = i_size_read(inode) + len;
 	ret = inode_newsize_ok(inode, new_size);
 	if (ret)
@@ -1578,6 +1675,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
@@ -1627,6 +1727,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
@@ -1663,6 +1766,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
@@ -1698,6 +1804,9 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
@@ -1727,6 +1836,9 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
@@ -2141,6 +2253,9 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
 	struct f2fs_defragment range;
 	int err;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
@@ -2190,6 +2305,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
 	size_t dst_osize;
 	int ret;
 
+	if (f2fs_dax_file(src) || f2fs_dax_file(dst))
+		return -ENOTSUPP;
+
 	if (file_in->f_path.mnt != file_out->f_path.mnt ||
 				src->i_sb != dst->i_sb)
 		return -EXDEV;
@@ -2431,6 +2549,62 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	}
 }
 
+#ifdef CONFIG_FS_DAX
+static ssize_t f2fs_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	inode_lock_shared(inode);
+
+	if (!IS_DAX(inode)) {
+		inode_unlock_shared(inode);
+		return generic_file_read_iter(iocb, to);
+	}
+
+	down_read(&F2FS_I(inode)->dio_rwsem[READ]);
+	ret = dax_iomap_rw(iocb, to, &f2fs_iomap_ops);
+	up_read(&F2FS_I(inode)->dio_rwsem[READ]);
+	inode_unlock_shared(inode);
+
+	file_accessed(iocb->ki_filp);
+	return ret;
+}
+
+static ssize_t f2fs_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	ret = file_remove_privs(iocb->ki_filp);
+	if (ret)
+		return ret;
+	ret = file_update_time(iocb->ki_filp);
+	if (ret)
+		return ret;
+
+	down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+	ret = dax_iomap_rw(iocb, from, &f2fs_iomap_ops);
+	up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
+	return ret;
+}
+#else
+#define f2fs_dax_read_iter	generic_file_read_iter
+#define f2fs_dax_write_iter	__generic_file_write_iter
+#endif
+
+static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	if (!iov_iter_count(to))
+		return 0; /* skip atime */
+
+	if (f2fs_dax_file(file_inode(iocb->ki_filp)))
+		return f2fs_dax_read_iter(iocb, to);
+
+	return generic_file_read_iter(iocb, to);
+}
+
 static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
@@ -2452,7 +2626,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			return err;
 		}
 		blk_start_plug(&plug);
-		ret = __generic_file_write_iter(iocb, from);
+		if (f2fs_dax_file(inode))
+			ret = f2fs_dax_write_iter(iocb, from);
+		else
+			ret = __generic_file_write_iter(iocb, from);
 		blk_finish_plug(&plug);
 		clear_inode_flag(inode, FI_NO_PREALLOC);
 	}
@@ -2501,7 +2678,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 const struct file_operations f2fs_file_operations = {
 	.llseek		= f2fs_llseek,
-	.read_iter	= generic_file_read_iter,
+	.read_iter	= f2fs_file_read_iter,
 	.write_iter	= f2fs_file_write_iter,
 	.open		= f2fs_file_open,
 	.release	= f2fs_release_file,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index fa3d2e2..06b6859 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -700,6 +700,100 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
 	f2fs_put_page(page, 1);
 }
 
+#ifdef CONFIG_FS_DAX
+#include <linux/dax.h>
+
+static void dax_move_data_page(struct inode *inode, block_t bidx,
+				unsigned int segno, int off)
+{
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	struct dax_device *dax_dev;
+	struct dnode_of_data dn;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_summary sum;
+	struct node_info ni;
+	block_t old_blkaddr, new_blkaddr;
+	int err, id;
+	long map_len;
+	pgoff_t pgoff;
+	void *kaddr_old, *kaddr_new;
+	pfn_t pfn;
+
+	f2fs_bug_on(sbi, f2fs_is_atomic_file(inode));
+
+	if (!check_valid_map(sbi, segno, off))
+		return;
+
+	f2fs_bug_on(sbi, !blk_queue_dax(bdev->bd_queue));
+
+	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+
+	if (!down_write_trylock(&F2FS_I(inode)->i_mmap_sem))
+		goto release;
+
+	unmap_mapping_range(inode->i_mapping, (loff_t)bidx << PAGE_SHIFT,
+			PAGE_SIZE, 1);
+	/* find the old block address */
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
+	if (err)
+		goto out;
+	old_blkaddr = dn.data_blkaddr;
+	/* This page is already truncated */
+	if (old_blkaddr == NULL_ADDR)
+		goto put_dn;
+
+	/* allocate a new block address */
+	get_node_info(sbi, dn.nid, &ni);
+	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+	allocate_data_block(sbi, NULL, old_blkaddr, &new_blkaddr,
+			&sum, CURSEG_COLD_DATA, NULL, false);
+
+	/* copy data page from old to new address in dax_bdev */
+	id = dax_read_lock();
+	err = bdev_dax_pgoff(bdev, SECTOR_FROM_BLOCK(old_blkaddr),
+			PAGE_SIZE, &pgoff);
+	if (err)
+		goto recover;
+	map_len = dax_direct_access(dax_dev, pgoff, 1, &kaddr_old, &pfn);
+	if (map_len < 0)
+		goto recover;
+	err = bdev_dax_pgoff(bdev, SECTOR_FROM_BLOCK(new_blkaddr),
+			PAGE_SIZE, &pgoff);
+	if (err)
+		goto recover;
+	map_len = dax_direct_access(dax_dev, pgoff, 1, &kaddr_new, &pfn);
+	if (map_len < 0)
+		goto recover;
+	copy_page((void __force *)kaddr_new, (void __force *)kaddr_old);
+
+	f2fs_update_data_blkaddr(&dn, new_blkaddr);
+	set_inode_flag(inode, FI_APPEND_WRITE);
+	if (bidx == 0)
+		set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+
+recover:
+	if (err || map_len < 0)
+		__f2fs_replace_block(sbi, &sum, new_blkaddr, old_blkaddr,
+							true, true);
+	dax_read_unlock(id);
+put_dn:
+	f2fs_put_dnode(&dn);
+out:
+	unmap_mapping_range(inode->i_mapping, (loff_t)bidx << PAGE_SHIFT,
+			PAGE_SIZE, 1);
+	up_write(&F2FS_I(inode)->i_mmap_sem);
+release:
+	put_dax(dax_dev);
+}
+#else
+static void dax_move_data_page(struct inode *inode, block_t bidx,
+				unsigned int segno, int off)
+{
+	BUG_ON(1);
+}
+#endif
+
 static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
 							unsigned int segno, int off)
 {
@@ -818,9 +912,10 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 			if (IS_ERR(inode) || is_bad_inode(inode))
 				continue;
 
-			/* if encrypted inode, let's go phase 3 */
+			/* if DAX or encrypted inode, let's go phase 3 */
 			if (f2fs_encrypted_inode(inode) &&
-						S_ISREG(inode->i_mode)) {
+						S_ISREG(inode->i_mode) ||
+						f2fs_dax_file(inode)) {
 				add_gc_inode(gc_list, inode);
 				continue;
 			}
@@ -858,7 +953,9 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 
 			start_bidx = start_bidx_of_node(nofs, inode)
 								+ ofs_in_node;
-			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+			if (f2fs_dax_file(inode))
+				dax_move_data_page(inode, start_bidx, segno, off);
+			else if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 				move_encrypted_block(inode, start_bidx, segno, off);
 			else
 				move_data_page(inode, start_bidx, gc_type, segno, off);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 2082816..b8c9116 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -28,6 +28,9 @@ bool f2fs_may_inline_data(struct inode *inode)
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 		return false;
 
+	if (f2fs_dax_file(inode))
+		return false;
+
 	return true;
 }
 
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 6cd312a..7741461 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -43,8 +43,14 @@ void f2fs_set_inode_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (flags & FS_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
+#ifdef CONFIG_FS_DAX
+	if (test_opt(F2FS_I_SB(inode), DAX) && S_ISREG(inode->i_mode) &&
+		!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
+		!f2fs_is_atomic_file(inode) && !f2fs_is_volatile_file(inode))
+		new_fl |= S_DAX;
+#endif
 	inode_set_flags(inode, new_fl,
-			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 760d852..afc52e0 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -70,6 +70,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
 		f2fs_set_encrypted_inode(inode);
 
+#ifdef CONFIG_FS_DAX
+	if (test_opt(sbi, DAX) && S_ISREG(inode->i_mode) &&
+		!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode))
+		inode->i_flags |= S_DAX;
+#endif
 	set_inode_flag(inode, FI_NEW_INODE);
 
 	if (test_opt(sbi, INLINE_XATTR))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 32e4c02..aefe931 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -109,6 +109,7 @@ enum {
 	Opt_nolazytime,
 	Opt_usrquota,
 	Opt_grpquota,
+	Opt_dax,
 	Opt_err,
 };
 
@@ -146,6 +147,7 @@ enum {
 	{Opt_nolazytime, "nolazytime"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
+	{Opt_dax, "dax"},
 	{Opt_err, NULL},
 };
 
@@ -399,6 +401,15 @@ static int parse_options(struct super_block *sb, char *options)
 					"quota operations not supported");
 			break;
 #endif
+#ifdef CONFIG_FS_DAX
+		case Opt_dax:
+			set_opt(sbi, DAX);
+			break;
+#else
+		case Opt_dax:
+			f2fs_msg(sb, KERN_INFO, "dax option not supported");
+			break;
+#endif
 		default:
 			f2fs_msg(sb, KERN_ERR,
 				"Unrecognized mount option \"%s\" or missing value",
@@ -814,6 +825,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 	if (test_opt(sbi, GRPQUOTA))
 		seq_puts(seq, ",grpquota");
 #endif
+#ifdef CONFIG_FS_DAX
+	if (test_opt(sbi, DAX))
+		seq_puts(seq, ",dax");
+#endif
 
 	return 0;
 }
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-20 12:10 ` sunqiuyang
  0 siblings, 0 replies; 33+ messages in thread
From: sunqiuyang @ 2017-07-20 12:10 UTC (permalink / raw)
  To: linux-kernel, linux-fsdevel, linux-f2fs-devel; +Cc: jaegeuk, sunqiuyang

From: Qiuyang Sun <sunqiuyang@huawei.com>

This patch implements Direct Access (DAX) in F2FS, including:
 - a mount option to choose whether to enable DAX or not
 - read/write and mmap of regular files in the DAX way
 - zero-out of unaligned partial blocks in the DAX way
 - garbage collection of DAX files, by mapping both old and new physical
   addresses of a data page into memory and copy data between them directly
 - incompatibility of DAX with inline data, atomic or volatile write, 
   collapse|insert_range, etc.

Signed-off-by: Qiuyang Sun <sunqiuyang@huawei.com>
---
Changelog v7 -> v8:
 - Introduce the macro f2fs_dax_file() to judge if a file is DAX for cases
   when CONFIG_FS_DAX is set or not
 - Return -ENOTSUPP when an operation does not support DAX
 - In f2fs_iomap_begin(), convert the inline data of an inode (if any) 
   before mapping blocks
 - Minor cleanups
---
 Documentation/filesystems/f2fs.txt |   2 +
 fs/f2fs/data.c                     | 132 +++++++++++++++++++++++++-
 fs/f2fs/f2fs.h                     |  15 +++
 fs/f2fs/file.c                     | 183 ++++++++++++++++++++++++++++++++++++-
 fs/f2fs/gc.c                       | 103 ++++++++++++++++++++-
 fs/f2fs/inline.c                   |   3 +
 fs/f2fs/inode.c                    |   8 +-
 fs/f2fs/namei.c                    |   5 +
 fs/f2fs/super.c                    |  15 +++
 9 files changed, 454 insertions(+), 12 deletions(-)

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 273ccb2..c86c421 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -164,6 +164,8 @@ io_bits=%u             Set the bit size of write IO requests. It should be set
                        with "mode=lfs".
 usrquota               Enable plain user disk quota accounting.
 grpquota               Enable plain group disk quota accounting.
+dax                    Use direct access (no page cache). See
+                       Documentation/filesystems/dax.txt.
 
 ================================================================================
 DEBUGFS ENTRIES
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 87c1f41..4eb4b76 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -910,6 +910,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 				err = -EIO;
 				goto sync_out;
 			}
+			/*
+			 * If newly allocated blocks are to be zeroed out later,
+			 * a single f2fs_map_blocks must not contain both old
+			 * and new blocks at the same time.
+			 */
+			if (flag == F2FS_GET_BLOCK_ZERO
+					&& (map->m_flags & F2FS_MAP_MAPPED)
+					&& !(map->m_flags & F2FS_MAP_NEW))
+				goto sync_out;
 			if (flag == F2FS_GET_BLOCK_PRE_AIO) {
 				if (blkaddr == NULL_ADDR) {
 					prealloc++;
@@ -938,6 +947,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 						blkaddr != NEW_ADDR)
 				goto sync_out;
 		}
+	} else if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
+		goto sync_out;
 	}
 
 	if (flag == F2FS_GET_BLOCK_PRE_AIO)
@@ -996,6 +1007,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	goto next_dnode;
 
 sync_out:
+	if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
+		clean_bdev_aliases(inode->i_sb->s_bdev,
+				map->m_pblk, map->m_len);
+		err = sb_issue_zeroout(inode->i_sb, map->m_pblk,
+				map->m_len, GFP_NOFS);
+	}
 	f2fs_put_dnode(&dn);
 unlock_out:
 	if (create) {
@@ -1808,16 +1825,19 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	return 0;
 }
 
-static void f2fs_write_failed(struct address_space *mapping, loff_t to)
+static void f2fs_write_failed(struct address_space *mapping, loff_t to,
+								bool lock)
 {
 	struct inode *inode = mapping->host;
 	loff_t i_size = i_size_read(inode);
 
 	if (to > i_size) {
-		down_write(&F2FS_I(inode)->i_mmap_sem);
+		if (lock)
+			down_write(&F2FS_I(inode)->i_mmap_sem);
 		truncate_pagecache(inode, i_size);
 		truncate_blocks(inode, i_size, true);
-		up_write(&F2FS_I(inode)->i_mmap_sem);
+		if (lock)
+			up_write(&F2FS_I(inode)->i_mmap_sem);
 	}
 }
 
@@ -2000,7 +2020,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 
 fail:
 	f2fs_put_page(page, 1);
-	f2fs_write_failed(mapping, pos + len);
+	f2fs_write_failed(mapping, pos + len, true);
 	return err;
 }
 
@@ -2077,7 +2097,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		if (err > 0)
 			set_inode_flag(inode, FI_UPDATE_WRITE);
 		else if (err < 0)
-			f2fs_write_failed(mapping, offset + count);
+			f2fs_write_failed(mapping, offset + count, true);
 	}
 
 	trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
@@ -2274,3 +2294,105 @@ int f2fs_migrate_page(struct address_space *mapping,
 	.migratepage    = f2fs_migrate_page,
 #endif
 };
+
+#ifdef CONFIG_FS_DAX
+#include <linux/iomap.h>
+#include <linux/dax.h>
+
+static int f2fs_iomap_begin(struct inode *inode, loff_t offset,
+	loff_t length, unsigned int flags, struct iomap *iomap)
+{
+	struct block_device *bdev;
+	unsigned long first_block = F2FS_BYTES_TO_BLK(offset);
+	unsigned long last_block = F2FS_BYTES_TO_BLK(offset + length - 1);
+	struct f2fs_map_blocks map;
+	int ret;
+
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
+
+	map.m_lblk = first_block;
+	map.m_len = last_block - first_block + 1;
+	map.m_next_pgofs = NULL;
+
+	if (!(flags & IOMAP_WRITE)) {
+		ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
+	} else {
+	/* i_size should be kept here and changed later in f2fs_iomap_end */
+		loff_t original_i_size = i_size_read(inode);
+
+		ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_ZERO);
+		if (i_size_read(inode) > original_i_size) {
+			f2fs_i_size_write(inode, original_i_size);
+			if (ret)
+				f2fs_write_failed(inode->i_mapping,
+						offset + length,
+						!(flags & IOMAP_FAULT));
+		}
+	}
+
+	if (ret)
+		return ret;
+
+	iomap->flags = 0;
+	bdev = inode->i_sb->s_bdev;
+	iomap->bdev = bdev;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
+	iomap->offset = F2FS_BLK_TO_BYTES((u64)first_block);
+
+	if (map.m_len == 0) {
+		iomap->type = IOMAP_HOLE;
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->length = F2FS_BLKSIZE;
+	} else {
+		if (map.m_flags & F2FS_MAP_UNWRITTEN) {
+			iomap->type = IOMAP_UNWRITTEN;
+		} else if (map.m_flags & F2FS_MAP_MAPPED) {
+			iomap->type = IOMAP_MAPPED;
+		} else {
+			WARN_ON_ONCE(1);
+			return -EIO;
+		}
+		iomap->blkno =
+			(sector_t)map.m_pblk << F2FS_LOG_SECTORS_PER_BLOCK;
+		iomap->length = F2FS_BLK_TO_BYTES((u64)map.m_len);
+	}
+
+	if (map.m_flags & F2FS_MAP_NEW)
+		iomap->flags |= IOMAP_F_NEW;
+	return 0;
+}
+
+static int f2fs_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+	ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+	put_dax(iomap->dax_dev);
+	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
+		return 0;
+
+	if (offset + written > i_size_read(inode))
+		f2fs_i_size_write(inode, offset + written);
+
+	if (iomap->offset + iomap->length >
+			ALIGN(i_size_read(inode), F2FS_BLKSIZE)) {
+		block_t written_blk = F2FS_BYTES_TO_BLK(offset + written);
+		block_t end_blk = F2FS_BYTES_TO_BLK(offset + length);
+
+		if (written_blk < end_blk)
+			f2fs_write_failed(inode->i_mapping, offset + length,
+									true);
+	}
+
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+	return 0;
+}
+
+struct iomap_ops f2fs_iomap_ops = {
+	.iomap_begin	= f2fs_iomap_begin,
+	.iomap_end	= f2fs_iomap_end,
+};
+#endif
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 70777a8..b6d629a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -91,6 +91,11 @@ struct f2fs_fault_info {
 #define F2FS_MOUNT_LFS			0x00040000
 #define F2FS_MOUNT_USRQUOTA		0x00080000
 #define F2FS_MOUNT_GRPQUOTA		0x00100000
+#ifdef CONFIG_FS_DAX
+#define F2FS_MOUNT_DAX			0x00400000 /* Direct Access */
+#else
+#define F2FS_MOUNT_DAX			0
+#endif
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -482,6 +487,7 @@ struct f2fs_map_blocks {
 #define F2FS_GET_BLOCK_BMAP		3
 #define F2FS_GET_BLOCK_PRE_DIO		4
 #define F2FS_GET_BLOCK_PRE_AIO		5
+#define F2FS_GET_BLOCK_ZERO		6
 
 /*
  * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
@@ -506,6 +512,12 @@ struct f2fs_map_blocks {
 #define file_keep_isize(inode)	is_file(inode, FADVISE_KEEP_SIZE_BIT)
 #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
 
+#ifdef CONFIG_FS_DAX
+#define f2fs_dax_file(inode)	IS_DAX(inode)
+#else
+#define f2fs_dax_file(inode)	false
+#endif
+
 #define DEF_DIR_LEVEL		0
 
 struct f2fs_inode_info {
@@ -2439,6 +2451,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
 			struct page *page, enum migrate_mode mode);
 #endif
+#ifdef CONFIG_FS_DAX
+extern struct iomap_ops f2fs_iomap_ops;
+#endif
 
 /*
  * gc.c
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 2706130..e26114f 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -23,6 +23,10 @@
 #include <linux/uio.h>
 #include <linux/uuid.h>
 #include <linux/file.h>
+#ifdef CONFIG_FS_DAX
+#include <linux/dax.h>
+#include <linux/iomap.h>
+#endif
 
 #include "f2fs.h"
 #include "node.h"
@@ -121,6 +125,64 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 	.page_mkwrite	= f2fs_vm_page_mkwrite,
 };
 
+#ifdef CONFIG_FS_DAX
+static int f2fs_dax_huge_fault(struct vm_fault *vmf,
+	enum page_entry_size pe_size)
+{
+	int result;
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	struct super_block *sb = inode->i_sb;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+	if (write) {
+		sb_start_pagefault(sb);
+		file_update_time(vmf->vma->vm_file);
+	}
+	down_read(&F2FS_I(inode)->i_mmap_sem);
+	result = dax_iomap_fault(vmf, pe_size, &f2fs_iomap_ops);
+	up_read(&F2FS_I(inode)->i_mmap_sem);
+	if (write)
+		sb_end_pagefault(sb);
+
+	return result;
+}
+
+static int f2fs_dax_fault(struct vm_fault *vmf)
+{
+	return f2fs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static int f2fs_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	struct super_block *sb = inode->i_sb;
+	loff_t size;
+	int ret;
+
+	sb_start_pagefault(sb);
+	file_update_time(vmf->vma->vm_file);
+	down_read(&F2FS_I(inode)->i_mmap_sem);
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		ret = VM_FAULT_SIGBUS;
+	else
+		ret = dax_pfn_mkwrite(vmf);
+	up_read(&F2FS_I(inode)->i_mmap_sem);
+	sb_end_pagefault(sb);
+
+	return ret;
+}
+
+static const struct vm_operations_struct f2fs_dax_vm_ops = {
+	.fault		= f2fs_dax_fault,
+	.huge_fault	= f2fs_dax_huge_fault,
+	.page_mkwrite	= f2fs_dax_fault,
+	.pfn_mkwrite	= f2fs_dax_pfn_mkwrite,
+};
+#else
+#define f2fs_dax_vm_ops f2fs_file_vm_ops
+#endif
+
 static int get_parent_ino(struct inode *inode, nid_t *pino)
 {
 	struct dentry *dentry;
@@ -436,7 +498,13 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 		return err;
 
 	file_accessed(file);
-	vma->vm_ops = &f2fs_file_vm_ops;
+
+	if (f2fs_dax_file(inode)) {
+		vma->vm_ops = &f2fs_dax_vm_ops;
+		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	} else {
+		vma->vm_ops = &f2fs_file_vm_ops;
+	}
 	return 0;
 }
 
@@ -519,6 +587,16 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 	if (!offset && !cache_only)
 		return 0;
 
+	if (f2fs_dax_file(inode)) {
+		int ret;
+
+		down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+		ret = iomap_zero_range(inode, from, PAGE_SIZE - offset,
+			NULL, &f2fs_iomap_ops);
+		up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+		return ret;
+	}
+
 	if (cache_only) {
 		page = find_lock_page(mapping, index);
 		if (page && PageUptodate(page))
@@ -799,6 +877,17 @@ static int fill_zero(struct inode *inode, pgoff_t index,
 	if (!len)
 		return 0;
 
+	if (f2fs_dax_file(inode)) {
+		int ret;
+
+		down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+		ret = iomap_zero_range(inode,
+			F2FS_BLK_TO_BYTES((loff_t)index) + start,
+			len, NULL, &f2fs_iomap_ops);
+		up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+		return ret;
+	}
+
 	f2fs_balance_fs(sbi, true);
 
 	f2fs_lock_op(sbi);
@@ -1121,6 +1210,10 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	loff_t new_size;
 	int ret;
 
+	/* The current implementation does not apply to DAX files. */
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (offset + len >= i_size_read(inode))
 		return -EINVAL;
 
@@ -1311,6 +1404,10 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	loff_t new_size;
 	int ret = 0;
 
+	/* The current implementation does not apply to DAX files. */
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	new_size = i_size_read(inode) + len;
 	ret = inode_newsize_ok(inode, new_size);
 	if (ret)
@@ -1578,6 +1675,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
@@ -1627,6 +1727,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
@@ -1663,6 +1766,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
@@ -1698,6 +1804,9 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
@@ -1727,6 +1836,9 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
@@ -2141,6 +2253,9 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
 	struct f2fs_defragment range;
 	int err;
 
+	if (f2fs_dax_file(inode))
+		return -ENOTSUPP;
+
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
@@ -2190,6 +2305,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
 	size_t dst_osize;
 	int ret;
 
+	if (f2fs_dax_file(src) || f2fs_dax_file(dst))
+		return -ENOTSUPP;
+
 	if (file_in->f_path.mnt != file_out->f_path.mnt ||
 				src->i_sb != dst->i_sb)
 		return -EXDEV;
@@ -2431,6 +2549,62 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	}
 }
 
+#ifdef CONFIG_FS_DAX
+static ssize_t f2fs_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	inode_lock_shared(inode);
+
+	if (!IS_DAX(inode)) {
+		inode_unlock_shared(inode);
+		return generic_file_read_iter(iocb, to);
+	}
+
+	down_read(&F2FS_I(inode)->dio_rwsem[READ]);
+	ret = dax_iomap_rw(iocb, to, &f2fs_iomap_ops);
+	up_read(&F2FS_I(inode)->dio_rwsem[READ]);
+	inode_unlock_shared(inode);
+
+	file_accessed(iocb->ki_filp);
+	return ret;
+}
+
+static ssize_t f2fs_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	ret = file_remove_privs(iocb->ki_filp);
+	if (ret)
+		return ret;
+	ret = file_update_time(iocb->ki_filp);
+	if (ret)
+		return ret;
+
+	down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+	ret = dax_iomap_rw(iocb, from, &f2fs_iomap_ops);
+	up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
+	return ret;
+}
+#else
+#define f2fs_dax_read_iter	generic_file_read_iter
+#define f2fs_dax_write_iter	__generic_file_write_iter
+#endif
+
+static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	if (!iov_iter_count(to))
+		return 0; /* skip atime */
+
+	if (f2fs_dax_file(file_inode(iocb->ki_filp)))
+		return f2fs_dax_read_iter(iocb, to);
+
+	return generic_file_read_iter(iocb, to);
+}
+
 static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
@@ -2452,7 +2626,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			return err;
 		}
 		blk_start_plug(&plug);
-		ret = __generic_file_write_iter(iocb, from);
+		if (f2fs_dax_file(inode))
+			ret = f2fs_dax_write_iter(iocb, from);
+		else
+			ret = __generic_file_write_iter(iocb, from);
 		blk_finish_plug(&plug);
 		clear_inode_flag(inode, FI_NO_PREALLOC);
 	}
@@ -2501,7 +2678,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 const struct file_operations f2fs_file_operations = {
 	.llseek		= f2fs_llseek,
-	.read_iter	= generic_file_read_iter,
+	.read_iter	= f2fs_file_read_iter,
 	.write_iter	= f2fs_file_write_iter,
 	.open		= f2fs_file_open,
 	.release	= f2fs_release_file,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index fa3d2e2..06b6859 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -700,6 +700,100 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
 	f2fs_put_page(page, 1);
 }
 
+#ifdef CONFIG_FS_DAX
+#include <linux/dax.h>
+
+static void dax_move_data_page(struct inode *inode, block_t bidx,
+				unsigned int segno, int off)
+{
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	struct dax_device *dax_dev;
+	struct dnode_of_data dn;
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_summary sum;
+	struct node_info ni;
+	block_t old_blkaddr, new_blkaddr;
+	int err, id;
+	long map_len;
+	pgoff_t pgoff;
+	void *kaddr_old, *kaddr_new;
+	pfn_t pfn;
+
+	f2fs_bug_on(sbi, f2fs_is_atomic_file(inode));
+
+	if (!check_valid_map(sbi, segno, off))
+		return;
+
+	f2fs_bug_on(sbi, !blk_queue_dax(bdev->bd_queue));
+
+	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+
+	if (!down_write_trylock(&F2FS_I(inode)->i_mmap_sem))
+		goto release;
+
+	unmap_mapping_range(inode->i_mapping, (loff_t)bidx << PAGE_SHIFT,
+			PAGE_SIZE, 1);
+	/* find the old block address */
+	set_new_dnode(&dn, inode, NULL, NULL, 0);
+	err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
+	if (err)
+		goto out;
+	old_blkaddr = dn.data_blkaddr;
+	/* This page is already truncated */
+	if (old_blkaddr == NULL_ADDR)
+		goto put_dn;
+
+	/* allocate a new block address */
+	get_node_info(sbi, dn.nid, &ni);
+	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+	allocate_data_block(sbi, NULL, old_blkaddr, &new_blkaddr,
+			&sum, CURSEG_COLD_DATA, NULL, false);
+
+	/* copy data page from old to new address in dax_bdev */
+	id = dax_read_lock();
+	err = bdev_dax_pgoff(bdev, SECTOR_FROM_BLOCK(old_blkaddr),
+			PAGE_SIZE, &pgoff);
+	if (err)
+		goto recover;
+	map_len = dax_direct_access(dax_dev, pgoff, 1, &kaddr_old, &pfn);
+	if (map_len < 0)
+		goto recover;
+	err = bdev_dax_pgoff(bdev, SECTOR_FROM_BLOCK(new_blkaddr),
+			PAGE_SIZE, &pgoff);
+	if (err)
+		goto recover;
+	map_len = dax_direct_access(dax_dev, pgoff, 1, &kaddr_new, &pfn);
+	if (map_len < 0)
+		goto recover;
+	copy_page((void __force *)kaddr_new, (void __force *)kaddr_old);
+
+	f2fs_update_data_blkaddr(&dn, new_blkaddr);
+	set_inode_flag(inode, FI_APPEND_WRITE);
+	if (bidx == 0)
+		set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+
+recover:
+	if (err || map_len < 0)
+		__f2fs_replace_block(sbi, &sum, new_blkaddr, old_blkaddr,
+							true, true);
+	dax_read_unlock(id);
+put_dn:
+	f2fs_put_dnode(&dn);
+out:
+	unmap_mapping_range(inode->i_mapping, (loff_t)bidx << PAGE_SHIFT,
+			PAGE_SIZE, 1);
+	up_write(&F2FS_I(inode)->i_mmap_sem);
+release:
+	put_dax(dax_dev);
+}
+#else
+static void dax_move_data_page(struct inode *inode, block_t bidx,
+				unsigned int segno, int off)
+{
+	BUG_ON(1);
+}
+#endif
+
 static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
 							unsigned int segno, int off)
 {
@@ -818,9 +912,10 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 			if (IS_ERR(inode) || is_bad_inode(inode))
 				continue;
 
-			/* if encrypted inode, let's go phase 3 */
+			/* if DAX or encrypted inode, let's go phase 3 */
 			if (f2fs_encrypted_inode(inode) &&
-						S_ISREG(inode->i_mode)) {
+						S_ISREG(inode->i_mode) ||
+						f2fs_dax_file(inode)) {
 				add_gc_inode(gc_list, inode);
 				continue;
 			}
@@ -858,7 +953,9 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 
 			start_bidx = start_bidx_of_node(nofs, inode)
 								+ ofs_in_node;
-			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+			if (f2fs_dax_file(inode))
+				dax_move_data_page(inode, start_bidx, segno, off);
+			else if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 				move_encrypted_block(inode, start_bidx, segno, off);
 			else
 				move_data_page(inode, start_bidx, gc_type, segno, off);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 2082816..b8c9116 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -28,6 +28,9 @@ bool f2fs_may_inline_data(struct inode *inode)
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 		return false;
 
+	if (f2fs_dax_file(inode))
+		return false;
+
 	return true;
 }
 
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 6cd312a..7741461 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -43,8 +43,14 @@ void f2fs_set_inode_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (flags & FS_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
+#ifdef CONFIG_FS_DAX
+	if (test_opt(F2FS_I_SB(inode), DAX) && S_ISREG(inode->i_mode) &&
+		!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
+		!f2fs_is_atomic_file(inode) && !f2fs_is_volatile_file(inode))
+		new_fl |= S_DAX;
+#endif
 	inode_set_flags(inode, new_fl,
-			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 760d852..afc52e0 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -70,6 +70,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
 		f2fs_set_encrypted_inode(inode);
 
+#ifdef CONFIG_FS_DAX
+	if (test_opt(sbi, DAX) && S_ISREG(inode->i_mode) &&
+		!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode))
+		inode->i_flags |= S_DAX;
+#endif
 	set_inode_flag(inode, FI_NEW_INODE);
 
 	if (test_opt(sbi, INLINE_XATTR))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 32e4c02..aefe931 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -109,6 +109,7 @@ enum {
 	Opt_nolazytime,
 	Opt_usrquota,
 	Opt_grpquota,
+	Opt_dax,
 	Opt_err,
 };
 
@@ -146,6 +147,7 @@ enum {
 	{Opt_nolazytime, "nolazytime"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
+	{Opt_dax, "dax"},
 	{Opt_err, NULL},
 };
 
@@ -399,6 +401,15 @@ static int parse_options(struct super_block *sb, char *options)
 					"quota operations not supported");
 			break;
 #endif
+#ifdef CONFIG_FS_DAX
+		case Opt_dax:
+			set_opt(sbi, DAX);
+			break;
+#else
+		case Opt_dax:
+			f2fs_msg(sb, KERN_INFO, "dax option not supported");
+			break;
+#endif
 		default:
 			f2fs_msg(sb, KERN_ERR,
 				"Unrecognized mount option \"%s\" or missing value",
@@ -814,6 +825,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 	if (test_opt(sbi, GRPQUOTA))
 		seq_puts(seq, ",grpquota");
 #endif
+#ifdef CONFIG_FS_DAX
+	if (test_opt(sbi, DAX))
+		seq_puts(seq, ",dax");
+#endif
 
 	return 0;
 }
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-20 12:10 ` sunqiuyang
  (?)
@ 2017-07-22  0:34 ` Jaegeuk Kim
  2017-07-24 12:03     ` Sun Qiuyang
  -1 siblings, 1 reply; 33+ messages in thread
From: Jaegeuk Kim @ 2017-07-22  0:34 UTC (permalink / raw)
  To: sunqiuyang; +Cc: linux-kernel, linux-fsdevel, linux-f2fs-devel

Hi Qiuyang,

This fails xfstests/generic/413.

Thanks,

On 07/20, sunqiuyang wrote:
> From: Qiuyang Sun <sunqiuyang@huawei.com>
> 
> This patch implements Direct Access (DAX) in F2FS, including:
>  - a mount option to choose whether to enable DAX or not
>  - read/write and mmap of regular files in the DAX way
>  - zero-out of unaligned partial blocks in the DAX way
>  - garbage collection of DAX files, by mapping both old and new physical
>    addresses of a data page into memory and copy data between them directly
>  - incompatibility of DAX with inline data, atomic or volatile write, 
>    collapse|insert_range, etc.
> 
> Signed-off-by: Qiuyang Sun <sunqiuyang@huawei.com>
> ---
> Changelog v7 -> v8:
>  - Introduce the macro f2fs_dax_file() to judge if a file is DAX for cases
>    when CONFIG_FS_DAX is set or not
>  - Return -ENOTSUPP when an operation does not support DAX
>  - In f2fs_iomap_begin(), convert the inline data of an inode (if any) 
>    before mapping blocks
>  - Minor cleanups
> ---
>  Documentation/filesystems/f2fs.txt |   2 +
>  fs/f2fs/data.c                     | 132 +++++++++++++++++++++++++-
>  fs/f2fs/f2fs.h                     |  15 +++
>  fs/f2fs/file.c                     | 183 ++++++++++++++++++++++++++++++++++++-
>  fs/f2fs/gc.c                       | 103 ++++++++++++++++++++-
>  fs/f2fs/inline.c                   |   3 +
>  fs/f2fs/inode.c                    |   8 +-
>  fs/f2fs/namei.c                    |   5 +
>  fs/f2fs/super.c                    |  15 +++
>  9 files changed, 454 insertions(+), 12 deletions(-)
> 
> diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
> index 273ccb2..c86c421 100644
> --- a/Documentation/filesystems/f2fs.txt
> +++ b/Documentation/filesystems/f2fs.txt
> @@ -164,6 +164,8 @@ io_bits=%u             Set the bit size of write IO requests. It should be set
>                         with "mode=lfs".
>  usrquota               Enable plain user disk quota accounting.
>  grpquota               Enable plain group disk quota accounting.
> +dax                    Use direct access (no page cache). See
> +                       Documentation/filesystems/dax.txt.
>  
>  ================================================================================
>  DEBUGFS ENTRIES
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 87c1f41..4eb4b76 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -910,6 +910,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>  				err = -EIO;
>  				goto sync_out;
>  			}
> +			/*
> +			 * If newly allocated blocks are to be zeroed out later,
> +			 * a single f2fs_map_blocks must not contain both old
> +			 * and new blocks at the same time.
> +			 */
> +			if (flag == F2FS_GET_BLOCK_ZERO
> +					&& (map->m_flags & F2FS_MAP_MAPPED)
> +					&& !(map->m_flags & F2FS_MAP_NEW))
> +				goto sync_out;
>  			if (flag == F2FS_GET_BLOCK_PRE_AIO) {
>  				if (blkaddr == NULL_ADDR) {
>  					prealloc++;
> @@ -938,6 +947,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>  						blkaddr != NEW_ADDR)
>  				goto sync_out;
>  		}
> +	} else if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
> +		goto sync_out;
>  	}
>  
>  	if (flag == F2FS_GET_BLOCK_PRE_AIO)
> @@ -996,6 +1007,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>  	goto next_dnode;
>  
>  sync_out:
> +	if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
> +		clean_bdev_aliases(inode->i_sb->s_bdev,
> +				map->m_pblk, map->m_len);
> +		err = sb_issue_zeroout(inode->i_sb, map->m_pblk,
> +				map->m_len, GFP_NOFS);
> +	}
>  	f2fs_put_dnode(&dn);
>  unlock_out:
>  	if (create) {
> @@ -1808,16 +1825,19 @@ static int f2fs_write_data_pages(struct address_space *mapping,
>  	return 0;
>  }
>  
> -static void f2fs_write_failed(struct address_space *mapping, loff_t to)
> +static void f2fs_write_failed(struct address_space *mapping, loff_t to,
> +								bool lock)
>  {
>  	struct inode *inode = mapping->host;
>  	loff_t i_size = i_size_read(inode);
>  
>  	if (to > i_size) {
> -		down_write(&F2FS_I(inode)->i_mmap_sem);
> +		if (lock)
> +			down_write(&F2FS_I(inode)->i_mmap_sem);
>  		truncate_pagecache(inode, i_size);
>  		truncate_blocks(inode, i_size, true);
> -		up_write(&F2FS_I(inode)->i_mmap_sem);
> +		if (lock)
> +			up_write(&F2FS_I(inode)->i_mmap_sem);
>  	}
>  }
>  
> @@ -2000,7 +2020,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
>  
>  fail:
>  	f2fs_put_page(page, 1);
> -	f2fs_write_failed(mapping, pos + len);
> +	f2fs_write_failed(mapping, pos + len, true);
>  	return err;
>  }
>  
> @@ -2077,7 +2097,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>  		if (err > 0)
>  			set_inode_flag(inode, FI_UPDATE_WRITE);
>  		else if (err < 0)
> -			f2fs_write_failed(mapping, offset + count);
> +			f2fs_write_failed(mapping, offset + count, true);
>  	}
>  
>  	trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
> @@ -2274,3 +2294,105 @@ int f2fs_migrate_page(struct address_space *mapping,
>  	.migratepage    = f2fs_migrate_page,
>  #endif
>  };
> +
> +#ifdef CONFIG_FS_DAX
> +#include <linux/iomap.h>
> +#include <linux/dax.h>
> +
> +static int f2fs_iomap_begin(struct inode *inode, loff_t offset,
> +	loff_t length, unsigned int flags, struct iomap *iomap)
> +{
> +	struct block_device *bdev;
> +	unsigned long first_block = F2FS_BYTES_TO_BLK(offset);
> +	unsigned long last_block = F2FS_BYTES_TO_BLK(offset + length - 1);
> +	struct f2fs_map_blocks map;
> +	int ret;
> +
> +	ret = f2fs_convert_inline_inode(inode);
> +	if (ret)
> +		return ret;
> +
> +	map.m_lblk = first_block;
> +	map.m_len = last_block - first_block + 1;
> +	map.m_next_pgofs = NULL;
> +
> +	if (!(flags & IOMAP_WRITE)) {
> +		ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
> +	} else {
> +	/* i_size should be kept here and changed later in f2fs_iomap_end */
> +		loff_t original_i_size = i_size_read(inode);
> +
> +		ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_ZERO);
> +		if (i_size_read(inode) > original_i_size) {
> +			f2fs_i_size_write(inode, original_i_size);
> +			if (ret)
> +				f2fs_write_failed(inode->i_mapping,
> +						offset + length,
> +						!(flags & IOMAP_FAULT));
> +		}
> +	}
> +
> +	if (ret)
> +		return ret;
> +
> +	iomap->flags = 0;
> +	bdev = inode->i_sb->s_bdev;
> +	iomap->bdev = bdev;
> +	if (blk_queue_dax(bdev->bd_queue))
> +		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
> +	else
> +		iomap->dax_dev = NULL;
> +	iomap->offset = F2FS_BLK_TO_BYTES((u64)first_block);
> +
> +	if (map.m_len == 0) {
> +		iomap->type = IOMAP_HOLE;
> +		iomap->blkno = IOMAP_NULL_BLOCK;
> +		iomap->length = F2FS_BLKSIZE;
> +	} else {
> +		if (map.m_flags & F2FS_MAP_UNWRITTEN) {
> +			iomap->type = IOMAP_UNWRITTEN;
> +		} else if (map.m_flags & F2FS_MAP_MAPPED) {
> +			iomap->type = IOMAP_MAPPED;
> +		} else {
> +			WARN_ON_ONCE(1);
> +			return -EIO;
> +		}
> +		iomap->blkno =
> +			(sector_t)map.m_pblk << F2FS_LOG_SECTORS_PER_BLOCK;
> +		iomap->length = F2FS_BLK_TO_BYTES((u64)map.m_len);
> +	}
> +
> +	if (map.m_flags & F2FS_MAP_NEW)
> +		iomap->flags |= IOMAP_F_NEW;
> +	return 0;
> +}
> +
> +static int f2fs_iomap_end(struct inode *inode, loff_t offset, loff_t length,
> +	ssize_t written, unsigned int flags, struct iomap *iomap)
> +{
> +	put_dax(iomap->dax_dev);
> +	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
> +		return 0;
> +
> +	if (offset + written > i_size_read(inode))
> +		f2fs_i_size_write(inode, offset + written);
> +
> +	if (iomap->offset + iomap->length >
> +			ALIGN(i_size_read(inode), F2FS_BLKSIZE)) {
> +		block_t written_blk = F2FS_BYTES_TO_BLK(offset + written);
> +		block_t end_blk = F2FS_BYTES_TO_BLK(offset + length);
> +
> +		if (written_blk < end_blk)
> +			f2fs_write_failed(inode->i_mapping, offset + length,
> +									true);
> +	}
> +
> +	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
> +	return 0;
> +}
> +
> +struct iomap_ops f2fs_iomap_ops = {
> +	.iomap_begin	= f2fs_iomap_begin,
> +	.iomap_end	= f2fs_iomap_end,
> +};
> +#endif
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 70777a8..b6d629a 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -91,6 +91,11 @@ struct f2fs_fault_info {
>  #define F2FS_MOUNT_LFS			0x00040000
>  #define F2FS_MOUNT_USRQUOTA		0x00080000
>  #define F2FS_MOUNT_GRPQUOTA		0x00100000
> +#ifdef CONFIG_FS_DAX
> +#define F2FS_MOUNT_DAX			0x00400000 /* Direct Access */
> +#else
> +#define F2FS_MOUNT_DAX			0
> +#endif
>  
>  #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
>  #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
> @@ -482,6 +487,7 @@ struct f2fs_map_blocks {
>  #define F2FS_GET_BLOCK_BMAP		3
>  #define F2FS_GET_BLOCK_PRE_DIO		4
>  #define F2FS_GET_BLOCK_PRE_AIO		5
> +#define F2FS_GET_BLOCK_ZERO		6
>  
>  /*
>   * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
> @@ -506,6 +512,12 @@ struct f2fs_map_blocks {
>  #define file_keep_isize(inode)	is_file(inode, FADVISE_KEEP_SIZE_BIT)
>  #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
>  
> +#ifdef CONFIG_FS_DAX
> +#define f2fs_dax_file(inode)	IS_DAX(inode)
> +#else
> +#define f2fs_dax_file(inode)	false
> +#endif
> +
>  #define DEF_DIR_LEVEL		0
>  
>  struct f2fs_inode_info {
> @@ -2439,6 +2451,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
>  int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
>  			struct page *page, enum migrate_mode mode);
>  #endif
> +#ifdef CONFIG_FS_DAX
> +extern struct iomap_ops f2fs_iomap_ops;
> +#endif
>  
>  /*
>   * gc.c
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 2706130..e26114f 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -23,6 +23,10 @@
>  #include <linux/uio.h>
>  #include <linux/uuid.h>
>  #include <linux/file.h>
> +#ifdef CONFIG_FS_DAX
> +#include <linux/dax.h>
> +#include <linux/iomap.h>
> +#endif
>  
>  #include "f2fs.h"
>  #include "node.h"
> @@ -121,6 +125,64 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>  	.page_mkwrite	= f2fs_vm_page_mkwrite,
>  };
>  
> +#ifdef CONFIG_FS_DAX
> +static int f2fs_dax_huge_fault(struct vm_fault *vmf,
> +	enum page_entry_size pe_size)
> +{
> +	int result;
> +	struct inode *inode = file_inode(vmf->vma->vm_file);
> +	struct super_block *sb = inode->i_sb;
> +	bool write = vmf->flags & FAULT_FLAG_WRITE;
> +
> +	if (write) {
> +		sb_start_pagefault(sb);
> +		file_update_time(vmf->vma->vm_file);
> +	}
> +	down_read(&F2FS_I(inode)->i_mmap_sem);
> +	result = dax_iomap_fault(vmf, pe_size, &f2fs_iomap_ops);
> +	up_read(&F2FS_I(inode)->i_mmap_sem);
> +	if (write)
> +		sb_end_pagefault(sb);
> +
> +	return result;
> +}
> +
> +static int f2fs_dax_fault(struct vm_fault *vmf)
> +{
> +	return f2fs_dax_huge_fault(vmf, PE_SIZE_PTE);
> +}
> +
> +static int f2fs_dax_pfn_mkwrite(struct vm_fault *vmf)
> +{
> +	struct inode *inode = file_inode(vmf->vma->vm_file);
> +	struct super_block *sb = inode->i_sb;
> +	loff_t size;
> +	int ret;
> +
> +	sb_start_pagefault(sb);
> +	file_update_time(vmf->vma->vm_file);
> +	down_read(&F2FS_I(inode)->i_mmap_sem);
> +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +	if (vmf->pgoff >= size)
> +		ret = VM_FAULT_SIGBUS;
> +	else
> +		ret = dax_pfn_mkwrite(vmf);
> +	up_read(&F2FS_I(inode)->i_mmap_sem);
> +	sb_end_pagefault(sb);
> +
> +	return ret;
> +}
> +
> +static const struct vm_operations_struct f2fs_dax_vm_ops = {
> +	.fault		= f2fs_dax_fault,
> +	.huge_fault	= f2fs_dax_huge_fault,
> +	.page_mkwrite	= f2fs_dax_fault,
> +	.pfn_mkwrite	= f2fs_dax_pfn_mkwrite,
> +};
> +#else
> +#define f2fs_dax_vm_ops f2fs_file_vm_ops
> +#endif
> +
>  static int get_parent_ino(struct inode *inode, nid_t *pino)
>  {
>  	struct dentry *dentry;
> @@ -436,7 +498,13 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
>  		return err;
>  
>  	file_accessed(file);
> -	vma->vm_ops = &f2fs_file_vm_ops;
> +
> +	if (f2fs_dax_file(inode)) {
> +		vma->vm_ops = &f2fs_dax_vm_ops;
> +		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
> +	} else {
> +		vma->vm_ops = &f2fs_file_vm_ops;
> +	}
>  	return 0;
>  }
>  
> @@ -519,6 +587,16 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
>  	if (!offset && !cache_only)
>  		return 0;
>  
> +	if (f2fs_dax_file(inode)) {
> +		int ret;
> +
> +		down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
> +		ret = iomap_zero_range(inode, from, PAGE_SIZE - offset,
> +			NULL, &f2fs_iomap_ops);
> +		up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
> +		return ret;
> +	}
> +
>  	if (cache_only) {
>  		page = find_lock_page(mapping, index);
>  		if (page && PageUptodate(page))
> @@ -799,6 +877,17 @@ static int fill_zero(struct inode *inode, pgoff_t index,
>  	if (!len)
>  		return 0;
>  
> +	if (f2fs_dax_file(inode)) {
> +		int ret;
> +
> +		down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
> +		ret = iomap_zero_range(inode,
> +			F2FS_BLK_TO_BYTES((loff_t)index) + start,
> +			len, NULL, &f2fs_iomap_ops);
> +		up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
> +		return ret;
> +	}
> +
>  	f2fs_balance_fs(sbi, true);
>  
>  	f2fs_lock_op(sbi);
> @@ -1121,6 +1210,10 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
>  	loff_t new_size;
>  	int ret;
>  
> +	/* The current implementation does not apply to DAX files. */
> +	if (f2fs_dax_file(inode))
> +		return -ENOTSUPP;
> +
>  	if (offset + len >= i_size_read(inode))
>  		return -EINVAL;
>  
> @@ -1311,6 +1404,10 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
>  	loff_t new_size;
>  	int ret = 0;
>  
> +	/* The current implementation does not apply to DAX files. */
> +	if (f2fs_dax_file(inode))
> +		return -ENOTSUPP;
> +
>  	new_size = i_size_read(inode) + len;
>  	ret = inode_newsize_ok(inode, new_size);
>  	if (ret)
> @@ -1578,6 +1675,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
>  	struct inode *inode = file_inode(filp);
>  	int ret;
>  
> +	if (f2fs_dax_file(inode))
> +		return -ENOTSUPP;
> +
>  	if (!inode_owner_or_capable(inode))
>  		return -EACCES;
>  
> @@ -1627,6 +1727,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
>  	struct inode *inode = file_inode(filp);
>  	int ret;
>  
> +	if (f2fs_dax_file(inode))
> +		return -ENOTSUPP;
> +
>  	if (!inode_owner_or_capable(inode))
>  		return -EACCES;
>  
> @@ -1663,6 +1766,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
>  	struct inode *inode = file_inode(filp);
>  	int ret;
>  
> +	if (f2fs_dax_file(inode))
> +		return -ENOTSUPP;
> +
>  	if (!inode_owner_or_capable(inode))
>  		return -EACCES;
>  
> @@ -1698,6 +1804,9 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
>  	struct inode *inode = file_inode(filp);
>  	int ret;
>  
> +	if (f2fs_dax_file(inode))
> +		return -ENOTSUPP;
> +
>  	if (!inode_owner_or_capable(inode))
>  		return -EACCES;
>  
> @@ -1727,6 +1836,9 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
>  	struct inode *inode = file_inode(filp);
>  	int ret;
>  
> +	if (f2fs_dax_file(inode))
> +		return -ENOTSUPP;
> +
>  	if (!inode_owner_or_capable(inode))
>  		return -EACCES;
>  
> @@ -2141,6 +2253,9 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
>  	struct f2fs_defragment range;
>  	int err;
>  
> +	if (f2fs_dax_file(inode))
> +		return -ENOTSUPP;
> +
>  	if (!capable(CAP_SYS_ADMIN))
>  		return -EPERM;
>  
> @@ -2190,6 +2305,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
>  	size_t dst_osize;
>  	int ret;
>  
> +	if (f2fs_dax_file(src) || f2fs_dax_file(dst))
> +		return -ENOTSUPP;
> +
>  	if (file_in->f_path.mnt != file_out->f_path.mnt ||
>  				src->i_sb != dst->i_sb)
>  		return -EXDEV;
> @@ -2431,6 +2549,62 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>  	}
>  }
>  
> +#ifdef CONFIG_FS_DAX
> +static ssize_t f2fs_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	ssize_t ret;
> +
> +	inode_lock_shared(inode);
> +
> +	if (!IS_DAX(inode)) {
> +		inode_unlock_shared(inode);
> +		return generic_file_read_iter(iocb, to);
> +	}
> +
> +	down_read(&F2FS_I(inode)->dio_rwsem[READ]);
> +	ret = dax_iomap_rw(iocb, to, &f2fs_iomap_ops);
> +	up_read(&F2FS_I(inode)->dio_rwsem[READ]);
> +	inode_unlock_shared(inode);
> +
> +	file_accessed(iocb->ki_filp);
> +	return ret;
> +}
> +
> +static ssize_t f2fs_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	ssize_t ret;
> +
> +	ret = file_remove_privs(iocb->ki_filp);
> +	if (ret)
> +		return ret;
> +	ret = file_update_time(iocb->ki_filp);
> +	if (ret)
> +		return ret;
> +
> +	down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
> +	ret = dax_iomap_rw(iocb, from, &f2fs_iomap_ops);
> +	up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
> +
> +	return ret;
> +}
> +#else
> +#define f2fs_dax_read_iter	generic_file_read_iter
> +#define f2fs_dax_write_iter	__generic_file_write_iter
> +#endif
> +
> +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +	if (!iov_iter_count(to))
> +		return 0; /* skip atime */
> +
> +	if (f2fs_dax_file(file_inode(iocb->ki_filp)))
> +		return f2fs_dax_read_iter(iocb, to);
> +
> +	return generic_file_read_iter(iocb, to);
> +}
> +
>  static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
>  	struct file *file = iocb->ki_filp;
> @@ -2452,7 +2626,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  			return err;
>  		}
>  		blk_start_plug(&plug);
> -		ret = __generic_file_write_iter(iocb, from);
> +		if (f2fs_dax_file(inode))
> +			ret = f2fs_dax_write_iter(iocb, from);
> +		else
> +			ret = __generic_file_write_iter(iocb, from);
>  		blk_finish_plug(&plug);
>  		clear_inode_flag(inode, FI_NO_PREALLOC);
>  	}
> @@ -2501,7 +2678,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>  
>  const struct file_operations f2fs_file_operations = {
>  	.llseek		= f2fs_llseek,
> -	.read_iter	= generic_file_read_iter,
> +	.read_iter	= f2fs_file_read_iter,
>  	.write_iter	= f2fs_file_write_iter,
>  	.open		= f2fs_file_open,
>  	.release	= f2fs_release_file,
> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
> index fa3d2e2..06b6859 100644
> --- a/fs/f2fs/gc.c
> +++ b/fs/f2fs/gc.c
> @@ -700,6 +700,100 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
>  	f2fs_put_page(page, 1);
>  }
>  
> +#ifdef CONFIG_FS_DAX
> +#include <linux/dax.h>
> +
> +static void dax_move_data_page(struct inode *inode, block_t bidx,
> +				unsigned int segno, int off)
> +{
> +	struct block_device *bdev = inode->i_sb->s_bdev;
> +	struct dax_device *dax_dev;
> +	struct dnode_of_data dn;
> +	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> +	struct f2fs_summary sum;
> +	struct node_info ni;
> +	block_t old_blkaddr, new_blkaddr;
> +	int err, id;
> +	long map_len;
> +	pgoff_t pgoff;
> +	void *kaddr_old, *kaddr_new;
> +	pfn_t pfn;
> +
> +	f2fs_bug_on(sbi, f2fs_is_atomic_file(inode));
> +
> +	if (!check_valid_map(sbi, segno, off))
> +		return;
> +
> +	f2fs_bug_on(sbi, !blk_queue_dax(bdev->bd_queue));
> +
> +	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
> +
> +	if (!down_write_trylock(&F2FS_I(inode)->i_mmap_sem))
> +		goto release;
> +
> +	unmap_mapping_range(inode->i_mapping, (loff_t)bidx << PAGE_SHIFT,
> +			PAGE_SIZE, 1);
> +	/* find the old block address */
> +	set_new_dnode(&dn, inode, NULL, NULL, 0);
> +	err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
> +	if (err)
> +		goto out;
> +	old_blkaddr = dn.data_blkaddr;
> +	/* This page is already truncated */
> +	if (old_blkaddr == NULL_ADDR)
> +		goto put_dn;
> +
> +	/* allocate a new block address */
> +	get_node_info(sbi, dn.nid, &ni);
> +	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
> +	allocate_data_block(sbi, NULL, old_blkaddr, &new_blkaddr,
> +			&sum, CURSEG_COLD_DATA, NULL, false);
> +
> +	/* copy data page from old to new address in dax_bdev */
> +	id = dax_read_lock();
> +	err = bdev_dax_pgoff(bdev, SECTOR_FROM_BLOCK(old_blkaddr),
> +			PAGE_SIZE, &pgoff);
> +	if (err)
> +		goto recover;
> +	map_len = dax_direct_access(dax_dev, pgoff, 1, &kaddr_old, &pfn);
> +	if (map_len < 0)
> +		goto recover;
> +	err = bdev_dax_pgoff(bdev, SECTOR_FROM_BLOCK(new_blkaddr),
> +			PAGE_SIZE, &pgoff);
> +	if (err)
> +		goto recover;
> +	map_len = dax_direct_access(dax_dev, pgoff, 1, &kaddr_new, &pfn);
> +	if (map_len < 0)
> +		goto recover;
> +	copy_page((void __force *)kaddr_new, (void __force *)kaddr_old);
> +
> +	f2fs_update_data_blkaddr(&dn, new_blkaddr);
> +	set_inode_flag(inode, FI_APPEND_WRITE);
> +	if (bidx == 0)
> +		set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
> +
> +recover:
> +	if (err || map_len < 0)
> +		__f2fs_replace_block(sbi, &sum, new_blkaddr, old_blkaddr,
> +							true, true);
> +	dax_read_unlock(id);
> +put_dn:
> +	f2fs_put_dnode(&dn);
> +out:
> +	unmap_mapping_range(inode->i_mapping, (loff_t)bidx << PAGE_SHIFT,
> +			PAGE_SIZE, 1);
> +	up_write(&F2FS_I(inode)->i_mmap_sem);
> +release:
> +	put_dax(dax_dev);
> +}
> +#else
> +static void dax_move_data_page(struct inode *inode, block_t bidx,
> +				unsigned int segno, int off)
> +{
> +	BUG_ON(1);
> +}
> +#endif
> +
>  static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
>  							unsigned int segno, int off)
>  {
> @@ -818,9 +912,10 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
>  			if (IS_ERR(inode) || is_bad_inode(inode))
>  				continue;
>  
> -			/* if encrypted inode, let's go phase 3 */
> +			/* if DAX or encrypted inode, let's go phase 3 */
>  			if (f2fs_encrypted_inode(inode) &&
> -						S_ISREG(inode->i_mode)) {
> +						S_ISREG(inode->i_mode) ||
> +						f2fs_dax_file(inode)) {
>  				add_gc_inode(gc_list, inode);
>  				continue;
>  			}
> @@ -858,7 +953,9 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
>  
>  			start_bidx = start_bidx_of_node(nofs, inode)
>  								+ ofs_in_node;
> -			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
> +			if (f2fs_dax_file(inode))
> +				dax_move_data_page(inode, start_bidx, segno, off);
> +			else if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
>  				move_encrypted_block(inode, start_bidx, segno, off);
>  			else
>  				move_data_page(inode, start_bidx, gc_type, segno, off);
> diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
> index 2082816..b8c9116 100644
> --- a/fs/f2fs/inline.c
> +++ b/fs/f2fs/inline.c
> @@ -28,6 +28,9 @@ bool f2fs_may_inline_data(struct inode *inode)
>  	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
>  		return false;
>  
> +	if (f2fs_dax_file(inode))
> +		return false;
> +
>  	return true;
>  }
>  
> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
> index 6cd312a..7741461 100644
> --- a/fs/f2fs/inode.c
> +++ b/fs/f2fs/inode.c
> @@ -43,8 +43,14 @@ void f2fs_set_inode_flags(struct inode *inode)
>  		new_fl |= S_NOATIME;
>  	if (flags & FS_DIRSYNC_FL)
>  		new_fl |= S_DIRSYNC;
> +#ifdef CONFIG_FS_DAX
> +	if (test_opt(F2FS_I_SB(inode), DAX) && S_ISREG(inode->i_mode) &&
> +		!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
> +		!f2fs_is_atomic_file(inode) && !f2fs_is_volatile_file(inode))
> +		new_fl |= S_DAX;
> +#endif
>  	inode_set_flags(inode, new_fl,
> -			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
> +			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
>  }
>  
>  static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
> diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
> index 760d852..afc52e0 100644
> --- a/fs/f2fs/namei.c
> +++ b/fs/f2fs/namei.c
> @@ -70,6 +70,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
>  	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
>  		f2fs_set_encrypted_inode(inode);
>  
> +#ifdef CONFIG_FS_DAX
> +	if (test_opt(sbi, DAX) && S_ISREG(inode->i_mode) &&
> +		!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode))
> +		inode->i_flags |= S_DAX;
> +#endif
>  	set_inode_flag(inode, FI_NEW_INODE);
>  
>  	if (test_opt(sbi, INLINE_XATTR))
> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> index 32e4c02..aefe931 100644
> --- a/fs/f2fs/super.c
> +++ b/fs/f2fs/super.c
> @@ -109,6 +109,7 @@ enum {
>  	Opt_nolazytime,
>  	Opt_usrquota,
>  	Opt_grpquota,
> +	Opt_dax,
>  	Opt_err,
>  };
>  
> @@ -146,6 +147,7 @@ enum {
>  	{Opt_nolazytime, "nolazytime"},
>  	{Opt_usrquota, "usrquota"},
>  	{Opt_grpquota, "grpquota"},
> +	{Opt_dax, "dax"},
>  	{Opt_err, NULL},
>  };
>  
> @@ -399,6 +401,15 @@ static int parse_options(struct super_block *sb, char *options)
>  					"quota operations not supported");
>  			break;
>  #endif
> +#ifdef CONFIG_FS_DAX
> +		case Opt_dax:
> +			set_opt(sbi, DAX);
> +			break;
> +#else
> +		case Opt_dax:
> +			f2fs_msg(sb, KERN_INFO, "dax option not supported");
> +			break;
> +#endif
>  		default:
>  			f2fs_msg(sb, KERN_ERR,
>  				"Unrecognized mount option \"%s\" or missing value",
> @@ -814,6 +825,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
>  	if (test_opt(sbi, GRPQUOTA))
>  		seq_puts(seq, ",grpquota");
>  #endif
> +#ifdef CONFIG_FS_DAX
> +	if (test_opt(sbi, DAX))
> +		seq_puts(seq, ",dax");
> +#endif
>  
>  	return 0;
>  }
> -- 
> 1.8.3.1

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-22  0:34 ` Jaegeuk Kim
@ 2017-07-24 12:03     ` Sun Qiuyang
  0 siblings, 0 replies; 33+ messages in thread
From: Sun Qiuyang @ 2017-07-24 12:03 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-fsdevel, linux-f2fs-devel

Hi Jaegeuk,

Below is the error message I got from this testcase:
---
write (Invalid argument) len 1024 dio [dax to nondax | both nodax]
read (Bad address) len [4096 | 16777216 | 67108864] dio dax to nondax
---
The write error is expected, as F2FS does not support unaligned direct 
IO (1024 B).

The read error is more complex. In the test script, when we mmap the src 
file (dax), the flags (VM_MIXEDMAP | VM_HUGEPAGE) are added into
vma->vm_flags. Later on, when we write to the dest file (nondax) and 
then read from it by direct IO, we will fail to get pages from this 
"special" vma.

Functions involved:
f2fs_direct_IO
blockdev_direct_IO
__blockdev_direct_IO
do_direct_IO
dio_get_page
dio_refill_pages
iov_iter_get_pages
get_user_pages_unlocked
__get_user_pages_fast
__get_user_pages_unlocked
__get_user_pages_locked
__get_user_pages
follow_page_mask
follow_p4d_mask
follow_pud_mask
follow_pmd_mask
follow_page_pte
	vm_normal_page
	follow_pfn_pte

In my test environment HAVE_PTE_SPECIAL is true, and vm_normal_page() 
returns NULL due to VM_MIXEDMAP in vm_flags. Then follow_page_pte() 
continue to call follow_pfn_pte(), which returns -EFAULT. This is how we 
get a "bad address" error finally.

This error also occurs in EXT4-DAX for similar reasons.

Thanks,


> Hi Qiuyang,
>
> This fails xfstests/generic/413.
>
> Thanks,
>
> On 07/20, sunqiuyang wrote:
>> From: Qiuyang Sun <sunqiuyang@huawei.com>
>>
>> This patch implements Direct Access (DAX) in F2FS, including:
>>  - a mount option to choose whether to enable DAX or not
>>  - read/write and mmap of regular files in the DAX way
>>  - zero-out of unaligned partial blocks in the DAX way
>>  - garbage collection of DAX files, by mapping both old and new physical
>>    addresses of a data page into memory and copy data between them directly
>>  - incompatibility of DAX with inline data, atomic or volatile write,
>>    collapse|insert_range, etc.
>>
>> Signed-off-by: Qiuyang Sun <sunqiuyang@huawei.com>
>> ---
>> Changelog v7 -> v8:
>>  - Introduce the macro f2fs_dax_file() to judge if a file is DAX for cases
>>    when CONFIG_FS_DAX is set or not
>>  - Return -ENOTSUPP when an operation does not support DAX
>>  - In f2fs_iomap_begin(), convert the inline data of an inode (if any)
>>    before mapping blocks
>>  - Minor cleanups
>> ---
>>  Documentation/filesystems/f2fs.txt |   2 +
>>  fs/f2fs/data.c                     | 132 +++++++++++++++++++++++++-
>>  fs/f2fs/f2fs.h                     |  15 +++
>>  fs/f2fs/file.c                     | 183 ++++++++++++++++++++++++++++++++++++-
>>  fs/f2fs/gc.c                       | 103 ++++++++++++++++++++-
>>  fs/f2fs/inline.c                   |   3 +
>>  fs/f2fs/inode.c                    |   8 +-
>>  fs/f2fs/namei.c                    |   5 +
>>  fs/f2fs/super.c                    |  15 +++
>>  9 files changed, 454 insertions(+), 12 deletions(-)
>>
>> diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
>> index 273ccb2..c86c421 100644
>> --- a/Documentation/filesystems/f2fs.txt
>> +++ b/Documentation/filesystems/f2fs.txt
>> @@ -164,6 +164,8 @@ io_bits=%u             Set the bit size of write IO requests. It should be set
>>                         with "mode=lfs".
>>  usrquota               Enable plain user disk quota accounting.
>>  grpquota               Enable plain group disk quota accounting.
>> +dax                    Use direct access (no page cache). See
>> +                       Documentation/filesystems/dax.txt.
>>
>>  ================================================================================
>>  DEBUGFS ENTRIES
>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
>> index 87c1f41..4eb4b76 100644
>> --- a/fs/f2fs/data.c
>> +++ b/fs/f2fs/data.c
>> @@ -910,6 +910,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>>  				err = -EIO;
>>  				goto sync_out;
>>  			}
>> +			/*
>> +			 * If newly allocated blocks are to be zeroed out later,
>> +			 * a single f2fs_map_blocks must not contain both old
>> +			 * and new blocks at the same time.
>> +			 */
>> +			if (flag == F2FS_GET_BLOCK_ZERO
>> +					&& (map->m_flags & F2FS_MAP_MAPPED)
>> +					&& !(map->m_flags & F2FS_MAP_NEW))
>> +				goto sync_out;
>>  			if (flag == F2FS_GET_BLOCK_PRE_AIO) {
>>  				if (blkaddr == NULL_ADDR) {
>>  					prealloc++;
>> @@ -938,6 +947,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>>  						blkaddr != NEW_ADDR)
>>  				goto sync_out;
>>  		}
>> +	} else if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
>> +		goto sync_out;
>>  	}
>>
>>  	if (flag == F2FS_GET_BLOCK_PRE_AIO)
>> @@ -996,6 +1007,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>>  	goto next_dnode;
>>
>>  sync_out:
>> +	if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
>> +		clean_bdev_aliases(inode->i_sb->s_bdev,
>> +				map->m_pblk, map->m_len);
>> +		err = sb_issue_zeroout(inode->i_sb, map->m_pblk,
>> +				map->m_len, GFP_NOFS);
>> +	}
>>  	f2fs_put_dnode(&dn);
>>  unlock_out:
>>  	if (create) {
>> @@ -1808,16 +1825,19 @@ static int f2fs_write_data_pages(struct address_space *mapping,
>>  	return 0;
>>  }
>>
>> -static void f2fs_write_failed(struct address_space *mapping, loff_t to)
>> +static void f2fs_write_failed(struct address_space *mapping, loff_t to,
>> +								bool lock)
>>  {
>>  	struct inode *inode = mapping->host;
>>  	loff_t i_size = i_size_read(inode);
>>
>>  	if (to > i_size) {
>> -		down_write(&F2FS_I(inode)->i_mmap_sem);
>> +		if (lock)
>> +			down_write(&F2FS_I(inode)->i_mmap_sem);
>>  		truncate_pagecache(inode, i_size);
>>  		truncate_blocks(inode, i_size, true);
>> -		up_write(&F2FS_I(inode)->i_mmap_sem);
>> +		if (lock)
>> +			up_write(&F2FS_I(inode)->i_mmap_sem);
>>  	}
>>  }
>>
>> @@ -2000,7 +2020,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
>>
>>  fail:
>>  	f2fs_put_page(page, 1);
>> -	f2fs_write_failed(mapping, pos + len);
>> +	f2fs_write_failed(mapping, pos + len, true);
>>  	return err;
>>  }
>>
>> @@ -2077,7 +2097,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>>  		if (err > 0)
>>  			set_inode_flag(inode, FI_UPDATE_WRITE);
>>  		else if (err < 0)
>> -			f2fs_write_failed(mapping, offset + count);
>> +			f2fs_write_failed(mapping, offset + count, true);
>>  	}
>>
>>  	trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
>> @@ -2274,3 +2294,105 @@ int f2fs_migrate_page(struct address_space *mapping,
>>  	.migratepage    = f2fs_migrate_page,
>>  #endif
>>  };
>> +
>> +#ifdef CONFIG_FS_DAX
>> +#include <linux/iomap.h>
>> +#include <linux/dax.h>
>> +
>> +static int f2fs_iomap_begin(struct inode *inode, loff_t offset,
>> +	loff_t length, unsigned int flags, struct iomap *iomap)
>> +{
>> +	struct block_device *bdev;
>> +	unsigned long first_block = F2FS_BYTES_TO_BLK(offset);
>> +	unsigned long last_block = F2FS_BYTES_TO_BLK(offset + length - 1);
>> +	struct f2fs_map_blocks map;
>> +	int ret;
>> +
>> +	ret = f2fs_convert_inline_inode(inode);
>> +	if (ret)
>> +		return ret;
>> +
>> +	map.m_lblk = first_block;
>> +	map.m_len = last_block - first_block + 1;
>> +	map.m_next_pgofs = NULL;
>> +
>> +	if (!(flags & IOMAP_WRITE)) {
>> +		ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
>> +	} else {
>> +	/* i_size should be kept here and changed later in f2fs_iomap_end */
>> +		loff_t original_i_size = i_size_read(inode);
>> +
>> +		ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_ZERO);
>> +		if (i_size_read(inode) > original_i_size) {
>> +			f2fs_i_size_write(inode, original_i_size);
>> +			if (ret)
>> +				f2fs_write_failed(inode->i_mapping,
>> +						offset + length,
>> +						!(flags & IOMAP_FAULT));
>> +		}
>> +	}
>> +
>> +	if (ret)
>> +		return ret;
>> +
>> +	iomap->flags = 0;
>> +	bdev = inode->i_sb->s_bdev;
>> +	iomap->bdev = bdev;
>> +	if (blk_queue_dax(bdev->bd_queue))
>> +		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
>> +	else
>> +		iomap->dax_dev = NULL;
>> +	iomap->offset = F2FS_BLK_TO_BYTES((u64)first_block);
>> +
>> +	if (map.m_len == 0) {
>> +		iomap->type = IOMAP_HOLE;
>> +		iomap->blkno = IOMAP_NULL_BLOCK;
>> +		iomap->length = F2FS_BLKSIZE;
>> +	} else {
>> +		if (map.m_flags & F2FS_MAP_UNWRITTEN) {
>> +			iomap->type = IOMAP_UNWRITTEN;
>> +		} else if (map.m_flags & F2FS_MAP_MAPPED) {
>> +			iomap->type = IOMAP_MAPPED;
>> +		} else {
>> +			WARN_ON_ONCE(1);
>> +			return -EIO;
>> +		}
>> +		iomap->blkno =
>> +			(sector_t)map.m_pblk << F2FS_LOG_SECTORS_PER_BLOCK;
>> +		iomap->length = F2FS_BLK_TO_BYTES((u64)map.m_len);
>> +	}
>> +
>> +	if (map.m_flags & F2FS_MAP_NEW)
>> +		iomap->flags |= IOMAP_F_NEW;
>> +	return 0;
>> +}
>> +
>> +static int f2fs_iomap_end(struct inode *inode, loff_t offset, loff_t length,
>> +	ssize_t written, unsigned int flags, struct iomap *iomap)
>> +{
>> +	put_dax(iomap->dax_dev);
>> +	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
>> +		return 0;
>> +
>> +	if (offset + written > i_size_read(inode))
>> +		f2fs_i_size_write(inode, offset + written);
>> +
>> +	if (iomap->offset + iomap->length >
>> +			ALIGN(i_size_read(inode), F2FS_BLKSIZE)) {
>> +		block_t written_blk = F2FS_BYTES_TO_BLK(offset + written);
>> +		block_t end_blk = F2FS_BYTES_TO_BLK(offset + length);
>> +
>> +		if (written_blk < end_blk)
>> +			f2fs_write_failed(inode->i_mapping, offset + length,
>> +									true);
>> +	}
>> +
>> +	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
>> +	return 0;
>> +}
>> +
>> +struct iomap_ops f2fs_iomap_ops = {
>> +	.iomap_begin	= f2fs_iomap_begin,
>> +	.iomap_end	= f2fs_iomap_end,
>> +};
>> +#endif
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index 70777a8..b6d629a 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -91,6 +91,11 @@ struct f2fs_fault_info {
>>  #define F2FS_MOUNT_LFS			0x00040000
>>  #define F2FS_MOUNT_USRQUOTA		0x00080000
>>  #define F2FS_MOUNT_GRPQUOTA		0x00100000
>> +#ifdef CONFIG_FS_DAX
>> +#define F2FS_MOUNT_DAX			0x00400000 /* Direct Access */
>> +#else
>> +#define F2FS_MOUNT_DAX			0
>> +#endif
>>
>>  #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
>>  #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
>> @@ -482,6 +487,7 @@ struct f2fs_map_blocks {
>>  #define F2FS_GET_BLOCK_BMAP		3
>>  #define F2FS_GET_BLOCK_PRE_DIO		4
>>  #define F2FS_GET_BLOCK_PRE_AIO		5
>> +#define F2FS_GET_BLOCK_ZERO		6
>>
>>  /*
>>   * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
>> @@ -506,6 +512,12 @@ struct f2fs_map_blocks {
>>  #define file_keep_isize(inode)	is_file(inode, FADVISE_KEEP_SIZE_BIT)
>>  #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
>>
>> +#ifdef CONFIG_FS_DAX
>> +#define f2fs_dax_file(inode)	IS_DAX(inode)
>> +#else
>> +#define f2fs_dax_file(inode)	false
>> +#endif
>> +
>>  #define DEF_DIR_LEVEL		0
>>
>>  struct f2fs_inode_info {
>> @@ -2439,6 +2451,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
>>  int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
>>  			struct page *page, enum migrate_mode mode);
>>  #endif
>> +#ifdef CONFIG_FS_DAX
>> +extern struct iomap_ops f2fs_iomap_ops;
>> +#endif
>>
>>  /*
>>   * gc.c
>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>> index 2706130..e26114f 100644
>> --- a/fs/f2fs/file.c
>> +++ b/fs/f2fs/file.c
>> @@ -23,6 +23,10 @@
>>  #include <linux/uio.h>
>>  #include <linux/uuid.h>
>>  #include <linux/file.h>
>> +#ifdef CONFIG_FS_DAX
>> +#include <linux/dax.h>
>> +#include <linux/iomap.h>
>> +#endif
>>
>>  #include "f2fs.h"
>>  #include "node.h"
>> @@ -121,6 +125,64 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>>  	.page_mkwrite	= f2fs_vm_page_mkwrite,
>>  };
>>
>> +#ifdef CONFIG_FS_DAX
>> +static int f2fs_dax_huge_fault(struct vm_fault *vmf,
>> +	enum page_entry_size pe_size)
>> +{
>> +	int result;
>> +	struct inode *inode = file_inode(vmf->vma->vm_file);
>> +	struct super_block *sb = inode->i_sb;
>> +	bool write = vmf->flags & FAULT_FLAG_WRITE;
>> +
>> +	if (write) {
>> +		sb_start_pagefault(sb);
>> +		file_update_time(vmf->vma->vm_file);
>> +	}
>> +	down_read(&F2FS_I(inode)->i_mmap_sem);
>> +	result = dax_iomap_fault(vmf, pe_size, &f2fs_iomap_ops);
>> +	up_read(&F2FS_I(inode)->i_mmap_sem);
>> +	if (write)
>> +		sb_end_pagefault(sb);
>> +
>> +	return result;
>> +}
>> +
>> +static int f2fs_dax_fault(struct vm_fault *vmf)
>> +{
>> +	return f2fs_dax_huge_fault(vmf, PE_SIZE_PTE);
>> +}
>> +
>> +static int f2fs_dax_pfn_mkwrite(struct vm_fault *vmf)
>> +{
>> +	struct inode *inode = file_inode(vmf->vma->vm_file);
>> +	struct super_block *sb = inode->i_sb;
>> +	loff_t size;
>> +	int ret;
>> +
>> +	sb_start_pagefault(sb);
>> +	file_update_time(vmf->vma->vm_file);
>> +	down_read(&F2FS_I(inode)->i_mmap_sem);
>> +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
>> +	if (vmf->pgoff >= size)
>> +		ret = VM_FAULT_SIGBUS;
>> +	else
>> +		ret = dax_pfn_mkwrite(vmf);
>> +	up_read(&F2FS_I(inode)->i_mmap_sem);
>> +	sb_end_pagefault(sb);
>> +
>> +	return ret;
>> +}
>> +
>> +static const struct vm_operations_struct f2fs_dax_vm_ops = {
>> +	.fault		= f2fs_dax_fault,
>> +	.huge_fault	= f2fs_dax_huge_fault,
>> +	.page_mkwrite	= f2fs_dax_fault,
>> +	.pfn_mkwrite	= f2fs_dax_pfn_mkwrite,
>> +};
>> +#else
>> +#define f2fs_dax_vm_ops f2fs_file_vm_ops
>> +#endif
>> +
>>  static int get_parent_ino(struct inode *inode, nid_t *pino)
>>  {
>>  	struct dentry *dentry;
>> @@ -436,7 +498,13 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
>>  		return err;
>>
>>  	file_accessed(file);
>> -	vma->vm_ops = &f2fs_file_vm_ops;
>> +
>> +	if (f2fs_dax_file(inode)) {
>> +		vma->vm_ops = &f2fs_dax_vm_ops;
>> +		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
>> +	} else {
>> +		vma->vm_ops = &f2fs_file_vm_ops;
>> +	}
>>  	return 0;
>>  }
>>
>> @@ -519,6 +587,16 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
>>  	if (!offset && !cache_only)
>>  		return 0;
>>
>> +	if (f2fs_dax_file(inode)) {
>> +		int ret;
>> +
>> +		down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +		ret = iomap_zero_range(inode, from, PAGE_SIZE - offset,
>> +			NULL, &f2fs_iomap_ops);
>> +		up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +		return ret;
>> +	}
>> +
>>  	if (cache_only) {
>>  		page = find_lock_page(mapping, index);
>>  		if (page && PageUptodate(page))
>> @@ -799,6 +877,17 @@ static int fill_zero(struct inode *inode, pgoff_t index,
>>  	if (!len)
>>  		return 0;
>>
>> +	if (f2fs_dax_file(inode)) {
>> +		int ret;
>> +
>> +		down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +		ret = iomap_zero_range(inode,
>> +			F2FS_BLK_TO_BYTES((loff_t)index) + start,
>> +			len, NULL, &f2fs_iomap_ops);
>> +		up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +		return ret;
>> +	}
>> +
>>  	f2fs_balance_fs(sbi, true);
>>
>>  	f2fs_lock_op(sbi);
>> @@ -1121,6 +1210,10 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
>>  	loff_t new_size;
>>  	int ret;
>>
>> +	/* The current implementation does not apply to DAX files. */
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (offset + len >= i_size_read(inode))
>>  		return -EINVAL;
>>
>> @@ -1311,6 +1404,10 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
>>  	loff_t new_size;
>>  	int ret = 0;
>>
>> +	/* The current implementation does not apply to DAX files. */
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	new_size = i_size_read(inode) + len;
>>  	ret = inode_newsize_ok(inode, new_size);
>>  	if (ret)
>> @@ -1578,6 +1675,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
>>  	struct inode *inode = file_inode(filp);
>>  	int ret;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!inode_owner_or_capable(inode))
>>  		return -EACCES;
>>
>> @@ -1627,6 +1727,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
>>  	struct inode *inode = file_inode(filp);
>>  	int ret;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!inode_owner_or_capable(inode))
>>  		return -EACCES;
>>
>> @@ -1663,6 +1766,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
>>  	struct inode *inode = file_inode(filp);
>>  	int ret;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!inode_owner_or_capable(inode))
>>  		return -EACCES;
>>
>> @@ -1698,6 +1804,9 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
>>  	struct inode *inode = file_inode(filp);
>>  	int ret;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!inode_owner_or_capable(inode))
>>  		return -EACCES;
>>
>> @@ -1727,6 +1836,9 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
>>  	struct inode *inode = file_inode(filp);
>>  	int ret;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!inode_owner_or_capable(inode))
>>  		return -EACCES;
>>
>> @@ -2141,6 +2253,9 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
>>  	struct f2fs_defragment range;
>>  	int err;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!capable(CAP_SYS_ADMIN))
>>  		return -EPERM;
>>
>> @@ -2190,6 +2305,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
>>  	size_t dst_osize;
>>  	int ret;
>>
>> +	if (f2fs_dax_file(src) || f2fs_dax_file(dst))
>> +		return -ENOTSUPP;
>> +
>>  	if (file_in->f_path.mnt != file_out->f_path.mnt ||
>>  				src->i_sb != dst->i_sb)
>>  		return -EXDEV;
>> @@ -2431,6 +2549,62 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>>  	}
>>  }
>>
>> +#ifdef CONFIG_FS_DAX
>> +static ssize_t f2fs_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
>> +{
>> +	struct inode *inode = file_inode(iocb->ki_filp);
>> +	ssize_t ret;
>> +
>> +	inode_lock_shared(inode);
>> +
>> +	if (!IS_DAX(inode)) {
>> +		inode_unlock_shared(inode);
>> +		return generic_file_read_iter(iocb, to);
>> +	}
>> +
>> +	down_read(&F2FS_I(inode)->dio_rwsem[READ]);
>> +	ret = dax_iomap_rw(iocb, to, &f2fs_iomap_ops);
>> +	up_read(&F2FS_I(inode)->dio_rwsem[READ]);
>> +	inode_unlock_shared(inode);
>> +
>> +	file_accessed(iocb->ki_filp);
>> +	return ret;
>> +}
>> +
>> +static ssize_t f2fs_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>> +{
>> +	struct inode *inode = file_inode(iocb->ki_filp);
>> +	ssize_t ret;
>> +
>> +	ret = file_remove_privs(iocb->ki_filp);
>> +	if (ret)
>> +		return ret;
>> +	ret = file_update_time(iocb->ki_filp);
>> +	if (ret)
>> +		return ret;
>> +
>> +	down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +	ret = dax_iomap_rw(iocb, from, &f2fs_iomap_ops);
>> +	up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +
>> +	return ret;
>> +}
>> +#else
>> +#define f2fs_dax_read_iter	generic_file_read_iter
>> +#define f2fs_dax_write_iter	__generic_file_write_iter
>> +#endif
>> +
>> +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>> +{
>> +	if (!iov_iter_count(to))
>> +		return 0; /* skip atime */
>> +
>> +	if (f2fs_dax_file(file_inode(iocb->ki_filp)))
>> +		return f2fs_dax_read_iter(iocb, to);
>> +
>> +	return generic_file_read_iter(iocb, to);
>> +}
>> +
>>  static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  {
>>  	struct file *file = iocb->ki_filp;
>> @@ -2452,7 +2626,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  			return err;
>>  		}
>>  		blk_start_plug(&plug);
>> -		ret = __generic_file_write_iter(iocb, from);
>> +		if (f2fs_dax_file(inode))
>> +			ret = f2fs_dax_write_iter(iocb, from);
>> +		else
>> +			ret = __generic_file_write_iter(iocb, from);
>>  		blk_finish_plug(&plug);
>>  		clear_inode_flag(inode, FI_NO_PREALLOC);
>>  	}
>> @@ -2501,7 +2678,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>>
>>  const struct file_operations f2fs_file_operations = {
>>  	.llseek		= f2fs_llseek,
>> -	.read_iter	= generic_file_read_iter,
>> +	.read_iter	= f2fs_file_read_iter,
>>  	.write_iter	= f2fs_file_write_iter,
>>  	.open		= f2fs_file_open,
>>  	.release	= f2fs_release_file,
>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>> index fa3d2e2..06b6859 100644
>> --- a/fs/f2fs/gc.c
>> +++ b/fs/f2fs/gc.c
>> @@ -700,6 +700,100 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
>>  	f2fs_put_page(page, 1);
>>  }
>>
>> +#ifdef CONFIG_FS_DAX
>> +#include <linux/dax.h>
>> +
>> +static void dax_move_data_page(struct inode *inode, block_t bidx,
>> +				unsigned int segno, int off)
>> +{
>> +	struct block_device *bdev = inode->i_sb->s_bdev;
>> +	struct dax_device *dax_dev;
>> +	struct dnode_of_data dn;
>> +	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
>> +	struct f2fs_summary sum;
>> +	struct node_info ni;
>> +	block_t old_blkaddr, new_blkaddr;
>> +	int err, id;
>> +	long map_len;
>> +	pgoff_t pgoff;
>> +	void *kaddr_old, *kaddr_new;
>> +	pfn_t pfn;
>> +
>> +	f2fs_bug_on(sbi, f2fs_is_atomic_file(inode));
>> +
>> +	if (!check_valid_map(sbi, segno, off))
>> +		return;
>> +
>> +	f2fs_bug_on(sbi, !blk_queue_dax(bdev->bd_queue));
>> +
>> +	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
>> +
>> +	if (!down_write_trylock(&F2FS_I(inode)->i_mmap_sem))
>> +		goto release;
>> +
>> +	unmap_mapping_range(inode->i_mapping, (loff_t)bidx << PAGE_SHIFT,
>> +			PAGE_SIZE, 1);
>> +	/* find the old block address */
>> +	set_new_dnode(&dn, inode, NULL, NULL, 0);
>> +	err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
>> +	if (err)
>> +		goto out;
>> +	old_blkaddr = dn.data_blkaddr;
>> +	/* This page is already truncated */
>> +	if (old_blkaddr == NULL_ADDR)
>> +		goto put_dn;
>> +
>> +	/* allocate a new block address */
>> +	get_node_info(sbi, dn.nid, &ni);
>> +	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
>> +	allocate_data_block(sbi, NULL, old_blkaddr, &new_blkaddr,
>> +			&sum, CURSEG_COLD_DATA, NULL, false);
>> +
>> +	/* copy data page from old to new address in dax_bdev */
>> +	id = dax_read_lock();
>> +	err = bdev_dax_pgoff(bdev, SECTOR_FROM_BLOCK(old_blkaddr),
>> +			PAGE_SIZE, &pgoff);
>> +	if (err)
>> +		goto recover;
>> +	map_len = dax_direct_access(dax_dev, pgoff, 1, &kaddr_old, &pfn);
>> +	if (map_len < 0)
>> +		goto recover;
>> +	err = bdev_dax_pgoff(bdev, SECTOR_FROM_BLOCK(new_blkaddr),
>> +			PAGE_SIZE, &pgoff);
>> +	if (err)
>> +		goto recover;
>> +	map_len = dax_direct_access(dax_dev, pgoff, 1, &kaddr_new, &pfn);
>> +	if (map_len < 0)
>> +		goto recover;
>> +	copy_page((void __force *)kaddr_new, (void __force *)kaddr_old);
>> +
>> +	f2fs_update_data_blkaddr(&dn, new_blkaddr);
>> +	set_inode_flag(inode, FI_APPEND_WRITE);
>> +	if (bidx == 0)
>> +		set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
>> +
>> +recover:
>> +	if (err || map_len < 0)
>> +		__f2fs_replace_block(sbi, &sum, new_blkaddr, old_blkaddr,
>> +							true, true);
>> +	dax_read_unlock(id);
>> +put_dn:
>> +	f2fs_put_dnode(&dn);
>> +out:
>> +	unmap_mapping_range(inode->i_mapping, (loff_t)bidx << PAGE_SHIFT,
>> +			PAGE_SIZE, 1);
>> +	up_write(&F2FS_I(inode)->i_mmap_sem);
>> +release:
>> +	put_dax(dax_dev);
>> +}
>> +#else
>> +static void dax_move_data_page(struct inode *inode, block_t bidx,
>> +				unsigned int segno, int off)
>> +{
>> +	BUG_ON(1);
>> +}
>> +#endif
>> +
>>  static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
>>  							unsigned int segno, int off)
>>  {
>> @@ -818,9 +912,10 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
>>  			if (IS_ERR(inode) || is_bad_inode(inode))
>>  				continue;
>>
>> -			/* if encrypted inode, let's go phase 3 */
>> +			/* if DAX or encrypted inode, let's go phase 3 */
>>  			if (f2fs_encrypted_inode(inode) &&
>> -						S_ISREG(inode->i_mode)) {
>> +						S_ISREG(inode->i_mode) ||
>> +						f2fs_dax_file(inode)) {
>>  				add_gc_inode(gc_list, inode);
>>  				continue;
>>  			}
>> @@ -858,7 +953,9 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
>>
>>  			start_bidx = start_bidx_of_node(nofs, inode)
>>  								+ ofs_in_node;
>> -			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
>> +			if (f2fs_dax_file(inode))
>> +				dax_move_data_page(inode, start_bidx, segno, off);
>> +			else if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
>>  				move_encrypted_block(inode, start_bidx, segno, off);
>>  			else
>>  				move_data_page(inode, start_bidx, gc_type, segno, off);
>> diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
>> index 2082816..b8c9116 100644
>> --- a/fs/f2fs/inline.c
>> +++ b/fs/f2fs/inline.c
>> @@ -28,6 +28,9 @@ bool f2fs_may_inline_data(struct inode *inode)
>>  	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
>>  		return false;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return false;
>> +
>>  	return true;
>>  }
>>
>> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
>> index 6cd312a..7741461 100644
>> --- a/fs/f2fs/inode.c
>> +++ b/fs/f2fs/inode.c
>> @@ -43,8 +43,14 @@ void f2fs_set_inode_flags(struct inode *inode)
>>  		new_fl |= S_NOATIME;
>>  	if (flags & FS_DIRSYNC_FL)
>>  		new_fl |= S_DIRSYNC;
>> +#ifdef CONFIG_FS_DAX
>> +	if (test_opt(F2FS_I_SB(inode), DAX) && S_ISREG(inode->i_mode) &&
>> +		!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
>> +		!f2fs_is_atomic_file(inode) && !f2fs_is_volatile_file(inode))
>> +		new_fl |= S_DAX;
>> +#endif
>>  	inode_set_flags(inode, new_fl,
>> -			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
>> +			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
>>  }
>>
>>  static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
>> diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
>> index 760d852..afc52e0 100644
>> --- a/fs/f2fs/namei.c
>> +++ b/fs/f2fs/namei.c
>> @@ -70,6 +70,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
>>  	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
>>  		f2fs_set_encrypted_inode(inode);
>>
>> +#ifdef CONFIG_FS_DAX
>> +	if (test_opt(sbi, DAX) && S_ISREG(inode->i_mode) &&
>> +		!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode))
>> +		inode->i_flags |= S_DAX;
>> +#endif
>>  	set_inode_flag(inode, FI_NEW_INODE);
>>
>>  	if (test_opt(sbi, INLINE_XATTR))
>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>> index 32e4c02..aefe931 100644
>> --- a/fs/f2fs/super.c
>> +++ b/fs/f2fs/super.c
>> @@ -109,6 +109,7 @@ enum {
>>  	Opt_nolazytime,
>>  	Opt_usrquota,
>>  	Opt_grpquota,
>> +	Opt_dax,
>>  	Opt_err,
>>  };
>>
>> @@ -146,6 +147,7 @@ enum {
>>  	{Opt_nolazytime, "nolazytime"},
>>  	{Opt_usrquota, "usrquota"},
>>  	{Opt_grpquota, "grpquota"},
>> +	{Opt_dax, "dax"},
>>  	{Opt_err, NULL},
>>  };
>>
>> @@ -399,6 +401,15 @@ static int parse_options(struct super_block *sb, char *options)
>>  					"quota operations not supported");
>>  			break;
>>  #endif
>> +#ifdef CONFIG_FS_DAX
>> +		case Opt_dax:
>> +			set_opt(sbi, DAX);
>> +			break;
>> +#else
>> +		case Opt_dax:
>> +			f2fs_msg(sb, KERN_INFO, "dax option not supported");
>> +			break;
>> +#endif
>>  		default:
>>  			f2fs_msg(sb, KERN_ERR,
>>  				"Unrecognized mount option \"%s\" or missing value",
>> @@ -814,6 +825,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
>>  	if (test_opt(sbi, GRPQUOTA))
>>  		seq_puts(seq, ",grpquota");
>>  #endif
>> +#ifdef CONFIG_FS_DAX
>> +	if (test_opt(sbi, DAX))
>> +		seq_puts(seq, ",dax");
>> +#endif
>>
>>  	return 0;
>>  }
>> --
>> 1.8.3.1
>
> .
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-24 12:03     ` Sun Qiuyang
  0 siblings, 0 replies; 33+ messages in thread
From: Sun Qiuyang @ 2017-07-24 12:03 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-fsdevel, linux-f2fs-devel

Hi Jaegeuk,

Below is the error message I got from this testcase:
---
write (Invalid argument) len 1024 dio [dax to nondax | both nodax]
read (Bad address) len [4096 | 16777216 | 67108864] dio dax to nondax
---
The write error is expected, as F2FS does not support unaligned direct 
IO (1024 B).

The read error is more complex. In the test script, when we mmap the src 
file (dax), the flags (VM_MIXEDMAP | VM_HUGEPAGE) are added into
vma->vm_flags. Later on, when we write to the dest file (nondax) and 
then read from it by direct IO, we will fail to get pages from this 
"special" vma.

Functions involved:
f2fs_direct_IO
blockdev_direct_IO
__blockdev_direct_IO
do_direct_IO
dio_get_page
dio_refill_pages
iov_iter_get_pages
get_user_pages_unlocked
__get_user_pages_fast
__get_user_pages_unlocked
__get_user_pages_locked
__get_user_pages
follow_page_mask
follow_p4d_mask
follow_pud_mask
follow_pmd_mask
follow_page_pte
	vm_normal_page
	follow_pfn_pte

In my test environment HAVE_PTE_SPECIAL is true, and vm_normal_page() 
returns NULL due to VM_MIXEDMAP in vm_flags. Then follow_page_pte() 
continue to call follow_pfn_pte(), which returns -EFAULT. This is how we 
get a "bad address" error finally.

This error also occurs in EXT4-DAX for similar reasons.

Thanks,


> Hi Qiuyang,
>
> This fails xfstests/generic/413.
>
> Thanks,
>
> On 07/20, sunqiuyang wrote:
>> From: Qiuyang Sun <sunqiuyang@huawei.com>
>>
>> This patch implements Direct Access (DAX) in F2FS, including:
>>  - a mount option to choose whether to enable DAX or not
>>  - read/write and mmap of regular files in the DAX way
>>  - zero-out of unaligned partial blocks in the DAX way
>>  - garbage collection of DAX files, by mapping both old and new physical
>>    addresses of a data page into memory and copy data between them directly
>>  - incompatibility of DAX with inline data, atomic or volatile write,
>>    collapse|insert_range, etc.
>>
>> Signed-off-by: Qiuyang Sun <sunqiuyang@huawei.com>
>> ---
>> Changelog v7 -> v8:
>>  - Introduce the macro f2fs_dax_file() to judge if a file is DAX for cases
>>    when CONFIG_FS_DAX is set or not
>>  - Return -ENOTSUPP when an operation does not support DAX
>>  - In f2fs_iomap_begin(), convert the inline data of an inode (if any)
>>    before mapping blocks
>>  - Minor cleanups
>> ---
>>  Documentation/filesystems/f2fs.txt |   2 +
>>  fs/f2fs/data.c                     | 132 +++++++++++++++++++++++++-
>>  fs/f2fs/f2fs.h                     |  15 +++
>>  fs/f2fs/file.c                     | 183 ++++++++++++++++++++++++++++++++++++-
>>  fs/f2fs/gc.c                       | 103 ++++++++++++++++++++-
>>  fs/f2fs/inline.c                   |   3 +
>>  fs/f2fs/inode.c                    |   8 +-
>>  fs/f2fs/namei.c                    |   5 +
>>  fs/f2fs/super.c                    |  15 +++
>>  9 files changed, 454 insertions(+), 12 deletions(-)
>>
>> diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
>> index 273ccb2..c86c421 100644
>> --- a/Documentation/filesystems/f2fs.txt
>> +++ b/Documentation/filesystems/f2fs.txt
>> @@ -164,6 +164,8 @@ io_bits=%u             Set the bit size of write IO requests. It should be set
>>                         with "mode=lfs".
>>  usrquota               Enable plain user disk quota accounting.
>>  grpquota               Enable plain group disk quota accounting.
>> +dax                    Use direct access (no page cache). See
>> +                       Documentation/filesystems/dax.txt.
>>
>>  ================================================================================
>>  DEBUGFS ENTRIES
>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
>> index 87c1f41..4eb4b76 100644
>> --- a/fs/f2fs/data.c
>> +++ b/fs/f2fs/data.c
>> @@ -910,6 +910,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>>  				err = -EIO;
>>  				goto sync_out;
>>  			}
>> +			/*
>> +			 * If newly allocated blocks are to be zeroed out later,
>> +			 * a single f2fs_map_blocks must not contain both old
>> +			 * and new blocks at the same time.
>> +			 */
>> +			if (flag == F2FS_GET_BLOCK_ZERO
>> +					&& (map->m_flags & F2FS_MAP_MAPPED)
>> +					&& !(map->m_flags & F2FS_MAP_NEW))
>> +				goto sync_out;
>>  			if (flag == F2FS_GET_BLOCK_PRE_AIO) {
>>  				if (blkaddr == NULL_ADDR) {
>>  					prealloc++;
>> @@ -938,6 +947,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>>  						blkaddr != NEW_ADDR)
>>  				goto sync_out;
>>  		}
>> +	} else if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
>> +		goto sync_out;
>>  	}
>>
>>  	if (flag == F2FS_GET_BLOCK_PRE_AIO)
>> @@ -996,6 +1007,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>>  	goto next_dnode;
>>
>>  sync_out:
>> +	if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
>> +		clean_bdev_aliases(inode->i_sb->s_bdev,
>> +				map->m_pblk, map->m_len);
>> +		err = sb_issue_zeroout(inode->i_sb, map->m_pblk,
>> +				map->m_len, GFP_NOFS);
>> +	}
>>  	f2fs_put_dnode(&dn);
>>  unlock_out:
>>  	if (create) {
>> @@ -1808,16 +1825,19 @@ static int f2fs_write_data_pages(struct address_space *mapping,
>>  	return 0;
>>  }
>>
>> -static void f2fs_write_failed(struct address_space *mapping, loff_t to)
>> +static void f2fs_write_failed(struct address_space *mapping, loff_t to,
>> +								bool lock)
>>  {
>>  	struct inode *inode = mapping->host;
>>  	loff_t i_size = i_size_read(inode);
>>
>>  	if (to > i_size) {
>> -		down_write(&F2FS_I(inode)->i_mmap_sem);
>> +		if (lock)
>> +			down_write(&F2FS_I(inode)->i_mmap_sem);
>>  		truncate_pagecache(inode, i_size);
>>  		truncate_blocks(inode, i_size, true);
>> -		up_write(&F2FS_I(inode)->i_mmap_sem);
>> +		if (lock)
>> +			up_write(&F2FS_I(inode)->i_mmap_sem);
>>  	}
>>  }
>>
>> @@ -2000,7 +2020,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
>>
>>  fail:
>>  	f2fs_put_page(page, 1);
>> -	f2fs_write_failed(mapping, pos + len);
>> +	f2fs_write_failed(mapping, pos + len, true);
>>  	return err;
>>  }
>>
>> @@ -2077,7 +2097,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>>  		if (err > 0)
>>  			set_inode_flag(inode, FI_UPDATE_WRITE);
>>  		else if (err < 0)
>> -			f2fs_write_failed(mapping, offset + count);
>> +			f2fs_write_failed(mapping, offset + count, true);
>>  	}
>>
>>  	trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
>> @@ -2274,3 +2294,105 @@ int f2fs_migrate_page(struct address_space *mapping,
>>  	.migratepage    = f2fs_migrate_page,
>>  #endif
>>  };
>> +
>> +#ifdef CONFIG_FS_DAX
>> +#include <linux/iomap.h>
>> +#include <linux/dax.h>
>> +
>> +static int f2fs_iomap_begin(struct inode *inode, loff_t offset,
>> +	loff_t length, unsigned int flags, struct iomap *iomap)
>> +{
>> +	struct block_device *bdev;
>> +	unsigned long first_block = F2FS_BYTES_TO_BLK(offset);
>> +	unsigned long last_block = F2FS_BYTES_TO_BLK(offset + length - 1);
>> +	struct f2fs_map_blocks map;
>> +	int ret;
>> +
>> +	ret = f2fs_convert_inline_inode(inode);
>> +	if (ret)
>> +		return ret;
>> +
>> +	map.m_lblk = first_block;
>> +	map.m_len = last_block - first_block + 1;
>> +	map.m_next_pgofs = NULL;
>> +
>> +	if (!(flags & IOMAP_WRITE)) {
>> +		ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
>> +	} else {
>> +	/* i_size should be kept here and changed later in f2fs_iomap_end */
>> +		loff_t original_i_size = i_size_read(inode);
>> +
>> +		ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_ZERO);
>> +		if (i_size_read(inode) > original_i_size) {
>> +			f2fs_i_size_write(inode, original_i_size);
>> +			if (ret)
>> +				f2fs_write_failed(inode->i_mapping,
>> +						offset + length,
>> +						!(flags & IOMAP_FAULT));
>> +		}
>> +	}
>> +
>> +	if (ret)
>> +		return ret;
>> +
>> +	iomap->flags = 0;
>> +	bdev = inode->i_sb->s_bdev;
>> +	iomap->bdev = bdev;
>> +	if (blk_queue_dax(bdev->bd_queue))
>> +		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
>> +	else
>> +		iomap->dax_dev = NULL;
>> +	iomap->offset = F2FS_BLK_TO_BYTES((u64)first_block);
>> +
>> +	if (map.m_len == 0) {
>> +		iomap->type = IOMAP_HOLE;
>> +		iomap->blkno = IOMAP_NULL_BLOCK;
>> +		iomap->length = F2FS_BLKSIZE;
>> +	} else {
>> +		if (map.m_flags & F2FS_MAP_UNWRITTEN) {
>> +			iomap->type = IOMAP_UNWRITTEN;
>> +		} else if (map.m_flags & F2FS_MAP_MAPPED) {
>> +			iomap->type = IOMAP_MAPPED;
>> +		} else {
>> +			WARN_ON_ONCE(1);
>> +			return -EIO;
>> +		}
>> +		iomap->blkno =
>> +			(sector_t)map.m_pblk << F2FS_LOG_SECTORS_PER_BLOCK;
>> +		iomap->length = F2FS_BLK_TO_BYTES((u64)map.m_len);
>> +	}
>> +
>> +	if (map.m_flags & F2FS_MAP_NEW)
>> +		iomap->flags |= IOMAP_F_NEW;
>> +	return 0;
>> +}
>> +
>> +static int f2fs_iomap_end(struct inode *inode, loff_t offset, loff_t length,
>> +	ssize_t written, unsigned int flags, struct iomap *iomap)
>> +{
>> +	put_dax(iomap->dax_dev);
>> +	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
>> +		return 0;
>> +
>> +	if (offset + written > i_size_read(inode))
>> +		f2fs_i_size_write(inode, offset + written);
>> +
>> +	if (iomap->offset + iomap->length >
>> +			ALIGN(i_size_read(inode), F2FS_BLKSIZE)) {
>> +		block_t written_blk = F2FS_BYTES_TO_BLK(offset + written);
>> +		block_t end_blk = F2FS_BYTES_TO_BLK(offset + length);
>> +
>> +		if (written_blk < end_blk)
>> +			f2fs_write_failed(inode->i_mapping, offset + length,
>> +									true);
>> +	}
>> +
>> +	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
>> +	return 0;
>> +}
>> +
>> +struct iomap_ops f2fs_iomap_ops = {
>> +	.iomap_begin	= f2fs_iomap_begin,
>> +	.iomap_end	= f2fs_iomap_end,
>> +};
>> +#endif
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index 70777a8..b6d629a 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -91,6 +91,11 @@ struct f2fs_fault_info {
>>  #define F2FS_MOUNT_LFS			0x00040000
>>  #define F2FS_MOUNT_USRQUOTA		0x00080000
>>  #define F2FS_MOUNT_GRPQUOTA		0x00100000
>> +#ifdef CONFIG_FS_DAX
>> +#define F2FS_MOUNT_DAX			0x00400000 /* Direct Access */
>> +#else
>> +#define F2FS_MOUNT_DAX			0
>> +#endif
>>
>>  #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
>>  #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
>> @@ -482,6 +487,7 @@ struct f2fs_map_blocks {
>>  #define F2FS_GET_BLOCK_BMAP		3
>>  #define F2FS_GET_BLOCK_PRE_DIO		4
>>  #define F2FS_GET_BLOCK_PRE_AIO		5
>> +#define F2FS_GET_BLOCK_ZERO		6
>>
>>  /*
>>   * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
>> @@ -506,6 +512,12 @@ struct f2fs_map_blocks {
>>  #define file_keep_isize(inode)	is_file(inode, FADVISE_KEEP_SIZE_BIT)
>>  #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
>>
>> +#ifdef CONFIG_FS_DAX
>> +#define f2fs_dax_file(inode)	IS_DAX(inode)
>> +#else
>> +#define f2fs_dax_file(inode)	false
>> +#endif
>> +
>>  #define DEF_DIR_LEVEL		0
>>
>>  struct f2fs_inode_info {
>> @@ -2439,6 +2451,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
>>  int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
>>  			struct page *page, enum migrate_mode mode);
>>  #endif
>> +#ifdef CONFIG_FS_DAX
>> +extern struct iomap_ops f2fs_iomap_ops;
>> +#endif
>>
>>  /*
>>   * gc.c
>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>> index 2706130..e26114f 100644
>> --- a/fs/f2fs/file.c
>> +++ b/fs/f2fs/file.c
>> @@ -23,6 +23,10 @@
>>  #include <linux/uio.h>
>>  #include <linux/uuid.h>
>>  #include <linux/file.h>
>> +#ifdef CONFIG_FS_DAX
>> +#include <linux/dax.h>
>> +#include <linux/iomap.h>
>> +#endif
>>
>>  #include "f2fs.h"
>>  #include "node.h"
>> @@ -121,6 +125,64 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>>  	.page_mkwrite	= f2fs_vm_page_mkwrite,
>>  };
>>
>> +#ifdef CONFIG_FS_DAX
>> +static int f2fs_dax_huge_fault(struct vm_fault *vmf,
>> +	enum page_entry_size pe_size)
>> +{
>> +	int result;
>> +	struct inode *inode = file_inode(vmf->vma->vm_file);
>> +	struct super_block *sb = inode->i_sb;
>> +	bool write = vmf->flags & FAULT_FLAG_WRITE;
>> +
>> +	if (write) {
>> +		sb_start_pagefault(sb);
>> +		file_update_time(vmf->vma->vm_file);
>> +	}
>> +	down_read(&F2FS_I(inode)->i_mmap_sem);
>> +	result = dax_iomap_fault(vmf, pe_size, &f2fs_iomap_ops);
>> +	up_read(&F2FS_I(inode)->i_mmap_sem);
>> +	if (write)
>> +		sb_end_pagefault(sb);
>> +
>> +	return result;
>> +}
>> +
>> +static int f2fs_dax_fault(struct vm_fault *vmf)
>> +{
>> +	return f2fs_dax_huge_fault(vmf, PE_SIZE_PTE);
>> +}
>> +
>> +static int f2fs_dax_pfn_mkwrite(struct vm_fault *vmf)
>> +{
>> +	struct inode *inode = file_inode(vmf->vma->vm_file);
>> +	struct super_block *sb = inode->i_sb;
>> +	loff_t size;
>> +	int ret;
>> +
>> +	sb_start_pagefault(sb);
>> +	file_update_time(vmf->vma->vm_file);
>> +	down_read(&F2FS_I(inode)->i_mmap_sem);
>> +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
>> +	if (vmf->pgoff >= size)
>> +		ret = VM_FAULT_SIGBUS;
>> +	else
>> +		ret = dax_pfn_mkwrite(vmf);
>> +	up_read(&F2FS_I(inode)->i_mmap_sem);
>> +	sb_end_pagefault(sb);
>> +
>> +	return ret;
>> +}
>> +
>> +static const struct vm_operations_struct f2fs_dax_vm_ops = {
>> +	.fault		= f2fs_dax_fault,
>> +	.huge_fault	= f2fs_dax_huge_fault,
>> +	.page_mkwrite	= f2fs_dax_fault,
>> +	.pfn_mkwrite	= f2fs_dax_pfn_mkwrite,
>> +};
>> +#else
>> +#define f2fs_dax_vm_ops f2fs_file_vm_ops
>> +#endif
>> +
>>  static int get_parent_ino(struct inode *inode, nid_t *pino)
>>  {
>>  	struct dentry *dentry;
>> @@ -436,7 +498,13 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
>>  		return err;
>>
>>  	file_accessed(file);
>> -	vma->vm_ops = &f2fs_file_vm_ops;
>> +
>> +	if (f2fs_dax_file(inode)) {
>> +		vma->vm_ops = &f2fs_dax_vm_ops;
>> +		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
>> +	} else {
>> +		vma->vm_ops = &f2fs_file_vm_ops;
>> +	}
>>  	return 0;
>>  }
>>
>> @@ -519,6 +587,16 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
>>  	if (!offset && !cache_only)
>>  		return 0;
>>
>> +	if (f2fs_dax_file(inode)) {
>> +		int ret;
>> +
>> +		down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +		ret = iomap_zero_range(inode, from, PAGE_SIZE - offset,
>> +			NULL, &f2fs_iomap_ops);
>> +		up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +		return ret;
>> +	}
>> +
>>  	if (cache_only) {
>>  		page = find_lock_page(mapping, index);
>>  		if (page && PageUptodate(page))
>> @@ -799,6 +877,17 @@ static int fill_zero(struct inode *inode, pgoff_t index,
>>  	if (!len)
>>  		return 0;
>>
>> +	if (f2fs_dax_file(inode)) {
>> +		int ret;
>> +
>> +		down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +		ret = iomap_zero_range(inode,
>> +			F2FS_BLK_TO_BYTES((loff_t)index) + start,
>> +			len, NULL, &f2fs_iomap_ops);
>> +		up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +		return ret;
>> +	}
>> +
>>  	f2fs_balance_fs(sbi, true);
>>
>>  	f2fs_lock_op(sbi);
>> @@ -1121,6 +1210,10 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
>>  	loff_t new_size;
>>  	int ret;
>>
>> +	/* The current implementation does not apply to DAX files. */
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (offset + len >= i_size_read(inode))
>>  		return -EINVAL;
>>
>> @@ -1311,6 +1404,10 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
>>  	loff_t new_size;
>>  	int ret = 0;
>>
>> +	/* The current implementation does not apply to DAX files. */
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	new_size = i_size_read(inode) + len;
>>  	ret = inode_newsize_ok(inode, new_size);
>>  	if (ret)
>> @@ -1578,6 +1675,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
>>  	struct inode *inode = file_inode(filp);
>>  	int ret;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!inode_owner_or_capable(inode))
>>  		return -EACCES;
>>
>> @@ -1627,6 +1727,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
>>  	struct inode *inode = file_inode(filp);
>>  	int ret;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!inode_owner_or_capable(inode))
>>  		return -EACCES;
>>
>> @@ -1663,6 +1766,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
>>  	struct inode *inode = file_inode(filp);
>>  	int ret;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!inode_owner_or_capable(inode))
>>  		return -EACCES;
>>
>> @@ -1698,6 +1804,9 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
>>  	struct inode *inode = file_inode(filp);
>>  	int ret;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!inode_owner_or_capable(inode))
>>  		return -EACCES;
>>
>> @@ -1727,6 +1836,9 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
>>  	struct inode *inode = file_inode(filp);
>>  	int ret;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!inode_owner_or_capable(inode))
>>  		return -EACCES;
>>
>> @@ -2141,6 +2253,9 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
>>  	struct f2fs_defragment range;
>>  	int err;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return -ENOTSUPP;
>> +
>>  	if (!capable(CAP_SYS_ADMIN))
>>  		return -EPERM;
>>
>> @@ -2190,6 +2305,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
>>  	size_t dst_osize;
>>  	int ret;
>>
>> +	if (f2fs_dax_file(src) || f2fs_dax_file(dst))
>> +		return -ENOTSUPP;
>> +
>>  	if (file_in->f_path.mnt != file_out->f_path.mnt ||
>>  				src->i_sb != dst->i_sb)
>>  		return -EXDEV;
>> @@ -2431,6 +2549,62 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>>  	}
>>  }
>>
>> +#ifdef CONFIG_FS_DAX
>> +static ssize_t f2fs_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
>> +{
>> +	struct inode *inode = file_inode(iocb->ki_filp);
>> +	ssize_t ret;
>> +
>> +	inode_lock_shared(inode);
>> +
>> +	if (!IS_DAX(inode)) {
>> +		inode_unlock_shared(inode);
>> +		return generic_file_read_iter(iocb, to);
>> +	}
>> +
>> +	down_read(&F2FS_I(inode)->dio_rwsem[READ]);
>> +	ret = dax_iomap_rw(iocb, to, &f2fs_iomap_ops);
>> +	up_read(&F2FS_I(inode)->dio_rwsem[READ]);
>> +	inode_unlock_shared(inode);
>> +
>> +	file_accessed(iocb->ki_filp);
>> +	return ret;
>> +}
>> +
>> +static ssize_t f2fs_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
>> +{
>> +	struct inode *inode = file_inode(iocb->ki_filp);
>> +	ssize_t ret;
>> +
>> +	ret = file_remove_privs(iocb->ki_filp);
>> +	if (ret)
>> +		return ret;
>> +	ret = file_update_time(iocb->ki_filp);
>> +	if (ret)
>> +		return ret;
>> +
>> +	down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +	ret = dax_iomap_rw(iocb, from, &f2fs_iomap_ops);
>> +	up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
>> +
>> +	return ret;
>> +}
>> +#else
>> +#define f2fs_dax_read_iter	generic_file_read_iter
>> +#define f2fs_dax_write_iter	__generic_file_write_iter
>> +#endif
>> +
>> +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>> +{
>> +	if (!iov_iter_count(to))
>> +		return 0; /* skip atime */
>> +
>> +	if (f2fs_dax_file(file_inode(iocb->ki_filp)))
>> +		return f2fs_dax_read_iter(iocb, to);
>> +
>> +	return generic_file_read_iter(iocb, to);
>> +}
>> +
>>  static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  {
>>  	struct file *file = iocb->ki_filp;
>> @@ -2452,7 +2626,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  			return err;
>>  		}
>>  		blk_start_plug(&plug);
>> -		ret = __generic_file_write_iter(iocb, from);
>> +		if (f2fs_dax_file(inode))
>> +			ret = f2fs_dax_write_iter(iocb, from);
>> +		else
>> +			ret = __generic_file_write_iter(iocb, from);
>>  		blk_finish_plug(&plug);
>>  		clear_inode_flag(inode, FI_NO_PREALLOC);
>>  	}
>> @@ -2501,7 +2678,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>>
>>  const struct file_operations f2fs_file_operations = {
>>  	.llseek		= f2fs_llseek,
>> -	.read_iter	= generic_file_read_iter,
>> +	.read_iter	= f2fs_file_read_iter,
>>  	.write_iter	= f2fs_file_write_iter,
>>  	.open		= f2fs_file_open,
>>  	.release	= f2fs_release_file,
>> diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
>> index fa3d2e2..06b6859 100644
>> --- a/fs/f2fs/gc.c
>> +++ b/fs/f2fs/gc.c
>> @@ -700,6 +700,100 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
>>  	f2fs_put_page(page, 1);
>>  }
>>
>> +#ifdef CONFIG_FS_DAX
>> +#include <linux/dax.h>
>> +
>> +static void dax_move_data_page(struct inode *inode, block_t bidx,
>> +				unsigned int segno, int off)
>> +{
>> +	struct block_device *bdev = inode->i_sb->s_bdev;
>> +	struct dax_device *dax_dev;
>> +	struct dnode_of_data dn;
>> +	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
>> +	struct f2fs_summary sum;
>> +	struct node_info ni;
>> +	block_t old_blkaddr, new_blkaddr;
>> +	int err, id;
>> +	long map_len;
>> +	pgoff_t pgoff;
>> +	void *kaddr_old, *kaddr_new;
>> +	pfn_t pfn;
>> +
>> +	f2fs_bug_on(sbi, f2fs_is_atomic_file(inode));
>> +
>> +	if (!check_valid_map(sbi, segno, off))
>> +		return;
>> +
>> +	f2fs_bug_on(sbi, !blk_queue_dax(bdev->bd_queue));
>> +
>> +	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
>> +
>> +	if (!down_write_trylock(&F2FS_I(inode)->i_mmap_sem))
>> +		goto release;
>> +
>> +	unmap_mapping_range(inode->i_mapping, (loff_t)bidx << PAGE_SHIFT,
>> +			PAGE_SIZE, 1);
>> +	/* find the old block address */
>> +	set_new_dnode(&dn, inode, NULL, NULL, 0);
>> +	err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
>> +	if (err)
>> +		goto out;
>> +	old_blkaddr = dn.data_blkaddr;
>> +	/* This page is already truncated */
>> +	if (old_blkaddr == NULL_ADDR)
>> +		goto put_dn;
>> +
>> +	/* allocate a new block address */
>> +	get_node_info(sbi, dn.nid, &ni);
>> +	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
>> +	allocate_data_block(sbi, NULL, old_blkaddr, &new_blkaddr,
>> +			&sum, CURSEG_COLD_DATA, NULL, false);
>> +
>> +	/* copy data page from old to new address in dax_bdev */
>> +	id = dax_read_lock();
>> +	err = bdev_dax_pgoff(bdev, SECTOR_FROM_BLOCK(old_blkaddr),
>> +			PAGE_SIZE, &pgoff);
>> +	if (err)
>> +		goto recover;
>> +	map_len = dax_direct_access(dax_dev, pgoff, 1, &kaddr_old, &pfn);
>> +	if (map_len < 0)
>> +		goto recover;
>> +	err = bdev_dax_pgoff(bdev, SECTOR_FROM_BLOCK(new_blkaddr),
>> +			PAGE_SIZE, &pgoff);
>> +	if (err)
>> +		goto recover;
>> +	map_len = dax_direct_access(dax_dev, pgoff, 1, &kaddr_new, &pfn);
>> +	if (map_len < 0)
>> +		goto recover;
>> +	copy_page((void __force *)kaddr_new, (void __force *)kaddr_old);
>> +
>> +	f2fs_update_data_blkaddr(&dn, new_blkaddr);
>> +	set_inode_flag(inode, FI_APPEND_WRITE);
>> +	if (bidx == 0)
>> +		set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
>> +
>> +recover:
>> +	if (err || map_len < 0)
>> +		__f2fs_replace_block(sbi, &sum, new_blkaddr, old_blkaddr,
>> +							true, true);
>> +	dax_read_unlock(id);
>> +put_dn:
>> +	f2fs_put_dnode(&dn);
>> +out:
>> +	unmap_mapping_range(inode->i_mapping, (loff_t)bidx << PAGE_SHIFT,
>> +			PAGE_SIZE, 1);
>> +	up_write(&F2FS_I(inode)->i_mmap_sem);
>> +release:
>> +	put_dax(dax_dev);
>> +}
>> +#else
>> +static void dax_move_data_page(struct inode *inode, block_t bidx,
>> +				unsigned int segno, int off)
>> +{
>> +	BUG_ON(1);
>> +}
>> +#endif
>> +
>>  static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
>>  							unsigned int segno, int off)
>>  {
>> @@ -818,9 +912,10 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
>>  			if (IS_ERR(inode) || is_bad_inode(inode))
>>  				continue;
>>
>> -			/* if encrypted inode, let's go phase 3 */
>> +			/* if DAX or encrypted inode, let's go phase 3 */
>>  			if (f2fs_encrypted_inode(inode) &&
>> -						S_ISREG(inode->i_mode)) {
>> +						S_ISREG(inode->i_mode) ||
>> +						f2fs_dax_file(inode)) {
>>  				add_gc_inode(gc_list, inode);
>>  				continue;
>>  			}
>> @@ -858,7 +953,9 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
>>
>>  			start_bidx = start_bidx_of_node(nofs, inode)
>>  								+ ofs_in_node;
>> -			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
>> +			if (f2fs_dax_file(inode))
>> +				dax_move_data_page(inode, start_bidx, segno, off);
>> +			else if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
>>  				move_encrypted_block(inode, start_bidx, segno, off);
>>  			else
>>  				move_data_page(inode, start_bidx, gc_type, segno, off);
>> diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
>> index 2082816..b8c9116 100644
>> --- a/fs/f2fs/inline.c
>> +++ b/fs/f2fs/inline.c
>> @@ -28,6 +28,9 @@ bool f2fs_may_inline_data(struct inode *inode)
>>  	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
>>  		return false;
>>
>> +	if (f2fs_dax_file(inode))
>> +		return false;
>> +
>>  	return true;
>>  }
>>
>> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
>> index 6cd312a..7741461 100644
>> --- a/fs/f2fs/inode.c
>> +++ b/fs/f2fs/inode.c
>> @@ -43,8 +43,14 @@ void f2fs_set_inode_flags(struct inode *inode)
>>  		new_fl |= S_NOATIME;
>>  	if (flags & FS_DIRSYNC_FL)
>>  		new_fl |= S_DIRSYNC;
>> +#ifdef CONFIG_FS_DAX
>> +	if (test_opt(F2FS_I_SB(inode), DAX) && S_ISREG(inode->i_mode) &&
>> +		!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
>> +		!f2fs_is_atomic_file(inode) && !f2fs_is_volatile_file(inode))
>> +		new_fl |= S_DAX;
>> +#endif
>>  	inode_set_flags(inode, new_fl,
>> -			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
>> +			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
>>  }
>>
>>  static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
>> diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
>> index 760d852..afc52e0 100644
>> --- a/fs/f2fs/namei.c
>> +++ b/fs/f2fs/namei.c
>> @@ -70,6 +70,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
>>  	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
>>  		f2fs_set_encrypted_inode(inode);
>>
>> +#ifdef CONFIG_FS_DAX
>> +	if (test_opt(sbi, DAX) && S_ISREG(inode->i_mode) &&
>> +		!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode))
>> +		inode->i_flags |= S_DAX;
>> +#endif
>>  	set_inode_flag(inode, FI_NEW_INODE);
>>
>>  	if (test_opt(sbi, INLINE_XATTR))
>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>> index 32e4c02..aefe931 100644
>> --- a/fs/f2fs/super.c
>> +++ b/fs/f2fs/super.c
>> @@ -109,6 +109,7 @@ enum {
>>  	Opt_nolazytime,
>>  	Opt_usrquota,
>>  	Opt_grpquota,
>> +	Opt_dax,
>>  	Opt_err,
>>  };
>>
>> @@ -146,6 +147,7 @@ enum {
>>  	{Opt_nolazytime, "nolazytime"},
>>  	{Opt_usrquota, "usrquota"},
>>  	{Opt_grpquota, "grpquota"},
>> +	{Opt_dax, "dax"},
>>  	{Opt_err, NULL},
>>  };
>>
>> @@ -399,6 +401,15 @@ static int parse_options(struct super_block *sb, char *options)
>>  					"quota operations not supported");
>>  			break;
>>  #endif
>> +#ifdef CONFIG_FS_DAX
>> +		case Opt_dax:
>> +			set_opt(sbi, DAX);
>> +			break;
>> +#else
>> +		case Opt_dax:
>> +			f2fs_msg(sb, KERN_INFO, "dax option not supported");
>> +			break;
>> +#endif
>>  		default:
>>  			f2fs_msg(sb, KERN_ERR,
>>  				"Unrecognized mount option \"%s\" or missing value",
>> @@ -814,6 +825,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
>>  	if (test_opt(sbi, GRPQUOTA))
>>  		seq_puts(seq, ",grpquota");
>>  #endif
>> +#ifdef CONFIG_FS_DAX
>> +	if (test_opt(sbi, DAX))
>> +		seq_puts(seq, ",dax");
>> +#endif
>>
>>  	return 0;
>>  }
>> --
>> 1.8.3.1
>
> .
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-20 12:10 ` sunqiuyang
  (?)
@ 2017-07-26  0:15   ` Dan Williams
  -1 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26  0:15 UTC (permalink / raw)
  To: sunqiuyang
  Cc: linux-fsdevel, Jaegeuk Kim, linux-nvdimm@lists.01.org,
	Linux Kernel Mailing List, linux-f2fs-devel

[ adding linux-nvdimm ]

On Thu, Jul 20, 2017 at 5:10 AM, sunqiuyang <sunqiuyang@huawei.com> wrote:
> From: Qiuyang Sun <sunqiuyang@huawei.com>
>
> This patch implements Direct Access (DAX) in F2FS, including:
>  - a mount option to choose whether to enable DAX or not

We're in the process of walking back and potentially deprecating the
use of the dax mount option for xfs and ext4 since dax can have
negative performance implications if page cache memory happens to be
faster than pmem. It should be limited to applications that
specifically want the semantic, not globally enabled for the entire
mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
per-inode enabling of dax.

I'm wondering if any new filesystem that adds dax support at this
point should do so with inode flags and not a mount option?
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26  0:15   ` Dan Williams
  0 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26  0:15 UTC (permalink / raw)
  To: sunqiuyang
  Cc: Linux Kernel Mailing List, linux-fsdevel, linux-f2fs-devel,
	Jaegeuk Kim, linux-nvdimm@lists.01.org

[ adding linux-nvdimm ]

On Thu, Jul 20, 2017 at 5:10 AM, sunqiuyang <sunqiuyang@huawei.com> wrote:
> From: Qiuyang Sun <sunqiuyang@huawei.com>
>
> This patch implements Direct Access (DAX) in F2FS, including:
>  - a mount option to choose whether to enable DAX or not

We're in the process of walking back and potentially deprecating the
use of the dax mount option for xfs and ext4 since dax can have
negative performance implications if page cache memory happens to be
faster than pmem. It should be limited to applications that
specifically want the semantic, not globally enabled for the entire
mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
per-inode enabling of dax.

I'm wondering if any new filesystem that adds dax support at this
point should do so with inode flags and not a mount option?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26  0:15   ` Dan Williams
  0 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26  0:15 UTC (permalink / raw)
  To: sunqiuyang
  Cc: linux-fsdevel, Jaegeuk Kim,
	linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
	Linux Kernel Mailing List,
	linux-f2fs-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

[ adding linux-nvdimm ]

On Thu, Jul 20, 2017 at 5:10 AM, sunqiuyang <sunqiuyang-hv44wF8Li93QT0dZR+AlfA@public.gmane.org> wrote:
> From: Qiuyang Sun <sunqiuyang-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
>
> This patch implements Direct Access (DAX) in F2FS, including:
>  - a mount option to choose whether to enable DAX or not

We're in the process of walking back and potentially deprecating the
use of the dax mount option for xfs and ext4 since dax can have
negative performance implications if page cache memory happens to be
faster than pmem. It should be limited to applications that
specifically want the semantic, not globally enabled for the entire
mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
per-inode enabling of dax.

I'm wondering if any new filesystem that adds dax support at this
point should do so with inode flags and not a mount option?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-26  0:15   ` Dan Williams
@ 2017-07-26  2:16     ` Jaegeuk Kim
  -1 siblings, 0 replies; 33+ messages in thread
From: Jaegeuk Kim @ 2017-07-26  2:16 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-fsdevel, linux-nvdimm@lists.01.org, linux-f2fs-devel,
	sunqiuyang, Linux Kernel Mailing List

Hi Dan,

On 07/25, Dan Williams wrote:
> [ adding linux-nvdimm ]
> 
> On Thu, Jul 20, 2017 at 5:10 AM, sunqiuyang <sunqiuyang@huawei.com> wrote:
> > From: Qiuyang Sun <sunqiuyang@huawei.com>
> >
> > This patch implements Direct Access (DAX) in F2FS, including:
> >  - a mount option to choose whether to enable DAX or not
> 
> We're in the process of walking back and potentially deprecating the
> use of the dax mount option for xfs and ext4 since dax can have
> negative performance implications if page cache memory happens to be
> faster than pmem. It should be limited to applications that
> specifically want the semantic, not globally enabled for the entire
> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
> per-inode enabling of dax.

Thank you so much for pointing this out. So, is there a plan to define a
generic inode flag to enable dax via inode_set_flag? Or, does each filesystem
need to handle it individually likewise xfs?

> 
> I'm wondering if any new filesystem that adds dax support at this
> point should do so with inode flags and not a mount option?

Anyway, in such the case, I have to postpone merging this patch for a while.

Thanks,
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26  2:16     ` Jaegeuk Kim
  0 siblings, 0 replies; 33+ messages in thread
From: Jaegeuk Kim @ 2017-07-26  2:16 UTC (permalink / raw)
  To: Dan Williams
  Cc: sunqiuyang, Linux Kernel Mailing List, linux-fsdevel,
	linux-f2fs-devel, linux-nvdimm@lists.01.org

Hi Dan,

On 07/25, Dan Williams wrote:
> [ adding linux-nvdimm ]
> 
> On Thu, Jul 20, 2017 at 5:10 AM, sunqiuyang <sunqiuyang@huawei.com> wrote:
> > From: Qiuyang Sun <sunqiuyang@huawei.com>
> >
> > This patch implements Direct Access (DAX) in F2FS, including:
> >  - a mount option to choose whether to enable DAX or not
> 
> We're in the process of walking back and potentially deprecating the
> use of the dax mount option for xfs and ext4 since dax can have
> negative performance implications if page cache memory happens to be
> faster than pmem. It should be limited to applications that
> specifically want the semantic, not globally enabled for the entire
> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
> per-inode enabling of dax.

Thank you so much for pointing this out. So, is there a plan to define a
generic inode flag to enable dax via inode_set_flag? Or, does each filesystem
need to handle it individually likewise xfs?

> 
> I'm wondering if any new filesystem that adds dax support at this
> point should do so with inode flags and not a mount option?

Anyway, in such the case, I have to postpone merging this patch for a while.

Thanks,

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-26  2:16     ` Jaegeuk Kim
  (?)
@ 2017-07-26  6:47       ` sunqiuyang
  -1 siblings, 0 replies; 33+ messages in thread
From: sunqiuyang @ 2017-07-26  6:47 UTC (permalink / raw)
  To: Jaegeuk Kim, Dan Williams
  Cc: linux-fsdevel, linux-nvdimm@lists.01.org,
	Linux Kernel Mailing List, linux-f2fs-devel

Hi, 

Considering the current interfaces of F2FS and EXT4, my thought is that we can define a generic user-modifiable flag FS_DAX_FL, which can be included in the i_flags field of [f2fs | ext4]_inode_info. Thus, DAX can be enabled in either of the two ways below: 

1) mount the FS with a "dax" option, so that all files created will have the flag S_DAX set in the VFS inode, and the flag FS_DAX_FL set in [f2fs | ext4]_inode_info, by default.

2) mount the FS without "dax", and enable DAX per-inode from 
f2fs_ioctl_setflags() => f2fs_set_inode_flags()
 
Thanks,

________________________________________
From: Jaegeuk Kim [jaegeuk@kernel.org]
Sent: Wednesday, July 26, 2017 10:16
To: Dan Williams
Cc: sunqiuyang; Linux Kernel Mailing List; linux-fsdevel; linux-f2fs-devel@lists.sourceforge.net; linux-nvdimm@lists.01.org
Subject: Re: [PATCH v8 1/1] f2fs: dax: implement direct access

Hi Dan,

On 07/25, Dan Williams wrote:
> [ adding linux-nvdimm ]
>
> On Thu, Jul 20, 2017 at 5:10 AM, sunqiuyang <sunqiuyang@huawei.com> wrote:
> > From: Qiuyang Sun <sunqiuyang@huawei.com>
> >
> > This patch implements Direct Access (DAX) in F2FS, including:
> >  - a mount option to choose whether to enable DAX or not
>
> We're in the process of walking back and potentially deprecating the
> use of the dax mount option for xfs and ext4 since dax can have
> negative performance implications if page cache memory happens to be
> faster than pmem. It should be limited to applications that
> specifically want the semantic, not globally enabled for the entire
> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
> per-inode enabling of dax.

Thank you so much for pointing this out. So, is there a plan to define a
generic inode flag to enable dax via inode_set_flag? Or, does each filesystem
need to handle it individually likewise xfs?

>
> I'm wondering if any new filesystem that adds dax support at this
> point should do so with inode flags and not a mount option?

Anyway, in such the case, I have to postpone merging this patch for a while.

Thanks,
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26  6:47       ` sunqiuyang
  0 siblings, 0 replies; 33+ messages in thread
From: sunqiuyang @ 2017-07-26  6:47 UTC (permalink / raw)
  To: Jaegeuk Kim, Dan Williams
  Cc: Linux Kernel Mailing List, linux-fsdevel, linux-f2fs-devel,
	linux-nvdimm@lists.01.org

Hi, 

Considering the current interfaces of F2FS and EXT4, my thought is that we can define a generic user-modifiable flag FS_DAX_FL, which can be included in the i_flags field of [f2fs | ext4]_inode_info. Thus, DAX can be enabled in either of the two ways below: 

1) mount the FS with a "dax" option, so that all files created will have the flag S_DAX set in the VFS inode, and the flag FS_DAX_FL set in [f2fs | ext4]_inode_info, by default.

2) mount the FS without "dax", and enable DAX per-inode from 
f2fs_ioctl_setflags() => f2fs_set_inode_flags()
 
Thanks,

________________________________________
From: Jaegeuk Kim [jaegeuk@kernel.org]
Sent: Wednesday, July 26, 2017 10:16
To: Dan Williams
Cc: sunqiuyang; Linux Kernel Mailing List; linux-fsdevel; linux-f2fs-devel@lists.sourceforge.net; linux-nvdimm@lists.01.org
Subject: Re: [PATCH v8 1/1] f2fs: dax: implement direct access

Hi Dan,

On 07/25, Dan Williams wrote:
> [ adding linux-nvdimm ]
>
> On Thu, Jul 20, 2017 at 5:10 AM, sunqiuyang <sunqiuyang@huawei.com> wrote:
> > From: Qiuyang Sun <sunqiuyang@huawei.com>
> >
> > This patch implements Direct Access (DAX) in F2FS, including:
> >  - a mount option to choose whether to enable DAX or not
>
> We're in the process of walking back and potentially deprecating the
> use of the dax mount option for xfs and ext4 since dax can have
> negative performance implications if page cache memory happens to be
> faster than pmem. It should be limited to applications that
> specifically want the semantic, not globally enabled for the entire
> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
> per-inode enabling of dax.

Thank you so much for pointing this out. So, is there a plan to define a
generic inode flag to enable dax via inode_set_flag? Or, does each filesystem
need to handle it individually likewise xfs?

>
> I'm wondering if any new filesystem that adds dax support at this
> point should do so with inode flags and not a mount option?

Anyway, in such the case, I have to postpone merging this patch for a while.

Thanks,

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26  6:47       ` sunqiuyang
  0 siblings, 0 replies; 33+ messages in thread
From: sunqiuyang @ 2017-07-26  6:47 UTC (permalink / raw)
  To: Jaegeuk Kim, Dan Williams
  Cc: Linux Kernel Mailing List, linux-fsdevel, linux-f2fs-devel,
	linux-nvdimm@lists.01.org

Hi, 

Considering the current interfaces of F2FS and EXT4, my thought is that we can define a generic user-modifiable flag FS_DAX_FL, which can be included in the i_flags field of [f2fs | ext4]_inode_info. Thus, DAX can be enabled in either of the two ways below: 

1) mount the FS with a "dax" option, so that all files created will have the flag S_DAX set in the VFS inode, and the flag FS_DAX_FL set in [f2fs | ext4]_inode_info, by default.

2) mount the FS without "dax", and enable DAX per-inode from 
f2fs_ioctl_setflags() => f2fs_set_inode_flags()
 
Thanks,

________________________________________
From: Jaegeuk Kim [jaegeuk@kernel.org]
Sent: Wednesday, July 26, 2017 10:16
To: Dan Williams
Cc: sunqiuyang; Linux Kernel Mailing List; linux-fsdevel; linux-f2fs-devel@lists.sourceforge.net; linux-nvdimm@lists.01.org
Subject: Re: [PATCH v8 1/1] f2fs: dax: implement direct access

Hi Dan,

On 07/25, Dan Williams wrote:
> [ adding linux-nvdimm ]
>
> On Thu, Jul 20, 2017 at 5:10 AM, sunqiuyang <sunqiuyang@huawei.com> wrote:
> > From: Qiuyang Sun <sunqiuyang@huawei.com>
> >
> > This patch implements Direct Access (DAX) in F2FS, including:
> >  - a mount option to choose whether to enable DAX or not
>
> We're in the process of walking back and potentially deprecating the
> use of the dax mount option for xfs and ext4 since dax can have
> negative performance implications if page cache memory happens to be
> faster than pmem. It should be limited to applications that
> specifically want the semantic, not globally enabled for the entire
> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
> per-inode enabling of dax.

Thank you so much for pointing this out. So, is there a plan to define a
generic inode flag to enable dax via inode_set_flag? Or, does each filesystem
need to handle it individually likewise xfs?

>
> I'm wondering if any new filesystem that adds dax support at this
> point should do so with inode flags and not a mount option?

Anyway, in such the case, I have to postpone merging this patch for a while.

Thanks,

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-26  0:15   ` Dan Williams
  (?)
@ 2017-07-26  7:26     ` Christoph Hellwig
  -1 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26  7:26 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Tue, Jul 25, 2017 at 05:15:10PM -0700, Dan Williams wrote:
> We're in the process of walking back and potentially deprecating the
> use of the dax mount option for xfs and ext4 since dax can have
> negative performance implications if page cache memory happens to be
> faster than pmem. It should be limited to applications that
> specifically want the semantic, not globally enabled for the entire
> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
> per-inode enabling of dax.
> 
> I'm wondering if any new filesystem that adds dax support at this
> point should do so with inode flags and not a mount option?

That tradeoff is not one that the application should make, but one that
should depend on the storage medium.  To make things worse it might also
depend on the type of access.  E.g. with certain media it makes a lot of
sense to cache writes in the page cache, but generally not reads.
I've been spending some time to analyze how that could be done, but
I've not made real progress on it.

XFS_DIFLAG2_DAX is unfortunately totally unhelpful with that.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26  7:26     ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26  7:26 UTC (permalink / raw)
  To: Dan Williams
  Cc: sunqiuyang, linux-fsdevel, Jaegeuk Kim,
	linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel

On Tue, Jul 25, 2017 at 05:15:10PM -0700, Dan Williams wrote:
> We're in the process of walking back and potentially deprecating the
> use of the dax mount option for xfs and ext4 since dax can have
> negative performance implications if page cache memory happens to be
> faster than pmem. It should be limited to applications that
> specifically want the semantic, not globally enabled for the entire
> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
> per-inode enabling of dax.
> 
> I'm wondering if any new filesystem that adds dax support at this
> point should do so with inode flags and not a mount option?

That tradeoff is not one that the application should make, but one that
should depend on the storage medium.  To make things worse it might also
depend on the type of access.  E.g. with certain media it makes a lot of
sense to cache writes in the page cache, but generally not reads.
I've been spending some time to analyze how that could be done, but
I've not made real progress on it.

XFS_DIFLAG2_DAX is unfortunately totally unhelpful with that.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26  7:26     ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26  7:26 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
	Linux Kernel Mailing List,
	linux-f2fs-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, linux-fsdevel,
	Jaegeuk Kim, sunqiuyang

On Tue, Jul 25, 2017 at 05:15:10PM -0700, Dan Williams wrote:
> We're in the process of walking back and potentially deprecating the
> use of the dax mount option for xfs and ext4 since dax can have
> negative performance implications if page cache memory happens to be
> faster than pmem. It should be limited to applications that
> specifically want the semantic, not globally enabled for the entire
> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
> per-inode enabling of dax.
> 
> I'm wondering if any new filesystem that adds dax support at this
> point should do so with inode flags and not a mount option?

That tradeoff is not one that the application should make, but one that
should depend on the storage medium.  To make things worse it might also
depend on the type of access.  E.g. with certain media it makes a lot of
sense to cache writes in the page cache, but generally not reads.
I've been spending some time to analyze how that could be done, but
I've not made real progress on it.

XFS_DIFLAG2_DAX is unfortunately totally unhelpful with that.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-26  7:26     ` Christoph Hellwig
@ 2017-07-26 16:53       ` Dan Williams
  -1 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26 16:53 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 12:26 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Tue, Jul 25, 2017 at 05:15:10PM -0700, Dan Williams wrote:
>> We're in the process of walking back and potentially deprecating the
>> use of the dax mount option for xfs and ext4 since dax can have
>> negative performance implications if page cache memory happens to be
>> faster than pmem. It should be limited to applications that
>> specifically want the semantic, not globally enabled for the entire
>> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
>> per-inode enabling of dax.
>>
>> I'm wondering if any new filesystem that adds dax support at this
>> point should do so with inode flags and not a mount option?
>
> That tradeoff is not one that the application should make, but one that
> should depend on the storage medium. To make things worse it might also
> depend on the type of access. E.g. with certain media it makes a lot of
> sense to cache writes in the page cache, but generally not reads.
> I've been spending some time to analyze how that could be done, but
> I've not made real progress on it.
>
> XFS_DIFLAG2_DAX is unfortunately totally unhelpful with that.

It allows for opt-in for applications, or administrators of those
applications, that know the type of access. There's also the new HMAT
(heterogeneous memory attributes table) in ACPI that can indicate the
relative performance of pmem to system-ram if userspace needs data to
make a decision. It would be interesting to have an automatic policy
in the kernel, but we also need a mechanism for explicit
configurations.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26 16:53       ` Dan Williams
  0 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26 16:53 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 12:26 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Tue, Jul 25, 2017 at 05:15:10PM -0700, Dan Williams wrote:
>> We're in the process of walking back and potentially deprecating the
>> use of the dax mount option for xfs and ext4 since dax can have
>> negative performance implications if page cache memory happens to be
>> faster than pmem. It should be limited to applications that
>> specifically want the semantic, not globally enabled for the entire
>> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
>> per-inode enabling of dax.
>>
>> I'm wondering if any new filesystem that adds dax support at this
>> point should do so with inode flags and not a mount option?
>
> That tradeoff is not one that the application should make, but one that
> should depend on the storage medium. To make things worse it might also
> depend on the type of access. E.g. with certain media it makes a lot of
> sense to cache writes in the page cache, but generally not reads.
> I've been spending some time to analyze how that could be done, but
> I've not made real progress on it.
>
> XFS_DIFLAG2_DAX is unfortunately totally unhelpful with that.

It allows for opt-in for applications, or administrators of those
applications, that know the type of access. There's also the new HMAT
(heterogeneous memory attributes table) in ACPI that can indicate the
relative performance of pmem to system-ram if userspace needs data to
make a decision. It would be interesting to have an automatic policy
in the kernel, but we also need a mechanism for explicit
configurations.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-26 16:53       ` Dan Williams
@ 2017-07-26 17:01         ` Christoph Hellwig
  -1 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26 17:01 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, Christoph Hellwig, linux-fsdevel, Jaegeuk Kim,
	sunqiuyang

On Wed, Jul 26, 2017 at 09:53:07AM -0700, Dan Williams wrote:
> It allows for opt-in for applications, or administrators of those
> applications, that know the type of access.

That's BS.  We need to provide the best possible way to access the
media to an application.  And whether that's DAX or the page cache
is an implementation detail that should not matter to the application.

Which doesn't mean there shouldn't be ways to override the default
that the kernel chose based on hardware details, but it's certainly
not something for the application to hardcode, but something for
the adminstrator to decide.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26 17:01         ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26 17:01 UTC (permalink / raw)
  To: Dan Williams
  Cc: Christoph Hellwig, linux-nvdimm@lists.01.org,
	Linux Kernel Mailing List, linux-f2fs-devel, linux-fsdevel,
	Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 09:53:07AM -0700, Dan Williams wrote:
> It allows for opt-in for applications, or administrators of those
> applications, that know the type of access.

That's BS.  We need to provide the best possible way to access the
media to an application.  And whether that's DAX or the page cache
is an implementation detail that should not matter to the application.

Which doesn't mean there shouldn't be ways to override the default
that the kernel chose based on hardware details, but it's certainly
not something for the application to hardcode, but something for
the adminstrator to decide.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-26 17:01         ` Christoph Hellwig
  (?)
@ 2017-07-26 17:11           ` Dan Williams
  -1 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26 17:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 10:01 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, Jul 26, 2017 at 09:53:07AM -0700, Dan Williams wrote:
>> It allows for opt-in for applications, or administrators of those
>> applications, that know the type of access.
>
> That's BS.  We need to provide the best possible way to access the
> media to an application.  And whether that's DAX or the page cache
> is an implementation detail that should not matter to the application.
>
> Which doesn't mean there shouldn't be ways to override the default
> that the kernel chose based on hardware details, but it's certainly
> not something for the application to hardcode, but something for
> the adminstrator to decide.

Until HMAT came along we had no data in the kernel how to pick a sane
default, but we could now very easily make a "if pmem performance <
dram, disable dax by default" policy in the kernel.

The question for this patch is do we want to add yet another
filesystem that adds "-o dax" or require use of per-inode flags to
enable dax.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26 17:11           ` Dan Williams
  0 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26 17:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 10:01 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, Jul 26, 2017 at 09:53:07AM -0700, Dan Williams wrote:
>> It allows for opt-in for applications, or administrators of those
>> applications, that know the type of access.
>
> That's BS.  We need to provide the best possible way to access the
> media to an application.  And whether that's DAX or the page cache
> is an implementation detail that should not matter to the application.
>
> Which doesn't mean there shouldn't be ways to override the default
> that the kernel chose based on hardware details, but it's certainly
> not something for the application to hardcode, but something for
> the adminstrator to decide.

Until HMAT came along we had no data in the kernel how to pick a sane
default, but we could now very easily make a "if pmem performance <
dram, disable dax by default" policy in the kernel.

The question for this patch is do we want to add yet another
filesystem that adds "-o dax" or require use of per-inode flags to
enable dax.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26 17:11           ` Dan Williams
  0 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26 17:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
	Linux Kernel Mailing List,
	linux-f2fs-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, linux-fsdevel,
	Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 10:01 AM, Christoph Hellwig <hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org> wrote:
> On Wed, Jul 26, 2017 at 09:53:07AM -0700, Dan Williams wrote:
>> It allows for opt-in for applications, or administrators of those
>> applications, that know the type of access.
>
> That's BS.  We need to provide the best possible way to access the
> media to an application.  And whether that's DAX or the page cache
> is an implementation detail that should not matter to the application.
>
> Which doesn't mean there shouldn't be ways to override the default
> that the kernel chose based on hardware details, but it's certainly
> not something for the application to hardcode, but something for
> the adminstrator to decide.

Until HMAT came along we had no data in the kernel how to pick a sane
default, but we could now very easily make a "if pmem performance <
dram, disable dax by default" policy in the kernel.

The question for this patch is do we want to add yet another
filesystem that adds "-o dax" or require use of per-inode flags to
enable dax.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-26 17:11           ` Dan Williams
  (?)
@ 2017-07-26 17:20             ` Christoph Hellwig
  -1 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26 17:20 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, Christoph Hellwig, linux-fsdevel, Jaegeuk Kim,
	sunqiuyang

On Wed, Jul 26, 2017 at 10:11:08AM -0700, Dan Williams wrote:
> Until HMAT came along we had no data in the kernel how to pick a sane
> default, but we could now very easily make a "if pmem performance <
> dram, disable dax by default" policy in the kernel.

I'd rather do it the other way around - if HMAT is present and
pmem performance >= dram use dax.  Else require the explicit -o dax
for now to enable it.  If an explicit -o nodax is specified disable
DAX even if HMAT says it is faster.

> The question for this patch is do we want to add yet another
> filesystem that adds "-o dax" or require use of per-inode flags to
> enable dax.

Please stick to the mount option.  After spending a lot of time with
DAX and various memory techologies I'm pretty confident that the inode
flag is the wrong thing to do.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26 17:20             ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26 17:20 UTC (permalink / raw)
  To: Dan Williams
  Cc: Christoph Hellwig, linux-nvdimm@lists.01.org,
	Linux Kernel Mailing List, linux-f2fs-devel, linux-fsdevel,
	Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 10:11:08AM -0700, Dan Williams wrote:
> Until HMAT came along we had no data in the kernel how to pick a sane
> default, but we could now very easily make a "if pmem performance <
> dram, disable dax by default" policy in the kernel.

I'd rather do it the other way around - if HMAT is present and
pmem performance >= dram use dax.  Else require the explicit -o dax
for now to enable it.  If an explicit -o nodax is specified disable
DAX even if HMAT says it is faster.

> The question for this patch is do we want to add yet another
> filesystem that adds "-o dax" or require use of per-inode flags to
> enable dax.

Please stick to the mount option.  After spending a lot of time with
DAX and various memory techologies I'm pretty confident that the inode
flag is the wrong thing to do.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26 17:20             ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26 17:20 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
	Linux Kernel Mailing List,
	linux-f2fs-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	Christoph Hellwig, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 10:11:08AM -0700, Dan Williams wrote:
> Until HMAT came along we had no data in the kernel how to pick a sane
> default, but we could now very easily make a "if pmem performance <
> dram, disable dax by default" policy in the kernel.

I'd rather do it the other way around - if HMAT is present and
pmem performance >= dram use dax.  Else require the explicit -o dax
for now to enable it.  If an explicit -o nodax is specified disable
DAX even if HMAT says it is faster.

> The question for this patch is do we want to add yet another
> filesystem that adds "-o dax" or require use of per-inode flags to
> enable dax.

Please stick to the mount option.  After spending a lot of time with
DAX and various memory techologies I'm pretty confident that the inode
flag is the wrong thing to do.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-26 17:20             ` Christoph Hellwig
@ 2017-07-26 19:16               ` Dan Williams
  -1 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26 19:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 10:20 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, Jul 26, 2017 at 10:11:08AM -0700, Dan Williams wrote:
>> Until HMAT came along we had no data in the kernel how to pick a sane
>> default, but we could now very easily make a "if pmem performance <
>> dram, disable dax by default" policy in the kernel.
>
> I'd rather do it the other way around - if HMAT is present and
> pmem performance >= dram use dax.  Else require the explicit -o dax
> for now to enable it.  If an explicit -o nodax is specified disable
> DAX even if HMAT says it is faster.

Silently turn on DAX if HMAT says its ok? I think we would instead
want a "-o autodax" for that case and then "-o dax" and "-o nodax" for
the force cases.

>> The question for this patch is do we want to add yet another
>> filesystem that adds "-o dax" or require use of per-inode flags to
>> enable dax.
>
> Please stick to the mount option.  After spending a lot of time with
> DAX and various memory techologies I'm pretty confident that the inode
> flag is the wrong thing to do.

I think it's easier to administer than the dax mount option. If
someone wants dax on only in a sub-tree they can set the flag on that
parent directory and have a policy in dax filesystems that children
inherit the dax policy from the parent. That seems a better
administrative model than trying to get it all right globally at mount
time.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26 19:16               ` Dan Williams
  0 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26 19:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 10:20 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, Jul 26, 2017 at 10:11:08AM -0700, Dan Williams wrote:
>> Until HMAT came along we had no data in the kernel how to pick a sane
>> default, but we could now very easily make a "if pmem performance <
>> dram, disable dax by default" policy in the kernel.
>
> I'd rather do it the other way around - if HMAT is present and
> pmem performance >= dram use dax.  Else require the explicit -o dax
> for now to enable it.  If an explicit -o nodax is specified disable
> DAX even if HMAT says it is faster.

Silently turn on DAX if HMAT says its ok? I think we would instead
want a "-o autodax" for that case and then "-o dax" and "-o nodax" for
the force cases.

>> The question for this patch is do we want to add yet another
>> filesystem that adds "-o dax" or require use of per-inode flags to
>> enable dax.
>
> Please stick to the mount option.  After spending a lot of time with
> DAX and various memory techologies I'm pretty confident that the inode
> flag is the wrong thing to do.

I think it's easier to administer than the dax mount option. If
someone wants dax on only in a sub-tree they can set the flag on that
parent directory and have a policy in dax filesystems that children
inherit the dax policy from the parent. That seems a better
administrative model than trying to get it all right globally at mount
time.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-26 19:16               ` Dan Williams
  (?)
@ 2017-07-26 20:11                 ` Christoph Hellwig
  -1 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26 20:11 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, Christoph Hellwig, linux-fsdevel, Jaegeuk Kim,
	sunqiuyang

On Wed, Jul 26, 2017 at 12:16:11PM -0700, Dan Williams wrote:
> Silently turn on DAX if HMAT says its ok?

Yes, absolutely.  I want my system to do the right thing by default,
and if HMAT says bypassing the page cache is a clear advatange it
should be the default.

> I think we would instead
> want a "-o autodax" for that case and then "-o dax" and "-o nodax" for
> the force cases.

Why?

> I think it's easier to administer than the dax mount option. If
> someone wants dax on only in a sub-tree they can set the flag on that
> parent directory and have a policy in dax filesystems that children
> inherit the dax policy from the parent. That seems a better
> administrative model than trying to get it all right globally at mount
> time.

And why exactly? If DAX is faster for file a in directory X it will
be just as fast for a file b in directory Y.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26 20:11                 ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26 20:11 UTC (permalink / raw)
  To: Dan Williams
  Cc: Christoph Hellwig, linux-nvdimm@lists.01.org,
	Linux Kernel Mailing List, linux-f2fs-devel, linux-fsdevel,
	Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 12:16:11PM -0700, Dan Williams wrote:
> Silently turn on DAX if HMAT says its ok?

Yes, absolutely.  I want my system to do the right thing by default,
and if HMAT says bypassing the page cache is a clear advatange it
should be the default.

> I think we would instead
> want a "-o autodax" for that case and then "-o dax" and "-o nodax" for
> the force cases.

Why?

> I think it's easier to administer than the dax mount option. If
> someone wants dax on only in a sub-tree they can set the flag on that
> parent directory and have a policy in dax filesystems that children
> inherit the dax policy from the parent. That seems a better
> administrative model than trying to get it all right globally at mount
> time.

And why exactly? If DAX is faster for file a in directory X it will
be just as fast for a file b in directory Y.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26 20:11                 ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2017-07-26 20:11 UTC (permalink / raw)
  To: Dan Williams
  Cc: linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org,
	Linux Kernel Mailing List,
	linux-f2fs-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	Christoph Hellwig, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 12:16:11PM -0700, Dan Williams wrote:
> Silently turn on DAX if HMAT says its ok?

Yes, absolutely.  I want my system to do the right thing by default,
and if HMAT says bypassing the page cache is a clear advatange it
should be the default.

> I think we would instead
> want a "-o autodax" for that case and then "-o dax" and "-o nodax" for
> the force cases.

Why?

> I think it's easier to administer than the dax mount option. If
> someone wants dax on only in a sub-tree they can set the flag on that
> parent directory and have a policy in dax filesystems that children
> inherit the dax policy from the parent. That seems a better
> administrative model than trying to get it all right globally at mount
> time.

And why exactly? If DAX is faster for file a in directory X it will
be just as fast for a file b in directory Y.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
  2017-07-26 20:11                 ` Christoph Hellwig
@ 2017-07-26 20:29                   ` Dan Williams
  -1 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26 20:29 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 1:11 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, Jul 26, 2017 at 12:16:11PM -0700, Dan Williams wrote:
>> Silently turn on DAX if HMAT says its ok?
>
> Yes, absolutely.  I want my system to do the right thing by default,
> and if HMAT says bypassing the page cache is a clear advatange it
> should be the default.
>
>> I think we would instead
>> want a "-o autodax" for that case and then "-o dax" and "-o nodax" for
>> the force cases.
>
> Why?
>

I'm worried about the case where HMAT says pmem >= dram performance,
but dax semantics like disabling delayed allocation and
dirty-cacheline tracking end up hurting performance, but I guess we
can handle that on a case by case basis with targeted kernel
optimizations.

>> I think it's easier to administer than the dax mount option. If
>> someone wants dax on only in a sub-tree they can set the flag on that
>> parent directory and have a policy in dax filesystems that children
>> inherit the dax policy from the parent. That seems a better
>> administrative model than trying to get it all right globally at mount
>> time.
>
> And why exactly? If DAX is faster for file a in directory X it will
> be just as fast for a file b in directory Y.

So I want the inode setting for the pmem < dram performance case where
I know that access patterns of the application using file b in
directory Y can still yield better performance without page cache. For
example, the working set is larger than dram capacity.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v8 1/1] f2fs: dax: implement direct access
@ 2017-07-26 20:29                   ` Dan Williams
  0 siblings, 0 replies; 33+ messages in thread
From: Dan Williams @ 2017-07-26 20:29 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvdimm@lists.01.org, Linux Kernel Mailing List,
	linux-f2fs-devel, linux-fsdevel, Jaegeuk Kim, sunqiuyang

On Wed, Jul 26, 2017 at 1:11 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, Jul 26, 2017 at 12:16:11PM -0700, Dan Williams wrote:
>> Silently turn on DAX if HMAT says its ok?
>
> Yes, absolutely.  I want my system to do the right thing by default,
> and if HMAT says bypassing the page cache is a clear advatange it
> should be the default.
>
>> I think we would instead
>> want a "-o autodax" for that case and then "-o dax" and "-o nodax" for
>> the force cases.
>
> Why?
>

I'm worried about the case where HMAT says pmem >= dram performance,
but dax semantics like disabling delayed allocation and
dirty-cacheline tracking end up hurting performance, but I guess we
can handle that on a case by case basis with targeted kernel
optimizations.

>> I think it's easier to administer than the dax mount option. If
>> someone wants dax on only in a sub-tree they can set the flag on that
>> parent directory and have a policy in dax filesystems that children
>> inherit the dax policy from the parent. That seems a better
>> administrative model than trying to get it all right globally at mount
>> time.
>
> And why exactly? If DAX is faster for file a in directory X it will
> be just as fast for a file b in directory Y.

So I want the inode setting for the pmem < dram performance case where
I know that access patterns of the application using file b in
directory Y can still yield better performance without page cache. For
example, the working set is larger than dram capacity.

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2017-07-26 20:29 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-20 12:10 [PATCH v8 1/1] f2fs: dax: implement direct access sunqiuyang
2017-07-20 12:10 ` sunqiuyang
2017-07-22  0:34 ` Jaegeuk Kim
2017-07-24 12:03   ` Sun Qiuyang
2017-07-24 12:03     ` Sun Qiuyang
2017-07-26  0:15 ` Dan Williams
2017-07-26  0:15   ` Dan Williams
2017-07-26  0:15   ` Dan Williams
2017-07-26  2:16   ` Jaegeuk Kim
2017-07-26  2:16     ` Jaegeuk Kim
2017-07-26  6:47     ` sunqiuyang
2017-07-26  6:47       ` sunqiuyang
2017-07-26  6:47       ` sunqiuyang
2017-07-26  7:26   ` Christoph Hellwig
2017-07-26  7:26     ` Christoph Hellwig
2017-07-26  7:26     ` Christoph Hellwig
2017-07-26 16:53     ` Dan Williams
2017-07-26 16:53       ` Dan Williams
2017-07-26 17:01       ` Christoph Hellwig
2017-07-26 17:01         ` Christoph Hellwig
2017-07-26 17:11         ` Dan Williams
2017-07-26 17:11           ` Dan Williams
2017-07-26 17:11           ` Dan Williams
2017-07-26 17:20           ` Christoph Hellwig
2017-07-26 17:20             ` Christoph Hellwig
2017-07-26 17:20             ` Christoph Hellwig
2017-07-26 19:16             ` Dan Williams
2017-07-26 19:16               ` Dan Williams
2017-07-26 20:11               ` Christoph Hellwig
2017-07-26 20:11                 ` Christoph Hellwig
2017-07-26 20:11                 ` Christoph Hellwig
2017-07-26 20:29                 ` Dan Williams
2017-07-26 20:29                   ` Dan Williams

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.