All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 0/2] erofs: dio/dax support for non-tailpacking cases
@ 2021-07-04 13:50 ` Gao Xiang
  0 siblings, 0 replies; 13+ messages in thread
From: Gao Xiang @ 2021-07-04 13:50 UTC (permalink / raw)
  To: linux-erofs
  Cc: linux-fsdevel, LKML, nvdimm, Darrick J. Wong, Liu Bo, Joseph Qi,
	Liu Jiang, Gao Xiang

Hi folks,

This patchset mainly adds preliminary EROFS iomap dio/dax support
for non-tailpacking uncompressed cases.

Direct I/O is useful in certain scenarios for uncompressed files.
For example, double pagecache can be avoid by direct I/O when
loop device is used for uncompressed files containing upper layer
compressed filesystem.

Also, DAX is quite useful for some VM use cases in order to
save guest memory extremely by using the minimal lightweight EROFS.

Tail-packing inline iomap support will be handled later since
currently iomap doesn't support such data pattern, which is
independent to non-tailpacking cases.

Comments are welcome. Thanks for your time on reading this!

Thanks,
Gao Xiang

Gao Xiang (1):
  erofs: dax support for non-tailpacking regular file

Huang Jianan (1):
  erofs: iomap support for non-tailpacking DIO

 fs/erofs/Kconfig    |   1 +
 fs/erofs/data.c     | 143 +++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/inode.c    |  10 +++-
 fs/erofs/internal.h |   3 +
 fs/erofs/super.c    |  20 ++++++-
 5 files changed, 173 insertions(+), 4 deletions(-)

-- 
2.24.4


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC PATCH 0/2] erofs: dio/dax support for non-tailpacking cases
@ 2021-07-04 13:50 ` Gao Xiang
  0 siblings, 0 replies; 13+ messages in thread
From: Gao Xiang @ 2021-07-04 13:50 UTC (permalink / raw)
  To: linux-erofs
  Cc: nvdimm, Darrick J. Wong, LKML, Joseph Qi, Liu Bo, linux-fsdevel,
	Gao Xiang, Liu Jiang

Hi folks,

This patchset mainly adds preliminary EROFS iomap dio/dax support
for non-tailpacking uncompressed cases.

Direct I/O is useful in certain scenarios for uncompressed files.
For example, double pagecache can be avoid by direct I/O when
loop device is used for uncompressed files containing upper layer
compressed filesystem.

Also, DAX is quite useful for some VM use cases in order to
save guest memory extremely by using the minimal lightweight EROFS.

Tail-packing inline iomap support will be handled later since
currently iomap doesn't support such data pattern, which is
independent to non-tailpacking cases.

Comments are welcome. Thanks for your time on reading this!

Thanks,
Gao Xiang

Gao Xiang (1):
  erofs: dax support for non-tailpacking regular file

Huang Jianan (1):
  erofs: iomap support for non-tailpacking DIO

 fs/erofs/Kconfig    |   1 +
 fs/erofs/data.c     | 143 +++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/inode.c    |  10 +++-
 fs/erofs/internal.h |   3 +
 fs/erofs/super.c    |  20 ++++++-
 5 files changed, 173 insertions(+), 4 deletions(-)

-- 
2.24.4


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC PATCH 1/2] erofs: iomap support for non-tailpacking DIO
  2021-07-04 13:50 ` Gao Xiang
@ 2021-07-04 13:50   ` Gao Xiang
  -1 siblings, 0 replies; 13+ messages in thread
From: Gao Xiang @ 2021-07-04 13:50 UTC (permalink / raw)
  To: linux-erofs
  Cc: linux-fsdevel, LKML, nvdimm, Darrick J. Wong, Liu Bo, Joseph Qi,
	Liu Jiang, Huang Jianan, Gao Xiang

From: Huang Jianan <huangjianan@oppo.com>

Add iomap support for non-tailpacking uncompressed data in order to
support DIO and DAX.

Direct I/O is useful in certain scenarios for uncompressed files.
For example, double pagecache can be avoid by direct I/O when
loop device is used for uncompressed files containing upper layer
compressed filesystem.

This adds iomap DIO support for non-tailpacking cases first and
tail-packing inline files can be handled later.

Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Huang Jianan <huangjianan@oppo.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/Kconfig    |   1 +
 fs/erofs/data.c     | 102 ++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/inode.c    |   5 ++-
 fs/erofs/internal.h |   1 +
 4 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 906af0c1998c..14b747026742 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,6 +3,7 @@
 config EROFS_FS
 	tristate "EROFS filesystem support"
 	depends on BLOCK
+	select FS_IOMAP
 	select LIBCRC32C
 	help
 	  EROFS (Enhanced Read-Only File System) is a lightweight
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 3787a5fb0a42..38e9439c2510 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,6 +5,7 @@
  */
 #include "internal.h"
 #include <linux/prefetch.h>
+#include <linux/iomap.h>
 
 #include <trace/events/erofs.h>
 
@@ -308,9 +309,110 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
 	return 0;
 }
 
+static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+		unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+	int ret;
+	struct erofs_map_blocks map;
+
+	map.m_la = offset;
+	map.m_llen = length;
+
+	ret = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW);
+	if (ret < 0)
+		return ret;
+
+	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->offset = map.m_la;
+	iomap->length = map.m_llen;
+
+	if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+		iomap->type = IOMAP_HOLE;
+		iomap->addr = IOMAP_NULL_ADDR;
+		if (!iomap->length)
+			iomap->length = length;
+		return 0;
+	}
+
+	/* that shouldn't happen for now */
+	if (map.m_flags & EROFS_MAP_META) {
+		DBG_BUGON(1);
+		return -ENOTBLK;
+	}
+	iomap->type = IOMAP_MAPPED;
+	iomap->addr = map.m_pa;
+	iomap->flags = 0;
+	return 0;
+}
+
+const struct iomap_ops erofs_iomap_ops = {
+	.iomap_begin = erofs_iomap_begin,
+};
+
+static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	unsigned int blksize_mask = (1 << inode->i_blkbits) - 1;
+	loff_t align = iocb->ki_pos | iov_iter_count(to) |
+		iov_iter_alignment(to);
+	struct block_device *bdev = inode->i_sb->s_bdev;
+
+	if (bdev)
+		blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1;
+	else
+		blksize_mask = (1 << inode->i_blkbits) - 1;
+
+	if (align & blksize_mask)
+		return -EINVAL;
+
+	/*
+	 * Tail-packing inline data is not supported for iomap for now.
+	 * Temporarily fall back this to buffered I/O instead.
+	 */
+	if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE &&
+	    iocb->ki_pos + iov_iter_count(to) >
+			rounddown(inode->i_size, EROFS_BLKSIZ))
+		return 1;
+	return 0;
+}
+
+static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	/* no need taking (shared) inode lock since it's a ro filesystem */
+	if (!iov_iter_count(to))
+		return 0;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		int err = erofs_prepare_dio(iocb, to);
+
+		if (!err)
+			return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
+					    NULL, 0);
+		if (err < 0)
+			return err;
+		/*
+		 * Fallback to buffered I/O if the operation being performed on
+		 * the inode is not supported by direct I/O. The IOCB_DIRECT
+		 * flag needs to be cleared here in order to ensure that the
+		 * direct I/O path within generic_file_read_iter() is not
+		 * taken.
+		 */
+		iocb->ki_flags &= ~IOCB_DIRECT;
+	}
+	return generic_file_read_iter(iocb, to);
+}
+
 /* for uncompressed (aligned) files and raw access for other files */
 const struct address_space_operations erofs_raw_access_aops = {
 	.readpage = erofs_raw_access_readpage,
 	.readahead = erofs_raw_access_readahead,
 	.bmap = erofs_bmap,
+	.direct_IO = noop_direct_IO,
+};
+
+const struct file_operations erofs_file_fops = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= erofs_file_read_iter,
+	.mmap		= generic_file_readonly_mmap,
+	.splice_read	= generic_file_splice_read,
 };
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index aa8a0d770ba3..00edb7562fea 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -247,7 +247,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_op = &erofs_generic_iops;
-		inode->i_fop = &generic_ro_fops;
+		if (!erofs_inode_is_data_compressed(vi->datalayout))
+			inode->i_fop = &erofs_file_fops;
+		else
+			inode->i_fop = &generic_ro_fops;
 		break;
 	case S_IFDIR:
 		inode->i_op = &erofs_dir_iops;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 543c2ff97d30..2669c785d548 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -371,6 +371,7 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
 #endif	/* !CONFIG_EROFS_FS_ZIP */
 
 /* data.c */
+extern const struct file_operations erofs_file_fops;
 struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
 
 /* inode.c */
-- 
2.24.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [RFC PATCH 1/2] erofs: iomap support for non-tailpacking DIO
@ 2021-07-04 13:50   ` Gao Xiang
  0 siblings, 0 replies; 13+ messages in thread
From: Gao Xiang @ 2021-07-04 13:50 UTC (permalink / raw)
  To: linux-erofs
  Cc: nvdimm, Darrick J. Wong, LKML, Joseph Qi, Liu Bo, linux-fsdevel,
	Gao Xiang, Liu Jiang

From: Huang Jianan <huangjianan@oppo.com>

Add iomap support for non-tailpacking uncompressed data in order to
support DIO and DAX.

Direct I/O is useful in certain scenarios for uncompressed files.
For example, double pagecache can be avoid by direct I/O when
loop device is used for uncompressed files containing upper layer
compressed filesystem.

This adds iomap DIO support for non-tailpacking cases first and
tail-packing inline files can be handled later.

Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Huang Jianan <huangjianan@oppo.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/Kconfig    |   1 +
 fs/erofs/data.c     | 102 ++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/inode.c    |   5 ++-
 fs/erofs/internal.h |   1 +
 4 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 906af0c1998c..14b747026742 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,6 +3,7 @@
 config EROFS_FS
 	tristate "EROFS filesystem support"
 	depends on BLOCK
+	select FS_IOMAP
 	select LIBCRC32C
 	help
 	  EROFS (Enhanced Read-Only File System) is a lightweight
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 3787a5fb0a42..38e9439c2510 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,6 +5,7 @@
  */
 #include "internal.h"
 #include <linux/prefetch.h>
+#include <linux/iomap.h>
 
 #include <trace/events/erofs.h>
 
@@ -308,9 +309,110 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
 	return 0;
 }
 
+static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+		unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+	int ret;
+	struct erofs_map_blocks map;
+
+	map.m_la = offset;
+	map.m_llen = length;
+
+	ret = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW);
+	if (ret < 0)
+		return ret;
+
+	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->offset = map.m_la;
+	iomap->length = map.m_llen;
+
+	if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+		iomap->type = IOMAP_HOLE;
+		iomap->addr = IOMAP_NULL_ADDR;
+		if (!iomap->length)
+			iomap->length = length;
+		return 0;
+	}
+
+	/* that shouldn't happen for now */
+	if (map.m_flags & EROFS_MAP_META) {
+		DBG_BUGON(1);
+		return -ENOTBLK;
+	}
+	iomap->type = IOMAP_MAPPED;
+	iomap->addr = map.m_pa;
+	iomap->flags = 0;
+	return 0;
+}
+
+const struct iomap_ops erofs_iomap_ops = {
+	.iomap_begin = erofs_iomap_begin,
+};
+
+static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	unsigned int blksize_mask = (1 << inode->i_blkbits) - 1;
+	loff_t align = iocb->ki_pos | iov_iter_count(to) |
+		iov_iter_alignment(to);
+	struct block_device *bdev = inode->i_sb->s_bdev;
+
+	if (bdev)
+		blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1;
+	else
+		blksize_mask = (1 << inode->i_blkbits) - 1;
+
+	if (align & blksize_mask)
+		return -EINVAL;
+
+	/*
+	 * Tail-packing inline data is not supported for iomap for now.
+	 * Temporarily fall back this to buffered I/O instead.
+	 */
+	if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE &&
+	    iocb->ki_pos + iov_iter_count(to) >
+			rounddown(inode->i_size, EROFS_BLKSIZ))
+		return 1;
+	return 0;
+}
+
+static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	/* no need taking (shared) inode lock since it's a ro filesystem */
+	if (!iov_iter_count(to))
+		return 0;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		int err = erofs_prepare_dio(iocb, to);
+
+		if (!err)
+			return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
+					    NULL, 0);
+		if (err < 0)
+			return err;
+		/*
+		 * Fallback to buffered I/O if the operation being performed on
+		 * the inode is not supported by direct I/O. The IOCB_DIRECT
+		 * flag needs to be cleared here in order to ensure that the
+		 * direct I/O path within generic_file_read_iter() is not
+		 * taken.
+		 */
+		iocb->ki_flags &= ~IOCB_DIRECT;
+	}
+	return generic_file_read_iter(iocb, to);
+}
+
 /* for uncompressed (aligned) files and raw access for other files */
 const struct address_space_operations erofs_raw_access_aops = {
 	.readpage = erofs_raw_access_readpage,
 	.readahead = erofs_raw_access_readahead,
 	.bmap = erofs_bmap,
+	.direct_IO = noop_direct_IO,
+};
+
+const struct file_operations erofs_file_fops = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= erofs_file_read_iter,
+	.mmap		= generic_file_readonly_mmap,
+	.splice_read	= generic_file_splice_read,
 };
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index aa8a0d770ba3..00edb7562fea 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -247,7 +247,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_op = &erofs_generic_iops;
-		inode->i_fop = &generic_ro_fops;
+		if (!erofs_inode_is_data_compressed(vi->datalayout))
+			inode->i_fop = &erofs_file_fops;
+		else
+			inode->i_fop = &generic_ro_fops;
 		break;
 	case S_IFDIR:
 		inode->i_op = &erofs_dir_iops;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 543c2ff97d30..2669c785d548 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -371,6 +371,7 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
 #endif	/* !CONFIG_EROFS_FS_ZIP */
 
 /* data.c */
+extern const struct file_operations erofs_file_fops;
 struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
 
 /* inode.c */
-- 
2.24.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [RFC PATCH 2/2] erofs: dax support for non-tailpacking regular file
  2021-07-04 13:50 ` Gao Xiang
@ 2021-07-04 13:50   ` Gao Xiang
  -1 siblings, 0 replies; 13+ messages in thread
From: Gao Xiang @ 2021-07-04 13:50 UTC (permalink / raw)
  To: linux-erofs
  Cc: linux-fsdevel, LKML, nvdimm, Darrick J. Wong, Liu Bo, Joseph Qi,
	Liu Jiang, Gao Xiang

DAX is quite useful for some VM use cases in order to save guest
memory extremely with minimal lightweight EROFS.

In order to prepare for such use cases, add preliminary dax support
for non-tailpacking regular files for now.

Tested with the DRAM-emulated PMEM and the EROFS image generated by
"mkfs.erofs -Enoinline_data enwik9.fsdax.img enwik9"

Cc: nvdimm@lists.linux.dev
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/data.c     | 41 ++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/inode.c    |  5 +++++
 fs/erofs/internal.h |  2 ++
 fs/erofs/super.c    | 20 ++++++++++++++++++--
 4 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 38e9439c2510..edeee8ccb22a 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -6,7 +6,7 @@
 #include "internal.h"
 #include <linux/prefetch.h>
 #include <linux/iomap.h>
-
+#include <linux/dax.h>
 #include <trace/events/erofs.h>
 
 static void erofs_readendio(struct bio *bio)
@@ -323,6 +323,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		return ret;
 
 	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
 	iomap->offset = map.m_la;
 	iomap->length = map.m_llen;
 
@@ -382,6 +383,11 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (!iov_iter_count(to))
 		return 0;
 
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(iocb->ki_filp->f_mapping->host))
+		return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
+#endif
+
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		int err = erofs_prepare_dio(iocb, to);
 
@@ -410,6 +416,39 @@ const struct address_space_operations erofs_raw_access_aops = {
 	.direct_IO = noop_direct_IO,
 };
 
+#ifdef CONFIG_FS_DAX
+static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
+{
+	return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
+}
+
+static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
+{
+	return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static const struct vm_operations_struct erofs_dax_vm_ops = {
+	.fault		= erofs_dax_fault,
+	.huge_fault	= erofs_dax_huge_fault,
+};
+
+int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (!IS_DAX(file_inode(file)))
+		return generic_file_readonly_mmap(file, vma);
+
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+		return -EINVAL;
+
+	vma->vm_ops = &erofs_dax_vm_ops;
+	vma->vm_flags |= VM_HUGEPAGE;
+	return 0;
+}
+#else
+#define erofs_file_mmap	generic_file_readonly_mmap
+#endif
+
 const struct file_operations erofs_file_fops = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= erofs_file_read_iter,
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 00edb7562fea..695b97acb9a6 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -174,6 +174,11 @@ static struct page *erofs_read_inode(struct inode *inode,
 	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
 
+	inode->i_flags &= ~S_DAX;
+	if (test_opt(&sbi->ctx, DAX) && S_ISREG(inode->i_mode) &&
+	    vi->datalayout == EROFS_INODE_FLAT_PLAIN)
+		inode->i_flags |= S_DAX;
+
 	if (!nblks)
 		/* measure inode.i_blocks as generic filesystems */
 		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2669c785d548..8b0542d35148 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -83,6 +83,7 @@ struct erofs_sb_info {
 
 	struct erofs_sb_lz4_info lz4;
 #endif	/* CONFIG_EROFS_FS_ZIP */
+	struct dax_device *dax_dev;
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -115,6 +116,7 @@ struct erofs_sb_info {
 /* Mount flags set via mount options or defaults */
 #define EROFS_MOUNT_XATTR_USER		0x00000010
 #define EROFS_MOUNT_POSIX_ACL		0x00000020
+#define EROFS_MOUNT_DAX			0x00000040
 
 #define clear_opt(ctx, option)	((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
 #define set_opt(ctx, option)	((ctx)->mount_opt |= EROFS_MOUNT_##option)
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 8fc6c04b54f4..9aa385e22e1a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -11,6 +11,7 @@
 #include <linux/crc32c.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
+#include <linux/dax.h>
 #include "xattr.h"
 
 #define CREATE_TRACE_POINTS
@@ -355,6 +356,7 @@ enum {
 	Opt_user_xattr,
 	Opt_acl,
 	Opt_cache_strategy,
+	Opt_dax,
 	Opt_err
 };
 
@@ -370,6 +372,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
 	fsparam_flag_no("acl",		Opt_acl),
 	fsparam_enum("cache_strategy",	Opt_cache_strategy,
 		     erofs_param_cache_strategy),
+	fsparam_flag("dax",             Opt_dax),
 	{}
 };
 
@@ -410,6 +413,14 @@ static int erofs_fc_parse_param(struct fs_context *fc,
 		ctx->cache_strategy = result.uint_32;
 #else
 		errorfc(fc, "compression not supported, cache_strategy ignored");
+#endif
+		break;
+	case Opt_dax:
+#ifdef CONFIG_FS_DAX
+		warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+		set_opt(ctx, DAX);
+#else
+		errorfc(fc, "dax options not supported");
 #endif
 		break;
 	default:
@@ -496,6 +507,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
+	sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
 	err = erofs_read_superblock(sb);
 	if (err)
 		return err;
@@ -609,6 +621,8 @@ static void erofs_kill_sb(struct super_block *sb)
 	sbi = EROFS_SB(sb);
 	if (!sbi)
 		return;
+	if (sbi->dax_dev)
+		fs_put_dax(sbi->dax_dev);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
@@ -711,8 +725,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
-	struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
+	struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
+	struct erofs_fs_context *ctx = &sbi->ctx;
 
 #ifdef CONFIG_EROFS_FS_XATTR
 	if (test_opt(ctx, XATTR_USER))
@@ -734,6 +748,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 	else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
 		seq_puts(seq, ",cache_strategy=readaround");
 #endif
+	if (test_opt(ctx, DAX))
+		seq_puts(seq, ",dax");
 	return 0;
 }
 
-- 
2.24.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [RFC PATCH 2/2] erofs: dax support for non-tailpacking regular file
@ 2021-07-04 13:50   ` Gao Xiang
  0 siblings, 0 replies; 13+ messages in thread
From: Gao Xiang @ 2021-07-04 13:50 UTC (permalink / raw)
  To: linux-erofs
  Cc: nvdimm, Darrick J. Wong, LKML, Joseph Qi, Liu Bo, linux-fsdevel,
	Gao Xiang, Liu Jiang

DAX is quite useful for some VM use cases in order to save guest
memory extremely with minimal lightweight EROFS.

In order to prepare for such use cases, add preliminary dax support
for non-tailpacking regular files for now.

Tested with the DRAM-emulated PMEM and the EROFS image generated by
"mkfs.erofs -Enoinline_data enwik9.fsdax.img enwik9"

Cc: nvdimm@lists.linux.dev
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/data.c     | 41 ++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/inode.c    |  5 +++++
 fs/erofs/internal.h |  2 ++
 fs/erofs/super.c    | 20 ++++++++++++++++++--
 4 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 38e9439c2510..edeee8ccb22a 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -6,7 +6,7 @@
 #include "internal.h"
 #include <linux/prefetch.h>
 #include <linux/iomap.h>
-
+#include <linux/dax.h>
 #include <trace/events/erofs.h>
 
 static void erofs_readendio(struct bio *bio)
@@ -323,6 +323,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		return ret;
 
 	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
 	iomap->offset = map.m_la;
 	iomap->length = map.m_llen;
 
@@ -382,6 +383,11 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (!iov_iter_count(to))
 		return 0;
 
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(iocb->ki_filp->f_mapping->host))
+		return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
+#endif
+
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		int err = erofs_prepare_dio(iocb, to);
 
@@ -410,6 +416,39 @@ const struct address_space_operations erofs_raw_access_aops = {
 	.direct_IO = noop_direct_IO,
 };
 
+#ifdef CONFIG_FS_DAX
+static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
+{
+	return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
+}
+
+static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
+{
+	return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static const struct vm_operations_struct erofs_dax_vm_ops = {
+	.fault		= erofs_dax_fault,
+	.huge_fault	= erofs_dax_huge_fault,
+};
+
+int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (!IS_DAX(file_inode(file)))
+		return generic_file_readonly_mmap(file, vma);
+
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+		return -EINVAL;
+
+	vma->vm_ops = &erofs_dax_vm_ops;
+	vma->vm_flags |= VM_HUGEPAGE;
+	return 0;
+}
+#else
+#define erofs_file_mmap	generic_file_readonly_mmap
+#endif
+
 const struct file_operations erofs_file_fops = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= erofs_file_read_iter,
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 00edb7562fea..695b97acb9a6 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -174,6 +174,11 @@ static struct page *erofs_read_inode(struct inode *inode,
 	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
 
+	inode->i_flags &= ~S_DAX;
+	if (test_opt(&sbi->ctx, DAX) && S_ISREG(inode->i_mode) &&
+	    vi->datalayout == EROFS_INODE_FLAT_PLAIN)
+		inode->i_flags |= S_DAX;
+
 	if (!nblks)
 		/* measure inode.i_blocks as generic filesystems */
 		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2669c785d548..8b0542d35148 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -83,6 +83,7 @@ struct erofs_sb_info {
 
 	struct erofs_sb_lz4_info lz4;
 #endif	/* CONFIG_EROFS_FS_ZIP */
+	struct dax_device *dax_dev;
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -115,6 +116,7 @@ struct erofs_sb_info {
 /* Mount flags set via mount options or defaults */
 #define EROFS_MOUNT_XATTR_USER		0x00000010
 #define EROFS_MOUNT_POSIX_ACL		0x00000020
+#define EROFS_MOUNT_DAX			0x00000040
 
 #define clear_opt(ctx, option)	((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
 #define set_opt(ctx, option)	((ctx)->mount_opt |= EROFS_MOUNT_##option)
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 8fc6c04b54f4..9aa385e22e1a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -11,6 +11,7 @@
 #include <linux/crc32c.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
+#include <linux/dax.h>
 #include "xattr.h"
 
 #define CREATE_TRACE_POINTS
@@ -355,6 +356,7 @@ enum {
 	Opt_user_xattr,
 	Opt_acl,
 	Opt_cache_strategy,
+	Opt_dax,
 	Opt_err
 };
 
@@ -370,6 +372,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
 	fsparam_flag_no("acl",		Opt_acl),
 	fsparam_enum("cache_strategy",	Opt_cache_strategy,
 		     erofs_param_cache_strategy),
+	fsparam_flag("dax",             Opt_dax),
 	{}
 };
 
@@ -410,6 +413,14 @@ static int erofs_fc_parse_param(struct fs_context *fc,
 		ctx->cache_strategy = result.uint_32;
 #else
 		errorfc(fc, "compression not supported, cache_strategy ignored");
+#endif
+		break;
+	case Opt_dax:
+#ifdef CONFIG_FS_DAX
+		warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+		set_opt(ctx, DAX);
+#else
+		errorfc(fc, "dax options not supported");
 #endif
 		break;
 	default:
@@ -496,6 +507,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
+	sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
 	err = erofs_read_superblock(sb);
 	if (err)
 		return err;
@@ -609,6 +621,8 @@ static void erofs_kill_sb(struct super_block *sb)
 	sbi = EROFS_SB(sb);
 	if (!sbi)
 		return;
+	if (sbi->dax_dev)
+		fs_put_dax(sbi->dax_dev);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
@@ -711,8 +725,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
-	struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
+	struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
+	struct erofs_fs_context *ctx = &sbi->ctx;
 
 #ifdef CONFIG_EROFS_FS_XATTR
 	if (test_opt(ctx, XATTR_USER))
@@ -734,6 +748,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 	else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
 		seq_puts(seq, ",cache_strategy=readaround");
 #endif
+	if (test_opt(ctx, DAX))
+		seq_puts(seq, ",dax");
 	return 0;
 }
 
-- 
2.24.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 2/2] erofs: dax support for non-tailpacking regular file
  2021-07-04 13:50   ` Gao Xiang
  (?)
@ 2021-07-04 16:00   ` kernel test robot
  -1 siblings, 0 replies; 13+ messages in thread
From: kernel test robot @ 2021-07-04 16:00 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 2812 bytes --]

Hi Gao,

[FYI, it's a private test report for your RFC patch.]
[auto build test WARNING on xiang-erofs/dev-test]
[also build test WARNING on v5.13 next-20210701]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Gao-Xiang/erofs-dio-dax-support-for-non-tailpacking-cases/20210704-215229
base:   https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev-test
config: csky-randconfig-r005-20210704 (attached as .config)
compiler: csky-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/e5df28a5e6dbbcf409d0f7a7976a6263ac3de64a
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Gao-Xiang/erofs-dio-dax-support-for-non-tailpacking-cases/20210704-215229
        git checkout e5df28a5e6dbbcf409d0f7a7976a6263ac3de64a
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=csky 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> fs/erofs/data.c:436:5: warning: no previous prototype for 'erofs_file_mmap' [-Wmissing-prototypes]
     436 | int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
         |     ^~~~~~~~~~~~~~~

Kconfig warnings: (for reference only)
   WARNING: unmet direct dependencies detected for LOCKDEP
   Depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT && (FRAME_POINTER || MIPS || PPC || S390 || MICROBLAZE || ARM || ARC || X86)
   Selected by
   - PROVE_LOCKING && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
   - LOCK_STAT && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
   - DEBUG_LOCK_ALLOC && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT


vim +/erofs_file_mmap +436 fs/erofs/data.c

   435	
 > 436	int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
   437	{
   438		if (!IS_DAX(file_inode(file)))
   439			return generic_file_readonly_mmap(file, vma);
   440	
   441		if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
   442			return -EINVAL;
   443	
   444		vma->vm_ops = &erofs_dax_vm_ops;
   445		vma->vm_flags |= VM_HUGEPAGE;
   446		return 0;
   447	}
   448	#else
   449	#define erofs_file_mmap	generic_file_readonly_mmap
   450	#endif
   451	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 23816 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC PATCH v1.1 2/2] erofs: dax support for non-tailpacking regular file
  2021-07-04 13:50   ` Gao Xiang
@ 2021-07-05 13:21     ` Gao Xiang
  -1 siblings, 0 replies; 13+ messages in thread
From: Gao Xiang @ 2021-07-05 13:21 UTC (permalink / raw)
  To: linux-erofs
  Cc: linux-fsdevel, LKML, nvdimm, Liu Bo, Darrick J. Wong, Joseqh Qi,
	Liu Jiang, Gao Xiang

DAX is quite useful for some VM use cases in order to save guest
memory extremely with minimal lightweight EROFS.

In order to prepare for such use cases, add preliminary dax support
for non-tailpacking regular files for now.

Tested with the DRAM-emulated PMEM and the EROFS image generated by
"mkfs.erofs -Enoinline_data enwik9.fsdax.img enwik9"

Cc: nvdimm@lists.linux.dev
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
change since v1:
 - update missing hunks due to patch spliting...
    bdev_dax_supported(...)
    erofs_file_mmap(...)   

 fs/erofs/data.c     | 43 +++++++++++++++++++++++++++++++++++++++++--
 fs/erofs/inode.c    |  5 +++++
 fs/erofs/internal.h |  2 ++
 fs/erofs/super.c    | 26 ++++++++++++++++++++++++--
 4 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0f82b4cb474c..c188c629be45 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -6,7 +6,7 @@
 #include "internal.h"
 #include <linux/prefetch.h>
 #include <linux/iomap.h>
-
+#include <linux/dax.h>
 #include <trace/events/erofs.h>
 
 static void erofs_readendio(struct bio *bio)
@@ -323,6 +323,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		return ret;
 
 	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
 	iomap->offset = map.m_la;
 	iomap->length = map.m_llen;
 
@@ -382,6 +383,11 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (!iov_iter_count(to))
 		return 0;
 
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(iocb->ki_filp->f_mapping->host))
+		return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
+#endif
+
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		int err = erofs_prepare_dio(iocb, to);
 
@@ -410,9 +416,42 @@ const struct address_space_operations erofs_raw_access_aops = {
 	.direct_IO = noop_direct_IO,
 };
 
+#ifdef CONFIG_FS_DAX
+static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
+{
+	return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
+}
+
+static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
+{
+	return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static const struct vm_operations_struct erofs_dax_vm_ops = {
+	.fault		= erofs_dax_fault,
+	.huge_fault	= erofs_dax_huge_fault,
+};
+
+static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (!IS_DAX(file_inode(file)))
+		return generic_file_readonly_mmap(file, vma);
+
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+		return -EINVAL;
+
+	vma->vm_ops = &erofs_dax_vm_ops;
+	vma->vm_flags |= VM_HUGEPAGE;
+	return 0;
+}
+#else
+#define erofs_file_mmap	generic_file_readonly_mmap
+#endif
+
 const struct file_operations erofs_file_fops = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= erofs_file_read_iter,
-	.mmap		= generic_file_readonly_mmap,
+	.mmap		= erofs_file_mmap,
 	.splice_read	= generic_file_splice_read,
 };
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 00edb7562fea..695b97acb9a6 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -174,6 +174,11 @@ static struct page *erofs_read_inode(struct inode *inode,
 	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
 
+	inode->i_flags &= ~S_DAX;
+	if (test_opt(&sbi->ctx, DAX) && S_ISREG(inode->i_mode) &&
+	    vi->datalayout == EROFS_INODE_FLAT_PLAIN)
+		inode->i_flags |= S_DAX;
+
 	if (!nblks)
 		/* measure inode.i_blocks as generic filesystems */
 		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2669c785d548..8b0542d35148 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -83,6 +83,7 @@ struct erofs_sb_info {
 
 	struct erofs_sb_lz4_info lz4;
 #endif	/* CONFIG_EROFS_FS_ZIP */
+	struct dax_device *dax_dev;
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -115,6 +116,7 @@ struct erofs_sb_info {
 /* Mount flags set via mount options or defaults */
 #define EROFS_MOUNT_XATTR_USER		0x00000010
 #define EROFS_MOUNT_POSIX_ACL		0x00000020
+#define EROFS_MOUNT_DAX			0x00000040
 
 #define clear_opt(ctx, option)	((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
 #define set_opt(ctx, option)	((ctx)->mount_opt |= EROFS_MOUNT_##option)
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 8fc6c04b54f4..b44a964ab24f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -11,6 +11,7 @@
 #include <linux/crc32c.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
+#include <linux/dax.h>
 #include "xattr.h"
 
 #define CREATE_TRACE_POINTS
@@ -355,6 +356,7 @@ enum {
 	Opt_user_xattr,
 	Opt_acl,
 	Opt_cache_strategy,
+	Opt_dax,
 	Opt_err
 };
 
@@ -370,6 +372,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
 	fsparam_flag_no("acl",		Opt_acl),
 	fsparam_enum("cache_strategy",	Opt_cache_strategy,
 		     erofs_param_cache_strategy),
+	fsparam_flag("dax",             Opt_dax),
 	{}
 };
 
@@ -410,6 +413,14 @@ static int erofs_fc_parse_param(struct fs_context *fc,
 		ctx->cache_strategy = result.uint_32;
 #else
 		errorfc(fc, "compression not supported, cache_strategy ignored");
+#endif
+		break;
+	case Opt_dax:
+#ifdef CONFIG_FS_DAX
+		warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+		set_opt(ctx, DAX);
+#else
+		errorfc(fc, "dax options not supported");
 #endif
 		break;
 	default:
@@ -496,10 +507,17 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
+	sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
 	err = erofs_read_superblock(sb);
 	if (err)
 		return err;
 
+	if (test_opt(ctx, DAX) &&
+	    !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) {
+		errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
+		clear_opt(ctx, DAX);
+	}
+
 	sb->s_flags |= SB_RDONLY | SB_NOATIME;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_time_gran = 1;
@@ -609,6 +627,8 @@ static void erofs_kill_sb(struct super_block *sb)
 	sbi = EROFS_SB(sb);
 	if (!sbi)
 		return;
+	if (sbi->dax_dev)
+		fs_put_dax(sbi->dax_dev);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
@@ -711,8 +731,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
-	struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
+	struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
+	struct erofs_fs_context *ctx = &sbi->ctx;
 
 #ifdef CONFIG_EROFS_FS_XATTR
 	if (test_opt(ctx, XATTR_USER))
@@ -734,6 +754,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 	else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
 		seq_puts(seq, ",cache_strategy=readaround");
 #endif
+	if (test_opt(ctx, DAX))
+		seq_puts(seq, ",dax");
 	return 0;
 }
 
-- 
2.24.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [RFC PATCH v1.1 2/2] erofs: dax support for non-tailpacking regular file
@ 2021-07-05 13:21     ` Gao Xiang
  0 siblings, 0 replies; 13+ messages in thread
From: Gao Xiang @ 2021-07-05 13:21 UTC (permalink / raw)
  To: linux-erofs
  Cc: nvdimm, Darrick J. Wong, LKML, Joseqh Qi, Liu Bo, linux-fsdevel,
	Gao Xiang, Liu Jiang

DAX is quite useful for some VM use cases in order to save guest
memory extremely with minimal lightweight EROFS.

In order to prepare for such use cases, add preliminary dax support
for non-tailpacking regular files for now.

Tested with the DRAM-emulated PMEM and the EROFS image generated by
"mkfs.erofs -Enoinline_data enwik9.fsdax.img enwik9"

Cc: nvdimm@lists.linux.dev
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
change since v1:
 - update missing hunks due to patch spliting...
    bdev_dax_supported(...)
    erofs_file_mmap(...)   

 fs/erofs/data.c     | 43 +++++++++++++++++++++++++++++++++++++++++--
 fs/erofs/inode.c    |  5 +++++
 fs/erofs/internal.h |  2 ++
 fs/erofs/super.c    | 26 ++++++++++++++++++++++++--
 4 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0f82b4cb474c..c188c629be45 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -6,7 +6,7 @@
 #include "internal.h"
 #include <linux/prefetch.h>
 #include <linux/iomap.h>
-
+#include <linux/dax.h>
 #include <trace/events/erofs.h>
 
 static void erofs_readendio(struct bio *bio)
@@ -323,6 +323,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		return ret;
 
 	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
 	iomap->offset = map.m_la;
 	iomap->length = map.m_llen;
 
@@ -382,6 +383,11 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (!iov_iter_count(to))
 		return 0;
 
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(iocb->ki_filp->f_mapping->host))
+		return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
+#endif
+
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		int err = erofs_prepare_dio(iocb, to);
 
@@ -410,9 +416,42 @@ const struct address_space_operations erofs_raw_access_aops = {
 	.direct_IO = noop_direct_IO,
 };
 
+#ifdef CONFIG_FS_DAX
+static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
+{
+	return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
+}
+
+static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
+{
+	return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static const struct vm_operations_struct erofs_dax_vm_ops = {
+	.fault		= erofs_dax_fault,
+	.huge_fault	= erofs_dax_huge_fault,
+};
+
+static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (!IS_DAX(file_inode(file)))
+		return generic_file_readonly_mmap(file, vma);
+
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+		return -EINVAL;
+
+	vma->vm_ops = &erofs_dax_vm_ops;
+	vma->vm_flags |= VM_HUGEPAGE;
+	return 0;
+}
+#else
+#define erofs_file_mmap	generic_file_readonly_mmap
+#endif
+
 const struct file_operations erofs_file_fops = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= erofs_file_read_iter,
-	.mmap		= generic_file_readonly_mmap,
+	.mmap		= erofs_file_mmap,
 	.splice_read	= generic_file_splice_read,
 };
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 00edb7562fea..695b97acb9a6 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -174,6 +174,11 @@ static struct page *erofs_read_inode(struct inode *inode,
 	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
 
+	inode->i_flags &= ~S_DAX;
+	if (test_opt(&sbi->ctx, DAX) && S_ISREG(inode->i_mode) &&
+	    vi->datalayout == EROFS_INODE_FLAT_PLAIN)
+		inode->i_flags |= S_DAX;
+
 	if (!nblks)
 		/* measure inode.i_blocks as generic filesystems */
 		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2669c785d548..8b0542d35148 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -83,6 +83,7 @@ struct erofs_sb_info {
 
 	struct erofs_sb_lz4_info lz4;
 #endif	/* CONFIG_EROFS_FS_ZIP */
+	struct dax_device *dax_dev;
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -115,6 +116,7 @@ struct erofs_sb_info {
 /* Mount flags set via mount options or defaults */
 #define EROFS_MOUNT_XATTR_USER		0x00000010
 #define EROFS_MOUNT_POSIX_ACL		0x00000020
+#define EROFS_MOUNT_DAX			0x00000040
 
 #define clear_opt(ctx, option)	((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
 #define set_opt(ctx, option)	((ctx)->mount_opt |= EROFS_MOUNT_##option)
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 8fc6c04b54f4..b44a964ab24f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -11,6 +11,7 @@
 #include <linux/crc32c.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
+#include <linux/dax.h>
 #include "xattr.h"
 
 #define CREATE_TRACE_POINTS
@@ -355,6 +356,7 @@ enum {
 	Opt_user_xattr,
 	Opt_acl,
 	Opt_cache_strategy,
+	Opt_dax,
 	Opt_err
 };
 
@@ -370,6 +372,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
 	fsparam_flag_no("acl",		Opt_acl),
 	fsparam_enum("cache_strategy",	Opt_cache_strategy,
 		     erofs_param_cache_strategy),
+	fsparam_flag("dax",             Opt_dax),
 	{}
 };
 
@@ -410,6 +413,14 @@ static int erofs_fc_parse_param(struct fs_context *fc,
 		ctx->cache_strategy = result.uint_32;
 #else
 		errorfc(fc, "compression not supported, cache_strategy ignored");
+#endif
+		break;
+	case Opt_dax:
+#ifdef CONFIG_FS_DAX
+		warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+		set_opt(ctx, DAX);
+#else
+		errorfc(fc, "dax options not supported");
 #endif
 		break;
 	default:
@@ -496,10 +507,17 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 		return -ENOMEM;
 
 	sb->s_fs_info = sbi;
+	sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
 	err = erofs_read_superblock(sb);
 	if (err)
 		return err;
 
+	if (test_opt(ctx, DAX) &&
+	    !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) {
+		errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
+		clear_opt(ctx, DAX);
+	}
+
 	sb->s_flags |= SB_RDONLY | SB_NOATIME;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_time_gran = 1;
@@ -609,6 +627,8 @@ static void erofs_kill_sb(struct super_block *sb)
 	sbi = EROFS_SB(sb);
 	if (!sbi)
 		return;
+	if (sbi->dax_dev)
+		fs_put_dax(sbi->dax_dev);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
@@ -711,8 +731,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
-	struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
+	struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
+	struct erofs_fs_context *ctx = &sbi->ctx;
 
 #ifdef CONFIG_EROFS_FS_XATTR
 	if (test_opt(ctx, XATTR_USER))
@@ -734,6 +754,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 	else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
 		seq_puts(seq, ",cache_strategy=readaround");
 #endif
+	if (test_opt(ctx, DAX))
+		seq_puts(seq, ",dax");
 	return 0;
 }
 
-- 
2.24.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH v1.1 2/2] erofs: dax support for non-tailpacking regular file
  2021-07-05 13:21     ` Gao Xiang
@ 2021-07-09  1:47       ` Darrick J. Wong
  -1 siblings, 0 replies; 13+ messages in thread
From: Darrick J. Wong @ 2021-07-09  1:47 UTC (permalink / raw)
  To: Gao Xiang
  Cc: linux-erofs, linux-fsdevel, LKML, nvdimm, Liu Bo, Joseqh Qi, Liu Jiang

On Mon, Jul 05, 2021 at 09:21:53PM +0800, Gao Xiang wrote:
> DAX is quite useful for some VM use cases in order to save guest
> memory extremely with minimal lightweight EROFS.
> 
> In order to prepare for such use cases, add preliminary dax support
> for non-tailpacking regular files for now.
> 
> Tested with the DRAM-emulated PMEM and the EROFS image generated by
> "mkfs.erofs -Enoinline_data enwik9.fsdax.img enwik9"
> 
> Cc: nvdimm@lists.linux.dev
> Cc: linux-fsdevel@vger.kernel.org
> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> ---
> change since v1:
>  - update missing hunks due to patch spliting...
>     bdev_dax_supported(...)
>     erofs_file_mmap(...)   
> 
>  fs/erofs/data.c     | 43 +++++++++++++++++++++++++++++++++++++++++--
>  fs/erofs/inode.c    |  5 +++++
>  fs/erofs/internal.h |  2 ++
>  fs/erofs/super.c    | 26 ++++++++++++++++++++++++--
>  4 files changed, 72 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 0f82b4cb474c..c188c629be45 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -6,7 +6,7 @@
>  #include "internal.h"
>  #include <linux/prefetch.h>
>  #include <linux/iomap.h>
> -
> +#include <linux/dax.h>
>  #include <trace/events/erofs.h>
>  
>  static void erofs_readendio(struct bio *bio)
> @@ -323,6 +323,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>  		return ret;
>  
>  	iomap->bdev = inode->i_sb->s_bdev;
> +	iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
>  	iomap->offset = map.m_la;
>  	iomap->length = map.m_llen;
>  
> @@ -382,6 +383,11 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  	if (!iov_iter_count(to))
>  		return 0;
>  
> +#ifdef CONFIG_FS_DAX
> +	if (IS_DAX(iocb->ki_filp->f_mapping->host))
> +		return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
> +#endif
> +
>  	if (iocb->ki_flags & IOCB_DIRECT) {
>  		int err = erofs_prepare_dio(iocb, to);
>  
> @@ -410,9 +416,42 @@ const struct address_space_operations erofs_raw_access_aops = {
>  	.direct_IO = noop_direct_IO,
>  };
>  
> +#ifdef CONFIG_FS_DAX
> +static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
> +		enum page_entry_size pe_size)
> +{
> +	return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
> +}
> +
> +static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
> +{
> +	return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
> +}
> +
> +static const struct vm_operations_struct erofs_dax_vm_ops = {
> +	.fault		= erofs_dax_fault,
> +	.huge_fault	= erofs_dax_huge_fault,
> +};
> +
> +static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	if (!IS_DAX(file_inode(file)))
> +		return generic_file_readonly_mmap(file, vma);
> +
> +	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
> +		return -EINVAL;
> +
> +	vma->vm_ops = &erofs_dax_vm_ops;
> +	vma->vm_flags |= VM_HUGEPAGE;
> +	return 0;
> +}
> +#else
> +#define erofs_file_mmap	generic_file_readonly_mmap
> +#endif
> +
>  const struct file_operations erofs_file_fops = {
>  	.llseek		= generic_file_llseek,
>  	.read_iter	= erofs_file_read_iter,
> -	.mmap		= generic_file_readonly_mmap,
> +	.mmap		= erofs_file_mmap,
>  	.splice_read	= generic_file_splice_read,
>  };
> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
> index 00edb7562fea..695b97acb9a6 100644
> --- a/fs/erofs/inode.c
> +++ b/fs/erofs/inode.c
> @@ -174,6 +174,11 @@ static struct page *erofs_read_inode(struct inode *inode,
>  	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
>  	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
>  
> +	inode->i_flags &= ~S_DAX;
> +	if (test_opt(&sbi->ctx, DAX) && S_ISREG(inode->i_mode) &&
> +	    vi->datalayout == EROFS_INODE_FLAT_PLAIN)
> +		inode->i_flags |= S_DAX;
> +
>  	if (!nblks)
>  		/* measure inode.i_blocks as generic filesystems */
>  		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
> index 2669c785d548..8b0542d35148 100644
> --- a/fs/erofs/internal.h
> +++ b/fs/erofs/internal.h
> @@ -83,6 +83,7 @@ struct erofs_sb_info {
>  
>  	struct erofs_sb_lz4_info lz4;
>  #endif	/* CONFIG_EROFS_FS_ZIP */
> +	struct dax_device *dax_dev;
>  	u32 blocks;
>  	u32 meta_blkaddr;
>  #ifdef CONFIG_EROFS_FS_XATTR
> @@ -115,6 +116,7 @@ struct erofs_sb_info {
>  /* Mount flags set via mount options or defaults */
>  #define EROFS_MOUNT_XATTR_USER		0x00000010
>  #define EROFS_MOUNT_POSIX_ACL		0x00000020
> +#define EROFS_MOUNT_DAX			0x00000040
>  
>  #define clear_opt(ctx, option)	((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
>  #define set_opt(ctx, option)	((ctx)->mount_opt |= EROFS_MOUNT_##option)
> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
> index 8fc6c04b54f4..b44a964ab24f 100644
> --- a/fs/erofs/super.c
> +++ b/fs/erofs/super.c
> @@ -11,6 +11,7 @@
>  #include <linux/crc32c.h>
>  #include <linux/fs_context.h>
>  #include <linux/fs_parser.h>
> +#include <linux/dax.h>
>  #include "xattr.h"
>  
>  #define CREATE_TRACE_POINTS
> @@ -355,6 +356,7 @@ enum {
>  	Opt_user_xattr,
>  	Opt_acl,
>  	Opt_cache_strategy,
> +	Opt_dax,
>  	Opt_err
>  };
>  
> @@ -370,6 +372,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
>  	fsparam_flag_no("acl",		Opt_acl),
>  	fsparam_enum("cache_strategy",	Opt_cache_strategy,
>  		     erofs_param_cache_strategy),
> +	fsparam_flag("dax",             Opt_dax),
>  	{}
>  };
>  
> @@ -410,6 +413,14 @@ static int erofs_fc_parse_param(struct fs_context *fc,
>  		ctx->cache_strategy = result.uint_32;
>  #else
>  		errorfc(fc, "compression not supported, cache_strategy ignored");
> +#endif
> +		break;
> +	case Opt_dax:
> +#ifdef CONFIG_FS_DAX
> +		warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
> +		set_opt(ctx, DAX);

You might want to allow 'dax=always' and 'dax=never' to maintain parity
with xfs/ext4's mount options...

--D

> +#else
> +		errorfc(fc, "dax options not supported");
>  #endif
>  		break;
>  	default:
> @@ -496,10 +507,17 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
>  		return -ENOMEM;
>  
>  	sb->s_fs_info = sbi;
> +	sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
>  	err = erofs_read_superblock(sb);
>  	if (err)
>  		return err;
>  
> +	if (test_opt(ctx, DAX) &&
> +	    !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) {
> +		errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
> +		clear_opt(ctx, DAX);
> +	}
> +
>  	sb->s_flags |= SB_RDONLY | SB_NOATIME;
>  	sb->s_maxbytes = MAX_LFS_FILESIZE;
>  	sb->s_time_gran = 1;
> @@ -609,6 +627,8 @@ static void erofs_kill_sb(struct super_block *sb)
>  	sbi = EROFS_SB(sb);
>  	if (!sbi)
>  		return;
> +	if (sbi->dax_dev)
> +		fs_put_dax(sbi->dax_dev);
>  	kfree(sbi);
>  	sb->s_fs_info = NULL;
>  }
> @@ -711,8 +731,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
>  
>  static int erofs_show_options(struct seq_file *seq, struct dentry *root)
>  {
> -	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
> -	struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
> +	struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
> +	struct erofs_fs_context *ctx = &sbi->ctx;
>  
>  #ifdef CONFIG_EROFS_FS_XATTR
>  	if (test_opt(ctx, XATTR_USER))
> @@ -734,6 +754,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
>  	else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
>  		seq_puts(seq, ",cache_strategy=readaround");
>  #endif
> +	if (test_opt(ctx, DAX))
> +		seq_puts(seq, ",dax");
>  	return 0;
>  }
>  
> -- 
> 2.24.4
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH v1.1 2/2] erofs: dax support for non-tailpacking regular file
@ 2021-07-09  1:47       ` Darrick J. Wong
  0 siblings, 0 replies; 13+ messages in thread
From: Darrick J. Wong @ 2021-07-09  1:47 UTC (permalink / raw)
  To: Gao Xiang
  Cc: nvdimm, LKML, Joseqh Qi, Liu Bo, linux-fsdevel, Liu Jiang, linux-erofs

On Mon, Jul 05, 2021 at 09:21:53PM +0800, Gao Xiang wrote:
> DAX is quite useful for some VM use cases in order to save guest
> memory extremely with minimal lightweight EROFS.
> 
> In order to prepare for such use cases, add preliminary dax support
> for non-tailpacking regular files for now.
> 
> Tested with the DRAM-emulated PMEM and the EROFS image generated by
> "mkfs.erofs -Enoinline_data enwik9.fsdax.img enwik9"
> 
> Cc: nvdimm@lists.linux.dev
> Cc: linux-fsdevel@vger.kernel.org
> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> ---
> change since v1:
>  - update missing hunks due to patch spliting...
>     bdev_dax_supported(...)
>     erofs_file_mmap(...)   
> 
>  fs/erofs/data.c     | 43 +++++++++++++++++++++++++++++++++++++++++--
>  fs/erofs/inode.c    |  5 +++++
>  fs/erofs/internal.h |  2 ++
>  fs/erofs/super.c    | 26 ++++++++++++++++++++++++--
>  4 files changed, 72 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 0f82b4cb474c..c188c629be45 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
> @@ -6,7 +6,7 @@
>  #include "internal.h"
>  #include <linux/prefetch.h>
>  #include <linux/iomap.h>
> -
> +#include <linux/dax.h>
>  #include <trace/events/erofs.h>
>  
>  static void erofs_readendio(struct bio *bio)
> @@ -323,6 +323,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>  		return ret;
>  
>  	iomap->bdev = inode->i_sb->s_bdev;
> +	iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
>  	iomap->offset = map.m_la;
>  	iomap->length = map.m_llen;
>  
> @@ -382,6 +383,11 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  	if (!iov_iter_count(to))
>  		return 0;
>  
> +#ifdef CONFIG_FS_DAX
> +	if (IS_DAX(iocb->ki_filp->f_mapping->host))
> +		return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
> +#endif
> +
>  	if (iocb->ki_flags & IOCB_DIRECT) {
>  		int err = erofs_prepare_dio(iocb, to);
>  
> @@ -410,9 +416,42 @@ const struct address_space_operations erofs_raw_access_aops = {
>  	.direct_IO = noop_direct_IO,
>  };
>  
> +#ifdef CONFIG_FS_DAX
> +static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
> +		enum page_entry_size pe_size)
> +{
> +	return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
> +}
> +
> +static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
> +{
> +	return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
> +}
> +
> +static const struct vm_operations_struct erofs_dax_vm_ops = {
> +	.fault		= erofs_dax_fault,
> +	.huge_fault	= erofs_dax_huge_fault,
> +};
> +
> +static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	if (!IS_DAX(file_inode(file)))
> +		return generic_file_readonly_mmap(file, vma);
> +
> +	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
> +		return -EINVAL;
> +
> +	vma->vm_ops = &erofs_dax_vm_ops;
> +	vma->vm_flags |= VM_HUGEPAGE;
> +	return 0;
> +}
> +#else
> +#define erofs_file_mmap	generic_file_readonly_mmap
> +#endif
> +
>  const struct file_operations erofs_file_fops = {
>  	.llseek		= generic_file_llseek,
>  	.read_iter	= erofs_file_read_iter,
> -	.mmap		= generic_file_readonly_mmap,
> +	.mmap		= erofs_file_mmap,
>  	.splice_read	= generic_file_splice_read,
>  };
> diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
> index 00edb7562fea..695b97acb9a6 100644
> --- a/fs/erofs/inode.c
> +++ b/fs/erofs/inode.c
> @@ -174,6 +174,11 @@ static struct page *erofs_read_inode(struct inode *inode,
>  	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
>  	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
>  
> +	inode->i_flags &= ~S_DAX;
> +	if (test_opt(&sbi->ctx, DAX) && S_ISREG(inode->i_mode) &&
> +	    vi->datalayout == EROFS_INODE_FLAT_PLAIN)
> +		inode->i_flags |= S_DAX;
> +
>  	if (!nblks)
>  		/* measure inode.i_blocks as generic filesystems */
>  		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
> index 2669c785d548..8b0542d35148 100644
> --- a/fs/erofs/internal.h
> +++ b/fs/erofs/internal.h
> @@ -83,6 +83,7 @@ struct erofs_sb_info {
>  
>  	struct erofs_sb_lz4_info lz4;
>  #endif	/* CONFIG_EROFS_FS_ZIP */
> +	struct dax_device *dax_dev;
>  	u32 blocks;
>  	u32 meta_blkaddr;
>  #ifdef CONFIG_EROFS_FS_XATTR
> @@ -115,6 +116,7 @@ struct erofs_sb_info {
>  /* Mount flags set via mount options or defaults */
>  #define EROFS_MOUNT_XATTR_USER		0x00000010
>  #define EROFS_MOUNT_POSIX_ACL		0x00000020
> +#define EROFS_MOUNT_DAX			0x00000040
>  
>  #define clear_opt(ctx, option)	((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
>  #define set_opt(ctx, option)	((ctx)->mount_opt |= EROFS_MOUNT_##option)
> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
> index 8fc6c04b54f4..b44a964ab24f 100644
> --- a/fs/erofs/super.c
> +++ b/fs/erofs/super.c
> @@ -11,6 +11,7 @@
>  #include <linux/crc32c.h>
>  #include <linux/fs_context.h>
>  #include <linux/fs_parser.h>
> +#include <linux/dax.h>
>  #include "xattr.h"
>  
>  #define CREATE_TRACE_POINTS
> @@ -355,6 +356,7 @@ enum {
>  	Opt_user_xattr,
>  	Opt_acl,
>  	Opt_cache_strategy,
> +	Opt_dax,
>  	Opt_err
>  };
>  
> @@ -370,6 +372,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
>  	fsparam_flag_no("acl",		Opt_acl),
>  	fsparam_enum("cache_strategy",	Opt_cache_strategy,
>  		     erofs_param_cache_strategy),
> +	fsparam_flag("dax",             Opt_dax),
>  	{}
>  };
>  
> @@ -410,6 +413,14 @@ static int erofs_fc_parse_param(struct fs_context *fc,
>  		ctx->cache_strategy = result.uint_32;
>  #else
>  		errorfc(fc, "compression not supported, cache_strategy ignored");
> +#endif
> +		break;
> +	case Opt_dax:
> +#ifdef CONFIG_FS_DAX
> +		warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
> +		set_opt(ctx, DAX);

You might want to allow 'dax=always' and 'dax=never' to maintain parity
with xfs/ext4's mount options...

--D

> +#else
> +		errorfc(fc, "dax options not supported");
>  #endif
>  		break;
>  	default:
> @@ -496,10 +507,17 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
>  		return -ENOMEM;
>  
>  	sb->s_fs_info = sbi;
> +	sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
>  	err = erofs_read_superblock(sb);
>  	if (err)
>  		return err;
>  
> +	if (test_opt(ctx, DAX) &&
> +	    !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) {
> +		errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
> +		clear_opt(ctx, DAX);
> +	}
> +
>  	sb->s_flags |= SB_RDONLY | SB_NOATIME;
>  	sb->s_maxbytes = MAX_LFS_FILESIZE;
>  	sb->s_time_gran = 1;
> @@ -609,6 +627,8 @@ static void erofs_kill_sb(struct super_block *sb)
>  	sbi = EROFS_SB(sb);
>  	if (!sbi)
>  		return;
> +	if (sbi->dax_dev)
> +		fs_put_dax(sbi->dax_dev);
>  	kfree(sbi);
>  	sb->s_fs_info = NULL;
>  }
> @@ -711,8 +731,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
>  
>  static int erofs_show_options(struct seq_file *seq, struct dentry *root)
>  {
> -	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
> -	struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
> +	struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
> +	struct erofs_fs_context *ctx = &sbi->ctx;
>  
>  #ifdef CONFIG_EROFS_FS_XATTR
>  	if (test_opt(ctx, XATTR_USER))
> @@ -734,6 +754,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
>  	else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
>  		seq_puts(seq, ",cache_strategy=readaround");
>  #endif
> +	if (test_opt(ctx, DAX))
> +		seq_puts(seq, ",dax");
>  	return 0;
>  }
>  
> -- 
> 2.24.4
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH v1.1 2/2] erofs: dax support for non-tailpacking regular file
  2021-07-09  1:47       ` Darrick J. Wong
@ 2021-07-09  2:28         ` Gao Xiang
  -1 siblings, 0 replies; 13+ messages in thread
From: Gao Xiang @ 2021-07-09  2:28 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: linux-erofs, linux-fsdevel, LKML, nvdimm, Liu Bo, Joseqh Qi, Liu Jiang

Hi Darrick,

On Thu, Jul 08, 2021 at 06:47:19PM -0700, Darrick J. Wong wrote:
> On Mon, Jul 05, 2021 at 09:21:53PM +0800, Gao Xiang wrote:

...

> >  	Opt_cache_strategy,
> > +	Opt_dax,
> >  	Opt_err
> >  };
> >  
> > @@ -370,6 +372,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
> >  	fsparam_flag_no("acl",		Opt_acl),
> >  	fsparam_enum("cache_strategy",	Opt_cache_strategy,
> >  		     erofs_param_cache_strategy),
> > +	fsparam_flag("dax",             Opt_dax),
> >  	{}
> >  };
> >  
> > @@ -410,6 +413,14 @@ static int erofs_fc_parse_param(struct fs_context *fc,
> >  		ctx->cache_strategy = result.uint_32;
> >  #else
> >  		errorfc(fc, "compression not supported, cache_strategy ignored");
> > +#endif
> > +		break;
> > +	case Opt_dax:
> > +#ifdef CONFIG_FS_DAX
> > +		warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
> > +		set_opt(ctx, DAX);
> 
> You might want to allow 'dax=always' and 'dax=never' to maintain parity
> with xfs/ext4's mount options...

Yeah, thanks for your suggestion. Will revise in the next version..

(Also, more use case details and development status about this scenario
 will be shown in the following months...)

Thanks,
Gao Xiang


> 
> --D
> 
> > +#else
> > +		errorfc(fc, "dax options not supported");
> >  #endif
> >  		break;
> >  	default:
> > @@ -496,10 +507,17 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
> >  		return -ENOMEM;
> >  
> >  	sb->s_fs_info = sbi;
> > +	sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
> >  	err = erofs_read_superblock(sb);
> >  	if (err)
> >  		return err;
> >  
> > +	if (test_opt(ctx, DAX) &&
> > +	    !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) {
> > +		errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
> > +		clear_opt(ctx, DAX);
> > +	}
> > +
> >  	sb->s_flags |= SB_RDONLY | SB_NOATIME;
> >  	sb->s_maxbytes = MAX_LFS_FILESIZE;
> >  	sb->s_time_gran = 1;
> > @@ -609,6 +627,8 @@ static void erofs_kill_sb(struct super_block *sb)
> >  	sbi = EROFS_SB(sb);
> >  	if (!sbi)
> >  		return;
> > +	if (sbi->dax_dev)
> > +		fs_put_dax(sbi->dax_dev);
> >  	kfree(sbi);
> >  	sb->s_fs_info = NULL;
> >  }
> > @@ -711,8 +731,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
> >  
> >  static int erofs_show_options(struct seq_file *seq, struct dentry *root)
> >  {
> > -	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
> > -	struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
> > +	struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
> > +	struct erofs_fs_context *ctx = &sbi->ctx;
> >  
> >  #ifdef CONFIG_EROFS_FS_XATTR
> >  	if (test_opt(ctx, XATTR_USER))
> > @@ -734,6 +754,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
> >  	else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
> >  		seq_puts(seq, ",cache_strategy=readaround");
> >  #endif
> > +	if (test_opt(ctx, DAX))
> > +		seq_puts(seq, ",dax");
> >  	return 0;
> >  }
> >  
> > -- 
> > 2.24.4
> > 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH v1.1 2/2] erofs: dax support for non-tailpacking regular file
@ 2021-07-09  2:28         ` Gao Xiang
  0 siblings, 0 replies; 13+ messages in thread
From: Gao Xiang @ 2021-07-09  2:28 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: nvdimm, LKML, Joseqh Qi, Liu Bo, linux-fsdevel, Liu Jiang, linux-erofs

Hi Darrick,

On Thu, Jul 08, 2021 at 06:47:19PM -0700, Darrick J. Wong wrote:
> On Mon, Jul 05, 2021 at 09:21:53PM +0800, Gao Xiang wrote:

...

> >  	Opt_cache_strategy,
> > +	Opt_dax,
> >  	Opt_err
> >  };
> >  
> > @@ -370,6 +372,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
> >  	fsparam_flag_no("acl",		Opt_acl),
> >  	fsparam_enum("cache_strategy",	Opt_cache_strategy,
> >  		     erofs_param_cache_strategy),
> > +	fsparam_flag("dax",             Opt_dax),
> >  	{}
> >  };
> >  
> > @@ -410,6 +413,14 @@ static int erofs_fc_parse_param(struct fs_context *fc,
> >  		ctx->cache_strategy = result.uint_32;
> >  #else
> >  		errorfc(fc, "compression not supported, cache_strategy ignored");
> > +#endif
> > +		break;
> > +	case Opt_dax:
> > +#ifdef CONFIG_FS_DAX
> > +		warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
> > +		set_opt(ctx, DAX);
> 
> You might want to allow 'dax=always' and 'dax=never' to maintain parity
> with xfs/ext4's mount options...

Yeah, thanks for your suggestion. Will revise in the next version..

(Also, more use case details and development status about this scenario
 will be shown in the following months...)

Thanks,
Gao Xiang


> 
> --D
> 
> > +#else
> > +		errorfc(fc, "dax options not supported");
> >  #endif
> >  		break;
> >  	default:
> > @@ -496,10 +507,17 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
> >  		return -ENOMEM;
> >  
> >  	sb->s_fs_info = sbi;
> > +	sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
> >  	err = erofs_read_superblock(sb);
> >  	if (err)
> >  		return err;
> >  
> > +	if (test_opt(ctx, DAX) &&
> > +	    !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) {
> > +		errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
> > +		clear_opt(ctx, DAX);
> > +	}
> > +
> >  	sb->s_flags |= SB_RDONLY | SB_NOATIME;
> >  	sb->s_maxbytes = MAX_LFS_FILESIZE;
> >  	sb->s_time_gran = 1;
> > @@ -609,6 +627,8 @@ static void erofs_kill_sb(struct super_block *sb)
> >  	sbi = EROFS_SB(sb);
> >  	if (!sbi)
> >  		return;
> > +	if (sbi->dax_dev)
> > +		fs_put_dax(sbi->dax_dev);
> >  	kfree(sbi);
> >  	sb->s_fs_info = NULL;
> >  }
> > @@ -711,8 +731,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
> >  
> >  static int erofs_show_options(struct seq_file *seq, struct dentry *root)
> >  {
> > -	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
> > -	struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
> > +	struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
> > +	struct erofs_fs_context *ctx = &sbi->ctx;
> >  
> >  #ifdef CONFIG_EROFS_FS_XATTR
> >  	if (test_opt(ctx, XATTR_USER))
> > @@ -734,6 +754,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
> >  	else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
> >  		seq_puts(seq, ",cache_strategy=readaround");
> >  #endif
> > +	if (test_opt(ctx, DAX))
> > +		seq_puts(seq, ",dax");
> >  	return 0;
> >  }
> >  
> > -- 
> > 2.24.4
> > 

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2021-07-09  2:28 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-04 13:50 [RFC PATCH 0/2] erofs: dio/dax support for non-tailpacking cases Gao Xiang
2021-07-04 13:50 ` Gao Xiang
2021-07-04 13:50 ` [RFC PATCH 1/2] erofs: iomap support for non-tailpacking DIO Gao Xiang
2021-07-04 13:50   ` Gao Xiang
2021-07-04 13:50 ` [RFC PATCH 2/2] erofs: dax support for non-tailpacking regular file Gao Xiang
2021-07-04 13:50   ` Gao Xiang
2021-07-04 16:00   ` kernel test robot
2021-07-05 13:21   ` [RFC PATCH v1.1 " Gao Xiang
2021-07-05 13:21     ` Gao Xiang
2021-07-09  1:47     ` Darrick J. Wong
2021-07-09  1:47       ` Darrick J. Wong
2021-07-09  2:28       ` Gao Xiang
2021-07-09  2:28         ` Gao Xiang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.