All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
@ 2016-12-15 23:40 ` Dave Jiang
  0 siblings, 0 replies; 23+ messages in thread
From: Dave Jiang @ 2016-12-15 23:40 UTC (permalink / raw)
  To: akpm; +Cc: jack, linux-nvdimm, david, linux-mm, tytso, hch

The caller into dax needs to clear __GFP_FS mask bit since it's
responsible for acquiring locks / transactions that blocks __GFP_FS
allocation.  The caller will restore the original mask when dax function
returns.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/dax.c          |    1 +
 fs/ext2/file.c    |    9 ++++++++-
 fs/ext4/file.c    |   10 +++++++++-
 fs/xfs/xfs_file.c |   14 +++++++++++++-
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index d3fe880..6395bc6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1380,6 +1380,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	vmf.pgoff = pgoff;
 	vmf.flags = flags;
 	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
+	vmf.gfp_mask &= ~__GFP_FS;
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index b0f2415..8422d5f 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -92,16 +92,19 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	int ret;
+	gfp_t old_gfp = vmf->gfp_mask;
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
 		file_update_time(vma->vm_file);
 	}
+	vmf->gfp_mask &= ~__GFP_FS;
 	down_read(&ei->dax_sem);
 
 	ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
+	vmf->gfp_mask = old_gfp;
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(inode->i_sb);
 	return ret;
@@ -114,6 +117,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	loff_t size;
 	int ret;
+	gfp_t old_gfp = vmf->gfp_mask;
 
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
@@ -123,8 +127,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
-	else
+	else {
+		vmf->gfp_mask &= ~__GFP_FS;
 		ret = dax_pfn_mkwrite(vma, vmf);
+		vmf->gfp_mask = old_gfp;
+	}
 
 	up_read(&ei->dax_sem);
 	sb_end_pagefault(inode->i_sb);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d663d3d..a3f2bf0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -261,14 +261,17 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = file_inode(vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	gfp_t old_gfp = vmf->gfp_mask;
 
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
+	vmf->gfp_mask &= ~__GFP_FS;
 	down_read(&EXT4_I(inode)->i_mmap_sem);
 	result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
+	vmf->gfp_mask = old_gfp;
 	if (write)
 		sb_end_pagefault(sb);
 
@@ -320,8 +323,13 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
-	else
+	else {
+		gfp_t old_gfp = vmf->gfp_mask;
+
+		vmf->gfp_mask &= ~__GFP_FS;
 		ret = dax_pfn_mkwrite(vma, vmf);
+		vmf->gfp_mask = old_gfp;
+	}
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	sb_end_pagefault(sb);
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d818c16..52202b4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1474,7 +1474,11 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
+		gfp_t old_gfp = vmf->gfp_mask;
+
+		vmf->gfp_mask &= ~__GFP_FS;
 		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+		vmf->gfp_mask = old_gfp;
 	} else {
 		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
@@ -1502,13 +1506,16 @@ xfs_filemap_fault(
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode)) {
+		gfp_t old_gfp = vmf->gfp_mask;
 		/*
 		 * we do not want to trigger unwritten extent conversion on read
 		 * faults - that is unnecessary overhead and would also require
 		 * changes to xfs_get_blocks_direct() to map unwritten extent
 		 * ioend for conversion on read-only mappings.
 		 */
+		vmf->gfp_mask &= ~__GFP_FS;
 		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+		vmf->gfp_mask = old_gfp;
 	} else
 		ret = filemap_fault(vma, vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1581,8 +1588,13 @@ xfs_filemap_pfn_mkwrite(
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
-	else if (IS_DAX(inode))
+	else if (IS_DAX(inode)) {
+		gfp_t old_gfp = vmf->gfp_mask;
+
+		vmf->gfp_mask &= ~__GFP_FS;
 		ret = dax_pfn_mkwrite(vma, vmf);
+		vmf->gfp_mask = old_gfp;
+	}
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
@ 2016-12-15 23:40 ` Dave Jiang
  0 siblings, 0 replies; 23+ messages in thread
From: Dave Jiang @ 2016-12-15 23:40 UTC (permalink / raw)
  To: akpm
  Cc: jack, linux-nvdimm, david, hch, linux-mm, tytso, ross.zwisler,
	dan.j.williams

The caller into dax needs to clear __GFP_FS mask bit since it's
responsible for acquiring locks / transactions that blocks __GFP_FS
allocation.  The caller will restore the original mask when dax function
returns.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/dax.c          |    1 +
 fs/ext2/file.c    |    9 ++++++++-
 fs/ext4/file.c    |   10 +++++++++-
 fs/xfs/xfs_file.c |   14 +++++++++++++-
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index d3fe880..6395bc6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1380,6 +1380,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	vmf.pgoff = pgoff;
 	vmf.flags = flags;
 	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
+	vmf.gfp_mask &= ~__GFP_FS;
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index b0f2415..8422d5f 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -92,16 +92,19 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	int ret;
+	gfp_t old_gfp = vmf->gfp_mask;
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
 		file_update_time(vma->vm_file);
 	}
+	vmf->gfp_mask &= ~__GFP_FS;
 	down_read(&ei->dax_sem);
 
 	ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
+	vmf->gfp_mask = old_gfp;
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(inode->i_sb);
 	return ret;
@@ -114,6 +117,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	loff_t size;
 	int ret;
+	gfp_t old_gfp = vmf->gfp_mask;
 
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
@@ -123,8 +127,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
-	else
+	else {
+		vmf->gfp_mask &= ~__GFP_FS;
 		ret = dax_pfn_mkwrite(vma, vmf);
+		vmf->gfp_mask = old_gfp;
+	}
 
 	up_read(&ei->dax_sem);
 	sb_end_pagefault(inode->i_sb);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d663d3d..a3f2bf0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -261,14 +261,17 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = file_inode(vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	gfp_t old_gfp = vmf->gfp_mask;
 
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
+	vmf->gfp_mask &= ~__GFP_FS;
 	down_read(&EXT4_I(inode)->i_mmap_sem);
 	result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
+	vmf->gfp_mask = old_gfp;
 	if (write)
 		sb_end_pagefault(sb);
 
@@ -320,8 +323,13 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
-	else
+	else {
+		gfp_t old_gfp = vmf->gfp_mask;
+
+		vmf->gfp_mask &= ~__GFP_FS;
 		ret = dax_pfn_mkwrite(vma, vmf);
+		vmf->gfp_mask = old_gfp;
+	}
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	sb_end_pagefault(sb);
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d818c16..52202b4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1474,7 +1474,11 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
+		gfp_t old_gfp = vmf->gfp_mask;
+
+		vmf->gfp_mask &= ~__GFP_FS;
 		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+		vmf->gfp_mask = old_gfp;
 	} else {
 		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
@@ -1502,13 +1506,16 @@ xfs_filemap_fault(
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode)) {
+		gfp_t old_gfp = vmf->gfp_mask;
 		/*
 		 * we do not want to trigger unwritten extent conversion on read
 		 * faults - that is unnecessary overhead and would also require
 		 * changes to xfs_get_blocks_direct() to map unwritten extent
 		 * ioend for conversion on read-only mappings.
 		 */
+		vmf->gfp_mask &= ~__GFP_FS;
 		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+		vmf->gfp_mask = old_gfp;
 	} else
 		ret = filemap_fault(vma, vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1581,8 +1588,13 @@ xfs_filemap_pfn_mkwrite(
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
-	else if (IS_DAX(inode))
+	else if (IS_DAX(inode)) {
+		gfp_t old_gfp = vmf->gfp_mask;
+
+		vmf->gfp_mask &= ~__GFP_FS;
 		ret = dax_pfn_mkwrite(vma, vmf);
+		vmf->gfp_mask = old_gfp;
+	}
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH v4 2/3] mm, dax: make pmd_fault() and friends to be the same as fault()
  2016-12-15 23:40 ` Dave Jiang
  (?)
@ 2016-12-15 23:40 ` Dave Jiang
  -1 siblings, 0 replies; 23+ messages in thread
From: Dave Jiang @ 2016-12-15 23:40 UTC (permalink / raw)
  To: akpm
  Cc: jack, linux-nvdimm, david, hch, linux-mm, tytso, ross.zwisler,
	dan.j.williams

Instead of passing in multiple parameters in the pmd_fault() handler,
a vmf can be passed in just like a fault() handler. This will simplify
code and remove the need for the actual pmd fault handlers to allocate a
vmf. Related functions are also modified to do the same.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 drivers/dax/dax.c             |   16 ++++++---------
 fs/dax.c                      |   45 ++++++++++++++++++-----------------------
 fs/ext4/file.c                |   13 +++++++-----
 fs/xfs/xfs_file.c             |   13 ++++++------
 include/linux/dax.h           |    7 +++---
 include/linux/mm.h            |    3 +--
 include/trace/events/fs_dax.h |   15 ++++++--------
 mm/memory.c                   |    6 ++---
 8 files changed, 55 insertions(+), 63 deletions(-)

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index c753a4c..947e49a 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -379,10 +379,9 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
-		struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
-		unsigned int flags)
+		struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	unsigned long pmd_addr = addr & PMD_MASK;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct device *dev = &dax_dev->dev;
 	struct dax_region *dax_region;
 	phys_addr_t phys;
@@ -414,23 +413,22 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
-			flags & FAULT_FLAG_WRITE);
+	return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+			vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, unsigned int flags)
+static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int rc;
 	struct file *filp = vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
 
 	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
-			current->comm, (flags & FAULT_FLAG_WRITE)
+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
 			? "write" : "read", vma->vm_start, vma->vm_end);
 
 	rcu_read_lock();
-	rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
+	rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/fs/dax.c b/fs/dax.c
index 6395bc6..157f77f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1310,18 +1310,17 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
 	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long pmd_addr = address & PMD_MASK;
-	bool write = flags & FAULT_FLAG_WRITE;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
 	int result = VM_FAULT_FALLBACK;
 	struct iomap iomap = { 0 };
-	pgoff_t max_pgoff, pgoff;
-	struct vm_fault vmf;
+	pgoff_t max_pgoff, old_pgoff;
 	void *entry;
 	loff_t pos;
 	int error;
@@ -1331,10 +1330,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	 * supposed to hold locks serializing us with truncate / punch hole so
 	 * this is a reliable test.
 	 */
-	pgoff = linear_page_index(vma, pmd_addr);
+	old_pgoff = vmf->pgoff;
+	vmf->pgoff = linear_page_index(vma, pmd_addr);
 	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
-	trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0);
+	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
@@ -1346,13 +1346,13 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
 		goto fallback;
 
-	if (pgoff > max_pgoff) {
+	if (vmf->pgoff > max_pgoff) {
 		result = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
 	/* If the PMD would extend beyond the file size */
-	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+	if ((vmf->pgoff | PG_PMD_COLOUR) > max_pgoff)
 		goto fallback;
 
 	/*
@@ -1360,7 +1360,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	 * setting up a mapping, so really we're using iomap_begin() as a way
 	 * to look up our filesystem block.
 	 */
-	pos = (loff_t)pgoff << PAGE_SHIFT;
+	pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
 	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
 	if (error)
 		goto fallback;
@@ -1370,29 +1370,24 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	 * the tree, for instance), it will return -EEXIST and we just fall
 	 * back to 4k entries.
 	 */
-	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+	entry = grab_mapping_entry(mapping, vmf->pgoff, RADIX_DAX_PMD);
 	if (IS_ERR(entry))
 		goto finish_iomap;
 
 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
 		goto unlock_entry;
 
-	vmf.pgoff = pgoff;
-	vmf.flags = flags;
-	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
-	vmf.gfp_mask &= ~__GFP_FS;
-
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
-				&iomap, pos, write, &entry);
+		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
+				vmf->address, &iomap, pos, write, &entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			goto unlock_entry;
-		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
-				&entry);
+		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
+				&iomap, &entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1400,7 +1395,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
 
  unlock_entry:
-	put_locked_mapping_entry(mapping, pgoff, entry);
+	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  finish_iomap:
 	if (ops->iomap_end) {
 		int copied = PMD_SIZE;
@@ -1418,12 +1413,12 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
  fallback:
 	if (result == VM_FAULT_FALLBACK) {
-		split_huge_pmd(vma, pmd, address);
+		split_huge_pmd(vma, vmf->pmd, vmf->address);
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
 out:
-	trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff,
-			result);
+	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
+	vmf->pgoff = old_pgoff;
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index a3f2bf0..ad56f90 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -278,22 +278,25 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return result;
 }
 
-static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-						pmd_t *pmd, unsigned int flags)
+static int
+ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int result;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct super_block *sb = inode->i_sb;
-	bool write = flags & FAULT_FLAG_WRITE;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	gfp_t old_gfp = vmf->gfp_mask;
 
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
+
+	vmf->gfp_mask &= ~__GFP_FS;
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
-				     &ext4_iomap_ops);
+	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
+	vmf->gfp_mask = old_gfp;
 	if (write)
 		sb_end_pagefault(sb);
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52202b4..27d395b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1533,29 +1533,30 @@ xfs_filemap_fault(
 STATIC int
 xfs_filemap_pmd_fault(
 	struct vm_area_struct	*vma,
-	unsigned long		addr,
-	pmd_t			*pmd,
-	unsigned int		flags)
+	struct vm_fault *vmf)
 {
 	struct inode		*inode = file_inode(vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			ret;
+	gfp_t			old_gfp = vmf->gfp_mask;
 
 	if (!IS_DAX(inode))
 		return VM_FAULT_FALLBACK;
 
 	trace_xfs_filemap_pmd_fault(ip);
 
-	if (flags & FAULT_FLAG_WRITE) {
+	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
 		file_update_time(vma->vm_file);
 	}
 
+	vmf->gfp_mask &= ~__GFP_FS;
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
+	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	vmf->gfp_mask = old_gfp;
 
-	if (flags & FAULT_FLAG_WRITE)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(inode->i_sb);
 
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 6e36b11..9761c90 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -71,16 +71,15 @@ static inline unsigned int dax_radix_order(void *entry)
 		return PMD_SHIFT - PAGE_SHIFT;
 	return 0;
 }
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
 	return 0;
 }
 static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
-		unsigned long address, pmd_t *pmd, unsigned int flags,
-		struct iomap_ops *ops)
+		struct vm_fault *vmf, struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 30f416a..aef645b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -347,8 +347,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
-	int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
-						pmd_t *, unsigned int flags);
+	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index c3b0aae..a98665b 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -8,9 +8,8 @@
 
 DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, unsigned int flags, pgoff_t pgoff,
-		pgoff_t max_pgoff, int result),
-	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result),
+		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
+	TP_ARGS(inode, vma, vmf, max_pgoff, result),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_start)
@@ -29,9 +28,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 		__entry->vm_start = vma->vm_start;
 		__entry->vm_end = vma->vm_end;
 		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
-		__entry->flags = flags;
-		__entry->pgoff = pgoff;
+		__entry->address = vmf->address;
+		__entry->flags = vmf->flags;
+		__entry->pgoff = vmf->pgoff;
 		__entry->max_pgoff = max_pgoff;
 		__entry->result = result;
 	),
@@ -54,9 +53,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 #define DEFINE_PMD_FAULT_EVENT(name) \
 DEFINE_EVENT(dax_pmd_fault_class, name, \
 	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, unsigned int flags, pgoff_t pgoff, \
+		struct vm_fault *vmf, \
 		pgoff_t max_pgoff, int result), \
-	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result))
+	TP_ARGS(inode, vma, vmf, max_pgoff, result))
 
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
diff --git a/mm/memory.c b/mm/memory.c
index e37250f..8ec36cf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3447,8 +3447,7 @@ static int create_huge_pmd(struct vm_fault *vmf)
 	if (vma_is_anonymous(vma))
 		return do_huge_pmd_anonymous_page(vmf);
 	if (vma->vm_ops->pmd_fault)
-		return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
-				vmf->flags);
+		return vma->vm_ops->pmd_fault(vma, vmf);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3457,8 +3456,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
-				vmf->pmd, vmf->flags);
+		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH v4 3/3] mm, dax: move pmd_fault() to take only vmf parameter
  2016-12-15 23:40 ` Dave Jiang
@ 2016-12-15 23:40   ` Dave Jiang
  -1 siblings, 0 replies; 23+ messages in thread
From: Dave Jiang @ 2016-12-15 23:40 UTC (permalink / raw)
  To: akpm; +Cc: jack, linux-nvdimm, david, linux-mm, tytso, hch

pmd_fault() and related functions really only need the vmf parameter since
the additional parameters are all included in the vmf struct. Removing
additional parameter and simplify pmd_fault() and friends.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 drivers/dax/dax.c             |   18 +++++++-------
 fs/dax.c                      |   54 +++++++++++++++++++----------------------
 fs/ext4/file.c                |    8 +++---
 fs/xfs/xfs_file.c             |    7 ++---
 include/linux/dax.h           |    7 ++---
 include/linux/mm.h            |    2 +-
 include/trace/events/fs_dax.h |   54 +++++++++++++++++++----------------------
 mm/memory.c                   |    9 +++----
 8 files changed, 74 insertions(+), 85 deletions(-)

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 947e49a..55160f8 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -378,8 +378,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return rc;
 }
 
-static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
-		struct vm_area_struct *vma, struct vm_fault *vmf)
+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct device *dev = &dax_dev->dev;
@@ -388,7 +387,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 	pgoff_t pgoff;
 	pfn_t pfn;
 
-	if (check_vma(dax_dev, vma, __func__))
+	if (check_vma(dax_dev, vmf->vma, __func__))
 		return VM_FAULT_SIGBUS;
 
 	dax_region = dax_dev->region;
@@ -403,7 +402,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 		return VM_FAULT_SIGBUS;
 	}
 
-	pgoff = linear_page_index(vma, pmd_addr);
+	pgoff = linear_page_index(vmf->vma, pmd_addr);
 	phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
@@ -413,22 +412,23 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
 			vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int dax_dev_pmd_fault(struct vm_fault *vmf)
 {
 	int rc;
-	struct file *filp = vma->vm_file;
+	struct file *filp = vmf->vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
 
 	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
 			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
-			? "write" : "read", vma->vm_start, vma->vm_end);
+			? "write" : "read",
+			vmf->vma->vm_start, vmf->vma->vm_end);
 
 	rcu_read_lock();
-	rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
+	rc = __dax_dev_pmd_fault(dax_dev, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/fs/dax.c b/fs/dax.c
index 157f77f..bc39809 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1226,11 +1226,10 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
  */
 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
-static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
-		struct vm_fault *vmf, unsigned long address,
-		struct iomap *iomap, loff_t pos, bool write, void **entryp)
+static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
+		loff_t pos, void **entryp)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct block_device *bdev = iomap->bdev;
 	struct inode *inode = mapping->host;
 	struct blk_dax_ctl dax = {
@@ -1257,31 +1256,30 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
 		goto fallback;
 	*entryp = ret;
 
-	trace_dax_pmd_insert_mapping(inode, vma, address, write, length,
-			dax.pfn, ret);
-	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
 
  unmap_fallback:
 	dax_unmap_atomic(bdev, &dax);
 fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write,
-			length, dax.pfn, ret);
+	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
+			dax.pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
-		struct vm_fault *vmf, unsigned long address,
-		struct iomap *iomap, void **entryp)
+static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
+		void **entryp)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long pmd_addr = address & PMD_MASK;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct inode *inode = mapping->host;
 	struct page *zero_page;
 	void *ret = NULL;
 	spinlock_t *ptl;
 	pmd_t pmd_entry;
 
-	zero_page = mm_get_huge_zero_page(vma->vm_mm);
+	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
 
 	if (unlikely(!zero_page))
 		goto fallback;
@@ -1292,27 +1290,27 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
 		goto fallback;
 	*entryp = ret;
 
-	ptl = pmd_lock(vma->vm_mm, pmd);
-	if (!pmd_none(*pmd)) {
+	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+	if (!pmd_none(*(vmf->pmd))) {
 		spin_unlock(ptl);
 		goto fallback;
 	}
 
-	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
 	pmd_entry = pmd_mkhuge(pmd_entry);
-	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
 	spin_unlock(ptl);
-	trace_dax_pmd_load_hole(inode, vma, address, zero_page, ret);
+	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
 	return VM_FAULT_NOPAGE;
 
 fallback:
-	trace_dax_pmd_load_hole_fallback(inode, vma, address, zero_page, ret);
+	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
 	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops)
+int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -1334,7 +1332,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	vmf->pgoff = linear_page_index(vma, pmd_addr);
 	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
-	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
+	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
@@ -1379,15 +1377,13 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
-				vmf->address, &iomap, pos, write, &entry);
+		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			goto unlock_entry;
-		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
-				&iomap, &entry);
+		result = dax_pmd_load_hole(vmf, &iomap, &entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1417,7 +1413,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
 out:
-	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
+	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
 	vmf->pgoff = old_pgoff;
 	return result;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ad56f90..14a8753 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -279,22 +279,22 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 static int
-ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ext4_dax_pmd_fault(struct vm_fault *vmf)
 {
 	int result;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	gfp_t old_gfp = vmf->gfp_mask;
 
 	if (write) {
 		sb_start_pagefault(sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 
 	vmf->gfp_mask &= ~__GFP_FS;
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
+	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	vmf->gfp_mask = old_gfp;
 	if (write)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 27d395b..8359b8f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1532,10 +1532,9 @@ xfs_filemap_fault(
  */
 STATIC int
 xfs_filemap_pmd_fault(
-	struct vm_area_struct	*vma,
 	struct vm_fault *vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			ret;
 	gfp_t			old_gfp = vmf->gfp_mask;
@@ -1547,12 +1546,12 @@ xfs_filemap_pmd_fault(
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 
 	vmf->gfp_mask &= ~__GFP_FS;
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
+	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	vmf->gfp_mask = old_gfp;
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9761c90..1ffdb4d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -71,15 +71,14 @@ static inline unsigned int dax_radix_order(void *entry)
 		return PMD_SHIFT - PAGE_SHIFT;
 	return 0;
 }
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops);
+int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
 	return 0;
 }
-static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
-		struct vm_fault *vmf, struct iomap_ops *ops)
+static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
+		struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aef645b..795f03e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -347,7 +347,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
-	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*pmd_fault)(struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index a98665b..c566ddc 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -7,9 +7,9 @@
 #include <linux/tracepoint.h>
 
 DECLARE_EVENT_CLASS(dax_pmd_fault_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
-	TP_ARGS(inode, vma, vmf, max_pgoff, result),
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		pgoff_t max_pgoff, int result),
+	TP_ARGS(inode, vmf, max_pgoff, result),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_start)
@@ -25,9 +25,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_start = vma->vm_start;
-		__entry->vm_end = vma->vm_end;
-		__entry->vm_flags = vma->vm_flags;
+		__entry->vm_start = vmf->vma->vm_start;
+		__entry->vm_end = vmf->vma->vm_end;
+		__entry->vm_flags = vmf->vma->vm_flags;
 		__entry->address = vmf->address;
 		__entry->flags = vmf->flags;
 		__entry->pgoff = vmf->pgoff;
@@ -52,19 +52,18 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 
 #define DEFINE_PMD_FAULT_EVENT(name) \
 DEFINE_EVENT(dax_pmd_fault_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		struct vm_fault *vmf, \
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
 		pgoff_t max_pgoff, int result), \
-	TP_ARGS(inode, vma, vmf, max_pgoff, result))
+	TP_ARGS(inode, vmf, max_pgoff, result))
 
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
 
 DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, struct page *zero_page,
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		struct page *zero_page,
 		void *radix_entry),
-	TP_ARGS(inode, vma, address, zero_page, radix_entry),
+	TP_ARGS(inode, vmf, zero_page, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_flags)
@@ -76,8 +75,8 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
+		__entry->vm_flags = vmf->vma->vm_flags;
+		__entry->address = vmf->address;
 		__entry->zero_page = zero_page;
 		__entry->radix_entry = radix_entry;
 	),
@@ -95,19 +94,17 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 
 #define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
 DEFINE_EVENT(dax_pmd_load_hole_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, struct page *zero_page, \
-		void *radix_entry), \
-	TP_ARGS(inode, vma, address, zero_page, radix_entry))
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
+		struct page *zero_page, void *radix_entry), \
+	TP_ARGS(inode, vmf, zero_page, radix_entry))
 
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
 
 DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, int write, long length, pfn_t pfn,
-		void *radix_entry),
-	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry),
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		long length, pfn_t pfn, void *radix_entry),
+	TP_ARGS(inode, vmf, length, pfn, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_flags)
@@ -121,9 +118,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
-		__entry->write = write;
+		__entry->vm_flags = vmf->vma->vm_flags;
+		__entry->address = vmf->address;
+		__entry->write = vmf->flags & FAULT_FLAG_WRITE;
 		__entry->length = length;
 		__entry->pfn_val = pfn.val;
 		__entry->radix_entry = radix_entry;
@@ -146,10 +143,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 
 #define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
 DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, int write, long length, pfn_t pfn, \
-		void *radix_entry), \
-	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry))
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
+		long length, pfn_t pfn, void *radix_entry), \
+	TP_ARGS(inode, vmf, length, pfn, radix_entry))
 
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
diff --git a/mm/memory.c b/mm/memory.c
index 8ec36cf..e929c41 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3443,11 +3443,10 @@ static int do_numa_page(struct vm_fault *vmf)
 
 static int create_huge_pmd(struct vm_fault *vmf)
 {
-	struct vm_area_struct *vma = vmf->vma;
-	if (vma_is_anonymous(vma))
+	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_anonymous_page(vmf);
-	if (vma->vm_ops->pmd_fault)
-		return vma->vm_ops->pmd_fault(vma, vmf);
+	if (vmf->vma->vm_ops->pmd_fault)
+		return vmf->vma->vm_ops->pmd_fault(vmf);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3456,7 +3455,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
+		return vmf->vma->vm_ops->pmd_fault(vmf);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH v4 3/3] mm, dax: move pmd_fault() to take only vmf parameter
@ 2016-12-15 23:40   ` Dave Jiang
  0 siblings, 0 replies; 23+ messages in thread
From: Dave Jiang @ 2016-12-15 23:40 UTC (permalink / raw)
  To: akpm
  Cc: jack, linux-nvdimm, david, hch, linux-mm, tytso, ross.zwisler,
	dan.j.williams

pmd_fault() and related functions really only need the vmf parameter since
the additional parameters are all included in the vmf struct. Removing
additional parameter and simplify pmd_fault() and friends.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 drivers/dax/dax.c             |   18 +++++++-------
 fs/dax.c                      |   54 +++++++++++++++++++----------------------
 fs/ext4/file.c                |    8 +++---
 fs/xfs/xfs_file.c             |    7 ++---
 include/linux/dax.h           |    7 ++---
 include/linux/mm.h            |    2 +-
 include/trace/events/fs_dax.h |   54 +++++++++++++++++++----------------------
 mm/memory.c                   |    9 +++----
 8 files changed, 74 insertions(+), 85 deletions(-)

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 947e49a..55160f8 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -378,8 +378,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return rc;
 }
 
-static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
-		struct vm_area_struct *vma, struct vm_fault *vmf)
+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct device *dev = &dax_dev->dev;
@@ -388,7 +387,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 	pgoff_t pgoff;
 	pfn_t pfn;
 
-	if (check_vma(dax_dev, vma, __func__))
+	if (check_vma(dax_dev, vmf->vma, __func__))
 		return VM_FAULT_SIGBUS;
 
 	dax_region = dax_dev->region;
@@ -403,7 +402,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 		return VM_FAULT_SIGBUS;
 	}
 
-	pgoff = linear_page_index(vma, pmd_addr);
+	pgoff = linear_page_index(vmf->vma, pmd_addr);
 	phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
@@ -413,22 +412,23 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
 			vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int dax_dev_pmd_fault(struct vm_fault *vmf)
 {
 	int rc;
-	struct file *filp = vma->vm_file;
+	struct file *filp = vmf->vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
 
 	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
 			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
-			? "write" : "read", vma->vm_start, vma->vm_end);
+			? "write" : "read",
+			vmf->vma->vm_start, vmf->vma->vm_end);
 
 	rcu_read_lock();
-	rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
+	rc = __dax_dev_pmd_fault(dax_dev, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/fs/dax.c b/fs/dax.c
index 157f77f..bc39809 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1226,11 +1226,10 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
  */
 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
-static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
-		struct vm_fault *vmf, unsigned long address,
-		struct iomap *iomap, loff_t pos, bool write, void **entryp)
+static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
+		loff_t pos, void **entryp)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct block_device *bdev = iomap->bdev;
 	struct inode *inode = mapping->host;
 	struct blk_dax_ctl dax = {
@@ -1257,31 +1256,30 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
 		goto fallback;
 	*entryp = ret;
 
-	trace_dax_pmd_insert_mapping(inode, vma, address, write, length,
-			dax.pfn, ret);
-	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
 
  unmap_fallback:
 	dax_unmap_atomic(bdev, &dax);
 fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write,
-			length, dax.pfn, ret);
+	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
+			dax.pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
-		struct vm_fault *vmf, unsigned long address,
-		struct iomap *iomap, void **entryp)
+static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
+		void **entryp)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long pmd_addr = address & PMD_MASK;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct inode *inode = mapping->host;
 	struct page *zero_page;
 	void *ret = NULL;
 	spinlock_t *ptl;
 	pmd_t pmd_entry;
 
-	zero_page = mm_get_huge_zero_page(vma->vm_mm);
+	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
 
 	if (unlikely(!zero_page))
 		goto fallback;
@@ -1292,27 +1290,27 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
 		goto fallback;
 	*entryp = ret;
 
-	ptl = pmd_lock(vma->vm_mm, pmd);
-	if (!pmd_none(*pmd)) {
+	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+	if (!pmd_none(*(vmf->pmd))) {
 		spin_unlock(ptl);
 		goto fallback;
 	}
 
-	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
 	pmd_entry = pmd_mkhuge(pmd_entry);
-	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
 	spin_unlock(ptl);
-	trace_dax_pmd_load_hole(inode, vma, address, zero_page, ret);
+	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
 	return VM_FAULT_NOPAGE;
 
 fallback:
-	trace_dax_pmd_load_hole_fallback(inode, vma, address, zero_page, ret);
+	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
 	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops)
+int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -1334,7 +1332,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	vmf->pgoff = linear_page_index(vma, pmd_addr);
 	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
-	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
+	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
@@ -1379,15 +1377,13 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
-				vmf->address, &iomap, pos, write, &entry);
+		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			goto unlock_entry;
-		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
-				&iomap, &entry);
+		result = dax_pmd_load_hole(vmf, &iomap, &entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1417,7 +1413,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
 out:
-	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
+	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
 	vmf->pgoff = old_pgoff;
 	return result;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ad56f90..14a8753 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -279,22 +279,22 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 static int
-ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ext4_dax_pmd_fault(struct vm_fault *vmf)
 {
 	int result;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	gfp_t old_gfp = vmf->gfp_mask;
 
 	if (write) {
 		sb_start_pagefault(sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 
 	vmf->gfp_mask &= ~__GFP_FS;
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
+	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	vmf->gfp_mask = old_gfp;
 	if (write)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 27d395b..8359b8f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1532,10 +1532,9 @@ xfs_filemap_fault(
  */
 STATIC int
 xfs_filemap_pmd_fault(
-	struct vm_area_struct	*vma,
 	struct vm_fault *vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			ret;
 	gfp_t			old_gfp = vmf->gfp_mask;
@@ -1547,12 +1546,12 @@ xfs_filemap_pmd_fault(
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 
 	vmf->gfp_mask &= ~__GFP_FS;
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
+	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	vmf->gfp_mask = old_gfp;
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9761c90..1ffdb4d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -71,15 +71,14 @@ static inline unsigned int dax_radix_order(void *entry)
 		return PMD_SHIFT - PAGE_SHIFT;
 	return 0;
 }
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops);
+int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
 	return 0;
 }
-static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
-		struct vm_fault *vmf, struct iomap_ops *ops)
+static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
+		struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aef645b..795f03e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -347,7 +347,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
-	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*pmd_fault)(struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index a98665b..c566ddc 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -7,9 +7,9 @@
 #include <linux/tracepoint.h>
 
 DECLARE_EVENT_CLASS(dax_pmd_fault_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
-	TP_ARGS(inode, vma, vmf, max_pgoff, result),
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		pgoff_t max_pgoff, int result),
+	TP_ARGS(inode, vmf, max_pgoff, result),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_start)
@@ -25,9 +25,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_start = vma->vm_start;
-		__entry->vm_end = vma->vm_end;
-		__entry->vm_flags = vma->vm_flags;
+		__entry->vm_start = vmf->vma->vm_start;
+		__entry->vm_end = vmf->vma->vm_end;
+		__entry->vm_flags = vmf->vma->vm_flags;
 		__entry->address = vmf->address;
 		__entry->flags = vmf->flags;
 		__entry->pgoff = vmf->pgoff;
@@ -52,19 +52,18 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 
 #define DEFINE_PMD_FAULT_EVENT(name) \
 DEFINE_EVENT(dax_pmd_fault_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		struct vm_fault *vmf, \
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
 		pgoff_t max_pgoff, int result), \
-	TP_ARGS(inode, vma, vmf, max_pgoff, result))
+	TP_ARGS(inode, vmf, max_pgoff, result))
 
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
 
 DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, struct page *zero_page,
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		struct page *zero_page,
 		void *radix_entry),
-	TP_ARGS(inode, vma, address, zero_page, radix_entry),
+	TP_ARGS(inode, vmf, zero_page, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_flags)
@@ -76,8 +75,8 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
+		__entry->vm_flags = vmf->vma->vm_flags;
+		__entry->address = vmf->address;
 		__entry->zero_page = zero_page;
 		__entry->radix_entry = radix_entry;
 	),
@@ -95,19 +94,17 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 
 #define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
 DEFINE_EVENT(dax_pmd_load_hole_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, struct page *zero_page, \
-		void *radix_entry), \
-	TP_ARGS(inode, vma, address, zero_page, radix_entry))
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
+		struct page *zero_page, void *radix_entry), \
+	TP_ARGS(inode, vmf, zero_page, radix_entry))
 
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
 
 DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, int write, long length, pfn_t pfn,
-		void *radix_entry),
-	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry),
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		long length, pfn_t pfn, void *radix_entry),
+	TP_ARGS(inode, vmf, length, pfn, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_flags)
@@ -121,9 +118,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
-		__entry->write = write;
+		__entry->vm_flags = vmf->vma->vm_flags;
+		__entry->address = vmf->address;
+		__entry->write = vmf->flags & FAULT_FLAG_WRITE;
 		__entry->length = length;
 		__entry->pfn_val = pfn.val;
 		__entry->radix_entry = radix_entry;
@@ -146,10 +143,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 
 #define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
 DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, int write, long length, pfn_t pfn, \
-		void *radix_entry), \
-	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry))
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
+		long length, pfn_t pfn, void *radix_entry), \
+	TP_ARGS(inode, vmf, length, pfn, radix_entry))
 
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
diff --git a/mm/memory.c b/mm/memory.c
index 8ec36cf..e929c41 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3443,11 +3443,10 @@ static int do_numa_page(struct vm_fault *vmf)
 
 static int create_huge_pmd(struct vm_fault *vmf)
 {
-	struct vm_area_struct *vma = vmf->vma;
-	if (vma_is_anonymous(vma))
+	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_anonymous_page(vmf);
-	if (vma->vm_ops->pmd_fault)
-		return vma->vm_ops->pmd_fault(vma, vmf);
+	if (vmf->vma->vm_ops->pmd_fault)
+		return vmf->vma->vm_ops->pmd_fault(vmf);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3456,7 +3455,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
+		return vmf->vma->vm_ops->pmd_fault(vmf);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
  2016-12-15 23:40 ` Dave Jiang
@ 2016-12-16  1:07   ` Dave Chinner
  -1 siblings, 0 replies; 23+ messages in thread
From: Dave Chinner @ 2016-12-16  1:07 UTC (permalink / raw)
  To: Dave Jiang; +Cc: tytso, linux-nvdimm, linux-mm, jack, akpm, hch

On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> The caller into dax needs to clear __GFP_FS mask bit since it's
> responsible for acquiring locks / transactions that blocks __GFP_FS
> allocation.  The caller will restore the original mask when dax function
> returns.

What's the allocation problem you're working around here? Can you
please describe the call chain that is the problem?

>  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  
>  	if (IS_DAX(inode)) {
> +		gfp_t old_gfp = vmf->gfp_mask;
> +
> +		vmf->gfp_mask &= ~__GFP_FS;
>  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> +		vmf->gfp_mask = old_gfp;

I really have to say that I hate code that clears and restores flags
without any explanation of why the code needs to play flag tricks. I
take one look at the XFS fault handling code and ask myself now "why
the hell do we need to clear those flags?" Especially as the other
paths into generic fault handlers /don't/ require us to do this.
What does DAX do that require us to treat memory allocation contexts
differently to the filemap_fault() path?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
@ 2016-12-16  1:07   ` Dave Chinner
  0 siblings, 0 replies; 23+ messages in thread
From: Dave Chinner @ 2016-12-16  1:07 UTC (permalink / raw)
  To: Dave Jiang
  Cc: akpm, jack, linux-nvdimm, hch, linux-mm, tytso, ross.zwisler,
	dan.j.williams

On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> The caller into dax needs to clear __GFP_FS mask bit since it's
> responsible for acquiring locks / transactions that blocks __GFP_FS
> allocation.  The caller will restore the original mask when dax function
> returns.

What's the allocation problem you're working around here? Can you
please describe the call chain that is the problem?

>  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  
>  	if (IS_DAX(inode)) {
> +		gfp_t old_gfp = vmf->gfp_mask;
> +
> +		vmf->gfp_mask &= ~__GFP_FS;
>  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> +		vmf->gfp_mask = old_gfp;

I really have to say that I hate code that clears and restores flags
without any explanation of why the code needs to play flag tricks. I
take one look at the XFS fault handling code and ask myself now "why
the hell do we need to clear those flags?" Especially as the other
paths into generic fault handlers /don't/ require us to do this.
What does DAX do that require us to treat memory allocation contexts
differently to the filemap_fault() path?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
  2016-12-16  1:07   ` Dave Chinner
@ 2016-12-16 16:19     ` Ross Zwisler
  -1 siblings, 0 replies; 23+ messages in thread
From: Ross Zwisler @ 2016-12-16 16:19 UTC (permalink / raw)
  To: Dave Chinner; +Cc: jack, linux-nvdimm, linux-mm, tytso, akpm, hch

On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > The caller into dax needs to clear __GFP_FS mask bit since it's
> > responsible for acquiring locks / transactions that blocks __GFP_FS
> > allocation.  The caller will restore the original mask when dax function
> > returns.
> 
> What's the allocation problem you're working around here? Can you
> please describe the call chain that is the problem?
> 
> >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> >  
> >  	if (IS_DAX(inode)) {
> > +		gfp_t old_gfp = vmf->gfp_mask;
> > +
> > +		vmf->gfp_mask &= ~__GFP_FS;
> >  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> > +		vmf->gfp_mask = old_gfp;
> 
> I really have to say that I hate code that clears and restores flags
> without any explanation of why the code needs to play flag tricks. I
> take one look at the XFS fault handling code and ask myself now "why
> the hell do we need to clear those flags?" Especially as the other
> paths into generic fault handlers /don't/ require us to do this.
> What does DAX do that require us to treat memory allocation contexts
> differently to the filemap_fault() path?

This was done in response to Jan Kara's concern:

  The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
  because at that point it is correct. But once we grab filesystem locks which
  are not reclaim safe, we should update vmf->gfp_mask we pass further down
  into DAX code to not contain __GFP_FS (that's a bug we apparently have
  there). And inside DAX code, we definitely are not generally safe to add
  __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
  vm_fault into this function, using passed gfp_mask there and make sure
  callers update gfp_mask as appropriate.

https://lkml.org/lkml/2016/10/4/37

IIUC I think the concern is that, for example, in xfs_filemap_page_mkwrite()
we take a read lock on the struct inode.i_rwsem before we call
dax_iomap_fault().

dax_iomap_fault() then calls find_or_create_page(), etc. with the
vfm->gfp_mask we were given.

I believe the concern is that if that memory allocation tries to do FS
operations to free memory because __GFP_FS is part of the gfp mask, then we
could end up deadlocking because we are already holding FS locks.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
@ 2016-12-16 16:19     ` Ross Zwisler
  0 siblings, 0 replies; 23+ messages in thread
From: Ross Zwisler @ 2016-12-16 16:19 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Dave Jiang, akpm, jack, linux-nvdimm, hch, linux-mm, tytso,
	ross.zwisler, dan.j.williams

On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > The caller into dax needs to clear __GFP_FS mask bit since it's
> > responsible for acquiring locks / transactions that blocks __GFP_FS
> > allocation.  The caller will restore the original mask when dax function
> > returns.
> 
> What's the allocation problem you're working around here? Can you
> please describe the call chain that is the problem?
> 
> >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> >  
> >  	if (IS_DAX(inode)) {
> > +		gfp_t old_gfp = vmf->gfp_mask;
> > +
> > +		vmf->gfp_mask &= ~__GFP_FS;
> >  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> > +		vmf->gfp_mask = old_gfp;
> 
> I really have to say that I hate code that clears and restores flags
> without any explanation of why the code needs to play flag tricks. I
> take one look at the XFS fault handling code and ask myself now "why
> the hell do we need to clear those flags?" Especially as the other
> paths into generic fault handlers /don't/ require us to do this.
> What does DAX do that require us to treat memory allocation contexts
> differently to the filemap_fault() path?

This was done in response to Jan Kara's concern:

  The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
  because at that point it is correct. But once we grab filesystem locks which
  are not reclaim safe, we should update vmf->gfp_mask we pass further down
  into DAX code to not contain __GFP_FS (that's a bug we apparently have
  there). And inside DAX code, we definitely are not generally safe to add
  __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
  vm_fault into this function, using passed gfp_mask there and make sure
  callers update gfp_mask as appropriate.

https://lkml.org/lkml/2016/10/4/37

IIUC I think the concern is that, for example, in xfs_filemap_page_mkwrite()
we take a read lock on the struct inode.i_rwsem before we call
dax_iomap_fault().

dax_iomap_fault() then calls find_or_create_page(), etc. with the
vfm->gfp_mask we were given.

I believe the concern is that if that memory allocation tries to do FS
operations to free memory because __GFP_FS is part of the gfp mask, then we
could end up deadlocking because we are already holding FS locks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
  2016-12-16 16:19     ` Ross Zwisler
@ 2016-12-16 22:04       ` Dave Chinner
  -1 siblings, 0 replies; 23+ messages in thread
From: Dave Chinner @ 2016-12-16 22:04 UTC (permalink / raw)
  To: Ross Zwisler; +Cc: jack, linux-nvdimm, linux-mm, tytso, akpm, hch

On Fri, Dec 16, 2016 at 09:19:16AM -0700, Ross Zwisler wrote:
> On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> > On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > > The caller into dax needs to clear __GFP_FS mask bit since it's
> > > responsible for acquiring locks / transactions that blocks __GFP_FS
> > > allocation.  The caller will restore the original mask when dax function
> > > returns.
> > 
> > What's the allocation problem you're working around here? Can you
> > please describe the call chain that is the problem?
> > 
> > >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > >  
> > >  	if (IS_DAX(inode)) {
> > > +		gfp_t old_gfp = vmf->gfp_mask;
> > > +
> > > +		vmf->gfp_mask &= ~__GFP_FS;
> > >  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> > > +		vmf->gfp_mask = old_gfp;
> > 
> > I really have to say that I hate code that clears and restores flags
> > without any explanation of why the code needs to play flag tricks. I
> > take one look at the XFS fault handling code and ask myself now "why
> > the hell do we need to clear those flags?" Especially as the other
> > paths into generic fault handlers /don't/ require us to do this.
> > What does DAX do that require us to treat memory allocation contexts
> > differently to the filemap_fault() path?
> 
> This was done in response to Jan Kara's concern:
> 
>   The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
>   because at that point it is correct. But once we grab filesystem locks which
>   are not reclaim safe, we should update vmf->gfp_mask we pass further down
>   into DAX code to not contain __GFP_FS (that's a bug we apparently have
>   there). And inside DAX code, we definitely are not generally safe to add
>   __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
>   vm_fault into this function, using passed gfp_mask there and make sure
>   callers update gfp_mask as appropriate.
> 
> https://lkml.org/lkml/2016/10/4/37
> 
> IIUC I think the concern is that, for example, in xfs_filemap_page_mkwrite()
> we take a read lock on the struct inode.i_rwsem before we call
> dax_iomap_fault().

That, my friends, is exactly the problem that mapping_gfp_mask() is
meant to solve. This:

> > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS |  __GFP_IO;

Is just so wrong it's not funny.

The whole point of mapping_gfp_mask() is to remove flags from the
gfp_mask used to do mapping+page cache related allocations that the
mapping->host considers dangerous when the host may be holding locks.
This includes mapping tree allocations, and anything else required
to set up a new entry in the mapping during IO path operations. That
includes page fault operations...

e.g. in xfs_setup_inode():

        /*
         * Ensure all page cache allocations are done from GFP_NOFS context to
         * prevent direct reclaim recursion back into the filesystem and blowing
         * stacks or deadlocking.
         */
        gfp_mask = mapping_gfp_mask(inode->i_mapping);
        mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));

i.e. XFS considers it invalid to use GFP_FS at all for mapping
allocations in the io path, because we *know* that we hold
filesystems locks over those allocations.

> dax_iomap_fault() then calls find_or_create_page(), etc. with the
> vfm->gfp_mask we were given.

Yup. Precisely why we should be using mapping_gfp_mask() as it was
intended for vmf.gfp_mask....

> I believe the concern is that if that memory allocation tries to do FS
> operations to free memory because __GFP_FS is part of the gfp mask, then we
> could end up deadlocking because we are already holding FS locks.

Which is a problem with the filesystem mapping mask setup, not a
reason to sprinkle random gfpmask clear/set pairs around the code.
i.e. For DAX inodes, the mapping mask should clear __GFP_FS as XFS
does above, and the mapping_gfp_mask() should be used unadulterated
by the DAX page fault code....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
@ 2016-12-16 22:04       ` Dave Chinner
  0 siblings, 0 replies; 23+ messages in thread
From: Dave Chinner @ 2016-12-16 22:04 UTC (permalink / raw)
  To: Ross Zwisler
  Cc: Dave Jiang, akpm, jack, linux-nvdimm, hch, linux-mm, tytso,
	dan.j.williams

On Fri, Dec 16, 2016 at 09:19:16AM -0700, Ross Zwisler wrote:
> On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> > On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > > The caller into dax needs to clear __GFP_FS mask bit since it's
> > > responsible for acquiring locks / transactions that blocks __GFP_FS
> > > allocation.  The caller will restore the original mask when dax function
> > > returns.
> > 
> > What's the allocation problem you're working around here? Can you
> > please describe the call chain that is the problem?
> > 
> > >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > >  
> > >  	if (IS_DAX(inode)) {
> > > +		gfp_t old_gfp = vmf->gfp_mask;
> > > +
> > > +		vmf->gfp_mask &= ~__GFP_FS;
> > >  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> > > +		vmf->gfp_mask = old_gfp;
> > 
> > I really have to say that I hate code that clears and restores flags
> > without any explanation of why the code needs to play flag tricks. I
> > take one look at the XFS fault handling code and ask myself now "why
> > the hell do we need to clear those flags?" Especially as the other
> > paths into generic fault handlers /don't/ require us to do this.
> > What does DAX do that require us to treat memory allocation contexts
> > differently to the filemap_fault() path?
> 
> This was done in response to Jan Kara's concern:
> 
>   The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
>   because at that point it is correct. But once we grab filesystem locks which
>   are not reclaim safe, we should update vmf->gfp_mask we pass further down
>   into DAX code to not contain __GFP_FS (that's a bug we apparently have
>   there). And inside DAX code, we definitely are not generally safe to add
>   __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
>   vm_fault into this function, using passed gfp_mask there and make sure
>   callers update gfp_mask as appropriate.
> 
> https://lkml.org/lkml/2016/10/4/37
> 
> IIUC I think the concern is that, for example, in xfs_filemap_page_mkwrite()
> we take a read lock on the struct inode.i_rwsem before we call
> dax_iomap_fault().

That, my friends, is exactly the problem that mapping_gfp_mask() is
meant to solve. This:

> > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS |  __GFP_IO;

Is just so wrong it's not funny.

The whole point of mapping_gfp_mask() is to remove flags from the
gfp_mask used to do mapping+page cache related allocations that the
mapping->host considers dangerous when the host may be holding locks.
This includes mapping tree allocations, and anything else required
to set up a new entry in the mapping during IO path operations. That
includes page fault operations...

e.g. in xfs_setup_inode():

        /*
         * Ensure all page cache allocations are done from GFP_NOFS context to
         * prevent direct reclaim recursion back into the filesystem and blowing
         * stacks or deadlocking.
         */
        gfp_mask = mapping_gfp_mask(inode->i_mapping);
        mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));

i.e. XFS considers it invalid to use GFP_FS at all for mapping
allocations in the io path, because we *know* that we hold
filesystems locks over those allocations.

> dax_iomap_fault() then calls find_or_create_page(), etc. with the
> vfm->gfp_mask we were given.

Yup. Precisely why we should be using mapping_gfp_mask() as it was
intended for vmf.gfp_mask....

> I believe the concern is that if that memory allocation tries to do FS
> operations to free memory because __GFP_FS is part of the gfp mask, then we
> could end up deadlocking because we are already holding FS locks.

Which is a problem with the filesystem mapping mask setup, not a
reason to sprinkle random gfpmask clear/set pairs around the code.
i.e. For DAX inodes, the mapping mask should clear __GFP_FS as XFS
does above, and the mapping_gfp_mask() should be used unadulterated
by the DAX page fault code....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 3/3] mm, dax: move pmd_fault() to take only vmf parameter
  2016-12-15 23:40   ` Dave Jiang
@ 2016-12-19 17:41     ` Jan Kara
  -1 siblings, 0 replies; 23+ messages in thread
From: Jan Kara @ 2016-12-19 17:41 UTC (permalink / raw)
  To: Dave Jiang; +Cc: jack, linux-nvdimm, david, linux-mm, tytso, akpm, hch

On Thu 15-12-16 16:40:54, Dave Jiang wrote:
> pmd_fault() and related functions really only need the vmf parameter since
> the additional parameters are all included in the vmf struct. Removing
> additional parameter and simplify pmd_fault() and friends.
> 
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>

I think I already gave you my reviewed-by tag for this but whatever. Here
it is again:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
> ---
>  drivers/dax/dax.c             |   18 +++++++-------
>  fs/dax.c                      |   54 +++++++++++++++++++----------------------
>  fs/ext4/file.c                |    8 +++---
>  fs/xfs/xfs_file.c             |    7 ++---
>  include/linux/dax.h           |    7 ++---
>  include/linux/mm.h            |    2 +-
>  include/trace/events/fs_dax.h |   54 +++++++++++++++++++----------------------
>  mm/memory.c                   |    9 +++----
>  8 files changed, 74 insertions(+), 85 deletions(-)
> 
> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
> index 947e49a..55160f8 100644
> --- a/drivers/dax/dax.c
> +++ b/drivers/dax/dax.c
> @@ -378,8 +378,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	return rc;
>  }
>  
> -static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
> -		struct vm_area_struct *vma, struct vm_fault *vmf)
> +static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
>  {
>  	unsigned long pmd_addr = vmf->address & PMD_MASK;
>  	struct device *dev = &dax_dev->dev;
> @@ -388,7 +387,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
>  	pgoff_t pgoff;
>  	pfn_t pfn;
>  
> -	if (check_vma(dax_dev, vma, __func__))
> +	if (check_vma(dax_dev, vmf->vma, __func__))
>  		return VM_FAULT_SIGBUS;
>  
>  	dax_region = dax_dev->region;
> @@ -403,7 +402,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
>  		return VM_FAULT_SIGBUS;
>  	}
>  
> -	pgoff = linear_page_index(vma, pmd_addr);
> +	pgoff = linear_page_index(vmf->vma, pmd_addr);
>  	phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
>  	if (phys == -1) {
>  		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
> @@ -413,22 +412,23 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
>  
>  	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
>  
> -	return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
> +	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
>  			vmf->flags & FAULT_FLAG_WRITE);
>  }
>  
> -static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +static int dax_dev_pmd_fault(struct vm_fault *vmf)
>  {
>  	int rc;
> -	struct file *filp = vma->vm_file;
> +	struct file *filp = vmf->vma->vm_file;
>  	struct dax_dev *dax_dev = filp->private_data;
>  
>  	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
>  			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
> -			? "write" : "read", vma->vm_start, vma->vm_end);
> +			? "write" : "read",
> +			vmf->vma->vm_start, vmf->vma->vm_end);
>  
>  	rcu_read_lock();
> -	rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
> +	rc = __dax_dev_pmd_fault(dax_dev, vmf);
>  	rcu_read_unlock();
>  
>  	return rc;
> diff --git a/fs/dax.c b/fs/dax.c
> index 157f77f..bc39809 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1226,11 +1226,10 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
>   */
>  #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
>  
> -static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
> -		struct vm_fault *vmf, unsigned long address,
> -		struct iomap *iomap, loff_t pos, bool write, void **entryp)
> +static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
> +		loff_t pos, void **entryp)
>  {
> -	struct address_space *mapping = vma->vm_file->f_mapping;
> +	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
>  	struct block_device *bdev = iomap->bdev;
>  	struct inode *inode = mapping->host;
>  	struct blk_dax_ctl dax = {
> @@ -1257,31 +1256,30 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
>  		goto fallback;
>  	*entryp = ret;
>  
> -	trace_dax_pmd_insert_mapping(inode, vma, address, write, length,
> -			dax.pfn, ret);
> -	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
> +	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
> +	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
> +			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
>  
>   unmap_fallback:
>  	dax_unmap_atomic(bdev, &dax);
>  fallback:
> -	trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write,
> -			length, dax.pfn, ret);
> +	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
> +			dax.pfn, ret);
>  	return VM_FAULT_FALLBACK;
>  }
>  
> -static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
> -		struct vm_fault *vmf, unsigned long address,
> -		struct iomap *iomap, void **entryp)
> +static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
> +		void **entryp)
>  {
> -	struct address_space *mapping = vma->vm_file->f_mapping;
> -	unsigned long pmd_addr = address & PMD_MASK;
> +	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> +	unsigned long pmd_addr = vmf->address & PMD_MASK;
>  	struct inode *inode = mapping->host;
>  	struct page *zero_page;
>  	void *ret = NULL;
>  	spinlock_t *ptl;
>  	pmd_t pmd_entry;
>  
> -	zero_page = mm_get_huge_zero_page(vma->vm_mm);
> +	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
>  
>  	if (unlikely(!zero_page))
>  		goto fallback;
> @@ -1292,27 +1290,27 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
>  		goto fallback;
>  	*entryp = ret;
>  
> -	ptl = pmd_lock(vma->vm_mm, pmd);
> -	if (!pmd_none(*pmd)) {
> +	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
> +	if (!pmd_none(*(vmf->pmd))) {
>  		spin_unlock(ptl);
>  		goto fallback;
>  	}
>  
> -	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
> +	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
>  	pmd_entry = pmd_mkhuge(pmd_entry);
> -	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
> +	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
>  	spin_unlock(ptl);
> -	trace_dax_pmd_load_hole(inode, vma, address, zero_page, ret);
> +	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
>  	return VM_FAULT_NOPAGE;
>  
>  fallback:
> -	trace_dax_pmd_load_hole_fallback(inode, vma, address, zero_page, ret);
> +	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
>  	return VM_FAULT_FALLBACK;
>  }
>  
> -int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> -		struct iomap_ops *ops)
> +int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
>  {
> +	struct vm_area_struct *vma = vmf->vma;
>  	struct address_space *mapping = vma->vm_file->f_mapping;
>  	unsigned long pmd_addr = vmf->address & PMD_MASK;
>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
> @@ -1334,7 +1332,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	vmf->pgoff = linear_page_index(vma, pmd_addr);
>  	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
>  
> -	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
> +	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
>  
>  	/* Fall back to PTEs if we're going to COW */
>  	if (write && !(vma->vm_flags & VM_SHARED))
> @@ -1379,15 +1377,13 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  
>  	switch (iomap.type) {
>  	case IOMAP_MAPPED:
> -		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
> -				vmf->address, &iomap, pos, write, &entry);
> +		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
>  		break;
>  	case IOMAP_UNWRITTEN:
>  	case IOMAP_HOLE:
>  		if (WARN_ON_ONCE(write))
>  			goto unlock_entry;
> -		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
> -				&iomap, &entry);
> +		result = dax_pmd_load_hole(vmf, &iomap, &entry);
>  		break;
>  	default:
>  		WARN_ON_ONCE(1);
> @@ -1417,7 +1413,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		count_vm_event(THP_FAULT_FALLBACK);
>  	}
>  out:
> -	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
> +	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
>  	vmf->pgoff = old_pgoff;
>  	return result;
>  }
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index ad56f90..14a8753 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -279,22 +279,22 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  }
>  
>  static int
> -ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +ext4_dax_pmd_fault(struct vm_fault *vmf)
>  {
>  	int result;
> -	struct inode *inode = file_inode(vma->vm_file);
> +	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	struct super_block *sb = inode->i_sb;
>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
>  	gfp_t old_gfp = vmf->gfp_mask;
>  
>  	if (write) {
>  		sb_start_pagefault(sb);
> -		file_update_time(vma->vm_file);
> +		file_update_time(vmf->vma->vm_file);
>  	}
>  
>  	vmf->gfp_mask &= ~__GFP_FS;
>  	down_read(&EXT4_I(inode)->i_mmap_sem);
> -	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
> +	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
>  	up_read(&EXT4_I(inode)->i_mmap_sem);
>  	vmf->gfp_mask = old_gfp;
>  	if (write)
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 27d395b..8359b8f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1532,10 +1532,9 @@ xfs_filemap_fault(
>   */
>  STATIC int
>  xfs_filemap_pmd_fault(
> -	struct vm_area_struct	*vma,
>  	struct vm_fault *vmf)
>  {
> -	struct inode		*inode = file_inode(vma->vm_file);
> +	struct inode		*inode = file_inode(vmf->vma->vm_file);
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	int			ret;
>  	gfp_t			old_gfp = vmf->gfp_mask;
> @@ -1547,12 +1546,12 @@ xfs_filemap_pmd_fault(
>  
>  	if (vmf->flags & FAULT_FLAG_WRITE) {
>  		sb_start_pagefault(inode->i_sb);
> -		file_update_time(vma->vm_file);
> +		file_update_time(vmf->vma->vm_file);
>  	}
>  
>  	vmf->gfp_mask &= ~__GFP_FS;
>  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> -	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
> +	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
>  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  	vmf->gfp_mask = old_gfp;
>  
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 9761c90..1ffdb4d 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -71,15 +71,14 @@ static inline unsigned int dax_radix_order(void *entry)
>  		return PMD_SHIFT - PAGE_SHIFT;
>  	return 0;
>  }
> -int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> -		struct iomap_ops *ops);
> +int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops);
>  #else
>  static inline unsigned int dax_radix_order(void *entry)
>  {
>  	return 0;
>  }
> -static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
> -		struct vm_fault *vmf, struct iomap_ops *ops)
> +static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
> +		struct iomap_ops *ops)
>  {
>  	return VM_FAULT_FALLBACK;
>  }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index aef645b..795f03e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -347,7 +347,7 @@ struct vm_operations_struct {
>  	void (*close)(struct vm_area_struct * area);
>  	int (*mremap)(struct vm_area_struct * area);
>  	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
> -	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
> +	int (*pmd_fault)(struct vm_fault *vmf);
>  	void (*map_pages)(struct vm_fault *vmf,
>  			pgoff_t start_pgoff, pgoff_t end_pgoff);
>  
> diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
> index a98665b..c566ddc 100644
> --- a/include/trace/events/fs_dax.h
> +++ b/include/trace/events/fs_dax.h
> @@ -7,9 +7,9 @@
>  #include <linux/tracepoint.h>
>  
>  DECLARE_EVENT_CLASS(dax_pmd_fault_class,
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
> -		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
> -	TP_ARGS(inode, vma, vmf, max_pgoff, result),
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
> +		pgoff_t max_pgoff, int result),
> +	TP_ARGS(inode, vmf, max_pgoff, result),
>  	TP_STRUCT__entry(
>  		__field(unsigned long, ino)
>  		__field(unsigned long, vm_start)
> @@ -25,9 +25,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
>  	TP_fast_assign(
>  		__entry->dev = inode->i_sb->s_dev;
>  		__entry->ino = inode->i_ino;
> -		__entry->vm_start = vma->vm_start;
> -		__entry->vm_end = vma->vm_end;
> -		__entry->vm_flags = vma->vm_flags;
> +		__entry->vm_start = vmf->vma->vm_start;
> +		__entry->vm_end = vmf->vma->vm_end;
> +		__entry->vm_flags = vmf->vma->vm_flags;
>  		__entry->address = vmf->address;
>  		__entry->flags = vmf->flags;
>  		__entry->pgoff = vmf->pgoff;
> @@ -52,19 +52,18 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
>  
>  #define DEFINE_PMD_FAULT_EVENT(name) \
>  DEFINE_EVENT(dax_pmd_fault_class, name, \
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
> -		struct vm_fault *vmf, \
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
>  		pgoff_t max_pgoff, int result), \
> -	TP_ARGS(inode, vma, vmf, max_pgoff, result))
> +	TP_ARGS(inode, vmf, max_pgoff, result))
>  
>  DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
>  DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
>  
>  DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
> -		unsigned long address, struct page *zero_page,
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
> +		struct page *zero_page,
>  		void *radix_entry),
> -	TP_ARGS(inode, vma, address, zero_page, radix_entry),
> +	TP_ARGS(inode, vmf, zero_page, radix_entry),
>  	TP_STRUCT__entry(
>  		__field(unsigned long, ino)
>  		__field(unsigned long, vm_flags)
> @@ -76,8 +75,8 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
>  	TP_fast_assign(
>  		__entry->dev = inode->i_sb->s_dev;
>  		__entry->ino = inode->i_ino;
> -		__entry->vm_flags = vma->vm_flags;
> -		__entry->address = address;
> +		__entry->vm_flags = vmf->vma->vm_flags;
> +		__entry->address = vmf->address;
>  		__entry->zero_page = zero_page;
>  		__entry->radix_entry = radix_entry;
>  	),
> @@ -95,19 +94,17 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
>  
>  #define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
>  DEFINE_EVENT(dax_pmd_load_hole_class, name, \
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
> -		unsigned long address, struct page *zero_page, \
> -		void *radix_entry), \
> -	TP_ARGS(inode, vma, address, zero_page, radix_entry))
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
> +		struct page *zero_page, void *radix_entry), \
> +	TP_ARGS(inode, vmf, zero_page, radix_entry))
>  
>  DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
>  DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
>  
>  DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
> -		unsigned long address, int write, long length, pfn_t pfn,
> -		void *radix_entry),
> -	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry),
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
> +		long length, pfn_t pfn, void *radix_entry),
> +	TP_ARGS(inode, vmf, length, pfn, radix_entry),
>  	TP_STRUCT__entry(
>  		__field(unsigned long, ino)
>  		__field(unsigned long, vm_flags)
> @@ -121,9 +118,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
>  	TP_fast_assign(
>  		__entry->dev = inode->i_sb->s_dev;
>  		__entry->ino = inode->i_ino;
> -		__entry->vm_flags = vma->vm_flags;
> -		__entry->address = address;
> -		__entry->write = write;
> +		__entry->vm_flags = vmf->vma->vm_flags;
> +		__entry->address = vmf->address;
> +		__entry->write = vmf->flags & FAULT_FLAG_WRITE;
>  		__entry->length = length;
>  		__entry->pfn_val = pfn.val;
>  		__entry->radix_entry = radix_entry;
> @@ -146,10 +143,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
>  
>  #define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
>  DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
> -		unsigned long address, int write, long length, pfn_t pfn, \
> -		void *radix_entry), \
> -	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry))
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
> +		long length, pfn_t pfn, void *radix_entry), \
> +	TP_ARGS(inode, vmf, length, pfn, radix_entry))
>  
>  DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
>  DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
> diff --git a/mm/memory.c b/mm/memory.c
> index 8ec36cf..e929c41 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3443,11 +3443,10 @@ static int do_numa_page(struct vm_fault *vmf)
>  
>  static int create_huge_pmd(struct vm_fault *vmf)
>  {
> -	struct vm_area_struct *vma = vmf->vma;
> -	if (vma_is_anonymous(vma))
> +	if (vma_is_anonymous(vmf->vma))
>  		return do_huge_pmd_anonymous_page(vmf);
> -	if (vma->vm_ops->pmd_fault)
> -		return vma->vm_ops->pmd_fault(vma, vmf);
> +	if (vmf->vma->vm_ops->pmd_fault)
> +		return vmf->vma->vm_ops->pmd_fault(vmf);
>  	return VM_FAULT_FALLBACK;
>  }
>  
> @@ -3456,7 +3455,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
>  	if (vma_is_anonymous(vmf->vma))
>  		return do_huge_pmd_wp_page(vmf, orig_pmd);
>  	if (vmf->vma->vm_ops->pmd_fault)
> -		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
> +		return vmf->vma->vm_ops->pmd_fault(vmf);
>  
>  	/* COW handled on pte level: split pmd */
>  	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 3/3] mm, dax: move pmd_fault() to take only vmf parameter
@ 2016-12-19 17:41     ` Jan Kara
  0 siblings, 0 replies; 23+ messages in thread
From: Jan Kara @ 2016-12-19 17:41 UTC (permalink / raw)
  To: Dave Jiang
  Cc: akpm, jack, linux-nvdimm, david, hch, linux-mm, tytso,
	ross.zwisler, dan.j.williams

On Thu 15-12-16 16:40:54, Dave Jiang wrote:
> pmd_fault() and related functions really only need the vmf parameter since
> the additional parameters are all included in the vmf struct. Removing
> additional parameter and simplify pmd_fault() and friends.
> 
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>

I think I already gave you my reviewed-by tag for this but whatever. Here
it is again:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
> ---
>  drivers/dax/dax.c             |   18 +++++++-------
>  fs/dax.c                      |   54 +++++++++++++++++++----------------------
>  fs/ext4/file.c                |    8 +++---
>  fs/xfs/xfs_file.c             |    7 ++---
>  include/linux/dax.h           |    7 ++---
>  include/linux/mm.h            |    2 +-
>  include/trace/events/fs_dax.h |   54 +++++++++++++++++++----------------------
>  mm/memory.c                   |    9 +++----
>  8 files changed, 74 insertions(+), 85 deletions(-)
> 
> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
> index 947e49a..55160f8 100644
> --- a/drivers/dax/dax.c
> +++ b/drivers/dax/dax.c
> @@ -378,8 +378,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	return rc;
>  }
>  
> -static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
> -		struct vm_area_struct *vma, struct vm_fault *vmf)
> +static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
>  {
>  	unsigned long pmd_addr = vmf->address & PMD_MASK;
>  	struct device *dev = &dax_dev->dev;
> @@ -388,7 +387,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
>  	pgoff_t pgoff;
>  	pfn_t pfn;
>  
> -	if (check_vma(dax_dev, vma, __func__))
> +	if (check_vma(dax_dev, vmf->vma, __func__))
>  		return VM_FAULT_SIGBUS;
>  
>  	dax_region = dax_dev->region;
> @@ -403,7 +402,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
>  		return VM_FAULT_SIGBUS;
>  	}
>  
> -	pgoff = linear_page_index(vma, pmd_addr);
> +	pgoff = linear_page_index(vmf->vma, pmd_addr);
>  	phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
>  	if (phys == -1) {
>  		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
> @@ -413,22 +412,23 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
>  
>  	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
>  
> -	return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
> +	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
>  			vmf->flags & FAULT_FLAG_WRITE);
>  }
>  
> -static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +static int dax_dev_pmd_fault(struct vm_fault *vmf)
>  {
>  	int rc;
> -	struct file *filp = vma->vm_file;
> +	struct file *filp = vmf->vma->vm_file;
>  	struct dax_dev *dax_dev = filp->private_data;
>  
>  	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
>  			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
> -			? "write" : "read", vma->vm_start, vma->vm_end);
> +			? "write" : "read",
> +			vmf->vma->vm_start, vmf->vma->vm_end);
>  
>  	rcu_read_lock();
> -	rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
> +	rc = __dax_dev_pmd_fault(dax_dev, vmf);
>  	rcu_read_unlock();
>  
>  	return rc;
> diff --git a/fs/dax.c b/fs/dax.c
> index 157f77f..bc39809 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1226,11 +1226,10 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
>   */
>  #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
>  
> -static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
> -		struct vm_fault *vmf, unsigned long address,
> -		struct iomap *iomap, loff_t pos, bool write, void **entryp)
> +static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
> +		loff_t pos, void **entryp)
>  {
> -	struct address_space *mapping = vma->vm_file->f_mapping;
> +	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
>  	struct block_device *bdev = iomap->bdev;
>  	struct inode *inode = mapping->host;
>  	struct blk_dax_ctl dax = {
> @@ -1257,31 +1256,30 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
>  		goto fallback;
>  	*entryp = ret;
>  
> -	trace_dax_pmd_insert_mapping(inode, vma, address, write, length,
> -			dax.pfn, ret);
> -	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
> +	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
> +	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
> +			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
>  
>   unmap_fallback:
>  	dax_unmap_atomic(bdev, &dax);
>  fallback:
> -	trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write,
> -			length, dax.pfn, ret);
> +	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
> +			dax.pfn, ret);
>  	return VM_FAULT_FALLBACK;
>  }
>  
> -static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
> -		struct vm_fault *vmf, unsigned long address,
> -		struct iomap *iomap, void **entryp)
> +static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
> +		void **entryp)
>  {
> -	struct address_space *mapping = vma->vm_file->f_mapping;
> -	unsigned long pmd_addr = address & PMD_MASK;
> +	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> +	unsigned long pmd_addr = vmf->address & PMD_MASK;
>  	struct inode *inode = mapping->host;
>  	struct page *zero_page;
>  	void *ret = NULL;
>  	spinlock_t *ptl;
>  	pmd_t pmd_entry;
>  
> -	zero_page = mm_get_huge_zero_page(vma->vm_mm);
> +	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
>  
>  	if (unlikely(!zero_page))
>  		goto fallback;
> @@ -1292,27 +1290,27 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
>  		goto fallback;
>  	*entryp = ret;
>  
> -	ptl = pmd_lock(vma->vm_mm, pmd);
> -	if (!pmd_none(*pmd)) {
> +	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
> +	if (!pmd_none(*(vmf->pmd))) {
>  		spin_unlock(ptl);
>  		goto fallback;
>  	}
>  
> -	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
> +	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
>  	pmd_entry = pmd_mkhuge(pmd_entry);
> -	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
> +	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
>  	spin_unlock(ptl);
> -	trace_dax_pmd_load_hole(inode, vma, address, zero_page, ret);
> +	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
>  	return VM_FAULT_NOPAGE;
>  
>  fallback:
> -	trace_dax_pmd_load_hole_fallback(inode, vma, address, zero_page, ret);
> +	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
>  	return VM_FAULT_FALLBACK;
>  }
>  
> -int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> -		struct iomap_ops *ops)
> +int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
>  {
> +	struct vm_area_struct *vma = vmf->vma;
>  	struct address_space *mapping = vma->vm_file->f_mapping;
>  	unsigned long pmd_addr = vmf->address & PMD_MASK;
>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
> @@ -1334,7 +1332,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  	vmf->pgoff = linear_page_index(vma, pmd_addr);
>  	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
>  
> -	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
> +	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
>  
>  	/* Fall back to PTEs if we're going to COW */
>  	if (write && !(vma->vm_flags & VM_SHARED))
> @@ -1379,15 +1377,13 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  
>  	switch (iomap.type) {
>  	case IOMAP_MAPPED:
> -		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
> -				vmf->address, &iomap, pos, write, &entry);
> +		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
>  		break;
>  	case IOMAP_UNWRITTEN:
>  	case IOMAP_HOLE:
>  		if (WARN_ON_ONCE(write))
>  			goto unlock_entry;
> -		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
> -				&iomap, &entry);
> +		result = dax_pmd_load_hole(vmf, &iomap, &entry);
>  		break;
>  	default:
>  		WARN_ON_ONCE(1);
> @@ -1417,7 +1413,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  		count_vm_event(THP_FAULT_FALLBACK);
>  	}
>  out:
> -	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
> +	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
>  	vmf->pgoff = old_pgoff;
>  	return result;
>  }
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index ad56f90..14a8753 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -279,22 +279,22 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  }
>  
>  static int
> -ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +ext4_dax_pmd_fault(struct vm_fault *vmf)
>  {
>  	int result;
> -	struct inode *inode = file_inode(vma->vm_file);
> +	struct inode *inode = file_inode(vmf->vma->vm_file);
>  	struct super_block *sb = inode->i_sb;
>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
>  	gfp_t old_gfp = vmf->gfp_mask;
>  
>  	if (write) {
>  		sb_start_pagefault(sb);
> -		file_update_time(vma->vm_file);
> +		file_update_time(vmf->vma->vm_file);
>  	}
>  
>  	vmf->gfp_mask &= ~__GFP_FS;
>  	down_read(&EXT4_I(inode)->i_mmap_sem);
> -	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
> +	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
>  	up_read(&EXT4_I(inode)->i_mmap_sem);
>  	vmf->gfp_mask = old_gfp;
>  	if (write)
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 27d395b..8359b8f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1532,10 +1532,9 @@ xfs_filemap_fault(
>   */
>  STATIC int
>  xfs_filemap_pmd_fault(
> -	struct vm_area_struct	*vma,
>  	struct vm_fault *vmf)
>  {
> -	struct inode		*inode = file_inode(vma->vm_file);
> +	struct inode		*inode = file_inode(vmf->vma->vm_file);
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	int			ret;
>  	gfp_t			old_gfp = vmf->gfp_mask;
> @@ -1547,12 +1546,12 @@ xfs_filemap_pmd_fault(
>  
>  	if (vmf->flags & FAULT_FLAG_WRITE) {
>  		sb_start_pagefault(inode->i_sb);
> -		file_update_time(vma->vm_file);
> +		file_update_time(vmf->vma->vm_file);
>  	}
>  
>  	vmf->gfp_mask &= ~__GFP_FS;
>  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> -	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
> +	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
>  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  	vmf->gfp_mask = old_gfp;
>  
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 9761c90..1ffdb4d 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -71,15 +71,14 @@ static inline unsigned int dax_radix_order(void *entry)
>  		return PMD_SHIFT - PAGE_SHIFT;
>  	return 0;
>  }
> -int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> -		struct iomap_ops *ops);
> +int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops);
>  #else
>  static inline unsigned int dax_radix_order(void *entry)
>  {
>  	return 0;
>  }
> -static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
> -		struct vm_fault *vmf, struct iomap_ops *ops)
> +static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
> +		struct iomap_ops *ops)
>  {
>  	return VM_FAULT_FALLBACK;
>  }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index aef645b..795f03e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -347,7 +347,7 @@ struct vm_operations_struct {
>  	void (*close)(struct vm_area_struct * area);
>  	int (*mremap)(struct vm_area_struct * area);
>  	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
> -	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
> +	int (*pmd_fault)(struct vm_fault *vmf);
>  	void (*map_pages)(struct vm_fault *vmf,
>  			pgoff_t start_pgoff, pgoff_t end_pgoff);
>  
> diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
> index a98665b..c566ddc 100644
> --- a/include/trace/events/fs_dax.h
> +++ b/include/trace/events/fs_dax.h
> @@ -7,9 +7,9 @@
>  #include <linux/tracepoint.h>
>  
>  DECLARE_EVENT_CLASS(dax_pmd_fault_class,
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
> -		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
> -	TP_ARGS(inode, vma, vmf, max_pgoff, result),
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
> +		pgoff_t max_pgoff, int result),
> +	TP_ARGS(inode, vmf, max_pgoff, result),
>  	TP_STRUCT__entry(
>  		__field(unsigned long, ino)
>  		__field(unsigned long, vm_start)
> @@ -25,9 +25,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
>  	TP_fast_assign(
>  		__entry->dev = inode->i_sb->s_dev;
>  		__entry->ino = inode->i_ino;
> -		__entry->vm_start = vma->vm_start;
> -		__entry->vm_end = vma->vm_end;
> -		__entry->vm_flags = vma->vm_flags;
> +		__entry->vm_start = vmf->vma->vm_start;
> +		__entry->vm_end = vmf->vma->vm_end;
> +		__entry->vm_flags = vmf->vma->vm_flags;
>  		__entry->address = vmf->address;
>  		__entry->flags = vmf->flags;
>  		__entry->pgoff = vmf->pgoff;
> @@ -52,19 +52,18 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
>  
>  #define DEFINE_PMD_FAULT_EVENT(name) \
>  DEFINE_EVENT(dax_pmd_fault_class, name, \
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
> -		struct vm_fault *vmf, \
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
>  		pgoff_t max_pgoff, int result), \
> -	TP_ARGS(inode, vma, vmf, max_pgoff, result))
> +	TP_ARGS(inode, vmf, max_pgoff, result))
>  
>  DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
>  DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
>  
>  DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
> -		unsigned long address, struct page *zero_page,
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
> +		struct page *zero_page,
>  		void *radix_entry),
> -	TP_ARGS(inode, vma, address, zero_page, radix_entry),
> +	TP_ARGS(inode, vmf, zero_page, radix_entry),
>  	TP_STRUCT__entry(
>  		__field(unsigned long, ino)
>  		__field(unsigned long, vm_flags)
> @@ -76,8 +75,8 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
>  	TP_fast_assign(
>  		__entry->dev = inode->i_sb->s_dev;
>  		__entry->ino = inode->i_ino;
> -		__entry->vm_flags = vma->vm_flags;
> -		__entry->address = address;
> +		__entry->vm_flags = vmf->vma->vm_flags;
> +		__entry->address = vmf->address;
>  		__entry->zero_page = zero_page;
>  		__entry->radix_entry = radix_entry;
>  	),
> @@ -95,19 +94,17 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
>  
>  #define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
>  DEFINE_EVENT(dax_pmd_load_hole_class, name, \
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
> -		unsigned long address, struct page *zero_page, \
> -		void *radix_entry), \
> -	TP_ARGS(inode, vma, address, zero_page, radix_entry))
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
> +		struct page *zero_page, void *radix_entry), \
> +	TP_ARGS(inode, vmf, zero_page, radix_entry))
>  
>  DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
>  DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
>  
>  DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
> -		unsigned long address, int write, long length, pfn_t pfn,
> -		void *radix_entry),
> -	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry),
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
> +		long length, pfn_t pfn, void *radix_entry),
> +	TP_ARGS(inode, vmf, length, pfn, radix_entry),
>  	TP_STRUCT__entry(
>  		__field(unsigned long, ino)
>  		__field(unsigned long, vm_flags)
> @@ -121,9 +118,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
>  	TP_fast_assign(
>  		__entry->dev = inode->i_sb->s_dev;
>  		__entry->ino = inode->i_ino;
> -		__entry->vm_flags = vma->vm_flags;
> -		__entry->address = address;
> -		__entry->write = write;
> +		__entry->vm_flags = vmf->vma->vm_flags;
> +		__entry->address = vmf->address;
> +		__entry->write = vmf->flags & FAULT_FLAG_WRITE;
>  		__entry->length = length;
>  		__entry->pfn_val = pfn.val;
>  		__entry->radix_entry = radix_entry;
> @@ -146,10 +143,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
>  
>  #define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
>  DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
> -	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
> -		unsigned long address, int write, long length, pfn_t pfn, \
> -		void *radix_entry), \
> -	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry))
> +	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
> +		long length, pfn_t pfn, void *radix_entry), \
> +	TP_ARGS(inode, vmf, length, pfn, radix_entry))
>  
>  DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
>  DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
> diff --git a/mm/memory.c b/mm/memory.c
> index 8ec36cf..e929c41 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3443,11 +3443,10 @@ static int do_numa_page(struct vm_fault *vmf)
>  
>  static int create_huge_pmd(struct vm_fault *vmf)
>  {
> -	struct vm_area_struct *vma = vmf->vma;
> -	if (vma_is_anonymous(vma))
> +	if (vma_is_anonymous(vmf->vma))
>  		return do_huge_pmd_anonymous_page(vmf);
> -	if (vma->vm_ops->pmd_fault)
> -		return vma->vm_ops->pmd_fault(vma, vmf);
> +	if (vmf->vma->vm_ops->pmd_fault)
> +		return vmf->vma->vm_ops->pmd_fault(vmf);
>  	return VM_FAULT_FALLBACK;
>  }
>  
> @@ -3456,7 +3455,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
>  	if (vma_is_anonymous(vmf->vma))
>  		return do_huge_pmd_wp_page(vmf, orig_pmd);
>  	if (vmf->vma->vm_ops->pmd_fault)
> -		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
> +		return vmf->vma->vm_ops->pmd_fault(vmf);
>  
>  	/* COW handled on pte level: split pmd */
>  	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
  2016-12-16 22:04       ` Dave Chinner
@ 2016-12-19 17:56         ` Jiang, Dave
  -1 siblings, 0 replies; 23+ messages in thread
From: Jiang, Dave @ 2016-12-19 17:56 UTC (permalink / raw)
  To: ross.zwisler, david; +Cc: jack, akpm, linux-nvdimm, linux-mm, tytso, hch


On Sat, 2016-12-17 at 09:04 +1100, Dave Chinner wrote:
> On Fri, Dec 16, 2016 at 09:19:16AM -0700, Ross Zwisler wrote:
> > 
> > On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> > > 
> > > On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > > > 
> > > > The caller into dax needs to clear __GFP_FS mask bit since it's
> > > > responsible for acquiring locks / transactions that blocks
> > > > __GFP_FS
> > > > allocation.  The caller will restore the original mask when dax
> > > > function
> > > > returns.
> > > 
> > > What's the allocation problem you're working around here? Can you
> > > please describe the call chain that is the problem?
> > > 
> > > > 
> > > >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > > >  
> > > >  	if (IS_DAX(inode)) {
> > > > +		gfp_t old_gfp = vmf->gfp_mask;
> > > > +
> > > > +		vmf->gfp_mask &= ~__GFP_FS;
> > > >  		ret = dax_iomap_fault(vma, vmf,
> > > > &xfs_iomap_ops);
> > > > +		vmf->gfp_mask = old_gfp;
> > > 
> > > I really have to say that I hate code that clears and restores
> > > flags
> > > without any explanation of why the code needs to play flag
> > > tricks. I
> > > take one look at the XFS fault handling code and ask myself now
> > > "why
> > > the hell do we need to clear those flags?" Especially as the
> > > other
> > > paths into generic fault handlers /don't/ require us to do this.
> > > What does DAX do that require us to treat memory allocation
> > > contexts
> > > differently to the filemap_fault() path?
> > 
> > This was done in response to Jan Kara's concern:
> > 
> >   The gfp_mask that propagates from __do_fault() or
> > do_page_mkwrite() is fine
> >   because at that point it is correct. But once we grab filesystem
> > locks which
> >   are not reclaim safe, we should update vmf->gfp_mask we pass
> > further down
> >   into DAX code to not contain __GFP_FS (that's a bug we apparently
> > have
> >   there). And inside DAX code, we definitely are not generally safe
> > to add
> >   __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off
> > propagating struct
> >   vm_fault into this function, using passed gfp_mask there and make
> > sure
> >   callers update gfp_mask as appropriate.
> > 
> > https://lkml.org/lkml/2016/10/4/37
> > 
> > IIUC I think the concern is that, for example, in
> > xfs_filemap_page_mkwrite()
> > we take a read lock on the struct inode.i_rwsem before we call
> > dax_iomap_fault().
> 
> That, my friends, is exactly the problem that mapping_gfp_mask() is
> meant to solve. This:
> 
> > 
> > > 
> > > > 
> > > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS
> > > > |  __GFP_IO;
> 
> Is just so wrong it's not funny.
> 
> The whole point of mapping_gfp_mask() is to remove flags from the
> gfp_mask used to do mapping+page cache related allocations that the
> mapping->host considers dangerous when the host may be holding locks.
> This includes mapping tree allocations, and anything else required
> to set up a new entry in the mapping during IO path operations. That
> includes page fault operations...
> 
> e.g. in xfs_setup_inode():
> 
>         /*
>          * Ensure all page cache allocations are done from GFP_NOFS
> context to
>          * prevent direct reclaim recursion back into the filesystem
> and blowing
>          * stacks or deadlocking.
>          */
>         gfp_mask = mapping_gfp_mask(inode->i_mapping);
>         mapping_set_gfp_mask(inode->i_mapping, (gfp_mask &
> ~(__GFP_FS)));
> 
> i.e. XFS considers it invalid to use GFP_FS at all for mapping
> allocations in the io path, because we *know* that we hold
> filesystems locks over those allocations.
> 
> > 
> > dax_iomap_fault() then calls find_or_create_page(), etc. with the
> > vfm->gfp_mask we were given.
> 
> Yup. Precisely why we should be using mapping_gfp_mask() as it was
> intended for vmf.gfp_mask....
> 
> > 
> > I believe the concern is that if that memory allocation tries to do
> > FS
> > operations to free memory because __GFP_FS is part of the gfp mask,
> > then we
> > could end up deadlocking because we are already holding FS locks.
> 
> Which is a problem with the filesystem mapping mask setup, not a
> reason to sprinkle random gfpmask clear/set pairs around the code.
> i.e. For DAX inodes, the mapping mask should clear __GFP_FS as XFS
> does above, and the mapping_gfp_mask() should be used unadulterated
> by the DAX page fault code....


I'll drop this patch. We can address the issue separate from the
pmd_fault changes. 

> 
> Cheers,
> 
> Dave.
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
@ 2016-12-19 17:56         ` Jiang, Dave
  0 siblings, 0 replies; 23+ messages in thread
From: Jiang, Dave @ 2016-12-19 17:56 UTC (permalink / raw)
  To: ross.zwisler, david
  Cc: hch, Williams, Dan J, jack, linux-nvdimm, linux-mm, akpm, tytso


On Sat, 2016-12-17 at 09:04 +1100, Dave Chinner wrote:
> On Fri, Dec 16, 2016 at 09:19:16AM -0700, Ross Zwisler wrote:
> > 
> > On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> > > 
> > > On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > > > 
> > > > The caller into dax needs to clear __GFP_FS mask bit since it's
> > > > responsible for acquiring locks / transactions that blocks
> > > > __GFP_FS
> > > > allocation.  The caller will restore the original mask when dax
> > > > function
> > > > returns.
> > > 
> > > What's the allocation problem you're working around here? Can you
> > > please describe the call chain that is the problem?
> > > 
> > > > 
> > > >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > > >  
> > > >  	if (IS_DAX(inode)) {
> > > > +		gfp_t old_gfp = vmf->gfp_mask;
> > > > +
> > > > +		vmf->gfp_mask &= ~__GFP_FS;
> > > >  		ret = dax_iomap_fault(vma, vmf,
> > > > &xfs_iomap_ops);
> > > > +		vmf->gfp_mask = old_gfp;
> > > 
> > > I really have to say that I hate code that clears and restores
> > > flags
> > > without any explanation of why the code needs to play flag
> > > tricks. I
> > > take one look at the XFS fault handling code and ask myself now
> > > "why
> > > the hell do we need to clear those flags?" Especially as the
> > > other
> > > paths into generic fault handlers /don't/ require us to do this.
> > > What does DAX do that require us to treat memory allocation
> > > contexts
> > > differently to the filemap_fault() path?
> > 
> > This was done in response to Jan Kara's concern:
> > 
> >   The gfp_mask that propagates from __do_fault() or
> > do_page_mkwrite() is fine
> >   because at that point it is correct. But once we grab filesystem
> > locks which
> >   are not reclaim safe, we should update vmf->gfp_mask we pass
> > further down
> >   into DAX code to not contain __GFP_FS (that's a bug we apparently
> > have
> >   there). And inside DAX code, we definitely are not generally safe
> > to add
> >   __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off
> > propagating struct
> >   vm_fault into this function, using passed gfp_mask there and make
> > sure
> >   callers update gfp_mask as appropriate.
> > 
> > https://lkml.org/lkml/2016/10/4/37
> > 
> > IIUC I think the concern is that, for example, in
> > xfs_filemap_page_mkwrite()
> > we take a read lock on the struct inode.i_rwsem before we call
> > dax_iomap_fault().
> 
> That, my friends, is exactly the problem that mapping_gfp_mask() is
> meant to solve. This:
> 
> > 
> > > 
> > > > 
> > > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS
> > > > |  __GFP_IO;
> 
> Is just so wrong it's not funny.
> 
> The whole point of mapping_gfp_mask() is to remove flags from the
> gfp_mask used to do mapping+page cache related allocations that the
> mapping->host considers dangerous when the host may be holding locks.
> This includes mapping tree allocations, and anything else required
> to set up a new entry in the mapping during IO path operations. That
> includes page fault operations...
> 
> e.g. in xfs_setup_inode():
> 
>         /*
>          * Ensure all page cache allocations are done from GFP_NOFS
> context to
>          * prevent direct reclaim recursion back into the filesystem
> and blowing
>          * stacks or deadlocking.
>          */
>         gfp_mask = mapping_gfp_mask(inode->i_mapping);
>         mapping_set_gfp_mask(inode->i_mapping, (gfp_mask &
> ~(__GFP_FS)));
> 
> i.e. XFS considers it invalid to use GFP_FS at all for mapping
> allocations in the io path, because we *know* that we hold
> filesystems locks over those allocations.
> 
> > 
> > dax_iomap_fault() then calls find_or_create_page(), etc. with the
> > vfm->gfp_mask we were given.
> 
> Yup. Precisely why we should be using mapping_gfp_mask() as it was
> intended for vmf.gfp_mask....
> 
> > 
> > I believe the concern is that if that memory allocation tries to do
> > FS
> > operations to free memory because __GFP_FS is part of the gfp mask,
> > then we
> > could end up deadlocking because we are already holding FS locks.
> 
> Which is a problem with the filesystem mapping mask setup, not a
> reason to sprinkle random gfpmask clear/set pairs around the code.
> i.e. For DAX inodes, the mapping mask should clear __GFP_FS as XFS
> does above, and the mapping_gfp_mask() should be used unadulterated
> by the DAX page fault code....


I'll drop this patch. We can address the issue separate from the
pmd_fault changes. 

> 
> Cheers,
> 
> Dave.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
  2016-12-16 22:04       ` Dave Chinner
@ 2016-12-19 19:53         ` Jan Kara
  -1 siblings, 0 replies; 23+ messages in thread
From: Jan Kara @ 2016-12-19 19:53 UTC (permalink / raw)
  To: Dave Chinner; +Cc: mhocko, jack, akpm, linux-nvdimm, linux-mm, tytso, hch

On Sat 17-12-16 09:04:50, Dave Chinner wrote:
> On Fri, Dec 16, 2016 at 09:19:16AM -0700, Ross Zwisler wrote:
> > On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> > > On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > > > The caller into dax needs to clear __GFP_FS mask bit since it's
> > > > responsible for acquiring locks / transactions that blocks __GFP_FS
> > > > allocation.  The caller will restore the original mask when dax function
> > > > returns.
> > > 
> > > What's the allocation problem you're working around here? Can you
> > > please describe the call chain that is the problem?
> > > 
> > > >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > > >  
> > > >  	if (IS_DAX(inode)) {
> > > > +		gfp_t old_gfp = vmf->gfp_mask;
> > > > +
> > > > +		vmf->gfp_mask &= ~__GFP_FS;
> > > >  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> > > > +		vmf->gfp_mask = old_gfp;
> > > 
> > > I really have to say that I hate code that clears and restores flags
> > > without any explanation of why the code needs to play flag tricks. I
> > > take one look at the XFS fault handling code and ask myself now "why
> > > the hell do we need to clear those flags?" Especially as the other
> > > paths into generic fault handlers /don't/ require us to do this.
> > > What does DAX do that require us to treat memory allocation contexts
> > > differently to the filemap_fault() path?
> > 
> > This was done in response to Jan Kara's concern:
> > 
> >   The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
> >   because at that point it is correct. But once we grab filesystem locks which
> >   are not reclaim safe, we should update vmf->gfp_mask we pass further down
> >   into DAX code to not contain __GFP_FS (that's a bug we apparently have
> >   there). And inside DAX code, we definitely are not generally safe to add
> >   __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
> >   vm_fault into this function, using passed gfp_mask there and make sure
> >   callers update gfp_mask as appropriate.
> > 
> > https://lkml.org/lkml/2016/10/4/37
> > 
> > IIUC I think the concern is that, for example, in xfs_filemap_page_mkwrite()
> > we take a read lock on the struct inode.i_rwsem before we call
> > dax_iomap_fault().
> 
> That, my friends, is exactly the problem that mapping_gfp_mask() is
> meant to solve. This:
> 
> > > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS |  __GFP_IO;
> 
> Is just so wrong it's not funny.

You mean like in mm/memory.c: __get_fault_gfp_mask()?

Which was introduced by commit c20cd45eb017 "mm: allow GFP_{FS,IO} for
page_cache_read page cache allocation" by Michal (added to CC) and you were
even on CC ;).

The code here was replicating __get_fault_gfp_mask() and in fact the idea
of the cleanup is to get rid of this code and take whatever is in
vmf.gfp_mask and mask off __GFP_FS in the filesystem if it deems it is
needed (e.g. ext4 really needs this as inode reclaim is depending on being
able to force a transaction commit).

I agree with your point about comments, we should add those when changing
gfp_mask.

> The whole point of mapping_gfp_mask() is to remove flags from the
> gfp_mask used to do mapping+page cache related allocations that the
> mapping->host considers dangerous when the host may be holding locks.
> This includes mapping tree allocations, and anything else required
> to set up a new entry in the mapping during IO path operations. That
> includes page fault operations...
> 
> e.g. in xfs_setup_inode():
> 
>         /*
>          * Ensure all page cache allocations are done from GFP_NOFS context to
>          * prevent direct reclaim recursion back into the filesystem and blowing
>          * stacks or deadlocking.
>          */
>         gfp_mask = mapping_gfp_mask(inode->i_mapping);
>         mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
> 
> i.e. XFS considers it invalid to use GFP_FS at all for mapping
> allocations in the io path, because we *know* that we hold
> filesystems locks over those allocations.

Well, this is a discussion you should probably have with Michal. DAX code
was just mirroring what the generic code does. Michal had a valid points
why page fault path is special and allocation of pages for a page fault
should be fine with __GFP_FS - but if those assumptions are wrong for XFS,
generic code needs to be fixed.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
@ 2016-12-19 19:53         ` Jan Kara
  0 siblings, 0 replies; 23+ messages in thread
From: Jan Kara @ 2016-12-19 19:53 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Ross Zwisler, Dave Jiang, akpm, jack, linux-nvdimm, hch,
	linux-mm, tytso, dan.j.williams, mhocko

On Sat 17-12-16 09:04:50, Dave Chinner wrote:
> On Fri, Dec 16, 2016 at 09:19:16AM -0700, Ross Zwisler wrote:
> > On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> > > On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > > > The caller into dax needs to clear __GFP_FS mask bit since it's
> > > > responsible for acquiring locks / transactions that blocks __GFP_FS
> > > > allocation.  The caller will restore the original mask when dax function
> > > > returns.
> > > 
> > > What's the allocation problem you're working around here? Can you
> > > please describe the call chain that is the problem?
> > > 
> > > >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > > >  
> > > >  	if (IS_DAX(inode)) {
> > > > +		gfp_t old_gfp = vmf->gfp_mask;
> > > > +
> > > > +		vmf->gfp_mask &= ~__GFP_FS;
> > > >  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> > > > +		vmf->gfp_mask = old_gfp;
> > > 
> > > I really have to say that I hate code that clears and restores flags
> > > without any explanation of why the code needs to play flag tricks. I
> > > take one look at the XFS fault handling code and ask myself now "why
> > > the hell do we need to clear those flags?" Especially as the other
> > > paths into generic fault handlers /don't/ require us to do this.
> > > What does DAX do that require us to treat memory allocation contexts
> > > differently to the filemap_fault() path?
> > 
> > This was done in response to Jan Kara's concern:
> > 
> >   The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
> >   because at that point it is correct. But once we grab filesystem locks which
> >   are not reclaim safe, we should update vmf->gfp_mask we pass further down
> >   into DAX code to not contain __GFP_FS (that's a bug we apparently have
> >   there). And inside DAX code, we definitely are not generally safe to add
> >   __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
> >   vm_fault into this function, using passed gfp_mask there and make sure
> >   callers update gfp_mask as appropriate.
> > 
> > https://lkml.org/lkml/2016/10/4/37
> > 
> > IIUC I think the concern is that, for example, in xfs_filemap_page_mkwrite()
> > we take a read lock on the struct inode.i_rwsem before we call
> > dax_iomap_fault().
> 
> That, my friends, is exactly the problem that mapping_gfp_mask() is
> meant to solve. This:
> 
> > > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS |  __GFP_IO;
> 
> Is just so wrong it's not funny.

You mean like in mm/memory.c: __get_fault_gfp_mask()?

Which was introduced by commit c20cd45eb017 "mm: allow GFP_{FS,IO} for
page_cache_read page cache allocation" by Michal (added to CC) and you were
even on CC ;).

The code here was replicating __get_fault_gfp_mask() and in fact the idea
of the cleanup is to get rid of this code and take whatever is in
vmf.gfp_mask and mask off __GFP_FS in the filesystem if it deems it is
needed (e.g. ext4 really needs this as inode reclaim is depending on being
able to force a transaction commit).

I agree with your point about comments, we should add those when changing
gfp_mask.

> The whole point of mapping_gfp_mask() is to remove flags from the
> gfp_mask used to do mapping+page cache related allocations that the
> mapping->host considers dangerous when the host may be holding locks.
> This includes mapping tree allocations, and anything else required
> to set up a new entry in the mapping during IO path operations. That
> includes page fault operations...
> 
> e.g. in xfs_setup_inode():
> 
>         /*
>          * Ensure all page cache allocations are done from GFP_NOFS context to
>          * prevent direct reclaim recursion back into the filesystem and blowing
>          * stacks or deadlocking.
>          */
>         gfp_mask = mapping_gfp_mask(inode->i_mapping);
>         mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
> 
> i.e. XFS considers it invalid to use GFP_FS at all for mapping
> allocations in the io path, because we *know* that we hold
> filesystems locks over those allocations.

Well, this is a discussion you should probably have with Michal. DAX code
was just mirroring what the generic code does. Michal had a valid points
why page fault path is special and allocation of pages for a page fault
should be fine with __GFP_FS - but if those assumptions are wrong for XFS,
generic code needs to be fixed.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
  2016-12-19 19:53         ` Jan Kara
@ 2016-12-19 21:17           ` Dave Chinner
  -1 siblings, 0 replies; 23+ messages in thread
From: Dave Chinner @ 2016-12-19 21:17 UTC (permalink / raw)
  To: Jan Kara; +Cc: mhocko, akpm, linux-nvdimm, linux-mm, tytso, hch

On Mon, Dec 19, 2016 at 08:53:02PM +0100, Jan Kara wrote:
> On Sat 17-12-16 09:04:50, Dave Chinner wrote:
> > On Fri, Dec 16, 2016 at 09:19:16AM -0700, Ross Zwisler wrote:
> > > On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> > > > On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > > > > The caller into dax needs to clear __GFP_FS mask bit since it's
> > > > > responsible for acquiring locks / transactions that blocks __GFP_FS
> > > > > allocation.  The caller will restore the original mask when dax function
> > > > > returns.
> > > > 
> > > > What's the allocation problem you're working around here? Can you
> > > > please describe the call chain that is the problem?
> > > > 
> > > > >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > > > >  
> > > > >  	if (IS_DAX(inode)) {
> > > > > +		gfp_t old_gfp = vmf->gfp_mask;
> > > > > +
> > > > > +		vmf->gfp_mask &= ~__GFP_FS;
> > > > >  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> > > > > +		vmf->gfp_mask = old_gfp;
> > > > 
> > > > I really have to say that I hate code that clears and restores flags
> > > > without any explanation of why the code needs to play flag tricks. I
> > > > take one look at the XFS fault handling code and ask myself now "why
> > > > the hell do we need to clear those flags?" Especially as the other
> > > > paths into generic fault handlers /don't/ require us to do this.
> > > > What does DAX do that require us to treat memory allocation contexts
> > > > differently to the filemap_fault() path?
> > > 
> > > This was done in response to Jan Kara's concern:
> > > 
> > >   The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
> > >   because at that point it is correct. But once we grab filesystem locks which
> > >   are not reclaim safe, we should update vmf->gfp_mask we pass further down
> > >   into DAX code to not contain __GFP_FS (that's a bug we apparently have
> > >   there). And inside DAX code, we definitely are not generally safe to add
> > >   __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
> > >   vm_fault into this function, using passed gfp_mask there and make sure
> > >   callers update gfp_mask as appropriate.
> > > 
> > > https://lkml.org/lkml/2016/10/4/37
> > > 
> > > IIUC I think the concern is that, for example, in xfs_filemap_page_mkwrite()
> > > we take a read lock on the struct inode.i_rwsem before we call
> > > dax_iomap_fault().
> > 
> > That, my friends, is exactly the problem that mapping_gfp_mask() is
> > meant to solve. This:
> > 
> > > > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS |  __GFP_IO;
> > 
> > Is just so wrong it's not funny.
> 
> You mean like in mm/memory.c: __get_fault_gfp_mask()?
> 
> Which was introduced by commit c20cd45eb017 "mm: allow GFP_{FS,IO} for
> page_cache_read page cache allocation" by Michal (added to CC) and you were
> even on CC ;).

Sure, I was on the cc list, but that doesn't mean I /liked/ the
patch. It also doesn't mean I had the time or patience to argue
whether it was the right way to address whatever whacky OOM/reclaim
deficiency was being reported....

Oh, and this is a write fault, not a read fault. There's a big
difference in filesystem behaviour between those two types of
faults, so what might be fine for a page cache read (i.e. no
transactions) isn't necessarily correct for a write operation...

> The code here was replicating __get_fault_gfp_mask() and in fact the idea
> of the cleanup is to get rid of this code and take whatever is in
> vmf.gfp_mask and mask off __GFP_FS in the filesystem if it deems it is
> needed (e.g. ext4 really needs this as inode reclaim is depending on being
> able to force a transaction commit).

And so now we add a flag to the fault that the filesystem says not
to add to mapping masks, and now the filesystem has to mask off
thati flag /again/ because it's mapping gfp mask guidelines are
essentially being ignored.

Remind me again why we even have the mapping gfp_mask if we just
ignore it like this?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
@ 2016-12-19 21:17           ` Dave Chinner
  0 siblings, 0 replies; 23+ messages in thread
From: Dave Chinner @ 2016-12-19 21:17 UTC (permalink / raw)
  To: Jan Kara
  Cc: Ross Zwisler, Dave Jiang, akpm, linux-nvdimm, hch, linux-mm,
	tytso, dan.j.williams, mhocko

On Mon, Dec 19, 2016 at 08:53:02PM +0100, Jan Kara wrote:
> On Sat 17-12-16 09:04:50, Dave Chinner wrote:
> > On Fri, Dec 16, 2016 at 09:19:16AM -0700, Ross Zwisler wrote:
> > > On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> > > > On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > > > > The caller into dax needs to clear __GFP_FS mask bit since it's
> > > > > responsible for acquiring locks / transactions that blocks __GFP_FS
> > > > > allocation.  The caller will restore the original mask when dax function
> > > > > returns.
> > > > 
> > > > What's the allocation problem you're working around here? Can you
> > > > please describe the call chain that is the problem?
> > > > 
> > > > >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > > > >  
> > > > >  	if (IS_DAX(inode)) {
> > > > > +		gfp_t old_gfp = vmf->gfp_mask;
> > > > > +
> > > > > +		vmf->gfp_mask &= ~__GFP_FS;
> > > > >  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> > > > > +		vmf->gfp_mask = old_gfp;
> > > > 
> > > > I really have to say that I hate code that clears and restores flags
> > > > without any explanation of why the code needs to play flag tricks. I
> > > > take one look at the XFS fault handling code and ask myself now "why
> > > > the hell do we need to clear those flags?" Especially as the other
> > > > paths into generic fault handlers /don't/ require us to do this.
> > > > What does DAX do that require us to treat memory allocation contexts
> > > > differently to the filemap_fault() path?
> > > 
> > > This was done in response to Jan Kara's concern:
> > > 
> > >   The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
> > >   because at that point it is correct. But once we grab filesystem locks which
> > >   are not reclaim safe, we should update vmf->gfp_mask we pass further down
> > >   into DAX code to not contain __GFP_FS (that's a bug we apparently have
> > >   there). And inside DAX code, we definitely are not generally safe to add
> > >   __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
> > >   vm_fault into this function, using passed gfp_mask there and make sure
> > >   callers update gfp_mask as appropriate.
> > > 
> > > https://lkml.org/lkml/2016/10/4/37
> > > 
> > > IIUC I think the concern is that, for example, in xfs_filemap_page_mkwrite()
> > > we take a read lock on the struct inode.i_rwsem before we call
> > > dax_iomap_fault().
> > 
> > That, my friends, is exactly the problem that mapping_gfp_mask() is
> > meant to solve. This:
> > 
> > > > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS |  __GFP_IO;
> > 
> > Is just so wrong it's not funny.
> 
> You mean like in mm/memory.c: __get_fault_gfp_mask()?
> 
> Which was introduced by commit c20cd45eb017 "mm: allow GFP_{FS,IO} for
> page_cache_read page cache allocation" by Michal (added to CC) and you were
> even on CC ;).

Sure, I was on the cc list, but that doesn't mean I /liked/ the
patch. It also doesn't mean I had the time or patience to argue
whether it was the right way to address whatever whacky OOM/reclaim
deficiency was being reported....

Oh, and this is a write fault, not a read fault. There's a big
difference in filesystem behaviour between those two types of
faults, so what might be fine for a page cache read (i.e. no
transactions) isn't necessarily correct for a write operation...

> The code here was replicating __get_fault_gfp_mask() and in fact the idea
> of the cleanup is to get rid of this code and take whatever is in
> vmf.gfp_mask and mask off __GFP_FS in the filesystem if it deems it is
> needed (e.g. ext4 really needs this as inode reclaim is depending on being
> able to force a transaction commit).

And so now we add a flag to the fault that the filesystem says not
to add to mapping masks, and now the filesystem has to mask off
thati flag /again/ because it's mapping gfp mask guidelines are
essentially being ignored.

Remind me again why we even have the mapping gfp_mask if we just
ignore it like this?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
  2016-12-19 21:17           ` Dave Chinner
@ 2016-12-20 10:13             ` Michal Hocko
  -1 siblings, 0 replies; 23+ messages in thread
From: Michal Hocko @ 2016-12-20 10:13 UTC (permalink / raw)
  To: Dave Chinner; +Cc: akpm, linux-nvdimm, linux-mm, tytso, Jan Kara, hch

On Tue 20-12-16 08:17:11, Dave Chinner wrote:
> On Mon, Dec 19, 2016 at 08:53:02PM +0100, Jan Kara wrote:
> > On Sat 17-12-16 09:04:50, Dave Chinner wrote:
> > > On Fri, Dec 16, 2016 at 09:19:16AM -0700, Ross Zwisler wrote:
> > > > On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> > > > > On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > > > > > The caller into dax needs to clear __GFP_FS mask bit since it's
> > > > > > responsible for acquiring locks / transactions that blocks __GFP_FS
> > > > > > allocation.  The caller will restore the original mask when dax function
> > > > > > returns.
> > > > > 
> > > > > What's the allocation problem you're working around here? Can you
> > > > > please describe the call chain that is the problem?
> > > > > 
> > > > > >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > > > > >  
> > > > > >  	if (IS_DAX(inode)) {
> > > > > > +		gfp_t old_gfp = vmf->gfp_mask;
> > > > > > +
> > > > > > +		vmf->gfp_mask &= ~__GFP_FS;
> > > > > >  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> > > > > > +		vmf->gfp_mask = old_gfp;
> > > > > 
> > > > > I really have to say that I hate code that clears and restores flags
> > > > > without any explanation of why the code needs to play flag tricks. I
> > > > > take one look at the XFS fault handling code and ask myself now "why
> > > > > the hell do we need to clear those flags?" Especially as the other
> > > > > paths into generic fault handlers /don't/ require us to do this.
> > > > > What does DAX do that require us to treat memory allocation contexts
> > > > > differently to the filemap_fault() path?
> > > > 
> > > > This was done in response to Jan Kara's concern:
> > > > 
> > > >   The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
> > > >   because at that point it is correct. But once we grab filesystem locks which
> > > >   are not reclaim safe, we should update vmf->gfp_mask we pass further down
> > > >   into DAX code to not contain __GFP_FS (that's a bug we apparently have
> > > >   there). And inside DAX code, we definitely are not generally safe to add
> > > >   __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
> > > >   vm_fault into this function, using passed gfp_mask there and make sure
> > > >   callers update gfp_mask as appropriate.
> > > > 
> > > > https://lkml.org/lkml/2016/10/4/37
> > > > 
> > > > IIUC I think the concern is that, for example, in xfs_filemap_page_mkwrite()
> > > > we take a read lock on the struct inode.i_rwsem before we call
> > > > dax_iomap_fault().
> > > 
> > > That, my friends, is exactly the problem that mapping_gfp_mask() is
> > > meant to solve. This:
> > > 
> > > > > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS |  __GFP_IO;
> > > 
> > > Is just so wrong it's not funny.
> > 
> > You mean like in mm/memory.c: __get_fault_gfp_mask()?
> > 
> > Which was introduced by commit c20cd45eb017 "mm: allow GFP_{FS,IO} for
> > page_cache_read page cache allocation" by Michal (added to CC) and you were
> > even on CC ;).
> 
> Sure, I was on the cc list, but that doesn't mean I /liked/ the
> patch. It also doesn't mean I had the time or patience to argue
> whether it was the right way to address whatever whacky OOM/reclaim
> deficiency was being reported....
> 
> Oh, and this is a write fault, not a read fault. There's a big
> difference in filesystem behaviour between those two types of
> faults, so what might be fine for a page cache read (i.e. no
> transactions) isn't necessarily correct for a write operation...
> 
> > The code here was replicating __get_fault_gfp_mask() and in fact the idea
> > of the cleanup is to get rid of this code and take whatever is in
> > vmf.gfp_mask and mask off __GFP_FS in the filesystem if it deems it is
> > needed (e.g. ext4 really needs this as inode reclaim is depending on being
> > able to force a transaction commit).
> 
> And so now we add a flag to the fault that the filesystem says not
> to add to mapping masks, and now the filesystem has to mask off
> thati flag /again/ because it's mapping gfp mask guidelines are
> essentially being ignored.
> 
> Remind me again why we even have the mapping gfp_mask if we just
> ignore it like this?

mapping mask still serves its _main_ purpose - the allocation
placement/movability properties. This is something only the owner of
the mapping knows. The (ab)use of the mapping gfp_mask to drop GFP_FS
was imho a bad decision. As the above mentioned commit has mentioned
we were doing a lot of GFP_NOFS allocations from the paths which are
inherently GFP_KERNEL so they couldn't prevent from recursion problems
while they still affected the direct relaim behavior. On the other hand
I do understand why mapping's mask has been used at the time. We simply
lacked a better api back then. But I believe that with the scope nofs
[1] api we can do much better and get rid of ~__GFP_FS in mapping's mask
finally. c20cd45eb017 was an intermediate step until we get there.

I am not fully familiar with the DAX changes which started this
discussion but if there is a reclaim recursion problem from within the
fault path then the scope api sounds like a good fit here.

[1] http://lkml.kernel.org/r/20161215140715.12732-1-mhocko@kernel.org

-- 
Michal Hocko
SUSE Labs
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
@ 2016-12-20 10:13             ` Michal Hocko
  0 siblings, 0 replies; 23+ messages in thread
From: Michal Hocko @ 2016-12-20 10:13 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Jan Kara, Ross Zwisler, Dave Jiang, akpm, linux-nvdimm, hch,
	linux-mm, tytso, dan.j.williams

On Tue 20-12-16 08:17:11, Dave Chinner wrote:
> On Mon, Dec 19, 2016 at 08:53:02PM +0100, Jan Kara wrote:
> > On Sat 17-12-16 09:04:50, Dave Chinner wrote:
> > > On Fri, Dec 16, 2016 at 09:19:16AM -0700, Ross Zwisler wrote:
> > > > On Fri, Dec 16, 2016 at 12:07:30PM +1100, Dave Chinner wrote:
> > > > > On Thu, Dec 15, 2016 at 04:40:41PM -0700, Dave Jiang wrote:
> > > > > > The caller into dax needs to clear __GFP_FS mask bit since it's
> > > > > > responsible for acquiring locks / transactions that blocks __GFP_FS
> > > > > > allocation.  The caller will restore the original mask when dax function
> > > > > > returns.
> > > > > 
> > > > > What's the allocation problem you're working around here? Can you
> > > > > please describe the call chain that is the problem?
> > > > > 
> > > > > >  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> > > > > >  
> > > > > >  	if (IS_DAX(inode)) {
> > > > > > +		gfp_t old_gfp = vmf->gfp_mask;
> > > > > > +
> > > > > > +		vmf->gfp_mask &= ~__GFP_FS;
> > > > > >  		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
> > > > > > +		vmf->gfp_mask = old_gfp;
> > > > > 
> > > > > I really have to say that I hate code that clears and restores flags
> > > > > without any explanation of why the code needs to play flag tricks. I
> > > > > take one look at the XFS fault handling code and ask myself now "why
> > > > > the hell do we need to clear those flags?" Especially as the other
> > > > > paths into generic fault handlers /don't/ require us to do this.
> > > > > What does DAX do that require us to treat memory allocation contexts
> > > > > differently to the filemap_fault() path?
> > > > 
> > > > This was done in response to Jan Kara's concern:
> > > > 
> > > >   The gfp_mask that propagates from __do_fault() or do_page_mkwrite() is fine
> > > >   because at that point it is correct. But once we grab filesystem locks which
> > > >   are not reclaim safe, we should update vmf->gfp_mask we pass further down
> > > >   into DAX code to not contain __GFP_FS (that's a bug we apparently have
> > > >   there). And inside DAX code, we definitely are not generally safe to add
> > > >   __GFP_FS to mapping_gfp_mask(). Maybe we'd be better off propagating struct
> > > >   vm_fault into this function, using passed gfp_mask there and make sure
> > > >   callers update gfp_mask as appropriate.
> > > > 
> > > > https://lkml.org/lkml/2016/10/4/37
> > > > 
> > > > IIUC I think the concern is that, for example, in xfs_filemap_page_mkwrite()
> > > > we take a read lock on the struct inode.i_rwsem before we call
> > > > dax_iomap_fault().
> > > 
> > > That, my friends, is exactly the problem that mapping_gfp_mask() is
> > > meant to solve. This:
> > > 
> > > > > > +	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_FS |  __GFP_IO;
> > > 
> > > Is just so wrong it's not funny.
> > 
> > You mean like in mm/memory.c: __get_fault_gfp_mask()?
> > 
> > Which was introduced by commit c20cd45eb017 "mm: allow GFP_{FS,IO} for
> > page_cache_read page cache allocation" by Michal (added to CC) and you were
> > even on CC ;).
> 
> Sure, I was on the cc list, but that doesn't mean I /liked/ the
> patch. It also doesn't mean I had the time or patience to argue
> whether it was the right way to address whatever whacky OOM/reclaim
> deficiency was being reported....
> 
> Oh, and this is a write fault, not a read fault. There's a big
> difference in filesystem behaviour between those two types of
> faults, so what might be fine for a page cache read (i.e. no
> transactions) isn't necessarily correct for a write operation...
> 
> > The code here was replicating __get_fault_gfp_mask() and in fact the idea
> > of the cleanup is to get rid of this code and take whatever is in
> > vmf.gfp_mask and mask off __GFP_FS in the filesystem if it deems it is
> > needed (e.g. ext4 really needs this as inode reclaim is depending on being
> > able to force a transaction commit).
> 
> And so now we add a flag to the fault that the filesystem says not
> to add to mapping masks, and now the filesystem has to mask off
> thati flag /again/ because it's mapping gfp mask guidelines are
> essentially being ignored.
> 
> Remind me again why we even have the mapping gfp_mask if we just
> ignore it like this?

mapping mask still serves its _main_ purpose - the allocation
placement/movability properties. This is something only the owner of
the mapping knows. The (ab)use of the mapping gfp_mask to drop GFP_FS
was imho a bad decision. As the above mentioned commit has mentioned
we were doing a lot of GFP_NOFS allocations from the paths which are
inherently GFP_KERNEL so they couldn't prevent from recursion problems
while they still affected the direct relaim behavior. On the other hand
I do understand why mapping's mask has been used at the time. We simply
lacked a better api back then. But I believe that with the scope nofs
[1] api we can do much better and get rid of ~__GFP_FS in mapping's mask
finally. c20cd45eb017 was an intermediate step until we get there.

I am not fully familiar with the DAX changes which started this
discussion but if there is a reclaim recursion problem from within the
fault path then the scope api sounds like a good fit here.

[1] http://lkml.kernel.org/r/20161215140715.12732-1-mhocko@kernel.org

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
  2016-12-20 10:13             ` Michal Hocko
@ 2016-12-21 12:36               ` Jan Kara
  -1 siblings, 0 replies; 23+ messages in thread
From: Jan Kara @ 2016-12-21 12:36 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Jan Kara, linux-nvdimm, Dave Chinner, hch, linux-mm, tytso, akpm

On Tue 20-12-16 11:13:52, Michal Hocko wrote:
> I am not fully familiar with the DAX changes which started this
> discussion but if there is a reclaim recursion problem from within the
> fault path then the scope api sounds like a good fit here.
> 
> [1] http://lkml.kernel.org/r/20161215140715.12732-1-mhocko@kernel.org

Yes, once your scope API and associated ext4 changes are in, we can stop
playing tricks with gfp_mask in DAX code at least for ext4.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers
@ 2016-12-21 12:36               ` Jan Kara
  0 siblings, 0 replies; 23+ messages in thread
From: Jan Kara @ 2016-12-21 12:36 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Dave Chinner, Jan Kara, Ross Zwisler, Dave Jiang, akpm,
	linux-nvdimm, hch, linux-mm, tytso, dan.j.williams

On Tue 20-12-16 11:13:52, Michal Hocko wrote:
> I am not fully familiar with the DAX changes which started this
> discussion but if there is a reclaim recursion problem from within the
> fault path then the scope api sounds like a good fit here.
> 
> [1] http://lkml.kernel.org/r/20161215140715.12732-1-mhocko@kernel.org

Yes, once your scope API and associated ext4 changes are in, we can stop
playing tricks with gfp_mask in DAX code at least for ext4.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2016-12-21 12:36 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-15 23:40 [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers Dave Jiang
2016-12-15 23:40 ` Dave Jiang
2016-12-15 23:40 ` [PATCH v4 2/3] mm, dax: make pmd_fault() and friends to be the same as fault() Dave Jiang
2016-12-15 23:40 ` [PATCH v4 3/3] mm, dax: move pmd_fault() to take only vmf parameter Dave Jiang
2016-12-15 23:40   ` Dave Jiang
2016-12-19 17:41   ` Jan Kara
2016-12-19 17:41     ` Jan Kara
2016-12-16  1:07 ` [PATCH v4 1/3] dax: masking off __GFP_FS in fs DAX handlers Dave Chinner
2016-12-16  1:07   ` Dave Chinner
2016-12-16 16:19   ` Ross Zwisler
2016-12-16 16:19     ` Ross Zwisler
2016-12-16 22:04     ` Dave Chinner
2016-12-16 22:04       ` Dave Chinner
2016-12-19 17:56       ` Jiang, Dave
2016-12-19 17:56         ` Jiang, Dave
2016-12-19 19:53       ` Jan Kara
2016-12-19 19:53         ` Jan Kara
2016-12-19 21:17         ` Dave Chinner
2016-12-19 21:17           ` Dave Chinner
2016-12-20 10:13           ` Michal Hocko
2016-12-20 10:13             ` Michal Hocko
2016-12-21 12:36             ` Jan Kara
2016-12-21 12:36               ` Jan Kara

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.