linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] fs: dax: Adding new return type vm_fault_t
@ 2018-04-21 21:05 Souptick Joarder
  2018-04-21 21:34 ` Matthew Wilcox
  2018-04-22 23:09 ` Jan Kara
  0 siblings, 2 replies; 9+ messages in thread
From: Souptick Joarder @ 2018-04-21 21:05 UTC (permalink / raw)
  To: viro, mawilcox, ross.zwisler, akpm, dan.j.williams, mhocko, jack,
	kirill.shutemov
  Cc: linux-fsdevel, linux-mm, linux-kernel

Use new return type vm_fault_t for fault handler. For
now, this is just documenting that the function returns
a VM_FAULT value rather than an errno. Once all instances
are converted, vm_fault_t will become a distinct type.

commit 1c8f422059ae ("mm: change return type to vm_fault_t")

There was an existing bug inside dax_load_hole()
if vm_insert_mixed had failed to allocate a page table,
we'd return VM_FAULT_NOPAGE instead of VM_FAULT_OOM.
With new vmf_insert_mixed() this issue is addressed.

vm_insert_mixed_mkwrite has inefficiency when it returns
an error value, driver has to convert it to vm_fault_t
type. With new vmf_insert_mixed_mkwrite() this limitation
will be addressed.

As new function vmf_insert_mixed_mkwrite() only called
from fs/dax.c, so keeping both the changes in a single
patch.

Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
---
v2: vm_insert_mixed_mkwrite is replaced by new
    vmf_insert_mixed_mkwrite

v3: Addressed Matthew's comment. One patch which
    changes both at the same time. The history
    should be bisectable so that it compiles and
    works at every point.

 fs/dax.c            | 78 +++++++++++++++++++++++++----------------------------
 include/linux/dax.h |  4 +--
 include/linux/mm.h  |  4 +--
 mm/memory.c         | 15 ++++++++---
 4 files changed, 52 insertions(+), 49 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index aaec72de..821986c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -905,12 +905,12 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
  * If this page is ever written to we will re-fault and change the mapping to
  * point to real DAX storage instead.
  */
-static int dax_load_hole(struct address_space *mapping, void *entry,
+static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
 			 struct vm_fault *vmf)
 {
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
-	int ret = VM_FAULT_NOPAGE;
+	vm_fault_t ret = VM_FAULT_NOPAGE;
 	struct page *zero_page;
 	void *entry2;
 	pfn_t pfn;
@@ -929,7 +929,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
 		goto out;
 	}
 
-	vm_insert_mixed(vmf->vma, vaddr, pfn);
+	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
 out:
 	trace_dax_load_hole(inode, vmf, ret);
 	return ret;
@@ -1112,7 +1112,7 @@ int __dax_zero_page_range(struct block_device *bdev,
 }
 EXPORT_SYMBOL_GPL(dax_iomap_rw);
 
-static int dax_fault_return(int error)
+static vm_fault_t dax_fault_return(int error)
 {
 	if (error == 0)
 		return VM_FAULT_NOPAGE;
@@ -1132,7 +1132,7 @@ static bool dax_fault_is_synchronous(unsigned long flags,
 		&& (iomap->flags & IOMAP_F_DIRTY);
 }
 
-static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       int *iomap_errp, const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -1145,18 +1145,18 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	int error, major = 0;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	bool sync;
-	int vmf_ret = 0;
+	vm_fault_t ret = 0;
 	void *entry;
 	pfn_t pfn;
 
-	trace_dax_pte_fault(inode, vmf, vmf_ret);
+	trace_dax_pte_fault(inode, vmf, ret);
 	/*
 	 * Check whether offset isn't beyond end of file now. Caller is supposed
 	 * to hold locks serializing us with truncate / punch hole so this is
 	 * a reliable test.
 	 */
 	if (pos >= i_size_read(inode)) {
-		vmf_ret = VM_FAULT_SIGBUS;
+		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
@@ -1165,7 +1165,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
 	if (IS_ERR(entry)) {
-		vmf_ret = dax_fault_return(PTR_ERR(entry));
+		ret = dax_fault_return(PTR_ERR(entry));
 		goto out;
 	}
 
@@ -1176,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	 * retried.
 	 */
 	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
-		vmf_ret = VM_FAULT_NOPAGE;
+		ret = VM_FAULT_NOPAGE;
 		goto unlock_entry;
 	}
 
@@ -1189,7 +1189,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	if (iomap_errp)
 		*iomap_errp = error;
 	if (error) {
-		vmf_ret = dax_fault_return(error);
+		ret = dax_fault_return(error);
 		goto unlock_entry;
 	}
 	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
@@ -1219,9 +1219,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			goto error_finish_iomap;
 
 		__SetPageUptodate(vmf->cow_page);
-		vmf_ret = finish_fault(vmf);
-		if (!vmf_ret)
-			vmf_ret = VM_FAULT_DONE_COW;
+		ret = finish_fault(vmf);
+		if (!ret)
+			ret = VM_FAULT_DONE_COW;
 		goto finish_iomap;
 	}
 
@@ -1257,23 +1257,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 				goto error_finish_iomap;
 			}
 			*pfnp = pfn;
-			vmf_ret = VM_FAULT_NEEDDSYNC | major;
+			ret = VM_FAULT_NEEDDSYNC | major;
 			goto finish_iomap;
 		}
 		trace_dax_insert_mapping(inode, vmf, entry);
 		if (write)
-			error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+			ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
 		else
-			error = vm_insert_mixed(vma, vaddr, pfn);
+			ret = vmf_insert_mixed(vma, vaddr, pfn);
 
-		/* -EBUSY is fine, somebody else faulted on the same PTE */
-		if (error == -EBUSY)
-			error = 0;
-		break;
+		goto finish_iomap;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (!write) {
-			vmf_ret = dax_load_hole(mapping, entry, vmf);
+			ret = dax_load_hole(mapping, entry, vmf);
 			goto finish_iomap;
 		}
 		/*FALLTHRU*/
@@ -1284,12 +1281,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	}
 
  error_finish_iomap:
-	vmf_ret = dax_fault_return(error) | major;
+	ret = dax_fault_return(error);
  finish_iomap:
 	if (ops->iomap_end) {
 		int copied = PAGE_SIZE;
 
-		if (vmf_ret & VM_FAULT_ERROR)
+		if (ret & VM_FAULT_ERROR)
 			copied = 0;
 		/*
 		 * The fault is done by now and there's no way back (other
@@ -1302,12 +1299,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
  unlock_entry:
 	put_locked_mapping_entry(mapping, vmf->pgoff);
  out:
-	trace_dax_pte_fault_done(inode, vmf, vmf_ret);
-	return vmf_ret;
+	trace_dax_pte_fault_done(inode, vmf, ret);
+	return ret | major;
 }
 
 #ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
+static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 		void *entry)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1348,7 +1345,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -1358,7 +1355,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	bool sync;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
-	int result = VM_FAULT_FALLBACK;
+	vm_fault_t result = VM_FAULT_FALLBACK;
 	struct iomap iomap = { 0 };
 	pgoff_t max_pgoff, pgoff;
 	void *entry;
@@ -1509,7 +1506,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	return result;
 }
 #else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
@@ -1529,7 +1526,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
  * has done all the necessary locking for page fault to proceed
  * successfully.
  */
-int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
 {
 	switch (pe_size) {
@@ -1553,14 +1550,14 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
  * DAX file.  It takes care of marking corresponding radix tree entry as dirty
  * as well.
  */
-static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
 				  enum page_entry_size pe_size,
 				  pfn_t pfn)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	void *entry, **slot;
 	pgoff_t index = vmf->pgoff;
-	int vmf_ret, error;
+	vm_fault_t ret;
 
 	xa_lock_irq(&mapping->i_pages);
 	entry = get_unlocked_mapping_entry(mapping, index, &slot);
@@ -1579,21 +1576,20 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
 	xa_unlock_irq(&mapping->i_pages);
 	switch (pe_size) {
 	case PE_SIZE_PTE:
-		error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
-		vmf_ret = dax_fault_return(error);
+		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
 		break;
 #ifdef CONFIG_FS_DAX_PMD
 	case PE_SIZE_PMD:
-		vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+		ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
 			pfn, true);
 		break;
 #endif
 	default:
-		vmf_ret = VM_FAULT_FALLBACK;
+		ret = VM_FAULT_FALLBACK;
 	}
 	put_locked_mapping_entry(mapping, index);
-	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
-	return vmf_ret;
+	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
+	return ret;
 }
 
 /**
@@ -1606,8 +1602,8 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
  * stored persistently on the media and handles inserting of appropriate page
  * table entry.
  */
-int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-			  pfn_t pfn)
+vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size, pfn_t pfn)
 {
 	int err;
 	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index f9eb22a..7fddea8 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -124,8 +124,8 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 		    pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
-int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-			  pfn_t pfn);
+vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size, pfn_t pfn);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ac1f06..9fe441c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2423,8 +2423,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn, pgprot_t pgprot);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			pfn_t pfn);
-int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn);
+vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
+		unsigned long addr, pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
 
 static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 01f5464..721cfd5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1955,12 +1955,19 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_insert_mixed);
 
-int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn)
+vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
+		unsigned long addr, pfn_t pfn)
 {
-	return __vm_insert_mixed(vma, addr, pfn, true);
+	int err;
+
+	err =  __vm_insert_mixed(vma, addr, pfn, true);
+	if (err == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (err < 0 && err != -EBUSY)
+		return VM_FAULT_SIGBUS;
+	return VM_FAULT_NOPAGE;
 }
-EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
 
 /*
  * maps a range of physical memory into the requested pages. the old
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] fs: dax: Adding new return type vm_fault_t
  2018-04-21 21:05 [PATCH v3] fs: dax: Adding new return type vm_fault_t Souptick Joarder
@ 2018-04-21 21:34 ` Matthew Wilcox
  2018-04-21 21:54   ` Souptick Joarder
  2018-04-22 23:09 ` Jan Kara
  1 sibling, 1 reply; 9+ messages in thread
From: Matthew Wilcox @ 2018-04-21 21:34 UTC (permalink / raw)
  To: Souptick Joarder
  Cc: viro, mawilcox, ross.zwisler, akpm, dan.j.williams, mhocko, jack,
	kirill.shutemov, linux-fsdevel, linux-mm, linux-kernel

On Sun, Apr 22, 2018 at 02:35:29AM +0530, Souptick Joarder wrote:
> Use new return type vm_fault_t for fault handler. For
> now, this is just documenting that the function returns
> a VM_FAULT value rather than an errno. Once all instances
> are converted, vm_fault_t will become a distinct type.
> 
> commit 1c8f422059ae ("mm: change return type to vm_fault_t")
> 
> There was an existing bug inside dax_load_hole()
> if vm_insert_mixed had failed to allocate a page table,
> we'd return VM_FAULT_NOPAGE instead of VM_FAULT_OOM.
> With new vmf_insert_mixed() this issue is addressed.
> 
> vm_insert_mixed_mkwrite has inefficiency when it returns
> an error value, driver has to convert it to vm_fault_t
> type. With new vmf_insert_mixed_mkwrite() this limitation
> will be addressed.
> 
> As new function vmf_insert_mixed_mkwrite() only called
> from fs/dax.c, so keeping both the changes in a single
> patch.
> 
> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>

Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>

There's a couple of minor things which could be tidied up, but not worth
doing them as a revision to this patch.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] fs: dax: Adding new return type vm_fault_t
  2018-04-21 21:34 ` Matthew Wilcox
@ 2018-04-21 21:54   ` Souptick Joarder
  2018-04-21 23:56     ` Dan Williams
  0 siblings, 1 reply; 9+ messages in thread
From: Souptick Joarder @ 2018-04-21 21:54 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Al Viro, mawilcox, Ross Zwisler, Andrew Morton, Dan Williams,
	Michal Hocko, jack, kirill.shutemov, linux-fsdevel, Linux-MM,
	linux-kernel

On Sun, Apr 22, 2018 at 3:04 AM, Matthew Wilcox <willy@infradead.org> wrote:
> On Sun, Apr 22, 2018 at 02:35:29AM +0530, Souptick Joarder wrote:
>> Use new return type vm_fault_t for fault handler. For
>> now, this is just documenting that the function returns
>> a VM_FAULT value rather than an errno. Once all instances
>> are converted, vm_fault_t will become a distinct type.
>>
>> commit 1c8f422059ae ("mm: change return type to vm_fault_t")
>>
>> There was an existing bug inside dax_load_hole()
>> if vm_insert_mixed had failed to allocate a page table,
>> we'd return VM_FAULT_NOPAGE instead of VM_FAULT_OOM.
>> With new vmf_insert_mixed() this issue is addressed.
>>
>> vm_insert_mixed_mkwrite has inefficiency when it returns
>> an error value, driver has to convert it to vm_fault_t
>> type. With new vmf_insert_mixed_mkwrite() this limitation
>> will be addressed.
>>
>> As new function vmf_insert_mixed_mkwrite() only called
>> from fs/dax.c, so keeping both the changes in a single
>> patch.
>>
>> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
>
> Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
>
> There's a couple of minor things which could be tidied up, but not worth
> doing them as a revision to this patch.

Which tree this patch will go through ? mm or fsdevel ?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] fs: dax: Adding new return type vm_fault_t
  2018-04-21 21:54   ` Souptick Joarder
@ 2018-04-21 23:56     ` Dan Williams
  0 siblings, 0 replies; 9+ messages in thread
From: Dan Williams @ 2018-04-21 23:56 UTC (permalink / raw)
  To: Souptick Joarder
  Cc: Matthew Wilcox, Al Viro, Matthew Wilcox, Ross Zwisler,
	Andrew Morton, Michal Hocko, Jan Kara, Kirill A. Shutemov,
	linux-fsdevel, Linux-MM, Linux Kernel Mailing List

On Sat, Apr 21, 2018 at 2:54 PM, Souptick Joarder <jrdr.linux@gmail.com> wrote:
> On Sun, Apr 22, 2018 at 3:04 AM, Matthew Wilcox <willy@infradead.org> wrote:
>> On Sun, Apr 22, 2018 at 02:35:29AM +0530, Souptick Joarder wrote:
>>> Use new return type vm_fault_t for fault handler. For
>>> now, this is just documenting that the function returns
>>> a VM_FAULT value rather than an errno. Once all instances
>>> are converted, vm_fault_t will become a distinct type.
>>>
>>> commit 1c8f422059ae ("mm: change return type to vm_fault_t")
>>>
>>> There was an existing bug inside dax_load_hole()
>>> if vm_insert_mixed had failed to allocate a page table,
>>> we'd return VM_FAULT_NOPAGE instead of VM_FAULT_OOM.
>>> With new vmf_insert_mixed() this issue is addressed.
>>>
>>> vm_insert_mixed_mkwrite has inefficiency when it returns
>>> an error value, driver has to convert it to vm_fault_t
>>> type. With new vmf_insert_mixed_mkwrite() this limitation
>>> will be addressed.
>>>
>>> As new function vmf_insert_mixed_mkwrite() only called
>>> from fs/dax.c, so keeping both the changes in a single
>>> patch.
>>>
>>> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
>>
>> Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
>>
>> There's a couple of minor things which could be tidied up, but not worth
>> doing them as a revision to this patch.
>
> Which tree this patch will go through ? mm or fsdevel ?

nvdimm, since that tree has some pending reworks for dax-vs-truncate.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] fs: dax: Adding new return type vm_fault_t
  2018-04-21 21:05 [PATCH v3] fs: dax: Adding new return type vm_fault_t Souptick Joarder
  2018-04-21 21:34 ` Matthew Wilcox
@ 2018-04-22 23:09 ` Jan Kara
  2018-04-23  2:25   ` Matthew Wilcox
  1 sibling, 1 reply; 9+ messages in thread
From: Jan Kara @ 2018-04-22 23:09 UTC (permalink / raw)
  To: Souptick Joarder
  Cc: viro, mawilcox, ross.zwisler, akpm, dan.j.williams, mhocko, jack,
	kirill.shutemov, linux-fsdevel, linux-mm, linux-kernel

On Sun 22-04-18 02:35:29, Souptick Joarder wrote:
> Use new return type vm_fault_t for fault handler. For
> now, this is just documenting that the function returns
> a VM_FAULT value rather than an errno. Once all instances
> are converted, vm_fault_t will become a distinct type.
> 
> commit 1c8f422059ae ("mm: change return type to vm_fault_t")
> 
> There was an existing bug inside dax_load_hole()
> if vm_insert_mixed had failed to allocate a page table,
> we'd return VM_FAULT_NOPAGE instead of VM_FAULT_OOM.
> With new vmf_insert_mixed() this issue is addressed.
> 
> vm_insert_mixed_mkwrite has inefficiency when it returns
> an error value, driver has to convert it to vm_fault_t
> type. With new vmf_insert_mixed_mkwrite() this limitation
> will be addressed.
> 
> As new function vmf_insert_mixed_mkwrite() only called
> from fs/dax.c, so keeping both the changes in a single
> patch.
> 
> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>

The patch looks good to me. Just one question:

> diff --git a/mm/memory.c b/mm/memory.c
> index 01f5464..721cfd5 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1955,12 +1955,19 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
>  }
>  EXPORT_SYMBOL(vm_insert_mixed);
>  
> -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
> -			pfn_t pfn)
> +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
> +		unsigned long addr, pfn_t pfn)
>  {
> -	return __vm_insert_mixed(vma, addr, pfn, true);
> +	int err;
> +
> +	err =  __vm_insert_mixed(vma, addr, pfn, true);
> +	if (err == -ENOMEM)
> +		return VM_FAULT_OOM;
> +	if (err < 0 && err != -EBUSY)
> +		return VM_FAULT_SIGBUS;
> +	return VM_FAULT_NOPAGE;
>  }
> -EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
> +EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);

So are we sure that all the callers of this function (and also of
vmf_insert_mixed()) are OK with EBUSY? Because especially in the
vmf_insert_mixed() case other page than the caller provided is in page
tables and thus possibly the caller needs to do some error recovery (such
as drop page refcount) in such case...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] fs: dax: Adding new return type vm_fault_t
  2018-04-22 23:09 ` Jan Kara
@ 2018-04-23  2:25   ` Matthew Wilcox
  2018-04-23 13:59     ` Jan Kara
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Wilcox @ 2018-04-23  2:25 UTC (permalink / raw)
  To: Jan Kara
  Cc: Souptick Joarder, viro, mawilcox, ross.zwisler, akpm,
	dan.j.williams, mhocko, kirill.shutemov, linux-fsdevel, linux-mm,
	linux-kernel

On Mon, Apr 23, 2018 at 01:09:48AM +0200, Jan Kara wrote:
> > -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
> > -			pfn_t pfn)
> > +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
> > +		unsigned long addr, pfn_t pfn)
> >  {
> > -	return __vm_insert_mixed(vma, addr, pfn, true);
> > +	int err;
> > +
> > +	err =  __vm_insert_mixed(vma, addr, pfn, true);
> > +	if (err == -ENOMEM)
> > +		return VM_FAULT_OOM;
> > +	if (err < 0 && err != -EBUSY)
> > +		return VM_FAULT_SIGBUS;
> > +	return VM_FAULT_NOPAGE;
> >  }
> > -EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
> > +EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
> 
> So are we sure that all the callers of this function (and also of
> vmf_insert_mixed()) are OK with EBUSY? Because especially in the
> vmf_insert_mixed() case other page than the caller provided is in page
> tables and thus possibly the caller needs to do some error recovery (such
> as drop page refcount) in such case...

I went through all the users and didn't find any that did anything
with -EBUSY other than turn it into VM_FAULT_NOPAGE.  I agree that it's
possible that there might have been someone who wanted to do that, but
we tend to rely on mapcount (through rmap) rather than refcount (ie we
use refcount to mean the number of kernel references to the page and then
use mapcount for the number of times it's mapped into a process' address
space).  All the drivers I audited would allocagte the page first, store
it in their own data structures, then try to insert it into the virtual
address space.  So an EBUSY always meant "the same page was inserted".

If we did want to support "This happened already" in the future, we
could define a VM_FAULT flag for that.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] fs: dax: Adding new return type vm_fault_t
  2018-04-23  2:25   ` Matthew Wilcox
@ 2018-04-23 13:59     ` Jan Kara
  2018-04-23 16:12       ` Souptick Joarder
  0 siblings, 1 reply; 9+ messages in thread
From: Jan Kara @ 2018-04-23 13:59 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Jan Kara, Souptick Joarder, viro, mawilcox, ross.zwisler, akpm,
	dan.j.williams, mhocko, kirill.shutemov, linux-fsdevel, linux-mm,
	linux-kernel

On Sun 22-04-18 19:25:05, Matthew Wilcox wrote:
> On Mon, Apr 23, 2018 at 01:09:48AM +0200, Jan Kara wrote:
> > > -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
> > > -			pfn_t pfn)
> > > +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
> > > +		unsigned long addr, pfn_t pfn)
> > >  {
> > > -	return __vm_insert_mixed(vma, addr, pfn, true);
> > > +	int err;
> > > +
> > > +	err =  __vm_insert_mixed(vma, addr, pfn, true);
> > > +	if (err == -ENOMEM)
> > > +		return VM_FAULT_OOM;
> > > +	if (err < 0 && err != -EBUSY)
> > > +		return VM_FAULT_SIGBUS;
> > > +	return VM_FAULT_NOPAGE;
> > >  }
> > > -EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
> > > +EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
> > 
> > So are we sure that all the callers of this function (and also of
> > vmf_insert_mixed()) are OK with EBUSY? Because especially in the
> > vmf_insert_mixed() case other page than the caller provided is in page
> > tables and thus possibly the caller needs to do some error recovery (such
> > as drop page refcount) in such case...
> 
> I went through all the users and didn't find any that did anything
> with -EBUSY other than turn it into VM_FAULT_NOPAGE.  I agree that it's
> possible that there might have been someone who wanted to do that, but
> we tend to rely on mapcount (through rmap) rather than refcount (ie we
> use refcount to mean the number of kernel references to the page and then
> use mapcount for the number of times it's mapped into a process' address
> space).  All the drivers I audited would allocagte the page first, store
> it in their own data structures, then try to insert it into the virtual
> address space.  So an EBUSY always meant "the same page was inserted".
> 
> If we did want to support "This happened already" in the future, we
> could define a VM_FAULT flag for that.

OK, fair enough and thanks for doing an audit! So possibly just add a
comment above vmf_insert_mixed() and vmf_insert_mixed_mkwrite() like:

/*
 * If the insertion of PTE failed because someone else already added a
 * different entry in the mean time, we treat that as success as we assume
 * the same entry was actually inserted.
 */

After that feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

to the patch.

								Honza

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] fs: dax: Adding new return type vm_fault_t
  2018-04-23 13:59     ` Jan Kara
@ 2018-04-23 16:12       ` Souptick Joarder
  2018-04-23 17:28         ` Matthew Wilcox
  0 siblings, 1 reply; 9+ messages in thread
From: Souptick Joarder @ 2018-04-23 16:12 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox, Al Viro, mawilcox, Ross Zwisler, Andrew Morton,
	Dan Williams, Michal Hocko, kirill.shutemov, linux-fsdevel,
	Linux-MM, linux-kernel

On Mon, Apr 23, 2018 at 7:29 PM, Jan Kara <jack@suse.cz> wrote:
> On Sun 22-04-18 19:25:05, Matthew Wilcox wrote:
>> On Mon, Apr 23, 2018 at 01:09:48AM +0200, Jan Kara wrote:
>> > > -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
>> > > -                 pfn_t pfn)
>> > > +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
>> > > +         unsigned long addr, pfn_t pfn)
>> > >  {
>> > > - return __vm_insert_mixed(vma, addr, pfn, true);
>> > > + int err;
>> > > +
>> > > + err =  __vm_insert_mixed(vma, addr, pfn, true);
>> > > + if (err == -ENOMEM)
>> > > +         return VM_FAULT_OOM;
>> > > + if (err < 0 && err != -EBUSY)
>> > > +         return VM_FAULT_SIGBUS;
>> > > + return VM_FAULT_NOPAGE;
>> > >  }
>> > > -EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
>> > > +EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
>> >
>> > So are we sure that all the callers of this function (and also of
>> > vmf_insert_mixed()) are OK with EBUSY? Because especially in the
>> > vmf_insert_mixed() case other page than the caller provided is in page
>> > tables and thus possibly the caller needs to do some error recovery (such
>> > as drop page refcount) in such case...
>>
>> I went through all the users and didn't find any that did anything
>> with -EBUSY other than turn it into VM_FAULT_NOPAGE.  I agree that it's
>> possible that there might have been someone who wanted to do that, but
>> we tend to rely on mapcount (through rmap) rather than refcount (ie we
>> use refcount to mean the number of kernel references to the page and then
>> use mapcount for the number of times it's mapped into a process' address
>> space).  All the drivers I audited would allocagte the page first, store
>> it in their own data structures, then try to insert it into the virtual
>> address space.  So an EBUSY always meant "the same page was inserted".
>>
>> If we did want to support "This happened already" in the future, we
>> could define a VM_FAULT flag for that.
>
> OK, fair enough and thanks for doing an audit! So possibly just add a
> comment above vmf_insert_mixed() and vmf_insert_mixed_mkwrite() like:
>
> /*
>  * If the insertion of PTE failed because someone else already added a
>  * different entry in the mean time, we treat that as success as we assume
>  * the same entry was actually inserted.
>  */
>
> After that feel free to add:
>
> Reviewed-by: Jan Kara <jack@suse.cz>
>
> to the patch.
>

Thanks , will add this in change log and send v4.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v3] fs: dax: Adding new return type vm_fault_t
  2018-04-23 16:12       ` Souptick Joarder
@ 2018-04-23 17:28         ` Matthew Wilcox
  0 siblings, 0 replies; 9+ messages in thread
From: Matthew Wilcox @ 2018-04-23 17:28 UTC (permalink / raw)
  To: Souptick Joarder
  Cc: Jan Kara, Al Viro, mawilcox, Ross Zwisler, Andrew Morton,
	Dan Williams, Michal Hocko, kirill.shutemov, linux-fsdevel,
	Linux-MM, linux-kernel

On Mon, Apr 23, 2018 at 09:42:30PM +0530, Souptick Joarder wrote:
> > OK, fair enough and thanks for doing an audit! So possibly just add a
> > comment above vmf_insert_mixed() and vmf_insert_mixed_mkwrite() like:
> >
> > /*
> >  * If the insertion of PTE failed because someone else already added a
> >  * different entry in the mean time, we treat that as success as we assume
> >  * the same entry was actually inserted.
> >  */
> >
> > After that feel free to add:
> >
> > Reviewed-by: Jan Kara <jack@suse.cz>
> >
> > to the patch.
> >
> 
> Thanks , will add this in change log and send v4.

Jan asked you to add this comment above vmf_insert_mixed_mkwrite()
(I don't think it needs to be added above vmf_insert_mixed() because
this comment will get moved in a later commit once we have no more callers
of vm_insert_mixed()).

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-04-23 17:28 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-21 21:05 [PATCH v3] fs: dax: Adding new return type vm_fault_t Souptick Joarder
2018-04-21 21:34 ` Matthew Wilcox
2018-04-21 21:54   ` Souptick Joarder
2018-04-21 23:56     ` Dan Williams
2018-04-22 23:09 ` Jan Kara
2018-04-23  2:25   ` Matthew Wilcox
2018-04-23 13:59     ` Jan Kara
2018-04-23 16:12       ` Souptick Joarder
2018-04-23 17:28         ` Matthew Wilcox

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).