All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm, hugetlb: fix resv_huge_pages underflow on UFFDIO_COPY
@ 2021-05-13 23:43 ` Mina Almasry
  0 siblings, 0 replies; 30+ messages in thread
From: Mina Almasry @ 2021-05-13 23:43 UTC (permalink / raw)
  Cc: Mina Almasry, Axel Rasmussen, Peter Xu, linux-mm, Mike Kravetz,
	Andrew Morton, linux-kernel

When hugetlb_mcopy_atomic_pte() is called with:
- mode==MCOPY_ATOMIC_NORMAL and,
- we already have a page in the page cache corresponding to the
associated address,

We will allocate a huge page from the reserves, and then fail to insert it
into the cache and return -EEXIST. In this case, we need to return -EEXIST
without allocating a new page as the page already exists in the cache.
Allocating the extra page causes the resv_huge_pages to underflow temporarily
until the extra page is freed.

To fix this we check if a page exists in the cache, and allocate it and
insert it in the cache immediately while holding the lock. After that we
copy the contents into the page.

As a side effect of this, pages may exist in the cache for which the
copy failed and for these pages PageUptodate(page) == false. Modify code
that query the cache to handle this correctly.

Tested using:
./tools/testing/selftests/vm/userfaultfd hugetlb_shared 1024 200 \
/mnt/huge

Test passes, and dmesg shows no underflow warnings.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: linux-mm@kvack.org
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org

---
 fs/hugetlbfs/inode.c |   2 +-
 mm/hugetlb.c         | 109 +++++++++++++++++++++++--------------------
 2 files changed, 60 insertions(+), 51 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a2a42335e8fd..cc027c335242 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -346,7 +346,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)

 		/* Find the page */
 		page = find_lock_page(mapping, index);
-		if (unlikely(page == NULL)) {
+		if (unlikely(page == NULL || !PageUptodate(page))) {
 			/*
 			 * We have a HOLE, zero out the user-buffer for the
 			 * length of the hole or request.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 629aa4c2259c..a5a5fbf7ac25 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4543,7 +4543,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,

 retry:
 	page = find_lock_page(mapping, idx);
-	if (!page) {
+	if (!page || !PageUptodate(page)) {
 		/* Check for page in userfault range */
 		if (userfaultfd_missing(vma)) {
 			ret = hugetlb_handle_userfault(vma, mapping, idx,
@@ -4552,26 +4552,30 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 			goto out;
 		}

-		page = alloc_huge_page(vma, haddr, 0);
-		if (IS_ERR(page)) {
-			/*
-			 * Returning error will result in faulting task being
-			 * sent SIGBUS.  The hugetlb fault mutex prevents two
-			 * tasks from racing to fault in the same page which
-			 * could result in false unable to allocate errors.
-			 * Page migration does not take the fault mutex, but
-			 * does a clear then write of pte's under page table
-			 * lock.  Page fault code could race with migration,
-			 * notice the clear pte and try to allocate a page
-			 * here.  Before returning error, get ptl and make
-			 * sure there really is no pte entry.
-			 */
-			ptl = huge_pte_lock(h, mm, ptep);
-			ret = 0;
-			if (huge_pte_none(huge_ptep_get(ptep)))
-				ret = vmf_error(PTR_ERR(page));
-			spin_unlock(ptl);
-			goto out;
+		if (!page) {
+			page = alloc_huge_page(vma, haddr, 0);
+			if (IS_ERR(page)) {
+				/*
+				 * Returning error will result in faulting task
+				 * being sent SIGBUS.  The hugetlb fault mutex
+				 * prevents two tasks from racing to fault in
+				 * the same page which could result in false
+				 * unable to allocate errors.  Page migration
+				 * does not take the fault mutex, but does a
+				 * clear then write of pte's under page table
+				 * lock.  Page fault code could race with
+				 * migration, notice the clear pte and try to
+				 * allocate a page here.  Before returning
+				 * error, get ptl and make sure there really is
+				 * no pte entry.
+				 */
+				ptl = huge_pte_lock(h, mm, ptep);
+				ret = 0;
+				if (huge_pte_none(huge_ptep_get(ptep)))
+					ret = vmf_error(PTR_ERR(page));
+				spin_unlock(ptl);
+				goto out;
+			}
 		}
 		clear_huge_page(page, address, pages_per_huge_page(h));
 		__SetPageUptodate(page);
@@ -4868,31 +4872,55 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    struct page **pagep)
 {
 	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
-	struct address_space *mapping;
-	pgoff_t idx;
+	struct hstate *h = hstate_vma(dst_vma);
+	struct address_space *mapping = dst_vma->vm_file->f_mapping;
+	pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
 	unsigned long size;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
-	struct hstate *h = hstate_vma(dst_vma);
 	pte_t _dst_pte;
 	spinlock_t *ptl;
-	int ret;
+	int ret = -ENOMEM;
 	struct page *page;
 	int writable;

-	mapping = dst_vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
 	if (is_continue) {
 		ret = -EFAULT;
-		page = find_lock_page(mapping, idx);
-		if (!page)
+		page = hugetlbfs_pagecache_page(h, dst_vma, dst_addr);
+		if (!page) {
+			ret = -ENOMEM;
 			goto out;
+		}
 	} else if (!*pagep) {
-		ret = -ENOMEM;
+		/* If a page already exists, then it's UFFDIO_COPY for
+		 * a non-missing case. Return -EEXIST.
+		 */
+		if (hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+			ret = -EEXIST;
+			goto out;
+		}
+
 		page = alloc_huge_page(dst_vma, dst_addr, 0);
 		if (IS_ERR(page))
 			goto out;

+		/* Add shared, newly allocated pages to the page cache. */
+		if (vm_shared) {
+			size = i_size_read(mapping->host) >> huge_page_shift(h);
+			ret = -EFAULT;
+			if (idx >= size)
+				goto out;
+
+			/*
+			 * Serialization between remove_inode_hugepages() and
+			 * huge_add_to_page_cache() below happens through the
+			 * hugetlb_fault_mutex_table that here must be hold by
+			 * the caller.
+			 */
+			ret = huge_add_to_page_cache(page, mapping, idx);
+			if (ret)
+				goto out;
+		}
+
 		ret = copy_huge_page_from_user(page,
 						(const void __user *) src_addr,
 						pages_per_huge_page(h), false);
@@ -4916,24 +4944,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	 */
 	__SetPageUptodate(page);

-	/* Add shared, newly allocated pages to the page cache. */
-	if (vm_shared && !is_continue) {
-		size = i_size_read(mapping->host) >> huge_page_shift(h);
-		ret = -EFAULT;
-		if (idx >= size)
-			goto out_release_nounlock;
-
-		/*
-		 * Serialization between remove_inode_hugepages() and
-		 * huge_add_to_page_cache() below happens through the
-		 * hugetlb_fault_mutex_table that here must be hold by
-		 * the caller.
-		 */
-		ret = huge_add_to_page_cache(page, mapping, idx);
-		if (ret)
-			goto out_release_nounlock;
-	}
-
 	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
 	spin_lock(ptl);

@@ -4994,7 +5004,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	spin_unlock(ptl);
 	if (vm_shared || is_continue)
 		unlock_page(page);
-out_release_nounlock:
 	put_page(page);
 	goto out;
 }
--
2.31.1.751.gd2f1c929bd-goog

^ permalink raw reply related	[flat|nested] 30+ messages in thread
* Re: resv_huge_page underflow with userfaultfd test
@ 2021-05-12  3:06 Mike Kravetz
       [not found] ` <20210512065813.89270-1-almasrymina@google.com>
  0 siblings, 1 reply; 30+ messages in thread
From: Mike Kravetz @ 2021-05-12  3:06 UTC (permalink / raw)
  To: Peter Xu; +Cc: Mina Almasry, Linux-MM, Axel Rasmussen, aarcange

On 5/11/21 7:35 PM, Peter Xu wrote:
> Mike,
> 
> On Tue, May 11, 2021 at 07:25:39PM -0700, Mike Kravetz wrote:
>> I looked at this a bit more today and am not exactly sure of the
>> expected behavior.  The situation is:
>> - UFFDIO_COPY is called for hugetlb mapping
>>   - the dest address is in a shared mapping
>>   - there is a page in the cache associated with the address in the
>>     shared mapping
>>
>> Currently, the code will fail when trying to update the page cache as
>> the entry already exists.  The shm code appears to do the same.
>>
>> Quick question.  Is this the expected behavior?  Or, would you expect
>> the UFFDIO_COPY to update the page in the page cache, and then resolve
>> the fault/update the pte?
> 
> AFAICT that's the expected behavior, and it need to be like that so as to avoid
> silent data corruption (if the page cache existed, it means the page is not
> "missing" at all, then it does not suite for a UFFDIO_COPY as it's only used
> for uffd page missing case).  Thanks,

Thanks Peter.

Making it fail in that case is pretty straight forward.

BTW, in this case the page was not in the cache at the time of the
original fault which is why it is being handled as missing by userfaultfd.
By the time the UFFDIO_COPY is requested, a page has been added to the cache.
-- 
Mike Kravetz


^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2021-05-21  2:06 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-13 23:43 [PATCH] mm, hugetlb: fix resv_huge_pages underflow on UFFDIO_COPY Mina Almasry
2021-05-13 23:43 ` Mina Almasry
2021-05-13 23:49 ` Mina Almasry
2021-05-13 23:49   ` Mina Almasry
2021-05-14  0:14   ` Mike Kravetz
2021-05-14  0:23     ` Mina Almasry
2021-05-14  0:23       ` Mina Almasry
2021-05-14  4:02       ` Mike Kravetz
2021-05-14 12:31         ` Peter Xu
2021-05-14 17:56           ` Mike Kravetz
2021-05-14 18:30             ` Axel Rasmussen
2021-05-14 18:30               ` Axel Rasmussen
2021-05-14 19:16             ` Peter Xu
2021-05-20 19:18     ` Mina Almasry
2021-05-20 19:18       ` Mina Almasry
2021-05-20 19:21       ` Mina Almasry
2021-05-20 19:21         ` Mina Almasry
2021-05-20 20:00         ` Mike Kravetz
2021-05-20 20:31           ` Mina Almasry
2021-05-20 20:31             ` Mina Almasry
2021-05-21  2:05             ` Mina Almasry
2021-05-21  2:05               ` Mina Almasry
  -- strict thread matches above, loose matches on Subject: below --
2021-05-12  3:06 resv_huge_page underflow with userfaultfd test Mike Kravetz
     [not found] ` <20210512065813.89270-1-almasrymina@google.com>
2021-05-12  7:44   ` [PATCH] mm, hugetlb: fix resv_huge_pages underflow on UFFDIO_COPY Mina Almasry
2021-05-12  7:44     ` Mina Almasry
     [not found]   ` <CAJHvVch0ZMapPVEc0Ge5V4KDgNDNhECbqwDi0y9XxsxFXQZ-gg@mail.gmail.com>
     [not found]     ` <c455d241-11f6-95a6-eb29-0ddd94eedbd7@oracle.com>
2021-05-12 19:42       ` Mina Almasry
2021-05-12 19:42         ` Mina Almasry
2021-05-12 20:14         ` Peter Xu
2021-05-12 21:31           ` Mike Kravetz
2021-05-12 21:52             ` Mina Almasry
2021-05-12 21:52               ` Mina Almasry

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.