[PATCH v3] mm, hugetlb: fix resv_huge_pages underflow on UFFDIO_COPY

* [PATCH v3] mm, hugetlb: fix resv_huge_pages underflow on UFFDIO_COPY
@ 2021-05-21  7:44 Mina Almasry
  2021-05-22 21:19 ` Andrew Morton
  2021-05-24 18:07 ` Mike Kravetz
  0 siblings, 2 replies; 16+ messages in thread
From: Mina Almasry @ 2021-05-21  7:44 UTC (permalink / raw)
  Cc: Mina Almasry, Axel Rasmussen, Peter Xu, linux-mm, Mike Kravetz,
	Andrew Morton, linux-kernel

The userfaultfd hugetlb tests detect a resv_huge_pages underflow. This
happens when hugetlb_mcopy_atomic_pte() is called with !is_continue on
an index for which we already have a page in the cache. When this
happens, we allocate a second page, double consuming the reservation,
and then fail to insert the page into the cache and return -EEXIST.

To fix this, we first if there exists a page in the cache which already
consumed the reservation, and return -EEXIST immediately if so.

Secondly, if we fail to copy the page contents while holding the
hugetlb_fault_mutex, we will drop the mutex and return to the caller
after allocating a page that consumed a reservation. In this case there
may be a fault that double consumes the reservation. To handle this, we
free the allocated page, fix the reservations, and allocate a temporary
hugetlb page and return that to the caller. When the caller does the
copy outside of the lock, we again check the cache, and allocate a page
consuming the reservation, and copy over the contents.

Test:
Hacked the code locally such that resv_huge_pages underflows produce
a warning and the copy_huge_page_from_user() always fails, then:

./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10
	2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
./tools/testing/selftests/vm/userfaultfd hugetlb 10
	2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success

Both tests succeed and produce no warnings. After the test runs
number of free/resv hugepages is correct.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: linux-mm@kvack.org
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org

---
 include/linux/hugetlb.h |   4 ++
 mm/hugetlb.c            | 103 ++++++++++++++++++++++++++++++++++++----
 mm/migrate.c            |  39 +++------------
 3 files changed, 103 insertions(+), 43 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b92f25ccef58..427974510965 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -194,6 +194,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 bool is_hugetlb_entry_migration(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);

+void hugetlb_copy_page(struct page *dst, struct page *src);
+
 #else /* !CONFIG_HUGETLB_PAGE */

 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
@@ -379,6 +381,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,

 static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

+static inline void hugetlb_copy_page(struct page *dst, struct page *src);
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 /*
  * hugepages at page global directory. If arch support
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 629aa4c2259c..cb041c97a558 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -81,6 +81,45 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);

+/*
+ * Gigantic pages are so large that we do not guarantee that page++ pointer
+ * arithmetic will work across the entire page.  We need something more
+ * specialized.
+ */
+static void __copy_gigantic_page(struct page *dst, struct page *src,
+				 int nr_pages)
+{
+	int i;
+	struct page *dst_base = dst;
+	struct page *src_base = src;
+
+	for (i = 0; i < nr_pages;) {
+		cond_resched();
+		copy_highpage(dst, src);
+
+		i++;
+		dst = mem_map_next(dst, dst_base, i);
+		src = mem_map_next(src, src_base, i);
+	}
+}
+
+void hugetlb_copy_page(struct page *dst, struct page *src)
+{
+	int i;
+	struct hstate *h = page_hstate(src);
+	int nr_pages = pages_per_huge_page(h);
+
+	if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
+		__copy_gigantic_page(dst, src, nr_pages);
+		return;
+	}
+
+	for (i = 0; i < nr_pages; i++) {
+		cond_resched();
+		copy_highpage(dst + i, src + i);
+	}
+}
+
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
 {
 	if (spool->count)
@@ -4868,19 +4907,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    struct page **pagep)
 {
 	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
-	struct address_space *mapping;
-	pgoff_t idx;
+	struct hstate *h = hstate_vma(dst_vma);
+	struct address_space *mapping = dst_vma->vm_file->f_mapping;
+	pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
 	unsigned long size;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
-	struct hstate *h = hstate_vma(dst_vma);
 	pte_t _dst_pte;
 	spinlock_t *ptl;
-	int ret;
+	int ret = -ENOMEM;
 	struct page *page;
 	int writable;
-
-	mapping = dst_vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+	struct mempolicy *mpol;
+	nodemask_t *nodemask;
+	gfp_t gfp_mask = htlb_alloc_mask(h);
+	int node = huge_node(dst_vma, dst_addr, gfp_mask, &mpol, &nodemask);

 	if (is_continue) {
 		ret = -EFAULT;
@@ -4888,7 +4928,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		if (!page)
 			goto out;
 	} else if (!*pagep) {
-		ret = -ENOMEM;
+		/* If a page already exists, then it's UFFDIO_COPY for
+		 * a non-missing case. Return -EEXIST.
+		 */
+		if (hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+			ret = -EEXIST;
+			goto out;
+		}
+
 		page = alloc_huge_page(dst_vma, dst_addr, 0);
 		if (IS_ERR(page))
 			goto out;
@@ -4900,12 +4947,48 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		/* fallback to copy_from_user outside mmap_lock */
 		if (unlikely(ret)) {
 			ret = -ENOENT;
+			/* Free the allocated page which may have
+			 * consumed a reservation.
+			 */
+			restore_reserve_on_error(h, dst_vma, dst_addr, page);
+			if (!HPageRestoreReserve(page)) {
+				if (unlikely(hugetlb_unreserve_pages(
+					    mapping->host, idx, idx + 1, 1)))
+					hugetlb_fix_reserve_counts(
+						mapping->host);
+			}
+			put_page(page);
+
+			/* Allocate a temporary page to hold the copied
+			 * contents.
+			 */
+			page = alloc_migrate_huge_page(h, gfp_mask, node,
+						       nodemask);
+			if (IS_ERR(page)) {
+				ret = -ENOMEM;
+				goto out;
+			}
 			*pagep = page;
-			/* don't free the page */
+			/* Set the outparam pagep and return to the caller to
+			 * copy the contents outside the lock. Don't free the
+			 * page.
+			 */
 			goto out;
 		}
 	} else {
-		page = *pagep;
+		if (hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+			put_page(*pagep);
+			ret = -EEXIST;
+			goto out;
+		}
+
+		page = alloc_huge_page(dst_vma, dst_addr, 0);
+		if (IS_ERR(page)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		__copy_gigantic_page(page, *pagep, pages_per_huge_page(h));
+		put_page(*pagep);
 		*pagep = NULL;
 	}

diff --git a/mm/migrate.c b/mm/migrate.c
index 6b37d00890ca..d3437f9a608d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -528,28 +528,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 	return MIGRATEPAGE_SUCCESS;
 }

-/*
- * Gigantic pages are so large that we do not guarantee that page++ pointer
- * arithmetic will work across the entire page.  We need something more
- * specialized.
- */
-static void __copy_gigantic_page(struct page *dst, struct page *src,
-				int nr_pages)
-{
-	int i;
-	struct page *dst_base = dst;
-	struct page *src_base = src;
-
-	for (i = 0; i < nr_pages; ) {
-		cond_resched();
-		copy_highpage(dst, src);
-
-		i++;
-		dst = mem_map_next(dst, dst_base, i);
-		src = mem_map_next(src, src_base, i);
-	}
-}
-
 static void copy_huge_page(struct page *dst, struct page *src)
 {
 	int i;
@@ -557,19 +535,14 @@ static void copy_huge_page(struct page *dst, struct page *src)

 	if (PageHuge(src)) {
 		/* hugetlbfs page */
-		struct hstate *h = page_hstate(src);
-		nr_pages = pages_per_huge_page(h);
-
-		if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
-			__copy_gigantic_page(dst, src, nr_pages);
-			return;
-		}
-	} else {
-		/* thp page */
-		BUG_ON(!PageTransHuge(src));
-		nr_pages = thp_nr_pages(src);
+		hugetlb_copy_page(dst, src);
+		return;
 	}

+	/* thp page */
+	BUG_ON(!PageTransHuge(src));
+	nr_pages = thp_nr_pages(src);
+
 	for (i = 0; i < nr_pages; i++) {
 		cond_resched();
 		copy_highpage(dst + i, src + i);
--
2.31.1.818.g46aad6cb9e-goog

^ permalink raw reply related	[flat|nested] 16+ messages in thread