[PATCHv4 14/24] thp: implement new split_huge_page()

From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>,
	Hugh Dickins <hughd@google.com>, Mel Gorman <mgorman@suse.de>,
	Rik van Riel <riel@redhat.com>, Vlastimil Babka <vbabka@suse.cz>,
	Christoph Lameter <cl@gentwo.org>,
	Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>,
	Steve Capper <steve.capper@linaro.org>,
	"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@suse.cz>,
	Jerome Marchand <jmarchan@redhat.com>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCHv4 14/24] thp: implement new split_huge_page()
Date: Wed,  4 Mar 2015 18:33:02 +0200	[thread overview]
Message-ID: <1425486792-93161-15-git-send-email-kirill.shutemov@linux.intel.com> (raw)
In-Reply-To: <1425486792-93161-1-git-send-email-kirill.shutemov@linux.intel.com>

The new split_huge_page() can fail if the compound is pinned: we expect
only caller to have one reference to head page. If the page is pinned
split_huge_page() returns -EBUSY and caller must handle this correctly.

We don't need mark PMDs splitting since now we can split one PMD a time
with split_huge_pmd().

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 include/linux/hugetlb_inline.h |   9 +-
 include/linux/mm.h             |  22 +++--
 mm/huge_memory.c               | 183 +++++++++++++++++++++++------------------
 mm/swap.c                      | 126 +++++++++++++++++++++++++++-
 4 files changed, 244 insertions(+), 96 deletions(-)

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 2bb681fbeb35..c5cd37479731 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -10,6 +10,8 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 	return !!(vma->vm_flags & VM_HUGETLB);
 }
 
+int PageHeadHuge(struct page *page_head);
+
 #else
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
@@ -17,6 +19,11 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 	return 0;
 }
 
-#endif
+static inline int PageHeadHuge(struct page *page_head)
+{
+       return 0;
+}
+
+#endif /* CONFIG_HUGETLB_PAGE */
 
 #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 020dbbe1563c..28aeae6e553b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -485,20 +485,18 @@ static inline int page_count(struct page *page)
 	return atomic_read(&compound_head(page)->_count);
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
-extern int PageHeadHuge(struct page *page_head);
-#else /* CONFIG_HUGETLB_PAGE */
-static inline int PageHeadHuge(struct page *page_head)
-{
-	return 0;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
+void __get_page_tail(struct page *page);
 static inline void get_page(struct page *page)
 {
-	struct page *page_head = compound_head(page);
-	VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page);
-	atomic_inc(&page_head->_count);
+	if (unlikely(PageTail(page)))
+		return __get_page_tail(page);
+
+	/*
+	 * Getting a normal page or the head of a compound page
+	 * requires to already have an elevated page->_count.
+	 */
+	VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
+	atomic_inc(&page->_count);
 }
 
 static inline struct page *virt_to_head_page(const void *x)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3409a5c7dbb8..6f6429426edb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1737,31 +1737,52 @@ static void split_huge_pmd_address(struct vm_area_struct *vma,
 	__split_huge_pmd(vma, pmd, address);
 }
 
-#if 0
-static void __split_huge_page_refcount(struct page *page,
+static int __split_huge_page_refcount(struct page *page,
 				       struct list_head *list)
 {
 	int i;
 	struct zone *zone = page_zone(page);
 	struct lruvec *lruvec;
-	int tail_count = 0;
+	int tail_mapcount = 0;
 
 	/* prevent PageLRU to go away from under us, and freeze lru stats */
 	spin_lock_irq(&zone->lru_lock);
 	lruvec = mem_cgroup_page_lruvec(page, zone);
 
 	compound_lock(page);
+
+	/*
+	 * We cannot split pinned THP page: we expect page count to be equal
+	 * to sum of mapcount of all sub-pages plus one (split_huge_page()
+	 * caller must take reference for head page).
+	 *
+	 * Compound lock only prevents page->_count to be updated from
+	 * get_page() or put_page() on tail page. It means means page_count()
+	 * can change under us from head page after the check, but it's okay:
+	 * all new refernces will stay on head page after split.
+	 */
+	tail_mapcount = 0;
+	for (i = 0; i < HPAGE_PMD_NR; i++)
+		tail_mapcount += page_mapcount(page + i);
+	if (tail_mapcount != page_count(page) - 1) {
+		BUG_ON(tail_mapcount > page_count(page) - 1);
+		compound_unlock(page);
+		spin_unlock_irq(&zone->lru_lock);
+		return -EBUSY;
+	}
+
 	/* complete memcg works before add pages to LRU */
 	mem_cgroup_split_huge_fixup(page);
 
+	tail_mapcount = 0;
 	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
 		struct page *page_tail = page + i;
 
 		/* tail_page->_mapcount cannot change */
-		BUG_ON(atomic_read(&page_tail->_mapcount) + 1 < 0);
-		tail_count += atomic_read(&page_tail->_mapcount) + 1;
+		BUG_ON(page_mapcount(page_tail) < 0);
+		tail_mapcount += page_mapcount(page_tail);
 		/* check for overflow */
-		BUG_ON(tail_count < 0);
+		BUG_ON(tail_mapcount < 0);
 		BUG_ON(atomic_read(&page_tail->_count) != 0);
 		/*
 		 * tail_page->_count is zero and not changing from
@@ -1799,28 +1820,9 @@ static void __split_huge_page_refcount(struct page *page,
 		/* clear PageTail before overwriting first_page */
 		smp_wmb();
 
-		/*
-		 * __split_huge_page_splitting() already set the
-		 * splitting bit in all pmd that could map this
-		 * hugepage, that will ensure no CPU can alter the
-		 * mapcount on the head page. The mapcount is only
-		 * accounted in the head page and it has to be
-		 * transferred to all tail pages in the below code. So
-		 * for this code to be safe, the split the mapcount
-		 * can't change. But that doesn't mean userland can't
-		 * keep changing and reading the page contents while
-		 * we transfer the mapcount, so the pmd splitting
-		 * status is achieved setting a reserved bit in the
-		 * pmd, not by clearing the present bit.
-		*/
-		atomic_set(&page_tail->_mapcount, compound_mapcount(page) - 1);
-
 		/* ->mapping in first tail page is compound_mapcount */
-		if (i != 1) {
-			BUG_ON(page_tail->mapping);
-			page_tail->mapping = page->mapping;
-			BUG_ON(!PageAnon(page_tail));
-		}
+		BUG_ON(i != 1 && page_tail->mapping);
+		page_tail->mapping = page->mapping;
 
 		page_tail->index = page->index + i;
 		page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
@@ -1831,12 +1833,9 @@ static void __split_huge_page_refcount(struct page *page,
 
 		lru_add_page_tail(page, page_tail, lruvec, list);
 	}
-	atomic_sub(tail_count, &page->_count);
+	atomic_sub(tail_mapcount, &page->_count);
 	BUG_ON(atomic_read(&page->_count) <= 0);
 
-	page->_mapcount = *compound_mapcount_ptr(page);
-	page[1].mapping = page->mapping;
-
 	__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
 
 	ClearPageCompound(page);
@@ -1861,71 +1860,95 @@ static void __split_huge_page_refcount(struct page *page,
 	 * to be pinned by the caller.
 	 */
 	BUG_ON(page_count(page) <= 0);
+	return 0;
 }
 
-/* must be called with anon_vma->root->rwsem held */
-static void __split_huge_page(struct page *page,
-			      struct anon_vma *anon_vma,
-			      struct list_head *list)
+/*
+ * Split a hugepage into normal pages. This doesn't change the position of head
+ * page. If @list is null, tail pages will be added to LRU list, otherwise, to
+ * @list. Both head page and tail pages will inherit mapping, flags, and so on
+ * from the hugepage.
+ * Return 0 if the hugepage is split successfully otherwise return -errno.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
 {
-	int mapcount, mapcount2;
-	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct anon_vma *anon_vma;
 	struct anon_vma_chain *avc;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	int i, tail_mapcount;
+	int ret = -EBUSY;
 
-	BUG_ON(!PageHead(page));
-	BUG_ON(PageTail(page));
+	BUG_ON(is_huge_zero_page(page));
+	BUG_ON(!PageAnon(page));
 
-	mapcount = 0;
-	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-		struct vm_area_struct *vma = avc->vma;
-		unsigned long addr = vma_address(page, vma);
-		BUG_ON(is_vma_temporary_stack(vma));
-		mapcount += __split_huge_page_splitting(page, vma, addr);
-	}
 	/*
-	 * It is critical that new vmas are added to the tail of the
-	 * anon_vma list. This guarantes that if copy_huge_pmd() runs
-	 * and establishes a child pmd before
-	 * __split_huge_page_splitting() freezes the parent pmd (so if
-	 * we fail to prevent copy_huge_pmd() from running until the
-	 * whole __split_huge_page() is complete), we will still see
-	 * the newly established pmd of the child later during the
-	 * walk, to be able to set it as pmd_trans_splitting too.
+	 * The caller does not necessarily hold an mmap_sem that would prevent
+	 * the anon_vma disappearing so we first we take a reference to it
+	 * and then lock the anon_vma for write. This is similar to
+	 * page_lock_anon_vma_read except the write lock is taken to serialise
+	 * against parallel split or collapse operations.
 	 */
-	if (mapcount != page_mapcount(page)) {
-		pr_err("mapcount %d page_mapcount %d\n",
-			mapcount, page_mapcount(page));
-		BUG();
+	anon_vma = page_get_anon_vma(page);
+	if (!anon_vma)
+		goto out;
+	anon_vma_lock_write(anon_vma);
+
+	if (!PageCompound(page)) {
+		ret = 0;
+		goto out_unlock;
 	}
 
-	__split_huge_page_refcount(page, list);
+	BUG_ON(!PageSwapBacked(page));
+
+	/*
+	 * Racy check if __split_huge_page_refcount() can be successful, before
+	 * splitting PMDs.
+	 */
+	tail_mapcount = compound_mapcount(page);
+	for (i = 0; i < HPAGE_PMD_NR; i++)
+		tail_mapcount += atomic_read(&page[i]._mapcount) + 1;
+	if (tail_mapcount != page_count(page) - 1) {
+		VM_BUG_ON_PAGE(tail_mapcount > page_count(page) - 1, page);
+		ret = -EBUSY;
+		goto out_unlock;
+	}
 
-	mapcount2 = 0;
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long addr = vma_address(page, vma);
-		BUG_ON(is_vma_temporary_stack(vma));
-		mapcount2 += __split_huge_page_map(page, vma, addr);
-	}
-	if (mapcount != mapcount2) {
-		pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
-			mapcount, mapcount2, page_mapcount(page));
-		BUG();
+		spinlock_t *ptl;
+		pmd_t *pmd;
+		unsigned long haddr = addr & HPAGE_PMD_MASK;
+		unsigned long mmun_start;	/* For mmu_notifiers */
+		unsigned long mmun_end;		/* For mmu_notifiers */
+
+		mmun_start = haddr;
+		mmun_end   = haddr + HPAGE_PMD_SIZE;
+		mmu_notifier_invalidate_range_start(vma->vm_mm,
+				mmun_start, mmun_end);
+		pmd = page_check_address_pmd(page, vma->vm_mm, addr,
+				PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+		if (pmd) {
+			__split_huge_pmd_locked(vma, pmd, addr);
+			spin_unlock(ptl);
+		}
+		mmu_notifier_invalidate_range_end(vma->vm_mm,
+				mmun_start, mmun_end);
 	}
-}
-#endif
 
-/*
- * Split a hugepage into normal pages. This doesn't change the position of head
- * page. If @list is null, tail pages will be added to LRU list, otherwise, to
- * @list. Both head page and tail pages will inherit mapping, flags, and so on
- * from the hugepage.
- * Return 0 if the hugepage is split successfully otherwise return -errno.
- */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
-{
-	count_vm_event(THP_SPLIT_PAGE_FAILED);
-	return -EBUSY;
+	BUG_ON(compound_mapcount(page));
+	ret = __split_huge_page_refcount(page, list);
+	BUG_ON(!ret && PageCompound(page));
+
+out_unlock:
+	anon_vma_unlock_write(anon_vma);
+	put_anon_vma(anon_vma);
+out:
+	if (ret)
+		count_vm_event(THP_SPLIT_PAGE_FAILED);
+	else
+		count_vm_event(THP_SPLIT_PAGE);
+	return ret;
 }
 
 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
diff --git a/mm/swap.c b/mm/swap.c
index 2e647d4dc6bb..7b4fbb26cc2c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -80,12 +80,86 @@ static void __put_compound_page(struct page *page)
 	(*dtor)(page);
 }
 
+static inline bool compound_lock_needed(struct page *page)
+{
+	return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+		!PageSlab(page) && !PageHeadHuge(page);
+}
+
 static void put_compound_page(struct page *page)
 {
-	struct page *page_head = compound_head(page);
+	struct page *page_head;
+	unsigned long flags;
+
+	if (likely(!PageTail(page))) {
+		if (put_page_testzero(page)) {
+			/*
+			 * By the time all refcounts have been released
+			 * split_huge_page cannot run anymore from under us.
+			 */
+			if (PageHead(page))
+				__put_compound_page(page);
+			else
+				__put_single_page(page);
+		}
+		return;
+	}
+
+	/* __split_huge_page_refcount can run under us */
+	page_head = compound_head(page);
+
+	if (!compound_lock_needed(page_head)) {
+		/*
+		 * If "page" is a THP tail, we must read the tail page flags
+		 * after the head page flags. The split_huge_page side enforces
+		 * write memory barriers between clearing PageTail and before
+		 * the head page can be freed and reallocated.
+		 */
+		smp_rmb();
+		if (likely(PageTail(page))) {
+			/* __split_huge_page_refcount cannot race here. */
+			VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+			VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
+			if (put_page_testzero(page_head)) {
+				/*
+				 * If this is the tail of a slab compound page,
+				 * the tail pin must not be the last reference
+				 * held on the page, because the PG_slab cannot
+				 * be cleared before all tail pins (which skips
+				 * the _mapcount tail refcounting) have been
+				 * released. For hugetlbfs the tail pin may be
+				 * the last reference on the page instead,
+				 * because PageHeadHuge will not go away until
+				 * the compound page enters the buddy
+				 * allocator.
+				 */
+				VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
+				__put_compound_page(page_head);
+			}
+		} else if (put_page_testzero(page))
+			__put_single_page(page);
+		return;
+	}
 
-	if (put_page_testzero(page_head))
-			__put_compound_page(page_head);
+	flags = compound_lock_irqsave(page_head);
+	/* here __split_huge_page_refcount won't run anymore */
+	if (likely(page != page_head && PageTail(page))) {
+		bool free;
+
+		free = put_page_testzero(page_head);
+		compound_unlock_irqrestore(page_head, flags);
+		if (free) {
+			if (PageHead(page_head))
+				__put_compound_page(page_head);
+			else
+				__put_single_page(page_head);
+		}
+	} else {
+		compound_unlock_irqrestore(page_head, flags);
+		VM_BUG_ON_PAGE(PageTail(page), page);
+		if (put_page_testzero(page))
+			__put_single_page(page);
+	}
 }
 
 void put_page(struct page *page)
@@ -97,6 +171,52 @@ void put_page(struct page *page)
 }
 EXPORT_SYMBOL(put_page);
 
+/*
+ * This function is exported but must not be called by anything other
+ * than get_page(). It implements the slow path of get_page().
+ */
+void __get_page_tail(struct page *page)
+{
+	struct page *page_head = compound_head(page);
+	unsigned long flags;
+
+	if (!compound_lock_needed(page_head)) {
+		smp_rmb();
+		if (likely(PageTail(page))) {
+			/*
+			 * This is a hugetlbfs page or a slab page.
+			 * __split_huge_page_refcount cannot race here.
+			 */
+			VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+			VM_BUG_ON(page_head != page->first_page);
+			VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0,
+					page);
+			atomic_inc(&page_head->_count);
+		} else {
+			/*
+			 * __split_huge_page_refcount run before us, "page" was
+			 * a thp tail. the split page_head has been freed and
+			 * reallocated as slab or hugetlbfs page of smaller
+			 * order (only possible if reallocated as slab on x86).
+			 */
+			VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
+			atomic_inc(&page->_count);
+		}
+		return;
+	}
+
+	flags = compound_lock_irqsave(page_head);
+	/* here __split_huge_page_refcount won't run anymore */
+	if (unlikely(page == page_head || !PageTail(page) ||
+				!get_page_unless_zero(page_head))) {
+		/* page is not part of THP page anymore */
+		VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
+		atomic_inc(&page->_count);
+	}
+	compound_unlock_irqrestore(page_head, flags);
+}
+EXPORT_SYMBOL(__get_page_tail);
+
 /**
  * put_pages_list() - release a list of pages
  * @pages: list of pages threaded on page->lru
-- 
2.1.4