All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Huang, Ying" <ying.huang@intel.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	Huang Ying <ying.huang@intel.com>,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Michal Hocko <mhocko@suse.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Shaohua Li <shli@kernel.org>, Hugh Dickins <hughd@google.com>,
	Minchan Kim <minchan@kernel.org>, Rik van Riel <riel@redhat.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>,
	Zi Yan <zi.yan@cs.rutgers.edu>,
	Daniel Jordan <daniel.m.jordan@oracle.com>
Subject: [PATCH -mm -v4 09/21] mm, THP, swap: Swapin a THP as a whole
Date: Fri, 22 Jun 2018 11:51:39 +0800	[thread overview]
Message-ID: <20180622035151.6676-10-ying.huang@intel.com> (raw)
In-Reply-To: <20180622035151.6676-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

With this patch, when page fault handler find a PMD swap mapping, it
will swap in a THP as a whole.  This avoids the overhead of
splitting/collapsing before/after the THP swapping.  And improves the
swap performance greatly for reduced page fault count etc.

do_huge_pmd_swap_page() is added in the patch to implement this.  It
is similar to do_swap_page() for normal page swapin.

If failing to allocate a THP, the huge swap cluster and the PMD swap
mapping will be split to fallback to normal page swapin.

If the huge swap cluster has been split already, the PMD swap mapping
will be split to fallback to normal page swapin.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
---
 include/linux/huge_mm.h |   9 +++
 include/linux/swap.h    |   9 +++
 mm/huge_memory.c        | 170 ++++++++++++++++++++++++++++++++++++++++++++++++
 mm/memory.c             |  16 +++--
 4 files changed, 198 insertions(+), 6 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c5b8af173f67..42117b75de2d 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -403,4 +403,13 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifdef CONFIG_THP_SWAP
+extern int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd);
+#else /* CONFIG_THP_SWAP */
+static inline int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+	return 0;
+}
+#endif /* CONFIG_THP_SWAP */
+
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d2e017dd7bbd..5832a750baed 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -560,6 +560,15 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp,
 	return NULL;
 }
 
+static inline struct page *read_swap_cache_async(swp_entry_t swp,
+						 gfp_t gft_mask,
+						 struct vm_area_struct *vma,
+						 unsigned long addr,
+						 bool do_poll)
+{
+	return NULL;
+}
+
 static inline int add_to_swap(struct page *page)
 {
 	return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 275a4e616ec9..ac79ae2ab257 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -33,6 +33,8 @@
 #include <linux/page_idle.h>
 #include <linux/shmem_fs.h>
 #include <linux/oom.h>
+#include <linux/delayacct.h>
+#include <linux/swap.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -1609,6 +1611,174 @@ static void __split_huge_swap_pmd(struct vm_area_struct *vma,
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
 }
+
+static int split_huge_swap_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+			       unsigned long address, pmd_t orig_pmd)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	ptl = pmd_lock(mm, pmd);
+	if (pmd_same(*pmd, orig_pmd))
+		__split_huge_swap_pmd(vma, address & HPAGE_PMD_MASK, pmd);
+	else
+		ret = -ENOENT;
+	spin_unlock(ptl);
+
+	return ret;
+}
+
+int do_huge_pmd_swap_page(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+	struct page *page;
+	struct mem_cgroup *memcg;
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	swp_entry_t entry;
+	pmd_t pmd;
+	int i, locked, exclusive = 0, ret = 0;
+
+	entry = pmd_to_swp_entry(orig_pmd);
+	VM_BUG_ON(non_swap_entry(entry));
+	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+retry:
+	page = lookup_swap_cache(entry, NULL, vmf->address);
+	if (!page) {
+		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma,
+					     haddr, false);
+		if (!page) {
+			/*
+			 * Back out if somebody else faulted in this pmd
+			 * while we released the pmd lock.
+			 */
+			if (likely(pmd_same(*vmf->pmd, orig_pmd))) {
+				ret = split_swap_cluster(entry, false);
+				/*
+				 * Retry if somebody else swap in the swap
+				 * entry
+				 */
+				if (ret == -EEXIST) {
+					ret = 0;
+					goto retry;
+				/* swapoff occurs under us */
+				} else if (ret == -EINVAL)
+					ret = 0;
+				else
+					goto fallback;
+			}
+			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+			goto out;
+		}
+
+		/* Had to read the page from swap area: Major fault */
+		ret = VM_FAULT_MAJOR;
+		count_vm_event(PGMAJFAULT);
+		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+	} else if (!PageTransCompound(page))
+		goto fallback;
+
+	locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
+
+	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+	if (!locked) {
+		ret |= VM_FAULT_RETRY;
+		goto out_release;
+	}
+
+	/*
+	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+	 * release the swapcache from under us.  The page pin, and pmd_same
+	 * test below, are not enough to exclude that.  Even if it is still
+	 * swapcache, we need to check that the page's swap has not changed.
+	 */
+	if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
+		goto out_page;
+
+	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+				  &memcg, true)) {
+		ret = VM_FAULT_OOM;
+		goto out_page;
+	}
+
+	/*
+	 * Back out if somebody else already faulted in this pmd.
+	 */
+	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
+	spin_lock(vmf->ptl);
+	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
+		goto out_nomap;
+
+	if (unlikely(!PageUptodate(page))) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_nomap;
+	}
+
+	/*
+	 * The page isn't present yet, go ahead with the fault.
+	 *
+	 * Be careful about the sequence of operations here.
+	 * To get its accounting right, reuse_swap_page() must be called
+	 * while the page is counted on swap but not yet in mapcount i.e.
+	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
+	 * must be called after the swap_free(), or it will never succeed.
+	 */
+
+	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+	add_mm_counter(vma->vm_mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+	pmd = mk_huge_pmd(page, vma->vm_page_prot);
+	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+		pmd = maybe_pmd_mkwrite(pmd_mkdirty(pmd), vma);
+		vmf->flags &= ~FAULT_FLAG_WRITE;
+		ret |= VM_FAULT_WRITE;
+		exclusive = RMAP_EXCLUSIVE;
+	}
+	for (i = 0; i < HPAGE_PMD_NR; i++)
+		flush_icache_page(vma, page + i);
+	if (pmd_swp_soft_dirty(orig_pmd))
+		pmd = pmd_mksoft_dirty(pmd);
+	do_page_add_anon_rmap(page, vma, haddr,
+			      exclusive | RMAP_COMPOUND);
+	mem_cgroup_commit_charge(page, memcg, true, true);
+	activate_page(page);
+	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+
+	swap_free(entry, true);
+	if (mem_cgroup_swap_full(page) ||
+	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+		try_to_free_swap(page);
+	unlock_page(page);
+
+	if (vmf->flags & FAULT_FLAG_WRITE) {
+		ret |= do_huge_pmd_wp_page(vmf, pmd);
+		if (ret & VM_FAULT_ERROR)
+			ret &= VM_FAULT_ERROR;
+		goto out;
+	}
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+	spin_unlock(vmf->ptl);
+out:
+	return ret;
+out_nomap:
+	mem_cgroup_cancel_charge(page, memcg, true);
+	spin_unlock(vmf->ptl);
+out_page:
+	unlock_page(page);
+out_release:
+	put_page(page);
+	return ret;
+fallback:
+	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+	if (!split_huge_swap_pmd(vmf->vma, vmf->pmd, vmf->address, orig_pmd))
+		ret = VM_FAULT_FALLBACK;
+	else
+		ret = 0;
+	if (page)
+		put_page(page);
+	return ret;
+}
 #else
 static inline void __split_huge_swap_pmd(struct vm_area_struct *vma,
 					 unsigned long haddr,
diff --git a/mm/memory.c b/mm/memory.c
index 55e278bb59ee..2125035b6a70 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4072,13 +4072,17 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 
 		barrier();
 		if (unlikely(is_swap_pmd(orig_pmd))) {
-			VM_BUG_ON(thp_migration_supported() &&
-					  !is_pmd_migration_entry(orig_pmd));
-			if (is_pmd_migration_entry(orig_pmd))
+			if (thp_migration_supported() &&
+			    is_pmd_migration_entry(orig_pmd)) {
 				pmd_migration_entry_wait(mm, vmf.pmd);
-			return 0;
-		}
-		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
+				return 0;
+			} else if (thp_swap_supported()) {
+				ret = do_huge_pmd_swap_page(&vmf, orig_pmd);
+				if (!(ret & VM_FAULT_FALLBACK))
+					return ret;
+			} else
+				VM_BUG_ON(1);
+		} else if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
 			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
-- 
2.16.4


  parent reply	other threads:[~2018-06-22  3:58 UTC|newest]

Thread overview: 100+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-22  3:51 [PATCH -mm -v4 00/21] mm, THP, swap: Swapout/swapin THP in one piece Huang, Ying
2018-06-22  3:51 ` Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 01/21] mm, THP, swap: Enable PMD swap operations for CONFIG_THP_SWAP Huang, Ying
2018-07-07 21:11   ` Dan Williams
2018-07-09  5:40     ` Huang, Ying
2018-07-09  5:40       ` Huang, Ying
2018-07-09  6:08       ` Dan Williams
2018-07-09  6:34         ` Huang, Ying
2018-07-09  6:34           ` Huang, Ying
2018-07-09 15:59   ` Dave Hansen
2018-07-10  1:08     ` Huang, Ying
2018-07-10  1:08       ` Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 02/21] mm, THP, swap: Make CONFIG_THP_SWAP depends on CONFIG_SWAP Huang, Ying
2018-07-07 21:12   ` Dan Williams
2018-07-09  6:34     ` Huang, Ying
2018-07-09  6:34       ` Huang, Ying
2018-07-09 16:00   ` Dave Hansen
2018-07-10  1:19     ` Huang, Ying
2018-07-10  1:19       ` Huang, Ying
2018-07-10  1:59       ` Dave Hansen
2018-07-10  5:26         ` Huang, Ying
2018-07-10  5:26           ` Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 03/21] mm, THP, swap: Support PMD swap mapping in swap_duplicate() Huang, Ying
2018-06-29  6:04   ` Matthew Wilcox
2018-07-02  5:19     ` Huang, Ying
2018-07-02  5:19       ` Huang, Ying
2018-07-07 23:22   ` Dan Williams
2018-07-09  7:38     ` Huang, Ying
2018-07-09  7:38       ` Huang, Ying
2018-07-09 16:51   ` Dave Hansen
2018-07-10  6:44     ` Huang, Ying
2018-07-10  6:44       ` Huang, Ying
2018-07-10 13:50       ` Dave Hansen
2018-07-11  0:59         ` Huang, Ying
2018-07-11  0:59           ` Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 04/21] mm, THP, swap: Support PMD swap mapping in swapcache_free_cluster() Huang, Ying
2018-07-09 17:11   ` Dave Hansen
2018-07-10  6:53     ` Huang, Ying
2018-07-10  6:53       ` Huang, Ying
2018-07-10 13:54       ` Dave Hansen
2018-07-11  1:08         ` Huang, Ying
2018-07-11  1:08           ` Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 05/21] mm, THP, swap: Support PMD swap mapping in free_swap_and_cache()/swap_free() Huang, Ying
2018-07-05 18:33   ` Daniel Jordan
2018-07-06 12:49     ` Huang, Ying
2018-07-06 12:49       ` Huang, Ying
2018-07-09 17:19   ` Dave Hansen
2018-07-10  7:13     ` Huang, Ying
2018-07-10  7:13       ` Huang, Ying
2018-07-10 14:07       ` Dave Hansen
2018-07-11  1:28         ` Huang, Ying
2018-07-11  1:28           ` Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 06/21] mm, THP, swap: Support PMD swap mapping when splitting huge PMD Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 07/21] mm, THP, swap: Support PMD swap mapping in split_swap_cluster() Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 08/21] mm, THP, swap: Support to read a huge swap cluster for swapin a THP Huang, Ying
2018-06-29  6:21   ` Matthew Wilcox
2018-07-02  6:02     ` Huang, Ying
2018-07-02  6:02       ` Huang, Ying
2018-07-04  0:12   ` Daniel Jordan
2018-07-04  2:24     ` Huang, Ying
2018-07-04  2:24       ` Huang, Ying
2018-06-22  3:51 ` Huang, Ying [this message]
2018-06-22  3:51 ` [PATCH -mm -v4 10/21] mm, THP, swap: Support to count THP swapin and its fallback Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 11/21] mm, THP, swap: Add sysfs interface to configure THP swapin Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 12/21] mm, THP, swap: Support PMD swap mapping in swapoff Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 13/21] mm, THP, swap: Support PMD swap mapping in madvise_free() Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 14/21] mm, cgroup, THP, swap: Support to move swap account for PMD swap mapping Huang, Ying
2018-07-09 17:20   ` Daniel Jordan
2018-07-10  7:49     ` Huang, Ying
2018-07-10  7:49       ` Huang, Ying
2018-07-10 22:49       ` Daniel Jordan
2018-06-22  3:51 ` [PATCH -mm -v4 15/21] mm, THP, swap: Support to copy PMD swap mapping when fork() Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 16/21] mm, THP, swap: Free PMD swap mapping when zap_huge_pmd() Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 17/21] mm, THP, swap: Support PMD swap mapping for MADV_WILLNEED Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 18/21] mm, THP, swap: Support PMD swap mapping in mincore() Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 19/21] mm, THP, swap: Support PMD swap mapping in common path Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 20/21] mm, THP, swap: create PMD swap mapping when unmap the THP Huang, Ying
2018-06-22  3:51 ` [PATCH -mm -v4 21/21] mm, THP: Avoid to split THP when reclaim MADV_FREE THP Huang, Ying
2018-06-28  4:51 ` [PATCH -mm -v4 00/21] mm, THP, swap: Swapout/swapin THP in one piece Andrew Morton
2018-06-28  5:29   ` Huang, Ying
2018-06-28  5:29     ` Huang, Ying
2018-06-28  5:31     ` Andrew Morton
2018-06-28  5:31       ` Andrew Morton
2018-06-28  5:35       ` Huang, Ying
2018-06-28  5:35         ` Huang, Ying
2018-06-28  6:18         ` Andrew Morton
2018-06-28  6:18           ` Andrew Morton
2018-06-28  9:03           ` Matthew Wilcox
2018-06-28  9:03             ` Matthew Wilcox
2018-06-29  1:17             ` Huang, Ying
2018-06-29  1:17               ` Huang, Ying
2018-06-29  5:57               ` Matthew Wilcox
2018-07-02  5:19                 ` Huang, Ying
2018-07-02  5:19                   ` Huang, Ying
2018-07-04  2:11   ` Sergey Senozhatsky
2018-07-04  2:20     ` Huang, Ying
2018-07-04  2:20       ` Huang, Ying
2018-07-04  2:27       ` Sergey Senozhatsky
2018-07-04  2:59         ` Huang, Ying
2018-07-04  2:59           ` Huang, Ying

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180622035151.6676-10-ying.huang@intel.com \
    --to=ying.huang@intel.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=daniel.m.jordan@oracle.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    --cc=minchan@kernel.org \
    --cc=n-horiguchi@ah.jp.nec.com \
    --cc=riel@redhat.com \
    --cc=shli@kernel.org \
    --cc=zi.yan@cs.rutgers.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.