All of lore.kernel.org
 help / color / mirror / Atom feed
From: Huang Ying <ying.huang@intel.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	Huang Ying <ying.huang@intel.com>,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Michal Hocko <mhocko@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Shaohua Li <shli@kernel.org>, Hugh Dickins <hughd@google.com>,
	Minchan Kim <minchan@kernel.org>, Rik van Riel <riel@redhat.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>,
	Zi Yan <zi.yan@cs.rutgers.edu>,
	Daniel Jordan <daniel.m.jordan@oracle.com>
Subject: [PATCH -V6 08/21] swap: Support to read a huge swap cluster for swapin a THP
Date: Wed, 10 Oct 2018 15:19:11 +0800	[thread overview]
Message-ID: <20181010071924.18767-9-ying.huang@intel.com> (raw)
In-Reply-To: <20181010071924.18767-1-ying.huang@intel.com>

To swapin a THP in one piece, we need to read a huge swap cluster from
the swap device.  This patch revised the __read_swap_cache_async() and
its callers and callees to support this.  If __read_swap_cache_async()
find the swap cluster of the specified swap entry is huge, it will try
to allocate a THP, add it into the swap cache.  So later the contents
of the huge swap cluster can be read into the THP.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
---
 include/linux/huge_mm.h |  8 +++++++
 include/linux/swap.h    |  4 ++--
 mm/huge_memory.c        |  3 ++-
 mm/swap_state.c         | 59 ++++++++++++++++++++++++++++++++++++++++---------
 mm/swapfile.c           |  9 +++++---
 5 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 0f3e1739986f..a0e7f4f9c12b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -250,6 +250,8 @@ static inline bool thp_migration_supported(void)
 	return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
 }
 
+gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma,
+				    unsigned long addr);
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -363,6 +365,12 @@ static inline bool thp_migration_supported(void)
 {
 	return false;
 }
+
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma,
+						  unsigned long addr)
+{
+	return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 60fd5189fde9..f2daf3fbdd4b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -457,7 +457,7 @@ extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
 extern int __swap_count(swp_entry_t entry);
-extern int __swp_swapcount(swp_entry_t entry);
+extern int __swp_swapcount(swp_entry_t entry, int *entry_size);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
@@ -585,7 +585,7 @@ static inline int __swap_count(swp_entry_t entry)
 	return 0;
 }
 
-static inline int __swp_swapcount(swp_entry_t entry)
+static inline int __swp_swapcount(swp_entry_t entry, int *entry_size)
 {
 	return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 92e0cdb99c5a..a025494dd828 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,7 +629,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
  *	    available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
+gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma,
+				    unsigned long addr)
 {
 	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
 	gfp_t this_node = 0;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index bca34fc7a5e5..784ad6388da0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -361,7 +361,9 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 {
 	struct page *found_page = NULL, *new_page = NULL;
 	struct swap_info_struct *si;
-	int err;
+	int err, entry_size = 1;
+	swp_entry_t hentry;
+
 	*new_page_allocated = false;
 
 	do {
@@ -387,14 +389,42 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * as SWAP_HAS_CACHE.  That's done in later part of code or
 		 * else swap_off will be aborted if we return NULL.
 		 */
-		if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
+		if (!__swp_swapcount(entry, &entry_size) &&
+		    swap_slot_cache_enabled)
 			break;
 
 		/*
 		 * Get a new page to read into from swap.
 		 */
-		if (!new_page) {
-			new_page = alloc_page_vma(gfp_mask, vma, addr);
+		if (!new_page ||
+		    (IS_ENABLED(CONFIG_THP_SWAP) &&
+		     hpage_nr_pages(new_page) != entry_size)) {
+			if (new_page)
+				put_page(new_page);
+			if (IS_ENABLED(CONFIG_THP_SWAP) &&
+			    entry_size == HPAGE_PMD_NR) {
+				gfp_t gfp;
+
+				gfp = alloc_hugepage_direct_gfpmask(vma, addr);
+				/*
+				 * Make sure huge page allocation flags are
+				 * compatible with that of normal page
+				 */
+				VM_WARN_ONCE(gfp_mask & ~(gfp | __GFP_RECLAIM),
+					     "ignoring gfp_mask bits: %x",
+					     gfp_mask & ~(gfp | __GFP_RECLAIM));
+				new_page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER,
+							   vma, addr,
+							   numa_node_id());
+				if (new_page)
+					prep_transhuge_page(new_page);
+				hentry = swp_entry(swp_type(entry),
+						   round_down(swp_offset(entry),
+							      HPAGE_PMD_NR));
+			} else {
+				new_page = alloc_page_vma(gfp_mask, vma, addr);
+				hentry = entry;
+			}
 			if (!new_page)
 				break;		/* Out of memory */
 		}
@@ -402,7 +432,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		/*
 		 * Swap entry may have been freed since our caller observed it.
 		 */
-		err = swapcache_prepare(entry, 1);
+		err = swapcache_prepare(hentry, entry_size);
 		if (err == -EEXIST) {
 			/*
 			 * We might race against get_swap_page() and stumble
@@ -411,6 +441,9 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			 */
 			cond_resched();
 			continue;
+		} else if (err == -ENOTDIR) {
+			/* huge swap cluster has been split under us */
+			continue;
 		} else if (err) {	/* swp entry is obsolete ? */
 			break;
 		}
@@ -424,6 +457,9 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			SetPageWorkingset(new_page);
 			lru_cache_add_anon(new_page);
 			*new_page_allocated = true;
+			if (IS_ENABLED(CONFIG_THP_SWAP))
+				new_page += swp_offset(entry) &
+					(entry_size - 1);
 			return new_page;
 		}
 		__ClearPageLocked(new_page);
@@ -431,7 +467,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		put_swap_page(new_page, entry);
+		put_swap_page(new_page, hentry);
 	} while (err != -ENOMEM);
 
 	if (new_page)
@@ -453,7 +489,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			vma, addr, &page_was_allocated);
 
 	if (page_was_allocated)
-		swap_readpage(retpage, do_poll);
+		swap_readpage(compound_head(retpage), do_poll);
 
 	return retpage;
 }
@@ -572,8 +608,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		if (!page)
 			continue;
 		if (page_allocated) {
-			swap_readpage(page, false);
-			if (offset != entry_offset) {
+			swap_readpage(compound_head(page), false);
+			if (offset != entry_offset &&
+			    !PageTransCompound(page)) {
 				SetPageReadahead(page);
 				count_vm_event(SWAP_RA);
 			}
@@ -734,8 +771,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 		if (!page)
 			continue;
 		if (page_allocated) {
-			swap_readpage(page, false);
-			if (i != ra_info.offset) {
+			swap_readpage(compound_head(page), false);
+			if (i != ra_info.offset && !PageTransCompound(page)) {
 				SetPageReadahead(page);
 				count_vm_event(SWAP_RA);
 			}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2020bd494419..2ca013df35e1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1542,7 +1542,8 @@ int __swap_count(swp_entry_t entry)
 	return count;
 }
 
-static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry,
+			  int *entry_size)
 {
 	int count = 0;
 	pgoff_t offset = swp_offset(entry);
@@ -1550,6 +1551,8 @@ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
 
 	ci = lock_cluster_or_swap_info(si, offset);
 	count = swap_count(si->swap_map[offset]);
+	if (entry_size)
+		*entry_size = ci && cluster_is_huge(ci) ? SWAPFILE_CLUSTER : 1;
 	unlock_cluster_or_swap_info(si, ci);
 	return count;
 }
@@ -1559,14 +1562,14 @@ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
  * This does not give an exact answer when swap count is continued,
  * but does include the high COUNT_CONTINUED flag to allow for that.
  */
-int __swp_swapcount(swp_entry_t entry)
+int __swp_swapcount(swp_entry_t entry, int *entry_size)
 {
 	int count = 0;
 	struct swap_info_struct *si;
 
 	si = get_swap_device(entry);
 	if (si) {
-		count = swap_swapcount(si, entry);
+		count = swap_swapcount(si, entry, entry_size);
 		put_swap_device(si);
 	}
 	return count;
-- 
2.16.4


  parent reply	other threads:[~2018-10-10  7:27 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-10-10  7:19 [PATCH -V6 00/21] swap: Swapout/swapin THP in one piece Huang Ying
2018-10-10  7:19 ` Huang Ying
2018-10-10  7:19 ` [PATCH -V6 01/21] swap: Enable PMD swap operations for CONFIG_THP_SWAP Huang Ying
2018-10-10  7:19 ` [PATCH -V6 02/21] swap: Add __swap_duplicate_locked() Huang Ying
2018-10-10  7:19 ` [PATCH -V6 03/21] swap: Support PMD swap mapping in swap_duplicate() Huang Ying
2018-10-10  7:19 ` [PATCH -V6 04/21] swap: Support PMD swap mapping in put_swap_page() Huang Ying
2018-10-10  7:19 ` [PATCH -V6 05/21] swap: Support PMD swap mapping in free_swap_and_cache()/swap_free() Huang Ying
2018-10-10  7:19 ` [PATCH -V6 06/21] swap: Support PMD swap mapping when splitting huge PMD Huang Ying
2018-10-24 17:25   ` Daniel Jordan
2018-10-25  0:54     ` Huang, Ying
2018-10-25  0:54       ` Huang, Ying
2018-10-25 15:00       ` Daniel Jordan
2018-10-10  7:19 ` [PATCH -V6 07/21] swap: Support PMD swap mapping in split_swap_cluster() Huang Ying
2018-10-10  7:19 ` Huang Ying [this message]
2018-10-10  7:19 ` [PATCH -V6 09/21] swap: Swapin a THP in one piece Huang Ying
2018-10-10  7:19 ` [PATCH -V6 10/21] swap: Support to count THP swapin and its fallback Huang Ying
2018-10-10  7:19 ` [PATCH -V6 11/21] swap: Add sysfs interface to configure THP swapin Huang Ying
2018-10-10  7:19 ` [PATCH -V6 12/21] swap: Support PMD swap mapping in swapoff Huang Ying
2018-10-10  7:19 ` [PATCH -V6 13/21] swap: Support PMD swap mapping in madvise_free() Huang Ying
2018-10-10  7:19 ` [PATCH -V6 14/21] swap: Support to move swap account for PMD swap mapping Huang Ying
2018-10-24 17:27   ` Daniel Jordan
2018-10-24 17:27     ` Daniel Jordan
2018-10-25  1:06     ` Huang, Ying
2018-10-25  1:06       ` Huang, Ying
2018-10-10  7:19 ` [PATCH -V6 15/21] swap: Support to copy PMD swap mapping when fork() Huang Ying
2018-10-10  7:19 ` [PATCH -V6 16/21] swap: Free PMD swap mapping when zap_huge_pmd() Huang Ying
2018-10-10  7:19 ` [PATCH -V6 17/21] swap: Support PMD swap mapping for MADV_WILLNEED Huang Ying
2018-10-10  7:19 ` [PATCH -V6 18/21] swap: Support PMD swap mapping in mincore() Huang Ying
2018-10-10  7:19 ` [PATCH -V6 19/21] swap: Support PMD swap mapping in common path Huang Ying
2018-10-10  7:19 ` [PATCH -V6 20/21] swap: create PMD swap mapping when unmap the THP Huang Ying
2018-10-10  7:19 ` [PATCH -V6 21/21] swap: Update help of CONFIG_THP_SWAP Huang Ying
2018-10-23 12:27 ` [PATCH -V6 00/21] swap: Swapout/swapin THP in one piece Daniel Jordan
2018-10-24  3:31   ` Huang, Ying
2018-10-24  3:31     ` Huang, Ying
2018-10-24 17:24     ` Daniel Jordan
2018-10-25  0:42       ` Huang, Ying
2018-10-25  0:42         ` Huang, Ying
2018-11-09  1:12 ` Daniel Jordan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181010071924.18767-9-ying.huang@intel.com \
    --to=ying.huang@intel.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=daniel.m.jordan@oracle.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=minchan@kernel.org \
    --cc=n-horiguchi@ah.jp.nec.com \
    --cc=riel@redhat.com \
    --cc=shli@kernel.org \
    --cc=zi.yan@cs.rutgers.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.