Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
From: Zi Yan <zi.yan@sent.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Dave Hansen <dave.hansen@linux.intel.com>,
	Michal Hocko <mhocko@kernel.org>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Vlastimil Babka <vbabka@suse.cz>,
	Mel Gorman <mgorman@techsingularity.net>,
	John Hubbard <jhubbard@nvidia.com>,
	Mark Hairgrove <mhairgrove@nvidia.com>,
	Nitin Gupta <nigupta@nvidia.com>,
	David Nellans <dnellans@nvidia.com>, Zi Yan <ziy@nvidia.com>
Subject: [RFC PATCH 01/31] mm: migrate: Add exchange_pages to exchange two lists of pages.
Date: Fri, 15 Feb 2019 14:08:26 -0800
Message-ID: <20190215220856.29749-2-zi.yan@sent.com> (raw)
In-Reply-To: <20190215220856.29749-1-zi.yan@sent.com>

From: Zi Yan <ziy@nvidia.com>

In stead of using two migrate_pages(), a single exchange_pages() would
be sufficient and without allocating new pages.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 include/linux/ksm.h |   5 +
 mm/Makefile         |   1 +
 mm/exchange.c       | 846 ++++++++++++++++++++++++++++++++++++++++++++
 mm/internal.h       |   6 +
 mm/ksm.c            |  35 ++
 mm/migrate.c        |   4 +-
 6 files changed, 895 insertions(+), 2 deletions(-)
 create mode 100644 mm/exchange.c

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 161e8164abcf..87c5b943a73c 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -53,6 +53,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 
 void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+void ksm_exchange_page(struct page *to_page, struct page *from_page);
 
 #else  /* !CONFIG_KSM */
 
@@ -86,6 +87,10 @@ static inline void rmap_walk_ksm(struct page *page,
 static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 {
 }
+static inline void ksm_exchange_page(struct page *to_page,
+				struct page *from_page)
+{
+}
 #endif /* CONFIG_MMU */
 #endif /* !CONFIG_KSM */
 
diff --git a/mm/Makefile b/mm/Makefile
index d210cc9d6f80..1574ea5743e4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -43,6 +43,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 
 obj-y += init-mm.o
 obj-y += memblock.o
+obj-y += exchange.o
 
 ifdef CONFIG_MMU
 	obj-$(CONFIG_ADVISE_SYSCALLS)	+= madvise.o
diff --git a/mm/exchange.c b/mm/exchange.c
new file mode 100644
index 000000000000..a607348cc6f4
--- /dev/null
+++ b/mm/exchange.c
@@ -0,0 +1,846 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2016 NVIDIA, Zi Yan <ziy@nvidia.com>
+ *
+ * Exchange two in-use pages. Page flags and page->mapping are exchanged
+ * as well. Only anonymous pages are supported.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/migrate.h>
+#include <linux/security.h>
+#include <linux/cpuset.h>
+#include <linux/hugetlb.h>
+#include <linux/mm_inline.h>
+#include <linux/page_idle.h>
+#include <linux/page-flags.h>
+#include <linux/ksm.h>
+#include <linux/memcontrol.h>
+#include <linux/balloon_compaction.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h> /* buffer_migrate_page  */
+#include <linux/backing-dev.h>
+
+
+#include "internal.h"
+
+struct exchange_page_info {
+	struct page *from_page;
+	struct page *to_page;
+
+	struct anon_vma *from_anon_vma;
+	struct anon_vma *to_anon_vma;
+
+	struct list_head list;
+};
+
+struct page_flags {
+	unsigned int page_error :1;
+	unsigned int page_referenced:1;
+	unsigned int page_uptodate:1;
+	unsigned int page_active:1;
+	unsigned int page_unevictable:1;
+	unsigned int page_checked:1;
+	unsigned int page_mappedtodisk:1;
+	unsigned int page_dirty:1;
+	unsigned int page_is_young:1;
+	unsigned int page_is_idle:1;
+	unsigned int page_swapcache:1;
+	unsigned int page_writeback:1;
+	unsigned int page_private:1;
+	unsigned int __pad:3;
+};
+
+
+static void exchange_page(char *to, char *from)
+{
+	u64 tmp;
+	int i;
+
+	for (i = 0; i < PAGE_SIZE; i += sizeof(tmp)) {
+		tmp = *((u64 *)(from + i));
+		*((u64 *)(from + i)) = *((u64 *)(to + i));
+		*((u64 *)(to + i)) = tmp;
+	}
+}
+
+static inline void exchange_highpage(struct page *to, struct page *from)
+{
+	char *vfrom, *vto;
+
+	vfrom = kmap_atomic(from);
+	vto = kmap_atomic(to);
+	exchange_page(vto, vfrom);
+	kunmap_atomic(vto);
+	kunmap_atomic(vfrom);
+}
+
+static void __exchange_gigantic_page(struct page *dst, struct page *src,
+				int nr_pages)
+{
+	int i;
+	struct page *dst_base = dst;
+	struct page *src_base = src;
+
+	for (i = 0; i < nr_pages; ) {
+		cond_resched();
+		exchange_highpage(dst, src);
+
+		i++;
+		dst = mem_map_next(dst, dst_base, i);
+		src = mem_map_next(src, src_base, i);
+	}
+}
+
+static void exchange_huge_page(struct page *dst, struct page *src)
+{
+	int i;
+	int nr_pages;
+
+	if (PageHuge(src)) {
+		/* hugetlbfs page */
+		struct hstate *h = page_hstate(src);
+
+		nr_pages = pages_per_huge_page(h);
+
+		if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
+			__exchange_gigantic_page(dst, src, nr_pages);
+			return;
+		}
+	} else {
+		/* thp page */
+		VM_BUG_ON(!PageTransHuge(src));
+		nr_pages = hpage_nr_pages(src);
+	}
+
+	for (i = 0; i < nr_pages; i++) {
+		cond_resched();
+		exchange_highpage(dst + i, src + i);
+	}
+}
+
+/*
+ * Copy the page to its new location without polluting cache
+ */
+static void exchange_page_flags(struct page *to_page, struct page *from_page)
+{
+	int from_cpupid, to_cpupid;
+	struct page_flags from_page_flags, to_page_flags;
+	struct mem_cgroup *to_memcg = page_memcg(to_page),
+					  *from_memcg = page_memcg(from_page);
+
+	from_cpupid = page_cpupid_xchg_last(from_page, -1);
+
+	from_page_flags.page_error = TestClearPageError(from_page);
+	from_page_flags.page_referenced = TestClearPageReferenced(from_page);
+	from_page_flags.page_uptodate = PageUptodate(from_page);
+	ClearPageUptodate(from_page);
+	from_page_flags.page_active = TestClearPageActive(from_page);
+	from_page_flags.page_unevictable = TestClearPageUnevictable(from_page);
+	from_page_flags.page_checked = PageChecked(from_page);
+	ClearPageChecked(from_page);
+	from_page_flags.page_mappedtodisk = PageMappedToDisk(from_page);
+	ClearPageMappedToDisk(from_page);
+	from_page_flags.page_dirty = PageDirty(from_page);
+	ClearPageDirty(from_page);
+	from_page_flags.page_is_young = test_and_clear_page_young(from_page);
+	from_page_flags.page_is_idle = page_is_idle(from_page);
+	clear_page_idle(from_page);
+	from_page_flags.page_swapcache = PageSwapCache(from_page);
+	from_page_flags.page_writeback = test_clear_page_writeback(from_page);
+
+
+	to_cpupid = page_cpupid_xchg_last(to_page, -1);
+
+	to_page_flags.page_error = TestClearPageError(to_page);
+	to_page_flags.page_referenced = TestClearPageReferenced(to_page);
+	to_page_flags.page_uptodate = PageUptodate(to_page);
+	ClearPageUptodate(to_page);
+	to_page_flags.page_active = TestClearPageActive(to_page);
+	to_page_flags.page_unevictable = TestClearPageUnevictable(to_page);
+	to_page_flags.page_checked = PageChecked(to_page);
+	ClearPageChecked(to_page);
+	to_page_flags.page_mappedtodisk = PageMappedToDisk(to_page);
+	ClearPageMappedToDisk(to_page);
+	to_page_flags.page_dirty = PageDirty(to_page);
+	ClearPageDirty(to_page);
+	to_page_flags.page_is_young = test_and_clear_page_young(to_page);
+	to_page_flags.page_is_idle = page_is_idle(to_page);
+	clear_page_idle(to_page);
+	to_page_flags.page_swapcache = PageSwapCache(to_page);
+	to_page_flags.page_writeback = test_clear_page_writeback(to_page);
+
+	/* set to_page */
+	if (from_page_flags.page_error)
+		SetPageError(to_page);
+	if (from_page_flags.page_referenced)
+		SetPageReferenced(to_page);
+	if (from_page_flags.page_uptodate)
+		SetPageUptodate(to_page);
+	if (from_page_flags.page_active) {
+		VM_BUG_ON_PAGE(from_page_flags.page_unevictable, from_page);
+		SetPageActive(to_page);
+	} else if (from_page_flags.page_unevictable)
+		SetPageUnevictable(to_page);
+	if (from_page_flags.page_checked)
+		SetPageChecked(to_page);
+	if (from_page_flags.page_mappedtodisk)
+		SetPageMappedToDisk(to_page);
+
+	/* Move dirty on pages not done by migrate_page_move_mapping() */
+	if (from_page_flags.page_dirty)
+		SetPageDirty(to_page);
+
+	if (from_page_flags.page_is_young)
+		set_page_young(to_page);
+	if (from_page_flags.page_is_idle)
+		set_page_idle(to_page);
+
+	/* set from_page */
+	if (to_page_flags.page_error)
+		SetPageError(from_page);
+	if (to_page_flags.page_referenced)
+		SetPageReferenced(from_page);
+	if (to_page_flags.page_uptodate)
+		SetPageUptodate(from_page);
+	if (to_page_flags.page_active) {
+		VM_BUG_ON_PAGE(to_page_flags.page_unevictable, from_page);
+		SetPageActive(from_page);
+	} else if (to_page_flags.page_unevictable)
+		SetPageUnevictable(from_page);
+	if (to_page_flags.page_checked)
+		SetPageChecked(from_page);
+	if (to_page_flags.page_mappedtodisk)
+		SetPageMappedToDisk(from_page);
+
+	/* Move dirty on pages not done by migrate_page_move_mapping() */
+	if (to_page_flags.page_dirty)
+		SetPageDirty(from_page);
+
+	if (to_page_flags.page_is_young)
+		set_page_young(from_page);
+	if (to_page_flags.page_is_idle)
+		set_page_idle(from_page);
+
+	/*
+	 * Copy NUMA information to the new page, to prevent over-eager
+	 * future migrations of this same page.
+	 */
+	page_cpupid_xchg_last(to_page, from_cpupid);
+	page_cpupid_xchg_last(from_page, to_cpupid);
+
+	ksm_exchange_page(to_page, from_page);
+	/*
+	 * Please do not reorder this without considering how mm/ksm.c's
+	 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
+	 */
+	ClearPageSwapCache(to_page);
+	ClearPageSwapCache(from_page);
+	if (from_page_flags.page_swapcache)
+		SetPageSwapCache(to_page);
+	if (to_page_flags.page_swapcache)
+		SetPageSwapCache(from_page);
+
+
+#ifdef CONFIG_PAGE_OWNER
+	/* exchange page owner  */
+	BUILD_BUG();
+#endif
+	/* exchange mem cgroup  */
+	to_page->mem_cgroup = from_memcg;
+	from_page->mem_cgroup = to_memcg;
+
+}
+
+/*
+ * Replace the page in the mapping.
+ *
+ * The number of remaining references must be:
+ * 1 for anonymous pages without a mapping
+ * 2 for pages with a mapping
+ * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
+ */
+
+static int exchange_page_move_mapping(struct address_space *to_mapping,
+			struct address_space *from_mapping,
+			struct page *to_page, struct page *from_page,
+			struct buffer_head *to_head,
+			struct buffer_head *from_head,
+			enum migrate_mode mode,
+			int to_extra_count, int from_extra_count)
+{
+	int to_expected_count = 1 + to_extra_count,
+		from_expected_count = 1 + from_extra_count;
+	unsigned long from_page_index = from_page->index;
+	unsigned long to_page_index = to_page->index;
+	int to_swapbacked = PageSwapBacked(to_page),
+		from_swapbacked = PageSwapBacked(from_page);
+	struct address_space *to_mapping_value = to_page->mapping;
+	struct address_space *from_mapping_value = from_page->mapping;
+
+	VM_BUG_ON_PAGE(to_mapping != page_mapping(to_page), to_page);
+	VM_BUG_ON_PAGE(from_mapping != page_mapping(from_page), from_page);
+
+	if (!to_mapping) {
+		/* Anonymous page without mapping */
+		if (page_count(to_page) != to_expected_count)
+			return -EAGAIN;
+	}
+
+	if (!from_mapping) {
+		/* Anonymous page without mapping */
+		if (page_count(from_page) != from_expected_count)
+			return -EAGAIN;
+	}
+
+	/* both are anonymous pages  */
+	if (!from_mapping && !to_mapping) {
+		/* from_page  */
+		from_page->index = to_page_index;
+		from_page->mapping = to_mapping_value;
+
+		ClearPageSwapBacked(from_page);
+		if (to_swapbacked)
+			SetPageSwapBacked(from_page);
+
+
+		/* to_page  */
+		to_page->index = from_page_index;
+		to_page->mapping = from_mapping_value;
+
+		ClearPageSwapBacked(to_page);
+		if (from_swapbacked)
+			SetPageSwapBacked(to_page);
+	} else if (!from_mapping && to_mapping) {
+		/* from is anonymous, to is file-backed  */
+		struct zone *from_zone, *to_zone;
+		void **to_pslot;
+		int dirty;
+
+		from_zone = page_zone(from_page);
+		to_zone = page_zone(to_page);
+
+		xa_lock_irq(&to_mapping->i_pages);
+
+		to_pslot = radix_tree_lookup_slot(&to_mapping->i_pages,
+			page_index(to_page));
+
+		to_expected_count += 1 + page_has_private(to_page);
+		if (page_count(to_page) != to_expected_count ||
+			radix_tree_deref_slot_protected(to_pslot,
+				&to_mapping->i_pages.xa_lock) != to_page) {
+			xa_unlock_irq(&to_mapping->i_pages);
+			return -EAGAIN;
+		}
+
+		if (!page_ref_freeze(to_page, to_expected_count)) {
+			xa_unlock_irq(&to_mapping->i_pages);
+			pr_debug("cannot freeze page count\n");
+			return -EAGAIN;
+		}
+
+		if (mode == MIGRATE_ASYNC && to_head &&
+				!buffer_migrate_lock_buffers(to_head, mode)) {
+			page_ref_unfreeze(to_page, to_expected_count);
+			xa_unlock_irq(&to_mapping->i_pages);
+
+			pr_debug("cannot lock buffer head\n");
+			return -EAGAIN;
+		}
+
+		if (!page_ref_freeze(from_page, from_expected_count)) {
+			page_ref_unfreeze(to_page, to_expected_count);
+			xa_unlock_irq(&to_mapping->i_pages);
+
+			return -EAGAIN;
+		}
+		/*
+		 * Now we know that no one else is looking at the page:
+		 * no turning back from here.
+		 */
+		ClearPageSwapBacked(from_page);
+		ClearPageSwapBacked(to_page);
+
+		/* from_page  */
+		from_page->index = to_page_index;
+		from_page->mapping = to_mapping_value;
+		/* to_page  */
+		to_page->index = from_page_index;
+		to_page->mapping = from_mapping_value;
+
+		if (to_swapbacked)
+			__SetPageSwapBacked(from_page);
+		else
+			VM_BUG_ON_PAGE(PageSwapCache(to_page), to_page);
+
+		if (from_swapbacked)
+			__SetPageSwapBacked(to_page);
+		else
+			VM_BUG_ON_PAGE(PageSwapCache(from_page), from_page);
+
+		dirty = PageDirty(to_page);
+
+		radix_tree_replace_slot(&to_mapping->i_pages,
+				to_pslot, from_page);
+
+		/* move cache reference */
+		page_ref_unfreeze(to_page, to_expected_count - 1);
+		page_ref_unfreeze(from_page, from_expected_count + 1);
+
+		xa_unlock(&to_mapping->i_pages);
+
+		/*
+		 * If moved to a different zone then also account
+		 * the page for that zone. Other VM counters will be
+		 * taken care of when we establish references to the
+		 * new page and drop references to the old page.
+		 *
+		 * Note that anonymous pages are accounted for
+		 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
+		 * are mapped to swap space.
+		 */
+		if (to_zone != from_zone) {
+			__dec_node_state(to_zone->zone_pgdat, NR_FILE_PAGES);
+			__inc_node_state(from_zone->zone_pgdat, NR_FILE_PAGES);
+			if (PageSwapBacked(to_page) && !PageSwapCache(to_page)) {
+				__dec_node_state(to_zone->zone_pgdat, NR_SHMEM);
+				__inc_node_state(from_zone->zone_pgdat, NR_SHMEM);
+			}
+			if (dirty && mapping_cap_account_dirty(to_mapping)) {
+				__dec_node_state(to_zone->zone_pgdat, NR_FILE_DIRTY);
+				__dec_zone_state(to_zone, NR_ZONE_WRITE_PENDING);
+				__inc_node_state(from_zone->zone_pgdat, NR_FILE_DIRTY);
+				__inc_zone_state(from_zone, NR_ZONE_WRITE_PENDING);
+			}
+		}
+		local_irq_enable();
+
+	} else {
+		/* from is file-backed to is anonymous: fold this to the case above */
+		/* both are file-backed  */
+		VM_BUG_ON(1);
+	}
+
+	return MIGRATEPAGE_SUCCESS;
+}
+
+static int exchange_from_to_pages(struct page *to_page, struct page *from_page,
+				enum migrate_mode mode)
+{
+	int rc = -EBUSY;
+	struct address_space *to_page_mapping, *from_page_mapping;
+	struct buffer_head *to_head = NULL, *to_bh = NULL;
+
+	VM_BUG_ON_PAGE(!PageLocked(from_page), from_page);
+	VM_BUG_ON_PAGE(!PageLocked(to_page), to_page);
+
+	/* copy page->mapping not use page_mapping()  */
+	to_page_mapping = page_mapping(to_page);
+	from_page_mapping = page_mapping(from_page);
+
+	/* from_page has to be anonymous page  */
+	VM_BUG_ON(from_page_mapping);
+	VM_BUG_ON(PageWriteback(from_page));
+	/* writeback has to finish */
+	BUG_ON(PageWriteback(to_page));
+
+
+	/* to_page is anonymous  */
+	if (!to_page_mapping) {
+exchange_mappings:
+		/* actual page mapping exchange */
+		rc = exchange_page_move_mapping(to_page_mapping, from_page_mapping,
+					to_page, from_page, NULL, NULL, mode, 0, 0);
+	} else {
+		if (to_page_mapping->a_ops->migratepage == buffer_migrate_page) {
+
+			if (!page_has_buffers(to_page))
+				goto exchange_mappings;
+
+			to_head = page_buffers(to_page);
+
+			rc = exchange_page_move_mapping(to_page_mapping,
+					from_page_mapping, to_page, from_page,
+					to_head, NULL, mode, 0, 0);
+
+			if (rc != MIGRATEPAGE_SUCCESS)
+				return rc;
+
+			/*
+			 * In the async case, migrate_page_move_mapping locked the buffers
+			 * with an IRQ-safe spinlock held. In the sync case, the buffers
+			 * need to be locked now
+			 */
+			if (mode != MIGRATE_ASYNC)
+				VM_BUG_ON(!buffer_migrate_lock_buffers(to_head, mode));
+
+			ClearPagePrivate(to_page);
+			set_page_private(from_page, page_private(to_page));
+			set_page_private(to_page, 0);
+			/* transfer private page count  */
+			put_page(to_page);
+			get_page(from_page);
+
+			to_bh = to_head;
+			do {
+				set_bh_page(to_bh, from_page, bh_offset(to_bh));
+				to_bh = to_bh->b_this_page;
+
+			} while (to_bh != to_head);
+
+			SetPagePrivate(from_page);
+
+			to_bh = to_head;
+		} else if (!to_page_mapping->a_ops->migratepage) {
+			/* fallback_migrate_page  */
+			if (PageDirty(to_page)) {
+				if (mode != MIGRATE_SYNC)
+					return -EBUSY;
+				return writeout(to_page_mapping, to_page);
+			}
+			if (page_has_private(to_page) &&
+				!try_to_release_page(to_page, GFP_KERNEL))
+				return -EAGAIN;
+
+			goto exchange_mappings;
+		}
+	}
+	/* actual page data exchange  */
+	if (rc != MIGRATEPAGE_SUCCESS)
+		return rc;
+
+
+	if (PageHuge(from_page) || PageTransHuge(from_page))
+		exchange_huge_page(to_page, from_page);
+	else
+		exchange_highpage(to_page, from_page);
+	rc = 0;
+
+	/*
+	 * 1. buffer_migrate_page:
+	 *   private flag should be transferred from to_page to from_page
+	 *
+	 * 2. anon<->anon, fallback_migrate_page:
+	 *   both have none private flags or to_page's is cleared.
+	 */
+	VM_BUG_ON(!((page_has_private(from_page) && !page_has_private(to_page)) ||
+				(!page_has_private(from_page) && !page_has_private(to_page))));
+
+	exchange_page_flags(to_page, from_page);
+
+	if (to_bh) {
+		VM_BUG_ON(to_bh != to_head);
+		do {
+			unlock_buffer(to_bh);
+			put_bh(to_bh);
+			to_bh = to_bh->b_this_page;
+
+		} while (to_bh != to_head);
+	}
+
+	return rc;
+}
+
+static int unmap_and_exchange(struct page *from_page,
+		struct page *to_page, enum migrate_mode mode)
+{
+	int rc = -EAGAIN;
+	struct anon_vma *from_anon_vma = NULL;
+	struct anon_vma *to_anon_vma = NULL;
+	int from_page_was_mapped = 0;
+	int to_page_was_mapped = 0;
+	int from_page_count = 0, to_page_count = 0;
+	int from_map_count = 0, to_map_count = 0;
+	unsigned long from_flags, to_flags;
+	pgoff_t from_index, to_index;
+	struct address_space *from_mapping, *to_mapping;
+
+	if (!trylock_page(from_page)) {
+		if (mode == MIGRATE_ASYNC)
+			goto out;
+		lock_page(from_page);
+	}
+
+	if (!trylock_page(to_page)) {
+		if (mode == MIGRATE_ASYNC)
+			goto out_unlock;
+		lock_page(to_page);
+	}
+
+	/* from_page is supposed to be an anonymous page */
+	VM_BUG_ON_PAGE(PageWriteback(from_page), from_page);
+
+	if (PageWriteback(to_page)) {
+		/*
+		 * Only in the case of a full synchronous migration is it
+		 * necessary to wait for PageWriteback. In the async case,
+		 * the retry loop is too short and in the sync-light case,
+		 * the overhead of stalling is too much
+		 */
+		if (mode != MIGRATE_SYNC) {
+			rc = -EBUSY;
+			goto out_unlock_both;
+		}
+		wait_on_page_writeback(to_page);
+	}
+
+	if (PageAnon(from_page) && !PageKsm(from_page))
+		from_anon_vma = page_get_anon_vma(from_page);
+
+	if (PageAnon(to_page) && !PageKsm(to_page))
+		to_anon_vma = page_get_anon_vma(to_page);
+
+	from_page_count = page_count(from_page);
+	from_map_count = page_mapcount(from_page);
+	to_page_count = page_count(to_page);
+	to_map_count = page_mapcount(to_page);
+	from_flags = from_page->flags;
+	to_flags = to_page->flags;
+	from_mapping = from_page->mapping;
+	to_mapping = to_page->mapping;
+	from_index = from_page->index;
+	to_index = to_page->index;
+
+	/*
+	 * Corner case handling:
+	 * 1. When a new swap-cache page is read into, it is added to the LRU
+	 * and treated as swapcache but it has no rmap yet.
+	 * Calling try_to_unmap() against a page->mapping==NULL page will
+	 * trigger a BUG.  So handle it here.
+	 * 2. An orphaned page (see truncate_complete_page) might have
+	 * fs-private metadata. The page can be picked up due to memory
+	 * offlining.  Everywhere else except page reclaim, the page is
+	 * invisible to the vm, so the page can not be migrated.  So try to
+	 * free the metadata, so the page can be freed.
+	 */
+	if (!from_page->mapping) {
+		VM_BUG_ON_PAGE(PageAnon(from_page), from_page);
+		if (page_has_private(from_page)) {
+			try_to_free_buffers(from_page);
+			goto out_unlock_both;
+		}
+	} else if (page_mapped(from_page)) {
+		/* Establish migration ptes */
+		VM_BUG_ON_PAGE(PageAnon(from_page) && !PageKsm(from_page) &&
+					   !from_anon_vma, from_page);
+		try_to_unmap(from_page,
+			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+		from_page_was_mapped = 1;
+	}
+
+	if (!to_page->mapping) {
+		VM_BUG_ON_PAGE(PageAnon(to_page), to_page);
+		if (page_has_private(to_page)) {
+			try_to_free_buffers(to_page);
+			goto out_unlock_both_remove_from_migration_pte;
+		}
+	} else if (page_mapped(to_page)) {
+		/* Establish migration ptes */
+		VM_BUG_ON_PAGE(PageAnon(to_page) && !PageKsm(to_page) &&
+						!to_anon_vma, to_page);
+		try_to_unmap(to_page,
+			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+		to_page_was_mapped = 1;
+	}
+
+	if (!page_mapped(from_page) && !page_mapped(to_page))
+		rc = exchange_from_to_pages(to_page, from_page, mode);
+
+
+	if (to_page_was_mapped) {
+		/* swap back to_page->index to be compatible with
+		 * remove_migration_ptes(), which assumes both from_page and to_page
+		 * below have the same index.
+		 */
+		if (rc == MIGRATEPAGE_SUCCESS)
+			swap(to_page->index, to_index);
+
+		remove_migration_ptes(to_page,
+			rc == MIGRATEPAGE_SUCCESS ? from_page : to_page, false);
+
+		if (rc == MIGRATEPAGE_SUCCESS)
+			swap(to_page->index, to_index);
+	}
+
+out_unlock_both_remove_from_migration_pte:
+	if (from_page_was_mapped) {
+		/* swap back from_page->index to be compatible with
+		 * remove_migration_ptes(), which assumes both from_page and to_page
+		 * below have the same index.
+		 */
+		if (rc == MIGRATEPAGE_SUCCESS)
+			swap(from_page->index, from_index);
+
+		remove_migration_ptes(from_page,
+			rc == MIGRATEPAGE_SUCCESS ? to_page : from_page, false);
+
+		if (rc == MIGRATEPAGE_SUCCESS)
+			swap(from_page->index, from_index);
+	}
+
+out_unlock_both:
+	if (to_anon_vma)
+		put_anon_vma(to_anon_vma);
+	unlock_page(to_page);
+out_unlock:
+	/* Drop an anon_vma reference if we took one */
+	if (from_anon_vma)
+		put_anon_vma(from_anon_vma);
+	unlock_page(from_page);
+out:
+	return rc;
+}
+
+/*
+ * Exchange pages in the exchange_list
+ *
+ * Caller should release the exchange_list resource.
+ *
+ */
+static int exchange_pages(struct list_head *exchange_list,
+			enum migrate_mode mode,
+			int reason)
+{
+	struct exchange_page_info *one_pair, *one_pair2;
+	int failed = 0;
+
+	list_for_each_entry_safe(one_pair, one_pair2, exchange_list, list) {
+		struct page *from_page = one_pair->from_page;
+		struct page *to_page = one_pair->to_page;
+		int rc;
+		int retry = 0;
+
+again:
+		if (page_count(from_page) == 1) {
+			/* page was freed from under us. So we are done  */
+			ClearPageActive(from_page);
+			ClearPageUnevictable(from_page);
+
+			mod_node_page_state(page_pgdat(from_page), NR_ISOLATED_ANON +
+					page_is_file_cache(from_page),
+					-hpage_nr_pages(from_page));
+			put_page(from_page);
+
+			if (page_count(to_page) == 1) {
+				ClearPageActive(to_page);
+				ClearPageUnevictable(to_page);
+				put_page(to_page);
+				mod_node_page_state(page_pgdat(to_page), NR_ISOLATED_ANON +
+						page_is_file_cache(to_page),
+						-hpage_nr_pages(to_page));
+			} else
+				goto putback_to_page;
+
+			continue;
+		}
+
+		if (page_count(to_page) == 1) {
+			/* page was freed from under us. So we are done  */
+			ClearPageActive(to_page);
+			ClearPageUnevictable(to_page);
+
+			mod_node_page_state(page_pgdat(to_page), NR_ISOLATED_ANON +
+					page_is_file_cache(to_page),
+					-hpage_nr_pages(to_page));
+			put_page(to_page);
+
+			mod_node_page_state(page_pgdat(from_page), NR_ISOLATED_ANON +
+					page_is_file_cache(from_page),
+					-hpage_nr_pages(from_page));
+			putback_lru_page(from_page);
+			continue;
+		}
+
+		/* TODO: compound page not supported */
+		/* to_page can be file-backed page  */
+		if (PageCompound(from_page) ||
+			page_mapping(from_page)
+			) {
+			++failed;
+			goto putback;
+		}
+
+		rc = unmap_and_exchange(from_page, to_page, mode);
+
+		if (rc == -EAGAIN && retry < 3) {
+			++retry;
+			goto again;
+		}
+
+		if (rc != MIGRATEPAGE_SUCCESS)
+			++failed;
+
+putback:
+		mod_node_page_state(page_pgdat(from_page), NR_ISOLATED_ANON +
+				page_is_file_cache(from_page),
+				-hpage_nr_pages(from_page));
+
+		putback_lru_page(from_page);
+putback_to_page:
+		mod_node_page_state(page_pgdat(to_page), NR_ISOLATED_ANON +
+				page_is_file_cache(to_page),
+				-hpage_nr_pages(to_page));
+
+		putback_lru_page(to_page);
+	}
+	return failed;
+}
+
+int exchange_two_pages(struct page *page1, struct page *page2)
+{
+	struct exchange_page_info page_info;
+	LIST_HEAD(exchange_list);
+	int err = -EFAULT;
+	int pagevec_flushed = 0;
+
+	VM_BUG_ON_PAGE(PageTail(page1), page1);
+	VM_BUG_ON_PAGE(PageTail(page2), page2);
+
+	if (!(PageLRU(page1) && PageLRU(page2)))
+		return -EBUSY;
+
+retry_isolate1:
+	if (!get_page_unless_zero(page1))
+		return -EBUSY;
+	err = isolate_lru_page(page1);
+	put_page(page1);
+	if (err) {
+		if (!pagevec_flushed) {
+			migrate_prep();
+			pagevec_flushed = 1;
+			goto retry_isolate1;
+		}
+		return err;
+	}
+	mod_node_page_state(page_pgdat(page1),
+			NR_ISOLATED_ANON + page_is_file_cache(page1),
+			hpage_nr_pages(page1));
+
+retry_isolate2:
+	if (!get_page_unless_zero(page2)) {
+		putback_lru_page(page1);
+		return -EBUSY;
+	}
+	err = isolate_lru_page(page2);
+	put_page(page2);
+	if (err) {
+		if (!pagevec_flushed) {
+			migrate_prep();
+			pagevec_flushed = 1;
+			goto retry_isolate2;
+		}
+		return err;
+	}
+	mod_node_page_state(page_pgdat(page2),
+			NR_ISOLATED_ANON + page_is_file_cache(page2),
+			hpage_nr_pages(page2));
+
+	page_info.from_page = page1;
+	page_info.to_page = page2;
+	INIT_LIST_HEAD(&page_info.list);
+	list_add(&page_info.list, &exchange_list);
+
+
+	return exchange_pages(&exchange_list, MIGRATE_SYNC, 0);
+
+}
diff --git a/mm/internal.h b/mm/internal.h
index f4a7bb02decf..77e205c423ce 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -543,4 +543,10 @@ static inline bool is_migrate_highatomic_page(struct page *page)
 
 void setup_zone_pageset(struct zone *zone);
 extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
+
+bool buffer_migrate_lock_buffers(struct buffer_head *head,
+							enum migrate_mode mode);
+int writeout(struct address_space *mapping, struct page *page);
+extern int exchange_two_pages(struct page *page1, struct page *page2);
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c48ad13b4c9..dc1ec06b71a0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2665,6 +2665,41 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 		set_page_stable_node(oldpage, NULL);
 	}
 }
+
+void ksm_exchange_page(struct page *to_page, struct page *from_page)
+{
+	struct stable_node *to_stable_node, *from_stable_node;
+
+	VM_BUG_ON_PAGE(!PageLocked(to_page), to_page);
+	VM_BUG_ON_PAGE(!PageLocked(from_page), from_page);
+
+	to_stable_node = page_stable_node(to_page);
+	from_stable_node = page_stable_node(from_page);
+	if (to_stable_node) {
+		VM_BUG_ON_PAGE(to_stable_node->kpfn != page_to_pfn(from_page),
+					from_page);
+		to_stable_node->kpfn = page_to_pfn(to_page);
+		/*
+		 * newpage->mapping was set in advance; now we need smp_wmb()
+		 * to make sure that the new stable_node->kpfn is visible
+		 * to get_ksm_page() before it can see that oldpage->mapping
+		 * has gone stale (or that PageSwapCache has been cleared).
+		 */
+		smp_wmb();
+	}
+	if (from_stable_node) {
+		VM_BUG_ON_PAGE(from_stable_node->kpfn != page_to_pfn(to_page),
+					to_page);
+		from_stable_node->kpfn = page_to_pfn(from_page);
+		/*
+		 * newpage->mapping was set in advance; now we need smp_wmb()
+		 * to make sure that the new stable_node->kpfn is visible
+		 * to get_ksm_page() before it can see that oldpage->mapping
+		 * has gone stale (or that PageSwapCache has been cleared).
+		 */
+		smp_wmb();
+	}
+}
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/mm/migrate.c b/mm/migrate.c
index d4fd680be3b0..b8c79aa62134 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -701,7 +701,7 @@ EXPORT_SYMBOL(migrate_page);
 
 #ifdef CONFIG_BLOCK
 /* Returns true if all buffers are successfully locked */
-static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+bool buffer_migrate_lock_buffers(struct buffer_head *head,
 							enum migrate_mode mode)
 {
 	struct buffer_head *bh = head;
@@ -849,7 +849,7 @@ int buffer_migrate_page_norefs(struct address_space *mapping,
 /*
  * Writeback a page to clean the dirty state
  */
-static int writeout(struct address_space *mapping, struct page *page)
+int writeout(struct address_space *mapping, struct page *page)
 {
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_NONE,
-- 
2.20.1


  reply index

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-02-15 22:08 [RFC PATCH 00/31] Generating physically contiguous memory after page allocation Zi Yan
2019-02-15 22:08 ` Zi Yan [this message]
2019-02-17 11:29   ` [RFC PATCH 01/31] mm: migrate: Add exchange_pages to exchange two lists of pages Matthew Wilcox
2019-02-18 17:31     ` Zi Yan
2019-02-18 17:42       ` Vlastimil Babka
2019-02-18 17:51         ` Zi Yan
2019-02-18 17:52           ` Matthew Wilcox
2019-02-18 17:59             ` Zi Yan
2019-02-19  7:42               ` Anshuman Khandual
2019-02-19 12:56                 ` Matthew Wilcox
2019-02-20  4:38                   ` Anshuman Khandual
2019-03-14  2:39                     ` Zi Yan
2019-02-21 21:10   ` Jerome Glisse
2019-02-21 21:25     ` Zi Yan
2019-02-15 22:08 ` [RFC PATCH 02/31] mm: migrate: Add THP exchange support Zi Yan
2019-02-15 22:08 ` [RFC PATCH 03/31] mm: migrate: Add tmpfs " Zi Yan
2019-02-15 22:08 ` [RFC PATCH 04/31] mm: add mem_defrag functionality Zi Yan
2019-02-15 22:08 ` [RFC PATCH 05/31] mem_defrag: split a THP if either src or dst is THP only Zi Yan
2019-02-15 22:08 ` [RFC PATCH 06/31] mm: Make MAX_ORDER configurable in Kconfig for buddy allocator Zi Yan
2019-02-15 22:08 ` [RFC PATCH 07/31] mm: deallocate pages with order > MAX_ORDER Zi Yan
2019-02-15 22:08 ` [RFC PATCH 08/31] mm: add pagechain container for storing multiple pages Zi Yan
2019-02-15 22:08 ` [RFC PATCH 09/31] mm: thp: 1GB anonymous page implementation Zi Yan
2019-02-15 22:08 ` [RFC PATCH 10/31] mm: proc: add 1GB THP kpageflag Zi Yan
2019-02-15 22:08 ` [RFC PATCH 11/31] mm: debug: print compound page order in dump_page() Zi Yan
2019-02-15 22:08 ` [RFC PATCH 12/31] mm: stats: Separate PMD THP and PUD THP stats Zi Yan
2019-02-15 22:08 ` [RFC PATCH 13/31] mm: thp: 1GB THP copy on write implementation Zi Yan
2019-02-15 22:08 ` [RFC PATCH 14/31] mm: thp: handling 1GB THP reference bit Zi Yan
2019-02-15 22:08 ` [RFC PATCH 15/31] mm: thp: add 1GB THP split_huge_pud_page() function Zi Yan
2019-02-15 22:08 ` [RFC PATCH 16/31] mm: thp: check compound_mapcount of PMD-mapped PUD THPs at free time Zi Yan
2019-02-15 22:08 ` [RFC PATCH 17/31] mm: thp: split properly PMD-mapped PUD THP to PTE-mapped PUD THP Zi Yan
2019-02-15 22:08 ` [RFC PATCH 18/31] mm: page_vma_walk: teach it about PMD-mapped " Zi Yan
2019-02-15 22:08 ` [RFC PATCH 19/31] mm: thp: 1GB THP support in try_to_unmap() Zi Yan
2019-02-15 22:08 ` [RFC PATCH 20/31] mm: thp: split 1GB THPs at page reclaim Zi Yan
2019-02-15 22:08 ` [RFC PATCH 21/31] mm: thp: 1GB zero page shrinker Zi Yan
2019-02-15 22:08 ` [RFC PATCH 22/31] mm: thp: 1GB THP follow_p*d_page() support Zi Yan
2019-02-15 22:08 ` [RFC PATCH 23/31] mm: support 1GB THP pagemap support Zi Yan
2019-02-15 22:08 ` [RFC PATCH 24/31] sysctl: add an option to only print the head page virtual address Zi Yan
2019-02-15 22:08 ` [RFC PATCH 25/31] mm: thp: add a knob to enable/disable 1GB THPs Zi Yan
2019-02-15 22:08 ` [RFC PATCH 26/31] mm: thp: promote PTE-mapped THP to PMD-mapped THP Zi Yan
2019-02-15 22:08 ` [RFC PATCH 27/31] mm: thp: promote PMD-mapped PUD pages to PUD-mapped PUD pages Zi Yan
2019-02-15 22:08 ` [RFC PATCH 28/31] mm: vmstats: add page promotion stats Zi Yan
2019-02-15 22:08 ` [RFC PATCH 29/31] mm: madvise: add madvise options to split PMD and PUD THPs Zi Yan
2019-02-15 22:08 ` [RFC PATCH 30/31] mm: mem_defrag: thp: PMD THP and PUD THP in-place promotion support Zi Yan
2019-02-15 22:08 ` [RFC PATCH 31/31] sysctl: toggle to promote PUD-mapped 1GB THP or not Zi Yan
2019-02-20  1:42 ` [RFC PATCH 00/31] Generating physically contiguous memory after page allocation Mike Kravetz
2019-02-20  2:33   ` Zi Yan
2019-02-20  3:18     ` Mike Kravetz
2019-02-20  5:19       ` Zi Yan
2019-02-20  5:27         ` Mike Kravetz
  -- strict thread matches above, loose matches on Subject: below --
2019-02-15 22:03 Zi Yan
2019-02-15 22:03 ` [RFC PATCH 01/31] mm: migrate: Add exchange_pages to exchange two lists of pages Zi Yan

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190215220856.29749-2-zi.yan@sent.com \
    --to=zi.yan@sent.com \
    --cc=akpm@linux-foundation.org \
    --cc=dave.hansen@linux.intel.com \
    --cc=dnellans@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhairgrove@nvidia.com \
    --cc=mhocko@kernel.org \
    --cc=nigupta@nvidia.com \
    --cc=vbabka@suse.cz \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org linux-mm@archiver.kernel.org
	public-inbox-index linux-mm


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/ public-inbox