All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Jérôme Glisse" <jglisse@redhat.com>
To: akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: "John Hubbard" <jhubbard@nvidia.com>,
	"Dan Williams" <dan.j.williams@intel.com>,
	"Naoya Horiguchi" <n-horiguchi@ah.jp.nec.com>,
	"David Nellans" <dnellans@nvidia.com>,
	"Jérôme Glisse" <jglisse@redhat.com>
Subject: [HMM 13/16] mm/migrate: allow migrate_vma() to alloc new page on empty entry
Date: Wed,  5 Apr 2017 16:40:23 -0400	[thread overview]
Message-ID: <20170405204026.3940-14-jglisse@redhat.com> (raw)
In-Reply-To: <20170405204026.3940-1-jglisse@redhat.com>

This allow caller of migrate_vma() to allocate new page for empty CPU
page table entry. It only support anoymous memory and it won't allow
new page to be instance if userfaultfd is armed.

This is useful to device driver that want to migrate a range of virtual
address and would rather allocate new memory than having to fault later
on.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
---
 mm/migrate.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 127 insertions(+), 4 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index f7a7661..cbaa4f2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -41,6 +41,7 @@
 #include <linux/page_idle.h>
 #include <linux/page_owner.h>
 #include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/tlbflush.h>
 
@@ -2111,9 +2112,10 @@ static int migrate_vma_collect_hole(unsigned long start,
 				    struct mm_walk *walk)
 {
 	struct migrate_vma *migrate = walk->private;
-	unsigned long addr, next;
+	unsigned long addr;
 
 	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+		migrate->cpages++;
 		migrate->dst[migrate->npages] = 0;
 		migrate->src[migrate->npages++] = 0;
 	}
@@ -2150,6 +2152,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 		pfn = pte_pfn(pte);
 
 		if (pte_none(pte)) {
+			migrate->cpages++;
 			mpfn = pfn = 0;
 			goto next;
 		}
@@ -2463,6 +2466,114 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
 	}
 }
 
+static void migrate_vma_insert_page(struct migrate_vma *migrate,
+				    unsigned long addr,
+				    struct page *page,
+				    unsigned long *src,
+				    unsigned long *dst)
+{
+	struct vm_area_struct *vma = migrate->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	struct mem_cgroup *memcg;
+	spinlock_t *ptl;
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+	pte_t entry;
+
+	/* Only allow populating anonymous memory */
+	if (!vma_is_anonymous(vma))
+		goto abort;
+
+	pgdp = pgd_offset(mm, addr);
+	pudp = pud_alloc(mm, pgdp, addr);
+	if (!pudp)
+		goto abort;
+	pmdp = pmd_alloc(mm, pudp, addr);
+	if (!pmdp)
+		goto abort;
+
+	if (pmd_trans_unstable(pmdp) || pmd_devmap(*pmdp))
+		goto abort;
+
+	/*
+	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
+	 * pte_offset_map() on pmds where a huge pmd might be created
+	 * from a different thread.
+	 *
+	 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+	 * parallel threads are excluded by other means.
+	 *
+	 * Here we only have down_read(mmap_sem).
+	 */
+	if (pte_alloc(mm, pmdp, addr))
+		goto abort;
+
+	/* See the comment in pte_alloc_one_map() */
+	if (unlikely(pmd_trans_unstable(pmdp)))
+		goto abort;
+
+	if (unlikely(anon_vma_prepare(vma)))
+		goto abort;
+	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+		goto abort;
+
+	/*
+	 * The memory barrier inside __SetPageUptodate makes sure that
+	 * preceding stores to the page contents become visible before
+	 * the set_pte_at() write.
+	 */
+	__SetPageUptodate(page);
+
+	if (is_zone_device_page(page) && is_device_unaddressable_page(page)) {
+		swp_entry_t swp_entry;
+
+		swp_entry = make_device_entry(page, vma->vm_flags & VM_WRITE);
+		entry = swp_entry_to_pte(swp_entry);
+	} else {
+		entry = mk_pte(page, vma->vm_page_prot);
+		if (vma->vm_flags & VM_WRITE)
+			entry = pte_mkwrite(pte_mkdirty(entry));
+	}
+
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!pte_none(*ptep)) {
+		pte_unmap_unlock(ptep, ptl);
+		mem_cgroup_cancel_charge(page, memcg, false);
+		goto abort;
+	}
+
+	/*
+	 * Check for usefaultfd but do not deliver the fault. Instead,
+	 * just back off.
+	 */
+	if (userfaultfd_missing(vma)) {
+		pte_unmap_unlock(ptep, ptl);
+		mem_cgroup_cancel_charge(page, memcg, false);
+		goto abort;
+	}
+
+	inc_mm_counter(mm, MM_ANONPAGES);
+	page_add_new_anon_rmap(page, vma, addr, false);
+	mem_cgroup_commit_charge(page, memcg, false, false);
+	if (!is_zone_device_page(page))
+		lru_cache_add_active_or_unevictable(page, vma);
+	set_pte_at(mm, addr, ptep, entry);
+
+	/* Take a reference on the page */
+	get_page(page);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, addr, ptep);
+	pte_unmap_unlock(ptep, ptl);
+	*src = MIGRATE_PFN_MIGRATE;
+	return;
+
+abort:
+	*src &= ~MIGRATE_PFN_MIGRATE;
+}
+
 /*
  * migrate_vma_pages() - migrate meta-data from src page to dst page
  * @migrate: migrate struct containing all migration information
@@ -2483,10 +2594,16 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 		struct address_space *mapping;
 		int r;
 
-		if (!page || !newpage)
+		if (!newpage) {
+			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 			continue;
-		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+		} else if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+			if (!page)
+				migrate_vma_insert_page(migrate, addr, newpage,
+							&migrate->src[i],
+							&migrate->dst[i]);
 			continue;
+		}
 
 		mapping = page_mapping(page);
 
@@ -2536,8 +2653,14 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
 
-		if (!page)
+		if (!page) {
+			if (newpage) {
+				unlock_page(newpage);
+				put_page(newpage);
+			}
 			continue;
+		}
+
 		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
 			if (newpage) {
 				unlock_page(newpage);
-- 
2.9.3

WARNING: multiple messages have this Message-ID (diff)
From: "Jérôme Glisse" <jglisse@redhat.com>
To: akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: "John Hubbard" <jhubbard@nvidia.com>,
	"Dan Williams" <dan.j.williams@intel.com>,
	"Naoya Horiguchi" <n-horiguchi@ah.jp.nec.com>,
	"David Nellans" <dnellans@nvidia.com>,
	"Jérôme Glisse" <jglisse@redhat.com>
Subject: [HMM 13/16] mm/migrate: allow migrate_vma() to alloc new page on empty entry
Date: Wed,  5 Apr 2017 16:40:23 -0400	[thread overview]
Message-ID: <20170405204026.3940-14-jglisse@redhat.com> (raw)
In-Reply-To: <20170405204026.3940-1-jglisse@redhat.com>

This allow caller of migrate_vma() to allocate new page for empty CPU
page table entry. It only support anoymous memory and it won't allow
new page to be instance if userfaultfd is armed.

This is useful to device driver that want to migrate a range of virtual
address and would rather allocate new memory than having to fault later
on.

Signed-off-by: JA(C)rA'me Glisse <jglisse@redhat.com>
---
 mm/migrate.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 127 insertions(+), 4 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index f7a7661..cbaa4f2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -41,6 +41,7 @@
 #include <linux/page_idle.h>
 #include <linux/page_owner.h>
 #include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/tlbflush.h>
 
@@ -2111,9 +2112,10 @@ static int migrate_vma_collect_hole(unsigned long start,
 				    struct mm_walk *walk)
 {
 	struct migrate_vma *migrate = walk->private;
-	unsigned long addr, next;
+	unsigned long addr;
 
 	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+		migrate->cpages++;
 		migrate->dst[migrate->npages] = 0;
 		migrate->src[migrate->npages++] = 0;
 	}
@@ -2150,6 +2152,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 		pfn = pte_pfn(pte);
 
 		if (pte_none(pte)) {
+			migrate->cpages++;
 			mpfn = pfn = 0;
 			goto next;
 		}
@@ -2463,6 +2466,114 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
 	}
 }
 
+static void migrate_vma_insert_page(struct migrate_vma *migrate,
+				    unsigned long addr,
+				    struct page *page,
+				    unsigned long *src,
+				    unsigned long *dst)
+{
+	struct vm_area_struct *vma = migrate->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	struct mem_cgroup *memcg;
+	spinlock_t *ptl;
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+	pte_t entry;
+
+	/* Only allow populating anonymous memory */
+	if (!vma_is_anonymous(vma))
+		goto abort;
+
+	pgdp = pgd_offset(mm, addr);
+	pudp = pud_alloc(mm, pgdp, addr);
+	if (!pudp)
+		goto abort;
+	pmdp = pmd_alloc(mm, pudp, addr);
+	if (!pmdp)
+		goto abort;
+
+	if (pmd_trans_unstable(pmdp) || pmd_devmap(*pmdp))
+		goto abort;
+
+	/*
+	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
+	 * pte_offset_map() on pmds where a huge pmd might be created
+	 * from a different thread.
+	 *
+	 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+	 * parallel threads are excluded by other means.
+	 *
+	 * Here we only have down_read(mmap_sem).
+	 */
+	if (pte_alloc(mm, pmdp, addr))
+		goto abort;
+
+	/* See the comment in pte_alloc_one_map() */
+	if (unlikely(pmd_trans_unstable(pmdp)))
+		goto abort;
+
+	if (unlikely(anon_vma_prepare(vma)))
+		goto abort;
+	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+		goto abort;
+
+	/*
+	 * The memory barrier inside __SetPageUptodate makes sure that
+	 * preceding stores to the page contents become visible before
+	 * the set_pte_at() write.
+	 */
+	__SetPageUptodate(page);
+
+	if (is_zone_device_page(page) && is_device_unaddressable_page(page)) {
+		swp_entry_t swp_entry;
+
+		swp_entry = make_device_entry(page, vma->vm_flags & VM_WRITE);
+		entry = swp_entry_to_pte(swp_entry);
+	} else {
+		entry = mk_pte(page, vma->vm_page_prot);
+		if (vma->vm_flags & VM_WRITE)
+			entry = pte_mkwrite(pte_mkdirty(entry));
+	}
+
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!pte_none(*ptep)) {
+		pte_unmap_unlock(ptep, ptl);
+		mem_cgroup_cancel_charge(page, memcg, false);
+		goto abort;
+	}
+
+	/*
+	 * Check for usefaultfd but do not deliver the fault. Instead,
+	 * just back off.
+	 */
+	if (userfaultfd_missing(vma)) {
+		pte_unmap_unlock(ptep, ptl);
+		mem_cgroup_cancel_charge(page, memcg, false);
+		goto abort;
+	}
+
+	inc_mm_counter(mm, MM_ANONPAGES);
+	page_add_new_anon_rmap(page, vma, addr, false);
+	mem_cgroup_commit_charge(page, memcg, false, false);
+	if (!is_zone_device_page(page))
+		lru_cache_add_active_or_unevictable(page, vma);
+	set_pte_at(mm, addr, ptep, entry);
+
+	/* Take a reference on the page */
+	get_page(page);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, addr, ptep);
+	pte_unmap_unlock(ptep, ptl);
+	*src = MIGRATE_PFN_MIGRATE;
+	return;
+
+abort:
+	*src &= ~MIGRATE_PFN_MIGRATE;
+}
+
 /*
  * migrate_vma_pages() - migrate meta-data from src page to dst page
  * @migrate: migrate struct containing all migration information
@@ -2483,10 +2594,16 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 		struct address_space *mapping;
 		int r;
 
-		if (!page || !newpage)
+		if (!newpage) {
+			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 			continue;
-		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+		} else if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+			if (!page)
+				migrate_vma_insert_page(migrate, addr, newpage,
+							&migrate->src[i],
+							&migrate->dst[i]);
 			continue;
+		}
 
 		mapping = page_mapping(page);
 
@@ -2536,8 +2653,14 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
 
-		if (!page)
+		if (!page) {
+			if (newpage) {
+				unlock_page(newpage);
+				put_page(newpage);
+			}
 			continue;
+		}
+
 		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
 			if (newpage) {
 				unlock_page(newpage);
-- 
2.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2017-04-05 20:42 UTC|newest]

Thread overview: 81+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-05 20:40 [HMM 00/16] HMM (Heterogeneous Memory Management) v19 Jérôme Glisse
2017-04-05 20:40 ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 01/16] mm/memory/hotplug: add memory type parameter to arch_add/remove_memory Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-06  9:45   ` Anshuman Khandual
2017-04-06  9:45     ` Anshuman Khandual
2017-04-06 13:58     ` Jerome Glisse
2017-04-06 13:58       ` Jerome Glisse
2017-04-07 12:13   ` Michal Hocko
2017-04-07 12:13     ` Michal Hocko
2017-04-07 14:32     ` Jerome Glisse
2017-04-07 14:32       ` Jerome Glisse
2017-04-07 14:45       ` Michal Hocko
2017-04-07 14:45         ` Michal Hocko
2017-04-07 14:57         ` Jerome Glisse
2017-04-07 14:57           ` Jerome Glisse
2017-04-07 15:11           ` Michal Hocko
2017-04-07 15:11             ` Michal Hocko
2017-04-07 16:10             ` Jerome Glisse
2017-04-07 16:10               ` Jerome Glisse
2017-04-07 16:37               ` Michal Hocko
2017-04-07 16:37                 ` Michal Hocko
2017-04-07 17:10                 ` Jerome Glisse
2017-04-07 17:10                   ` Jerome Glisse
2017-04-07 17:59                   ` Michal Hocko
2017-04-07 17:59                     ` Michal Hocko
2017-04-07 18:27                     ` Jerome Glisse
2017-04-07 18:27                       ` Jerome Glisse
2017-04-05 20:40 ` [HMM 02/16] mm/put_page: move ZONE_DEVICE page reference decrement v2 Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 03/16] mm/unaddressable-memory: new type of ZONE_DEVICE for unaddressable memory Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 04/16] mm/ZONE_DEVICE/x86: add support for un-addressable device memory Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 05/16] mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 06/16] mm/migrate: new memory migration helper for use with device memory v4 Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 07/16] mm/migrate: migrate_vma() unmap page from vma while collecting pages Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 08/16] mm/hmm: heterogeneous memory management (HMM for short) Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 09/16] mm/hmm/mirror: mirror process address space on device with HMM helpers Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 10/16] mm/hmm/mirror: helper to snapshot CPU page table v2 Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-10  8:35   ` Michal Hocko
2017-04-10  8:35     ` Michal Hocko
2017-04-10  8:43   ` Michal Hocko
2017-04-10  8:43     ` Michal Hocko
2017-04-10 22:10     ` Andrew Morton
2017-04-10 22:10       ` Andrew Morton
2017-04-11  1:33       ` Jerome Glisse
2017-04-11  1:33         ` Jerome Glisse
2017-04-11 20:33         ` Andrew Morton
2017-04-11 20:33           ` Andrew Morton
2017-04-05 20:40 ` [HMM 11/16] mm/hmm/mirror: device page fault handler Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 12/16] mm/migrate: support un-addressable ZONE_DEVICE page in migration Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` Jérôme Glisse [this message]
2017-04-05 20:40   ` [HMM 13/16] mm/migrate: allow migrate_vma() to alloc new page on empty entry Jérôme Glisse
2017-04-05 20:40 ` [HMM 14/16] mm/hmm/devmem: device memory hotplug using ZONE_DEVICE Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-06 21:22   ` Jerome Glisse
2017-04-06 21:22     ` Jerome Glisse
2017-04-07  1:37   ` Balbir Singh
2017-04-07  1:37     ` Balbir Singh
2017-04-07  2:02     ` Jerome Glisse
2017-04-07  2:02       ` Jerome Glisse
2017-04-07 16:26       ` Jerome Glisse
2017-04-07 16:26         ` Jerome Glisse
2017-04-10  4:31         ` Balbir Singh
2017-04-10  4:31           ` Balbir Singh
2017-04-05 20:40 ` [HMM 15/16] mm/hmm/devmem: dummy HMM device for ZONE_DEVICE memory v2 Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-05 20:40 ` [HMM 16/16] hmm: heterogeneous memory management documentation Jérôme Glisse
2017-04-05 20:40   ` Jérôme Glisse
2017-04-06  3:22 ` [HMM 00/16] HMM (Heterogeneous Memory Management) v19 Figo.zhang
2017-04-06  4:59   ` Jerome Glisse
2017-04-06  4:59     ` Jerome Glisse

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170405204026.3940-14-jglisse@redhat.com \
    --to=jglisse@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=dan.j.williams@intel.com \
    --cc=dnellans@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=n-horiguchi@ah.jp.nec.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.