All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Jérôme Glisse" <jglisse@redhat.com>
To: akpm@linux-foundation.org, <linux-kernel@vger.kernel.org>,
	linux-mm@kvack.org
Cc: "Linus Torvalds" <torvalds@linux-foundation.org>,
	joro@8bytes.org, "Mel Gorman" <mgorman@suse.de>,
	"H. Peter Anvin" <hpa@zytor.com>,
	"Peter Zijlstra" <peterz@infradead.org>,
	"Andrea Arcangeli" <aarcange@redhat.com>,
	"Johannes Weiner" <jweiner@redhat.com>,
	"Larry Woodman" <lwoodman@redhat.com>,
	"Rik van Riel" <riel@redhat.com>,
	"Dave Airlie" <airlied@redhat.com>,
	"Brendan Conoboy" <blc@redhat.com>,
	"Joe Donohue" <jdonohue@redhat.com>,
	"Christophe Harle" <charle@nvidia.com>,
	"Duncan Poole" <dpoole@nvidia.com>,
	"Sherry Cheung" <SCheung@nvidia.com>,
	"Subhash Gutti" <sgutti@nvidia.com>,
	"John Hubbard" <jhubbard@nvidia.com>,
	"Mark Hairgrove" <mhairgrove@nvidia.com>,
	"Lucien Dunning" <ldunning@nvidia.com>,
	"Cameron Buschardt" <cabuschardt@nvidia.com>,
	"Arvind Gopalakrishnan" <arvindg@nvidia.com>,
	"Haggai Eran" <haggaie@mellanox.com>,
	"Shachar Raindel" <raindel@mellanox.com>,
	"Liran Liss" <liranl@mellanox.com>,
	"Roland Dreier" <roland@purestorage.com>,
	"Ben Sander" <ben.sander@amd.com>,
	"Greg Stoner" <Greg.Stoner@amd.com>,
	"John Bridgman" <John.Bridgman@amd.com>,
	"Michael Mantor" <Michael.Mantor@amd.com>,
	"Paul Blinzer" <Paul.Blinzer@amd.com>,
	"Leonid Shamis" <Leonid.Shamis@amd.com>,
	"Laurent Morichetti" <Laurent.Morichetti@amd.com>,
	"Alexander Deucher" <Alexander.Deucher@amd.com>,
	"Jérôme Glisse" <jglisse@redhat.com>
Subject: [PATCH v12 09/29] HMM: add mm page table iterator helpers.
Date: Tue,  8 Mar 2016 15:43:02 -0500	[thread overview]
Message-ID: <1457469802-11850-10-git-send-email-jglisse@redhat.com> (raw)
In-Reply-To: <1457469802-11850-1-git-send-email-jglisse@redhat.com>

Because inside the mmu_notifier callback we do not have access to the
vma nor do we know which lock we are holding (the mmap semaphore or
the i_mmap_lock) we can not rely on the regular page table walk (nor
do we want as we have to be carefull to not split huge page).

So this patch introduce an helper to iterate of the cpu page table
content in an efficient way for the situation we are in. Which is we
know that none of the page table entry might vanish from below us
and thus it is safe to walk the page table.

The only added value of the iterator is that it keeps the page table
entry level map accross call which fit well with the HMM mirror page
table update code.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
---
 mm/hmm.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/mm/hmm.c b/mm/hmm.c
index a9bdab5..74e429a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -406,6 +406,107 @@ static struct mmu_notifier_ops hmm_notifier_ops = {
 };
 
 
+struct mm_pt_iter {
+	struct mm_struct	*mm;
+	pte_t			*ptep;
+	unsigned long		addr;
+};
+
+static void mm_pt_iter_init(struct mm_pt_iter *pt_iter, struct mm_struct *mm)
+{
+	pt_iter->mm = mm;
+	pt_iter->ptep = NULL;
+	pt_iter->addr = -1UL;
+}
+
+static void mm_pt_iter_fini(struct mm_pt_iter *pt_iter)
+{
+	pte_unmap(pt_iter->ptep);
+	pt_iter->ptep = NULL;
+	pt_iter->addr = -1UL;
+	pt_iter->mm = NULL;
+}
+
+static inline bool mm_pt_iter_in_range(struct mm_pt_iter *pt_iter,
+				       unsigned long addr)
+{
+	return (addr >= pt_iter->addr && addr < (pt_iter->addr + PMD_SIZE));
+}
+
+static struct page *mm_pt_iter_page(struct mm_pt_iter *pt_iter,
+				    unsigned long addr)
+{
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+
+again:
+	/*
+	 * What we are doing here is only valid if we old either the mmap
+	 * semaphore or the i_mmap_lock of vma->address_space the address
+	 * belongs to. Sadly because we can not easily get the vma struct
+	 * we can not sanity test that either of those lock is taken.
+	 *
+	 * We have to rely on people using this code knowing what they do.
+	 */
+	if (mm_pt_iter_in_range(pt_iter, addr) && likely(pt_iter->ptep)) {
+		pte_t pte = *(pt_iter->ptep + pte_index(addr));
+		unsigned long pfn;
+
+		if (pte_none(pte) || !pte_present(pte))
+			return NULL;
+		if (unlikely(pte_special(pte)))
+			return NULL;
+
+		pfn = pte_pfn(pte);
+		if (is_zero_pfn(pfn))
+			return NULL;
+		return pfn_to_page(pfn);
+	}
+
+	if (pt_iter->ptep) {
+		pte_unmap(pt_iter->ptep);
+		pt_iter->ptep = NULL;
+		pt_iter->addr = -1UL;
+	}
+
+	pgdp = pgd_offset(pt_iter->mm, addr);
+	if (pgd_none_or_clear_bad(pgdp))
+		return NULL;
+	pudp = pud_offset(pgdp, addr);
+	if (pud_none_or_clear_bad(pudp))
+		return NULL;
+	pmdp = pmd_offset(pudp, addr);
+	/*
+	 * Because we either have the mmap semaphore or the i_mmap_lock we know
+	 * that pmd can not vanish from under us, thus if pmd exist then it is
+	 * either a huge page or a valid pmd. It might also be in the splitting
+	 * transitory state.
+	 */
+	if (pmd_none(*pmdp) || unlikely(pmd_bad(*pmdp)))
+		return NULL;
+	if (pmd_trans_huge(*pmdp)) {
+		spinlock_t *ptl;
+
+		ptl = pmd_lock(pt_iter->mm, pmdp);
+		if (pmd_trans_huge(*pmdp)) {
+			struct page *page;
+
+			page = pmd_page(*pmdp) + pte_index(addr);
+			spin_unlock(ptl);
+			return page;
+		}
+		/* It was morphing from thp to regular, try again. */
+		spin_unlock(ptl);
+		goto again;
+	}
+	/* Regular pmd and it can not morph. */
+	pt_iter->ptep = pte_offset_map(pmdp, addr & PMD_MASK);
+	pt_iter->addr = addr & PMD_MASK;
+	goto again;
+}
+
+
 /* hmm_mirror - per device mirroring functions.
  *
  * Each device that mirror a process has a uniq hmm_mirror struct. A process
-- 
2.4.3

WARNING: multiple messages have this Message-ID (diff)
From: "Jérôme Glisse" <jglisse@redhat.com>
To: akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org
Cc: "Linus Torvalds" <torvalds@linux-foundation.org>,
	joro@8bytes.org, "Mel Gorman" <mgorman@suse.de>,
	"H. Peter Anvin" <hpa@zytor.com>,
	"Peter Zijlstra" <peterz@infradead.org>,
	"Andrea Arcangeli" <aarcange@redhat.com>,
	"Johannes Weiner" <jweiner@redhat.com>,
	"Larry Woodman" <lwoodman@redhat.com>,
	"Rik van Riel" <riel@redhat.com>,
	"Dave Airlie" <airlied@redhat.com>,
	"Brendan Conoboy" <blc@redhat.com>,
	"Joe Donohue" <jdonohue@redhat.com>,
	"Christophe Harle" <charle@nvidia.com>,
	"Duncan Poole" <dpoole@nvidia.com>,
	"Sherry Cheung" <SCheung@nvidia.com>,
	"Subhash Gutti" <sgutti@nvidia.com>,
	"John Hubbard" <jhubbard@nvidia.com>,
	"Mark Hairgrove" <mhairgrove@nvidia.com>,
	"Lucien Dunning" <ldunning@nvidia.com>,
	"Cameron Buschardt" <cabuschardt@nvidia.com>,
	"Arvind Gopalakrishnan" <arvindg@nvidia.com>,
	"Haggai Eran" <haggaie@mellanox.com>,
	"Shachar Raindel" <raindel@mellanox.com>,
	"Liran Liss" <liranl@mellanox.com>,
	"Roland Dreier" <roland@purestorage.com>,
	"Ben Sander" <ben.sander@amd.com>,
	"Greg Stoner" <Greg.Stoner@amd.com>,
	"John Bridgman" <John.Bridgman@amd.com>,
	"Michael Mantor" <Michael.Mantor@amd.com>,
	"Paul Blinzer" <Paul.Blinzer@amd.com>,
	"Leonid Shamis" <Leonid.Shamis@amd.com>,
	"Laurent Morichetti" <Laurent.Morichetti@amd.com>,
	"Alexander Deucher" <Alexander.Deucher@amd.com>,
	"Jérôme Glisse" <jglisse@redhat.com>
Subject: [PATCH v12 09/29] HMM: add mm page table iterator helpers.
Date: Tue,  8 Mar 2016 15:43:02 -0500	[thread overview]
Message-ID: <1457469802-11850-10-git-send-email-jglisse@redhat.com> (raw)
In-Reply-To: <1457469802-11850-1-git-send-email-jglisse@redhat.com>

Because inside the mmu_notifier callback we do not have access to the
vma nor do we know which lock we are holding (the mmap semaphore or
the i_mmap_lock) we can not rely on the regular page table walk (nor
do we want as we have to be carefull to not split huge page).

So this patch introduce an helper to iterate of the cpu page table
content in an efficient way for the situation we are in. Which is we
know that none of the page table entry might vanish from below us
and thus it is safe to walk the page table.

The only added value of the iterator is that it keeps the page table
entry level map accross call which fit well with the HMM mirror page
table update code.

Signed-off-by: JA(C)rA'me Glisse <jglisse@redhat.com>
---
 mm/hmm.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/mm/hmm.c b/mm/hmm.c
index a9bdab5..74e429a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -406,6 +406,107 @@ static struct mmu_notifier_ops hmm_notifier_ops = {
 };
 
 
+struct mm_pt_iter {
+	struct mm_struct	*mm;
+	pte_t			*ptep;
+	unsigned long		addr;
+};
+
+static void mm_pt_iter_init(struct mm_pt_iter *pt_iter, struct mm_struct *mm)
+{
+	pt_iter->mm = mm;
+	pt_iter->ptep = NULL;
+	pt_iter->addr = -1UL;
+}
+
+static void mm_pt_iter_fini(struct mm_pt_iter *pt_iter)
+{
+	pte_unmap(pt_iter->ptep);
+	pt_iter->ptep = NULL;
+	pt_iter->addr = -1UL;
+	pt_iter->mm = NULL;
+}
+
+static inline bool mm_pt_iter_in_range(struct mm_pt_iter *pt_iter,
+				       unsigned long addr)
+{
+	return (addr >= pt_iter->addr && addr < (pt_iter->addr + PMD_SIZE));
+}
+
+static struct page *mm_pt_iter_page(struct mm_pt_iter *pt_iter,
+				    unsigned long addr)
+{
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+
+again:
+	/*
+	 * What we are doing here is only valid if we old either the mmap
+	 * semaphore or the i_mmap_lock of vma->address_space the address
+	 * belongs to. Sadly because we can not easily get the vma struct
+	 * we can not sanity test that either of those lock is taken.
+	 *
+	 * We have to rely on people using this code knowing what they do.
+	 */
+	if (mm_pt_iter_in_range(pt_iter, addr) && likely(pt_iter->ptep)) {
+		pte_t pte = *(pt_iter->ptep + pte_index(addr));
+		unsigned long pfn;
+
+		if (pte_none(pte) || !pte_present(pte))
+			return NULL;
+		if (unlikely(pte_special(pte)))
+			return NULL;
+
+		pfn = pte_pfn(pte);
+		if (is_zero_pfn(pfn))
+			return NULL;
+		return pfn_to_page(pfn);
+	}
+
+	if (pt_iter->ptep) {
+		pte_unmap(pt_iter->ptep);
+		pt_iter->ptep = NULL;
+		pt_iter->addr = -1UL;
+	}
+
+	pgdp = pgd_offset(pt_iter->mm, addr);
+	if (pgd_none_or_clear_bad(pgdp))
+		return NULL;
+	pudp = pud_offset(pgdp, addr);
+	if (pud_none_or_clear_bad(pudp))
+		return NULL;
+	pmdp = pmd_offset(pudp, addr);
+	/*
+	 * Because we either have the mmap semaphore or the i_mmap_lock we know
+	 * that pmd can not vanish from under us, thus if pmd exist then it is
+	 * either a huge page or a valid pmd. It might also be in the splitting
+	 * transitory state.
+	 */
+	if (pmd_none(*pmdp) || unlikely(pmd_bad(*pmdp)))
+		return NULL;
+	if (pmd_trans_huge(*pmdp)) {
+		spinlock_t *ptl;
+
+		ptl = pmd_lock(pt_iter->mm, pmdp);
+		if (pmd_trans_huge(*pmdp)) {
+			struct page *page;
+
+			page = pmd_page(*pmdp) + pte_index(addr);
+			spin_unlock(ptl);
+			return page;
+		}
+		/* It was morphing from thp to regular, try again. */
+		spin_unlock(ptl);
+		goto again;
+	}
+	/* Regular pmd and it can not morph. */
+	pt_iter->ptep = pte_offset_map(pmdp, addr & PMD_MASK);
+	pt_iter->addr = addr & PMD_MASK;
+	goto again;
+}
+
+
 /* hmm_mirror - per device mirroring functions.
  *
  * Each device that mirror a process has a uniq hmm_mirror struct. A process
-- 
2.4.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2016-03-08 19:47 UTC|newest]

Thread overview: 81+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-03-08 20:42 HMM (Heterogeneous Memory Management) Jérôme Glisse
2016-03-08 20:42 ` Jérôme Glisse
2016-03-08 20:42 ` [PATCH v12 01/29] mmu_notifier: add event information to address invalidation v9 Jérôme Glisse
2016-03-08 20:42   ` Jérôme Glisse
2016-03-08 20:42 ` [PATCH v12 02/29] mmu_notifier: keep track of active invalidation ranges v5 Jérôme Glisse
2016-03-08 20:42   ` Jérôme Glisse
2016-03-08 20:42 ` [PATCH v12 03/29] mmu_notifier: pass page pointer to mmu_notifier_invalidate_page() v2 Jérôme Glisse
2016-03-08 20:42   ` Jérôme Glisse
2016-03-08 20:42 ` [PATCH v12 04/29] mmu_notifier: allow range invalidation to exclude a specific mmu_notifier Jérôme Glisse
2016-03-08 20:42   ` Jérôme Glisse
2016-03-08 20:42 ` [PATCH v12 05/29] HMM: introduce heterogeneous memory management v5 Jérôme Glisse
2016-03-08 20:42   ` Jérôme Glisse
2016-03-08 20:42 ` [PATCH v12 06/29] HMM: add HMM page table v4 Jérôme Glisse
2016-03-08 20:42   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 07/29] HMM: add per mirror " Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-29 22:58   ` John Hubbard
2016-03-29 22:58     ` John Hubbard
2016-03-08 20:43 ` [PATCH v12 08/29] HMM: add device page fault support v6 Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-23  6:52   ` Aneesh Kumar K.V
2016-03-23  6:52     ` Aneesh Kumar K.V
2016-03-23 10:09     ` Jerome Glisse
2016-03-23 10:09       ` Jerome Glisse
2016-03-23 10:29       ` Aneesh Kumar K.V
2016-03-23 10:29         ` Aneesh Kumar K.V
2016-03-23 11:25         ` Jerome Glisse
2016-03-23 11:25           ` Jerome Glisse
2016-03-08 20:43 ` Jérôme Glisse [this message]
2016-03-08 20:43   ` [PATCH v12 09/29] HMM: add mm page table iterator helpers Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 10/29] HMM: use CPU page table during invalidation Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 11/29] HMM: add discard range helper (to clear and free resources for a range) Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 12/29] HMM: add dirty range helper (toggle dirty bit inside mirror page table) v2 Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 13/29] HMM: DMA map memory on behalf of device driver v2 Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 14/29] HMM: Add support for hugetlb Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 15/29] HMM: add documentation explaining HMM internals and how to use it Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 16/29] fork: pass the dst vma to copy_page_range() and its sub-functions Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 17/29] HMM: add special swap filetype for memory migrated to device v2 Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 18/29] HMM: add new HMM page table flag (valid device memory) Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 19/29] HMM: add new HMM page table flag (select flag) Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 20/29] HMM: handle HMM device page table entry on mirror page table fault and update Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 21/29] HMM: mm add helper to update page table when migrating memory back v2 Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-21 11:27   ` Aneesh Kumar K.V
2016-03-21 11:27     ` Aneesh Kumar K.V
2016-03-21 12:02     ` Jerome Glisse
2016-03-21 12:02       ` Jerome Glisse
2016-03-21 13:48       ` Aneesh Kumar K.V
2016-03-21 13:48         ` Aneesh Kumar K.V
2016-03-21 14:30         ` Jerome Glisse
2016-03-21 14:30           ` Jerome Glisse
2016-03-08 20:43 ` [PATCH v12 22/29] HMM: mm add helper to update page table when migrating memory v3 Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-21 14:24   ` Aneesh Kumar K.V
2016-03-21 14:24     ` Aneesh Kumar K.V
2016-03-08 20:43 ` [PATCH v12 23/29] HMM: new callback for copying memory from and to device memory v2 Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 24/29] HMM: allow to get pointer to spinlock protecting a directory Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 25/29] HMM: split DMA mapping function in two Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 26/29] HMM: add helpers for migration back to system memory v3 Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 27/29] HMM: fork copy migrated memory into system memory for child process Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 28/29] HMM: CPU page fault on migrated memory Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 20:43 ` [PATCH v12 29/29] HMM: add mirror fault support for system to device memory migration v3 Jérôme Glisse
2016-03-08 20:43   ` Jérôme Glisse
2016-03-08 22:02 ` HMM (Heterogeneous Memory Management) John Hubbard

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1457469802-11850-10-git-send-email-jglisse@redhat.com \
    --to=jglisse@redhat.com \
    --cc=Alexander.Deucher@amd.com \
    --cc=Greg.Stoner@amd.com \
    --cc=John.Bridgman@amd.com \
    --cc=Laurent.Morichetti@amd.com \
    --cc=Leonid.Shamis@amd.com \
    --cc=Michael.Mantor@amd.com \
    --cc=Paul.Blinzer@amd.com \
    --cc=SCheung@nvidia.com \
    --cc=aarcange@redhat.com \
    --cc=airlied@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=arvindg@nvidia.com \
    --cc=ben.sander@amd.com \
    --cc=blc@redhat.com \
    --cc=cabuschardt@nvidia.com \
    --cc=charle@nvidia.com \
    --cc=dpoole@nvidia.com \
    --cc=haggaie@mellanox.com \
    --cc=hpa@zytor.com \
    --cc=jdonohue@redhat.com \
    --cc=jhubbard@nvidia.com \
    --cc=joro@8bytes.org \
    --cc=jweiner@redhat.com \
    --cc=ldunning@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=liranl@mellanox.com \
    --cc=lwoodman@redhat.com \
    --cc=mgorman@suse.de \
    --cc=mhairgrove@nvidia.com \
    --cc=peterz@infradead.org \
    --cc=raindel@mellanox.com \
    --cc=riel@redhat.com \
    --cc=roland@purestorage.com \
    --cc=sgutti@nvidia.com \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.