[PATCH v12 09/29] HMM: add mm page table iterator helpers.

From: "Jérôme Glisse" <jglisse@redhat.com>
To: akpm@linux-foundation.org, <linux-kernel@vger.kernel.org>,
	linux-mm@kvack.org
Cc: "Linus Torvalds" <torvalds@linux-foundation.org>,
	joro@8bytes.org, "Mel Gorman" <mgorman@suse.de>,
	"H. Peter Anvin" <hpa@zytor.com>,
	"Peter Zijlstra" <peterz@infradead.org>,
	"Andrea Arcangeli" <aarcange@redhat.com>,
	"Johannes Weiner" <jweiner@redhat.com>,
	"Larry Woodman" <lwoodman@redhat.com>,
	"Rik van Riel" <riel@redhat.com>,
	"Dave Airlie" <airlied@redhat.com>,
	"Brendan Conoboy" <blc@redhat.com>,
	"Joe Donohue" <jdonohue@redhat.com>,
	"Christophe Harle" <charle@nvidia.com>,
	"Duncan Poole" <dpoole@nvidia.com>,
	"Sherry Cheung" <SCheung@nvidia.com>,
	"Subhash Gutti" <sgutti@nvidia.com>,
	"John Hubbard" <jhubbard@nvidia.com>,
	"Mark Hairgrove" <mhairgrove@nvidia.com>,
	"Lucien Dunning" <ldunning@nvidia.com>,
	"Cameron Buschardt" <cabuschardt@nvidia.com>,
	"Arvind Gopalakrishnan" <arvindg@nvidia.com>,
	"Haggai Eran" <haggaie@mellanox.com>,
	"Shachar Raindel" <raindel@mellanox.com>,
	"Liran Liss" <liranl@mellanox.com>,
	"Roland Dreier" <roland@purestorage.com>,
	"Ben Sander" <ben.sander@amd.com>,
	"Greg Stoner" <Greg.Stoner@amd.com>,
	"John Bridgman" <John.Bridgman@amd.com>,
	"Michael Mantor" <Michael.Mantor@amd.com>,
	"Paul Blinzer" <Paul.Blinzer@amd.com>,
	"Leonid Shamis" <Leonid.Shamis@amd.com>,
	"Laurent Morichetti" <Laurent.Morichetti@amd.com>,
	"Alexander Deucher" <Alexander.Deucher@amd.com>,
	"Jérôme Glisse" <jglisse@redhat.com>
Subject: [PATCH v12 09/29] HMM: add mm page table iterator helpers.
Date: Tue,  8 Mar 2016 15:43:02 -0500	[thread overview]
Message-ID: <1457469802-11850-10-git-send-email-jglisse@redhat.com> (raw)
In-Reply-To: <1457469802-11850-1-git-send-email-jglisse@redhat.com>

Because inside the mmu_notifier callback we do not have access to the
vma nor do we know which lock we are holding (the mmap semaphore or
the i_mmap_lock) we can not rely on the regular page table walk (nor
do we want as we have to be carefull to not split huge page).

So this patch introduce an helper to iterate of the cpu page table
content in an efficient way for the situation we are in. Which is we
know that none of the page table entry might vanish from below us
and thus it is safe to walk the page table.

The only added value of the iterator is that it keeps the page table
entry level map accross call which fit well with the HMM mirror page
table update code.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
---
 mm/hmm.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/mm/hmm.c b/mm/hmm.c
index a9bdab5..74e429a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -406,6 +406,107 @@ static struct mmu_notifier_ops hmm_notifier_ops = {
 };
 
 
+struct mm_pt_iter {
+	struct mm_struct	*mm;
+	pte_t			*ptep;
+	unsigned long		addr;
+};
+
+static void mm_pt_iter_init(struct mm_pt_iter *pt_iter, struct mm_struct *mm)
+{
+	pt_iter->mm = mm;
+	pt_iter->ptep = NULL;
+	pt_iter->addr = -1UL;
+}
+
+static void mm_pt_iter_fini(struct mm_pt_iter *pt_iter)
+{
+	pte_unmap(pt_iter->ptep);
+	pt_iter->ptep = NULL;
+	pt_iter->addr = -1UL;
+	pt_iter->mm = NULL;
+}
+
+static inline bool mm_pt_iter_in_range(struct mm_pt_iter *pt_iter,
+				       unsigned long addr)
+{
+	return (addr >= pt_iter->addr && addr < (pt_iter->addr + PMD_SIZE));
+}
+
+static struct page *mm_pt_iter_page(struct mm_pt_iter *pt_iter,
+				    unsigned long addr)
+{
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+
+again:
+	/*
+	 * What we are doing here is only valid if we old either the mmap
+	 * semaphore or the i_mmap_lock of vma->address_space the address
+	 * belongs to. Sadly because we can not easily get the vma struct
+	 * we can not sanity test that either of those lock is taken.
+	 *
+	 * We have to rely on people using this code knowing what they do.
+	 */
+	if (mm_pt_iter_in_range(pt_iter, addr) && likely(pt_iter->ptep)) {
+		pte_t pte = *(pt_iter->ptep + pte_index(addr));
+		unsigned long pfn;
+
+		if (pte_none(pte) || !pte_present(pte))
+			return NULL;
+		if (unlikely(pte_special(pte)))
+			return NULL;
+
+		pfn = pte_pfn(pte);
+		if (is_zero_pfn(pfn))
+			return NULL;
+		return pfn_to_page(pfn);
+	}
+
+	if (pt_iter->ptep) {
+		pte_unmap(pt_iter->ptep);
+		pt_iter->ptep = NULL;
+		pt_iter->addr = -1UL;
+	}
+
+	pgdp = pgd_offset(pt_iter->mm, addr);
+	if (pgd_none_or_clear_bad(pgdp))
+		return NULL;
+	pudp = pud_offset(pgdp, addr);
+	if (pud_none_or_clear_bad(pudp))
+		return NULL;
+	pmdp = pmd_offset(pudp, addr);
+	/*
+	 * Because we either have the mmap semaphore or the i_mmap_lock we know
+	 * that pmd can not vanish from under us, thus if pmd exist then it is
+	 * either a huge page or a valid pmd. It might also be in the splitting
+	 * transitory state.
+	 */
+	if (pmd_none(*pmdp) || unlikely(pmd_bad(*pmdp)))
+		return NULL;
+	if (pmd_trans_huge(*pmdp)) {
+		spinlock_t *ptl;
+
+		ptl = pmd_lock(pt_iter->mm, pmdp);
+		if (pmd_trans_huge(*pmdp)) {
+			struct page *page;
+
+			page = pmd_page(*pmdp) + pte_index(addr);
+			spin_unlock(ptl);
+			return page;
+		}
+		/* It was morphing from thp to regular, try again. */
+		spin_unlock(ptl);
+		goto again;
+	}
+	/* Regular pmd and it can not morph. */
+	pt_iter->ptep = pte_offset_map(pmdp, addr & PMD_MASK);
+	pt_iter->addr = addr & PMD_MASK;
+	goto again;
+}
+
+
 /* hmm_mirror - per device mirroring functions.
  *
  * Each device that mirror a process has a uniq hmm_mirror struct. A process
-- 
2.4.3