From: Zi Yan <zi.yan@sent.com>
To: linux-mm@kvack.org
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
Roman Gushchin <guro@fb.com>, Rik van Riel <riel@surriel.com>,
Matthew Wilcox <willy@infradead.org>,
Shakeel Butt <shakeelb@google.com>,
Yang Shi <shy828301@gmail.com>, Jason Gunthorpe <jgg@nvidia.com>,
Mike Kravetz <mike.kravetz@oracle.com>,
Michal Hocko <mhocko@suse.com>,
David Hildenbrand <david@redhat.com>,
William Kucharski <william.kucharski@oracle.com>,
Andrea Arcangeli <aarcange@redhat.com>,
John Hubbard <jhubbard@nvidia.com>,
David Nellans <dnellans@nvidia.com>,
linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 01/30] mm/pagewalk: use READ_ONCE when reading the PUD entry unlocked
Date: Mon, 28 Sep 2020 13:53:59 -0400 [thread overview]
Message-ID: <20200928175428.4110504-2-zi.yan@sent.com> (raw)
In-Reply-To: <20200928175428.4110504-1-zi.yan@sent.com>
From: Jason Gunthorpe <jgg@nvidia.com>
The pagewalker runs while only holding the mmap_sem for read. The pud can
be set asynchronously, while also holding the mmap_sem for read
eg from:
handle_mm_fault()
__handle_mm_fault()
create_huge_pmd()
dev_dax_huge_fault()
__dev_dax_pud_fault()
vmf_insert_pfn_pud()
insert_pfn_pud()
pud_lock()
set_pud_at()
At least x86 sets the PUD using WRITE_ONCE(), so an unlocked read of
unstable data should be paired to use READ_ONCE().
For the pagewalker to work locklessly the PUD must work similarly to the
PMD: once the PUD entry becomes a pointer to a PMD, it must be stable, and
safe to pass to pmd_offset()
Passing the value from READ_ONCE into the callbacks prevents the callers
from seeing inconsistencies after they re-read, such as seeing pud_none().
If a callback does obtain the pud_lock then it should trigger ACTION_AGAIN
if a data race caused the original value to change.
Use the same pattern as gup_pmd_range() and pass in the address of the
local READ_ONCE stack variable to pmd_offset() to avoid reading it again.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
include/linux/pagewalk.h | 2 +-
mm/hmm.c | 16 +++++++---------
mm/mapping_dirty_helpers.c | 6 ++----
mm/pagewalk.c | 28 ++++++++++++++++------------
mm/ptdump.c | 3 +--
5 files changed, 27 insertions(+), 28 deletions(-)
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index b1cb6b753abb..6caf28aadafb 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -39,7 +39,7 @@ struct mm_walk_ops {
unsigned long next, struct mm_walk *walk);
int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
unsigned long next, struct mm_walk *walk);
- int (*pud_entry)(pud_t *pud, unsigned long addr,
+ int (*pud_entry)(pud_t pud, pud_t *pudp, unsigned long addr,
unsigned long next, struct mm_walk *walk);
int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk);
diff --git a/mm/hmm.c b/mm/hmm.c
index 943cb2ba4442..419e9e50fd51 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -402,28 +402,26 @@ static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT);
}
-static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
- struct mm_walk *walk)
+static int hmm_vma_walk_pud(pud_t pud, pud_t *pudp, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long addr = start;
- pud_t pud;
int ret = 0;
spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
if (!ptl)
return 0;
+ if (memcmp(pudp, &pud, sizeof(pud)) != 0) {
+ walk->action = ACTION_AGAIN;
+ spin_unlock(ptl);
+ return 0;
+ }
/* Normally we don't want to split the huge page */
walk->action = ACTION_CONTINUE;
- pud = READ_ONCE(*pudp);
- if (pud_none(pud)) {
- spin_unlock(ptl);
- return hmm_vma_walk_hole(start, end, -1, walk);
- }
-
if (pud_huge(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
unsigned int required_fault;
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 2c7d03675903..9fc46ebef497 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -150,11 +150,9 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
* causes dirty info loss. The pagefault handler should do
* that if needed.
*/
-static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+static int wp_clean_pud_entry(pud_t pudval, pud_t *pudp, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
- pud_t pudval = READ_ONCE(*pud);
-
if (!pud_trans_unstable(&pudval))
return 0;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..15d1e423b4a3 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return err;
}
-static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+static int walk_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pmd_t *pmd;
@@ -67,7 +67,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
int err = 0;
int depth = real_depth(3);
- pmd = pmd_offset(pud, addr);
+ pmd = pmd_offset(&pud, addr);
do {
again:
next = pmd_addr_end(addr, end);
@@ -119,17 +119,19 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- pud_t *pud;
+ pud_t *pudp;
+ pud_t pud;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
int depth = real_depth(2);
- pud = pud_offset(p4d, addr);
+ pudp = pud_offset(p4d, addr);
do {
again:
+ pud = READ_ONCE(*pudp);
next = pud_addr_end(addr, end);
- if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
+ if (pud_none(pud) || (!walk->vma && !walk->no_vma)) {
if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
@@ -140,27 +142,29 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
walk->action = ACTION_SUBTREE;
if (ops->pud_entry)
- err = ops->pud_entry(pud, addr, next, walk);
+ err = ops->pud_entry(pud, pudp, addr, next, walk);
if (err)
break;
if (walk->action == ACTION_AGAIN)
goto again;
- if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
+ if ((!walk->vma && (pud_leaf(pud) || !pud_present(pud))) ||
walk->action == ACTION_CONTINUE ||
!(ops->pmd_entry || ops->pte_entry))
continue;
- if (walk->vma)
- split_huge_pud(walk->vma, pud, addr);
- if (pud_none(*pud))
- goto again;
+ if (walk->vma) {
+ split_huge_pud(walk->vma, pudp, addr);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ goto again;
+ }
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
- } while (pud++, addr = next, addr != end);
+ } while (pudp++, addr = next, addr != end);
return err;
}
diff --git a/mm/ptdump.c b/mm/ptdump.c
index ba88ec43ff21..2055b940408e 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -65,11 +65,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
return 0;
}
-static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
+static int ptdump_pud_entry(pud_t val, pud_t *pudp, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- pud_t val = READ_ONCE(*pud);
#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
--
2.28.0
next prev parent reply other threads:[~2020-09-28 17:55 UTC|newest]
Thread overview: 56+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-09-28 17:53 [RFC PATCH v2 00/30] 1GB PUD THP support on x86_64 Zi Yan
2020-09-28 17:53 ` Zi Yan [this message]
2020-09-28 17:54 ` [RFC PATCH v2 02/30] mm: pagewalk: use READ_ONCE when reading the PMD entry unlocked Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 03/30] mm: thp: use single linked list for THP page table page deposit Zi Yan
2020-09-28 19:34 ` Matthew Wilcox
2020-09-28 20:34 ` Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 04/30] mm: add new helper functions to allocate one PMD page with 512 PTE pages Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 05/30] mm: thp: add page table deposit/withdraw functions for PUD THP Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 06/30] mm: change thp_order and thp_nr as we will have not just PMD THPs Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 07/30] mm: thp: add anonymous PUD THP page fault support without enabling it Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 08/30] mm: thp: add PUD THP support for copy_huge_pud Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 09/30] mm: thp: add PUD THP support to zap_huge_pud Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 10/30] fs: proc: add PUD THP kpageflag Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 11/30] mm: thp: handling PUD THP reference bit Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 12/30] mm: rmap: add mappped/unmapped page order to anonymous page rmap functions Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 13/30] mm: rmap: add map_order to page_remove_anon_compound_rmap Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 14/30] mm: thp: add PUD THP split_huge_pud_page() function Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 15/30] mm: thp: add PUD THP to deferred split list when PUD mapping is gone Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 16/30] mm: debug: adapt dump_page to PUD THP Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 17/30] mm: thp: PUD THP COW splits PUD page and falls back to PMD page Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 18/30] mm: thp: PUD THP follow_p*d_page() support Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 19/30] mm: stats: make smap stats understand PUD THPs Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 20/30] mm: page_vma_walk: teach it about PMD-mapped PUD THP Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 21/30] mm: thp: PUD THP support in try_to_unmap() Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 22/30] mm: thp: split PUD THPs at page reclaim Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 23/30] mm: support PUD THP pagemap support Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 24/30] mm: madvise: add page size options to MADV_HUGEPAGE and MADV_NOHUGEPAGE Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 25/30] mm: vma: add VM_HUGEPAGE_PUD to vm_flags at bit 37 Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 26/30] mm: thp: add a global knob to enable/disable PUD THPs Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 27/30] mm: thp: make PUD THP size public Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 28/30] hugetlb: cma: move cma reserve function to cma.c Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 29/30] mm: thp: use cma reservation for pud thp allocation Zi Yan
2020-09-28 17:54 ` [RFC PATCH v2 30/30] mm: thp: enable anonymous PUD THP at page fault path Zi Yan
2020-09-30 11:55 ` [RFC PATCH v2 00/30] 1GB PUD THP support on x86_64 Michal Hocko
2020-10-01 15:14 ` Zi Yan
2020-10-02 7:32 ` Michal Hocko
2020-10-02 7:50 ` David Hildenbrand
2020-10-02 8:10 ` Michal Hocko
2020-10-02 8:30 ` David Hildenbrand
2020-10-05 15:03 ` Zi Yan
2020-10-05 15:55 ` Matthew Wilcox
2020-10-05 17:04 ` Roman Gushchin
2020-10-05 19:12 ` Zi Yan
2020-10-05 19:37 ` Matthew Wilcox
2020-10-05 17:16 ` Roman Gushchin
2020-10-05 17:27 ` David Hildenbrand
2020-10-05 18:25 ` Roman Gushchin
2020-10-05 18:33 ` David Hildenbrand
2020-10-05 19:11 ` Roman Gushchin
2020-10-06 8:25 ` David Hildenbrand
2020-10-05 17:39 ` David Hildenbrand
2020-10-05 18:05 ` Zi Yan
2020-10-05 18:48 ` David Hildenbrand
2020-10-06 11:59 ` Michal Hocko
2020-10-05 15:34 ` Zi Yan
2020-10-05 17:30 ` David Hildenbrand
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200928175428.4110504-2-zi.yan@sent.com \
--to=zi.yan@sent.com \
--cc=aarcange@redhat.com \
--cc=david@redhat.com \
--cc=dnellans@nvidia.com \
--cc=guro@fb.com \
--cc=jgg@nvidia.com \
--cc=jhubbard@nvidia.com \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@suse.com \
--cc=mike.kravetz@oracle.com \
--cc=riel@surriel.com \
--cc=shakeelb@google.com \
--cc=shy828301@gmail.com \
--cc=william.kucharski@oracle.com \
--cc=willy@infradead.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).