From: Qi Zheng <zhengqi.arch@bytedance.com>
To: akpm@linux-foundation.org, tglx@linutronix.de,
hannes@cmpxchg.org, mhocko@kernel.org, vdavydov.dev@gmail.com
Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-mm@kvack.org, songmuchun@bytedance.com,
Qi Zheng <zhengqi.arch@bytedance.com>
Subject: [PATCH 6/7] mm: defer freeing PTE page table for a grace period
Date: Sun, 18 Jul 2021 12:30:32 +0800 [thread overview]
Message-ID: <20210718043034.76431-7-zhengqi.arch@bytedance.com> (raw)
In-Reply-To: <20210718043034.76431-1-zhengqi.arch@bytedance.com>
With rcu_read_lock(), the release of the PTE page table
can be postponed. So we don't need to hold the pmd lock
anymore when we do pte_try_get()/pte_alloc_get(), which
can improve performance and simplify code logic.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
fs/proc/task_mmu.c | 8 ++++----
include/linux/pte_ref.h | 28 +++++++++++++++-------------
mm/gup.c | 2 +-
mm/hmm.c | 2 +-
mm/khugepaged.c | 4 ++--
mm/ksm.c | 2 +-
mm/madvise.c | 6 +++---
mm/memcontrol.c | 4 ++--
mm/memory.c | 14 +++++++-------
mm/mempolicy.c | 2 +-
mm/migrate.c | 2 +-
mm/mincore.c | 2 +-
mm/mprotect.c | 2 +-
mm/page_vma_mapped.c | 2 +-
mm/pagewalk.c | 2 +-
mm/pte_ref.c | 10 +++++++++-
mm/swapfile.c | 2 +-
17 files changed, 52 insertions(+), 42 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b3cf4b8a91d6..f3c9c984bc29 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -584,7 +584,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
- !pte_try_get(vma->vm_mm, pmd))
+ !pte_try_get(pmd))
goto out;
/*
* The mmap_lock held all the way back in m_start() is what
@@ -1148,7 +1148,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
}
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
- !pte_try_get(vma->vm_mm, pmd))
+ !pte_try_get(pmd))
return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -1482,7 +1482,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
return 0;
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- if (!pte_try_get(walk->mm, pmdp))
+ if (!pte_try_get(pmdp))
return 0;
/*
@@ -1824,7 +1824,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd))
return 0;
#endif
- if (!pte_try_get(walk->mm, pmd))
+ if (!pte_try_get(pmd))
return 0;
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
index 695fbe8b991b..f4d20faab684 100644
--- a/include/linux/pte_ref.h
+++ b/include/linux/pte_ref.h
@@ -74,16 +74,17 @@ static inline bool pte_get_unless_zero(pmd_t *pmdp)
* i_mmap_lock or when parallel threads are excluded by other means
* which can make @pmdp entry stable.
*/
-static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
+static inline bool pte_try_get(pmd_t *pmdp)
{
bool retval = true;
- spinlock_t *ptl;
+ pmd_t pmdval;
- ptl = pmd_lock(mm, pmdp);
- if (pmd_leaf(*pmdp) || !pmd_present(*pmdp) ||
- !pte_get_unless_zero(pmdp))
+ rcu_read_lock();
+ pmdval = READ_ONCE(*pmdp);
+ if (pmd_leaf(pmdval) || !pmd_present(pmdval) ||
+ !pte_get_unless_zero(&pmdval))
retval = false;
- spin_unlock(ptl);
+ rcu_read_unlock();
return retval;
}
@@ -129,21 +130,22 @@ static inline void pte_put_vmf(struct vm_fault *vmf)
static inline int pte_alloc_try_get(struct mm_struct *mm, pmd_t *pmdp)
{
- if (!pte_try_get(mm, pmdp))
+ if (!pte_try_get(pmdp))
return __pte_alloc_try_get(mm, pmdp);
return 1;
}
static inline int pte_alloc_get(struct mm_struct *mm, pmd_t *pmdp)
{
- spinlock_t *ptl;
+ pmd_t pmdval;
- ptl = pmd_lock(mm, pmdp);
- if (pmd_none(*pmdp) || !pte_get_unless_zero(pmdp)) {
- spin_unlock(ptl);
+ rcu_read_lock();
+ pmdval = READ_ONCE(*pmdp);
+ if (pmd_none(pmdval) || !pte_get_unless_zero(&pmdval)) {
+ rcu_read_unlock();
return __pte_alloc_get(mm, pmdp);
}
- spin_unlock(ptl);
+ rcu_read_unlock();
return 0;
}
#else
@@ -173,7 +175,7 @@ static inline bool pte_get_unless_zero(pmd_t *pmdp)
return true;
}
-static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
+static inline bool pte_try_get(pmd_t *pmdp)
{
return true;
}
diff --git a/mm/gup.c b/mm/gup.c
index 3e2a153cb18e..a5be18e349cd 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -503,7 +503,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
- if (!pte_try_get(mm, pmd))
+ if (!pte_try_get(pmd))
return no_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/hmm.c b/mm/hmm.c
index 29bb379510cc..d0e767c5fbb6 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -380,7 +380,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
}
- if (!pte_try_get(walk->mm, pmdp))
+ if (!pte_try_get(pmdp))
goto again;
ptep = pte_offset_map(pmdp, addr);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e6c4d1b7a12a..c653edd75345 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1240,7 +1240,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out;
}
- if (!pte_try_get(mm, pmd)) {
+ if (!pte_try_get(pmd)) {
result = SCAN_PMD_NULL;
goto out;
}
@@ -1469,7 +1469,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (!pmd)
goto drop_hpage;
- if (!pte_try_get(mm, pmd))
+ if (!pte_try_get(pmd))
goto drop_hpage;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
diff --git a/mm/ksm.c b/mm/ksm.c
index 2e106f58dad0..5671683890c0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1133,7 +1133,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (!pmd)
goto out;
- if (!pte_try_get(mm, pmd))
+ if (!pte_try_get(pmd))
goto out;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
diff --git a/mm/madvise.c b/mm/madvise.c
index 4c4b35292212..0e849bbf268b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -193,7 +193,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
pmd_none_or_trans_huge_or_clear_bad(pmd)) ||
- !pte_try_get(vma->vm_mm, pmd))
+ !pte_try_get(pmd))
return 0;
for (index = start; index != end; index += PAGE_SIZE) {
@@ -396,7 +396,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd))
return 0;
#endif
- if (!pte_try_get(vma->vm_mm, pmd))
+ if (!pte_try_get(pmd))
return 0;
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -596,7 +596,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
goto next;
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
- pmd_trans_unstable(pmd)) || !pte_try_get(mm, pmd))
+ pmd_trans_unstable(pmd)) || !pte_try_get(pmd))
return 0;
nr_put++;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4f19e5f2cd18..f8c1cabdd259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5835,7 +5835,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
}
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
- !pte_try_get(vma->vm_mm, pmd))
+ !pte_try_get(pmd))
return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
@@ -6058,7 +6058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd))
return 0;
retry:
- if (!pte_try_get(vma->vm_mm, pmd))
+ if (!pte_try_get(pmd))
return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/memory.c b/mm/memory.c
index 242ed135bde4..c8ee0074c730 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1143,7 +1143,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
if (pmd_none_or_clear_bad(src_pmd))
continue;
- if (!pte_try_get(src_mm, src_pmd))
+ if (!pte_try_get(src_pmd))
goto retry;
if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
addr, next)) {
@@ -1481,7 +1481,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
*/
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
pmd_none_or_trans_huge_or_clear_bad(pmd)) ||
- !pte_try_get(tlb->mm, pmd))
+ !pte_try_get(pmd))
goto next;
next = zap_pte_range(tlb, vma, pmd, addr, next, details);
@@ -2608,7 +2608,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
continue;
pmd_clear_bad(pmd);
}
- if (!create && !pte_try_get(mm, pmd))
+ if (!create && !pte_try_get(pmd))
goto retry;
err = apply_to_pte_range(mm, pmd, addr, next,
fn, data, create, mask);
@@ -4078,7 +4078,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
}
} else if (pmd_devmap_trans_unstable(vmf->pmd)) { /* See comment in handle_pte_fault() */
return 0;
- } else if (!pte_try_get(vma->vm_mm, vmf->pmd)) {
+ } else if (!pte_try_get(vmf->pmd)) {
goto retry;
}
@@ -4319,7 +4319,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
goto out;
} else {
- if (!pte_try_get(vma->vm_mm, vmf->pmd)) {
+ if (!pte_try_get(vmf->pmd)) {
ret = VM_FAULT_SIGBUS;
goto out;
}
@@ -4579,7 +4579,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
- if (!pte_try_get(vmf->vma->vm_mm, vmf->pmd))
+ if (!pte_try_get(vmf->pmd))
goto retry;
if (IS_ENABLED(CONFIG_FREE_USER_PTE))
@@ -5000,7 +5000,7 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
(address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(range);
}
- if (!pte_try_get(mm, pmd))
+ if (!pte_try_get(pmd))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cbb3640717ff..b19243d8fe56 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -520,7 +520,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
/* THP was split, fall through to pte walk */
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
- !pte_try_get(walk->mm, pmd))
+ !pte_try_get(pmd))
return 0;
mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a94e8558b2c..e1a2169ab9e9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2265,7 +2265,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
if (unlikely(pmd_bad(*pmdp)))
return migrate_vma_collect_skip(start, end, walk);
- if (!pte_try_get(mm, pmdp))
+ if (!pte_try_get(pmdp))
goto again;
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
arch_enter_lazy_mmu_mode();
diff --git a/mm/mincore.c b/mm/mincore.c
index e21e271a7657..76eb31aaeef9 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -115,7 +115,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
- !pte_try_get(walk->mm, pmd)) {
+ !pte_try_get(pmd)) {
__mincore_unmapped_range(addr, end, vma, vec);
goto out;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 9cbd0848c5c5..8b387f8386c4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -319,7 +319,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
}
/* fall through, the trans huge pmd just split */
}
- if (!pte_try_get(vma->vm_mm, pmd))
+ if (!pte_try_get(pmd))
goto retry;
this_pages = change_pte_range(vma, pmd, addr, next, newprot,
cp_flags);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index eb84fa5825c0..c49bbff7aa60 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -259,7 +259,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
step_forward(pvmw, PMD_SIZE);
continue;
}
- if (!pte_try_get(pvmw->vma->vm_mm, pvmw->pmd))
+ if (!pte_try_get(pvmw->pmd))
goto retry;
if (!map_pte(pvmw))
goto next_pte;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 4080a88d7852..c7439a2e85f7 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -152,7 +152,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
} else {
if (!walk->no_vma) {
- if (!pte_try_get(walk->mm, pmd))
+ if (!pte_try_get(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
pte_put(walk->mm, pmd, addr);
diff --git a/mm/pte_ref.c b/mm/pte_ref.c
index 1b8d9828d513..7fd3d687a9cd 100644
--- a/mm/pte_ref.c
+++ b/mm/pte_ref.c
@@ -26,6 +26,14 @@ static inline void pte_free_debug(pmd_t pmd)
}
#endif
+static void pte_free_rcu(struct rcu_head *rcu)
+{
+ struct page *page = container_of(rcu, struct page, rcu_head);
+
+ pgtable_pte_page_dtor(page);
+ __free_page(page);
+}
+
void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr)
{
struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
@@ -39,7 +47,7 @@ void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr)
pte_free_debug(pmd);
flush_tlb_range(&vma, addr, addr + PMD_SIZE);
mm_dec_nr_ptes(mm);
- pte_free(mm, pmd_pgtable(pmd));
+ call_rcu(&pmd_pgtable(pmd)->rcu_head, pte_free_rcu);
}
static inline void __pte_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6153283be500..fe6f7c6d2849 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2024,7 +2024,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
pmd_none_or_trans_huge_or_clear_bad(pmd)) ||
- !pte_try_get(vma->vm_mm, pmd))
+ !pte_try_get(pmd))
continue;
ret = unuse_pte_range(vma, pmd, addr, next, type,
frontswap, fs_pages_to_unuse);
--
2.11.0
next prev parent reply other threads:[~2021-07-18 4:31 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-07-18 4:30 [PATCH 0/7] Free user PTE page table pages Qi Zheng
2021-07-18 4:30 ` [PATCH 1/7] mm: fix the deadlock in finish_fault() Qi Zheng
2021-07-18 21:28 ` Kirill A. Shutemov
2021-07-19 9:53 ` Qi Zheng
2021-07-20 23:14 ` Andrew Morton
2021-07-21 2:21 ` Qi Zheng
2021-07-18 4:30 ` [PATCH 2/7] mm: introduce pte_install() helper Qi Zheng
2021-07-18 21:31 ` Kirill A. Shutemov
2021-07-19 10:20 ` Qi Zheng
2021-07-18 4:30 ` [PATCH 3/7] mm: remove redundant smp_wmb() Qi Zheng
2021-07-27 13:39 ` Muchun Song
2021-07-27 13:39 ` Muchun Song
2021-07-18 4:30 ` [PATCH 4/7] mm: rework the parameter of lock_page_or_retry() Qi Zheng
2021-07-18 4:30 ` [PATCH 5/7] mm: free user PTE page table pages Qi Zheng
2021-07-18 6:19 ` Mika Penttilä
2021-07-19 12:56 ` Qi Zheng
2021-07-19 13:55 ` Mika Penttilä
2021-07-19 14:12 ` Qi Zheng
2021-07-19 14:17 ` Mika Penttilä
2021-07-18 22:01 ` Kirill A. Shutemov
2021-07-19 13:55 ` Qi Zheng
2021-07-18 4:30 ` Qi Zheng [this message]
2021-07-18 4:30 ` [PATCH 7/7] mm: use mmu_gather to free PTE page table Qi Zheng
2021-07-19 7:34 ` [PATCH 0/7] Free user PTE page table pages David Hildenbrand
2021-07-19 11:28 ` David Hildenbrand
2021-07-19 12:42 ` Muchun Song
2021-07-19 12:42 ` Muchun Song
2021-07-19 13:30 ` Muchun Song
2021-07-19 13:30 ` Muchun Song
2021-07-20 4:00 ` Qi Zheng
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210718043034.76431-7-zhengqi.arch@bytedance.com \
--to=zhengqi.arch@bytedance.com \
--cc=akpm@linux-foundation.org \
--cc=hannes@cmpxchg.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=songmuchun@bytedance.com \
--cc=tglx@linutronix.de \
--cc=vdavydov.dev@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.