From: Qi Zheng <zhengqi.arch@bytedance.com>
To: akpm@linux-foundation.org, tglx@linutronix.de,
kirill.shutemov@linux.intel.com, mika.penttila@nextfour.com,
david@redhat.com, jgg@nvidia.com
Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-mm@kvack.org, songmuchun@bytedance.com,
zhouchengming@bytedance.com,
Qi Zheng <zhengqi.arch@bytedance.com>
Subject: [PATCH v3 13/15] mm/pte_ref: free user PTE page table pages
Date: Wed, 10 Nov 2021 18:54:26 +0800 [thread overview]
Message-ID: <20211110105428.32458-14-zhengqi.arch@bytedance.com> (raw)
In-Reply-To: <20211110105428.32458-1-zhengqi.arch@bytedance.com>
This commit introduces CONFIG_FREE_USER_PTE option and
implements the actual logic to the dummy helpers about
pte_ref.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
include/linux/mm.h | 2 ++
include/linux/pgtable.h | 3 +-
include/linux/pte_ref.h | 53 ++++++++++++++++++++++++----
mm/Kconfig | 4 +++
mm/debug_vm_pgtable.c | 2 ++
mm/memory.c | 15 ++++++++
mm/pte_ref.c | 91 +++++++++++++++++++++++++++++++++++++++++++++----
7 files changed, 156 insertions(+), 14 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 753a9435e0d0..18fbf9e0996a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -437,6 +437,7 @@ extern pgprot_t protection_map[16];
* @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
* @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
* @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
+ * @FAULT_FLAG_PTE_GET: This means the refcount of the pte has been got.
*
* About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
* whether we would allow page faults to retry by specifying these two
@@ -468,6 +469,7 @@ enum fault_flag {
FAULT_FLAG_REMOTE = 1 << 7,
FAULT_FLAG_INSTRUCTION = 1 << 8,
FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
+ FAULT_FLAG_PTE_GET = 1 << 10,
};
/*
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c8f045705c1e..6ac51d58f11a 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -480,7 +480,6 @@ static inline pte_t ptep_get_lockless(pte_t *ptep)
}
#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
unsigned long address,
@@ -491,6 +490,8 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
return pmd;
}
#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
unsigned long address,
diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
index b6d8335bdc59..8a26eaba83ef 100644
--- a/include/linux/pte_ref.h
+++ b/include/linux/pte_ref.h
@@ -8,6 +8,7 @@
#define _LINUX_PTE_REF_H
#include <linux/pgtable.h>
+#include <linux/page-flags.h>
enum pte_tryget_type {
TRYGET_SUCCESSED,
@@ -16,12 +17,49 @@ enum pte_tryget_type {
TRYGET_FAILED_HUGE_PMD,
};
-bool pte_get_unless_zero(pmd_t *pmd);
-enum pte_tryget_type pte_try_get(pmd_t *pmd);
void pte_put_vmf(struct vm_fault *vmf);
+enum pte_tryget_type pte_try_get(pmd_t *pmd);
+bool pte_get_unless_zero(pmd_t *pmd);
+
+#ifdef CONFIG_FREE_USER_PTE
+void free_user_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr);
static inline void pte_ref_init(pgtable_t pte, pmd_t *pmd, int count)
{
+ pte->pmd = pmd;
+ atomic_set(&pte->pte_refcount, count);
+}
+
+static inline pmd_t *pte_to_pmd(pte_t *pte)
+{
+ return virt_to_page(pte)->pmd;
+}
+
+static inline void pte_update_pmd(pmd_t old_pmd, pmd_t *new_pmd)
+{
+ pmd_pgtable(old_pmd)->pmd = new_pmd;
+}
+
+static inline void pte_get_many(pmd_t *pmd, unsigned int nr)
+{
+ pgtable_t pte = pmd_pgtable(*pmd);
+
+ VM_BUG_ON(!PageTable(pte));
+ atomic_add(nr, &pte->pte_refcount);
+}
+
+static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned int nr)
+{
+ pgtable_t pte = pmd_pgtable(*pmd);
+
+ VM_BUG_ON(!PageTable(pte));
+ if (atomic_sub_and_test(nr, &pte->pte_refcount))
+ free_user_pte_table(mm, pmd, addr & PMD_MASK);
+}
+#else
+static inline void pte_ref_init(pgtable_t pte, pmd_t *pmd, int count)
+{
}
static inline pmd_t *pte_to_pmd(pte_t *pte)
@@ -37,6 +75,12 @@ static inline void pte_get_many(pmd_t *pmd, unsigned int nr)
{
}
+static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned int nr)
+{
+}
+#endif /* CONFIG_FREE_USER_PTE */
+
/*
* pte_get - Increment refcount for the PTE page table.
* @pmd: a pointer to the pmd entry corresponding to the PTE page table.
@@ -66,11 +110,6 @@ static inline pte_t *pte_tryget_map_lock(struct mm_struct *mm, pmd_t *pmd,
return pte_offset_map_lock(mm, pmd, address, ptlp);
}
-static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, unsigned int nr)
-{
-}
-
/*
* pte_put - Decrement refcount for the PTE page table.
* @mm: the mm_struct of the target address space.
diff --git a/mm/Kconfig b/mm/Kconfig
index 5c5508fafcec..44549d287869 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -898,6 +898,10 @@ config IO_MAPPING
config SECRETMEM
def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+config FREE_USER_PTE
+ def_bool y
+ depends on X86_64
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 52f006654664..757dd84ee254 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1049,8 +1049,10 @@ static void __init destroy_args(struct pgtable_debug_args *args)
/* Free page table entries */
if (args->start_ptep) {
pte_put(args->mm, args->start_pmdp, args->vaddr);
+#ifndef CONFIG_FREE_USER_PTE
pte_free(args->mm, args->start_ptep);
mm_dec_nr_ptes(args->mm);
+#endif
}
if (args->start_pmdp) {
diff --git a/mm/memory.c b/mm/memory.c
index e360ecd37a71..4d1ede78d1b0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -219,6 +219,17 @@ static void check_sync_rss_stat(struct task_struct *task)
#endif /* SPLIT_RSS_COUNTING */
+#ifdef CONFIG_FREE_USER_PTE
+static inline void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+ unsigned long addr)
+{
+ /*
+ * We should never reach here since the PTE page tables are
+ * dynamically freed.
+ */
+ BUG();
+}
+#else
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -231,6 +242,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
pte_free_tlb(tlb, token, addr);
mm_dec_nr_ptes(tlb->mm);
}
+#endif /* CONFIG_FREE_USER_PTE */
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr, unsigned long end,
@@ -4631,6 +4643,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
goto retry;
vmf->orig_pte = *vmf->pte;
+ if (IS_ENABLED(CONFIG_FREE_USER_PTE))
+ vmf->flags |= FAULT_FLAG_PTE_GET;
+
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
diff --git a/mm/pte_ref.c b/mm/pte_ref.c
index de109905bc8f..728e61cea25e 100644
--- a/mm/pte_ref.c
+++ b/mm/pte_ref.c
@@ -7,7 +7,10 @@
#include <linux/pte_ref.h>
#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/tlbflush.h>
+#ifdef CONFIG_FREE_USER_PTE
/*
* pte_get_unless_zero - Increment refcount for the PTE page table
* unless it is zero.
@@ -15,7 +18,10 @@
*/
bool pte_get_unless_zero(pmd_t *pmd)
{
- return true;
+ pgtable_t pte = pmd_pgtable(*pmd);
+
+ VM_BUG_ON(!PageTable(pte));
+ return atomic_inc_not_zero(&pte->pte_refcount);
}
/*
@@ -32,12 +38,20 @@ bool pte_get_unless_zero(pmd_t *pmd)
*/
enum pte_tryget_type pte_try_get(pmd_t *pmd)
{
- if (unlikely(pmd_none(*pmd)))
- return TRYGET_FAILED_NONE;
- if (unlikely(is_huge_pmd(*pmd)))
- return TRYGET_FAILED_HUGE_PMD;
+ int retval = TRYGET_SUCCESSED;
+ pmd_t pmdval;
- return TRYGET_SUCCESSED;
+ rcu_read_lock();
+ pmdval = READ_ONCE(*pmd);
+ if (unlikely(pmd_none(pmdval)))
+ retval = TRYGET_FAILED_NONE;
+ else if (unlikely(is_huge_pmd(pmdval)))
+ retval = TRYGET_FAILED_HUGE_PMD;
+ else if (!pte_get_unless_zero(&pmdval))
+ retval = TRYGET_FAILED_ZERO;
+ rcu_read_unlock();
+
+ return retval;
}
/*
@@ -52,4 +66,69 @@ enum pte_tryget_type pte_try_get(pmd_t *pmd)
*/
void pte_put_vmf(struct vm_fault *vmf)
{
+ if (!(vmf->flags & FAULT_FLAG_PTE_GET))
+ return;
+ vmf->flags &= ~FAULT_FLAG_PTE_GET;
+
+ pte_put(vmf->vma->vm_mm, vmf->pmd, vmf->address);
+}
+#else
+bool pte_get_unless_zero(pmd_t *pmd)
+{
+ return true;
+}
+
+enum pte_tryget_type pte_try_get(pmd_t *pmd)
+{
+ if (unlikely(pmd_none(*pmd)))
+ return TRYGET_FAILED_NONE;
+
+ if (unlikely(is_huge_pmd(*pmd)))
+ return TRYGET_FAILED_HUGE_PMD;
+
+ return TRYGET_SUCCESSED;
+}
+
+void pte_put_vmf(struct vm_fault *vmf)
+{
+}
+#endif /* CONFIG_FREE_USER_PTE */
+
+#ifdef CONFIG_DEBUG_VM
+static void pte_free_debug(pmd_t pmd)
+{
+ pte_t *ptep = (pte_t *)pmd_page_vaddr(pmd);
+ int i = 0;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ BUG_ON(!pte_none(*ptep++));
+}
+#else
+static inline void pte_free_debug(pmd_t pmd)
+{
+}
+#endif
+
+static void pte_free_rcu(struct rcu_head *rcu)
+{
+ struct page *page = container_of(rcu, struct page, rcu_head);
+
+ pgtable_pte_page_dtor(page);
+ __free_page(page);
+}
+
+void free_user_pte_table(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+ struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+ spinlock_t *ptl;
+ pmd_t pmdval;
+
+ ptl = pmd_lock(mm, pmd);
+ pmdval = pmdp_huge_get_and_clear(mm, addr, pmd);
+ flush_tlb_range(&vma, addr, addr + PMD_SIZE);
+ spin_unlock(ptl);
+
+ pte_free_debug(pmdval);
+ mm_dec_nr_ptes(mm);
+ call_rcu(&pmd_pgtable(pmdval)->rcu_head, pte_free_rcu);
}
--
2.11.0
next prev parent reply other threads:[~2021-11-10 10:56 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-11-10 10:54 [PATCH v3 00/15] Free user PTE page table pages Qi Zheng
2021-11-10 10:54 ` [PATCH v3 01/15] mm: do code cleanups to filemap_map_pmd() Qi Zheng
2021-11-10 10:54 ` [PATCH v3 02/15] mm: introduce is_huge_pmd() helper Qi Zheng
2021-11-11 13:46 ` kernel test robot
2021-11-10 10:54 ` [PATCH v3 03/15] mm: move pte_offset_map_lock() to pgtable.h Qi Zheng
2021-11-10 10:54 ` [PATCH v3 04/15] mm: rework the parameter of lock_page_or_retry() Qi Zheng
2021-11-10 10:54 ` [PATCH v3 05/15] mm: add pmd_installed_type return for __pte_alloc() and other friends Qi Zheng
2021-11-10 10:54 ` [PATCH v3 06/15] mm: introduce refcount for user PTE page table page Qi Zheng
2021-11-10 10:54 ` [PATCH v3 07/15] mm/pte_ref: add support for user PTE page table page allocation Qi Zheng
2021-11-11 15:17 ` kernel test robot
2021-11-10 10:54 ` [PATCH v3 08/15] mm/pte_ref: initialize the refcount of the withdrawn PTE page table page Qi Zheng
2021-11-10 10:54 ` [PATCH v3 09/15] mm/pte_ref: add support for the map/unmap of user " Qi Zheng
2021-11-10 10:54 ` [PATCH v3 10/15] mm/pte_ref: add support for page fault path Qi Zheng
2021-11-10 10:54 ` [PATCH v3 11/15] mm/pte_ref: take a refcount before accessing the PTE page table page Qi Zheng
2021-11-10 10:54 ` [PATCH v3 12/15] mm/pte_ref: update the pmd entry in move_normal_pmd() Qi Zheng
2021-11-10 10:54 ` Qi Zheng [this message]
2021-11-10 10:54 ` [PATCH v3 14/15] Documentation: add document for pte_ref Qi Zheng
2021-11-10 14:39 ` Jonathan Corbet
2021-11-11 5:40 ` Qi Zheng
2021-11-10 10:54 ` [PATCH v3 15/15] mm/pte_ref: use mmu_gather to free PTE page table pages Qi Zheng
2021-11-10 12:56 ` [PATCH v3 00/15] Free user " Jason Gunthorpe
2021-11-10 13:25 ` David Hildenbrand
2021-11-10 13:59 ` Qi Zheng
2021-11-10 14:38 ` Jason Gunthorpe
2021-11-10 15:37 ` David Hildenbrand
2021-11-10 16:39 ` Jason Gunthorpe
2021-11-10 17:37 ` David Hildenbrand
2021-11-10 17:49 ` Jason Gunthorpe
2021-11-11 3:58 ` Qi Zheng
2021-11-11 9:22 ` David Hildenbrand
2021-11-11 11:08 ` Qi Zheng
2021-11-11 11:19 ` David Hildenbrand
2021-11-11 12:00 ` Qi Zheng
2021-11-11 12:20 ` David Hildenbrand
2021-11-11 12:32 ` Qi Zheng
2021-11-11 12:51 ` David Hildenbrand
2021-11-11 13:01 ` Qi Zheng
2021-11-10 16:49 ` Matthew Wilcox
2021-11-10 16:53 ` David Hildenbrand
2021-11-10 16:56 ` Jason Gunthorpe
2021-11-10 13:54 ` Qi Zheng
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211110105428.32458-14-zhengqi.arch@bytedance.com \
--to=zhengqi.arch@bytedance.com \
--cc=akpm@linux-foundation.org \
--cc=david@redhat.com \
--cc=jgg@nvidia.com \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mika.penttila@nextfour.com \
--cc=songmuchun@bytedance.com \
--cc=tglx@linutronix.de \
--cc=zhouchengming@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).