linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Qi Zheng <zhengqi.arch@bytedance.com>
To: akpm@linux-foundation.org, tglx@linutronix.de,
	kirill.shutemov@linux.intel.com, mika.penttila@nextfour.com,
	david@redhat.com, jgg@nvidia.com, tj@kernel.org,
	dennis@kernel.org, ming.lei@redhat.com
Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, songmuchun@bytedance.com,
	zhouchengming@bytedance.com,
	Qi Zheng <zhengqi.arch@bytedance.com>
Subject: [RFC PATCH 15/18] mm: use try_to_free_user_pte() in MADV_FREE case
Date: Fri, 29 Apr 2022 21:35:49 +0800	[thread overview]
Message-ID: <20220429133552.33768-16-zhengqi.arch@bytedance.com> (raw)
In-Reply-To: <20220429133552.33768-1-zhengqi.arch@bytedance.com>

Different from MADV_DONTNEED case, MADV_FREE just marks the physical
page as lazyfree instead of unmapping it immediately, and the physical
page will not be unmapped until the system memory is tight. So we
convert the percpu_ref of the related user PTE page table page to
atomic mode in madvise_free_pte_range(), and then check if it is 0
in try_to_unmap_one(). If it is 0, we can safely reclaim the PTE page
table page at this time.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 include/linux/rmap.h |  2 ++
 mm/madvise.c         |  7 ++++++-
 mm/page_vma_mapped.c | 46 ++++++++++++++++++++++++++++++++++++++++++--
 mm/rmap.c            |  9 +++++++++
 4 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 17230c458341..a3174d3bf118 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -204,6 +204,8 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
 #define PVMW_SYNC		(1 << 0)
 /* Look for migration entries rather than present PTEs */
 #define PVMW_MIGRATION		(1 << 1)
+/* Used for MADV_FREE page */
+#define PVMW_MADV_FREE		(1 << 2)
 
 struct page_vma_mapped_walk {
 	unsigned long pfn;
diff --git a/mm/madvise.c b/mm/madvise.c
index 8123397f14c8..bd4bcaad5a9f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -598,7 +598,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	pte_t *orig_pte, *pte, ptent;
 	struct page *page;
 	int nr_swap = 0;
+	bool have_lazyfree = false;
 	unsigned long next;
+	unsigned long start = addr;
 
 	next = pmd_addr_end(addr, end);
 	if (pmd_trans_huge(*pmd))
@@ -709,6 +711,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			tlb_remove_tlb_entry(tlb, pte, addr);
 		}
 		mark_page_lazyfree(page);
+		have_lazyfree = true;
 	}
 out:
 	if (nr_swap) {
@@ -718,8 +721,10 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
 	}
 	arch_leave_lazy_mmu_mode();
-	if (orig_pte)
+	if (orig_pte) {
 		pte_unmap_unlock(orig_pte, ptl);
+		try_to_free_user_pte(mm, pmd, start, !have_lazyfree);
+	}
 	cond_resched();
 next:
 	return 0;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 8ecf8fd7cf5e..00bc09f57f48 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -266,8 +266,30 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 next_pte:
 		do {
 			pvmw->address += PAGE_SIZE;
-			if (pvmw->address >= end)
-				return not_found(pvmw);
+			if (pvmw->address >= end) {
+				not_found(pvmw);
+
+				if (pvmw->flags & PVMW_MADV_FREE) {
+					pgtable_t pte;
+					pmd_t pmdval;
+
+					pvmw->flags &= ~PVMW_MADV_FREE;
+					rcu_read_lock();
+					pmdval = READ_ONCE(*pvmw->pmd);
+					if (pmd_none(pmdval) || pmd_leaf(pmdval)) {
+						rcu_read_unlock();
+						return false;
+					}
+					pte = pmd_pgtable(pmdval);
+					if (percpu_ref_is_zero(pte->pte_ref)) {
+						rcu_read_unlock();
+						free_user_pte(mm, pvmw->pmd, pvmw->address);
+					} else {
+						rcu_read_unlock();
+					}
+				}
+				return false;
+			}
 			/* Did we cross page table boundary? */
 			if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
 				if (pvmw->ptl) {
@@ -275,6 +297,26 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 					pvmw->ptl = NULL;
 				}
 				pte_unmap(pvmw->pte);
+				if (pvmw->flags & PVMW_MADV_FREE) {
+					pgtable_t pte;
+					pmd_t pmdval;
+
+					pvmw->flags &= ~PVMW_MADV_FREE;
+					rcu_read_lock();
+					pmdval = READ_ONCE(*pvmw->pmd);
+					if (pmd_none(pmdval) || pmd_leaf(pmdval)) {
+						rcu_read_unlock();
+						pvmw->pte = NULL;
+						goto restart;
+					}
+					pte = pmd_pgtable(pmdval);
+					if (percpu_ref_is_zero(pte->pte_ref)) {
+						rcu_read_unlock();
+						free_user_pte(mm, pvmw->pmd, pvmw->address);
+					} else {
+						rcu_read_unlock();
+					}
+				}
 				pvmw->pte = NULL;
 				goto restart;
 			}
diff --git a/mm/rmap.c b/mm/rmap.c
index fedb82371efe..f978d324d4f9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1616,6 +1616,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 					mmu_notifier_invalidate_range(mm,
 						address, address + PAGE_SIZE);
 					dec_mm_counter(mm, MM_ANONPAGES);
+					if (IS_ENABLED(CONFIG_FREE_USER_PTE))
+						pvmw.flags |= PVMW_MADV_FREE;
 					goto discard;
 				}
 
@@ -1627,6 +1629,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				folio_set_swapbacked(folio);
 				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
+				if (IS_ENABLED(CONFIG_FREE_USER_PTE) &&
+				    pte_tryget(mm, pvmw.pmd, address)) {
+					pgtable_t pte_page = pmd_pgtable(*pvmw.pmd);
+
+					percpu_ref_switch_to_percpu(pte_page->pte_ref);
+					__pte_put(pte_page);
+				}
 				break;
 			}
 
-- 
2.20.1


  parent reply	other threads:[~2022-04-29 13:38 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-29 13:35 [RFC PATCH 00/18] Try to free user PTE page table pages Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 01/18] x86/mm/encrypt: add the missing pte_unmap() call Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 02/18] percpu_ref: make ref stable after percpu_ref_switch_to_atomic_sync() returns Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 03/18] percpu_ref: make percpu_ref_switch_lock per percpu_ref Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 04/18] mm: convert to use ptep_clear() in pte_clear_not_present_full() Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 05/18] mm: split the related definitions of pte_offset_map_lock() into pgtable.h Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 06/18] mm: introduce CONFIG_FREE_USER_PTE Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 07/18] mm: add pte_to_page() helper Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 08/18] mm: introduce percpu_ref for user PTE page table page Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 09/18] pte_ref: add pte_tryget() and {__,}pte_put() helper Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 10/18] mm: add pte_tryget_map{_lock}() helper Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 11/18] mm: convert to use pte_tryget_map_lock() Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 12/18] mm: convert to use pte_tryget_map() Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 13/18] mm: add try_to_free_user_pte() helper Qi Zheng
2022-04-30 13:35   ` Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 14/18] mm: use try_to_free_user_pte() in MADV_DONTNEED case Qi Zheng
2022-04-29 13:35 ` Qi Zheng [this message]
2022-04-29 13:35 ` [RFC PATCH 16/18] pte_ref: add track_pte_{set, clear}() helper Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 17/18] x86/mm: add x86_64 support for pte_ref Qi Zheng
2022-04-29 13:35 ` [RFC PATCH 18/18] Documentation: add document " Qi Zheng
2022-04-30 13:19   ` Bagas Sanjaya
2022-04-30 13:32     ` Qi Zheng
2022-05-17  8:30 ` [RFC PATCH 00/18] Try to free user PTE page table pages Qi Zheng
2022-05-18 14:51   ` David Hildenbrand
2022-05-18 14:56     ` Matthew Wilcox
2022-05-19  4:03       ` Qi Zheng
2022-05-19  3:58     ` Qi Zheng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220429133552.33768-16-zhengqi.arch@bytedance.com \
    --to=zhengqi.arch@bytedance.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=dennis@kernel.org \
    --cc=jgg@nvidia.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mika.penttila@nextfour.com \
    --cc=ming.lei@redhat.com \
    --cc=songmuchun@bytedance.com \
    --cc=tglx@linutronix.de \
    --cc=tj@kernel.org \
    --cc=zhouchengming@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).