linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Nadav Amit <nadav.amit@gmail.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Nadav Amit <namit@vmware.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Andy Lutomirski <luto@kernel.org>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Will Deacon <will@kernel.org>, Yu Zhao <yuzhao@google.com>,
	Nick Piggin <npiggin@gmail.com>,
	x86@kernel.org
Subject: [RFC 13/20] mm/tlb: introduce tlb_start_ptes() and tlb_end_ptes()
Date: Sat, 30 Jan 2021 16:11:25 -0800	[thread overview]
Message-ID: <20210131001132.3368247-14-namit@vmware.com> (raw)
In-Reply-To: <20210131001132.3368247-1-namit@vmware.com>

From: Nadav Amit <namit@vmware.com>

Introduce tlb_start_ptes() and tlb_end_ptes() which would be called
before and after PTEs are updated and TLB flushes are deferred. This
will be later be used for fine granualrity deferred TLB flushing
detection.

In the meanwhile, move flush_tlb_batched_pending() into
tlb_start_ptes(). It was not called from mapping_dirty_helpers by
wp_pte() and clean_record_pte(), which might be a bug.

No additional functional change is intended.

Signed-off-by: Nadav Amit <namit@vmware.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Nick Piggin <npiggin@gmail.com>
Cc: x86@kernel.org
---
 fs/proc/task_mmu.c         |  2 ++
 include/asm-generic/tlb.h  | 18 ++++++++++++++++++
 mm/madvise.c               |  6 ++++--
 mm/mapping_dirty_helpers.c | 15 +++++++++++++--
 mm/memory.c                |  2 ++
 mm/mprotect.c              |  3 ++-
 6 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4cd048ffa0f6..d0cce961fa5c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1168,6 +1168,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 		return 0;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	tlb_start_ptes(&cp->tlb);
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
 		ptent = *pte;
 
@@ -1190,6 +1191,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 		tlb_flush_pte_range(&cp->tlb, addr, PAGE_SIZE);
 		ClearPageReferenced(page);
 	}
+	tlb_end_ptes(&cp->tlb);
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	return 0;
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 041be2ef4426..10690763090a 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -58,6 +58,11 @@
  *    Defaults to flushing at tlb_end_vma() to reset the range; helps when
  *    there's large holes between the VMAs.
  *
+ *  - tlb_start_ptes() / tlb_end_ptes; makr the start / end of PTEs change.
+ *
+ *    Does internal accounting to allow fine(r) granularity checks for
+ *    pte_accessible() on certain configuration.
+ *
  *  - tlb_remove_table()
  *
  *    tlb_remove_table() is the basic primitive to free page-table directories
@@ -373,6 +378,10 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 		flush_tlb_range(tlb->vma, tlb->start, tlb->end);
 	}
 }
+#endif
+
+#if __is_defined(tlb_flush) ||						\
+	IS_ENABLED(CONFIG_ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING)
 
 static inline void
 tlb_update_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
@@ -523,6 +532,15 @@ static inline void mark_mm_tlb_gen_done(struct mm_struct *mm, u64 gen)
 
 #endif /* CONFIG_ARCH_HAS_TLB_GENERATIONS */
 
+#define tlb_start_ptes(tlb)						\
+	do {								\
+		struct mmu_gather *_tlb = (tlb);			\
+									\
+		flush_tlb_batched_pending(_tlb->mm);			\
+	} while (0)
+
+static inline void tlb_end_ptes(struct mmu_gather *tlb) { }
+
 /*
  * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,
  * and set corresponding cleared_*.
diff --git a/mm/madvise.c b/mm/madvise.c
index 0938fd3ad228..932c1c2eb9a3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -392,7 +392,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 #endif
 	tlb_change_page_size(tlb, PAGE_SIZE);
 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	flush_tlb_batched_pending(mm);
+	tlb_start_ptes(tlb);
 	arch_enter_lazy_mmu_mode();
 	for (; addr < end; pte++, addr += PAGE_SIZE) {
 		ptent = *pte;
@@ -468,6 +468,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	}
 
 	arch_leave_lazy_mmu_mode();
+	tlb_end_ptes(tlb);
 	pte_unmap_unlock(orig_pte, ptl);
 	if (pageout)
 		reclaim_pages(&page_list);
@@ -588,7 +589,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 
 	tlb_change_page_size(tlb, PAGE_SIZE);
 	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-	flush_tlb_batched_pending(mm);
+	tlb_start_ptes(tlb);
 	arch_enter_lazy_mmu_mode();
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
 		ptent = *pte;
@@ -692,6 +693,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
 	}
 	arch_leave_lazy_mmu_mode();
+	tlb_end_ptes(tlb);
 	pte_unmap_unlock(orig_pte, ptl);
 	cond_resched();
 next:
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 2ce6cf431026..063419ade304 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -6,6 +6,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 
+#include "internal.h"
+
 /**
  * struct wp_walk - Private struct for pagetable walk callbacks
  * @range: Range for mmu notifiers
@@ -36,7 +38,10 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
 	pte_t ptent = *pte;
 
 	if (pte_write(ptent)) {
-		pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
+		pte_t old_pte;
+
+		tlb_start_ptes(&wpwalk->tlb);
+		old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
 
 		ptent = pte_wrprotect(old_pte);
 		ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
@@ -44,6 +49,7 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
 
 		if (pte_may_need_flush(old_pte, ptent))
 			tlb_flush_pte_range(&wpwalk->tlb, addr, PAGE_SIZE);
+		tlb_end_ptes(&wpwalk->tlb);
 	}
 
 	return 0;
@@ -94,13 +100,18 @@ static int clean_record_pte(pte_t *pte, unsigned long addr,
 	if (pte_dirty(ptent)) {
 		pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) +
 			walk->vma->vm_pgoff - cwalk->bitmap_pgoff;
-		pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
+		pte_t old_pte;
+
+		tlb_start_ptes(&wpwalk->tlb);
+
+		old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
 
 		ptent = pte_mkclean(old_pte);
 		ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
 
 		wpwalk->total++;
 		tlb_flush_pte_range(&wpwalk->tlb, addr, PAGE_SIZE);
+		tlb_end_ptes(&wpwalk->tlb);
 
 		__set_bit(pgoff, cwalk->bitmap);
 		cwalk->start = min(cwalk->start, pgoff);
diff --git a/mm/memory.c b/mm/memory.c
index 9e8576a83147..929a93c50d9a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1221,6 +1221,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	init_rss_vec(rss);
 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	pte = start_pte;
+	tlb_start_ptes(tlb);
 	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
 	do {
@@ -1314,6 +1315,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	add_mm_rss_vec(mm, rss);
 	arch_leave_lazy_mmu_mode();
 
+	tlb_end_ptes(tlb);
 	/* Do the actual TLB flush before dropping ptl */
 	if (force_flush)
 		tlb_flush_mmu_tlbonly(tlb);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b7473d2c9a1f..1258bbe42ee1 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -70,7 +70,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
 	    atomic_read(&vma->vm_mm->mm_users) == 1)
 		target_node = numa_node_id();
 
-	flush_tlb_batched_pending(vma->vm_mm);
+	tlb_start_ptes(tlb);
 	arch_enter_lazy_mmu_mode();
 	do {
 		oldpte = *pte;
@@ -182,6 +182,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
+	tlb_end_ptes(tlb);
 	pte_unmap_unlock(pte - 1, ptl);
 
 	return pages;
-- 
2.25.1


  parent reply	other threads:[~2021-01-31  0:18 UTC|newest]

Thread overview: 65+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-01-31  0:11 [RFC 00/20] TLB batching consolidation and enhancements Nadav Amit
2021-01-31  0:11 ` [RFC 01/20] mm/tlb: fix fullmm semantics Nadav Amit
2021-01-31  1:02   ` Andy Lutomirski
2021-01-31  1:19     ` Nadav Amit
2021-01-31  2:57       ` Andy Lutomirski
2021-02-01  7:30         ` Nadav Amit
2021-02-01 11:36   ` Peter Zijlstra
2021-02-02  9:32     ` Nadav Amit
2021-02-02 11:00       ` Peter Zijlstra
2021-02-02 21:35         ` Nadav Amit
2021-02-03  9:44           ` Will Deacon
2021-02-04  3:20             ` Nadav Amit
2021-01-31  0:11 ` [RFC 02/20] mm/mprotect: use mmu_gather Nadav Amit
2021-01-31  0:11 ` [RFC 03/20] mm/mprotect: do not flush on permission promotion Nadav Amit
2021-01-31  1:07   ` Andy Lutomirski
2021-01-31  1:17     ` Nadav Amit
2021-01-31  2:59       ` Andy Lutomirski
     [not found]     ` <7a6de15a-a570-31f2-14d6-a8010296e694@citrix.com>
2021-02-01  5:58       ` Nadav Amit
2021-02-01 15:38         ` Andrew Cooper
2021-01-31  0:11 ` [RFC 04/20] mm/mapping_dirty_helpers: use mmu_gather Nadav Amit
2021-01-31  0:11 ` [RFC 05/20] mm/tlb: move BATCHED_UNMAP_TLB_FLUSH to tlb.h Nadav Amit
2021-01-31  0:11 ` [RFC 06/20] fs/task_mmu: use mmu_gather interface of clear-soft-dirty Nadav Amit
2021-01-31  0:11 ` [RFC 07/20] mm: move x86 tlb_gen to generic code Nadav Amit
2021-01-31 18:26   ` Andy Lutomirski
2021-01-31  0:11 ` [RFC 08/20] mm: store completed TLB generation Nadav Amit
2021-01-31 20:32   ` Andy Lutomirski
2021-02-01  7:28     ` Nadav Amit
2021-02-01 16:53       ` Andy Lutomirski
2021-02-01 11:52   ` Peter Zijlstra
2021-01-31  0:11 ` [RFC 09/20] mm: create pte/pmd_tlb_flush_pending() Nadav Amit
2021-01-31  0:11 ` [RFC 10/20] mm: add pte_to_page() Nadav Amit
2021-01-31  0:11 ` [RFC 11/20] mm/tlb: remove arch-specific tlb_start/end_vma() Nadav Amit
2021-02-01 12:09   ` Peter Zijlstra
2021-02-02  6:41     ` Nicholas Piggin
2021-02-02  7:20       ` Nadav Amit
2021-02-02  9:31         ` Peter Zijlstra
2021-02-02  9:54           ` Nadav Amit
2021-02-02 11:04             ` Peter Zijlstra
2021-01-31  0:11 ` [RFC 12/20] mm/tlb: save the VMA that is flushed during tlb_start_vma() Nadav Amit
2021-02-01 12:28   ` Peter Zijlstra
2021-01-31  0:11 ` Nadav Amit [this message]
     [not found]   ` <YBaBcc2jEGaxuxH0@fedora.tometzki.de>
2021-02-01  7:29     ` [RFC 13/20] mm/tlb: introduce tlb_start_ptes() and tlb_end_ptes() Nadav Amit
2021-02-01 13:19   ` Peter Zijlstra
2021-02-01 23:00     ` Nadav Amit
2021-01-31  0:11 ` [RFC 14/20] mm: move inc/dec_tlb_flush_pending() to mmu_gather.c Nadav Amit
2021-01-31  0:11 ` [RFC 15/20] mm: detect deferred TLB flushes in vma granularity Nadav Amit
2021-02-01 22:04   ` Nadav Amit
2021-02-02  0:14     ` Andy Lutomirski
2021-02-02 20:51       ` Nadav Amit
2021-02-04  4:35         ` Andy Lutomirski
2021-01-31  0:11 ` [RFC 16/20] mm/tlb: per-page table generation tracking Nadav Amit
2021-01-31  0:11 ` [RFC 17/20] mm/tlb: updated completed deferred TLB flush conditionally Nadav Amit
2021-01-31  0:11 ` [RFC 18/20] mm: make mm_cpumask() volatile Nadav Amit
2021-01-31  0:11 ` [RFC 19/20] lib/cpumask: introduce cpumask_atomic_or() Nadav Amit
2021-01-31  0:11 ` [RFC 20/20] mm/rmap: avoid potential races Nadav Amit
2021-08-23  8:05   ` Huang, Ying
2021-08-23 15:50     ` Nadav Amit
2021-08-24  0:36       ` Huang, Ying
2021-01-31  0:39 ` [RFC 00/20] TLB batching consolidation and enhancements Andy Lutomirski
2021-01-31  1:08   ` Nadav Amit
2021-01-31  3:30 ` Nicholas Piggin
2021-01-31  7:57   ` Nadav Amit
2021-01-31  8:14     ` Nadav Amit
2021-02-01 12:44     ` Peter Zijlstra
2021-02-02  7:14       ` Nicholas Piggin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210131001132.3368247-14-namit@vmware.com \
    --to=nadav.amit@gmail.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=dave.hansen@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=namit@vmware.com \
    --cc=npiggin@gmail.com \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=will@kernel.org \
    --cc=x86@kernel.org \
    --cc=yuzhao@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).