linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/4] x86: TLB invalidate fixes
@ 2018-08-22 15:30 Peter Zijlstra
  2018-08-22 15:30 ` [PATCH 1/4] x86/mm/tlb: Revert the recent lazy TLB patches Peter Zijlstra
                   ` (3 more replies)
  0 siblings, 4 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-22 15:30 UTC (permalink / raw)
  To: torvalds
  Cc: peterz, luto, x86, bp, will.deacon, riel, jannh, ascannell,
	dave.hansen, linux-kernel, linux-mm

Hi,

Here are a number of patches that fix a x86 TLB invalidation issue reported by
Jann and some related things that came up while sorting this out.

Please consider.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 1/4] x86/mm/tlb: Revert the recent lazy TLB patches
  2018-08-22 15:30 [PATCH 0/4] x86: TLB invalidate fixes Peter Zijlstra
@ 2018-08-22 15:30 ` Peter Zijlstra
  2018-08-22 21:37   ` Linus Torvalds
  2018-08-22 15:30 ` [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition Peter Zijlstra
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-22 15:30 UTC (permalink / raw)
  To: torvalds
  Cc: peterz, luto, x86, bp, will.deacon, riel, jannh, ascannell,
	dave.hansen, linux-kernel, linux-mm

Revert commits:

  95b0e6357d3e x86/mm/tlb: Always use lazy TLB mode
  64482aafe55f x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs
  ac0315896970 x86/mm/tlb: Make lazy TLB mode lazier
  61d0beb5796a x86/mm/tlb: Restructure switch_mm_irqs_off()
  2ff6ddf19c0e x86/mm/tlb: Leave lazy TLB mode at page table free time

In order to simplify the TLB invalidate fixes for x86. We'll try again later.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,6 +148,22 @@ static inline unsigned long build_cr3_no
 #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
 #endif
 
+static inline bool tlb_defer_switch_to_init_mm(void)
+{
+	/*
+	 * If we have PCID, then switching to init_mm is reasonably
+	 * fast.  If we don't have PCID, then switching to init_mm is
+	 * quite slow, so we try to defer it in the hopes that we can
+	 * avoid it entirely.  The latter approach runs the risk of
+	 * receiving otherwise unnecessary IPIs.
+	 *
+	 * This choice is just a heuristic.  The tlb code can handle this
+	 * function returning true or false regardless of whether we have
+	 * PCID.
+	 */
+	return !static_cpu_has(X86_FEATURE_PCID);
+}
+
 struct tlb_context {
 	u64 ctx_id;
 	u64 tlb_gen;
@@ -538,9 +554,4 @@ extern void arch_tlbbatch_flush(struct a
 	native_flush_tlb_others(mask, info)
 #endif
 
-extern void tlb_flush_remove_tables(struct mm_struct *mm);
-extern void tlb_flush_remove_tables_local(void *arg);
-
-#define HAVE_TLB_FLUSH_REMOVE_TABLES
-
 #endif /* _ASM_X86_TLBFLUSH_H */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,7 +7,6 @@
 #include <linux/export.h>
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
-#include <linux/gfp.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -186,11 +185,8 @@ void switch_mm_irqs_off(struct mm_struct
 {
 	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
 	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
-	bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
 	unsigned cpu = smp_processor_id();
 	u64 next_tlb_gen;
-	bool need_flush;
-	u16 new_asid;
 
 	/*
 	 * NB: The scheduler will call us with prev == next when switching
@@ -244,41 +240,20 @@ void switch_mm_irqs_off(struct mm_struct
 			   next->context.ctx_id);
 
 		/*
-		 * Even in lazy TLB mode, the CPU should stay set in the
-		 * mm_cpumask. The TLB shootdown code can figure out from
-		 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
+		 * We don't currently support having a real mm loaded without
+		 * our cpu set in mm_cpumask().  We have all the bookkeeping
+		 * in place to figure out whether we would need to flush
+		 * if our cpu were cleared in mm_cpumask(), but we don't
+		 * currently use it.
 		 */
 		if (WARN_ON_ONCE(real_prev != &init_mm &&
 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
 			cpumask_set_cpu(cpu, mm_cpumask(next));
 
-		/*
-		 * If the CPU is not in lazy TLB mode, we are just switching
-		 * from one thread in a process to another thread in the same
-		 * process. No TLB flush required.
-		 */
-		if (!was_lazy)
-			return;
-
-		/*
-		 * Read the tlb_gen to check whether a flush is needed.
-		 * If the TLB is up to date, just use it.
-		 * The barrier synchronizes with the tlb_gen increment in
-		 * the TLB shootdown code.
-		 */
-		smp_mb();
-		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
-				next_tlb_gen)
-			return;
-
-		/*
-		 * TLB contents went out of date while we were in lazy
-		 * mode. Fall through to the TLB switching code below.
-		 */
-		new_asid = prev_asid;
-		need_flush = true;
+		return;
 	} else {
+		u16 new_asid;
+		bool need_flush;
 		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
 
 		/*
@@ -329,41 +304,41 @@ void switch_mm_irqs_off(struct mm_struct
 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 
 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
-	}
 
-	if (need_flush) {
-		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
-		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-		load_new_mm_cr3(next->pgd, new_asid, true);
+		if (need_flush) {
+			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+			load_new_mm_cr3(next->pgd, new_asid, true);
+
+			/*
+			 * NB: This gets called via leave_mm() in the idle path
+			 * where RCU functions differently.  Tracing normally
+			 * uses RCU, so we need to use the _rcuidle variant.
+			 *
+			 * (There is no good reason for this.  The idle code should
+			 *  be rearranged to call this before rcu_idle_enter().)
+			 */
+			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+		} else {
+			/* The new ASID is already up to date. */
+			load_new_mm_cr3(next->pgd, new_asid, false);
+
+			/* See above wrt _rcuidle. */
+			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+		}
 
 		/*
-		 * NB: This gets called via leave_mm() in the idle path
-		 * where RCU functions differently.  Tracing normally
-		 * uses RCU, so we need to use the _rcuidle variant.
-		 *
-		 * (There is no good reason for this.  The idle code should
-		 *  be rearranged to call this before rcu_idle_enter().)
+		 * Record last user mm's context id, so we can avoid
+		 * flushing branch buffer with IBPB if we switch back
+		 * to the same user.
 		 */
-		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-	} else {
-		/* The new ASID is already up to date. */
-		load_new_mm_cr3(next->pgd, new_asid, false);
+		if (next != &init_mm)
+			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
 
-		/* See above wrt _rcuidle. */
-		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+		this_cpu_write(cpu_tlbstate.loaded_mm, next);
+		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 	}
 
-	/*
-	 * Record last user mm's context id, so we can avoid
-	 * flushing branch buffer with IBPB if we switch back
-	 * to the same user.
-	 */
-	if (next != &init_mm)
-		this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
-
-	this_cpu_write(cpu_tlbstate.loaded_mm, next);
-	this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
-
 	load_mm_cr4(next);
 	switch_ldt(real_prev, next);
 }
@@ -386,7 +361,20 @@ void enter_lazy_tlb(struct mm_struct *mm
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
 		return;
 
-	this_cpu_write(cpu_tlbstate.is_lazy, true);
+	if (tlb_defer_switch_to_init_mm()) {
+		/*
+		 * There's a significant optimization that may be possible
+		 * here.  We have accurate enough TLB flush tracking that we
+		 * don't need to maintain coherence of TLB per se when we're
+		 * lazy.  We do, however, need to maintain coherence of
+		 * paging-structure caches.  We could, in principle, leave our
+		 * old mm loaded and only switch to init_mm when
+		 * tlb_remove_page() happens.
+		 */
+		this_cpu_write(cpu_tlbstate.is_lazy, true);
+	} else {
+		switch_mm(NULL, &init_mm, NULL);
+	}
 }
 
 /*
@@ -473,9 +461,6 @@ static void flush_tlb_func_common(const
 		 * paging-structure cache to avoid speculatively reading
 		 * garbage into our TLB.  Since switching to init_mm is barely
 		 * slower than a minimal flush, just switch to init_mm.
-		 *
-		 * This should be rare, with native_flush_tlb_others skipping
-		 * IPIs to lazy TLB mode CPUs.
 		 */
 		switch_mm_irqs_off(NULL, &init_mm, NULL);
 		return;
@@ -582,9 +567,6 @@ static void flush_tlb_func_remote(void *
 void native_flush_tlb_others(const struct cpumask *cpumask,
 			     const struct flush_tlb_info *info)
 {
-	cpumask_var_t lazymask;
-	unsigned int cpu;
-
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
 	if (info->end == TLB_FLUSH_ALL)
 		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -608,6 +590,8 @@ void native_flush_tlb_others(const struc
 		 * that UV should be updated so that smp_call_function_many(),
 		 * etc, are optimal on UV.
 		 */
+		unsigned int cpu;
+
 		cpu = smp_processor_id();
 		cpumask = uv_flush_tlb_others(cpumask, info);
 		if (cpumask)
@@ -615,29 +599,8 @@ void native_flush_tlb_others(const struc
 					       (void *)info, 1);
 		return;
 	}
-
-	/*
-	 * A temporary cpumask is used in order to skip sending IPIs
-	 * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
-	 * If the allocation fails, simply IPI every CPU in mm_cpumask.
-	 */
-	if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
-		smp_call_function_many(cpumask, flush_tlb_func_remote,
+	smp_call_function_many(cpumask, flush_tlb_func_remote,
 			       (void *)info, 1);
-		return;
-	}
-
-	cpumask_copy(lazymask, cpumask);
-
-	for_each_cpu(cpu, lazymask) {
-		if (per_cpu(cpu_tlbstate.is_lazy, cpu))
-			cpumask_clear_cpu(cpu, lazymask);
-	}
-
-	smp_call_function_many(lazymask, flush_tlb_func_remote,
-			       (void *)info, 1);
-
-	free_cpumask_var(lazymask);
 }
 
 /*
@@ -690,68 +653,6 @@ void flush_tlb_mm_range(struct mm_struct
 	put_cpu();
 }
 
-void tlb_flush_remove_tables_local(void *arg)
-{
-	struct mm_struct *mm = arg;
-
-	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
-			this_cpu_read(cpu_tlbstate.is_lazy)) {
-		/*
-		 * We're in lazy mode.  We need to at least flush our
-		 * paging-structure cache to avoid speculatively reading
-		 * garbage into our TLB.  Since switching to init_mm is barely
-		 * slower than a minimal flush, just switch to init_mm.
-		 */
-		switch_mm_irqs_off(NULL, &init_mm, NULL);
-	}
-}
-
-static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
-				      struct cpumask *lazy_cpus)
-{
-	int cpu;
-
-	for_each_cpu(cpu, mm_cpumask(mm)) {
-		if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
-			cpumask_set_cpu(cpu, lazy_cpus);
-	}
-}
-
-void tlb_flush_remove_tables(struct mm_struct *mm)
-{
-	int cpu = get_cpu();
-	cpumask_var_t lazy_cpus;
-
-	if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
-		put_cpu();
-		return;
-	}
-
-	if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
-		/*
-		 * If the cpumask allocation fails, do a brute force flush
-		 * on all the CPUs that have this mm loaded.
-		 */
-		smp_call_function_many(mm_cpumask(mm),
-				tlb_flush_remove_tables_local, (void *)mm, 1);
-		put_cpu();
-		return;
-	}
-
-	/*
-	 * CPUs with !is_lazy either received a TLB flush IPI while the user
-	 * pages in this address range were unmapped, or have context switched
-	 * and reloaded %CR3 since then.
-	 *
-	 * Shootdown IPIs at page table freeing time only need to be sent to
-	 * CPUs that may have out of date TLB contents.
-	 */
-	mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
-	smp_call_function_many(lazy_cpus,
-				tlb_flush_remove_tables_local, (void *)mm, 1);
-	free_cpumask_var(lazy_cpus);
-	put_cpu();
-}
 
 static void do_flush_tlb_all(void *info)
 {
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -303,14 +303,4 @@ static inline void tlb_remove_check_page
 
 #define tlb_migrate_finish(mm) do {} while (0)
 
-/*
- * Used to flush the TLB when page tables are removed, when lazy
- * TLB mode may cause a CPU to retain intermediate translations
- * pointing to about-to-be-freed page table memory.
- */
-#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES
-#define tlb_flush_remove_tables(mm) do {} while (0)
-#define tlb_flush_remove_tables_local(mm) do {} while (0)
-#endif
-
 #endif /* _ASM_GENERIC__TLB_H */
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -326,20 +326,16 @@ bool __tlb_remove_page_size(struct mmu_g
 
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+
 static void tlb_remove_table_smp_sync(void *arg)
 {
-	struct mm_struct __maybe_unused *mm = arg;
-	/*
-	 * On most architectures this does nothing. Simply delivering the
-	 * interrupt is enough to prevent races with software page table
-	 * walking like that done in get_user_pages_fast.
-	 *
-	 * See the comment near struct mmu_table_batch.
-	 */
-	tlb_flush_remove_tables_local(mm);
+	/* Simply deliver the interrupt */
 }
 
-static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
+static void tlb_remove_table_one(void *table)
 {
 	/*
 	 * This isn't an RCU grace period and hence the page-tables cannot be
@@ -348,7 +344,7 @@ static void tlb_remove_table_one(void *t
 	 * It is however sufficient for software page-table walkers that rely on
 	 * IRQ disabling. See the comment near struct mmu_table_batch.
 	 */
-	smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
+	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 	__tlb_remove_table(table);
 }
 
@@ -369,8 +365,6 @@ void tlb_table_flush(struct mmu_gather *
 {
 	struct mmu_table_batch **batch = &tlb->batch;
 
-	tlb_flush_remove_tables(tlb->mm);
-
 	if (*batch) {
 		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
 		*batch = NULL;
@@ -393,7 +387,7 @@ void tlb_remove_table(struct mmu_gather
 	if (*batch == NULL) {
 		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		if (*batch == NULL) {
-			tlb_remove_table_one(table, tlb);
+			tlb_remove_table_one(table);
 			return;
 		}
 		(*batch)->nr = 0;



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition
  2018-08-22 15:30 [PATCH 0/4] x86: TLB invalidate fixes Peter Zijlstra
  2018-08-22 15:30 ` [PATCH 1/4] x86/mm/tlb: Revert the recent lazy TLB patches Peter Zijlstra
@ 2018-08-22 15:30 ` Peter Zijlstra
  2018-08-23  3:31   ` Nicholas Piggin
  2018-08-23 13:40   ` Will Deacon
  2018-08-22 15:30 ` [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE Peter Zijlstra
  2018-08-22 15:30 ` [PATCH 4/4] x86/mm: Only use tlb_remove_table() for paravirt Peter Zijlstra
  3 siblings, 2 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-22 15:30 UTC (permalink / raw)
  To: torvalds
  Cc: peterz, luto, x86, bp, will.deacon, riel, jannh, ascannell,
	dave.hansen, linux-kernel, linux-mm, Nicholas Piggin,
	David Miller, Martin Schwidefsky, Michael Ellerman

Will noted that only checking mm_users is incorrect; we should also
check mm_count in order to cover CPUs that have a lazy reference to
this mm (and could do speculative TLB operations).

If removing this turns out to be a performance issue, we can
re-instate a more complete check, but in tlb_table_flush() eliding the
call_rcu_sched().

Cc: stable@kernel.org
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Fixes: 267239116987 ("mm, powerpc: move the RCU page-table freeing into generic code")
Reported-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 mm/memory.c |    9 ---------
 1 file changed, 9 deletions(-)

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -375,15 +375,6 @@ void tlb_remove_table(struct mmu_gather
 {
 	struct mmu_table_batch **batch = &tlb->batch;
 
-	/*
-	 * When there's less then two users of this mm there cannot be a
-	 * concurrent page-table walk.
-	 */
-	if (atomic_read(&tlb->mm->mm_users) < 2) {
-		__tlb_remove_table(table);
-		return;
-	}
-
 	if (*batch == NULL) {
 		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		if (*batch == NULL) {



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-22 15:30 [PATCH 0/4] x86: TLB invalidate fixes Peter Zijlstra
  2018-08-22 15:30 ` [PATCH 1/4] x86/mm/tlb: Revert the recent lazy TLB patches Peter Zijlstra
  2018-08-22 15:30 ` [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition Peter Zijlstra
@ 2018-08-22 15:30 ` Peter Zijlstra
  2018-08-22 15:55   ` Peter Zijlstra
                     ` (2 more replies)
  2018-08-22 15:30 ` [PATCH 4/4] x86/mm: Only use tlb_remove_table() for paravirt Peter Zijlstra
  3 siblings, 3 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-22 15:30 UTC (permalink / raw)
  To: torvalds
  Cc: peterz, luto, x86, bp, will.deacon, riel, jannh, ascannell,
	dave.hansen, linux-kernel, linux-mm, Nicholas Piggin,
	David Miller, Martin Schwidefsky, Michael Ellerman

Jann reported that x86 was missing required TLB invalidates when he
hit the !*batch slow path in tlb_remove_table().

This is indeed the case; RCU_TABLE_FREE does not provide TLB (cache)
invalidates, the PowerPC-hash where this code originated and the
Sparc-hash where this was subsequently used did not need that. ARM
which later used this put an explicit TLB invalidate in their
__p*_free_tlb() functions, and PowerPC-radix followed that example.

But when we hooked up x86 we failed to consider this. Fix this by
(optionally) hooking tlb_remove_table() into the TLB invalidate code.

NOTE: s390 was also needing something like this and might now
      be able to use the generic code again.

Cc: stable@kernel.org
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Fixes: 9e52fc2b50de ("x86/mm: Enable RCU based page table freeing (CONFIG_HAVE_RCU_TABLE_FREE=y)")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/Kconfig     |    3 +++
 arch/x86/Kconfig |    1 +
 mm/memory.c      |   27 +++++++++++++++++++++++++--
 3 files changed, 29 insertions(+), 2 deletions(-)

--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -362,6 +362,9 @@ config HAVE_ARCH_JUMP_LABEL
 config HAVE_RCU_TABLE_FREE
 	bool
 
+config HAVE_RCU_TABLE_INVALIDATE
+	bool
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -180,6 +180,7 @@ config X86
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE
+	select HAVE_RCU_TABLE_INVALIDATE	if HAVE_RCU_TABLE_FREE
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
 	select HAVE_STACKPROTECTOR		if CC_HAS_SANE_STACKPROTECTOR
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -238,17 +238,22 @@ void arch_tlb_gather_mmu(struct mmu_gath
 	__tlb_reset_range(tlb);
 }
 
-static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+static void __tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
 	if (!tlb->end)
 		return;
 
 	tlb_flush(tlb);
 	mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
+	__tlb_reset_range(tlb);
+}
+
+static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+{
+	__tlb_flush_mmu_tlbonly(tlb);
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb_table_flush(tlb);
 #endif
-	__tlb_reset_range(tlb);
 }
 
 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
@@ -330,6 +335,21 @@ bool __tlb_remove_page_size(struct mmu_g
  * See the comment near struct mmu_table_batch.
  */
 
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
+	/*
+	 * Invalidate page-table caches used by hardware walkers. Then we still
+	 * need to RCU-sched wait while freeing the pages because software
+	 * walkers can still be in-flight.
+	 */
+	__tlb_flush_mmu_tlbonly(tlb);
+#endif
+}
+
 static void tlb_remove_table_smp_sync(void *arg)
 {
 	/* Simply deliver the interrupt */
@@ -366,6 +386,7 @@ void tlb_table_flush(struct mmu_gather *
 	struct mmu_table_batch **batch = &tlb->batch;
 
 	if (*batch) {
+		tlb_table_invalidate(tlb);
 		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
 		*batch = NULL;
 	}
@@ -378,11 +399,13 @@ void tlb_remove_table(struct mmu_gather
 	if (*batch == NULL) {
 		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		if (*batch == NULL) {
+			tlb_table_invalidate(tlb);
 			tlb_remove_table_one(table);
 			return;
 		}
 		(*batch)->nr = 0;
 	}
+
 	(*batch)->tables[(*batch)->nr++] = table;
 	if ((*batch)->nr == MAX_TABLE_BATCH)
 		tlb_table_flush(tlb);



^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 4/4] x86/mm: Only use tlb_remove_table() for paravirt
  2018-08-22 15:30 [PATCH 0/4] x86: TLB invalidate fixes Peter Zijlstra
                   ` (2 preceding siblings ...)
  2018-08-22 15:30 ` [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE Peter Zijlstra
@ 2018-08-22 15:30 ` Peter Zijlstra
  2018-08-22 22:12   ` Eduardo Valentin
  3 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-22 15:30 UTC (permalink / raw)
  To: torvalds
  Cc: peterz, luto, x86, bp, will.deacon, riel, jannh, ascannell,
	dave.hansen, linux-kernel, linux-mm

If we don't use paravirt; don't play unnecessary and complicated games
to free page-tables.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/Kconfig                      |    2 +-
 arch/x86/hyperv/mmu.c                 |    2 ++
 arch/x86/include/asm/paravirt.h       |    5 +++++
 arch/x86/include/asm/paravirt_types.h |    3 +++
 arch/x86/include/asm/tlbflush.h       |    3 +++
 arch/x86/kernel/kvm.c                 |    5 ++++-
 arch/x86/kernel/paravirt.c            |    2 ++
 arch/x86/mm/pgtable.c                 |    8 ++++----
 arch/x86/xen/mmu_pv.c                 |    1 +
 9 files changed, 25 insertions(+), 6 deletions(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -179,7 +179,7 @@ config X86
 	select HAVE_HARDLOCKUP_DETECTOR_PERF	if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
-	select HAVE_RCU_TABLE_FREE
+	select HAVE_RCU_TABLE_FREE		if PARAVIRT
 	select HAVE_RCU_TABLE_INVALIDATE	if HAVE_RCU_TABLE_FREE
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -9,6 +9,7 @@
 #include <asm/mshyperv.h>
 #include <asm/msr.h>
 #include <asm/tlbflush.h>
+#include <asm/tlb.h>
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/hyperv.h>
@@ -231,4 +232,5 @@ void hyperv_setup_mmu_ops(void)
 
 	pr_info("Using hypercall for remote TLB flush\n");
 	pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others;
+	pv_mmu_ops.tlb_remove_table = tlb_remove_table;
 }
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -309,6 +309,11 @@ static inline void flush_tlb_others(cons
 	PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
 }
 
+static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+	PVOP_VCALL2(pv_mmu_ops.tlb_remove_table, tlb, table);
+}
+
 static inline int paravirt_pgd_alloc(struct mm_struct *mm)
 {
 	return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -54,6 +54,7 @@ struct desc_struct;
 struct task_struct;
 struct cpumask;
 struct flush_tlb_info;
+struct mmu_gather;
 
 /*
  * Wrapper type for pointers to code which uses the non-standard
@@ -222,6 +223,8 @@ struct pv_mmu_ops {
 	void (*flush_tlb_others)(const struct cpumask *cpus,
 				 const struct flush_tlb_info *info);
 
+	void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
+
 	/* Hooks for allocating and freeing a pagetable top-level */
 	int  (*pgd_alloc)(struct mm_struct *mm);
 	void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -552,6 +552,9 @@ extern void arch_tlbbatch_flush(struct a
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, info)	\
 	native_flush_tlb_others(mask, info)
+
+#define paravirt_tlb_remove_table(tlb, page) \
+	tlb_remove_page(tlb, (void *)(page))
 #endif
 
 #endif /* _ASM_X86_TLBFLUSH_H */
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -45,6 +45,7 @@
 #include <asm/apic.h>
 #include <asm/apicdef.h>
 #include <asm/hypervisor.h>
+#include <asm/tlb.h>
 
 static int kvmapf = 1;
 
@@ -636,8 +637,10 @@ static void __init kvm_guest_init(void)
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
 	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
-	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
+	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 		pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
+		pv_mmu_ops.tlb_remove_table = tlb_remove_table;
+	}
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		apic_set_eoi_write(kvm_guest_apic_eoi_write);
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -41,6 +41,7 @@
 #include <asm/tlbflush.h>
 #include <asm/timer.h>
 #include <asm/special_insns.h>
+#include <asm/tlb.h>
 
 /*
  * nop stub, which must not clobber anything *including the stack* to
@@ -409,6 +410,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_
 	.flush_tlb_kernel = native_flush_tlb_global,
 	.flush_tlb_one_user = native_flush_tlb_one_user,
 	.flush_tlb_others = native_flush_tlb_others,
+	.tlb_remove_table = (void (*)(struct mmu_gather *, void *))tlb_remove_page,
 
 	.pgd_alloc = __paravirt_pgd_alloc,
 	.pgd_free = paravirt_nop,
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,7 +63,7 @@ void ___pte_free_tlb(struct mmu_gather *
 {
 	pgtable_page_dtor(pte);
 	paravirt_release_pte(page_to_pfn(pte));
-	tlb_remove_table(tlb, pte);
+	paravirt_tlb_remove_table(tlb, pte);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 2
@@ -79,21 +79,21 @@ void ___pmd_free_tlb(struct mmu_gather *
 	tlb->need_flush_all = 1;
 #endif
 	pgtable_pmd_page_dtor(page);
-	tlb_remove_table(tlb, page);
+	paravirt_tlb_remove_table(tlb, page);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 3
 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
-	tlb_remove_table(tlb, virt_to_page(pud));
+	paravirt_tlb_remove_table(tlb, virt_to_page(pud));
 }
 
 #if CONFIG_PGTABLE_LEVELS > 4
 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
 {
 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
-	tlb_remove_table(tlb, virt_to_page(p4d));
+	paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
 }
 #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
 #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2397,6 +2397,7 @@ static const struct pv_mmu_ops xen_mmu_o
 	.flush_tlb_kernel = xen_flush_tlb,
 	.flush_tlb_one_user = xen_flush_tlb_one_user,
 	.flush_tlb_others = xen_flush_tlb_others,
+	.tlb_remove_table = tlb_remove_table,
 
 	.pgd_alloc = xen_pgd_alloc,
 	.pgd_free = xen_pgd_free,



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-22 15:30 ` [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE Peter Zijlstra
@ 2018-08-22 15:55   ` Peter Zijlstra
  2018-08-23  3:45     ` Nicholas Piggin
  2018-08-23 23:31     ` Will Deacon
  2018-08-22 21:34   ` Linus Torvalds
  2018-08-23  8:46   ` Nicholas Piggin
  2 siblings, 2 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-22 15:55 UTC (permalink / raw)
  To: torvalds
  Cc: luto, x86, bp, will.deacon, riel, jannh, ascannell, dave.hansen,
	linux-kernel, linux-mm, Nicholas Piggin, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 05:30:15PM +0200, Peter Zijlstra wrote:
> ARM
> which later used this put an explicit TLB invalidate in their
> __p*_free_tlb() functions, and PowerPC-radix followed that example.

> +/*
> + * If we want tlb_remove_table() to imply TLB invalidates.
> + */
> +static inline void tlb_table_invalidate(struct mmu_gather *tlb)
> +{
> +#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
> +	/*
> +	 * Invalidate page-table caches used by hardware walkers. Then we still
> +	 * need to RCU-sched wait while freeing the pages because software
> +	 * walkers can still be in-flight.
> +	 */
> +	__tlb_flush_mmu_tlbonly(tlb);
> +#endif
> +}


Nick, Will is already looking at using this to remove the synchronous
invalidation from __p*_free_tlb() for ARM, could you have a look to see
if PowerPC-radix could benefit from that too?

Basically, using a patch like the below, would give your tlb_flush()
information on if tables were removed or not.

---

--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -96,12 +96,22 @@ struct mmu_gather {
 #endif
 	unsigned long		start;
 	unsigned long		end;
-	/* we are in the middle of an operation to clear
-	 * a full mm and can make some optimizations */
-	unsigned int		fullmm : 1,
-	/* we have performed an operation which
-	 * requires a complete flush of the tlb */
-				need_flush_all : 1;
+	/*
+	 * we are in the middle of an operation to clear
+	 * a full mm and can make some optimizations
+	 */
+	unsigned int		fullmm : 1;
+
+	/*
+	 * we have performed an operation which
+	 * requires a complete flush of the tlb
+	 */
+	unsigned int		need_flush_all : 1;
+
+	/*
+	 * we have removed page directories
+	 */
+	unsigned int		freed_tables : 1;
 
 	struct mmu_gather_batch *active;
 	struct mmu_gather_batch	local;
@@ -136,6 +146,7 @@ static inline void __tlb_reset_range(str
 		tlb->start = TASK_SIZE;
 		tlb->end = 0;
 	}
+	tlb->freed_tables = 0;
 }
 
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
@@ -269,6 +280,7 @@ static inline void tlb_remove_check_page
 #define pte_free_tlb(tlb, ptep, address)			\
 	do {							\
 		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb->freed_tables = 1;			\
 		__pte_free_tlb(tlb, ptep, address);		\
 	} while (0)
 #endif
@@ -276,7 +288,8 @@ static inline void tlb_remove_check_page
 #ifndef pmd_free_tlb
 #define pmd_free_tlb(tlb, pmdp, address)			\
 	do {							\
-		__tlb_adjust_range(tlb, address, PAGE_SIZE);		\
+		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb->freed_tables = 1;			\
 		__pmd_free_tlb(tlb, pmdp, address);		\
 	} while (0)
 #endif
@@ -286,6 +299,7 @@ static inline void tlb_remove_check_page
 #define pud_free_tlb(tlb, pudp, address)			\
 	do {							\
 		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb->freed_tables = 1;			\
 		__pud_free_tlb(tlb, pudp, address);		\
 	} while (0)
 #endif
@@ -295,7 +309,8 @@ static inline void tlb_remove_check_page
 #ifndef p4d_free_tlb
 #define p4d_free_tlb(tlb, pudp, address)			\
 	do {							\
-		__tlb_adjust_range(tlb, address, PAGE_SIZE);		\
+		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb->freed_tables = 1;			\
 		__p4d_free_tlb(tlb, pudp, address);		\
 	} while (0)
 #endif

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-22 15:30 ` [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE Peter Zijlstra
  2018-08-22 15:55   ` Peter Zijlstra
@ 2018-08-22 21:34   ` Linus Torvalds
  2018-08-23  8:46   ` Nicholas Piggin
  2 siblings, 0 replies; 96+ messages in thread
From: Linus Torvalds @ 2018-08-22 21:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Will Deacon, Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, Nick Piggin, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 8:46 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -180,6 +180,7 @@ config X86
>         select HAVE_PERF_REGS
>         select HAVE_PERF_USER_STACK_DUMP
>         select HAVE_RCU_TABLE_FREE
> +       select HAVE_RCU_TABLE_INVALIDATE        if HAVE_RCU_TABLE_FREE

This is confusing. First you select HAVE_RCU_TABLE_FREE
unconditionally, and then you select HAVE_RCU_TABLE_INVALIDATE based
on that unconditional variable.

I can see why you do it, but that's because I see the next patch. On
its own it just looks like you have a drinking problem.

That said, I was waiting to see if this patch-set would get any
comments before applying it, but it's been mostly crickets... So I
think I'll just apply it and get this issue over and done with.

              Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/4] x86/mm/tlb: Revert the recent lazy TLB patches
  2018-08-22 15:30 ` [PATCH 1/4] x86/mm/tlb: Revert the recent lazy TLB patches Peter Zijlstra
@ 2018-08-22 21:37   ` Linus Torvalds
  2018-08-22 22:11     ` Rik van Riel
  0 siblings, 1 reply; 96+ messages in thread
From: Linus Torvalds @ 2018-08-22 21:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Will Deacon, Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm

On Wed, Aug 22, 2018 at 8:46 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Revert [..] in order to simplify the TLB invalidate fixes for x86. We'll try again later.

Rik, I assume I should take your earlier "yeah, I can try later" as an
ack for this?

I'll wait a bit more in the hopes of getting reviews/acks, but I'm
basically chomping at the bit to just apply this series and have this
issue behind us.

             Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/4] x86/mm/tlb: Revert the recent lazy TLB patches
  2018-08-22 21:37   ` Linus Torvalds
@ 2018-08-22 22:11     ` Rik van Riel
  0 siblings, 0 replies; 96+ messages in thread
From: Rik van Riel @ 2018-08-22 22:11 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra
  Cc: Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Will Deacon, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm

[-- Attachment #1: Type: text/plain, Size: 639 bytes --]

On Wed, 2018-08-22 at 14:37 -0700, Linus Torvalds wrote:
> On Wed, Aug 22, 2018 at 8:46 AM Peter Zijlstra <peterz@infradead.org>
> wrote:
> > 
> > Revert [..] in order to simplify the TLB invalidate fixes for x86.
> > We'll try again later.
> 
> Rik, I assume I should take your earlier "yeah, I can try later" as
> an
> ack for this?

Yes, feel free to add my Acked-by: to all these
patches.

Patch 3/4 is not ideal, with the slow path in
tlb_remove_table sending two IPIs (one to every
CPU, one to the CPUs in the mm_cpumask), but
that is a slow path we should not be hitting
much anyway.

-- 
All Rights Reversed.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 4/4] x86/mm: Only use tlb_remove_table() for paravirt
  2018-08-22 15:30 ` [PATCH 4/4] x86/mm: Only use tlb_remove_table() for paravirt Peter Zijlstra
@ 2018-08-22 22:12   ` Eduardo Valentin
  0 siblings, 0 replies; 96+ messages in thread
From: Eduardo Valentin @ 2018-08-22 22:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, luto, x86, bp, will.deacon, riel, jannh, ascannell,
	dave.hansen, linux-kernel, linux-mm

Hey Peter,

On Wed, Aug 22, 2018 at 05:30:16PM +0200, Peter Zijlstra wrote:
> If we don't use paravirt; don't play unnecessary and complicated games
> to free page-tables.
> 
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/Kconfig                      |    2 +-
>  arch/x86/hyperv/mmu.c                 |    2 ++
>  arch/x86/include/asm/paravirt.h       |    5 +++++
>  arch/x86/include/asm/paravirt_types.h |    3 +++
>  arch/x86/include/asm/tlbflush.h       |    3 +++
>  arch/x86/kernel/kvm.c                 |    5 ++++-
>  arch/x86/kernel/paravirt.c            |    2 ++
>  arch/x86/mm/pgtable.c                 |    8 ++++----
>  arch/x86/xen/mmu_pv.c                 |    1 +
>  9 files changed, 25 insertions(+), 6 deletions(-)
> 

<cut>

> --- a/arch/x86/xen/mmu_pv.c
> +++ b/arch/x86/xen/mmu_pv.c
> @@ -2397,6 +2397,7 @@ static const struct pv_mmu_ops xen_mmu_o
>  	.flush_tlb_kernel = xen_flush_tlb,
>  	.flush_tlb_one_user = xen_flush_tlb_one_user,
>  	.flush_tlb_others = xen_flush_tlb_others,
> +	.tlb_remove_table = tlb_remove_table,

I tried compiling the series, and I am getting the following compilation error:

  CC      arch/x86/xen/mmu_pv.o
arch/x86/xen/mmu_pv.c:2400:22: error: ‘tlb_remove_table’ undeclared here (not in a function); did you mean ‘bus_remove_file’?
  .tlb_remove_table = tlb_remove_table,
                      ^~~~~~~~~~~~~~~~
                      bus_remove_file
make[4]: *** [arch/x86/xen/mmu_pv.o] Error 1


which I solved by:
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index a21988ec30d3..3114b7c76970 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -67,6 +67,7 @@
 #include <asm/init.h>
 #include <asm/pat.h>
 #include <asm/smp.h>
+#include <asm/tlb.h>
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>


>  
>  	.pgd_alloc = xen_pgd_alloc,
>  	.pgd_free = xen_pgd_free,
> 
> 
> 

-- 
All the best,
Eduardo Valentin

^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition
  2018-08-22 15:30 ` [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition Peter Zijlstra
@ 2018-08-23  3:31   ` Nicholas Piggin
  2018-08-23  3:35     ` Linus Torvalds
  2018-08-23 13:40   ` Will Deacon
  1 sibling, 1 reply; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-23  3:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, luto, x86, bp, will.deacon, riel, jannh, ascannell,
	dave.hansen, linux-kernel, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, 22 Aug 2018 17:30:14 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> Will noted that only checking mm_users is incorrect; we should also
> check mm_count in order to cover CPUs that have a lazy reference to
> this mm (and could do speculative TLB operations).

Why is that incorrect?

This shortcut has nothing to do with no TLBs -- not sure about x86, but
other CPUs can certainly have remaining TLBs here, speculative
operations or not (even if they don't have an mm_count ref they can
have TLBs here).

So that leaves speculative operations. I don't see where the problem is
with those either -- this shortcut needs to ensure there are no other
*non speculative* operations. mm_users is correct for that.

If there is a speculation security problem here it should be carefully
documented otherwise it's going to be re-introduced...

I actually have a patch to extend this optimisation further that I'm
going to send out again today. It's nice to avoid the double handling
of the pages.

Thanks,
Nick

> 
> If removing this turns out to be a performance issue, we can
> re-instate a more complete check, but in tlb_table_flush() eliding the
> call_rcu_sched().
> 
> Cc: stable@kernel.org
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: David Miller <davem@davemloft.net>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Fixes: 267239116987 ("mm, powerpc: move the RCU page-table freeing into generic code")
> Reported-by: Will Deacon <will.deacon@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  mm/memory.c |    9 ---------
>  1 file changed, 9 deletions(-)
> 
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -375,15 +375,6 @@ void tlb_remove_table(struct mmu_gather
>  {
>  	struct mmu_table_batch **batch = &tlb->batch;
>  
> -	/*
> -	 * When there's less then two users of this mm there cannot be a
> -	 * concurrent page-table walk.
> -	 */
> -	if (atomic_read(&tlb->mm->mm_users) < 2) {
> -		__tlb_remove_table(table);
> -		return;
> -	}
> -
>  	if (*batch == NULL) {
>  		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
>  		if (*batch == NULL) {
> 
> 


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition
  2018-08-23  3:31   ` Nicholas Piggin
@ 2018-08-23  3:35     ` Linus Torvalds
  2018-08-23  3:44       ` Linus Torvalds
  2018-08-23  4:16       ` Nicholas Piggin
  0 siblings, 2 replies; 96+ messages in thread
From: Linus Torvalds @ 2018-08-23  3:35 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Peter Zijlstra, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 8:31 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>
>
> So that leaves speculative operations. I don't see where the problem is
> with those either -- this shortcut needs to ensure there are no other
> *non speculative* operations. mm_users is correct for that.

No. Because mm_users doesn't contain any lazy tlb users.

And yes, those lazy tlbs are all kernel threads, but they can still
speculatively load user addresses.

           Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition
  2018-08-23  3:35     ` Linus Torvalds
@ 2018-08-23  3:44       ` Linus Torvalds
  2018-08-23  4:16       ` Nicholas Piggin
  1 sibling, 0 replies; 96+ messages in thread
From: Linus Torvalds @ 2018-08-23  3:44 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Peter Zijlstra, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 8:35 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> No. Because mm_users doesn't contain any lazy tlb users.

.. or, as it turns out, the use_mm() case either, which can do
gup_fast(). Oh well.

            Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-22 15:55   ` Peter Zijlstra
@ 2018-08-23  3:45     ` Nicholas Piggin
  2018-08-23  3:59       ` Linus Torvalds
  2018-08-23 23:31     ` Will Deacon
  1 sibling, 1 reply; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-23  3:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, luto, x86, bp, will.deacon, riel, jannh, ascannell,
	dave.hansen, linux-kernel, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, 22 Aug 2018 17:55:27 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Aug 22, 2018 at 05:30:15PM +0200, Peter Zijlstra wrote:
> > ARM
> > which later used this put an explicit TLB invalidate in their
> > __p*_free_tlb() functions, and PowerPC-radix followed that example.  
> 
> > +/*
> > + * If we want tlb_remove_table() to imply TLB invalidates.
> > + */
> > +static inline void tlb_table_invalidate(struct mmu_gather *tlb)
> > +{
> > +#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
> > +	/*
> > +	 * Invalidate page-table caches used by hardware walkers. Then we still
> > +	 * need to RCU-sched wait while freeing the pages because software
> > +	 * walkers can still be in-flight.
> > +	 */
> > +	__tlb_flush_mmu_tlbonly(tlb);
> > +#endif
> > +}  
> 
> 
> Nick, Will is already looking at using this to remove the synchronous
> invalidation from __p*_free_tlb() for ARM, could you have a look to see
> if PowerPC-radix could benefit from that too?

powerpc/radix has no such issue, it already does this tracking.

We were discussing this a couple of months ago, I wasn't aware of ARM's
issue but I suggested x86 could go the same way as powerpc. Would have
been good to get some feedback on some of the proposed approaches there.
Because it's not just pwc tracking but if you do this you don't need
those silly hacks in generic code to expand the TLB address range
either.

So powerpc has no fundamental problem with making this stuff generic.
If you need a fix for x86 and ARM for this merge window though, I would
suggest just copying what powerpc already has. Next time we can
consider consolidating them all into generic code.

Thanks,
Nick


> 
> Basically, using a patch like the below, would give your tlb_flush()
> information on if tables were removed or not.
> 
> ---
> 
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -96,12 +96,22 @@ struct mmu_gather {
>  #endif
>  	unsigned long		start;
>  	unsigned long		end;
> -	/* we are in the middle of an operation to clear
> -	 * a full mm and can make some optimizations */
> -	unsigned int		fullmm : 1,
> -	/* we have performed an operation which
> -	 * requires a complete flush of the tlb */
> -				need_flush_all : 1;
> +	/*
> +	 * we are in the middle of an operation to clear
> +	 * a full mm and can make some optimizations
> +	 */
> +	unsigned int		fullmm : 1;
> +
> +	/*
> +	 * we have performed an operation which
> +	 * requires a complete flush of the tlb
> +	 */
> +	unsigned int		need_flush_all : 1;
> +
> +	/*
> +	 * we have removed page directories
> +	 */
> +	unsigned int		freed_tables : 1;
>  
>  	struct mmu_gather_batch *active;
>  	struct mmu_gather_batch	local;
> @@ -136,6 +146,7 @@ static inline void __tlb_reset_range(str
>  		tlb->start = TASK_SIZE;
>  		tlb->end = 0;
>  	}
> +	tlb->freed_tables = 0;
>  }
>  
>  static inline void tlb_remove_page_size(struct mmu_gather *tlb,
> @@ -269,6 +280,7 @@ static inline void tlb_remove_check_page
>  #define pte_free_tlb(tlb, ptep, address)			\
>  	do {							\
>  		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
> +		tlb->freed_tables = 1;			\
>  		__pte_free_tlb(tlb, ptep, address);		\
>  	} while (0)
>  #endif
> @@ -276,7 +288,8 @@ static inline void tlb_remove_check_page
>  #ifndef pmd_free_tlb
>  #define pmd_free_tlb(tlb, pmdp, address)			\
>  	do {							\
> -		__tlb_adjust_range(tlb, address, PAGE_SIZE);		\
> +		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
> +		tlb->freed_tables = 1;			\
>  		__pmd_free_tlb(tlb, pmdp, address);		\
>  	} while (0)
>  #endif
> @@ -286,6 +299,7 @@ static inline void tlb_remove_check_page
>  #define pud_free_tlb(tlb, pudp, address)			\
>  	do {							\
>  		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
> +		tlb->freed_tables = 1;			\
>  		__pud_free_tlb(tlb, pudp, address);		\
>  	} while (0)
>  #endif
> @@ -295,7 +309,8 @@ static inline void tlb_remove_check_page
>  #ifndef p4d_free_tlb
>  #define p4d_free_tlb(tlb, pudp, address)			\
>  	do {							\
> -		__tlb_adjust_range(tlb, address, PAGE_SIZE);		\
> +		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
> +		tlb->freed_tables = 1;			\
>  		__p4d_free_tlb(tlb, pudp, address);		\
>  	} while (0)
>  #endif


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  3:45     ` Nicholas Piggin
@ 2018-08-23  3:59       ` Linus Torvalds
  2018-08-23  4:33         ` Nicholas Piggin
  2018-08-23  4:54         ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 96+ messages in thread
From: Linus Torvalds @ 2018-08-23  3:59 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Peter Zijlstra, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 8:45 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> powerpc/radix has no such issue, it already does this tracking.

Yeah, I now realize that this was why you wanted to add that hacky
thing to the generic code, so that you can add the tlb_flush_pgtable()
call.

I thought it was because powerpc had some special flush instruction
for it, and the regular tlb flush didn't do it. But no. It was because
the regular code had lost the tlb flush _entirely_, because powerpc
didn't want it.

> We were discussing this a couple of months ago, I wasn't aware of ARM's
> issue but I suggested x86 could go the same way as powerpc.

The problem is that x86 _used_ to do this all correctly long long ago.

And then we switched over to the "generic" table flushing (which
harkens back to the powerpc code).

Which actually turned out to be not generic at all, and did not flush
the internal pages like x86 used to (back when x86 just used
tlb_remove_page for everything).

So as a result, x86 had unintentionally lost the TLB flush we used to
have, because tlb_remove_table() had lost the tlb flushing because of
a powerpc quirk.

You then added it back as a hacky per-architecture hook (apparently
having realized that you never did it at all), which didn't fix the
unintentional lack of flushing on x86.

So now we're going to do it right.  No more "oh, powerpc didn't need
to flush because the hash tables weren't in the tlb at all" thing in
the generic code that then others need to work around.

              Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition
  2018-08-23  3:35     ` Linus Torvalds
  2018-08-23  3:44       ` Linus Torvalds
@ 2018-08-23  4:16       ` Nicholas Piggin
  2018-08-23  4:54         ` Linus Torvalds
  1 sibling, 1 reply; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-23  4:16 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, 22 Aug 2018 20:35:16 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Aug 22, 2018 at 8:31 PM Nicholas Piggin <npiggin@gmail.com> wrote:
> >
> >
> > So that leaves speculative operations. I don't see where the problem is
> > with those either -- this shortcut needs to ensure there are no other
> > *non speculative* operations. mm_users is correct for that.  
> 
> No. Because mm_users doesn't contain any lazy tlb users.
> 
> And yes, those lazy tlbs are all kernel threads, but they can still
> speculatively load user addresses.

So?

If the arch does not shoot those all down after the user page tables
are removed then it's buggy regardless of this short cut.

The only real problem I could see would be if a page walk cache still
points to the freed table, then the table gets re-allocated and used
elsewhere, and meanwhile a speculative access tries to load an entry
from the page that is an invalid form of page table that might cause
a machine check or something. That would be (u)arch specific, but if
that's what we're concerned with here it's a different issue and needs
to be documented as such.

I'll have a look at powerpc and see if we can cope with it. If so, I'll
make it an arch specific opt-in short cut.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  3:59       ` Linus Torvalds
@ 2018-08-23  4:33         ` Nicholas Piggin
  2018-08-23  5:03           ` Linus Torvalds
  2018-08-23  4:54         ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-23  4:33 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, 22 Aug 2018 20:59:46 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Aug 22, 2018 at 8:45 PM Nicholas Piggin <npiggin@gmail.com> wrote:
> >
> > powerpc/radix has no such issue, it already does this tracking.  
> 
> Yeah, I now realize that this was why you wanted to add that hacky
> thing to the generic code, so that you can add the tlb_flush_pgtable()
> call.
> 
> I thought it was because powerpc had some special flush instruction
> for it, and the regular tlb flush didn't do it. But no.

powerpc/radix does have a special instruction for it, that is why I
posted the patch :)

> It was because
> the regular code had lost the tlb flush _entirely_, because powerpc
> didn't want it.

I think that was long before I started looking at the code.
powerpc/hash hardware has no idea about the page tables so yeah they
don't need it.

> 
> > We were discussing this a couple of months ago, I wasn't aware of ARM's
> > issue but I suggested x86 could go the same way as powerpc.  
> 
> The problem is that x86 _used_ to do this all correctly long long ago.
> 
> And then we switched over to the "generic" table flushing (which
> harkens back to the powerpc code).
> 
> Which actually turned out to be not generic at all, and did not flush
> the internal pages like x86 used to (back when x86 just used
> tlb_remove_page for everything).
> 
> So as a result, x86 had unintentionally lost the TLB flush we used to
> have, because tlb_remove_table() had lost the tlb flushing because of
> a powerpc quirk.
> 
> You then added it back as a hacky per-architecture hook (apparently
> having realized that you never did it at all), which didn't fix the

I think it was quite well understood and fixed here, a145abf12c9 but
again that was before I really started looking at it.

The hooks I added recently are for a different reason, and it's
actaully the opposite problem -- to work around the hacky generic code
that x86 foisted on other archs.

> unintentional lack of flushing on x86.
> 
> So now we're going to do it right.  No more "oh, powerpc didn't need
> to flush because the hash tables weren't in the tlb at all" thing in
> the generic code that then others need to work around.

I don't really understand what the issue you have with powerpc here.
powerpc hash has the page table flushing accessors which are just
no-ops, it's the generic code that fails to call them properly. Surely
there was no powerpc patch that removed those calls from generic code?

powerpc/radix yes it does some arch specific things to do its page walk
cache flushing, but it is a better design than the hacks x86 has in
generic code, surely. I thought you basically agreed and thought x86 /
generic code could move to that kind of model.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  3:59       ` Linus Torvalds
  2018-08-23  4:33         ` Nicholas Piggin
@ 2018-08-23  4:54         ` Benjamin Herrenschmidt
  2018-08-23  5:11           ` Linus Torvalds
  2018-08-24  8:35           ` [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE Peter Zijlstra
  1 sibling, 2 replies; 96+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-23  4:54 UTC (permalink / raw)
  To: Linus Torvalds, Nick Piggin
  Cc: Peter Zijlstra, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, 2018-08-22 at 20:59 -0700, Linus Torvalds wrote:
> On Wed, Aug 22, 2018 at 8:45 PM Nicholas Piggin <npiggin@gmail.com> wrote:
> > 
> > powerpc/radix has no such issue, it already does this tracking.
> 
> Yeah, I now realize that this was why you wanted to add that hacky
> thing to the generic code, so that you can add the tlb_flush_pgtable()
> call.
> 
> I thought it was because powerpc had some special flush instruction
> for it, and the regular tlb flush didn't do it. But no. It was because
> the regular code had lost the tlb flush _entirely_, because powerpc
> didn't want it.

Heh :-) Well, back on hash we didn't (we do now with Radix) but I
wouldn't blame us for the generic code being broken ... the RCU table
freeing was in arch/powerpc at the time :-) I don't think it was us
making it generic :)

> > We were discussing this a couple of months ago, I wasn't aware of ARM's
> > issue but I suggested x86 could go the same way as powerpc.
> 
> The problem is that x86 _used_ to do this all correctly long long ago.
> 
> And then we switched over to the "generic" table flushing (which
> harkens back to the powerpc code).

Yes, we wrote it the RCU stuff to solve the races with SW walking,
which is completely orthogonal with HW walking & TLB content. We didn't
do the move to generic code though ;-)

> Which actually turned out to be not generic at all, and did not flush
> the internal pages like x86 used to (back when x86 just used
> tlb_remove_page for everything).

Well, having RCU do the flushing is rather generic, it makes sense
whenever there's somebody doing a SW walk *and* you don't have IPIs to
synchronize your flushes (ie, anybody with HW TLB invalidation
broadcast basically, so ARM and us).

> So as a result, x86 had unintentionally lost the TLB flush we used to
> have, because tlb_remove_table() had lost the tlb flushing because of
> a powerpc quirk.

This is a somewhat odd way of putting the "blame" :-) But yeah ok...

> You then added it back as a hacky per-architecture hook (apparently
> having realized that you never did it at all), which didn't fix the
> unintentional lack of flushing on x86.
> 
> So now we're going to do it right.  No more "oh, powerpc didn't need
> to flush because the hash tables weren't in the tlb at all" thing in
> the generic code that then others need to work around.

So we do need a different flush instruction for the page tables vs. the
normal TLB pages. 

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition
  2018-08-23  4:16       ` Nicholas Piggin
@ 2018-08-23  4:54         ` Linus Torvalds
  2018-08-23  5:15           ` Nicholas Piggin
  2018-08-24  8:42           ` Peter Zijlstra
  0 siblings, 2 replies; 96+ messages in thread
From: Linus Torvalds @ 2018-08-23  4:54 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Peter Zijlstra, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 9:16 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> On Wed, 22 Aug 2018 20:35:16 -0700
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
> >
> > And yes, those lazy tlbs are all kernel threads, but they can still
> > speculatively load user addresses.
>
> So?
>
> If the arch does not shoot those all down after the user page tables
> are removed then it's buggy regardless of this short cut.

Read the code.

That shortcut frees the pages *WITHOUT* doing that TLB flush. It just
does __tlb_remove_table(), which does *not* do that whole page
queueing so that we can batch flush and then free later.

So the fast-case it's buggy, exactly because of the reason you state.

The logic used to be that if you were the only cpu that had that tlb,
and it was a mid-level table, it didn't need that synchronization at
all.

And that logic is simply wrong. Exactly because even if mm_users is 1,
there can be things looking up TLB entries on other CPU's. Either
because of a lazy mm and a hw walker with speculation, or because of
use_mm() and a software walker.

So the whole "free immediately" shortcut was bogus. You *always* need
to queue, then flush the tlb, and then free.

That said, that wasn't actually the bug that Jann found. He found the
bug a few lines lower down, where the "I can't allocate memory for
queueing" ended up *also* not flushing the TLB.

> The only real problem I could see would be if a page walk cache still
> points to the freed table, then the table gets re-allocated and used
> elsewhere, and meanwhile a speculative access tries to load an entry
> from the page that is an invalid form of page table that might cause
> a machine check or something. That would be (u)arch specific, but if
> that's what we're concerned with here it's a different issue and needs
> to be documented as such.

We've *seen* that case, we had exactly that when we were being
aggressive about trying to avoid flushes for the lazy mmu case
entirely, because "we can just flush when we activate the lazy mm
state instead".

The logic was actually solid from a TLB case - who cares what garbage
TLB entries we might speculatively fill, when we're going to flush
them before they can possibly be used.

It turns out that that logic is solid, but hits another problem: at
least some AMD CPU's will cause machine checks when the TLB entries
are inconsistent with the machine setup. That can't happen with a
*good* page table, but when you speculatively walk already freed
tables, you can get any garbage.

I forget what the exact trigger was, but this was actually reported.
So you can't free page directory pages without flushing the tlb first
(to make that internal tlb node caches are flushed).

So the logic for freeing leaf pages and freeing middle nodes has to be
exactly the same: you make the modification to the page table to
remove the node/leaf, you queue up the memory for freeing, you flush
the tlb, and you then free the page.

That's the ordering that tlb_remove_page() has always honored, but
that tlb_remove_tabl() didn't.

It honored it for the *normal* case, which is why it took so long to
notice that the TLB shootdown had been broken on x86 when it moved to
the "generic" code. The *normal* case does this all right, and batches
things up, and then when the batch fills up it does a
tlb_table_flush() which does the TLB flush and schedules the actual
freeing.

But there were two cases that *didn't* do that. The special "I'm the
only thread" fast case, and the "oops I ran out of memory, so now I'll
fake it, and just synchronize with page twalkers manually, and then do
that special direct remove without flushing the tlb".

NOTE! Jann triggered that one by
 (a) forcing the machine low on memory
 (b) force-poisoning the page tables immediately after free

I suspect it's really hard to trigger under normal loads, exactly
because the *normal* case gets it right. It's literally the "oops, I
can't batch up because I ran out of memory" case that gets it wrong.

(And the special single-thread + lazy or use_mm() case, but that's
going to be entirely impossible to trigger, because in practice it's
just a single thread, and you won't ever hit the magic timing needed
that frees the page in the single thread at exactly the same time that
some optimistic lazy mm on another cpu happens to speculatively load
that address).

So the "atomic_read(&mm_users)" case is likely entirely impossible to
trigger any sane way. But because Jann found the problem with the 'ran
out of memory" case, we started looking at the more theoretical cases
that matched the same kind of "no tlb flush before free" pattern.

              Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  4:33         ` Nicholas Piggin
@ 2018-08-23  5:03           ` Linus Torvalds
  2018-08-23  5:58             ` Nicholas Piggin
  0 siblings, 1 reply; 96+ messages in thread
From: Linus Torvalds @ 2018-08-23  5:03 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Peter Zijlstra, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 9:33 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> I think it was quite well understood and fixed here, a145abf12c9 but
> again that was before I really started looking at it.

You don't understand the problem.

All the x86 people thought WE ALREADY DID THAT.

Because we had done this all correctly over a decade ago!

Nobody realized that it had been screwed up by the powerpc code, and
the commit you point to was believed to be a new *powerpc* only issue,
because the semantics on powerpc has changed because of the radix
tree.

The semantics on x86 have never changed, they've always been the same.
So why would the x86 people react to powerpc doing something that x86
had already always done.

See?

Nobody cared one whit about commit a145abf12c9, because it just
handles a new powerpc-specific case.

> I don't really understand what the issue you have with powerpc here.
> powerpc hash has the page table flushing accessors which are just
> no-ops, it's the generic code that fails to call them properly. Surely
> there was no powerpc patch that removed those calls from generic code?

Yes there was.

Look where the generic code *came* from.

It's powerpc code.

See commit 267239116987 ("mm, powerpc: move the RCU page-table freeing
into generic code").

The powerpc code was made into the generic code, because the powerpc
code had to handle all those special RCU freeing things etc that
others didn't.

It's just that when x86 was then switched over to use the "generic"
code, people didn't realize that the generic code didn't do the TLB
invalidations for page tables, because they hadn't been needed on
powerpc.

So the powerpc code that was made generic, never really was. The new
"generic" code had a powerpc-specific quirk.

That then very subtly broke x86, without the x86 people ever
realizing. Because the old simple non-RCU x86 code had never had that
issue, it just treated the leaf pages and the directory pages exactly
the same.

See?

And THAT is why I talk about the powerpc code. Because what is
"generic" code in 4.18 (and several releases before) oisn't actually
generic.

And that's basically exactly the bug that the patches from PeterZ is
fixing. Making the "tlb_remove_table()" code always flush the tlb, the
way it should have when it was made generic.

              Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  4:54         ` Benjamin Herrenschmidt
@ 2018-08-23  5:11           ` Linus Torvalds
  2018-08-23  5:20             ` Linus Torvalds
                               ` (2 more replies)
  2018-08-24  8:35           ` [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE Peter Zijlstra
  1 sibling, 3 replies; 96+ messages in thread
From: Linus Torvalds @ 2018-08-23  5:11 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Nick Piggin, Peter Zijlstra, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Will Deacon,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 9:54 PM Benjamin Herrenschmidt <benh@au1.ibm.com> wrote:
>
>
> So we do need a different flush instruction for the page tables vs. the
> normal TLB pages.

Right. ARM wants it too. x86 is odd in that a regular "invlpg" already
invalidates all the internal tlb cache nodes.

So the "new world order" is exactly that patch that PeterZ sent you, that adds a

+       unsigned int            freed_tables : 1;

to the 'struct mmu_gather', and then makes all those
pte/pmd/pud/p4d_free_tlb() functions set that bit.

So I'm referring to the email PeterZ sent you in this thread that said:

  Nick, Will is already looking at using this to remove the synchronous
  invalidation from __p*_free_tlb() for ARM, could you have a look to see
  if PowerPC-radix could benefit from that too?

  Basically, using a patch like the below, would give your tlb_flush()
  information on if tables were removed or not.

then, in that model, you do *not* need to override these
pte/pmd/pud/p4d_free_tlb() macros at all (well, you *can* if you want
to, for doing games with the range modification, but let's sayt that
you don't need that right now).

So instead, when you get to the actual "tlb_flush(tlb)", you do
exactly that - flush the tlb. And the mmu_gather structure shows you
how much you need to flush. If you see that "freed_tables" is set,
then you know that you need to also do the special instruction to
flush the inner level caches. The range continues to show the page
range.

           Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition
  2018-08-23  4:54         ` Linus Torvalds
@ 2018-08-23  5:15           ` Nicholas Piggin
  2018-08-24  8:42           ` Peter Zijlstra
  1 sibling, 0 replies; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-23  5:15 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, 22 Aug 2018 21:54:48 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Aug 22, 2018 at 9:16 PM Nicholas Piggin <npiggin@gmail.com> wrote:
> >
> > On Wed, 22 Aug 2018 20:35:16 -0700
> > Linus Torvalds <torvalds@linux-foundation.org> wrote:  
> > >
> > > And yes, those lazy tlbs are all kernel threads, but they can still
> > > speculatively load user addresses.  
> >
> > So?
> >
> > If the arch does not shoot those all down after the user page tables
> > are removed then it's buggy regardless of this short cut.  
> 
> Read the code.
> 
> That shortcut frees the pages *WITHOUT* doing that TLB flush.

I've read the code and I understand that. Hence I'm asking because I
did't think the changelog matched the change. But possibly it was
because I actually didn't read the changelog enough -- I guess it does
say the TLB operation is the problem. I may have got side tracked by
the word speculativee.

> It just
> does __tlb_remove_table(), which does *not* do that whole page
> queueing so that we can batch flush and then free later.
> 
> So the fast-case it's buggy, exactly because of the reason you state.

Okay sure, thanks for confirming it for me. I would ask for changelog
to be slightly expanded on, but maybe it's just my reading
comprehension needs improvement...

> > The only real problem I could see would be if a page walk cache still
> > points to the freed table, then the table gets re-allocated and used
> > elsewhere, and meanwhile a speculative access tries to load an entry
> > from the page that is an invalid form of page table that might cause
> > a machine check or something. That would be (u)arch specific, but if
> > that's what we're concerned with here it's a different issue and needs
> > to be documented as such.  
> 
> We've *seen* that case, we had exactly that when we were being
> aggressive about trying to avoid flushes for the lazy mmu case
> entirely, because "we can just flush when we activate the lazy mm
> state instead".
> 
> The logic was actually solid from a TLB case - who cares what garbage
> TLB entries we might speculatively fill, when we're going to flush
> them before they can possibly be used.
> 
> It turns out that that logic is solid, but hits another problem: at
> least some AMD CPU's will cause machine checks when the TLB entries
> are inconsistent with the machine setup. That can't happen with a
> *good* page table, but when you speculatively walk already freed
> tables, you can get any garbage.

Yeah that does make sense.

> 
> I forget what the exact trigger was, but this was actually reported.
> So you can't free page directory pages without flushing the tlb first
> (to make that internal tlb node caches are flushed).
> 
> So the logic for freeing leaf pages and freeing middle nodes has to be
> exactly the same: you make the modification to the page table to
> remove the node/leaf, you queue up the memory for freeing, you flush
> the tlb, and you then free the page.
> 
> That's the ordering that tlb_remove_page() has always honored, but
> that tlb_remove_tabl() didn't.
> 
> It honored it for the *normal* case, which is why it took so long to
> notice that the TLB shootdown had been broken on x86 when it moved to
> the "generic" code. The *normal* case does this all right, and batches
> things up, and then when the batch fills up it does a
> tlb_table_flush() which does the TLB flush and schedules the actual
> freeing.
> 
> But there were two cases that *didn't* do that. The special "I'm the
> only thread" fast case, and the "oops I ran out of memory, so now I'll
> fake it, and just synchronize with page twalkers manually, and then do
> that special direct remove without flushing the tlb".
> 
> NOTE! Jann triggered that one by
>  (a) forcing the machine low on memory
>  (b) force-poisoning the page tables immediately after free
> 
> I suspect it's really hard to trigger under normal loads, exactly
> because the *normal* case gets it right. It's literally the "oops, I
> can't batch up because I ran out of memory" case that gets it wrong.
> 
> (And the special single-thread + lazy or use_mm() case, but that's
> going to be entirely impossible to trigger, because in practice it's
> just a single thread, and you won't ever hit the magic timing needed
> that frees the page in the single thread at exactly the same time that
> some optimistic lazy mm on another cpu happens to speculatively load
> that address).
> 
> So the "atomic_read(&mm_users)" case is likely entirely impossible to
> trigger any sane way. But because Jann found the problem with the 'ran
> out of memory" case, we started looking at the more theoretical cases
> that matched the same kind of "no tlb flush before free" pattern.

Thanks for giving that background. In that case I'm happy with this fix.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  5:11           ` Linus Torvalds
@ 2018-08-23  5:20             ` Linus Torvalds
  2018-08-23  6:48               ` Martin Schwidefsky
  2018-08-23  5:21             ` Benjamin Herrenschmidt
  2018-08-23 13:39             ` Will Deacon
  2 siblings, 1 reply; 96+ messages in thread
From: Linus Torvalds @ 2018-08-23  5:20 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Nick Piggin, Peter Zijlstra, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Will Deacon,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 10:11 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> So instead, when you get to the actual "tlb_flush(tlb)", you do
> exactly that - flush the tlb. And the mmu_gather structure shows you
> how much you need to flush. If you see that "freed_tables" is set,
> then you know that you need to also do the special instruction to
> flush the inner level caches. The range continues to show the page
> range.

Note that this obviously works fine for a hashed table model too - you
just ignore the "freed_tables" bit entirely and continue to do
whatever you always did.

And we can ignore it on x86 too, because we just see the range, and we
invalidate the range, and that will always invalidate the mid-level
caching too.

So the new bit is literally for arm and powerpc-radix (and maybe
s390), but we want to make the actual VM interface truly generic and
not have one set of code with five different behaviors (which we
_currently_ kind of have with the whole in addition to all the
HAVE_RCU_TABLE_FREE etc config options that modify how the code works.

It would be good to also cut down on the millions of functions that
each architecture can override, because Christ, it got very confusing
at times to follow just how the code flowed from generic code to
architecture-specific macros back to generic code and then
arch-specific inline helper functions.

It's a maze of underscores and "page" vs "table", and "flush" vs "remove" etc.

But that "it would be good to really make everybody to use as much of
the generic code as possible" and everybody have the same pattern,
that's a future thing. But the whole "let's just add that
"freed_tables" thing would be part of trying to get people to use the
same overall pattern, even if some architectures might not care about
that detail.

              Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  5:11           ` Linus Torvalds
  2018-08-23  5:20             ` Linus Torvalds
@ 2018-08-23  5:21             ` Benjamin Herrenschmidt
  2018-08-23  6:15               ` Nicholas Piggin
  2018-08-23 13:39             ` Will Deacon
  2 siblings, 1 reply; 96+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-23  5:21 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Nick Piggin, Peter Zijlstra, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Will Deacon,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, 2018-08-22 at 22:11 -0700, Linus Torvalds wrote:
> On Wed, Aug 22, 2018 at 9:54 PM Benjamin Herrenschmidt <benh@au1.ibm.com> wrote:
> > 
> > 
> > So we do need a different flush instruction for the page tables vs. the
> > normal TLB pages.
> 
> Right. ARM wants it too. x86 is odd in that a regular "invlpg" already
> invalidates all the internal tlb cache nodes.
> 
> So the "new world order" is exactly that patch that PeterZ sent you, that adds a
> 
> +       unsigned int            freed_tables : 1;
> 

 .../...

> So instead, when you get to the actual "tlb_flush(tlb)", you do
> exactly that - flush the tlb. And the mmu_gather structure shows you
> how much you need to flush. If you see that "freed_tables" is set,
> then you know that you need to also do the special instruction to
> flush the inner level caches. The range continues to show the page
> range.

Yup. That looks like a generic version of the "need_flush_all" flag we
have, which is fine by us.

Just don't blame powerpc for all the historical crap :-)

Cheers,
Ben.

> 
>            Linus


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  5:03           ` Linus Torvalds
@ 2018-08-23  5:58             ` Nicholas Piggin
  0 siblings, 0 replies; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-23  5:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, 22 Aug 2018 22:03:40 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Aug 22, 2018 at 9:33 PM Nicholas Piggin <npiggin@gmail.com> wrote:
> >
> > I think it was quite well understood and fixed here, a145abf12c9 but
> > again that was before I really started looking at it.  
> 
> You don't understand the problem.

More fundamentally I think I didn't understand this fix, I think
actually powerpc/radix does have a bug here. a145abf12c9 was really
just a replacement for x86's hack of expanding the TLB invalidation
range when freeing page table to capture page walk cache (powerpc/radix
needs a different instruction so that didn't work for us).

But I hadn't really looked at this fix closely rather Peter's follow up
post about making powerpc page walk cache flushing design a generic
concept.

My point in this reply was more that my patches from the other month
weren't a blundering issue to fix this bug without realising it, they
were purely about avoiding the x86 TLB range expanding hack (that won't
be needed if generic users all move over).

> 
> All the x86 people thought WE ALREADY DID THAT.
> 
> Because we had done this all correctly over a decade ago!
> 
> Nobody realized that it had been screwed up by the powerpc code, and

The powerpc/hash code is not screwed up though AFAIKS. You can't
take arch specific code and slap a "generic" label on it, least of all
the crazy powerpc/hash code, you of all people would agree with that :)

> the commit you point to was believed to be a new *powerpc* only issue,
> because the semantics on powerpc has changed because of the radix
> tree.
> 
> The semantics on x86 have never changed, they've always been the same.
> So why would the x86 people react to powerpc doing something that x86
> had already always done.
> 
> See?
> 
> Nobody cared one whit about commit a145abf12c9, because it just
> handles a new powerpc-specific case.
> 
> > I don't really understand what the issue you have with powerpc here.
> > powerpc hash has the page table flushing accessors which are just
> > no-ops, it's the generic code that fails to call them properly. Surely
> > there was no powerpc patch that removed those calls from generic code?  
> 
> Yes there was.
> 
> Look where the generic code *came* from.
> 
> It's powerpc code.
> 
> See commit 267239116987 ("mm, powerpc: move the RCU page-table freeing
> into generic code").
> 
> The powerpc code was made into the generic code, because the powerpc
> code had to handle all those special RCU freeing things etc that
> others didn't.
> 
> It's just that when x86 was then switched over to use the "generic"
> code, people didn't realize that the generic code didn't do the TLB
> invalidations for page tables, because they hadn't been needed on
> powerpc.

Sure, there was a minor bug in the port. Not that it was a closely
guarded secret that powerpc didn't flush page table pages, but it's a
relatively subtle issue in complex code. That happens.

> 
> So the powerpc code that was made generic, never really was. The new
> "generic" code had a powerpc-specific quirk.
> 
> That then very subtly broke x86, without the x86 people ever
> realizing. Because the old simple non-RCU x86 code had never had that
> issue, it just treated the leaf pages and the directory pages exactly
> the same.
> 
> See?
> 
> And THAT is why I talk about the powerpc code. Because what is
> "generic" code in 4.18 (and several releases before) oisn't actually
> generic.
> 
> And that's basically exactly the bug that the patches from PeterZ is
> fixing. Making the "tlb_remove_table()" code always flush the tlb, the
> way it should have when it was made generic.

It just sounded like you were blaming correct powerpc/hash code for
this. It's just a minor bug in taking that code into generic, not really
a big deal, right? Or are you saying powerpc devs or code could be doing
something better to play nicer with the rest of the archs?

Honestly trying to improve things here, and encouraged by x86 and ARM
looking to move over to a saner page walk cache tracking design and
sharing more code with powerpc/radix. I would help with reviewing
things or writing code or porting powerpc bits if I can.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  5:21             ` Benjamin Herrenschmidt
@ 2018-08-23  6:15               ` Nicholas Piggin
  0 siblings, 0 replies; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-23  6:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Linus Torvalds, Peter Zijlstra, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Will Deacon,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Thu, 23 Aug 2018 15:21:30 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2018-08-22 at 22:11 -0700, Linus Torvalds wrote:
> > On Wed, Aug 22, 2018 at 9:54 PM Benjamin Herrenschmidt <benh@au1.ibm.com> wrote:  
> > > 
> > > 
> > > So we do need a different flush instruction for the page tables vs. the
> > > normal TLB pages.  
> > 
> > Right. ARM wants it too. x86 is odd in that a regular "invlpg" already
> > invalidates all the internal tlb cache nodes.
> > 
> > So the "new world order" is exactly that patch that PeterZ sent you, that adds a
> > 
> > +       unsigned int            freed_tables : 1;
> >   
> 
>  .../...
> 
> > So instead, when you get to the actual "tlb_flush(tlb)", you do
> > exactly that - flush the tlb. And the mmu_gather structure shows you
> > how much you need to flush. If you see that "freed_tables" is set,
> > then you know that you need to also do the special instruction to
> > flush the inner level caches. The range continues to show the page
> > range.  
> 
> Yup. That looks like a generic version of the "need_flush_all" flag we
> have, which is fine by us.
> 
> Just don't blame powerpc for all the historical crap :-)

And yes we very much want to remove the x86 hacks from generic code and
have them use the sane powerpc/radix page walk cache flushing model.
That would indeed allow us to stop overriding those macros and start
sharing more code with other archs. We can help write or review code to
make sure bugs don't creep in when moving it to generic implementation.
It will be much more relevant to us now because radix is very similar to
x86.

The hack is not the powerpc override macros though, let's be clear
about that. Even x86 will be helped out by removing that crap because
it won't have to do a full TLB flush caused by massive TLB range if it
frees 0..small number of pages that happen to also free some page
tables.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  5:20             ` Linus Torvalds
@ 2018-08-23  6:48               ` Martin Schwidefsky
  0 siblings, 0 replies; 96+ messages in thread
From: Martin Schwidefsky @ 2018-08-23  6:48 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Benjamin Herrenschmidt, Nick Piggin, Peter Zijlstra,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Will Deacon, Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Michael Ellerman

On Wed, 22 Aug 2018 22:20:32 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Aug 22, 2018 at 10:11 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > So instead, when you get to the actual "tlb_flush(tlb)", you do
> > exactly that - flush the tlb. And the mmu_gather structure shows you
> > how much you need to flush. If you see that "freed_tables" is set,
> > then you know that you need to also do the special instruction to
> > flush the inner level caches. The range continues to show the page
> > range.  
> 
> Note that this obviously works fine for a hashed table model too - you
> just ignore the "freed_tables" bit entirely and continue to do
> whatever you always did.
> 
> And we can ignore it on x86 too, because we just see the range, and we
> invalidate the range, and that will always invalidate the mid-level
> caching too.
> 
> So the new bit is literally for arm and powerpc-radix (and maybe
> s390), but we want to make the actual VM interface truly generic and
> not have one set of code with five different behaviors (which we
> _currently_ kind of have with the whole in addition to all the
> HAVE_RCU_TABLE_FREE etc config options that modify how the code works.
> 
> It would be good to also cut down on the millions of functions that
> each architecture can override, because Christ, it got very confusing
> at times to follow just how the code flowed from generic code to
> architecture-specific macros back to generic code and then
> arch-specific inline helper functions.
> 
> It's a maze of underscores and "page" vs "table", and "flush" vs "remove" etc.
> 
> But that "it would be good to really make everybody to use as much of
> the generic code as possible" and everybody have the same pattern,
> that's a future thing. But the whole "let's just add that
> "freed_tables" thing would be part of trying to get people to use the
> same overall pattern, even if some architectures might not care about
> that detail.

For s390 the new freed_tables bit looks to be step into the right
direction. Right now there is the flush_mm bit in the mm->context
that I use to keep track of the need for a global flush of the mm.
The flush_mm bit is currently used for both lazy PTE flushing and
TLB flushes for freed page table pages.

I'll look into it once the generic patch is merged. The switch to
the generic mmu_gather is certainly not a change that can be done
in a hurry, we had too many subtle TLB flush issues in the past.

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-22 15:30 ` [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE Peter Zijlstra
  2018-08-22 15:55   ` Peter Zijlstra
  2018-08-22 21:34   ` Linus Torvalds
@ 2018-08-23  8:46   ` Nicholas Piggin
  2 siblings, 0 replies; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-23  8:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, luto, x86, bp, will.deacon, riel, jannh, ascannell,
	dave.hansen, linux-kernel, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, 22 Aug 2018 17:30:15 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> Jann reported that x86 was missing required TLB invalidates when he
> hit the !*batch slow path in tlb_remove_table().
> 
> This is indeed the case; RCU_TABLE_FREE does not provide TLB (cache)
> invalidates, the PowerPC-hash where this code originated and the
> Sparc-hash where this was subsequently used did not need that. ARM
> which later used this put an explicit TLB invalidate in their
> __p*_free_tlb() functions, and PowerPC-radix followed that example.

So this is interesting, I _think_ a145abf12c did fix this bug for
powerpc, but then it seem to have been re-broken by a46cc7a90f
because that one defers the flush back to tlb_flush time. There
was quite a lot of churn getting the radix MMU off the ground at
that point though so I'm not 100% sure.

But AFAIKS powerpc today has this same breakage, and this patch
should fix it.

I have a couple of patches that touch the same code I'll send, you
might have some opinions on them.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  5:11           ` Linus Torvalds
  2018-08-23  5:20             ` Linus Torvalds
  2018-08-23  5:21             ` Benjamin Herrenschmidt
@ 2018-08-23 13:39             ` Will Deacon
  2018-08-24  8:47               ` Peter Zijlstra
  2 siblings, 1 reply; 96+ messages in thread
From: Will Deacon @ 2018-08-23 13:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Benjamin Herrenschmidt, Nick Piggin, Peter Zijlstra,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 10:11:41PM -0700, Linus Torvalds wrote:
> On Wed, Aug 22, 2018 at 9:54 PM Benjamin Herrenschmidt <benh@au1.ibm.com> wrote:
> >
> >
> > So we do need a different flush instruction for the page tables vs. the
> > normal TLB pages.
> 
> Right. ARM wants it too. x86 is odd in that a regular "invlpg" already
> invalidates all the internal tlb cache nodes.
> 
> So the "new world order" is exactly that patch that PeterZ sent you, that adds a
> 
> +       unsigned int            freed_tables : 1;
> 
> to the 'struct mmu_gather', and then makes all those
> pte/pmd/pud/p4d_free_tlb() functions set that bit.
> 
> So I'm referring to the email PeterZ sent you in this thread that said:
> 
>   Nick, Will is already looking at using this to remove the synchronous
>   invalidation from __p*_free_tlb() for ARM, could you have a look to see
>   if PowerPC-radix could benefit from that too?
> 
>   Basically, using a patch like the below, would give your tlb_flush()
>   information on if tables were removed or not.
> 
> then, in that model, you do *not* need to override these
> pte/pmd/pud/p4d_free_tlb() macros at all (well, you *can* if you want
> to, for doing games with the range modification, but let's sayt that
> you don't need that right now).
> 
> So instead, when you get to the actual "tlb_flush(tlb)", you do
> exactly that - flush the tlb. And the mmu_gather structure shows you
> how much you need to flush. If you see that "freed_tables" is set,
> then you know that you need to also do the special instruction to
> flush the inner level caches. The range continues to show the page
> range.

The only problem with this approach is that we've lost track of the granule
size by the point we get to the tlb_flush(), so we can't adjust the stride of
the TLB invalidations for huge mappings, which actually works nicely in the
synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
rather than iterating over it at a 4k granule).

One thing we could do is switch to synchronous mode if we detect a change in
granule (i.e. treat it like a batch failure).

Will

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition
  2018-08-22 15:30 ` [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition Peter Zijlstra
  2018-08-23  3:31   ` Nicholas Piggin
@ 2018-08-23 13:40   ` Will Deacon
  1 sibling, 0 replies; 96+ messages in thread
From: Will Deacon @ 2018-08-23 13:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, luto, x86, bp, riel, jannh, ascannell, dave.hansen,
	linux-kernel, linux-mm, Nicholas Piggin, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 05:30:14PM +0200, Peter Zijlstra wrote:
> Will noted that only checking mm_users is incorrect; we should also
> check mm_count in order to cover CPUs that have a lazy reference to
> this mm (and could do speculative TLB operations).
> 
> If removing this turns out to be a performance issue, we can
> re-instate a more complete check, but in tlb_table_flush() eliding the
> call_rcu_sched().
> 
> Cc: stable@kernel.org
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: David Miller <davem@davemloft.net>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Fixes: 267239116987 ("mm, powerpc: move the RCU page-table freeing into generic code")
> Reported-by: Will Deacon <will.deacon@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  mm/memory.c |    9 ---------
>  1 file changed, 9 deletions(-)

Acked-by: Will Deacon <will.deacon@arm.com>

Cheers,

Will

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-22 15:55   ` Peter Zijlstra
  2018-08-23  3:45     ` Nicholas Piggin
@ 2018-08-23 23:31     ` Will Deacon
  1 sibling, 0 replies; 96+ messages in thread
From: Will Deacon @ 2018-08-23 23:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: torvalds, luto, x86, bp, riel, jannh, ascannell, dave.hansen,
	linux-kernel, linux-mm, Nicholas Piggin, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 05:55:27PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 22, 2018 at 05:30:15PM +0200, Peter Zijlstra wrote:
> > ARM
> > which later used this put an explicit TLB invalidate in their
> > __p*_free_tlb() functions, and PowerPC-radix followed that example.
> 
> > +/*
> > + * If we want tlb_remove_table() to imply TLB invalidates.
> > + */
> > +static inline void tlb_table_invalidate(struct mmu_gather *tlb)
> > +{
> > +#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
> > +	/*
> > +	 * Invalidate page-table caches used by hardware walkers. Then we still
> > +	 * need to RCU-sched wait while freeing the pages because software
> > +	 * walkers can still be in-flight.
> > +	 */
> > +	__tlb_flush_mmu_tlbonly(tlb);
> > +#endif
> > +}
> 
> 
> Nick, Will is already looking at using this to remove the synchronous
> invalidation from __p*_free_tlb() for ARM, could you have a look to see
> if PowerPC-radix could benefit from that too?
> 
> Basically, using a patch like the below, would give your tlb_flush()
> information on if tables were removed or not.

Just to say that I have something up and running for arm64 based on this.
I'll post it once it's seen a bit more testing (either tomorrow or Monday).

Will

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23  4:54         ` Benjamin Herrenschmidt
  2018-08-23  5:11           ` Linus Torvalds
@ 2018-08-24  8:35           ` Peter Zijlstra
  2018-08-24 13:13             ` Peter Zijlstra
  1 sibling, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-24  8:35 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Linus Torvalds, Nick Piggin, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Will Deacon,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Thu, Aug 23, 2018 at 02:54:20PM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2018-08-22 at 20:59 -0700, Linus Torvalds wrote:

> > The problem is that x86 _used_ to do this all correctly long long ago.
> > 
> > And then we switched over to the "generic" table flushing (which
> > harkens back to the powerpc code).
> 
> Yes, we wrote it the RCU stuff to solve the races with SW walking,
> which is completely orthogonal with HW walking & TLB content. We didn't
> do the move to generic code though ;-)
> 
> > Which actually turned out to be not generic at all, and did not flush
> > the internal pages like x86 used to (back when x86 just used
> > tlb_remove_page for everything).
> 
> Well, having RCU do the flushing is rather generic, it makes sense
> whenever there's somebody doing a SW walk *and* you don't have IPIs to
> synchronize your flushes (ie, anybody with HW TLB invalidation
> broadcast basically, so ARM and us).

Right, so (many many years ago) I moved it over to generic code because
Sparc-hash wanted fast_gup and I figured having multiple copies of this
stuff wasn't ideal.

Then ARM came along and used it because it does the invalidate
broadcast.

And then when we switched x86 over last year or so; because paravirt; I
had long since forgotten all details and completely overlooked this.

Worse; somewhere along the line we tried to get s390 on this and they
ran into the exact problem being fixed now. That _should_ have been a
big clue, but somehow I never got around to thinking about it properly
and they went back to a private copy of all this.

So double fail on me I suppose :-/

Anyway, its sorted now; although I'd like to write me a fairly big
comment in asm-generic/tlb.h about things, before I forget again.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition
  2018-08-23  4:54         ` Linus Torvalds
  2018-08-23  5:15           ` Nicholas Piggin
@ 2018-08-24  8:42           ` Peter Zijlstra
  1 sibling, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-24  8:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Nick Piggin, Andrew Lutomirski, the arch/x86 maintainers,
	Borislav Petkov, Will Deacon, Rik van Riel, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Wed, Aug 22, 2018 at 09:54:48PM -0700, Linus Torvalds wrote:

> It honored it for the *normal* case, which is why it took so long to
> notice that the TLB shootdown had been broken on x86 when it moved to
> the "generic" code. The *normal* case does this all right, and batches
> things up, and then when the batch fills up it does a
> tlb_table_flush() which does the TLB flush and schedules the actual
> freeing.
> 
> But there were two cases that *didn't* do that. The special "I'm the
> only thread" fast case, and the "oops I ran out of memory, so now I'll
> fake it, and just synchronize with page twalkers manually, and then do
> that special direct remove without flushing the tlb".

The actual RCU batching case was also busted; there was no guarantee
that by the time we run the RCU callbacks the invalidate would've
happened. Exceedingly unlikely, but no guarantee.

So really, all 3 cases in tlb_remove_table() were busted in this
respect.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-23 13:39             ` Will Deacon
@ 2018-08-24  8:47               ` Peter Zijlstra
  2018-08-24 11:32                 ` Peter Zijlstra
  2018-08-24 17:26                 ` Nadav Amit
  0 siblings, 2 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-24  8:47 UTC (permalink / raw)
  To: Will Deacon
  Cc: Linus Torvalds, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
> The only problem with this approach is that we've lost track of the granule
> size by the point we get to the tlb_flush(), so we can't adjust the stride of
> the TLB invalidations for huge mappings, which actually works nicely in the
> synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
> rather than iterating over it at a 4k granule).
> 
> One thing we could do is switch to synchronous mode if we detect a change in
> granule (i.e. treat it like a batch failure).

We could use tlb_start_vma() to track that, I think. Shouldn't be too
hard.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-24  8:47               ` Peter Zijlstra
@ 2018-08-24 11:32                 ` Peter Zijlstra
  2018-08-24 11:39                   ` Peter Zijlstra
  2018-08-24 17:26                 ` Nadav Amit
  1 sibling, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-24 11:32 UTC (permalink / raw)
  To: Will Deacon
  Cc: Linus Torvalds, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Fri, Aug 24, 2018 at 10:47:17AM +0200, Peter Zijlstra wrote:
> On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
> > The only problem with this approach is that we've lost track of the granule
> > size by the point we get to the tlb_flush(), so we can't adjust the stride of
> > the TLB invalidations for huge mappings, which actually works nicely in the
> > synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
> > rather than iterating over it at a 4k granule).
> > 
> > One thing we could do is switch to synchronous mode if we detect a change in
> > granule (i.e. treat it like a batch failure).
> 
> We could use tlb_start_vma() to track that, I think. Shouldn't be too
> hard.

Hurm.. look at commit:

  e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change")

yuck yuck yuck. That needs fixing.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-24 11:32                 ` Peter Zijlstra
@ 2018-08-24 11:39                   ` Peter Zijlstra
  2018-08-27  5:00                     ` Nicholas Piggin
  0 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-24 11:39 UTC (permalink / raw)
  To: Will Deacon
  Cc: Linus Torvalds, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Fri, Aug 24, 2018 at 01:32:14PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 24, 2018 at 10:47:17AM +0200, Peter Zijlstra wrote:
> > On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
> > > The only problem with this approach is that we've lost track of the granule
> > > size by the point we get to the tlb_flush(), so we can't adjust the stride of
> > > the TLB invalidations for huge mappings, which actually works nicely in the
> > > synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
> > > rather than iterating over it at a 4k granule).
> > > 
> > > One thing we could do is switch to synchronous mode if we detect a change in
> > > granule (i.e. treat it like a batch failure).
> > 
> > We could use tlb_start_vma() to track that, I think. Shouldn't be too
> > hard.
> 
> Hurm.. look at commit:
> 
>   e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change")

Ah, good, it seems that already got cleaned up a lot. But it all moved
into the power code.. blergh.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-24  8:35           ` [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE Peter Zijlstra
@ 2018-08-24 13:13             ` Peter Zijlstra
  2018-08-24 13:14               ` Peter Zijlstra
  2018-08-24 15:49               ` Will Deacon
  0 siblings, 2 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-24 13:13 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Linus Torvalds, Nick Piggin, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Will Deacon,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Fri, Aug 24, 2018 at 10:35:56AM +0200, Peter Zijlstra wrote:

> Anyway, its sorted now; although I'd like to write me a fairly big
> comment in asm-generic/tlb.h about things, before I forget again.

How's something like so? There's a little page_size thingy in this;
mostly because I couldn't be arsed to split it for now.

Will has opinions on the page_size thing; I'll let him explain.

---

 arch/Kconfig                   |   3 +
 arch/arm/include/asm/tlb.h     |   3 +-
 arch/ia64/include/asm/tlb.h    |   3 +-
 arch/powerpc/Kconfig           |   1 +
 arch/powerpc/include/asm/tlb.h |  17 ------
 arch/s390/include/asm/tlb.h    |   4 +-
 arch/sh/include/asm/tlb.h      |   4 +-
 arch/um/include/asm/tlb.h      |   4 +-
 include/asm-generic/tlb.h      | 130 ++++++++++++++++++++++++++++++++++++-----
 mm/huge_memory.c               |   4 +-
 mm/hugetlb.c                   |   2 +-
 mm/madvise.c                   |   2 +-
 mm/memory.c                    |   9 ++-
 13 files changed, 137 insertions(+), 49 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 6801123932a5..053c44703539 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -365,6 +365,9 @@ config HAVE_RCU_TABLE_FREE
 config HAVE_RCU_TABLE_INVALIDATE
 	bool
 
+config HAVE_MMU_GATHER_PAGE_SIZE
+	bool
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index f854148c8d7c..d644c3c7c6f3 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -286,8 +286,7 @@ tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr
 
 #define tlb_migrate_finish(mm)		do { } while (0)
 
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+static inline void tlb_change_page_size(struct mmu_gather *tlb,
 						     unsigned int page_size)
 {
 }
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index 516355a774bf..bf8985f5f876 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -282,8 +282,7 @@ do {							\
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
 	tlb_remove_tlb_entry(tlb, ptep, address)
 
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+static inline void tlb_change_page_size(struct mmu_gather *tlb,
 						     unsigned int page_size)
 {
 }
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index db0b6eebbfa5..4db1072868f7 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -217,6 +217,7 @@ config PPC
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE		if SMP
+	select HAVE_MMU_GATHER_PAGE_SIZE
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC64 && CPU_LITTLE_ENDIAN
 	select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index f0e571b2dc7c..b29a67137acf 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -27,7 +27,6 @@
 #define tlb_start_vma(tlb, vma)	do { } while (0)
 #define tlb_end_vma(tlb, vma)	do { } while (0)
 #define __tlb_remove_tlb_entry	__tlb_remove_tlb_entry
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
 
 extern void tlb_flush(struct mmu_gather *tlb);
 
@@ -46,22 +45,6 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
 #endif
 }
 
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
-						     unsigned int page_size)
-{
-	if (!tlb->page_size)
-		tlb->page_size = page_size;
-	else if (tlb->page_size != page_size) {
-		if (!tlb->fullmm)
-			tlb_flush_mmu(tlb);
-		/*
-		 * update the page size after flush for the new
-		 * mmu_gather.
-		 */
-		tlb->page_size = page_size;
-	}
-}
-
 #ifdef CONFIG_SMP
 static inline int mm_is_core_local(struct mm_struct *mm)
 {
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 457b7ba0fbb6..cf3d64313740 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -180,9 +180,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
 	tlb_remove_tlb_entry(tlb, ptep, address)
 
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
-						     unsigned int page_size)
+static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size)
 {
 }
 
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 77abe192fb43..af7c9d891cf8 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -127,9 +127,7 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 	return tlb_remove_page(tlb, page);
 }
 
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
-						     unsigned int page_size)
+static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size)
 {
 }
 
diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h
index dce6db147f24..6463f3ab1767 100644
--- a/arch/um/include/asm/tlb.h
+++ b/arch/um/include/asm/tlb.h
@@ -146,9 +146,7 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
 	tlb_remove_tlb_entry(tlb, ptep, address)
 
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
-						     unsigned int page_size)
+static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size)
 {
 }
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b3353e21f3b3..d3573ba10068 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -20,6 +20,108 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
+/*
+ * Generic MMU-gather implementation.
+ *
+ * The mmu_gather data structure is used by the mm code to implement the
+ * correct and efficient ordering of freeing pages and TLB invalidations.
+ *
+ * This correct ordering is:
+ *
+ *  1) unhook page
+ *  2) TLB invalidate page
+ *  3) free page
+ *
+ * That is, we must never free a page before we have ensured there are no live
+ * translations left to it. Otherwise it might be possible to observe (or
+ * worse, change) the page content after it has been reused.
+ *
+ * The mmu_gather API consists of:
+ *
+ *  - tlb_gather_mmu() / tlb_finish_mmu(); start and finish a mmu_gather
+ *
+ *    Finish in particular will issue a (final) TLB invalidate and free
+ *    all (remaining) queued pages.
+ *
+ *  - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA
+ *
+ *    Defaults to flushing at tlb_end_vma() to reset the range; helps when
+ *    there's large holes between the VMAs.
+ *
+ *  - tlb_remove_page() / __tlb_remove_page()
+ *  - tlb_remove_page_size() / __tlb_remove_page_size()
+ *
+ *    __tlb_remove_page_size() is the basic primitive that queues a page for
+ *    freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
+ *    boolean indicating if the queue is (now) full and a call to
+ *    tlb_flush_mmu() is required.
+ *
+ *    tlb_remove_page() and tlb_remove_page_size() imply the call to
+ *    tlb_flush_mmu() when required and has no return value.
+ *
+ *  - tlb_change_page_size()
+ *
+ *    call before __tlb_remove_page*() to set the current page-size; implies a
+ *    possible tlb_flush_mmu() call.
+ *
+ *  - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() / tlb_flush_mmu_free()
+ *
+ *    tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets
+ *                              related state, like the range)
+ *
+ *    tlb_flush_mmu_free() - frees the queued pages; make absolutely
+ *			     sure no additional tlb_remove_page()
+ *			     calls happen between _tlbonly() and this.
+ *
+ *    tlb_flush_mmu() - the above two calls.
+ *
+ *  - mmu_gather::fullmm
+ *
+ *    A flag set by tlb_gather_mmu() to indicate we're going to free
+ *    the entire mm; this allows a number of optimizations.
+ *
+ *    XXX list optimizations
+ *
+ *  - mmu_gather::need_flush_all
+ *
+ *    A flag that can be set by the arch code if it wants to force
+ *    flush the entire TLB irrespective of the range. For instance
+ *    x86-PAE needs this when changing top-level entries.
+ *
+ * And requires the architecture to provide and implement tlb_flush().
+ *
+ * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make
+ * use of:
+ *
+ *  - mmu_gather::start / mmu_gather::end
+ *
+ *    which (when !need_flush_all; fullmm will have start = end = ~0UL) provides
+ *    the range that needs to be flushed to cover the pages to be freed.
+ *
+ * Additionally there are a few opt-in features:
+ *
+ *  HAVE_MMU_GATHER_PAGE_SIZE
+ *
+ *  This ensures we call tlb_flush() every time tlb_change_page_size() actually
+ *  changes the size and provides mmu_gather::page_size to tlb_flush().
+ *
+ *  HAVE_RCU_TABLE_FREE
+ *
+ *  This provides tlb_remove_table(), to be used instead of tlb_remove_page()
+ *  for page directores (__p*_free_tlb()). This provides separate freeing of
+ *  the page-table pages themselves in a semi-RCU fashion (see comment below).
+ *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
+ *  and therefore doesn't naturally serialize with software page-table walkers.
+ *
+ *  HAVE_RCU_TABLE_INVALIDATE
+ *
+ *  This makes HAVE_RCU_TABLE_FREE call tlb_flush_mmu_tlbonly() before freeing
+ *  the page-table pages. Required if you use HAVE_RCU_TABLE_FREE and your
+ *  architecture uses the Linux page-tables natively.
+ *
+ */
+#define HAVE_GENERIC_MMU_GATHER
+
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 /*
  * Semi RCU freeing of the page directories.
@@ -87,14 +189,17 @@ struct mmu_gather_batch {
  */
 #define MAX_GATHER_BATCH_COUNT	(10000UL/MAX_GATHER_BATCH)
 
-/* struct mmu_gather is an opaque type used by the mm code for passing around
+/*
+ * struct mmu_gather is an opaque type used by the mm code for passing around
  * any data needed by arch specific code for tlb_remove_page.
  */
 struct mmu_gather {
 	struct mm_struct	*mm;
+
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	struct mmu_table_batch	*batch;
 #endif
+
 	unsigned long		start;
 	unsigned long		end;
 	/* we are in the middle of an operation to clear
@@ -103,15 +208,17 @@ struct mmu_gather {
 	/* we have performed an operation which
 	 * requires a complete flush of the tlb */
 				need_flush_all : 1;
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+	unsigned int page_size;
+#endif
 
 	struct mmu_gather_batch *active;
 	struct mmu_gather_batch	local;
 	struct page		*__pages[MMU_GATHER_BUNDLE];
 	unsigned int		batch_count;
-	int page_size;
+
 };
 
-#define HAVE_GENERIC_MMU_GATHER
 
 void arch_tlb_gather_mmu(struct mmu_gather *tlb,
 	struct mm_struct *mm, unsigned long start, unsigned long end);
@@ -170,21 +277,18 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 	return tlb_remove_page_size(tlb, page, PAGE_SIZE);
 }
 
-#ifndef tlb_remove_check_page_size_change
-#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
-static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+static inline void tlb_change_page_size(struct mmu_gather *tlb,
 						     unsigned int page_size)
 {
-	/*
-	 * We don't care about page size change, just update
-	 * mmu_gather page size here so that debug checks
-	 * doesn't throw false warning.
-	 */
-#ifdef CONFIG_DEBUG_VM
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+	if (tlb->page_size && tlb->page_size != page_size) {
+		if (!tlb->fullmm)
+			tlb_flush_mmu(tlb);
+	}
+
 	tlb->page_size = page_size;
 #endif
 }
-#endif
 
 /*
  * In the case of tlb vma handling, we can optimise these away in the
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08b544383d74..786758670ba1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1617,7 +1617,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	struct mm_struct *mm = tlb->mm;
 	bool ret = false;
 
-	tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (!ptl)
@@ -1693,7 +1693,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	pmd_t orig_pmd;
 	spinlock_t *ptl;
 
-	tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 
 	ptl = __pmd_trans_huge_lock(pmd, vma);
 	if (!ptl)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c21775f196b..8af346b53a79 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3337,7 +3337,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	 * This is a hugetlb vma, all the pte entries should point
 	 * to huge page.
 	 */
-	tlb_remove_check_page_size_change(tlb, sz);
+	tlb_change_page_size(tlb, sz);
 	tlb_start_vma(tlb, vma);
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	address = start;
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d3c922ea1a1..2a9073c652d2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -328,7 +328,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	if (pmd_trans_unstable(pmd))
 		return 0;
 
-	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+	tlb_change_page_size(tlb, PAGE_SIZE);
 	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
diff --git a/mm/memory.c b/mm/memory.c
index 83aef222f11b..2818e00d1aae 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -233,7 +233,9 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb->batch = NULL;
 #endif
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
 	tlb->page_size = 0;
+#endif
 
 	__tlb_reset_range(tlb);
 }
@@ -294,7 +296,10 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 	struct mmu_gather_batch *batch;
 
 	VM_BUG_ON(!tlb->end);
+
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
 	VM_WARN_ON(tlb->page_size != page_size);
+#endif
 
 	batch = tlb->active;
 	/*
@@ -602,7 +607,7 @@ void free_pgd_range(struct mmu_gather *tlb,
 	 * We add page table cache pages with PAGE_SIZE,
 	 * (see pte_free_tlb()), flush the tlb if we need
 	 */
-	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+	tlb_change_page_size(tlb, PAGE_SIZE);
 	pgd = pgd_offset(tlb->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -1293,7 +1298,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	pte_t *pte;
 	swp_entry_t entry;
 
-	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+	tlb_change_page_size(tlb, PAGE_SIZE);
 again:
 	init_rss_vec(rss);
 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);

^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-24 13:13             ` Peter Zijlstra
@ 2018-08-24 13:14               ` Peter Zijlstra
  2018-08-24 15:49               ` Will Deacon
  1 sibling, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-24 13:14 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Linus Torvalds, Nick Piggin, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Will Deacon,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Fri, Aug 24, 2018 at 03:13:32PM +0200, Peter Zijlstra wrote:
> + *  HAVE_RCU_TABLE_FREE
> + *
> + *  This provides tlb_remove_table(), to be used instead of tlb_remove_page()
> + *  for page directores (__p*_free_tlb()). This provides separate freeing of
> + *  the page-table pages themselves in a semi-RCU fashion (see comment below).
> + *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
> + *  and therefore doesn't naturally serialize with software page-table walkers.
> + *
> + *  HAVE_RCU_TABLE_INVALIDATE
> + *
> + *  This makes HAVE_RCU_TABLE_FREE call tlb_flush_mmu_tlbonly() before freeing
> + *  the page-table pages. Required if you use HAVE_RCU_TABLE_FREE and your
> + *  architecture uses the Linux page-tables natively.

Writing that also made me think we maybe should've negated that option.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-24 13:13             ` Peter Zijlstra
  2018-08-24 13:14               ` Peter Zijlstra
@ 2018-08-24 15:49               ` Will Deacon
  1 sibling, 0 replies; 96+ messages in thread
From: Will Deacon @ 2018-08-24 15:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Benjamin Herrenschmidt, Linus Torvalds, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

Hi Peter,

On Fri, Aug 24, 2018 at 03:13:32PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 24, 2018 at 10:35:56AM +0200, Peter Zijlstra wrote:
> 
> > Anyway, its sorted now; although I'd like to write me a fairly big
> > comment in asm-generic/tlb.h about things, before I forget again.
> 
> How's something like so? There's a little page_size thingy in this;
> mostly because I couldn't be arsed to split it for now.
> 
> Will has opinions on the page_size thing; I'll let him explain.

They're not especially strong opinions, it's just that I don't think the
page size is necessarily the right thing to track and I'd rather remove that
altogether.

In the patches I've hacked up (I'll post shortly as an RFC), I track the
levels of page-table instead so you can relate the mmu_gather explicitly
with the page-table structure, rather than have to infer it from the page
size. For example, if an architecture could put down huge mappings at the
pte level (e.g. using a contiguous hint in the pte like we have on arm64),
then actually you want to know about the level rather than the size. You can
also track the levels using only 4 bits in the gather structure.

Finally, both approaches have a funny corner case when a VMA contains a
mixture of granule sizes. With the "page size has changed so flush
synchronously" you can theoretically end up with a lot of flushes, where
you'd have been better off just invalidating the whole mm. If you track the
levels instead and postpone a flush using the smallest level you saw, then
you're likely to hit whatever threshold you have and nuke the mm.

Will

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-24  8:47               ` Peter Zijlstra
  2018-08-24 11:32                 ` Peter Zijlstra
@ 2018-08-24 17:26                 ` Nadav Amit
  2018-08-24 18:04                   ` Peter Zijlstra
  1 sibling, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-24 17:26 UTC (permalink / raw)
  To: Peter Zijlstra, Will Deacon
  Cc: Linus Torvalds, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

at 1:47 AM, Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
>> The only problem with this approach is that we've lost track of the granule
>> size by the point we get to the tlb_flush(), so we can't adjust the stride of
>> the TLB invalidations for huge mappings, which actually works nicely in the
>> synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
>> rather than iterating over it at a 4k granule).
>> 
>> One thing we could do is switch to synchronous mode if we detect a change in
>> granule (i.e. treat it like a batch failure).
> 
> We could use tlb_start_vma() to track that, I think. Shouldn't be too
> hard.

Somewhat unrelated, but I use this opportunity that TLB got your attention
for something that bothers me for some time. clear_fixmap(), which is used
in various places (e.g., text_poke()), ends up in doing only a local TLB
flush (in __set_pte_vaddr()).

Is that sufficient?


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-24 17:26                 ` Nadav Amit
@ 2018-08-24 18:04                   ` Peter Zijlstra
  2018-08-24 18:35                     ` TLB flushes on fixmap changes Nadav Amit
  0 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-24 18:04 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Will Deacon, Linus Torvalds, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Fri, Aug 24, 2018 at 10:26:50AM -0700, Nadav Amit wrote:
> at 1:47 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
> >> The only problem with this approach is that we've lost track of the granule
> >> size by the point we get to the tlb_flush(), so we can't adjust the stride of
> >> the TLB invalidations for huge mappings, which actually works nicely in the
> >> synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
> >> rather than iterating over it at a 4k granule).
> >> 
> >> One thing we could do is switch to synchronous mode if we detect a change in
> >> granule (i.e. treat it like a batch failure).
> > 
> > We could use tlb_start_vma() to track that, I think. Shouldn't be too
> > hard.
> 
> Somewhat unrelated, but I use this opportunity that TLB got your attention
> for something that bothers me for some time. clear_fixmap(), which is used
> in various places (e.g., text_poke()), ends up in doing only a local TLB
> flush (in __set_pte_vaddr()).
> 
> Is that sufficient?

Urgh.. weren't the fixmaps per cpu? Bah, I remember looking at this
during PTI, but I seem to have forgotten everything again.



^ permalink raw reply	[flat|nested] 96+ messages in thread

* TLB flushes on fixmap changes
  2018-08-24 18:04                   ` Peter Zijlstra
@ 2018-08-24 18:35                     ` Nadav Amit
  2018-08-24 19:31                       ` Linus Torvalds
  0 siblings, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-24 18:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Linus Torvalds, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

at 11:04 AM, Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Aug 24, 2018 at 10:26:50AM -0700, Nadav Amit wrote:
>> at 1:47 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> 
>>> On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:
>>>> The only problem with this approach is that we've lost track of the granule
>>>> size by the point we get to the tlb_flush(), so we can't adjust the stride of
>>>> the TLB invalidations for huge mappings, which actually works nicely in the
>>>> synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
>>>> rather than iterating over it at a 4k granule).
>>>> 
>>>> One thing we could do is switch to synchronous mode if we detect a change in
>>>> granule (i.e. treat it like a batch failure).
>>> 
>>> We could use tlb_start_vma() to track that, I think. Shouldn't be too
>>> hard.
>> 
>> Somewhat unrelated, but I use this opportunity that TLB got your attention
>> for something that bothers me for some time. clear_fixmap(), which is used
>> in various places (e.g., text_poke()), ends up in doing only a local TLB
>> flush (in __set_pte_vaddr()).
>> 
>> Is that sufficient?
> 
> Urgh.. weren't the fixmaps per cpu? Bah, I remember looking at this
> during PTI, but I seem to have forgotten everything again.

[ Changed the title. Sorry for hijacking the thread. ]

Since:

native_set_fixmap()->set_pte_vaddr()->pgd_offset_k()

And pgd_offset_k() uses init_mm, they do not seem to be per-CPU.

In addition, the __flush_tlb_one_kernel() in text_poke() seems redundant
(since set_fixmap() should do it as well).

If you also think the current behavior is inappropriate, I can take a stab
at fixing it by adding a shootdown. But, if text_poke() is called when
interrupts are disabled, the fix would be annoying.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-24 18:35                     ` TLB flushes on fixmap changes Nadav Amit
@ 2018-08-24 19:31                       ` Linus Torvalds
  2018-08-24 20:24                         ` Nadav Amit
  0 siblings, 1 reply; 96+ messages in thread
From: Linus Torvalds @ 2018-08-24 19:31 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Peter Zijlstra, Will Deacon, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Fri, Aug 24, 2018 at 11:36 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> >
> > Urgh.. weren't the fixmaps per cpu? Bah, I remember looking at this
> > during PTI, but I seem to have forgotten everything again.
>
> [ Changed the title. Sorry for hijacking the thread. ]
>
> Since:
>
> native_set_fixmap()->set_pte_vaddr()->pgd_offset_k()

The fixmaps should be entirely fixed after bootup to constant
mappings, except for the KMAP ones, and they are indexed per-cpu.

That's what my mental model is, at least.

Can you actually find something that changes the fixmaps after boot
(again, ignoring kmap)?

Maybe worth adding some debugging to verify that?

                Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-24 19:31                       ` Linus Torvalds
@ 2018-08-24 20:24                         ` Nadav Amit
  2018-08-25  0:58                           ` Linus Torvalds
  0 siblings, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-24 20:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Will Deacon, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

at 12:31 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, Aug 24, 2018 at 11:36 AM Nadav Amit <nadav.amit@gmail.com> wrote:
>>> Urgh.. weren't the fixmaps per cpu? Bah, I remember looking at this
>>> during PTI, but I seem to have forgotten everything again.
>> 
>> [ Changed the title. Sorry for hijacking the thread. ]
>> 
>> Since:
>> 
>> native_set_fixmap()->set_pte_vaddr()->pgd_offset_k()
> 
> The fixmaps should be entirely fixed after bootup to constant
> mappings, except for the KMAP ones, and they are indexed per-cpu.
> 
> That's what my mental model is, at least.
> 
> Can you actually find something that changes the fixmaps after boot
> (again, ignoring kmap)?

At least the alternatives mechanism appears to do so.
 
IIUC the following path is possible when adding a module:

	jump_label_add_module()
	->__jump_label_update()
	->arch_jump_label_transform()
	->__jump_label_transform()
	->text_poke_bp()
	->text_poke()
	->set_fixmap()
	
And a similar path can happen when static_key_enable/disable() is called.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-24 20:24                         ` Nadav Amit
@ 2018-08-25  0:58                           ` Linus Torvalds
  2018-08-25  2:29                             ` nadav.amit
  0 siblings, 1 reply; 96+ messages in thread
From: Linus Torvalds @ 2018-08-25  0:58 UTC (permalink / raw)
  To: Nadav Amit, Paolo Bonzini, Jiri Kosina, Masami Hiramatsu
  Cc: Peter Zijlstra, Will Deacon, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

Adding a few people to the cc.

On Fri, Aug 24, 2018 at 1:24 PM Nadav Amit <nadav.amit@gmail.com> wrote:
> >
> > Can you actually find something that changes the fixmaps after boot
> > (again, ignoring kmap)?
>
> At least the alternatives mechanism appears to do so.
>
> IIUC the following path is possible when adding a module:
>
>         jump_label_add_module()
>         ->__jump_label_update()
>         ->arch_jump_label_transform()
>         ->__jump_label_transform()
>         ->text_poke_bp()
>         ->text_poke()
>         ->set_fixmap()

Yeah, that looks a bit iffy.

But making the tlb flush global wouldn't help.  This is running on a
local core, and if there are other CPU's that can do this at the same
time, then they'd just fight about the same mapping.

Honestly, I think it's ok just because I *hope* this is all serialized
anyway (jump_label_lock? But what about other users of text_poke?).

But I'd be a lot happier about it if it either used an explicit lock
to make sure, or used per-cpu fixmap entries.

And the tlb flush is done *after* the address is used, which is bogus anyway.

> And a similar path can happen when static_key_enable/disable() is called.

Same comments.

How about replacing that

        local_irq_save(flags);
       ... do critical things here ...
        local_irq_restore(flags);

in text_poke() with

        static DEFINE_SPINLOCK(poke_lock);

        spin_lock_irqsave(&poke_lock, flags);
       ... do critical things here ...
        spin_unlock_irqrestore(&poke_lock, flags);

and moving the local_flush_tlb() to after the set_fixmaps, but before
the access through the virtual address.

But changing things to do a global tlb flush would just be wrong.

           Linus

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-25  0:58                           ` Linus Torvalds
@ 2018-08-25  2:29                             ` nadav.amit
  2018-08-25  4:23                               ` Andy Lutomirski
  0 siblings, 1 reply; 96+ messages in thread
From: nadav.amit @ 2018-08-25  2:29 UTC (permalink / raw)
  To: Linus Torvalds, Paolo Bonzini, Jiri Kosina, Masami Hiramatsu
  Cc: Peter Zijlstra, Will Deacon, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman



On August 24, 2018 5:58:43 PM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>Adding a few people to the cc.
>
>On Fri, Aug 24, 2018 at 1:24 PM Nadav Amit <nadav.amit@gmail.com>
>wrote:
>> >
>> > Can you actually find something that changes the fixmaps after boot
>> > (again, ignoring kmap)?
>>
>> At least the alternatives mechanism appears to do so.
>>
>> IIUC the following path is possible when adding a module:
>>
>>         jump_label_add_module()
>>         ->__jump_label_update()
>>         ->arch_jump_label_transform()
>>         ->__jump_label_transform()
>>         ->text_poke_bp()
>>         ->text_poke()
>>         ->set_fixmap()
>
>Yeah, that looks a bit iffy.
>
>But making the tlb flush global wouldn't help.  This is running on a
>local core, and if there are other CPU's that can do this at the same
>time, then they'd just fight about the same mapping.
>
>Honestly, I think it's ok just because I *hope* this is all serialized
>anyway (jump_label_lock? But what about other users of text_poke?).

The users should hold text_mutex.

>
>But I'd be a lot happier about it if it either used an explicit lock
>to make sure, or used per-cpu fixmap entries.

My concern is that despite the lock, one core would do a speculative page walk and cache a translation that soon after would become stale.

>
>And the tlb flush is done *after* the address is used, which is bogus
>anyway.

It seems to me that it is intended to remove the mapping that might be a security issue. 

But anyhow, set_fixmap and clear_fixmap perform a local TLB flush, (in __set_pte_vaddr()) so locally things should be fine.

>
>> And a similar path can happen when static_key_enable/disable() is
>called.
>
>Same comments.
>
>How about replacing that
>
>        local_irq_save(flags);
>       ... do critical things here ...
>        local_irq_restore(flags);
>
>in text_poke() with
>
>        static DEFINE_SPINLOCK(poke_lock);
>
>        spin_lock_irqsave(&poke_lock, flags);
>       ... do critical things here ...
>        spin_unlock_irqrestore(&poke_lock, flags);
>
>and moving the local_flush_tlb() to after the set_fixmaps, but before
>the access through the virtual address.
>
>But changing things to do a global tlb flush would just be wrong.

As I noted, I think that locking and local flushes as they are right now are fine (besides the redundant flush).

My concern is merely that speculative page walks on other cores would cache stale entries.



-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-25  2:29                             ` nadav.amit
@ 2018-08-25  4:23                               ` Andy Lutomirski
  2018-08-26  2:23                                 ` Masami Hiramatsu
  0 siblings, 1 reply; 96+ messages in thread
From: Andy Lutomirski @ 2018-08-25  4:23 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Linus Torvalds, Paolo Bonzini, Jiri Kosina, Masami Hiramatsu,
	Peter Zijlstra, Will Deacon, Benjamin Herrenschmidt, Nick Piggin,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Fri, Aug 24, 2018 at 7:29 PM,  <nadav.amit@gmail.com> wrote:
>
>
> On August 24, 2018 5:58:43 PM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>Adding a few people to the cc.
>>
>>On Fri, Aug 24, 2018 at 1:24 PM Nadav Amit <nadav.amit@gmail.com>
>>wrote:
>>> >
>>> > Can you actually find something that changes the fixmaps after boot
>>> > (again, ignoring kmap)?
>>>
>>> At least the alternatives mechanism appears to do so.
>>>
>>> IIUC the following path is possible when adding a module:
>>>
>>>         jump_label_add_module()
>>>         ->__jump_label_update()
>>>         ->arch_jump_label_transform()
>>>         ->__jump_label_transform()
>>>         ->text_poke_bp()
>>>         ->text_poke()
>>>         ->set_fixmap()
>>
>>Yeah, that looks a bit iffy.
>>
>>But making the tlb flush global wouldn't help.  This is running on a
>>local core, and if there are other CPU's that can do this at the same
>>time, then they'd just fight about the same mapping.
>>
>>Honestly, I think it's ok just because I *hope* this is all serialized
>>anyway (jump_label_lock? But what about other users of text_poke?).
>
> The users should hold text_mutex.
>
>>
>>But I'd be a lot happier about it if it either used an explicit lock
>>to make sure, or used per-cpu fixmap entries.
>
> My concern is that despite the lock, one core would do a speculative page walk and cache a translation that soon after would become stale.
>
>>
>>And the tlb flush is done *after* the address is used, which is bogus
>>anyway.
>
> It seems to me that it is intended to remove the mapping that might be a security issue.
>
> But anyhow, set_fixmap and clear_fixmap perform a local TLB flush, (in __set_pte_vaddr()) so locally things should be fine.
>
>>
>>> And a similar path can happen when static_key_enable/disable() is
>>called.
>>
>>Same comments.
>>
>>How about replacing that
>>
>>        local_irq_save(flags);
>>       ... do critical things here ...
>>        local_irq_restore(flags);
>>
>>in text_poke() with
>>
>>        static DEFINE_SPINLOCK(poke_lock);
>>
>>        spin_lock_irqsave(&poke_lock, flags);
>>       ... do critical things here ...
>>        spin_unlock_irqrestore(&poke_lock, flags);
>>
>>and moving the local_flush_tlb() to after the set_fixmaps, but before
>>the access through the virtual address.
>>
>>But changing things to do a global tlb flush would just be wrong.
>
> As I noted, I think that locking and local flushes as they are right now are fine (besides the redundant flush).
>
> My concern is merely that speculative page walks on other cores would cache stale entries.
>
>

This is almost certainly a bug, or even two bugs.  Bug 1:  why on
Earth do we flush in __set_pte_vaddr()?  We should flush when
*clearing* or when modifying an existing fixmap entry.  Right now, if
we do text_poke() after boot, then the TLB entry will stick around and
will be a nice exploit target.

Bug 2: what you're describing.  It's racy.

Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-25  4:23                               ` Andy Lutomirski
@ 2018-08-26  2:23                                 ` Masami Hiramatsu
  2018-08-26  4:21                                   ` Andy Lutomirski
  0 siblings, 1 reply; 96+ messages in thread
From: Masami Hiramatsu @ 2018-08-26  2:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Nadav Amit, Linus Torvalds, Paolo Bonzini, Jiri Kosina,
	Masami Hiramatsu, Peter Zijlstra, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Fri, 24 Aug 2018 21:23:26 -0700
Andy Lutomirski <luto@kernel.org> wrote:

> On Fri, Aug 24, 2018 at 7:29 PM,  <nadav.amit@gmail.com> wrote:
> >
> >
> > On August 24, 2018 5:58:43 PM PDT, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> >>Adding a few people to the cc.
> >>
> >>On Fri, Aug 24, 2018 at 1:24 PM Nadav Amit <nadav.amit@gmail.com>
> >>wrote:
> >>> >
> >>> > Can you actually find something that changes the fixmaps after boot
> >>> > (again, ignoring kmap)?
> >>>
> >>> At least the alternatives mechanism appears to do so.
> >>>
> >>> IIUC the following path is possible when adding a module:
> >>>
> >>>         jump_label_add_module()
> >>>         ->__jump_label_update()
> >>>         ->arch_jump_label_transform()
> >>>         ->__jump_label_transform()
> >>>         ->text_poke_bp()
> >>>         ->text_poke()
> >>>         ->set_fixmap()
> >>
> >>Yeah, that looks a bit iffy.
> >>
> >>But making the tlb flush global wouldn't help.  This is running on a
> >>local core, and if there are other CPU's that can do this at the same
> >>time, then they'd just fight about the same mapping.
> >>
> >>Honestly, I think it's ok just because I *hope* this is all serialized
> >>anyway (jump_label_lock? But what about other users of text_poke?).
> >
> > The users should hold text_mutex.
> >
> >>
> >>But I'd be a lot happier about it if it either used an explicit lock
> >>to make sure, or used per-cpu fixmap entries.
> >
> > My concern is that despite the lock, one core would do a speculative page walk and cache a translation that soon after would become stale.
> >
> >>
> >>And the tlb flush is done *after* the address is used, which is bogus
> >>anyway.
> >
> > It seems to me that it is intended to remove the mapping that might be a security issue.
> >
> > But anyhow, set_fixmap and clear_fixmap perform a local TLB flush, (in __set_pte_vaddr()) so locally things should be fine.
> >
> >>
> >>> And a similar path can happen when static_key_enable/disable() is
> >>called.
> >>
> >>Same comments.
> >>
> >>How about replacing that
> >>
> >>        local_irq_save(flags);
> >>       ... do critical things here ...
> >>        local_irq_restore(flags);
> >>
> >>in text_poke() with
> >>
> >>        static DEFINE_SPINLOCK(poke_lock);
> >>
> >>        spin_lock_irqsave(&poke_lock, flags);
> >>       ... do critical things here ...
> >>        spin_unlock_irqrestore(&poke_lock, flags);
> >>
> >>and moving the local_flush_tlb() to after the set_fixmaps, but before
> >>the access through the virtual address.
> >>
> >>But changing things to do a global tlb flush would just be wrong.
> >
> > As I noted, I think that locking and local flushes as they are right now are fine (besides the redundant flush).
> >
> > My concern is merely that speculative page walks on other cores would cache stale entries.
> >
> >
> 
> This is almost certainly a bug, or even two bugs.  Bug 1:  why on
> Earth do we flush in __set_pte_vaddr()?  We should flush when
> *clearing* or when modifying an existing fixmap entry.  Right now, if
> we do text_poke() after boot, then the TLB entry will stick around and
> will be a nice exploit target.
> 
> Bug 2: what you're describing.  It's racy.
> 
> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?

No, since kmap_atomic() is only for x86_32 and highmem support kernel.
In x86-64, it seems that returns just a page address. That is not
good for text_poke, since it needs to make a writable alias for RO
code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?

Thank you,

-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26  2:23                                 ` Masami Hiramatsu
@ 2018-08-26  4:21                                   ` Andy Lutomirski
  2018-08-26  4:43                                     ` Kees Cook
                                                       ` (2 more replies)
  0 siblings, 3 replies; 96+ messages in thread
From: Andy Lutomirski @ 2018-08-26  4:21 UTC (permalink / raw)
  To: Masami Hiramatsu, Kees Cook
  Cc: Andy Lutomirski, Nadav Amit, Linus Torvalds, Paolo Bonzini,
	Jiri Kosina, Peter Zijlstra, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

[-- Attachment #1: Type: text/plain, Size: 1140 bytes --]

On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> On Fri, 24 Aug 2018 21:23:26 -0700
> Andy Lutomirski <luto@kernel.org> wrote:
>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>
> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
> In x86-64, it seems that returns just a page address. That is not
> good for text_poke, since it needs to make a writable alias for RO
> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>

I just re-read text_poke().  It's, um, horrible.  Not only is the
implementation overcomplicated and probably buggy, but it's SLOOOOOW.
It's totally the wrong API -- poking one instruction at a time
basically can't be efficient on x86.  The API should either poke lots
of instructions at once or should be text_poke_begin(); ...;
text_poke_end();.

Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
too scary of an approach?  With the patch applied, text_poke() is a
fantastic exploit target.  On the other hand, even without the patch
applied, text_poke() is every bit as juicy.

--Andy

[-- Attachment #2: text_poke.patch --]
[-- Type: text/x-patch, Size: 1628 bytes --]

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 014f214da581..811c8735b129 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -690,40 +690,15 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
 void *text_poke(void *addr, const void *opcode, size_t len)
 {
 	unsigned long flags;
-	char *vaddr;
-	struct page *pages[2];
-	int i;
-
-	/*
-	 * While boot memory allocator is runnig we cannot use struct
-	 * pages as they are not yet initialized.
-	 */
-	BUG_ON(!after_bootmem);
+	unsigned long old_cr0;
 
-	if (!core_kernel_text((unsigned long)addr)) {
-		pages[0] = vmalloc_to_page(addr);
-		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
-	} else {
-		pages[0] = virt_to_page(addr);
-		WARN_ON(!PageReserved(pages[0]));
-		pages[1] = virt_to_page(addr + PAGE_SIZE);
-	}
-	BUG_ON(!pages[0]);
 	local_irq_save(flags);
-	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
-	if (pages[1])
-		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
-	vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
-	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
-	clear_fixmap(FIX_TEXT_POKE0);
-	if (pages[1])
-		clear_fixmap(FIX_TEXT_POKE1);
-	local_flush_tlb();
-	sync_core();
-	/* Could also do a CLFLUSH here to speed up CPU recovery; but
-	   that causes hangs on some VIA CPUs. */
-	for (i = 0; i < len; i++)
-		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
+	old_cr0 = read_cr0();
+	write_cr0(old_cr0 & ~X86_CR0_WP);
+
+	memcpy(addr, opcode, len);
+
+	write_cr0(old_cr0);	/* also serializes */
 	local_irq_restore(flags);
 	return addr;
 }

^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26  4:21                                   ` Andy Lutomirski
@ 2018-08-26  4:43                                     ` Kees Cook
  2018-08-26  5:53                                       ` Nadav Amit
  2018-08-26 14:20                                       ` Andy Lutomirski
  2018-08-26  9:09                                     ` Peter Zijlstra
  2018-08-26 22:48                                     ` Jann Horn
  2 siblings, 2 replies; 96+ messages in thread
From: Kees Cook @ 2018-08-26  4:43 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Masami Hiramatsu, Nadav Amit, Linus Torvalds, Paolo Bonzini,
	Jiri Kosina, Peter Zijlstra, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Sat, Aug 25, 2018 at 9:21 PM, Andy Lutomirski <luto@kernel.org> wrote:
> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>> On Fri, 24 Aug 2018 21:23:26 -0700
>> Andy Lutomirski <luto@kernel.org> wrote:
>>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>>
>> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
>> In x86-64, it seems that returns just a page address. That is not
>> good for text_poke, since it needs to make a writable alias for RO
>> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>>
>
> I just re-read text_poke().  It's, um, horrible.  Not only is the
> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
> It's totally the wrong API -- poking one instruction at a time
> basically can't be efficient on x86.  The API should either poke lots
> of instructions at once or should be text_poke_begin(); ...;
> text_poke_end();.
>
> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
> too scary of an approach?  With the patch applied, text_poke() is a
> fantastic exploit target.  On the other hand, even without the patch
> applied, text_poke() is every bit as juicy.

I tried to convince Ingo to use this method for doing "write rarely"
and he soundly rejected it. :) I've always liked this because AFAICT,
it's local to the CPU. I had proposed it in
https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d

With that, text_poke() mostly becomes:

rare_write_begin()
memcpy(addr, opcode, len);
rare_write_end()

As for juiciness, if an attacker already has execution control, they
can do more interesting things than text_poke(). But regardless, yes,
it's always made me uncomfortable. :)

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26  4:43                                     ` Kees Cook
@ 2018-08-26  5:53                                       ` Nadav Amit
  2018-08-26 14:20                                       ` Andy Lutomirski
  1 sibling, 0 replies; 96+ messages in thread
From: Nadav Amit @ 2018-08-26  5:53 UTC (permalink / raw)
  To: Kees Cook, Andy Lutomirski
  Cc: Masami Hiramatsu, Linus Torvalds, Paolo Bonzini, Jiri Kosina,
	Peter Zijlstra, Will Deacon, Benjamin Herrenschmidt, Nick Piggin,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman

at 9:43 PM, Kees Cook <keescook@chromium.org> wrote:

> On Sat, Aug 25, 2018 at 9:21 PM, Andy Lutomirski <luto@kernel.org> wrote:
>> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>> On Fri, 24 Aug 2018 21:23:26 -0700
>>> Andy Lutomirski <luto@kernel.org> wrote:
>>>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>>> 
>>> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
>>> In x86-64, it seems that returns just a page address. That is not
>>> good for text_poke, since it needs to make a writable alias for RO
>>> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>> 
>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>> It's totally the wrong API -- poking one instruction at a time
>> basically can't be efficient on x86.  The API should either poke lots
>> of instructions at once or should be text_poke_begin(); ...;
>> text_poke_end();.
>> 
>> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
>> too scary of an approach?  With the patch applied, text_poke() is a
>> fantastic exploit target.  On the other hand, even without the patch
>> applied, text_poke() is every bit as juicy.
> 
> I tried to convince Ingo to use this method for doing "write rarely"
> and he soundly rejected it. :) I've always liked this because AFAICT,
> it's local to the CPU. I had proposed it in
> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
> 
> With that, text_poke() mostly becomes:
> 
> rare_write_begin()
> memcpy(addr, opcode, len);
> rare_write_end()
> 
> As for juiciness, if an attacker already has execution control, they
> can do more interesting things than text_poke(). But regardless, yes,
> it's always made me uncomfortable. :)

I think that the key to harden the security of text_poke() against its use
as a gadget in a ROP/JOP attack is to add a check/assertion for the old
(expected) value, such as:

rare_write_begin()
if (*addr == prev_opcode)
	memcpy(addr, opcode, len);
rare_write_end()


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26  4:21                                   ` Andy Lutomirski
  2018-08-26  4:43                                     ` Kees Cook
@ 2018-08-26  9:09                                     ` Peter Zijlstra
  2018-08-27  3:03                                       ` Masami Hiramatsu
  2018-08-26 22:48                                     ` Jann Horn
  2 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-26  9:09 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Masami Hiramatsu, Kees Cook, Nadav Amit, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
> I just re-read text_poke().  It's, um, horrible.  Not only is the
> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
> It's totally the wrong API -- poking one instruction at a time
> basically can't be efficient on x86.  The API should either poke lots
> of instructions at once or should be text_poke_begin(); ...;
> text_poke_end();.

I don't think anybody ever cared about performance here. Only
correctness. That whole text_poke_bp() thing is entirely tricky.

FWIW, before text_poke_bp(), text_poke() would only be used from
stop_machine, so all the other CPUs would be stuck busy-waiting with
IRQs disabled. These days, yeah, that's lots more dodgy, but yes
text_mutex should be serializing all that.

And on that, I so hate comments like: "must be called under foo_mutex",
we have lockdep_assert_held() for that.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26  4:43                                     ` Kees Cook
  2018-08-26  5:53                                       ` Nadav Amit
@ 2018-08-26 14:20                                       ` Andy Lutomirski
  2018-08-26 16:47                                         ` Kees Cook
  1 sibling, 1 reply; 96+ messages in thread
From: Andy Lutomirski @ 2018-08-26 14:20 UTC (permalink / raw)
  To: Kees Cook
  Cc: Andy Lutomirski, Masami Hiramatsu, Nadav Amit, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Peter Zijlstra, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman



> On Aug 25, 2018, at 9:43 PM, Kees Cook <keescook@chromium.org> wrote:
> 
>> On Sat, Aug 25, 2018 at 9:21 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>> On Fri, 24 Aug 2018 21:23:26 -0700
>>> Andy Lutomirski <luto@kernel.org> wrote:
>>>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>>> 
>>> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
>>> In x86-64, it seems that returns just a page address. That is not
>>> good for text_poke, since it needs to make a writable alias for RO
>>> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>>> 
>> 
>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>> It's totally the wrong API -- poking one instruction at a time
>> basically can't be efficient on x86.  The API should either poke lots
>> of instructions at once or should be text_poke_begin(); ...;
>> text_poke_end();.
>> 
>> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
>> too scary of an approach?  With the patch applied, text_poke() is a
>> fantastic exploit target.  On the other hand, even without the patch
>> applied, text_poke() is every bit as juicy.
> 
> I tried to convince Ingo to use this method for doing "write rarely"
> and he soundly rejected it. :) I've always liked this because AFAICT,
> it's local to the CPU. I had proposed it in
> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d

Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.

> 
> With that, text_poke() mostly becomes:
> 
> rare_write_begin()
> memcpy(addr, opcode, len);
> rare_write_end()
> 
> As for juiciness, if an attacker already has execution control, they
> can do more interesting things than text_poke(). But regardless, yes,
> it's always made me uncomfortable. :)
> 
> -Kees
> 
> -- 
> Kees Cook
> Pixel Security

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26 14:20                                       ` Andy Lutomirski
@ 2018-08-26 16:47                                         ` Kees Cook
  2018-08-26 17:25                                           ` Andy Lutomirski
  0 siblings, 1 reply; 96+ messages in thread
From: Kees Cook @ 2018-08-26 16:47 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Masami Hiramatsu, Nadav Amit, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Peter Zijlstra, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Sun, Aug 26, 2018 at 7:20 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>
>
>> On Aug 25, 2018, at 9:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>
>>> On Sat, Aug 25, 2018 at 9:21 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>>> On Fri, 24 Aug 2018 21:23:26 -0700
>>>> Andy Lutomirski <luto@kernel.org> wrote:
>>>>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>>>>
>>>> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
>>>> In x86-64, it seems that returns just a page address. That is not
>>>> good for text_poke, since it needs to make a writable alias for RO
>>>> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>>>>
>>>
>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>> It's totally the wrong API -- poking one instruction at a time
>>> basically can't be efficient on x86.  The API should either poke lots
>>> of instructions at once or should be text_poke_begin(); ...;
>>> text_poke_end();.
>>>
>>> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
>>> too scary of an approach?  With the patch applied, text_poke() is a
>>> fantastic exploit target.  On the other hand, even without the patch
>>> applied, text_poke() is every bit as juicy.
>>
>> I tried to convince Ingo to use this method for doing "write rarely"
>> and he soundly rejected it. :) I've always liked this because AFAICT,
>> it's local to the CPU. I had proposed it in
>> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
>
> Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.

Sorry, it looks like it was tglx, not Ingo:

https://lkml.kernel.org/r/alpine.DEB.2.20.1704071048360.1716@nanos

This thread is long, and one thing that I think went unanswered was
"why do we want this to be fast?" the answer is: for doing page table
updates. Page tables are becoming a bigger target for attacks now, and
it's be nice if they could stay read-only unless they're getting
updated (with something like this).

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26 16:47                                         ` Kees Cook
@ 2018-08-26 17:25                                           ` Andy Lutomirski
  2018-08-26 20:15                                             ` Thomas Gleixner
  0 siblings, 1 reply; 96+ messages in thread
From: Andy Lutomirski @ 2018-08-26 17:25 UTC (permalink / raw)
  To: Kees Cook
  Cc: Andy Lutomirski, Masami Hiramatsu, Nadav Amit, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Peter Zijlstra, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman



> On Aug 26, 2018, at 9:47 AM, Kees Cook <keescook@chromium.org> wrote:
> 
>> On Sun, Aug 26, 2018 at 7:20 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> 
>> 
>>>> On Aug 25, 2018, at 9:43 PM, Kees Cook <keescook@chromium.org> wrote:
>>>> 
>>>>> On Sat, Aug 25, 2018 at 9:21 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>>>> On Fri, 24 Aug 2018 21:23:26 -0700
>>>>> Andy Lutomirski <luto@kernel.org> wrote:
>>>>>> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
>>>>> 
>>>>> No, since kmap_atomic() is only for x86_32 and highmem support kernel.
>>>>> In x86-64, it seems that returns just a page address. That is not
>>>>> good for text_poke, since it needs to make a writable alias for RO
>>>>> code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
>>>>> 
>>>> 
>>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>>> It's totally the wrong API -- poking one instruction at a time
>>>> basically can't be efficient on x86.  The API should either poke lots
>>>> of instructions at once or should be text_poke_begin(); ...;
>>>> text_poke_end();.
>>>> 
>>>> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
>>>> too scary of an approach?  With the patch applied, text_poke() is a
>>>> fantastic exploit target.  On the other hand, even without the patch
>>>> applied, text_poke() is every bit as juicy.
>>> 
>>> I tried to convince Ingo to use this method for doing "write rarely"
>>> and he soundly rejected it. :) I've always liked this because AFAICT,
>>> it's local to the CPU. I had proposed it in
>>> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
>> 
>> Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.
> 
> Sorry, it looks like it was tglx, not Ingo:
> 
> https://lkml.kernel.org/r/alpine.DEB.2.20.1704071048360.1716@nanos
> 
> This thread is long, and one thing that I think went unanswered was
> "why do we want this to be fast?" the answer is: for doing page table
> updates. Page tables are becoming a bigger target for attacks now, and
> it's be nice if they could stay read-only unless they're getting
> updated (with something like this).
> 
> 

It kind of sounds like tglx would prefer the CR3 approach. And indeed my patch has a serious problem wrt the NMI code.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26 17:25                                           ` Andy Lutomirski
@ 2018-08-26 20:15                                             ` Thomas Gleixner
  2018-08-26 22:03                                               ` Kees Cook
  0 siblings, 1 reply; 96+ messages in thread
From: Thomas Gleixner @ 2018-08-26 20:15 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Kees Cook, Andy Lutomirski, Masami Hiramatsu, Nadav Amit,
	Linus Torvalds, Paolo Bonzini, Jiri Kosina, Peter Zijlstra,
	Will Deacon, Benjamin Herrenschmidt, Nick Piggin,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman

On Sun, 26 Aug 2018, Andy Lutomirski wrote:
> > On Aug 26, 2018, at 9:47 AM, Kees Cook <keescook@chromium.org> wrote:
> >> On Sun, Aug 26, 2018 at 7:20 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> >>> I tried to convince Ingo to use this method for doing "write rarely"
> >>> and he soundly rejected it. :) I've always liked this because AFAICT,
> >>> it's local to the CPU. I had proposed it in
> >>> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
> >> 
> >> Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.
> > 
> > Sorry, it looks like it was tglx, not Ingo:
> > 
> > https://lkml.kernel.org/r/alpine.DEB.2.20.1704071048360.1716@nanos
> > 
> > This thread is long, and one thing that I think went unanswered was
> > "why do we want this to be fast?" the answer is: for doing page table
> > updates. Page tables are becoming a bigger target for attacks now, and
> > it's be nice if they could stay read-only unless they're getting
> > updated (with something like this).
> > 
> > 
> It kind of sounds like tglx would prefer the CR3 approach. And indeed my
> patch has a serious problem wrt the NMI code.

That's exactly the problem I have with CR0. It leaves everything and some
more writeable for any code which can interrupt that section.

Performance wise CR0 is not that much better than CR3 except that it has
the costs nicely hidden.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26 20:15                                             ` Thomas Gleixner
@ 2018-08-26 22:03                                               ` Kees Cook
  2018-08-26 22:15                                                 ` Matthew Wilcox
  2018-08-26 22:29                                                 ` Jann Horn
  0 siblings, 2 replies; 96+ messages in thread
From: Kees Cook @ 2018-08-26 22:03 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Andy Lutomirski, Andy Lutomirski, Masami Hiramatsu, Nadav Amit,
	Linus Torvalds, Paolo Bonzini, Jiri Kosina, Peter Zijlstra,
	Will Deacon, Benjamin Herrenschmidt, Nick Piggin,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman

On Sun, Aug 26, 2018 at 1:15 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Sun, 26 Aug 2018, Andy Lutomirski wrote:
>> > On Aug 26, 2018, at 9:47 AM, Kees Cook <keescook@chromium.org> wrote:
>> >> On Sun, Aug 26, 2018 at 7:20 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> >>> I tried to convince Ingo to use this method for doing "write rarely"
>> >>> and he soundly rejected it. :) I've always liked this because AFAICT,
>> >>> it's local to the CPU. I had proposed it in
>> >>> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
>> >>
>> >> Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.
>> >
>> > Sorry, it looks like it was tglx, not Ingo:
>> >
>> > https://lkml.kernel.org/r/alpine.DEB.2.20.1704071048360.1716@nanos
>> >
>> > This thread is long, and one thing that I think went unanswered was
>> > "why do we want this to be fast?" the answer is: for doing page table
>> > updates. Page tables are becoming a bigger target for attacks now, and
>> > it's be nice if they could stay read-only unless they're getting
>> > updated (with something like this).
>> >
>> >
>> It kind of sounds like tglx would prefer the CR3 approach. And indeed my
>> patch has a serious problem wrt the NMI code.
>
> That's exactly the problem I have with CR0. It leaves everything and some
> more writeable for any code which can interrupt that section.

I thought the point was that the implementation I suggested was
NMI-proof? (And in reading Documentation/preempt-locking.txt it sounds
like disabling interrupts is redundant to preempt_disable()? But I
don't understand how; it looks like the preempt stuff is advisory?)

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26 22:03                                               ` Kees Cook
@ 2018-08-26 22:15                                                 ` Matthew Wilcox
  2018-08-26 22:29                                                 ` Jann Horn
  1 sibling, 0 replies; 96+ messages in thread
From: Matthew Wilcox @ 2018-08-26 22:15 UTC (permalink / raw)
  To: Kees Cook
  Cc: Thomas Gleixner, Andy Lutomirski, Andy Lutomirski,
	Masami Hiramatsu, Nadav Amit, Linus Torvalds, Paolo Bonzini,
	Jiri Kosina, Peter Zijlstra, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Sun, Aug 26, 2018 at 03:03:59PM -0700, Kees Cook wrote:
> I thought the point was that the implementation I suggested was
> NMI-proof? (And in reading Documentation/preempt-locking.txt it sounds
> like disabling interrupts is redundant to preempt_disable()? But I
> don't understand how; it looks like the preempt stuff is advisory?)

Oter way round; disabling interrupts implicitly disables preemption

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26 22:03                                               ` Kees Cook
  2018-08-26 22:15                                                 ` Matthew Wilcox
@ 2018-08-26 22:29                                                 ` Jann Horn
  1 sibling, 0 replies; 96+ messages in thread
From: Jann Horn @ 2018-08-26 22:29 UTC (permalink / raw)
  To: Kees Cook, Andy Lutomirski
  Cc: Thomas Gleixner, Andy Lutomirski, mhiramat, Nadav Amit,
	Linus Torvalds, Paolo Bonzini, jkosina, Peter Zijlstra,
	Will Deacon, benh, npiggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Adin Scannell, Dave Hansen,
	kernel list, Linux-MM, David S. Miller, Martin Schwidefsky,
	Michael Ellerman

On Mon, Aug 27, 2018 at 12:11 AM Kees Cook <keescook@chromium.org> wrote:
>
> On Sun, Aug 26, 2018 at 1:15 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > On Sun, 26 Aug 2018, Andy Lutomirski wrote:
> >> > On Aug 26, 2018, at 9:47 AM, Kees Cook <keescook@chromium.org> wrote:
> >> >> On Sun, Aug 26, 2018 at 7:20 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> >> >>> I tried to convince Ingo to use this method for doing "write rarely"
> >> >>> and he soundly rejected it. :) I've always liked this because AFAICT,
> >> >>> it's local to the CPU. I had proposed it in
> >> >>> https://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git/commit/?h=kspp/write-rarely&id=9ab0cb2618ebbc51f830ceaa06b7d2182fe1a52d
> >> >>
> >> >> Ingo, can you clarify why you hate it?  I personally would rather use CR3, but CR0 seems like a fine first step, at least for text_poke.
> >> >
> >> > Sorry, it looks like it was tglx, not Ingo:
> >> >
> >> > https://lkml.kernel.org/r/alpine.DEB.2.20.1704071048360.1716@nanos
> >> >
> >> > This thread is long, and one thing that I think went unanswered was
> >> > "why do we want this to be fast?" the answer is: for doing page table
> >> > updates. Page tables are becoming a bigger target for attacks now, and
> >> > it's be nice if they could stay read-only unless they're getting
> >> > updated (with something like this).
> >> >
> >> >
> >> It kind of sounds like tglx would prefer the CR3 approach. And indeed my
> >> patch has a serious problem wrt the NMI code.
> >
> > That's exactly the problem I have with CR0. It leaves everything and some
> > more writeable for any code which can interrupt that section.
>
> I thought the point was that the implementation I suggested was
> NMI-proof? (And in reading Documentation/preempt-locking.txt it sounds
> like disabling interrupts is redundant to preempt_disable()? But I
> don't understand how; it looks like the preempt stuff is advisory?)

Where are you dealing with NMIs? local_irq_disable() disables the
interrupt flag, but Non-Maskable Interrupts can still come in. As far
as I know, the only way to block those is to artificially generate an
NMI yourself (Xen does that sometimes). Otherwise, you have to twiddle
CR0.WP in the NMI handler entry/exit code.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26  4:21                                   ` Andy Lutomirski
  2018-08-26  4:43                                     ` Kees Cook
  2018-08-26  9:09                                     ` Peter Zijlstra
@ 2018-08-26 22:48                                     ` Jann Horn
  2 siblings, 0 replies; 96+ messages in thread
From: Jann Horn @ 2018-08-26 22:48 UTC (permalink / raw)
  To: Andy Lutomirski, Kees Cook
  Cc: mhiramat, Nadav Amit, Linus Torvalds, Paolo Bonzini, jkosina,
	Peter Zijlstra, Will Deacon, benh, npiggin,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Adin Scannell, Dave Hansen, kernel list, Linux-MM,
	David S. Miller, Martin Schwidefsky, Michael Ellerman

On Sun, Aug 26, 2018 at 6:21 AM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Sat, Aug 25, 2018 at 7:23 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> > On Fri, 24 Aug 2018 21:23:26 -0700
> > Andy Lutomirski <luto@kernel.org> wrote:
> >> Couldn't text_poke() use kmap_atomic()?  Or, even better, just change CR3?
> >
> > No, since kmap_atomic() is only for x86_32 and highmem support kernel.
> > In x86-64, it seems that returns just a page address. That is not
> > good for text_poke, since it needs to make a writable alias for RO
> > code page. Hmm, maybe, can we mimic copy_oldmem_page(), it uses ioremap_cache?
> >
>
> I just re-read text_poke().  It's, um, horrible.  Not only is the
> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
> It's totally the wrong API -- poking one instruction at a time
> basically can't be efficient on x86.  The API should either poke lots
> of instructions at once or should be text_poke_begin(); ...;
> text_poke_end();.
>
> Anyway, the attached patch seems to boot.  Linus, Kees, etc: is this
> too scary of an approach?  With the patch applied, text_poke() is a
> fantastic exploit target.  On the other hand, even without the patch
> applied, text_poke() is every bit as juicy.

Twiddling CR0.WP is incompatible with Xen PV, right? It can't let you
do it because you're not actually running in ring 0 (but in ring 1 or
3), so CR0.WP has no influence on what you can access; and it must not
let you bypass write protection because you have read-only access to
host page tables. I think this code has to be compatible with Xen PV,
right?

In theory Xen PV could support this by emulating X86 instructions, but
I don't see anything related to CR0.WP in their emulation code. From
xen/arch/x86/pv/emul-priv-op.c:

    case 0: /* Write CR0 */
        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
        {
            gdprintk(XENLOG_WARNING,
                     "Attempt to change unmodifiable CR0 flags\n");
            break;
        }
        do_fpu_taskswitch(!!(val & X86_CR0_TS));
        return X86EMUL_OKAY;

Having a special fallback path for "patch kernel code while running
under Xen PV" would be kinda ugly.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-26  9:09                                     ` Peter Zijlstra
@ 2018-08-27  3:03                                       ` Masami Hiramatsu
  2018-08-27  3:26                                         ` Nadav Amit
  2018-08-27  8:13                                         ` Peter Zijlstra
  0 siblings, 2 replies; 96+ messages in thread
From: Masami Hiramatsu @ 2018-08-27  3:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andy Lutomirski, Masami Hiramatsu, Kees Cook, Nadav Amit,
	Linus Torvalds, Paolo Bonzini, Jiri Kosina, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Sun, 26 Aug 2018 11:09:58 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
> > I just re-read text_poke().  It's, um, horrible.  Not only is the
> > implementation overcomplicated and probably buggy, but it's SLOOOOOW.
> > It's totally the wrong API -- poking one instruction at a time
> > basically can't be efficient on x86.  The API should either poke lots
> > of instructions at once or should be text_poke_begin(); ...;
> > text_poke_end();.
> 
> I don't think anybody ever cared about performance here. Only
> correctness. That whole text_poke_bp() thing is entirely tricky.

Agreed. Self modification is a special event.

> FWIW, before text_poke_bp(), text_poke() would only be used from
> stop_machine, so all the other CPUs would be stuck busy-waiting with
> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
> text_mutex should be serializing all that.

I'm still not sure that speculative page-table walk can be done
over the mutex. Also, if the fixmap area is for aliasing
pages (which always mapped to memory), what kind of
security issue can happen?

Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
changing CR3 sounds good to me. I think we don't even need per-cpu,
it can call a thread/function on a dedicated core (like the first
boot processor) and wait :) This may prevent leakage of pte change
to other cores.

> And on that, I so hate comments like: "must be called under foo_mutex",
> we have lockdep_assert_held() for that.

Indeed. I also think that text_poke() should not call BUG_ON, but
its caller should decide it is recoverable or not.

Thank you,

-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27  3:03                                       ` Masami Hiramatsu
@ 2018-08-27  3:26                                         ` Nadav Amit
  2018-08-27  8:05                                           ` Masami Hiramatsu
  2018-08-27  8:13                                         ` Peter Zijlstra
  1 sibling, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-27  3:26 UTC (permalink / raw)
  To: Masami Hiramatsu, Peter Zijlstra
  Cc: Andy Lutomirski, Kees Cook, Linus Torvalds, Paolo Bonzini,
	Jiri Kosina, Will Deacon, Benjamin Herrenschmidt, Nick Piggin,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman

at 8:03 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:

> On Sun, 26 Aug 2018 11:09:58 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
>> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>> It's totally the wrong API -- poking one instruction at a time
>>> basically can't be efficient on x86.  The API should either poke lots
>>> of instructions at once or should be text_poke_begin(); ...;
>>> text_poke_end();.
>> 
>> I don't think anybody ever cared about performance here. Only
>> correctness. That whole text_poke_bp() thing is entirely tricky.
> 
> Agreed. Self modification is a special event.
> 
>> FWIW, before text_poke_bp(), text_poke() would only be used from
>> stop_machine, so all the other CPUs would be stuck busy-waiting with
>> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
>> text_mutex should be serializing all that.
> 
> I'm still not sure that speculative page-table walk can be done
> over the mutex. Also, if the fixmap area is for aliasing
> pages (which always mapped to memory), what kind of
> security issue can happen?

The PTE is accessible from other cores, so just as we assume for L1TF that
the every addressable memory might be cached in L1, we should assume and
PTE might be cached in the TLB when it is present.

Although the mapping is for an alias, there are a couple of issues here.
First, this alias mapping is writable, so it might an attacker to change the
kernel code (following another initial attack). Second, the alias mapping is
never explicitly flushed. We may assume that once the original mapping is
removed/changed, a full TLB flush would take place, but there is no
guarantee it actually takes place.

> Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
> changing CR3 sounds good to me. I think we don't even need per-cpu,
> it can call a thread/function on a dedicated core (like the first
> boot processor) and wait :) This may prevent leakage of pte change
> to other cores.

I implemented per-cpu fixmap, but I think that it makes more sense to take
peterz approach and set an entry in the PGD level. Per-CPU fixmap either
requires to pre-populate various levels in the page-table hierarchy, or
conditionally synchronize whenever module memory is allocated, since they
can share the same PGD, PUD & PMD. While usually the synchronization is not
needed, the possibility that synchronization is needed complicates locking.

Anyhow, having fixed addresses for the fixmap can be used to circumvent
KASLR.

I don’t think a dedicated core is needed. Anyhow there is a lock
(text_mutex), so use_mm() can be used after acquiring the mutex.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-24 11:39                   ` Peter Zijlstra
@ 2018-08-27  5:00                     ` Nicholas Piggin
  2018-08-27  7:47                       ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-27  5:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Linus Torvalds, Benjamin Herrenschmidt,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Fri, 24 Aug 2018 13:39:53 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Aug 24, 2018 at 01:32:14PM +0200, Peter Zijlstra wrote:
> > On Fri, Aug 24, 2018 at 10:47:17AM +0200, Peter Zijlstra wrote:  
> > > On Thu, Aug 23, 2018 at 02:39:59PM +0100, Will Deacon wrote:  
> > > > The only problem with this approach is that we've lost track of the granule
> > > > size by the point we get to the tlb_flush(), so we can't adjust the stride of
> > > > the TLB invalidations for huge mappings, which actually works nicely in the
> > > > synchronous case (e.g. we perform a single invalidation for a 2MB mapping,
> > > > rather than iterating over it at a 4k granule).
> > > > 
> > > > One thing we could do is switch to synchronous mode if we detect a change in
> > > > granule (i.e. treat it like a batch failure).  
> > > 
> > > We could use tlb_start_vma() to track that, I think. Shouldn't be too
> > > hard.  
> > 
> > Hurm.. look at commit:
> > 
> >   e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change")  
> 
> Ah, good, it seems that already got cleaned up a lot. But it all moved
> into the power code.. blergh.

I lost track of what the problem is here?

For powerpc, tlb_start_vma is not the right API to use for this because
it wants to deal with different page sizes within a vma.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  5:00                     ` Nicholas Piggin
@ 2018-08-27  7:47                       ` Peter Zijlstra
  2018-08-27  8:04                         ` Nicholas Piggin
                                           ` (2 more replies)
  0 siblings, 3 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-27  7:47 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Will Deacon, Linus Torvalds, Benjamin Herrenschmidt,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, Aug 27, 2018 at 03:00:08PM +1000, Nicholas Piggin wrote:
> On Fri, 24 Aug 2018 13:39:53 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> > On Fri, Aug 24, 2018 at 01:32:14PM +0200, Peter Zijlstra wrote:

> > > Hurm.. look at commit:
> > > 
> > >   e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change")  
> > 
> > Ah, good, it seems that already got cleaned up a lot. But it all moved
> > into the power code.. blergh.
> 
> I lost track of what the problem is here?

Aside from the commit above being absolute crap (which did get fixed up,
luckily) I would really like to get rid of all arch specific mmu_gather.

We can have opt-in bits to the generic code, but the endless back and
forth between common and arch code is an utter pain in the arse.

And there's only like 4 architectures that still have a custom
mmu_gather:

  - sh
  - arm
  - ia64
  - s390

sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
arch/ia64' leaving us with s390.

After that everyone uses the common code and we can clean up.

> For powerpc, tlb_start_vma is not the right API to use for this because
> it wants to deal with different page sizes within a vma.

Yes.. I see that. tlb_remove_check_page_size_change() really is a rather
ugly thing, it can cause loads of TLB flushes. Do you really _have_ to
do that? The way ARM and x86 work is that using INVLPG in a 4K stride is
still correct for huge pages, inefficient maybe, but so is flushing
every other page because 'sparse' transparant-huge-pages.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  7:47                       ` Peter Zijlstra
@ 2018-08-27  8:04                         ` Nicholas Piggin
  2018-08-27  8:09                           ` Benjamin Herrenschmidt
  2018-08-27 13:36                           ` Rik van Riel
  2018-08-27  8:57                         ` removig ia64, was: " Christoph Hellwig
  2018-08-27 11:00                         ` Peter Zijlstra
  2 siblings, 2 replies; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-27  8:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Linus Torvalds, Benjamin Herrenschmidt,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, 27 Aug 2018 09:47:01 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Aug 27, 2018 at 03:00:08PM +1000, Nicholas Piggin wrote:
> > On Fri, 24 Aug 2018 13:39:53 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:  
> > > On Fri, Aug 24, 2018 at 01:32:14PM +0200, Peter Zijlstra wrote:  
> 
> > > > Hurm.. look at commit:
> > > > 
> > > >   e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change")    
> > > 
> > > Ah, good, it seems that already got cleaned up a lot. But it all moved
> > > into the power code.. blergh.  
> > 
> > I lost track of what the problem is here?  
> 
> Aside from the commit above being absolute crap (which did get fixed up,
> luckily) I would really like to get rid of all arch specific mmu_gather.
> 
> We can have opt-in bits to the generic code, but the endless back and
> forth between common and arch code is an utter pain in the arse.
> 
> And there's only like 4 architectures that still have a custom
> mmu_gather:
> 
>   - sh
>   - arm
>   - ia64
>   - s390
> 
> sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
> arch/ia64' leaving us with s390.

Well I don't see a big problem in having an arch_mmu_gather field
or small bits. powerpc would actually like that rather than trying
to add things it wants into generic code (and it wants more than
just a few flags bits, ideally).

> After that everyone uses the common code and we can clean up.
> 
> > For powerpc, tlb_start_vma is not the right API to use for this because
> > it wants to deal with different page sizes within a vma.  
> 
> Yes.. I see that. tlb_remove_check_page_size_change() really is a rather
> ugly thing, it can cause loads of TLB flushes. Do you really _have_ to
> do that? The way ARM and x86 work is that using INVLPG in a 4K stride is
> still correct for huge pages, inefficient maybe, but so is flushing
> every other page because 'sparse' transparant-huge-pages.

It could do that. It requires a tlbie that matches the page size,
so it means 3 sizes. I think possibly even that would be better
than current code, but we could do better if we had a few specific
fields in there.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27  3:26                                         ` Nadav Amit
@ 2018-08-27  8:05                                           ` Masami Hiramatsu
  2018-08-27 17:34                                             ` Nadav Amit
  0 siblings, 1 reply; 96+ messages in thread
From: Masami Hiramatsu @ 2018-08-27  8:05 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Peter Zijlstra, Andy Lutomirski, Kees Cook, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Sun, 26 Aug 2018 20:26:09 -0700
Nadav Amit <nadav.amit@gmail.com> wrote:

> at 8:03 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
> 
> > On Sun, 26 Aug 2018 11:09:58 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> >> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
> >>> I just re-read text_poke().  It's, um, horrible.  Not only is the
> >>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
> >>> It's totally the wrong API -- poking one instruction at a time
> >>> basically can't be efficient on x86.  The API should either poke lots
> >>> of instructions at once or should be text_poke_begin(); ...;
> >>> text_poke_end();.
> >> 
> >> I don't think anybody ever cared about performance here. Only
> >> correctness. That whole text_poke_bp() thing is entirely tricky.
> > 
> > Agreed. Self modification is a special event.
> > 
> >> FWIW, before text_poke_bp(), text_poke() would only be used from
> >> stop_machine, so all the other CPUs would be stuck busy-waiting with
> >> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
> >> text_mutex should be serializing all that.
> > 
> > I'm still not sure that speculative page-table walk can be done
> > over the mutex. Also, if the fixmap area is for aliasing
> > pages (which always mapped to memory), what kind of
> > security issue can happen?
> 
> The PTE is accessible from other cores, so just as we assume for L1TF that
> the every addressable memory might be cached in L1, we should assume and
> PTE might be cached in the TLB when it is present.

Ok, so other cores can accidentally cache the PTE in TLB, (and no way
to shoot down explicitly?)

> Although the mapping is for an alias, there are a couple of issues here.
> First, this alias mapping is writable, so it might an attacker to change the
> kernel code (following another initial attack).

Combined with some buffer overflow, correct? If the attacker already can
write a kernel data directly, he is in the kernel mode.

> Second, the alias mapping is
> never explicitly flushed. We may assume that once the original mapping is
> removed/changed, a full TLB flush would take place, but there is no
> guarantee it actually takes place.

Hmm, would this means a full TLB flush will not flush alias mapping?
(or, the full TLB flush just doesn't work?)

> > Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
> > changing CR3 sounds good to me. I think we don't even need per-cpu,
> > it can call a thread/function on a dedicated core (like the first
> > boot processor) and wait :) This may prevent leakage of pte change
> > to other cores.
> 
> I implemented per-cpu fixmap, but I think that it makes more sense to take
> peterz approach and set an entry in the PGD level. Per-CPU fixmap either
> requires to pre-populate various levels in the page-table hierarchy, or
> conditionally synchronize whenever module memory is allocated, since they
> can share the same PGD, PUD & PMD. While usually the synchronization is not
> needed, the possibility that synchronization is needed complicates locking.
> 

Could you point which PeterZ approach you said? I guess it will be
make a clone of PGD and use it for local page mapping (as new mm).
If so, yes it sounds perfectly fine to me.

> Anyhow, having fixed addresses for the fixmap can be used to circumvent
> KASLR.

I think text_poke doesn't mind using random address :)

> I don’t think a dedicated core is needed. Anyhow there is a lock
> (text_mutex), so use_mm() can be used after acquiring the mutex.

Hmm, use_mm() said;

/*
 * use_mm
 *      Makes the calling kernel thread take on the specified
 *      mm context.
 *      (Note: this routine is intended to be called only
 *      from a kernel thread context)
 */

So maybe we need a dedicated kernel thread for safeness?

Thank you,

-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  8:04                         ` Nicholas Piggin
@ 2018-08-27  8:09                           ` Benjamin Herrenschmidt
  2018-08-27  8:20                             ` Peter Zijlstra
  2018-08-27  9:02                             ` Nicholas Piggin
  2018-08-27 13:36                           ` Rik van Riel
  1 sibling, 2 replies; 96+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-27  8:09 UTC (permalink / raw)
  To: Nicholas Piggin, Peter Zijlstra
  Cc: Will Deacon, Linus Torvalds, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman

On Mon, 2018-08-27 at 18:04 +1000, Nicholas Piggin wrote:
> > Yes.. I see that. tlb_remove_check_page_size_change() really is a rather
> > ugly thing, it can cause loads of TLB flushes. Do you really _have_ to
> > do that? The way ARM and x86 work is that using INVLPG in a 4K stride is
> > still correct for huge pages, inefficient maybe, but so is flushing
> > every other page because 'sparse' transparant-huge-pages.
> 
> It could do that. It requires a tlbie that matches the page size,
> so it means 3 sizes. I think possibly even that would be better
> than current code, but we could do better if we had a few specific
> fields in there.

More tlbies ? With the cost of the broadasts on the fabric ? I don't
think so.. or I'm not understanding your point...

Sadly our architecture requires a precise match between the page size
specified in the tlbie instruction and the entry in the TLB or it won't
be flushed.

Ben.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27  3:03                                       ` Masami Hiramatsu
  2018-08-27  3:26                                         ` Nadav Amit
@ 2018-08-27  8:13                                         ` Peter Zijlstra
  2018-08-27  9:39                                           ` Masami Hiramatsu
  2018-08-27  9:55                                           ` Jann Horn
  1 sibling, 2 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-27  8:13 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Andy Lutomirski, Kees Cook, Nadav Amit, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, Aug 27, 2018 at 12:03:05PM +0900, Masami Hiramatsu wrote:
> On Sun, 26 Aug 2018 11:09:58 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:

> > FWIW, before text_poke_bp(), text_poke() would only be used from
> > stop_machine, so all the other CPUs would be stuck busy-waiting with
> > IRQs disabled. These days, yeah, that's lots more dodgy, but yes
> > text_mutex should be serializing all that.
> 
> I'm still not sure that speculative page-table walk can be done
> over the mutex. Also, if the fixmap area is for aliasing
> pages (which always mapped to memory), what kind of
> security issue can happen?

So suppose CPU-A is doing the text_poke (let's say through text_poke_bp,
such that other CPUs get to continue with whatever they're doing).

While at that point, CPU-B gets an interrupt, and the CPU's
branch-trace-buffer for the IRET points to / near our fixmap. Then the
CPU could do a speculative TLB fill based on the BTB value, either
directly or indirectly (through speculative driven fault-ahead) of
whatever is in te fixmap at the time.

Then CPU-A completes the text_poke and only does a local TLB invalidate
on CPU-A, leaving CPU-B with an active translation.

*FAIL*



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  8:09                           ` Benjamin Herrenschmidt
@ 2018-08-27  8:20                             ` Peter Zijlstra
  2018-08-27  8:54                               ` Nicholas Piggin
  2018-08-27  9:02                             ` Nicholas Piggin
  1 sibling, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-27  8:20 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Nicholas Piggin, Will Deacon, Linus Torvalds, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman

On Mon, Aug 27, 2018 at 06:09:50PM +1000, Benjamin Herrenschmidt wrote:

> Sadly our architecture requires a precise match between the page size
> specified in the tlbie instruction and the entry in the TLB or it won't
> be flushed.

Argh.. OK I see. That is rather unfortunate and does seem to require
something along the lines of tlb_remove_check_page_size_change().

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  8:20                             ` Peter Zijlstra
@ 2018-08-27  8:54                               ` Nicholas Piggin
  0 siblings, 0 replies; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-27  8:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Benjamin Herrenschmidt, Will Deacon, Linus Torvalds,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, 27 Aug 2018 10:20:45 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Aug 27, 2018 at 06:09:50PM +1000, Benjamin Herrenschmidt wrote:
> 
> > Sadly our architecture requires a precise match between the page size
> > specified in the tlbie instruction and the entry in the TLB or it won't
> > be flushed.  
> 
> Argh.. OK I see. That is rather unfortunate and does seem to require
> something along the lines of tlb_remove_check_page_size_change().

Or we can do better with some more of our own data in mmu_gather,
but things that probably few or no other architectures want. I've
held off trying to put any crap in generic code because there's
other lower hanging fruit still, but I'd really rather just give
archs the ability to put their own data in there. I don't really
see a downside to it (divergence of course, but the existing
proliferation of code is much harder to follow than some data that
would be maintained and used purely by the arch, and beats having
to implement entirely your own mmu_gather).

Thanks,
Nick

^ permalink raw reply	[flat|nested] 96+ messages in thread

* removig ia64, was: Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  7:47                       ` Peter Zijlstra
  2018-08-27  8:04                         ` Nicholas Piggin
@ 2018-08-27  8:57                         ` Christoph Hellwig
  2018-08-27 11:28                           ` Peter Zijlstra
  2018-08-27 11:45                           ` Jason Duerstock
  2018-08-27 11:00                         ` Peter Zijlstra
  2 siblings, 2 replies; 96+ messages in thread
From: Christoph Hellwig @ 2018-08-27  8:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Nicholas Piggin, Will Deacon, Linus Torvalds,
	Benjamin Herrenschmidt, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman,
	Tony Luck, Fenghua Yu, linux-ia64

On Mon, Aug 27, 2018 at 09:47:01AM +0200, Peter Zijlstra wrote:
> sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
> arch/ia64' leaving us with s390.

Is removing ia64 a serious plan?  It is the cause for a fair share of
oddities in dma lang, and I did not have much luck getting maintainer
replies lately, but I didn't know of a plan to get rid of it.

What is the state of people still using ia64 mainline kernels vs just
old distros in the still existing machines?

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  8:09                           ` Benjamin Herrenschmidt
  2018-08-27  8:20                             ` Peter Zijlstra
@ 2018-08-27  9:02                             ` Nicholas Piggin
  2018-08-27 22:13                               ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-27  9:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Peter Zijlstra, Will Deacon, Linus Torvalds, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman

On Mon, 27 Aug 2018 18:09:50 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Mon, 2018-08-27 at 18:04 +1000, Nicholas Piggin wrote:
> > > Yes.. I see that. tlb_remove_check_page_size_change() really is a rather
> > > ugly thing, it can cause loads of TLB flushes. Do you really _have_ to
> > > do that? The way ARM and x86 work is that using INVLPG in a 4K stride is
> > > still correct for huge pages, inefficient maybe, but so is flushing
> > > every other page because 'sparse' transparant-huge-pages.  
> > 
> > It could do that. It requires a tlbie that matches the page size,
> > so it means 3 sizes. I think possibly even that would be better
> > than current code, but we could do better if we had a few specific
> > fields in there.  
> 
> More tlbies ? With the cost of the broadasts on the fabric ? I don't
> think so.. or I'm not understanding your point...

More tlbies are no good, but there will be some places where it works
out much better (and fewer tlbies). Worst possible case for current code
is a big unmap with lots of scattered page sizes. We _should_ get that
with just a single PID flush at the end, but what we will get today is
a bunch of PID and VA flushes.

I don't propose doing that though, I'd rather be explicit about
tracking start and end range of each page size. Still not "optimal"
but neither is existing single range for sparse mappings... anyway it
will need to be profiled, but my point is we don't really fit exactly
what x86/arm want.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27  8:13                                         ` Peter Zijlstra
@ 2018-08-27  9:39                                           ` Masami Hiramatsu
  2018-08-27  9:55                                           ` Jann Horn
  1 sibling, 0 replies; 96+ messages in thread
From: Masami Hiramatsu @ 2018-08-27  9:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andy Lutomirski, Kees Cook, Nadav Amit, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, 27 Aug 2018 10:13:29 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Aug 27, 2018 at 12:03:05PM +0900, Masami Hiramatsu wrote:
> > On Sun, 26 Aug 2018 11:09:58 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > > FWIW, before text_poke_bp(), text_poke() would only be used from
> > > stop_machine, so all the other CPUs would be stuck busy-waiting with
> > > IRQs disabled. These days, yeah, that's lots more dodgy, but yes
> > > text_mutex should be serializing all that.
> > 
> > I'm still not sure that speculative page-table walk can be done
> > over the mutex. Also, if the fixmap area is for aliasing
> > pages (which always mapped to memory), what kind of
> > security issue can happen?
> 
> So suppose CPU-A is doing the text_poke (let's say through text_poke_bp,
> such that other CPUs get to continue with whatever they're doing).
> 
> While at that point, CPU-B gets an interrupt, and the CPU's
> branch-trace-buffer for the IRET points to / near our fixmap. Then the
> CPU could do a speculative TLB fill based on the BTB value, either
> directly or indirectly (through speculative driven fault-ahead) of
> whatever is in te fixmap at the time.

Hmm, but how "near" is it enough? Since text_poke just map a non
executable alias page in fixmap, it is hard to suppose that IRET points
there (except for attacker change the IRET address).

I see that Intel CPU sometimes speculatively read-ahead the page tables,
but in that case, I guess we just need to keep fixmap area away from
text area. (Of course, it is hard to estimate how far is enough :( )

Anyway, I agree to introduce new page-table (and kthread?) for fixmap.

> Then CPU-A completes the text_poke and only does a local TLB invalidate
> on CPU-A, leaving CPU-B with an active translation.
> 
> *FAIL*

Ah, I got it. So on CPU-B, it can write-access to fixmap'd pages unless
the CPU-B shoot down the full TLB...

Thank you,

-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27  8:13                                         ` Peter Zijlstra
  2018-08-27  9:39                                           ` Masami Hiramatsu
@ 2018-08-27  9:55                                           ` Jann Horn
  1 sibling, 0 replies; 96+ messages in thread
From: Jann Horn @ 2018-08-27  9:55 UTC (permalink / raw)
  To: Peter Zijlstra, Andy Lutomirski, Dave Hansen
  Cc: mhiramat, Kees Cook, Nadav Amit, Linus Torvalds, Paolo Bonzini,
	jkosina, Will Deacon, benh, npiggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Adin Scannell, kernel list,
	Linux-MM, David S. Miller, Martin Schwidefsky, Michael Ellerman

On Mon, Aug 27, 2018 at 10:13 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Aug 27, 2018 at 12:03:05PM +0900, Masami Hiramatsu wrote:
> > On Sun, 26 Aug 2018 11:09:58 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
>
> > > FWIW, before text_poke_bp(), text_poke() would only be used from
> > > stop_machine, so all the other CPUs would be stuck busy-waiting with
> > > IRQs disabled. These days, yeah, that's lots more dodgy, but yes
> > > text_mutex should be serializing all that.
> >
> > I'm still not sure that speculative page-table walk can be done
> > over the mutex. Also, if the fixmap area is for aliasing
> > pages (which always mapped to memory), what kind of
> > security issue can happen?
>
> So suppose CPU-A is doing the text_poke (let's say through text_poke_bp,
> such that other CPUs get to continue with whatever they're doing).
>
> While at that point, CPU-B gets an interrupt, and the CPU's
> branch-trace-buffer for the IRET points to / near our fixmap. Then the
> CPU could do a speculative TLB fill based on the BTB value, either
> directly or indirectly (through speculative driven fault-ahead) of
> whatever is in te fixmap at the time.

Worse: The way academics have been defeating KASLR for a while is
based on TLB fills for kernel addresses, triggered from userspace.
Quoting https://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf :

| Additionally, even if a permission error occurs, this still allows to
| launch address translations and, hence, generate valid TLB entries
| by accessing privileged kernel space memory from user mode.

This was actually part of the original motivation for KAISER/KPTI.
Quoting https://gruss.cc/files/kaiser.pdf :

| Modern operating system kernels employ address space layout
| randomization (ASLR) to prevent control-flow hijacking attacks and
| code-injection attacks. While kernel security relies fundamentally
on preventing
| access to address information, recent attacks have shown that the
| hardware directly leaks this information.

I believe that PTI probably prevents this way of directly triggering
TLB fills for now (under the assumption that hyperthreads with equal
CR3 don't share TLB entries), but I would still assume that an
attacker can probably trigger TLB fills for arbitrary addresses
anytime. And at some point in the future, I believe people would
probably like to be able to disable PTI again?

> Then CPU-A completes the text_poke and only does a local TLB invalidate
> on CPU-A, leaving CPU-B with an active translation.
>
> *FAIL*

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  7:47                       ` Peter Zijlstra
  2018-08-27  8:04                         ` Nicholas Piggin
  2018-08-27  8:57                         ` removig ia64, was: " Christoph Hellwig
@ 2018-08-27 11:00                         ` Peter Zijlstra
  2018-08-30  0:13                           ` Vineet Gupta
  2 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-27 11:00 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Will Deacon, Linus Torvalds, Benjamin Herrenschmidt,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman, jejb, vgupta

On Mon, Aug 27, 2018 at 09:47:01AM +0200, Peter Zijlstra wrote:
> And there's only like 4 architectures that still have a custom
> mmu_gather:
> 
>   - sh
>   - arm
>   - ia64
>   - s390
> 
> sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
> arch/ia64' leaving us with s390.

The one obvious thing SH and ARM want is a sensible default for
tlb_start_vma(). (also: https://lkml.org/lkml/2004/1/15/6 )

The below make tlb_start_vma() default to flush_cache_range(), which
should be right and sufficient. The only exceptions that I found where
(oddly):

  - m68k-mmu
  - sparc64
  - unicore

Those architectures appear to have a non-NOP flush_cache_range(), but
their current tlb_start_vma() does not call it.

Furthermore, I think tlb_flush() is broken on arc and parisc; in
particular they don't appear to have any TLB invalidate for the
shift_arg_pages() case, where we do not call tlb_*_vma() and fullmm=0.

Possibly shift_arg_pages() should be fixed instead.

Some archs (nds32,sparc32) avoid this by having an unconditional
flush_tlb_mm() in tlb_flush(), which seems somewhat suboptimal if you
have flush_tlb_range().  TLB_FLUSH_VMA() might be an option, however
hideous it is.

---

diff --git a/arch/arc/include/asm/tlb.h b/arch/arc/include/asm/tlb.h
index a9db5f62aaf3..7af2b373ebe7 100644
--- a/arch/arc/include/asm/tlb.h
+++ b/arch/arc/include/asm/tlb.h
@@ -23,15 +23,6 @@ do {						\
  *
  * Note, read http://lkml.org/lkml/2004/1/15/6
  */
-#ifndef CONFIG_ARC_CACHE_VIPT_ALIASING
-#define tlb_start_vma(tlb, vma)
-#else
-#define tlb_start_vma(tlb, vma)						\
-do {									\
-	if (!tlb->fullmm)						\
-		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
-} while(0)
-#endif
 
 #define tlb_end_vma(tlb, vma)						\
 do {									\
diff --git a/arch/mips/include/asm/tlb.h b/arch/mips/include/asm/tlb.h
index b6823b9e94da..9d04b4649692 100644
--- a/arch/mips/include/asm/tlb.h
+++ b/arch/mips/include/asm/tlb.h
@@ -5,16 +5,6 @@
 #include <asm/cpu-features.h>
 #include <asm/mipsregs.h>
 
-/*
- * MIPS doesn't need any special per-pte or per-vma handling, except
- * we need to flush cache for area to be unmapped.
- */
-#define tlb_start_vma(tlb, vma)					\
-	do {							\
-		if (!tlb->fullmm)				\
-			flush_cache_range(vma, vma->vm_start, vma->vm_end); \
-	}  while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 
 /*
diff --git a/arch/nds32/include/asm/tlb.h b/arch/nds32/include/asm/tlb.h
index b35ae5eae3ab..0bf7c9482381 100644
--- a/arch/nds32/include/asm/tlb.h
+++ b/arch/nds32/include/asm/tlb.h
@@ -4,12 +4,6 @@
 #ifndef __ASMNDS32_TLB_H
 #define __ASMNDS32_TLB_H
 
-#define tlb_start_vma(tlb,vma)						\
-	do {								\
-		if (!tlb->fullmm)					\
-			flush_cache_range(vma, vma->vm_start, vma->vm_end); \
-	} while (0)
-
 #define tlb_end_vma(tlb,vma)				\
 	do { 						\
 		if(!tlb->fullmm)			\
diff --git a/arch/nios2/include/asm/tlb.h b/arch/nios2/include/asm/tlb.h
index d3bc648e08b5..9b518c6d0f62 100644
--- a/arch/nios2/include/asm/tlb.h
+++ b/arch/nios2/include/asm/tlb.h
@@ -15,16 +15,6 @@
 
 extern void set_mmu_pid(unsigned long pid);
 
-/*
- * NiosII doesn't need any special per-pte or per-vma handling, except
- * we need to flush cache for the area to be unmapped.
- */
-#define tlb_start_vma(tlb, vma)					\
-	do {							\
-		if (!tlb->fullmm)				\
-			flush_cache_range(vma, vma->vm_start, vma->vm_end); \
-	}  while (0)
-
 #define tlb_end_vma(tlb, vma)	do { } while (0)
 #define __tlb_remove_tlb_entry(tlb, ptep, address)	do { } while (0)
 
diff --git a/arch/parisc/include/asm/tlb.h b/arch/parisc/include/asm/tlb.h
index 0c881e74d8a6..b1984f9cd3af 100644
--- a/arch/parisc/include/asm/tlb.h
+++ b/arch/parisc/include/asm/tlb.h
@@ -7,11 +7,6 @@ do {	if ((tlb)->fullmm)		\
 		flush_tlb_mm((tlb)->mm);\
 } while (0)
 
-#define tlb_start_vma(tlb, vma) \
-do {	if (!(tlb)->fullmm)	\
-		flush_cache_range(vma, vma->vm_start, vma->vm_end); \
-} while (0)
-
 #define tlb_end_vma(tlb, vma)	\
 do {	if (!(tlb)->fullmm)	\
 		flush_tlb_range(vma, vma->vm_start, vma->vm_end); \
diff --git a/arch/sparc/include/asm/tlb_32.h b/arch/sparc/include/asm/tlb_32.h
index 343cea19e573..68d817273de8 100644
--- a/arch/sparc/include/asm/tlb_32.h
+++ b/arch/sparc/include/asm/tlb_32.h
@@ -2,11 +2,6 @@
 #ifndef _SPARC_TLB_H
 #define _SPARC_TLB_H
 
-#define tlb_start_vma(tlb, vma) \
-do {								\
-	flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
-} while (0)
-
 #define tlb_end_vma(tlb, vma) \
 do {								\
 	flush_tlb_range(vma, vma->vm_start, vma->vm_end);	\
diff --git a/arch/xtensa/include/asm/tlb.h b/arch/xtensa/include/asm/tlb.h
index 0d766f9c1083..1a93e350382e 100644
--- a/arch/xtensa/include/asm/tlb.h
+++ b/arch/xtensa/include/asm/tlb.h
@@ -16,19 +16,10 @@
 
 #if (DCACHE_WAY_SIZE <= PAGE_SIZE)
 
-/* Note, read http://lkml.org/lkml/2004/1/15/6 */
-
-# define tlb_start_vma(tlb,vma)			do { } while (0)
 # define tlb_end_vma(tlb,vma)			do { } while (0)
 
 #else
 
-# define tlb_start_vma(tlb, vma)					      \
-	do {								      \
-		if (!tlb->fullmm)					      \
-			flush_cache_range(vma, vma->vm_start, vma->vm_end);   \
-	} while(0)
-
 # define tlb_end_vma(tlb, vma)						      \
 	do {								      \
 		if (!tlb->fullmm)					      \
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index e811ef7b8350..1d037fd5bb7a 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -181,19 +181,21 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
  * the vmas are adjusted to only cover the region to be torn down.
  */
 #ifndef tlb_start_vma
-#define tlb_start_vma(tlb, vma) do { } while (0)
+#define tlb_start_vma(tlb, vma)						\
+do {									\
+	if (!tlb->fullmm)						\
+		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
+} while (0)
 #endif
 
-#define __tlb_end_vma(tlb, vma)					\
-	do {							\
-		if (!tlb->fullmm && tlb->end) {			\
-			tlb_flush(tlb);				\
-			__tlb_reset_range(tlb);			\
-		}						\
-	} while (0)
-
 #ifndef tlb_end_vma
-#define tlb_end_vma	__tlb_end_vma
+#define tlb_end_vma(tlb, vma)						\
+	do {								\
+		if (!tlb->fullmm && tlb->end) {				\
+			tlb_flush(tlb);					\
+			__tlb_reset_range(tlb);				\
+		}							\
+	} while (0)
 #endif
 
 #ifndef __tlb_remove_tlb_entry

^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: removig ia64, was: Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  8:57                         ` removig ia64, was: " Christoph Hellwig
@ 2018-08-27 11:28                           ` Peter Zijlstra
  2018-08-27 11:45                           ` Jason Duerstock
  1 sibling, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-27 11:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Nicholas Piggin, Will Deacon, Linus Torvalds,
	Benjamin Herrenschmidt, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman,
	Tony Luck, Fenghua Yu, linux-ia64

On Mon, Aug 27, 2018 at 01:57:08AM -0700, Christoph Hellwig wrote:
> On Mon, Aug 27, 2018 at 09:47:01AM +0200, Peter Zijlstra wrote:
> > sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
> > arch/ia64' leaving us with s390.
> 
> Is removing ia64 a serious plan?

I 'joked' about it a while ago on IRC, and aegl reacted that it might
not be entirely unreasonable.

> It is the cause for a fair share of
> oddities in dma lang, and I did not have much luck getting maintainer
> replies lately, but I didn't know of a plan to get rid of it.
> 
> What is the state of people still using ia64 mainline kernels vs just
> old distros in the still existing machines?

Both arjan and aegl said that the vast majority of people still running
ia64 machines run old distros.



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: removig ia64, was: Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  8:57                         ` removig ia64, was: " Christoph Hellwig
  2018-08-27 11:28                           ` Peter Zijlstra
@ 2018-08-27 11:45                           ` Jason Duerstock
  1 sibling, 0 replies; 96+ messages in thread
From: Jason Duerstock @ 2018-08-27 11:45 UTC (permalink / raw)
  To: hch
  Cc: peterz, npiggin, will.deacon, torvalds, benh, luto, x86, bp,
	riel, jannh, ascannell, dave.hansen, linux-kernel, linux-mm,
	davem, schwidefsky, mpe, tony.luck, fenghua.yu, linux-ia64

I cannot speak to how widespread it has been adopted, but the linux
(kernel) package for version 4.17.17 has been successfully built and
installed for ia64 under Debian ports.  There is clearly more work to
do to get ia64 rehabilitated, but there are over 10,000 packages
currently successfully built for ia64 under Debian ports[1].

Jason

[1] https://buildd.debian.org/status/architecture.php?a=ia64&suite=sid
On Mon, Aug 27, 2018 at 4:57 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Mon, Aug 27, 2018 at 09:47:01AM +0200, Peter Zijlstra wrote:
> > sh is trivial, arm seems doable, with a bit of luck we can do 'rm -rf
> > arch/ia64' leaving us with s390.
>
> Is removing ia64 a serious plan?  It is the cause for a fair share of
> oddities in dma lang, and I did not have much luck getting maintainer
> replies lately, but I didn't know of a plan to get rid of it.
>
> What is the state of people still using ia64 mainline kernels vs just
> old distros in the still existing machines?

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  8:04                         ` Nicholas Piggin
  2018-08-27  8:09                           ` Benjamin Herrenschmidt
@ 2018-08-27 13:36                           ` Rik van Riel
  2018-08-27 14:29                             ` Nicholas Piggin
  1 sibling, 1 reply; 96+ messages in thread
From: Rik van Riel @ 2018-08-27 13:36 UTC (permalink / raw)
  To: Nicholas Piggin, Peter Zijlstra
  Cc: Will Deacon, Linus Torvalds, Benjamin Herrenschmidt,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman

[-- Attachment #1: Type: text/plain, Size: 880 bytes --]

On Mon, 2018-08-27 at 18:04 +1000, Nicholas Piggin wrote:

> It could do that. It requires a tlbie that matches the page size,
> so it means 3 sizes. I think possibly even that would be better
> than current code, but we could do better if we had a few specific
> fields in there.

Would it cause a noticeable overhead to keep track
of which page sizes were removed, and to simply flush
the whole TLB in the (unlikely?) event that multiple
page sizes were removed in the same munmap?

Once the unmap is so large that multiple page sizes
were covered, you may already be looking at so many
individual flush operations that a full flush might
be faster.

Is there a point on PPC where simply flushing the
whole TLB, and having other things be reloaded later,
is faster than flushing every individual page mapping
that got unmapped?

-- 
All Rights Reversed.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27 13:36                           ` Rik van Riel
@ 2018-08-27 14:29                             ` Nicholas Piggin
  0 siblings, 0 replies; 96+ messages in thread
From: Nicholas Piggin @ 2018-08-27 14:29 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Peter Zijlstra, Will Deacon, Linus Torvalds,
	Benjamin Herrenschmidt, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Jann Horn,
	Adin Scannell, Dave Hansen, Linux Kernel Mailing List, linux-mm,
	David Miller, Martin Schwidefsky, Michael Ellerman

On Mon, 27 Aug 2018 09:36:50 -0400
Rik van Riel <riel@surriel.com> wrote:

> On Mon, 2018-08-27 at 18:04 +1000, Nicholas Piggin wrote:
> 
> > It could do that. It requires a tlbie that matches the page size,
> > so it means 3 sizes. I think possibly even that would be better
> > than current code, but we could do better if we had a few specific
> > fields in there.  
> 
> Would it cause a noticeable overhead to keep track
> of which page sizes were removed, and to simply flush
> the whole TLB in the (unlikely?) event that multiple
> page sizes were removed in the same munmap?
> 
> Once the unmap is so large that multiple page sizes
> were covered, you may already be looking at so many
> individual flush operations that a full flush might
> be faster.

It will take some profiling and measuring. unmapping a small number of
huge pages plus a small number of surrounding small pages may not be
uncommon if THP is working well. That could become a lot more expensive.

> 
> Is there a point on PPC where simply flushing the
> whole TLB, and having other things be reloaded later,
> is faster than flushing every individual page mapping
> that got unmapped?

There is. For local TLB flushes that point is well over 100 individual
invalidates though. We're generally better off flushing all page sizes
for that case.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27  8:05                                           ` Masami Hiramatsu
@ 2018-08-27 17:34                                             ` Nadav Amit
  2018-08-27 18:45                                               ` Andy Lutomirski
  0 siblings, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-27 17:34 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Peter Zijlstra, Andy Lutomirski, Kees Cook, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

at 1:05 AM, Masami Hiramatsu <mhiramat@kernel.org> wrote:

> On Sun, 26 Aug 2018 20:26:09 -0700
> Nadav Amit <nadav.amit@gmail.com> wrote:
> 
>> at 8:03 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>> 
>>> On Sun, 26 Aug 2018 11:09:58 +0200
>>> Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>>> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
>>>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>>>> It's totally the wrong API -- poking one instruction at a time
>>>>> basically can't be efficient on x86.  The API should either poke lots
>>>>> of instructions at once or should be text_poke_begin(); ...;
>>>>> text_poke_end();.
>>>> 
>>>> I don't think anybody ever cared about performance here. Only
>>>> correctness. That whole text_poke_bp() thing is entirely tricky.
>>> 
>>> Agreed. Self modification is a special event.
>>> 
>>>> FWIW, before text_poke_bp(), text_poke() would only be used from
>>>> stop_machine, so all the other CPUs would be stuck busy-waiting with
>>>> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
>>>> text_mutex should be serializing all that.
>>> 
>>> I'm still not sure that speculative page-table walk can be done
>>> over the mutex. Also, if the fixmap area is for aliasing
>>> pages (which always mapped to memory), what kind of
>>> security issue can happen?
>> 
>> The PTE is accessible from other cores, so just as we assume for L1TF that
>> the every addressable memory might be cached in L1, we should assume and
>> PTE might be cached in the TLB when it is present.
> 
> Ok, so other cores can accidentally cache the PTE in TLB, (and no way
> to shoot down explicitly?)

There is way (although current it does not). But it seems that the consensus
is that it is better to avoid it being mapped at all in remote cores.

>> Although the mapping is for an alias, there are a couple of issues here.
>> First, this alias mapping is writable, so it might an attacker to change the
>> kernel code (following another initial attack).
> 
> Combined with some buffer overflow, correct? If the attacker already can
> write a kernel data directly, he is in the kernel mode.

Right.

> 
>> Second, the alias mapping is
>> never explicitly flushed. We may assume that once the original mapping is
>> removed/changed, a full TLB flush would take place, but there is no
>> guarantee it actually takes place.
> 
> Hmm, would this means a full TLB flush will not flush alias mapping?
> (or, the full TLB flush just doesn't work?)

It will flush the alias mapping, but currently there is no such explicit
flush.

>>> Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
>>> changing CR3 sounds good to me. I think we don't even need per-cpu,
>>> it can call a thread/function on a dedicated core (like the first
>>> boot processor) and wait :) This may prevent leakage of pte change
>>> to other cores.
>> 
>> I implemented per-cpu fixmap, but I think that it makes more sense to take
>> peterz approach and set an entry in the PGD level. Per-CPU fixmap either
>> requires to pre-populate various levels in the page-table hierarchy, or
>> conditionally synchronize whenever module memory is allocated, since they
>> can share the same PGD, PUD & PMD. While usually the synchronization is not
>> needed, the possibility that synchronization is needed complicates locking.
> 
> Could you point which PeterZ approach you said? I guess it will be
> make a clone of PGD and use it for local page mapping (as new mm).
> If so, yes it sounds perfectly fine to me.

The thread is too long. What I think is best is having a mapping in the PGD
level. I’ll try to give it a shot, and see what I get.

>> Anyhow, having fixed addresses for the fixmap can be used to circumvent
>> KASLR.
> 
> I think text_poke doesn't mind using random address :)
> 
>> I don’t think a dedicated core is needed. Anyhow there is a lock
>> (text_mutex), so use_mm() can be used after acquiring the mutex.
> 
> Hmm, use_mm() said;
> 
> /*
> * use_mm
> *      Makes the calling kernel thread take on the specified
> *      mm context.
> *      (Note: this routine is intended to be called only
> *      from a kernel thread context)
> */
> 
> So maybe we need a dedicated kernel thread for safeness?

Yes, it says so. But I am not sure it cannot be changed, at least for this
specific use-case. Switching kernel threads just for patching seems to me as
an overkill.

Let me see if I can get something half-reasonable doing so...


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 17:34                                             ` Nadav Amit
@ 2018-08-27 18:45                                               ` Andy Lutomirski
  2018-08-27 18:54                                                 ` Nadav Amit
  0 siblings, 1 reply; 96+ messages in thread
From: Andy Lutomirski @ 2018-08-27 18:45 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Masami Hiramatsu, Peter Zijlstra, Andy Lutomirski, Kees Cook,
	Linus Torvalds, Paolo Bonzini, Jiri Kosina, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
> at 1:05 AM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
>> On Sun, 26 Aug 2018 20:26:09 -0700
>> Nadav Amit <nadav.amit@gmail.com> wrote:
>>
>>> at 8:03 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>>
>>>> On Sun, 26 Aug 2018 11:09:58 +0200
>>>> Peter Zijlstra <peterz@infradead.org> wrote:
>>>>
>>>>> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
>>>>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>>>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>>>>> It's totally the wrong API -- poking one instruction at a time
>>>>>> basically can't be efficient on x86.  The API should either poke lots
>>>>>> of instructions at once or should be text_poke_begin(); ...;
>>>>>> text_poke_end();.
>>>>>
>>>>> I don't think anybody ever cared about performance here. Only
>>>>> correctness. That whole text_poke_bp() thing is entirely tricky.
>>>>
>>>> Agreed. Self modification is a special event.
>>>>
>>>>> FWIW, before text_poke_bp(), text_poke() would only be used from
>>>>> stop_machine, so all the other CPUs would be stuck busy-waiting with
>>>>> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
>>>>> text_mutex should be serializing all that.
>>>>
>>>> I'm still not sure that speculative page-table walk can be done
>>>> over the mutex. Also, if the fixmap area is for aliasing
>>>> pages (which always mapped to memory), what kind of
>>>> security issue can happen?
>>>
>>> The PTE is accessible from other cores, so just as we assume for L1TF that
>>> the every addressable memory might be cached in L1, we should assume and
>>> PTE might be cached in the TLB when it is present.
>>
>> Ok, so other cores can accidentally cache the PTE in TLB, (and no way
>> to shoot down explicitly?)
>
> There is way (although current it does not). But it seems that the consensus
> is that it is better to avoid it being mapped at all in remote cores.
>
>>> Although the mapping is for an alias, there are a couple of issues here.
>>> First, this alias mapping is writable, so it might an attacker to change the
>>> kernel code (following another initial attack).
>>
>> Combined with some buffer overflow, correct? If the attacker already can
>> write a kernel data directly, he is in the kernel mode.
>
> Right.
>
>>
>>> Second, the alias mapping is
>>> never explicitly flushed. We may assume that once the original mapping is
>>> removed/changed, a full TLB flush would take place, but there is no
>>> guarantee it actually takes place.
>>
>> Hmm, would this means a full TLB flush will not flush alias mapping?
>> (or, the full TLB flush just doesn't work?)
>
> It will flush the alias mapping, but currently there is no such explicit
> flush.
>
>>>> Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
>>>> changing CR3 sounds good to me. I think we don't even need per-cpu,
>>>> it can call a thread/function on a dedicated core (like the first
>>>> boot processor) and wait :) This may prevent leakage of pte change
>>>> to other cores.
>>>
>>> I implemented per-cpu fixmap, but I think that it makes more sense to take
>>> peterz approach and set an entry in the PGD level. Per-CPU fixmap either
>>> requires to pre-populate various levels in the page-table hierarchy, or
>>> conditionally synchronize whenever module memory is allocated, since they
>>> can share the same PGD, PUD & PMD. While usually the synchronization is not
>>> needed, the possibility that synchronization is needed complicates locking.
>>
>> Could you point which PeterZ approach you said? I guess it will be
>> make a clone of PGD and use it for local page mapping (as new mm).
>> If so, yes it sounds perfectly fine to me.
>
> The thread is too long. What I think is best is having a mapping in the PGD
> level. I’ll try to give it a shot, and see what I get.
>
>>> Anyhow, having fixed addresses for the fixmap can be used to circumvent
>>> KASLR.
>>
>> I think text_poke doesn't mind using random address :)
>>
>>> I don’t think a dedicated core is needed. Anyhow there is a lock
>>> (text_mutex), so use_mm() can be used after acquiring the mutex.
>>
>> Hmm, use_mm() said;
>>
>> /*
>> * use_mm
>> *      Makes the calling kernel thread take on the specified
>> *      mm context.
>> *      (Note: this routine is intended to be called only
>> *      from a kernel thread context)
>> */
>>
>> So maybe we need a dedicated kernel thread for safeness?
>
> Yes, it says so. But I am not sure it cannot be changed, at least for this
> specific use-case. Switching kernel threads just for patching seems to me as
> an overkill.
>
> Let me see if I can get something half-reasonable doing so...
>

I don't understand at all how a kernel thread helps.  The useful bit
is to have a dedicated mm, which would involve setting up an mm_struct
and mapping the kernel and module text, EFI-style, in the user portion
of the mm.  But, to do the text_poke(), we'd just use the mm *without
calling use_mm*.

In other words, the following sequence should be (almost) just fine:

typedef struct {
  struct mm_struct *prev;
} temporary_mm_state_t;

temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
{
    temporary_mm_state_t state;

    lockdep_assert_irqs_disabled();
    state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
    switch_mm_irqs_off(NULL, mm, current);
}

void unuse_temporary_mm(temporary_mm_state_t prev)
{
    lockdep_assert_irqs_disabled();
    switch_mm_irqs_off(NULL, prev.prev, current);
}

The only thing wrong with this that I can see is that it interacts
poorly with perf.  But perf is *already* busted in this regard.  The
following (whitespace damaged, sorry) should fix it:

commit b62bff5a8406d252de752cfe75068d0b73b9cdf0
Author: Andy Lutomirski <luto@kernel.org>
Date:   Mon Aug 27 11:41:55 2018 -0700

    x86/nmi: Fix some races in NMI uaccess

    In NMI context, we might be in the middle of context switching or in
    the middle of switch_mm_irqs_off().  In either case, CR3 might not
    match current->mm, which could cause copy_from_user_nmi() and
    friends to read the wrong memory.

    Fix it by adding a new nmi_uaccess_okay() helper and checking it in
    copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.

    Signed-off-by: Andy Lutomirski <luto@kernel.org>

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct
perf_callchain_entry_ctx *entry, struct pt_regs *regs

     perf_callchain_store(entry, regs->ip);

-    if (!current->mm)
+    if (!nmi_uaccess_okay())
         return;

     if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 89a73bc31622..b23b2625793b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -230,6 +230,22 @@ struct tlb_state {
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);

+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm.  It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+    struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+    struct mm_struct *current_mm = current->mm;
+
+    return current_mm && loaded_mm == current_mm &&
+        loaded_mm->pgd == __va(read_cr3_pa());
+}
+
 /* Initialize cr4 shadow for this CPU. */
 static inline void cr4_init_shadow(void)
 {
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..c5f758430be2 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -19,6 +19,9 @@ copy_from_user_nmi(void *to, const void __user
*from, unsigned long n)
     if (__range_not_ok(from, n, TASK_SIZE))
         return n;

+    if (!nmi_uaccess_okay())
+        return n;
+
     /*
      * Even though this function is typically called from NMI/IRQ context
      * disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 457b281b9339..f4b41d5a93dd 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -345,6 +345,9 @@ void switch_mm_irqs_off(struct mm_struct *prev,
struct mm_struct *next,
          */
         trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
     } else {
+        /* Let NMI code know that CR3 may not match expectations. */
+        this_cpu_write(cpu_tlbstate.loaded_mm, NULL);
+
         /* The new ASID is already up to date. */
         load_new_mm_cr3(next->pgd, new_asid, false);

What do you all think?

^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 18:45                                               ` Andy Lutomirski
@ 2018-08-27 18:54                                                 ` Nadav Amit
  2018-08-27 18:58                                                   ` Andy Lutomirski
  0 siblings, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-27 18:54 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Masami Hiramatsu, Peter Zijlstra, Kees Cook, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

at 11:45 AM, Andy Lutomirski <luto@kernel.org> wrote:

> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> at 1:05 AM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>> 
>>> On Sun, 26 Aug 2018 20:26:09 -0700
>>> Nadav Amit <nadav.amit@gmail.com> wrote:
>>> 
>>>> at 8:03 PM, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>>>> 
>>>>> On Sun, 26 Aug 2018 11:09:58 +0200
>>>>> Peter Zijlstra <peterz@infradead.org> wrote:
>>>>> 
>>>>>> On Sat, Aug 25, 2018 at 09:21:22PM -0700, Andy Lutomirski wrote:
>>>>>>> I just re-read text_poke().  It's, um, horrible.  Not only is the
>>>>>>> implementation overcomplicated and probably buggy, but it's SLOOOOOW.
>>>>>>> It's totally the wrong API -- poking one instruction at a time
>>>>>>> basically can't be efficient on x86.  The API should either poke lots
>>>>>>> of instructions at once or should be text_poke_begin(); ...;
>>>>>>> text_poke_end();.
>>>>>> 
>>>>>> I don't think anybody ever cared about performance here. Only
>>>>>> correctness. That whole text_poke_bp() thing is entirely tricky.
>>>>> 
>>>>> Agreed. Self modification is a special event.
>>>>> 
>>>>>> FWIW, before text_poke_bp(), text_poke() would only be used from
>>>>>> stop_machine, so all the other CPUs would be stuck busy-waiting with
>>>>>> IRQs disabled. These days, yeah, that's lots more dodgy, but yes
>>>>>> text_mutex should be serializing all that.
>>>>> 
>>>>> I'm still not sure that speculative page-table walk can be done
>>>>> over the mutex. Also, if the fixmap area is for aliasing
>>>>> pages (which always mapped to memory), what kind of
>>>>> security issue can happen?
>>>> 
>>>> The PTE is accessible from other cores, so just as we assume for L1TF that
>>>> the every addressable memory might be cached in L1, we should assume and
>>>> PTE might be cached in the TLB when it is present.
>>> 
>>> Ok, so other cores can accidentally cache the PTE in TLB, (and no way
>>> to shoot down explicitly?)
>> 
>> There is way (although current it does not). But it seems that the consensus
>> is that it is better to avoid it being mapped at all in remote cores.
>> 
>>>> Although the mapping is for an alias, there are a couple of issues here.
>>>> First, this alias mapping is writable, so it might an attacker to change the
>>>> kernel code (following another initial attack).
>>> 
>>> Combined with some buffer overflow, correct? If the attacker already can
>>> write a kernel data directly, he is in the kernel mode.
>> 
>> Right.
>> 
>>>> Second, the alias mapping is
>>>> never explicitly flushed. We may assume that once the original mapping is
>>>> removed/changed, a full TLB flush would take place, but there is no
>>>> guarantee it actually takes place.
>>> 
>>> Hmm, would this means a full TLB flush will not flush alias mapping?
>>> (or, the full TLB flush just doesn't work?)
>> 
>> It will flush the alias mapping, but currently there is no such explicit
>> flush.
>> 
>>>>> Anyway, from the viewpoint of kprobes, either per-cpu fixmap or
>>>>> changing CR3 sounds good to me. I think we don't even need per-cpu,
>>>>> it can call a thread/function on a dedicated core (like the first
>>>>> boot processor) and wait :) This may prevent leakage of pte change
>>>>> to other cores.
>>>> 
>>>> I implemented per-cpu fixmap, but I think that it makes more sense to take
>>>> peterz approach and set an entry in the PGD level. Per-CPU fixmap either
>>>> requires to pre-populate various levels in the page-table hierarchy, or
>>>> conditionally synchronize whenever module memory is allocated, since they
>>>> can share the same PGD, PUD & PMD. While usually the synchronization is not
>>>> needed, the possibility that synchronization is needed complicates locking.
>>> 
>>> Could you point which PeterZ approach you said? I guess it will be
>>> make a clone of PGD and use it for local page mapping (as new mm).
>>> If so, yes it sounds perfectly fine to me.
>> 
>> The thread is too long. What I think is best is having a mapping in the PGD
>> level. I’ll try to give it a shot, and see what I get.
>> 
>>>> Anyhow, having fixed addresses for the fixmap can be used to circumvent
>>>> KASLR.
>>> 
>>> I think text_poke doesn't mind using random address :)
>>> 
>>>> I don’t think a dedicated core is needed. Anyhow there is a lock
>>>> (text_mutex), so use_mm() can be used after acquiring the mutex.
>>> 
>>> Hmm, use_mm() said;
>>> 
>>> /*
>>> * use_mm
>>> *      Makes the calling kernel thread take on the specified
>>> *      mm context.
>>> *      (Note: this routine is intended to be called only
>>> *      from a kernel thread context)
>>> */
>>> 
>>> So maybe we need a dedicated kernel thread for safeness?
>> 
>> Yes, it says so. But I am not sure it cannot be changed, at least for this
>> specific use-case. Switching kernel threads just for patching seems to me as
>> an overkill.
>> 
>> Let me see if I can get something half-reasonable doing so...
> 
> I don't understand at all how a kernel thread helps.  The useful bit
> is to have a dedicated mm, which would involve setting up an mm_struct
> and mapping the kernel and module text, EFI-style, in the user portion
> of the mm.  But, to do the text_poke(), we'd just use the mm *without
> calling use_mm*.
> 
> In other words, the following sequence should be (almost) just fine:
> 
> typedef struct {
>  struct mm_struct *prev;
> } temporary_mm_state_t;
> 
> temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
> {
>    temporary_mm_state_t state;
> 
>    lockdep_assert_irqs_disabled();
>    state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>    switch_mm_irqs_off(NULL, mm, current);
> }
> 
> void unuse_temporary_mm(temporary_mm_state_t prev)
> {
>    lockdep_assert_irqs_disabled();
>    switch_mm_irqs_off(NULL, prev.prev, current);
> }
> 
> The only thing wrong with this that I can see is that it interacts
> poorly with perf.  But perf is *already* busted in this regard.  The
> following (whitespace damaged, sorry) should fix it:
> 
> commit b62bff5a8406d252de752cfe75068d0b73b9cdf0
> Author: Andy Lutomirski <luto@kernel.org>
> Date:   Mon Aug 27 11:41:55 2018 -0700
> 
>    x86/nmi: Fix some races in NMI uaccess
> 
>    In NMI context, we might be in the middle of context switching or in
>    the middle of switch_mm_irqs_off().  In either case, CR3 might not
>    match current->mm, which could cause copy_from_user_nmi() and
>    friends to read the wrong memory.
> 
>    Fix it by adding a new nmi_uaccess_okay() helper and checking it in
>    copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
> 
>    Signed-off-by: Andy Lutomirski <luto@kernel.org>
> 
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 5f4829f10129..dfb2f7c0d019 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2465,7 +2465,7 @@ perf_callchain_user(struct
> perf_callchain_entry_ctx *entry, struct pt_regs *regs
> 
>     perf_callchain_store(entry, regs->ip);
> 
> -    if (!current->mm)
> +    if (!nmi_uaccess_okay())
>         return;
> 
>     if (perf_callchain_user32(regs, entry))
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 89a73bc31622..b23b2625793b 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -230,6 +230,22 @@ struct tlb_state {
> };
> DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
> 
> +/*
> + * Blindly accessing user memory from NMI context can be dangerous
> + * if we're in the middle of switching the current user task or
> + * switching the loaded mm.  It can also be dangerous if we
> + * interrupted some kernel code that was temporarily using a
> + * different mm.
> + */
> +static inline bool nmi_uaccess_okay(void)
> +{
> +    struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
> +    struct mm_struct *current_mm = current->mm;
> +
> +    return current_mm && loaded_mm == current_mm &&
> +        loaded_mm->pgd == __va(read_cr3_pa());
> +}
> +
> /* Initialize cr4 shadow for this CPU. */
> static inline void cr4_init_shadow(void)
> {
> diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
> index c8c6ad0d58b8..c5f758430be2 100644
> --- a/arch/x86/lib/usercopy.c
> +++ b/arch/x86/lib/usercopy.c
> @@ -19,6 +19,9 @@ copy_from_user_nmi(void *to, const void __user
> *from, unsigned long n)
>     if (__range_not_ok(from, n, TASK_SIZE))
>         return n;
> 
> +    if (!nmi_uaccess_okay())
> +        return n;
> +
>     /*
>      * Even though this function is typically called from NMI/IRQ context
>      * disable pagefaults so that its behaviour is consistent even when
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 457b281b9339..f4b41d5a93dd 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -345,6 +345,9 @@ void switch_mm_irqs_off(struct mm_struct *prev,
> struct mm_struct *next,
>          */
>         trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
>     } else {
> +        /* Let NMI code know that CR3 may not match expectations. */
> +        this_cpu_write(cpu_tlbstate.loaded_mm, NULL);
> +
>         /* The new ASID is already up to date. */
>         load_new_mm_cr3(next->pgd, new_asid, false);
> 
> What do you all think?

I agree in general. But I think that current->mm would need to be loaded, as
otherwise I am afraid it would break switch_mm_irqs_off().


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 18:54                                                 ` Nadav Amit
@ 2018-08-27 18:58                                                   ` Andy Lutomirski
  2018-08-27 19:10                                                     ` Nadav Amit
  0 siblings, 1 reply; 96+ messages in thread
From: Andy Lutomirski @ 2018-08-27 18:58 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Andy Lutomirski, Masami Hiramatsu, Peter Zijlstra, Kees Cook,
	Linus Torvalds, Paolo Bonzini, Jiri Kosina, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> What do you all think?
>
> I agree in general. But I think that current->mm would need to be loaded, as
> otherwise I am afraid it would break switch_mm_irqs_off().
>

What breaks?

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 18:58                                                   ` Andy Lutomirski
@ 2018-08-27 19:10                                                     ` Nadav Amit
  2018-08-27 19:43                                                       ` Nadav Amit
  0 siblings, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-27 19:10 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Masami Hiramatsu, Peter Zijlstra, Kees Cook, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:

> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> What do you all think?
>> 
>> I agree in general. But I think that current->mm would need to be loaded, as
>> otherwise I am afraid it would break switch_mm_irqs_off().
> 
> What breaks?

Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
matter.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 19:10                                                     ` Nadav Amit
@ 2018-08-27 19:43                                                       ` Nadav Amit
  2018-08-27 19:58                                                         ` Andy Lutomirski
  0 siblings, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-27 19:43 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Masami Hiramatsu, Peter Zijlstra, Kees Cook, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:

> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
> 
>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>> What do you all think?
>>> 
>>> I agree in general. But I think that current->mm would need to be loaded, as
>>> otherwise I am afraid it would break switch_mm_irqs_off().
>> 
>> What breaks?
> 
> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
> matter.

So here is what I got. It certainly needs some cleanup, but it boots.

Let me know how crappy you find it...


diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bbc796eb0a3b..336779650a41 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
 	return cr3;
 }
 
+typedef struct {
+	struct mm_struct *prev;
+} temporary_mm_state_t;
+
+static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
+{
+	temporary_mm_state_t state;
+
+	lockdep_assert_irqs_disabled();
+	state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+	switch_mm_irqs_off(NULL, mm, current);
+	return state;
+}
+
+static inline void unuse_temporary_mm(temporary_mm_state_t prev)
+{
+	lockdep_assert_irqs_disabled();
+	switch_mm_irqs_off(NULL, prev.prev, current);
+}
+
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5715647fc4fe..ef62af9a0ef7 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
 	/* Default trampoline pgd value */
 	trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
 }
+
+void __init patching_mm_init(void);
+#define patching_mm_init patching_mm_init
+
 # ifdef CONFIG_RANDOMIZE_MEMORY
 void __meminit init_trampoline(void);
 # else
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 054765ab2da2..9f44262abde0 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
 #define LDT_PGD_ENTRY		(pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
 #define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 
+#define TEXT_POKE_PGD_ENTRY	-5UL
+#define TEXT_POKE_ADDR		(TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
+
 #define __VMALLOC_BASE_L4	0xffffc90000000000UL
 #define __VMALLOC_BASE_L5 	0xffa0000000000000UL
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 99fff853c944..840c72ec8c4f 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 
+struct mm_struct;
+void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
+
 #ifdef CONFIG_X86_32
 extern void native_pagetable_init(void);
 #else
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 2ecd34e2d46c..cb364ea5b19d 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
 extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
 
+extern struct mm_struct *patching_mm;
+
 #endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a481763a3776..fd8a950b0d62 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -11,6 +11,7 @@
 #include <linux/stop_machine.h>
 #include <linux/slab.h>
 #include <linux/kdebug.h>
+#include <linux/mmu_context.h>
 #include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
@@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
 		WARN_ON(!PageReserved(pages[0]));
 		pages[1] = virt_to_page(addr + PAGE_SIZE);
 	}
-	BUG_ON(!pages[0]);
+
 	local_irq_save(flags);
+	BUG_ON(!pages[0]);
+
+	/*
+	 * During initial boot, it is hard to initialize patching_mm due to
+	 * dependencies in boot order.
+	 */
+	if (patching_mm) {
+		pte_t pte;
+		temporary_mm_state_t prev;
+
+		prev = use_temporary_mm(patching_mm);
+		pte = mk_pte(pages[0], PAGE_KERNEL);
+		set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
+		pte = mk_pte(pages[1], PAGE_KERNEL);
+		set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
+
+		memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
+		       opcode, len);
+
+		set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
+		set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
+		local_flush_tlb();
+		sync_core();
+
+		unuse_temporary_mm(prev);
+		goto out;
+	}
+
 	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
 	if (pages[1])
 		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
@@ -715,6 +744,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
 	sync_core();
 	/* Could also do a CLFLUSH here to speed up CPU recovery; but
 	   that causes hangs on some VIA CPUs. */
+out:
 	for (i = 0; i < len; i++)
 		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
 	local_irq_restore(flags);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a688617c727e..bd0d629e3831 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,7 @@
 #include <asm/init.h>
 #include <asm/uv/uv.h>
 #include <asm/setup.h>
+#include <asm/text-patching.h>
 
 #include "mm_internal.h"
 
@@ -285,14 +286,14 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 	__set_pte_vaddr(pud, vaddr, new_pte);
 }
 
-void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pteval)
 {
 	pgd_t *pgd;
 	p4d_t *p4d_page;
 
 	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
 
-	pgd = pgd_offset_k(vaddr);
+	pgd = pgd_offset(mm, vaddr);
 	if (pgd_none(*pgd)) {
 		printk(KERN_ERR
 			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
@@ -303,6 +304,11 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 	set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
 }
 
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+	set_mm_pte_vaddr(&init_mm, vaddr, pteval);
+}
+
 pmd_t * __init populate_extra_pmd(unsigned long vaddr)
 {
 	pgd_t *pgd;
@@ -1399,6 +1405,17 @@ unsigned long memory_block_size_bytes(void)
 	return memory_block_size_probed;
 }
 
+struct mm_struct *patching_mm;
+EXPORT_SYMBOL(patching_mm);
+
+void __init patching_mm_init(void)
+{
+	populate_extra_pte(TEXT_POKE_ADDR);
+	populate_extra_pte(TEXT_POKE_ADDR + PAGE_SIZE);
+
+	patching_mm = copy_init_mm();
+}
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f59639afaa39..c95d2240c23a 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1083,6 +1083,10 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 static inline void init_espfix_bsp(void) { }
 #endif
 
+#ifndef patching_mm_init
+static inline void patching_mm_init(void) { }
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #ifndef io_remap_pfn_range
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 108ede99e533..ac0a675678f5 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -74,6 +74,7 @@ extern void exit_itimers(struct signal_struct *);
 extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
+struct mm_struct *copy_init_mm(void);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
 
diff --git a/init/main.c b/init/main.c
index 3b4ada11ed52..9a313efc80a6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -724,6 +724,7 @@ asmlinkage __visible void __init start_kernel(void)
 	taskstats_init_early();
 	delayacct_init();
 
+	patching_mm_init();
 	check_bugs();
 
 	acpi_subsystem_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 1b27babc4c78..325d1a5ca903 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1249,9 +1249,9 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
  * Allocate a new mm structure and copy contents from the
  * mm structure of the passed in task structure.
  */
-static struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk, struct mm_struct *oldmm)
 {
-	struct mm_struct *mm, *oldmm = current->mm;
+	struct mm_struct *mm;
 	int err;
 
 	mm = allocate_mm();
@@ -1317,7 +1317,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 	}
 
 	retval = -ENOMEM;
-	mm = dup_mm(tsk);
+	mm = dup_mm(tsk, current->mm);
 	if (!mm)
 		goto fail_nomem;
 
@@ -2082,6 +2082,11 @@ struct task_struct *fork_idle(int cpu)
 	return task;
 }
 
+struct mm_struct *copy_init_mm(void)
+{
+	return dup_mm(NULL, &init_mm);
+}
+
 /*
  *  Ok, this is the main fork-routine.
  *
-- 
2.17.1

namit@sc2-haas01-esx0118:~/dev/linux-mainline$ 


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 19:43                                                       ` Nadav Amit
@ 2018-08-27 19:58                                                         ` Andy Lutomirski
  2018-08-27 20:16                                                           ` Nadav Amit
  0 siblings, 1 reply; 96+ messages in thread
From: Andy Lutomirski @ 2018-08-27 19:58 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Andy Lutomirski, Masami Hiramatsu, Peter Zijlstra, Kees Cook,
	Linus Torvalds, Paolo Bonzini, Jiri Kosina, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>
>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>
>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> What do you all think?
>>>>
>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>
>>> What breaks?
>>
>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>> matter.
>
> So here is what I got. It certainly needs some cleanup, but it boots.
>
> Let me know how crappy you find it...
>
>
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index bbc796eb0a3b..336779650a41 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>         return cr3;
>  }
>
> +typedef struct {
> +       struct mm_struct *prev;
> +} temporary_mm_state_t;
> +
> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
> +{
> +       temporary_mm_state_t state;
> +
> +       lockdep_assert_irqs_disabled();
> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
> +       switch_mm_irqs_off(NULL, mm, current);
> +       return state;
> +}
> +
> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
> +{
> +       lockdep_assert_irqs_disabled();
> +       switch_mm_irqs_off(NULL, prev.prev, current);
> +}
> +
>  #endif /* _ASM_X86_MMU_CONTEXT_H */
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 5715647fc4fe..ef62af9a0ef7 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>         /* Default trampoline pgd value */
>         trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>  }
> +
> +void __init patching_mm_init(void);
> +#define patching_mm_init patching_mm_init
> +
>  # ifdef CONFIG_RANDOMIZE_MEMORY
>  void __meminit init_trampoline(void);
>  # else
> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
> index 054765ab2da2..9f44262abde0 100644
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>  #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>  #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>
> +#define TEXT_POKE_PGD_ENTRY    -5UL
> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
> +
>  #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>  #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 99fff853c944..840c72ec8c4f 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>  /* Install a pte for a particular vaddr in kernel space. */
>  void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>
> +struct mm_struct;
> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
> +
>  #ifdef CONFIG_X86_32
>  extern void native_pagetable_init(void);
>  #else
> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
> index 2ecd34e2d46c..cb364ea5b19d 100644
> --- a/arch/x86/include/asm/text-patching.h
> +++ b/arch/x86/include/asm/text-patching.h
> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>  extern int poke_int3_handler(struct pt_regs *regs);
>  extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>
> +extern struct mm_struct *patching_mm;
> +
>  #endif /* _ASM_X86_TEXT_PATCHING_H */
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index a481763a3776..fd8a950b0d62 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -11,6 +11,7 @@
>  #include <linux/stop_machine.h>
>  #include <linux/slab.h>
>  #include <linux/kdebug.h>
> +#include <linux/mmu_context.h>
>  #include <asm/text-patching.h>
>  #include <asm/alternative.h>
>  #include <asm/sections.h>
> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>                 WARN_ON(!PageReserved(pages[0]));
>                 pages[1] = virt_to_page(addr + PAGE_SIZE);
>         }
> -       BUG_ON(!pages[0]);
> +
>         local_irq_save(flags);
> +       BUG_ON(!pages[0]);
> +
> +       /*
> +        * During initial boot, it is hard to initialize patching_mm due to
> +        * dependencies in boot order.
> +        */
> +       if (patching_mm) {
> +               pte_t pte;
> +               temporary_mm_state_t prev;
> +
> +               prev = use_temporary_mm(patching_mm);
> +               pte = mk_pte(pages[0], PAGE_KERNEL);
> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
> +               pte = mk_pte(pages[1], PAGE_KERNEL);
> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
> +
> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
> +                      opcode, len);
> +
> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
> +               local_flush_tlb();

Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
needed.  How about getting rid of all the weird TLB flushing stuff and
instead putting the mapping at vaddr - __START_KERNEL_map or whatever
it is?  You *might* need to flush_tlb_mm_range() on module unload, but
that's it.

> +               sync_core();

I can't think of any case where sync_core() is needed.  The mm switch
serializes.

Also, is there any circumstance in which any of this is used before at
least jump table init?  All the early stuff is text_poke_early(),
right?

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 19:58                                                         ` Andy Lutomirski
@ 2018-08-27 20:16                                                           ` Nadav Amit
  2018-08-27 21:55                                                             ` Nadav Amit
  0 siblings, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-27 20:16 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Masami Hiramatsu, Peter Zijlstra, Kees Cook, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:

> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> 
>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>> 
>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>> What do you all think?
>>>>> 
>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>> 
>>>> What breaks?
>>> 
>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>> matter.
>> 
>> So here is what I got. It certainly needs some cleanup, but it boots.
>> 
>> Let me know how crappy you find it...
>> 
>> 
>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>> index bbc796eb0a3b..336779650a41 100644
>> --- a/arch/x86/include/asm/mmu_context.h
>> +++ b/arch/x86/include/asm/mmu_context.h
>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>        return cr3;
>> }
>> 
>> +typedef struct {
>> +       struct mm_struct *prev;
>> +} temporary_mm_state_t;
>> +
>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>> +{
>> +       temporary_mm_state_t state;
>> +
>> +       lockdep_assert_irqs_disabled();
>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>> +       switch_mm_irqs_off(NULL, mm, current);
>> +       return state;
>> +}
>> +
>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>> +{
>> +       lockdep_assert_irqs_disabled();
>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>> +}
>> +
>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>> index 5715647fc4fe..ef62af9a0ef7 100644
>> --- a/arch/x86/include/asm/pgtable.h
>> +++ b/arch/x86/include/asm/pgtable.h
>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>        /* Default trampoline pgd value */
>>        trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>> }
>> +
>> +void __init patching_mm_init(void);
>> +#define patching_mm_init patching_mm_init
>> +
>> # ifdef CONFIG_RANDOMIZE_MEMORY
>> void __meminit init_trampoline(void);
>> # else
>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>> index 054765ab2da2..9f44262abde0 100644
>> --- a/arch/x86/include/asm/pgtable_64_types.h
>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>> 
>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>> +
>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>> 
>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>> index 99fff853c944..840c72ec8c4f 100644
>> --- a/arch/x86/include/asm/pgtable_types.h
>> +++ b/arch/x86/include/asm/pgtable_types.h
>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>> /* Install a pte for a particular vaddr in kernel space. */
>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>> 
>> +struct mm_struct;
>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>> +
>> #ifdef CONFIG_X86_32
>> extern void native_pagetable_init(void);
>> #else
>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>> index 2ecd34e2d46c..cb364ea5b19d 100644
>> --- a/arch/x86/include/asm/text-patching.h
>> +++ b/arch/x86/include/asm/text-patching.h
>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>> extern int poke_int3_handler(struct pt_regs *regs);
>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>> 
>> +extern struct mm_struct *patching_mm;
>> +
>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>> index a481763a3776..fd8a950b0d62 100644
>> --- a/arch/x86/kernel/alternative.c
>> +++ b/arch/x86/kernel/alternative.c
>> @@ -11,6 +11,7 @@
>> #include <linux/stop_machine.h>
>> #include <linux/slab.h>
>> #include <linux/kdebug.h>
>> +#include <linux/mmu_context.h>
>> #include <asm/text-patching.h>
>> #include <asm/alternative.h>
>> #include <asm/sections.h>
>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>                WARN_ON(!PageReserved(pages[0]));
>>                pages[1] = virt_to_page(addr + PAGE_SIZE);
>>        }
>> -       BUG_ON(!pages[0]);
>> +
>>        local_irq_save(flags);
>> +       BUG_ON(!pages[0]);
>> +
>> +       /*
>> +        * During initial boot, it is hard to initialize patching_mm due to
>> +        * dependencies in boot order.
>> +        */
>> +       if (patching_mm) {
>> +               pte_t pte;
>> +               temporary_mm_state_t prev;
>> +
>> +               prev = use_temporary_mm(patching_mm);
>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>> +
>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>> +                      opcode, len);
>> +
>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>> +               local_flush_tlb();
> 
> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
> needed.  How about getting rid of all the weird TLB flushing stuff and
> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
> that's it.

I don’t see what’s wrong in SMP, since this entire piece of code should be
running under text_mutex.

I don’t quite understand your proposal. I really don’t want to have any
chance in which the page-tables for the poked address is not preallocated.

It is more complicated than needed, and there are redundant TLB flushes. The
reason I preferred to do it this way, is in order not to use other functions
that take locks during the software page-walk and not to duplicate existing
code. Yet, duplication might be the way to go.

>> +               sync_core();
> 
> I can't think of any case where sync_core() is needed.  The mm switch
> serializes.

Good point!

> 
> Also, is there any circumstance in which any of this is used before at
> least jump table init?  All the early stuff is text_poke_early(),
> right?

Not before jump_label_init. However, I did not manage to get rid of the two
code-patches in text_poke(), since text_poke is used relatively early by
x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
to duplicate init_mm - still fails.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 20:16                                                           ` Nadav Amit
@ 2018-08-27 21:55                                                             ` Nadav Amit
  2018-08-27 22:32                                                               ` Andy Lutomirski
  0 siblings, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-27 21:55 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Masami Hiramatsu, Peter Zijlstra, Kees Cook, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:

> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
> 
>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> 
>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>> 
>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>> What do you all think?
>>>>>> 
>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>>> 
>>>>> What breaks?
>>>> 
>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>>> matter.
>>> 
>>> So here is what I got. It certainly needs some cleanup, but it boots.
>>> 
>>> Let me know how crappy you find it...
>>> 
>>> 
>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>>> index bbc796eb0a3b..336779650a41 100644
>>> --- a/arch/x86/include/asm/mmu_context.h
>>> +++ b/arch/x86/include/asm/mmu_context.h
>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>>       return cr3;
>>> }
>>> 
>>> +typedef struct {
>>> +       struct mm_struct *prev;
>>> +} temporary_mm_state_t;
>>> +
>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>>> +{
>>> +       temporary_mm_state_t state;
>>> +
>>> +       lockdep_assert_irqs_disabled();
>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>>> +       switch_mm_irqs_off(NULL, mm, current);
>>> +       return state;
>>> +}
>>> +
>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>>> +{
>>> +       lockdep_assert_irqs_disabled();
>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>>> +}
>>> +
>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>> index 5715647fc4fe..ef62af9a0ef7 100644
>>> --- a/arch/x86/include/asm/pgtable.h
>>> +++ b/arch/x86/include/asm/pgtable.h
>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>>       /* Default trampoline pgd value */
>>>       trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>> }
>>> +
>>> +void __init patching_mm_init(void);
>>> +#define patching_mm_init patching_mm_init
>>> +
>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>> void __meminit init_trampoline(void);
>>> # else
>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>>> index 054765ab2da2..9f44262abde0 100644
>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>>> 
>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>>> +
>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>>> 
>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>> index 99fff853c944..840c72ec8c4f 100644
>>> --- a/arch/x86/include/asm/pgtable_types.h
>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>> /* Install a pte for a particular vaddr in kernel space. */
>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>>> 
>>> +struct mm_struct;
>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>>> +
>>> #ifdef CONFIG_X86_32
>>> extern void native_pagetable_init(void);
>>> #else
>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>> index 2ecd34e2d46c..cb364ea5b19d 100644
>>> --- a/arch/x86/include/asm/text-patching.h
>>> +++ b/arch/x86/include/asm/text-patching.h
>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>>> extern int poke_int3_handler(struct pt_regs *regs);
>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>> 
>>> +extern struct mm_struct *patching_mm;
>>> +
>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>> index a481763a3776..fd8a950b0d62 100644
>>> --- a/arch/x86/kernel/alternative.c
>>> +++ b/arch/x86/kernel/alternative.c
>>> @@ -11,6 +11,7 @@
>>> #include <linux/stop_machine.h>
>>> #include <linux/slab.h>
>>> #include <linux/kdebug.h>
>>> +#include <linux/mmu_context.h>
>>> #include <asm/text-patching.h>
>>> #include <asm/alternative.h>
>>> #include <asm/sections.h>
>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>>               WARN_ON(!PageReserved(pages[0]));
>>>               pages[1] = virt_to_page(addr + PAGE_SIZE);
>>>       }
>>> -       BUG_ON(!pages[0]);
>>> +
>>>       local_irq_save(flags);
>>> +       BUG_ON(!pages[0]);
>>> +
>>> +       /*
>>> +        * During initial boot, it is hard to initialize patching_mm due to
>>> +        * dependencies in boot order.
>>> +        */
>>> +       if (patching_mm) {
>>> +               pte_t pte;
>>> +               temporary_mm_state_t prev;
>>> +
>>> +               prev = use_temporary_mm(patching_mm);
>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>>> +
>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>>> +                      opcode, len);
>>> +
>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>>> +               local_flush_tlb();
>> 
>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
>> needed.  How about getting rid of all the weird TLB flushing stuff and
>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
>> that's it.
> 
> I don’t see what’s wrong in SMP, since this entire piece of code should be
> running under text_mutex.
> 
> I don’t quite understand your proposal. I really don’t want to have any
> chance in which the page-tables for the poked address is not preallocated.
> 
> It is more complicated than needed, and there are redundant TLB flushes. The
> reason I preferred to do it this way, is in order not to use other functions
> that take locks during the software page-walk and not to duplicate existing
> code. Yet, duplication might be the way to go.
> 
>>> +               sync_core();
>> 
>> I can't think of any case where sync_core() is needed.  The mm switch
>> serializes.
> 
> Good point!
> 
>> Also, is there any circumstance in which any of this is used before at
>> least jump table init?  All the early stuff is text_poke_early(),
>> right?
> 
> Not before jump_label_init. However, I did not manage to get rid of the two
> code-patches in text_poke(), since text_poke is used relatively early by
> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
> to duplicate init_mm - still fails.

Another correction: the populate_extra_pte() is not needed.

Anyhow, if you want to do this whole thing differently, I obviously will not
object, but I think it will end up more complicated.

I think I finally understood your comment about "vaddr -
__START_KERNEL_map”. I did something like that before, and it is not
super-simple. You need not only to conditionally flush the TLB, but also
to synchronize the PUD/PMD on changes. Don’t forget that module memory
is installed even when BPF programs are installed.

Let me know if you want me to submit cleaner patches or you want to carry on
yourself.

Thanks,
Nadav

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27  9:02                             ` Nicholas Piggin
@ 2018-08-27 22:13                               ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 96+ messages in thread
From: Benjamin Herrenschmidt @ 2018-08-27 22:13 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Peter Zijlstra, Will Deacon, Linus Torvalds, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman

On Mon, 2018-08-27 at 19:02 +1000, Nicholas Piggin wrote:
> > More tlbies ? With the cost of the broadasts on the fabric ? I don't
> > think so.. or I'm not understanding your point...
> 
> More tlbies are no good, but there will be some places where it works
> out much better (and fewer tlbies). Worst possible case for current code
> is a big unmap with lots of scattered page sizes. We _should_ get that
> with just a single PID flush at the end, but what we will get today is
> a bunch of PID and VA flushes.
> 
> I don't propose doing that though, I'd rather be explicit about
> tracking start and end range of each page size. Still not "optimal"
> but neither is existing single range for sparse mappings... anyway it
> will need to be profiled, but my point is we don't really fit exactly
> what x86/arm want.

If we have an arch specific part, we could just remember up to N
"large" pages there without actually flushing, and if that overflows,
upgrade to a full flush.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 21:55                                                             ` Nadav Amit
@ 2018-08-27 22:32                                                               ` Andy Lutomirski
  2018-08-27 22:54                                                                 ` Nadav Amit
  0 siblings, 1 reply; 96+ messages in thread
From: Andy Lutomirski @ 2018-08-27 22:32 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Andy Lutomirski, Masami Hiramatsu, Peter Zijlstra, Kees Cook,
	Linus Torvalds, Paolo Bonzini, Jiri Kosina, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, Aug 27, 2018 at 2:55 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>
>> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>
>>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>
>>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>
>>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>> What do you all think?
>>>>>>>
>>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>>>>
>>>>>> What breaks?
>>>>>
>>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>>>> matter.
>>>>
>>>> So here is what I got. It certainly needs some cleanup, but it boots.
>>>>
>>>> Let me know how crappy you find it...
>>>>
>>>>
>>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>>>> index bbc796eb0a3b..336779650a41 100644
>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>>>       return cr3;
>>>> }
>>>>
>>>> +typedef struct {
>>>> +       struct mm_struct *prev;
>>>> +} temporary_mm_state_t;
>>>> +
>>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>>>> +{
>>>> +       temporary_mm_state_t state;
>>>> +
>>>> +       lockdep_assert_irqs_disabled();
>>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>>>> +       switch_mm_irqs_off(NULL, mm, current);
>>>> +       return state;
>>>> +}
>>>> +
>>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>>>> +{
>>>> +       lockdep_assert_irqs_disabled();
>>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>>>> +}
>>>> +
>>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>>> index 5715647fc4fe..ef62af9a0ef7 100644
>>>> --- a/arch/x86/include/asm/pgtable.h
>>>> +++ b/arch/x86/include/asm/pgtable.h
>>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>>>       /* Default trampoline pgd value */
>>>>       trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>>> }
>>>> +
>>>> +void __init patching_mm_init(void);
>>>> +#define patching_mm_init patching_mm_init
>>>> +
>>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>>> void __meminit init_trampoline(void);
>>>> # else
>>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>>>> index 054765ab2da2..9f44262abde0 100644
>>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>>>>
>>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>>>> +
>>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>>>>
>>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>>> index 99fff853c944..840c72ec8c4f 100644
>>>> --- a/arch/x86/include/asm/pgtable_types.h
>>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>>> /* Install a pte for a particular vaddr in kernel space. */
>>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>>>>
>>>> +struct mm_struct;
>>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>>>> +
>>>> #ifdef CONFIG_X86_32
>>>> extern void native_pagetable_init(void);
>>>> #else
>>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>>> index 2ecd34e2d46c..cb364ea5b19d 100644
>>>> --- a/arch/x86/include/asm/text-patching.h
>>>> +++ b/arch/x86/include/asm/text-patching.h
>>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>>>> extern int poke_int3_handler(struct pt_regs *regs);
>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>>>
>>>> +extern struct mm_struct *patching_mm;
>>>> +
>>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>>> index a481763a3776..fd8a950b0d62 100644
>>>> --- a/arch/x86/kernel/alternative.c
>>>> +++ b/arch/x86/kernel/alternative.c
>>>> @@ -11,6 +11,7 @@
>>>> #include <linux/stop_machine.h>
>>>> #include <linux/slab.h>
>>>> #include <linux/kdebug.h>
>>>> +#include <linux/mmu_context.h>
>>>> #include <asm/text-patching.h>
>>>> #include <asm/alternative.h>
>>>> #include <asm/sections.h>
>>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>>>               WARN_ON(!PageReserved(pages[0]));
>>>>               pages[1] = virt_to_page(addr + PAGE_SIZE);
>>>>       }
>>>> -       BUG_ON(!pages[0]);
>>>> +
>>>>       local_irq_save(flags);
>>>> +       BUG_ON(!pages[0]);
>>>> +
>>>> +       /*
>>>> +        * During initial boot, it is hard to initialize patching_mm due to
>>>> +        * dependencies in boot order.
>>>> +        */
>>>> +       if (patching_mm) {
>>>> +               pte_t pte;
>>>> +               temporary_mm_state_t prev;
>>>> +
>>>> +               prev = use_temporary_mm(patching_mm);
>>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>>>> +
>>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>>>> +                      opcode, len);
>>>> +
>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>>>> +               local_flush_tlb();
>>>
>>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
>>> needed.  How about getting rid of all the weird TLB flushing stuff and
>>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
>>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
>>> that's it.
>>
>> I don’t see what’s wrong in SMP, since this entire piece of code should be
>> running under text_mutex.
>>
>> I don’t quite understand your proposal. I really don’t want to have any
>> chance in which the page-tables for the poked address is not preallocated.
>>
>> It is more complicated than needed, and there are redundant TLB flushes. The
>> reason I preferred to do it this way, is in order not to use other functions
>> that take locks during the software page-walk and not to duplicate existing
>> code. Yet, duplication might be the way to go.
>>
>>>> +               sync_core();
>>>
>>> I can't think of any case where sync_core() is needed.  The mm switch
>>> serializes.
>>
>> Good point!
>>
>>> Also, is there any circumstance in which any of this is used before at
>>> least jump table init?  All the early stuff is text_poke_early(),
>>> right?
>>
>> Not before jump_label_init. However, I did not manage to get rid of the two
>> code-patches in text_poke(), since text_poke is used relatively early by
>> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
>> to duplicate init_mm - still fails.
>
> Another correction: the populate_extra_pte() is not needed.
>
> Anyhow, if you want to do this whole thing differently, I obviously will not
> object, but I think it will end up more complicated.
>
> I think I finally understood your comment about "vaddr -
> __START_KERNEL_map”. I did something like that before, and it is not
> super-simple. You need not only to conditionally flush the TLB, but also
> to synchronize the PUD/PMD on changes. Don’t forget that module memory
> is installed even when BPF programs are installed.
>
> Let me know if you want me to submit cleaner patches or you want to carry on
> yourself.
>

I think your approach is a good start and should be good enough (with
cleanups) as a fix for the bug.  But I think your code has the same
bug that we have now!  You're reusing the same address on multiple
CPUs without flushing.  You can easily fix it by forcing a flush
before loading the mm, which should be as simple as adding
flush_tlb_mm() before you load the mm.  (It won't actually flush
anything by itself, since the mm isn't loaded, but it will update the
bookkeeping so that switch_mm_irqs_off() flushes the mm.)

Also, please at least get rid of TEXT_POKE_ADDR.  If you don't want to
do the vaddr - __START_KERNEL_map thing, then at least pick an address
in the low half of the address space such as 0 :)  Ideally you'd only
use this thing late enough that you could even use the normal
insert_pfn (or similar) API for it, but that doesn't really matter.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 22:32                                                               ` Andy Lutomirski
@ 2018-08-27 22:54                                                                 ` Nadav Amit
  2018-08-27 23:01                                                                   ` Andy Lutomirski
  0 siblings, 1 reply; 96+ messages in thread
From: Nadav Amit @ 2018-08-27 22:54 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Masami Hiramatsu, Peter Zijlstra, Kees Cook, Linus Torvalds,
	Paolo Bonzini, Jiri Kosina, Will Deacon, Benjamin Herrenschmidt,
	Nick Piggin, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

at 3:32 PM, Andy Lutomirski <luto@kernel.org> wrote:

> On Mon, Aug 27, 2018 at 2:55 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>> 
>>> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> 
>>>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> 
>>>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>> 
>>>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>> What do you all think?
>>>>>>>> 
>>>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>>>>> 
>>>>>>> What breaks?
>>>>>> 
>>>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>>>>> matter.
>>>>> 
>>>>> So here is what I got. It certainly needs some cleanup, but it boots.
>>>>> 
>>>>> Let me know how crappy you find it...
>>>>> 
>>>>> 
>>>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>>>>> index bbc796eb0a3b..336779650a41 100644
>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>>>>      return cr3;
>>>>> }
>>>>> 
>>>>> +typedef struct {
>>>>> +       struct mm_struct *prev;
>>>>> +} temporary_mm_state_t;
>>>>> +
>>>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>>>>> +{
>>>>> +       temporary_mm_state_t state;
>>>>> +
>>>>> +       lockdep_assert_irqs_disabled();
>>>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>>>>> +       switch_mm_irqs_off(NULL, mm, current);
>>>>> +       return state;
>>>>> +}
>>>>> +
>>>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>>>>> +{
>>>>> +       lockdep_assert_irqs_disabled();
>>>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>>>>> +}
>>>>> +
>>>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>>>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>>>> index 5715647fc4fe..ef62af9a0ef7 100644
>>>>> --- a/arch/x86/include/asm/pgtable.h
>>>>> +++ b/arch/x86/include/asm/pgtable.h
>>>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>>>>      /* Default trampoline pgd value */
>>>>>      trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>>>> }
>>>>> +
>>>>> +void __init patching_mm_init(void);
>>>>> +#define patching_mm_init patching_mm_init
>>>>> +
>>>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>>>> void __meminit init_trampoline(void);
>>>>> # else
>>>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>>>>> index 054765ab2da2..9f44262abde0 100644
>>>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>>>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>>>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>>>>> 
>>>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>>>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>>>>> +
>>>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>>>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>>>>> 
>>>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>>>> index 99fff853c944..840c72ec8c4f 100644
>>>>> --- a/arch/x86/include/asm/pgtable_types.h
>>>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>>>> /* Install a pte for a particular vaddr in kernel space. */
>>>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>>>>> 
>>>>> +struct mm_struct;
>>>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>>>>> +
>>>>> #ifdef CONFIG_X86_32
>>>>> extern void native_pagetable_init(void);
>>>>> #else
>>>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>>>> index 2ecd34e2d46c..cb364ea5b19d 100644
>>>>> --- a/arch/x86/include/asm/text-patching.h
>>>>> +++ b/arch/x86/include/asm/text-patching.h
>>>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>>>>> extern int poke_int3_handler(struct pt_regs *regs);
>>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>>>> 
>>>>> +extern struct mm_struct *patching_mm;
>>>>> +
>>>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>>>> index a481763a3776..fd8a950b0d62 100644
>>>>> --- a/arch/x86/kernel/alternative.c
>>>>> +++ b/arch/x86/kernel/alternative.c
>>>>> @@ -11,6 +11,7 @@
>>>>> #include <linux/stop_machine.h>
>>>>> #include <linux/slab.h>
>>>>> #include <linux/kdebug.h>
>>>>> +#include <linux/mmu_context.h>
>>>>> #include <asm/text-patching.h>
>>>>> #include <asm/alternative.h>
>>>>> #include <asm/sections.h>
>>>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>>>>              WARN_ON(!PageReserved(pages[0]));
>>>>>              pages[1] = virt_to_page(addr + PAGE_SIZE);
>>>>>      }
>>>>> -       BUG_ON(!pages[0]);
>>>>> +
>>>>>      local_irq_save(flags);
>>>>> +       BUG_ON(!pages[0]);
>>>>> +
>>>>> +       /*
>>>>> +        * During initial boot, it is hard to initialize patching_mm due to
>>>>> +        * dependencies in boot order.
>>>>> +        */
>>>>> +       if (patching_mm) {
>>>>> +               pte_t pte;
>>>>> +               temporary_mm_state_t prev;
>>>>> +
>>>>> +               prev = use_temporary_mm(patching_mm);
>>>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>>>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>>>>> +
>>>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>>>>> +                      opcode, len);
>>>>> +
>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>>>>> +               local_flush_tlb();
>>>> 
>>>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
>>>> needed.  How about getting rid of all the weird TLB flushing stuff and
>>>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
>>>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
>>>> that's it.
>>> 
>>> I don’t see what’s wrong in SMP, since this entire piece of code should be
>>> running under text_mutex.
>>> 
>>> I don’t quite understand your proposal. I really don’t want to have any
>>> chance in which the page-tables for the poked address is not preallocated.
>>> 
>>> It is more complicated than needed, and there are redundant TLB flushes. The
>>> reason I preferred to do it this way, is in order not to use other functions
>>> that take locks during the software page-walk and not to duplicate existing
>>> code. Yet, duplication might be the way to go.
>>> 
>>>>> +               sync_core();
>>>> 
>>>> I can't think of any case where sync_core() is needed.  The mm switch
>>>> serializes.
>>> 
>>> Good point!
>>> 
>>>> Also, is there any circumstance in which any of this is used before at
>>>> least jump table init?  All the early stuff is text_poke_early(),
>>>> right?
>>> 
>>> Not before jump_label_init. However, I did not manage to get rid of the two
>>> code-patches in text_poke(), since text_poke is used relatively early by
>>> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
>>> to duplicate init_mm - still fails.
>> 
>> Another correction: the populate_extra_pte() is not needed.
>> 
>> Anyhow, if you want to do this whole thing differently, I obviously will not
>> object, but I think it will end up more complicated.
>> 
>> I think I finally understood your comment about "vaddr -
>> __START_KERNEL_map”. I did something like that before, and it is not
>> super-simple. You need not only to conditionally flush the TLB, but also
>> to synchronize the PUD/PMD on changes. Don’t forget that module memory
>> is installed even when BPF programs are installed.
>> 
>> Let me know if you want me to submit cleaner patches or you want to carry on
>> yourself.
> 
> I think your approach is a good start and should be good enough (with
> cleanups) as a fix for the bug.  But I think your code has the same
> bug that we have now!  You're reusing the same address on multiple
> CPUs without flushing.  You can easily fix it by forcing a flush
> before loading the mm, which should be as simple as adding
> flush_tlb_mm() before you load the mm.  (It won't actually flush
> anything by itself, since the mm isn't loaded, but it will update the
> bookkeeping so that switch_mm_irqs_off() flushes the mm.)

What am I missing?

We have a lock (text_mutex) which prevents the use of the new page-table
hierarchy on multiple CPUs. In addition we have __set_pte_vaddr() which does
a local TLB flush before the lock is released and before IRQs are enabled.
So how can the PTE be cached on multiple CPUs?

Yes, __set_pte_vaddr() is ugly and flushes too much. I’ll try to remove some
redundant TLB flushes, but these flushes are already there.

> Also, please at least get rid of TEXT_POKE_ADDR.  If you don't want to
> do the vaddr - __START_KERNEL_map thing, then at least pick an address
> in the low half of the address space such as 0 :)  Ideally you'd only
> use this thing late enough that you could even use the normal
> insert_pfn (or similar) API for it, but that doesn't really matter.

Perhaps the vaddr - __START_KERNEL_map actually makes sense. I was
misunderstanding it (again) before, thinking you want me to use vaddr (and
not the delta). I’ll give it a try.

Thanks,
Nadav


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 22:54                                                                 ` Nadav Amit
@ 2018-08-27 23:01                                                                   ` Andy Lutomirski
  2018-08-28  8:49                                                                     ` Masami Hiramatsu
  0 siblings, 1 reply; 96+ messages in thread
From: Andy Lutomirski @ 2018-08-27 23:01 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Andy Lutomirski, Masami Hiramatsu, Peter Zijlstra, Kees Cook,
	Linus Torvalds, Paolo Bonzini, Jiri Kosina, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, Aug 27, 2018 at 3:54 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> at 3:32 PM, Andy Lutomirski <luto@kernel.org> wrote:
>
>> On Mon, Aug 27, 2018 at 2:55 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>
>>>> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>
>>>>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>
>>>>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>>>
>>>>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>>> What do you all think?
>>>>>>>>>
>>>>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>>>>>>
>>>>>>>> What breaks?
>>>>>>>
>>>>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>>>>>> matter.
>>>>>>
>>>>>> So here is what I got. It certainly needs some cleanup, but it boots.
>>>>>>
>>>>>> Let me know how crappy you find it...
>>>>>>
>>>>>>
>>>>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>>>>>> index bbc796eb0a3b..336779650a41 100644
>>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>>>>>      return cr3;
>>>>>> }
>>>>>>
>>>>>> +typedef struct {
>>>>>> +       struct mm_struct *prev;
>>>>>> +} temporary_mm_state_t;
>>>>>> +
>>>>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>>>>>> +{
>>>>>> +       temporary_mm_state_t state;
>>>>>> +
>>>>>> +       lockdep_assert_irqs_disabled();
>>>>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>>>>>> +       switch_mm_irqs_off(NULL, mm, current);
>>>>>> +       return state;
>>>>>> +}
>>>>>> +
>>>>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>>>>>> +{
>>>>>> +       lockdep_assert_irqs_disabled();
>>>>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>>>>>> +}
>>>>>> +
>>>>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>>>>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>>>>> index 5715647fc4fe..ef62af9a0ef7 100644
>>>>>> --- a/arch/x86/include/asm/pgtable.h
>>>>>> +++ b/arch/x86/include/asm/pgtable.h
>>>>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>>>>>      /* Default trampoline pgd value */
>>>>>>      trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>>>>> }
>>>>>> +
>>>>>> +void __init patching_mm_init(void);
>>>>>> +#define patching_mm_init patching_mm_init
>>>>>> +
>>>>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>>>>> void __meminit init_trampoline(void);
>>>>>> # else
>>>>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>>>>>> index 054765ab2da2..9f44262abde0 100644
>>>>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>>>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>>>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>>>>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>>>>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>>>>>>
>>>>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>>>>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>>>>>> +
>>>>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>>>>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>>>>>>
>>>>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>>>>> index 99fff853c944..840c72ec8c4f 100644
>>>>>> --- a/arch/x86/include/asm/pgtable_types.h
>>>>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>>>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>>>>> /* Install a pte for a particular vaddr in kernel space. */
>>>>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>>>>>>
>>>>>> +struct mm_struct;
>>>>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>>>>>> +
>>>>>> #ifdef CONFIG_X86_32
>>>>>> extern void native_pagetable_init(void);
>>>>>> #else
>>>>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>>>>> index 2ecd34e2d46c..cb364ea5b19d 100644
>>>>>> --- a/arch/x86/include/asm/text-patching.h
>>>>>> +++ b/arch/x86/include/asm/text-patching.h
>>>>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>>>>>> extern int poke_int3_handler(struct pt_regs *regs);
>>>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>>>>>
>>>>>> +extern struct mm_struct *patching_mm;
>>>>>> +
>>>>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>>>>> index a481763a3776..fd8a950b0d62 100644
>>>>>> --- a/arch/x86/kernel/alternative.c
>>>>>> +++ b/arch/x86/kernel/alternative.c
>>>>>> @@ -11,6 +11,7 @@
>>>>>> #include <linux/stop_machine.h>
>>>>>> #include <linux/slab.h>
>>>>>> #include <linux/kdebug.h>
>>>>>> +#include <linux/mmu_context.h>
>>>>>> #include <asm/text-patching.h>
>>>>>> #include <asm/alternative.h>
>>>>>> #include <asm/sections.h>
>>>>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>>>>>              WARN_ON(!PageReserved(pages[0]));
>>>>>>              pages[1] = virt_to_page(addr + PAGE_SIZE);
>>>>>>      }
>>>>>> -       BUG_ON(!pages[0]);
>>>>>> +
>>>>>>      local_irq_save(flags);
>>>>>> +       BUG_ON(!pages[0]);
>>>>>> +
>>>>>> +       /*
>>>>>> +        * During initial boot, it is hard to initialize patching_mm due to
>>>>>> +        * dependencies in boot order.
>>>>>> +        */
>>>>>> +       if (patching_mm) {
>>>>>> +               pte_t pte;
>>>>>> +               temporary_mm_state_t prev;
>>>>>> +
>>>>>> +               prev = use_temporary_mm(patching_mm);
>>>>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>>>>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>>>>>> +
>>>>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>>>>>> +                      opcode, len);
>>>>>> +
>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>>>>>> +               local_flush_tlb();
>>>>>
>>>>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
>>>>> needed.  How about getting rid of all the weird TLB flushing stuff and
>>>>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
>>>>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
>>>>> that's it.
>>>>
>>>> I don’t see what’s wrong in SMP, since this entire piece of code should be
>>>> running under text_mutex.
>>>>
>>>> I don’t quite understand your proposal. I really don’t want to have any
>>>> chance in which the page-tables for the poked address is not preallocated.
>>>>
>>>> It is more complicated than needed, and there are redundant TLB flushes. The
>>>> reason I preferred to do it this way, is in order not to use other functions
>>>> that take locks during the software page-walk and not to duplicate existing
>>>> code. Yet, duplication might be the way to go.
>>>>
>>>>>> +               sync_core();
>>>>>
>>>>> I can't think of any case where sync_core() is needed.  The mm switch
>>>>> serializes.
>>>>
>>>> Good point!
>>>>
>>>>> Also, is there any circumstance in which any of this is used before at
>>>>> least jump table init?  All the early stuff is text_poke_early(),
>>>>> right?
>>>>
>>>> Not before jump_label_init. However, I did not manage to get rid of the two
>>>> code-patches in text_poke(), since text_poke is used relatively early by
>>>> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
>>>> to duplicate init_mm - still fails.
>>>
>>> Another correction: the populate_extra_pte() is not needed.
>>>
>>> Anyhow, if you want to do this whole thing differently, I obviously will not
>>> object, but I think it will end up more complicated.
>>>
>>> I think I finally understood your comment about "vaddr -
>>> __START_KERNEL_map”. I did something like that before, and it is not
>>> super-simple. You need not only to conditionally flush the TLB, but also
>>> to synchronize the PUD/PMD on changes. Don’t forget that module memory
>>> is installed even when BPF programs are installed.
>>>
>>> Let me know if you want me to submit cleaner patches or you want to carry on
>>> yourself.
>>
>> I think your approach is a good start and should be good enough (with
>> cleanups) as a fix for the bug.  But I think your code has the same
>> bug that we have now!  You're reusing the same address on multiple
>> CPUs without flushing.  You can easily fix it by forcing a flush
>> before loading the mm, which should be as simple as adding
>> flush_tlb_mm() before you load the mm.  (It won't actually flush
>> anything by itself, since the mm isn't loaded, but it will update the
>> bookkeeping so that switch_mm_irqs_off() flushes the mm.)
>
> What am I missing?
>
> We have a lock (text_mutex) which prevents the use of the new page-table
> hierarchy on multiple CPUs. In addition we have __set_pte_vaddr() which does
> a local TLB flush before the lock is released and before IRQs are enabled.
> So how can the PTE be cached on multiple CPUs?
>
> Yes, __set_pte_vaddr() is ugly and flushes too much. I’ll try to remove some
> redundant TLB flushes, but these flushes are already there.

I missed that set_pte_vaddr() contained a flush.  It's probably nicer
to use one of the APIs that doesn't imply a flush.  If you end up with
vm_insert_pfn() or similar, you can use can use copy_to_user() if
needed :)

>
>> Also, please at least get rid of TEXT_POKE_ADDR.  If you don't want to
>> do the vaddr - __START_KERNEL_map thing, then at least pick an address
>> in the low half of the address space such as 0 :)  Ideally you'd only
>> use this thing late enough that you could even use the normal
>> insert_pfn (or similar) API for it, but that doesn't really matter.
>
> Perhaps the vaddr - __START_KERNEL_map actually makes sense. I was
> misunderstanding it (again) before, thinking you want me to use vaddr (and
> not the delta). I’ll give it a try.

It does prevent easy preallocation of the intermediate tables.

And do we really allow text_poke() / text_poke_bp() on BPF programs?  Hmm.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-27 23:01                                                                   ` Andy Lutomirski
@ 2018-08-28  8:49                                                                     ` Masami Hiramatsu
  2018-08-28 17:33                                                                       ` Nadav Amit
  0 siblings, 1 reply; 96+ messages in thread
From: Masami Hiramatsu @ 2018-08-28  8:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Nadav Amit, Masami Hiramatsu, Peter Zijlstra, Kees Cook,
	Linus Torvalds, Paolo Bonzini, Jiri Kosina, Will Deacon,
	Benjamin Herrenschmidt, Nick Piggin, the arch/x86 maintainers,
	Borislav Petkov, Rik van Riel, Jann Horn, Adin Scannell,
	Dave Hansen, Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman

On Mon, 27 Aug 2018 16:01:32 -0700
Andy Lutomirski <luto@kernel.org> wrote:

> On Mon, Aug 27, 2018 at 3:54 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> > at 3:32 PM, Andy Lutomirski <luto@kernel.org> wrote:
> >
> >> On Mon, Aug 27, 2018 at 2:55 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>> at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>
> >>>> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
> >>>>
> >>>>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>>>>
> >>>>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
> >>>>>>>
> >>>>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
> >>>>>>>>>> What do you all think?
> >>>>>>>>>
> >>>>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
> >>>>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
> >>>>>>>>
> >>>>>>>> What breaks?
> >>>>>>>
> >>>>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
> >>>>>>> matter.
> >>>>>>
> >>>>>> So here is what I got. It certainly needs some cleanup, but it boots.
> >>>>>>
> >>>>>> Let me know how crappy you find it...
> >>>>>>
> >>>>>>
> >>>>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> >>>>>> index bbc796eb0a3b..336779650a41 100644
> >>>>>> --- a/arch/x86/include/asm/mmu_context.h
> >>>>>> +++ b/arch/x86/include/asm/mmu_context.h
> >>>>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
> >>>>>>      return cr3;
> >>>>>> }
> >>>>>>
> >>>>>> +typedef struct {
> >>>>>> +       struct mm_struct *prev;
> >>>>>> +} temporary_mm_state_t;
> >>>>>> +
> >>>>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
> >>>>>> +{
> >>>>>> +       temporary_mm_state_t state;
> >>>>>> +
> >>>>>> +       lockdep_assert_irqs_disabled();
> >>>>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
> >>>>>> +       switch_mm_irqs_off(NULL, mm, current);
> >>>>>> +       return state;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
> >>>>>> +{
> >>>>>> +       lockdep_assert_irqs_disabled();
> >>>>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
> >>>>>> +}
> >>>>>> +
> >>>>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
> >>>>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> >>>>>> index 5715647fc4fe..ef62af9a0ef7 100644
> >>>>>> --- a/arch/x86/include/asm/pgtable.h
> >>>>>> +++ b/arch/x86/include/asm/pgtable.h
> >>>>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
> >>>>>>      /* Default trampoline pgd value */
> >>>>>>      trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
> >>>>>> }
> >>>>>> +
> >>>>>> +void __init patching_mm_init(void);
> >>>>>> +#define patching_mm_init patching_mm_init
> >>>>>> +
> >>>>>> # ifdef CONFIG_RANDOMIZE_MEMORY
> >>>>>> void __meminit init_trampoline(void);
> >>>>>> # else
> >>>>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
> >>>>>> index 054765ab2da2..9f44262abde0 100644
> >>>>>> --- a/arch/x86/include/asm/pgtable_64_types.h
> >>>>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
> >>>>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
> >>>>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
> >>>>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
> >>>>>>
> >>>>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
> >>>>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
> >>>>>> +
> >>>>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
> >>>>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
> >>>>>>
> >>>>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> >>>>>> index 99fff853c944..840c72ec8c4f 100644
> >>>>>> --- a/arch/x86/include/asm/pgtable_types.h
> >>>>>> +++ b/arch/x86/include/asm/pgtable_types.h
> >>>>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
> >>>>>> /* Install a pte for a particular vaddr in kernel space. */
> >>>>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
> >>>>>>
> >>>>>> +struct mm_struct;
> >>>>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
> >>>>>> +
> >>>>>> #ifdef CONFIG_X86_32
> >>>>>> extern void native_pagetable_init(void);
> >>>>>> #else
> >>>>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
> >>>>>> index 2ecd34e2d46c..cb364ea5b19d 100644
> >>>>>> --- a/arch/x86/include/asm/text-patching.h
> >>>>>> +++ b/arch/x86/include/asm/text-patching.h
> >>>>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
> >>>>>> extern int poke_int3_handler(struct pt_regs *regs);
> >>>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
> >>>>>>
> >>>>>> +extern struct mm_struct *patching_mm;
> >>>>>> +
> >>>>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
> >>>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> >>>>>> index a481763a3776..fd8a950b0d62 100644
> >>>>>> --- a/arch/x86/kernel/alternative.c
> >>>>>> +++ b/arch/x86/kernel/alternative.c
> >>>>>> @@ -11,6 +11,7 @@
> >>>>>> #include <linux/stop_machine.h>
> >>>>>> #include <linux/slab.h>
> >>>>>> #include <linux/kdebug.h>
> >>>>>> +#include <linux/mmu_context.h>
> >>>>>> #include <asm/text-patching.h>
> >>>>>> #include <asm/alternative.h>
> >>>>>> #include <asm/sections.h>
> >>>>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
> >>>>>>              WARN_ON(!PageReserved(pages[0]));
> >>>>>>              pages[1] = virt_to_page(addr + PAGE_SIZE);
> >>>>>>      }
> >>>>>> -       BUG_ON(!pages[0]);
> >>>>>> +
> >>>>>>      local_irq_save(flags);
> >>>>>> +       BUG_ON(!pages[0]);
> >>>>>> +
> >>>>>> +       /*
> >>>>>> +        * During initial boot, it is hard to initialize patching_mm due to
> >>>>>> +        * dependencies in boot order.
> >>>>>> +        */
> >>>>>> +       if (patching_mm) {
> >>>>>> +               pte_t pte;
> >>>>>> +               temporary_mm_state_t prev;
> >>>>>> +
> >>>>>> +               prev = use_temporary_mm(patching_mm);
> >>>>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
> >>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
> >>>>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
> >>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
> >>>>>> +
> >>>>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
> >>>>>> +                      opcode, len);
> >>>>>> +
> >>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
> >>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
> >>>>>> +               local_flush_tlb();
> >>>>>
> >>>>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
> >>>>> needed.  How about getting rid of all the weird TLB flushing stuff and
> >>>>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
> >>>>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
> >>>>> that's it.
> >>>>
> >>>> I don’t see what’s wrong in SMP, since this entire piece of code should be
> >>>> running under text_mutex.
> >>>>
> >>>> I don’t quite understand your proposal. I really don’t want to have any
> >>>> chance in which the page-tables for the poked address is not preallocated.
> >>>>
> >>>> It is more complicated than needed, and there are redundant TLB flushes. The
> >>>> reason I preferred to do it this way, is in order not to use other functions
> >>>> that take locks during the software page-walk and not to duplicate existing
> >>>> code. Yet, duplication might be the way to go.
> >>>>
> >>>>>> +               sync_core();
> >>>>>
> >>>>> I can't think of any case where sync_core() is needed.  The mm switch
> >>>>> serializes.
> >>>>
> >>>> Good point!
> >>>>
> >>>>> Also, is there any circumstance in which any of this is used before at
> >>>>> least jump table init?  All the early stuff is text_poke_early(),
> >>>>> right?
> >>>>
> >>>> Not before jump_label_init. However, I did not manage to get rid of the two
> >>>> code-patches in text_poke(), since text_poke is used relatively early by
> >>>> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
> >>>> to duplicate init_mm - still fails.
> >>>
> >>> Another correction: the populate_extra_pte() is not needed.
> >>>
> >>> Anyhow, if you want to do this whole thing differently, I obviously will not
> >>> object, but I think it will end up more complicated.
> >>>
> >>> I think I finally understood your comment about "vaddr -
> >>> __START_KERNEL_map”. I did something like that before, and it is not
> >>> super-simple. You need not only to conditionally flush the TLB, but also
> >>> to synchronize the PUD/PMD on changes. Don’t forget that module memory
> >>> is installed even when BPF programs are installed.
> >>>
> >>> Let me know if you want me to submit cleaner patches or you want to carry on
> >>> yourself.
> >>
> >> I think your approach is a good start and should be good enough (with
> >> cleanups) as a fix for the bug.  But I think your code has the same
> >> bug that we have now!  You're reusing the same address on multiple
> >> CPUs without flushing.  You can easily fix it by forcing a flush
> >> before loading the mm, which should be as simple as adding
> >> flush_tlb_mm() before you load the mm.  (It won't actually flush
> >> anything by itself, since the mm isn't loaded, but it will update the
> >> bookkeeping so that switch_mm_irqs_off() flushes the mm.)
> >
> > What am I missing?
> >
> > We have a lock (text_mutex) which prevents the use of the new page-table
> > hierarchy on multiple CPUs. In addition we have __set_pte_vaddr() which does
> > a local TLB flush before the lock is released and before IRQs are enabled.
> > So how can the PTE be cached on multiple CPUs?
> >
> > Yes, __set_pte_vaddr() is ugly and flushes too much. I’ll try to remove some
> > redundant TLB flushes, but these flushes are already there.
> 
> I missed that set_pte_vaddr() contained a flush.  It's probably nicer
> to use one of the APIs that doesn't imply a flush.  If you end up with
> vm_insert_pfn() or similar, you can use can use copy_to_user() if
> needed :)
> 
> >
> >> Also, please at least get rid of TEXT_POKE_ADDR.  If you don't want to
> >> do the vaddr - __START_KERNEL_map thing, then at least pick an address
> >> in the low half of the address space such as 0 :)  Ideally you'd only
> >> use this thing late enough that you could even use the normal
> >> insert_pfn (or similar) API for it, but that doesn't really matter.
> >
> > Perhaps the vaddr - __START_KERNEL_map actually makes sense. I was
> > misunderstanding it (again) before, thinking you want me to use vaddr (and
> > not the delta). I’ll give it a try.
> 
> It does prevent easy preallocation of the intermediate tables.

Would we have to change the mapping address? If so, how about using hash
of vaddr, which allow us to map it in one pte?

> And do we really allow text_poke() / text_poke_bp() on BPF programs?  Hmm.

I don't think so. Allowing text_poke from BPF program is security hazzerd.
(or would you mean BPF JIT?)

Thanks,

-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: TLB flushes on fixmap changes
  2018-08-28  8:49                                                                     ` Masami Hiramatsu
@ 2018-08-28 17:33                                                                       ` Nadav Amit
  0 siblings, 0 replies; 96+ messages in thread
From: Nadav Amit @ 2018-08-28 17:33 UTC (permalink / raw)
  To: Masami Hiramatsu, Andy Lutomirski
  Cc: Peter Zijlstra, Kees Cook, Linus Torvalds, Paolo Bonzini,
	Jiri Kosina, Will Deacon, Benjamin Herrenschmidt, Nick Piggin,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman

at 1:49 AM, Masami Hiramatsu <mhiramat@kernel.org> wrote:

> On Mon, 27 Aug 2018 16:01:32 -0700
> Andy Lutomirski <luto@kernel.org> wrote:
> 
>> On Mon, Aug 27, 2018 at 3:54 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>> at 3:32 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>> 
>>>> On Mon, Aug 27, 2018 at 2:55 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> at 1:16 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>> 
>>>>>> at 12:58 PM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>> 
>>>>>>> On Mon, Aug 27, 2018 at 12:43 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>> at 12:10 PM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>> 
>>>>>>>>> at 11:58 AM, Andy Lutomirski <luto@kernel.org> wrote:
>>>>>>>>> 
>>>>>>>>>> On Mon, Aug 27, 2018 at 11:54 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>>>>> On Mon, Aug 27, 2018 at 10:34 AM, Nadav Amit <nadav.amit@gmail.com> wrote:
>>>>>>>>>>>> What do you all think?
>>>>>>>>>>> 
>>>>>>>>>>> I agree in general. But I think that current->mm would need to be loaded, as
>>>>>>>>>>> otherwise I am afraid it would break switch_mm_irqs_off().
>>>>>>>>>> 
>>>>>>>>>> What breaks?
>>>>>>>>> 
>>>>>>>>> Actually nothing. I just saw the IBPB stuff regarding tsk, but it should not
>>>>>>>>> matter.
>>>>>>>> 
>>>>>>>> So here is what I got. It certainly needs some cleanup, but it boots.
>>>>>>>> 
>>>>>>>> Let me know how crappy you find it...
>>>>>>>> 
>>>>>>>> 
>>>>>>>> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>>>>>>>> index bbc796eb0a3b..336779650a41 100644
>>>>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>>>>> @@ -343,4 +343,24 @@ static inline unsigned long __get_current_cr3_fast(void)
>>>>>>>>     return cr3;
>>>>>>>> }
>>>>>>>> 
>>>>>>>> +typedef struct {
>>>>>>>> +       struct mm_struct *prev;
>>>>>>>> +} temporary_mm_state_t;
>>>>>>>> +
>>>>>>>> +static inline temporary_mm_state_t use_temporary_mm(struct mm_struct *mm)
>>>>>>>> +{
>>>>>>>> +       temporary_mm_state_t state;
>>>>>>>> +
>>>>>>>> +       lockdep_assert_irqs_disabled();
>>>>>>>> +       state.prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>>>>>>>> +       switch_mm_irqs_off(NULL, mm, current);
>>>>>>>> +       return state;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static inline void unuse_temporary_mm(temporary_mm_state_t prev)
>>>>>>>> +{
>>>>>>>> +       lockdep_assert_irqs_disabled();
>>>>>>>> +       switch_mm_irqs_off(NULL, prev.prev, current);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> #endif /* _ASM_X86_MMU_CONTEXT_H */
>>>>>>>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>>>>>>>> index 5715647fc4fe..ef62af9a0ef7 100644
>>>>>>>> --- a/arch/x86/include/asm/pgtable.h
>>>>>>>> +++ b/arch/x86/include/asm/pgtable.h
>>>>>>>> @@ -976,6 +976,10 @@ static inline void __meminit init_trampoline_default(void)
>>>>>>>>     /* Default trampoline pgd value */
>>>>>>>>     trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
>>>>>>>> }
>>>>>>>> +
>>>>>>>> +void __init patching_mm_init(void);
>>>>>>>> +#define patching_mm_init patching_mm_init
>>>>>>>> +
>>>>>>>> # ifdef CONFIG_RANDOMIZE_MEMORY
>>>>>>>> void __meminit init_trampoline(void);
>>>>>>>> # else
>>>>>>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>>>>>>>> index 054765ab2da2..9f44262abde0 100644
>>>>>>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>>>>>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>>>>>>> @@ -116,6 +116,9 @@ extern unsigned int ptrs_per_p4d;
>>>>>>>> #define LDT_PGD_ENTRY          (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
>>>>>>>> #define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
>>>>>>>> 
>>>>>>>> +#define TEXT_POKE_PGD_ENTRY    -5UL
>>>>>>>> +#define TEXT_POKE_ADDR         (TEXT_POKE_PGD_ENTRY << PGDIR_SHIFT)
>>>>>>>> +
>>>>>>>> #define __VMALLOC_BASE_L4      0xffffc90000000000UL
>>>>>>>> #define __VMALLOC_BASE_L5      0xffa0000000000000UL
>>>>>>>> 
>>>>>>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>>>>>>> index 99fff853c944..840c72ec8c4f 100644
>>>>>>>> --- a/arch/x86/include/asm/pgtable_types.h
>>>>>>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>>>>>>> @@ -505,6 +505,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>>>>>>> /* Install a pte for a particular vaddr in kernel space. */
>>>>>>>> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>>>>>>>> 
>>>>>>>> +struct mm_struct;
>>>>>>>> +void set_mm_pte_vaddr(struct mm_struct *mm, unsigned long vaddr, pte_t pte);
>>>>>>>> +
>>>>>>>> #ifdef CONFIG_X86_32
>>>>>>>> extern void native_pagetable_init(void);
>>>>>>>> #else
>>>>>>>> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
>>>>>>>> index 2ecd34e2d46c..cb364ea5b19d 100644
>>>>>>>> --- a/arch/x86/include/asm/text-patching.h
>>>>>>>> +++ b/arch/x86/include/asm/text-patching.h
>>>>>>>> @@ -38,4 +38,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
>>>>>>>> extern int poke_int3_handler(struct pt_regs *regs);
>>>>>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>>>>>>>> 
>>>>>>>> +extern struct mm_struct *patching_mm;
>>>>>>>> +
>>>>>>>> #endif /* _ASM_X86_TEXT_PATCHING_H */
>>>>>>>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>>>>>>>> index a481763a3776..fd8a950b0d62 100644
>>>>>>>> --- a/arch/x86/kernel/alternative.c
>>>>>>>> +++ b/arch/x86/kernel/alternative.c
>>>>>>>> @@ -11,6 +11,7 @@
>>>>>>>> #include <linux/stop_machine.h>
>>>>>>>> #include <linux/slab.h>
>>>>>>>> #include <linux/kdebug.h>
>>>>>>>> +#include <linux/mmu_context.h>
>>>>>>>> #include <asm/text-patching.h>
>>>>>>>> #include <asm/alternative.h>
>>>>>>>> #include <asm/sections.h>
>>>>>>>> @@ -701,8 +702,36 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>>>>>>>>             WARN_ON(!PageReserved(pages[0]));
>>>>>>>>             pages[1] = virt_to_page(addr + PAGE_SIZE);
>>>>>>>>     }
>>>>>>>> -       BUG_ON(!pages[0]);
>>>>>>>> +
>>>>>>>>     local_irq_save(flags);
>>>>>>>> +       BUG_ON(!pages[0]);
>>>>>>>> +
>>>>>>>> +       /*
>>>>>>>> +        * During initial boot, it is hard to initialize patching_mm due to
>>>>>>>> +        * dependencies in boot order.
>>>>>>>> +        */
>>>>>>>> +       if (patching_mm) {
>>>>>>>> +               pte_t pte;
>>>>>>>> +               temporary_mm_state_t prev;
>>>>>>>> +
>>>>>>>> +               prev = use_temporary_mm(patching_mm);
>>>>>>>> +               pte = mk_pte(pages[0], PAGE_KERNEL);
>>>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, pte);
>>>>>>>> +               pte = mk_pte(pages[1], PAGE_KERNEL);
>>>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, pte);
>>>>>>>> +
>>>>>>>> +               memcpy((void *)(TEXT_POKE_ADDR | ((unsigned long)addr & ~PAGE_MASK)),
>>>>>>>> +                      opcode, len);
>>>>>>>> +
>>>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR, __pte(0));
>>>>>>>> +               set_mm_pte_vaddr(patching_mm, TEXT_POKE_ADDR + PAGE_SIZE, __pte(0));
>>>>>>>> +               local_flush_tlb();
>>>>>>> 
>>>>>>> Hmm.  This is stuff busted on SMP, and it's IMO more complicated than
>>>>>>> needed.  How about getting rid of all the weird TLB flushing stuff and
>>>>>>> instead putting the mapping at vaddr - __START_KERNEL_map or whatever
>>>>>>> it is?  You *might* need to flush_tlb_mm_range() on module unload, but
>>>>>>> that's it.
>>>>>> 
>>>>>> I don’t see what’s wrong in SMP, since this entire piece of code should be
>>>>>> running under text_mutex.
>>>>>> 
>>>>>> I don’t quite understand your proposal. I really don’t want to have any
>>>>>> chance in which the page-tables for the poked address is not preallocated.
>>>>>> 
>>>>>> It is more complicated than needed, and there are redundant TLB flushes. The
>>>>>> reason I preferred to do it this way, is in order not to use other functions
>>>>>> that take locks during the software page-walk and not to duplicate existing
>>>>>> code. Yet, duplication might be the way to go.
>>>>>> 
>>>>>>>> +               sync_core();
>>>>>>> 
>>>>>>> I can't think of any case where sync_core() is needed.  The mm switch
>>>>>>> serializes.
>>>>>> 
>>>>>> Good point!
>>>>>> 
>>>>>>> Also, is there any circumstance in which any of this is used before at
>>>>>>> least jump table init?  All the early stuff is text_poke_early(),
>>>>>>> right?
>>>>>> 
>>>>>> Not before jump_label_init. However, I did not manage to get rid of the two
>>>>>> code-patches in text_poke(), since text_poke is used relatively early by
>>>>>> x86_late_time_init(), and at this stage kmem_cache_alloc() - which is needed
>>>>>> to duplicate init_mm - still fails.
>>>>> 
>>>>> Another correction: the populate_extra_pte() is not needed.
>>>>> 
>>>>> Anyhow, if you want to do this whole thing differently, I obviously will not
>>>>> object, but I think it will end up more complicated.
>>>>> 
>>>>> I think I finally understood your comment about "vaddr -
>>>>> __START_KERNEL_map”. I did something like that before, and it is not
>>>>> super-simple. You need not only to conditionally flush the TLB, but also
>>>>> to synchronize the PUD/PMD on changes. Don’t forget that module memory
>>>>> is installed even when BPF programs are installed.
>>>>> 
>>>>> Let me know if you want me to submit cleaner patches or you want to carry on
>>>>> yourself.
>>>> 
>>>> I think your approach is a good start and should be good enough (with
>>>> cleanups) as a fix for the bug.  But I think your code has the same
>>>> bug that we have now!  You're reusing the same address on multiple
>>>> CPUs without flushing.  You can easily fix it by forcing a flush
>>>> before loading the mm, which should be as simple as adding
>>>> flush_tlb_mm() before you load the mm.  (It won't actually flush
>>>> anything by itself, since the mm isn't loaded, but it will update the
>>>> bookkeeping so that switch_mm_irqs_off() flushes the mm.)
>>> 
>>> What am I missing?
>>> 
>>> We have a lock (text_mutex) which prevents the use of the new page-table
>>> hierarchy on multiple CPUs. In addition we have __set_pte_vaddr() which does
>>> a local TLB flush before the lock is released and before IRQs are enabled.
>>> So how can the PTE be cached on multiple CPUs?
>>> 
>>> Yes, __set_pte_vaddr() is ugly and flushes too much. I’ll try to remove some
>>> redundant TLB flushes, but these flushes are already there.
>> 
>> I missed that set_pte_vaddr() contained a flush.  It's probably nicer
>> to use one of the APIs that doesn't imply a flush.  If you end up with
>> vm_insert_pfn() or similar, you can use can use copy_to_user() if
>> needed :)
>> 
>>>> Also, please at least get rid of TEXT_POKE_ADDR.  If you don't want to
>>>> do the vaddr - __START_KERNEL_map thing, then at least pick an address
>>>> in the low half of the address space such as 0 :)  Ideally you'd only
>>>> use this thing late enough that you could even use the normal
>>>> insert_pfn (or similar) API for it, but that doesn't really matter.
>>> 
>>> Perhaps the vaddr - __START_KERNEL_map actually makes sense. I was
>>> misunderstanding it (again) before, thinking you want me to use vaddr (and
>>> not the delta). I’ll give it a try.
>> 
>> It does prevent easy preallocation of the intermediate tables.
> 
> Would we have to change the mapping address? If so, how about using hash
> of vaddr, which allow us to map it in one pte?

We do need two PTEs, and I will reuse the same two PTEs in order not to
waste memory on page-tables. I will try to randomize their address to be on
the safe side.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-27 11:00                         ` Peter Zijlstra
@ 2018-08-30  0:13                           ` Vineet Gupta
  2018-08-30 10:23                             ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Vineet Gupta @ 2018-08-30  0:13 UTC (permalink / raw)
  To: Peter Zijlstra, Nicholas Piggin
  Cc: Will Deacon, Linus Torvalds, Benjamin Herrenschmidt,
	Andrew Lutomirski, the arch/x86 maintainers, Borislav Petkov,
	Rik van Riel, Jann Horn, Adin Scannell, Dave Hansen,
	Linux Kernel Mailing List, linux-mm, David Miller,
	Martin Schwidefsky, Michael Ellerman, jejb

On 08/27/2018 04:00 AM, Peter Zijlstra wrote:
>
> The one obvious thing SH and ARM want is a sensible default for
> tlb_start_vma(). (also: https://lkml.org/lkml/2004/1/15/6 )
>
> The below make tlb_start_vma() default to flush_cache_range(), which
> should be right and sufficient. The only exceptions that I found where
> (oddly):
>
>   - m68k-mmu
>   - sparc64
>   - unicore
>
> Those architectures appear to have a non-NOP flush_cache_range(), but
> their current tlb_start_vma() does not call it.

So indeed we follow the DaveM's insight from 2004 about tlb_{start,end}_vma() and
those are No-ops for ARC for the general case. For the historic VIPT aliasing
dcache they are what they should be per 2004 link above - I presume that is all
hunky dory with you ?

> Furthermore, I think tlb_flush() is broken on arc and parisc; in
> particular they don't appear to have any TLB invalidate for the
> shift_arg_pages() case, where we do not call tlb_*_vma() and fullmm=0.

Care to explain this issue a bit more ?
And that is independent of the current discussion.

> Possibly shift_arg_pages() should be fixed instead.
>
> Some archs (nds32,sparc32) avoid this by having an unconditional
> flush_tlb_mm() in tlb_flush(), which seems somewhat suboptimal if you
> have flush_tlb_range().  TLB_FLUSH_VMA() might be an option, however
> hideous it is.
>
> ---
>
> diff --git a/arch/arc/include/asm/tlb.h b/arch/arc/include/asm/tlb.h
> index a9db5f62aaf3..7af2b373ebe7 100644
> --- a/arch/arc/include/asm/tlb.h
> +++ b/arch/arc/include/asm/tlb.h
> @@ -23,15 +23,6 @@ do {						\
>   *
>   * Note, read http://lkml.org/lkml/2004/1/15/6
>   */
> -#ifndef CONFIG_ARC_CACHE_VIPT_ALIASING
> -#define tlb_start_vma(tlb, vma)
> -#else
> -#define tlb_start_vma(tlb, vma)						\
> -do {									\
> -	if (!tlb->fullmm)						\
> -		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
> -} while(0)
> -#endif
>  
>  #define tlb_end_vma(tlb, vma)						\
>  do {									\

[snip..]

> 				      \
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index e811ef7b8350..1d037fd5bb7a 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -181,19 +181,21 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
>   * the vmas are adjusted to only cover the region to be torn down.
>   */
>  #ifndef tlb_start_vma
> -#define tlb_start_vma(tlb, vma) do { } while (0)
> +#define tlb_start_vma(tlb, vma)						\
> +do {									\
> +	if (!tlb->fullmm)						\
> +		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
> +} while (0)
>  #endif

So for non aliasing arches to be not affected, this relies on flush_cache_range()
to be no-op ?

> -#define __tlb_end_vma(tlb, vma)					\
> -	do {							\
> -		if (!tlb->fullmm && tlb->end) {			\
> -			tlb_flush(tlb);				\
> -			__tlb_reset_range(tlb);			\
> -		}						\
> -	} while (0)
> -
>  #ifndef tlb_end_vma
> -#define tlb_end_vma	__tlb_end_vma
> +#define tlb_end_vma(tlb, vma)						\
> +	do {								\
> +		if (!tlb->fullmm && tlb->end) {				\
> +			tlb_flush(tlb);					\
> +			__tlb_reset_range(tlb);				\
> +		}							\
> +	} while (0)
>  #endif
>  
>  #ifndef __tlb_remove_tlb_entry

And this one is for shift_arg_pages() but will also cause extraneous flushes for
other cases - not happening currently !

-Vineet


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE
  2018-08-30  0:13                           ` Vineet Gupta
@ 2018-08-30 10:23                             ` Peter Zijlstra
  0 siblings, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2018-08-30 10:23 UTC (permalink / raw)
  To: Vineet Gupta
  Cc: Nicholas Piggin, Will Deacon, Linus Torvalds,
	Benjamin Herrenschmidt, Andrew Lutomirski,
	the arch/x86 maintainers, Borislav Petkov, Rik van Riel,
	Jann Horn, Adin Scannell, Dave Hansen, Linux Kernel Mailing List,
	linux-mm, David Miller, Martin Schwidefsky, Michael Ellerman,
	jejb

On Thu, Aug 30, 2018 at 12:13:50AM +0000, Vineet Gupta wrote:
> On 08/27/2018 04:00 AM, Peter Zijlstra wrote:
> >
> > The one obvious thing SH and ARM want is a sensible default for
> > tlb_start_vma(). (also: https://lkml.org/lkml/2004/1/15/6 )
> >
> > The below make tlb_start_vma() default to flush_cache_range(), which
> > should be right and sufficient. The only exceptions that I found where
> > (oddly):
> >
> >   - m68k-mmu
> >   - sparc64
> >   - unicore
> >
> > Those architectures appear to have a non-NOP flush_cache_range(), but
> > their current tlb_start_vma() does not call it.
> 
> So indeed we follow the DaveM's insight from 2004 about tlb_{start,end}_vma() and
> those are No-ops for ARC for the general case. For the historic VIPT aliasing
> dcache they are what they should be per 2004 link above - I presume that is all
> hunky dory with you ?

Yep, I was just confused about those 3 architectures having
flush_cache_range() but not calling it from tlb_start_vma(). The regular
VIPT aliasing thing is all good. And not having them is also fine.

> > Furthermore, I think tlb_flush() is broken on arc and parisc; in
> > particular they don't appear to have any TLB invalidate for the
> > shift_arg_pages() case, where we do not call tlb_*_vma() and fullmm=0.
> 
> Care to explain this issue a bit more ?
> And that is independent of the current discussion.
> 
> > Possibly shift_arg_pages() should be fixed instead.

So the problem is that shift_arg_pages() does not call tlb_start_vma() /
tlb_end_vma(). It also has fullmm=0. Therefore, on ARC, there will not
be a TLB invalidate at all when freeing the page-tables.

And while looking at this more, move_page_tables() also looks dodgy, I
think it always needs a TLB flush of the entire 'old' range.

> > diff --git a/arch/arc/include/asm/tlb.h b/arch/arc/include/asm/tlb.h
> > index a9db5f62aaf3..7af2b373ebe7 100644
> > --- a/arch/arc/include/asm/tlb.h
> > +++ b/arch/arc/include/asm/tlb.h
> > @@ -23,15 +23,6 @@ do {						\
> >   *
> >   * Note, read http://lkml.org/lkml/2004/1/15/6
> >   */
> > -#ifndef CONFIG_ARC_CACHE_VIPT_ALIASING
> > -#define tlb_start_vma(tlb, vma)
> > -#else
> > -#define tlb_start_vma(tlb, vma)						\
> > -do {									\
> > -	if (!tlb->fullmm)						\
> > -		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
> > -} while(0)
> > -#endif
> >  
> >  #define tlb_end_vma(tlb, vma)						\
> >  do {									\
> 
> [snip..]
> 
> > 				      \
> > diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> > index e811ef7b8350..1d037fd5bb7a 100644
> > --- a/include/asm-generic/tlb.h
> > +++ b/include/asm-generic/tlb.h
> > @@ -181,19 +181,21 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
> >   * the vmas are adjusted to only cover the region to be torn down.
> >   */
> >  #ifndef tlb_start_vma
> > -#define tlb_start_vma(tlb, vma) do { } while (0)
> > +#define tlb_start_vma(tlb, vma)						\
> > +do {									\
> > +	if (!tlb->fullmm)						\
> > +		flush_cache_range(vma, vma->vm_start, vma->vm_end);	\
> > +} while (0)
> >  #endif
> 
> So for non aliasing arches to be not affected, this relies on flush_cache_range()
> to be no-op ?

Yes; a cursory inspected shows this to be so. With 'obvious' exception
from the above 3 architectures.

> > -#define __tlb_end_vma(tlb, vma)					\
> > -	do {							\
> > -		if (!tlb->fullmm && tlb->end) {			\
> > -			tlb_flush(tlb);				\
> > -			__tlb_reset_range(tlb);			\
> > -		}						\
> > -	} while (0)
> > -
> >  #ifndef tlb_end_vma
> > -#define tlb_end_vma	__tlb_end_vma
> > +#define tlb_end_vma(tlb, vma)						\
> > +	do {								\
> > +		if (!tlb->fullmm && tlb->end) {				\
> > +			tlb_flush(tlb);					\
> > +			__tlb_reset_range(tlb);				\
> > +		}							\
> > +	} while (0)
> >  #endif
> >  
> >  #ifndef __tlb_remove_tlb_entry
> 
> And this one is for shift_arg_pages() but will also cause extraneous flushes for
> other cases - not happening currently !

No, shift_arg_pages() does not in fact call tlb_end_vma().

The reason for the flush in tlb_end_vma() is (as far as we can remember)
to 'better' deal with large gaps between adjacent VMAs.  Without the
flush here, the range would be extended to cover the (potential) dead
space between the VMAs. Resulting in more expensive range flushes.

^ permalink raw reply	[flat|nested] 96+ messages in thread

end of thread, other threads:[~2018-08-31  1:27 UTC | newest]

Thread overview: 96+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-22 15:30 [PATCH 0/4] x86: TLB invalidate fixes Peter Zijlstra
2018-08-22 15:30 ` [PATCH 1/4] x86/mm/tlb: Revert the recent lazy TLB patches Peter Zijlstra
2018-08-22 21:37   ` Linus Torvalds
2018-08-22 22:11     ` Rik van Riel
2018-08-22 15:30 ` [PATCH 2/4] mm/tlb: Remove tlb_remove_table() non-concurrent condition Peter Zijlstra
2018-08-23  3:31   ` Nicholas Piggin
2018-08-23  3:35     ` Linus Torvalds
2018-08-23  3:44       ` Linus Torvalds
2018-08-23  4:16       ` Nicholas Piggin
2018-08-23  4:54         ` Linus Torvalds
2018-08-23  5:15           ` Nicholas Piggin
2018-08-24  8:42           ` Peter Zijlstra
2018-08-23 13:40   ` Will Deacon
2018-08-22 15:30 ` [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE Peter Zijlstra
2018-08-22 15:55   ` Peter Zijlstra
2018-08-23  3:45     ` Nicholas Piggin
2018-08-23  3:59       ` Linus Torvalds
2018-08-23  4:33         ` Nicholas Piggin
2018-08-23  5:03           ` Linus Torvalds
2018-08-23  5:58             ` Nicholas Piggin
2018-08-23  4:54         ` Benjamin Herrenschmidt
2018-08-23  5:11           ` Linus Torvalds
2018-08-23  5:20             ` Linus Torvalds
2018-08-23  6:48               ` Martin Schwidefsky
2018-08-23  5:21             ` Benjamin Herrenschmidt
2018-08-23  6:15               ` Nicholas Piggin
2018-08-23 13:39             ` Will Deacon
2018-08-24  8:47               ` Peter Zijlstra
2018-08-24 11:32                 ` Peter Zijlstra
2018-08-24 11:39                   ` Peter Zijlstra
2018-08-27  5:00                     ` Nicholas Piggin
2018-08-27  7:47                       ` Peter Zijlstra
2018-08-27  8:04                         ` Nicholas Piggin
2018-08-27  8:09                           ` Benjamin Herrenschmidt
2018-08-27  8:20                             ` Peter Zijlstra
2018-08-27  8:54                               ` Nicholas Piggin
2018-08-27  9:02                             ` Nicholas Piggin
2018-08-27 22:13                               ` Benjamin Herrenschmidt
2018-08-27 13:36                           ` Rik van Riel
2018-08-27 14:29                             ` Nicholas Piggin
2018-08-27  8:57                         ` removig ia64, was: " Christoph Hellwig
2018-08-27 11:28                           ` Peter Zijlstra
2018-08-27 11:45                           ` Jason Duerstock
2018-08-27 11:00                         ` Peter Zijlstra
2018-08-30  0:13                           ` Vineet Gupta
2018-08-30 10:23                             ` Peter Zijlstra
2018-08-24 17:26                 ` Nadav Amit
2018-08-24 18:04                   ` Peter Zijlstra
2018-08-24 18:35                     ` TLB flushes on fixmap changes Nadav Amit
2018-08-24 19:31                       ` Linus Torvalds
2018-08-24 20:24                         ` Nadav Amit
2018-08-25  0:58                           ` Linus Torvalds
2018-08-25  2:29                             ` nadav.amit
2018-08-25  4:23                               ` Andy Lutomirski
2018-08-26  2:23                                 ` Masami Hiramatsu
2018-08-26  4:21                                   ` Andy Lutomirski
2018-08-26  4:43                                     ` Kees Cook
2018-08-26  5:53                                       ` Nadav Amit
2018-08-26 14:20                                       ` Andy Lutomirski
2018-08-26 16:47                                         ` Kees Cook
2018-08-26 17:25                                           ` Andy Lutomirski
2018-08-26 20:15                                             ` Thomas Gleixner
2018-08-26 22:03                                               ` Kees Cook
2018-08-26 22:15                                                 ` Matthew Wilcox
2018-08-26 22:29                                                 ` Jann Horn
2018-08-26  9:09                                     ` Peter Zijlstra
2018-08-27  3:03                                       ` Masami Hiramatsu
2018-08-27  3:26                                         ` Nadav Amit
2018-08-27  8:05                                           ` Masami Hiramatsu
2018-08-27 17:34                                             ` Nadav Amit
2018-08-27 18:45                                               ` Andy Lutomirski
2018-08-27 18:54                                                 ` Nadav Amit
2018-08-27 18:58                                                   ` Andy Lutomirski
2018-08-27 19:10                                                     ` Nadav Amit
2018-08-27 19:43                                                       ` Nadav Amit
2018-08-27 19:58                                                         ` Andy Lutomirski
2018-08-27 20:16                                                           ` Nadav Amit
2018-08-27 21:55                                                             ` Nadav Amit
2018-08-27 22:32                                                               ` Andy Lutomirski
2018-08-27 22:54                                                                 ` Nadav Amit
2018-08-27 23:01                                                                   ` Andy Lutomirski
2018-08-28  8:49                                                                     ` Masami Hiramatsu
2018-08-28 17:33                                                                       ` Nadav Amit
2018-08-27  8:13                                         ` Peter Zijlstra
2018-08-27  9:39                                           ` Masami Hiramatsu
2018-08-27  9:55                                           ` Jann Horn
2018-08-26 22:48                                     ` Jann Horn
2018-08-24  8:35           ` [PATCH 3/4] mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE Peter Zijlstra
2018-08-24 13:13             ` Peter Zijlstra
2018-08-24 13:14               ` Peter Zijlstra
2018-08-24 15:49               ` Will Deacon
2018-08-23 23:31     ` Will Deacon
2018-08-22 21:34   ` Linus Torvalds
2018-08-23  8:46   ` Nicholas Piggin
2018-08-22 15:30 ` [PATCH 4/4] x86/mm: Only use tlb_remove_table() for paravirt Peter Zijlstra
2018-08-22 22:12   ` Eduardo Valentin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).