All of lore.kernel.org
 help / color / mirror / Atom feed
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: x86@kernel.org, luto@kernel.org, dave.hansen@linux.intel.com,
	mingo@kernel.org, kernel-team@fb.com, tglx@linutronix.de,
	efault@gmx.de, songliubraving@fb.com,
	Rik van Riel <riel@surriel.com>
Subject: [PATCH 3/6] x86,tlb: make lazy TLB mode lazier
Date: Tue, 26 Jun 2018 13:31:23 -0400	[thread overview]
Message-ID: <20180626173126.12296-4-riel@surriel.com> (raw)
In-Reply-To: <20180626173126.12296-1-riel@surriel.com>

Lazy TLB mode can result in an idle CPU being woken up by a TLB flush,
when all it really needs to do is reload %CR3 at the next context switch,
assuming no page table pages got freed.

Memory ordering is used to prevent race conditions between switch_mm_irqs_off,
which checks whether .tlb_gen changed, and the TLB invalidation code, which
increments .tlb_gen whenever page table entries get invalidated.

The atomic increment in inc_mm_tlb_gen is its own barrier; the context
switch code adds an explicit barrier between reading tlbstate.is_lazy and
next->context.tlb_gen.

Unlike the 2016 version of this patch, CPUs with cpu_tlbstate.is_lazy set
are not removed from the mm_cpumask(mm), since that would prevent the TLB
flush IPIs at page table free time from being sent to all the CPUs
that need them.

Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Song Liu <songliubraving@fb.com>
---
 arch/x86/include/asm/uv/uv.h  |   6 +-
 arch/x86/mm/tlb.c             | 131 ++++++++++++++++++++++++++++--------------
 arch/x86/platform/uv/tlb_uv.c |   2 +-
 3 files changed, 92 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index a80c0673798f..d801afb5fe90 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -17,7 +17,7 @@ extern int is_uv_hubless(void);
 extern void uv_cpu_init(void);
 extern void uv_nmi_init(void);
 extern void uv_system_init(void);
-extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+extern struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask,
 						 const struct flush_tlb_info *info);
 
 #else	/* X86_UV */
@@ -27,8 +27,8 @@ static inline int is_uv_system(void)	{ return 0; }
 static inline int is_uv_hubless(void)	{ return 0; }
 static inline void uv_cpu_init(void)	{ }
 static inline void uv_system_init(void)	{ }
-static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask,
+static inline struct cpumask *
+uv_flush_tlb_others(struct cpumask *cpumask,
 		    const struct flush_tlb_info *info)
 { return cpumask; }
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9a893673c56b..137a2c62c75b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
 #include <linux/export.h>
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
+#include <linux/gfp.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 {
 	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
 	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+	bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
 	unsigned cpu = smp_processor_id();
 	u64 next_tlb_gen;
+	bool need_flush;
+	u16 new_asid;
 
 	/*
 	 * NB: The scheduler will call us with prev == next when switching
@@ -250,10 +254,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
 			cpumask_set_cpu(cpu, mm_cpumask(next));
 
-		return;
+		/*
+		 * Switching straight from one thread in a process to another
+		 * thread in the same process requires no TLB flush at all.
+		 */
+		if (!was_lazy)
+			return;
+
+		/*
+		 * The code below checks whether there was a TLB flush while
+		 * this CPU was in lazy TLB mode. The barrier ensures ordering
+		 * with the TLB invalidation code advancing .tlb_gen.
+		 */
+		smp_rmb();
 	} else {
-		u16 new_asid;
-		bool need_flush;
 		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
 
 		/*
@@ -290,48 +304,47 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 				real_prev != &init_mm);
 		cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
 
-		/*
-		 * Start remote flushes and then read tlb_gen.
-		 */
+		/* Start remote flushes. */
 		cpumask_set_cpu(cpu, mm_cpumask(next));
-		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
-		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+	}
 
-		if (need_flush) {
-			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
-			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-			load_new_mm_cr3(next->pgd, new_asid, true);
+	/* Read the tlb_gen to check whether a flush is needed. */
+	next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+	choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
 
-			/*
-			 * NB: This gets called via leave_mm() in the idle path
-			 * where RCU functions differently.  Tracing normally
-			 * uses RCU, so we need to use the _rcuidle variant.
-			 *
-			 * (There is no good reason for this.  The idle code should
-			 *  be rearranged to call this before rcu_idle_enter().)
-			 */
-			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-		} else {
-			/* The new ASID is already up to date. */
-			load_new_mm_cr3(next->pgd, new_asid, false);
-
-			/* See above wrt _rcuidle. */
-			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
-		}
+	if (need_flush) {
+		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+		load_new_mm_cr3(next->pgd, new_asid, true);
 
 		/*
-		 * Record last user mm's context id, so we can avoid
-		 * flushing branch buffer with IBPB if we switch back
-		 * to the same user.
+		 * NB: This gets called via leave_mm() in the idle path
+		 * where RCU functions differently.  Tracing normally
+		 * uses RCU, so we need to use the _rcuidle variant.
+		 *
+		 * (There is no good reason for this.  The idle code should
+		 *  be rearranged to call this before rcu_idle_enter().)
 		 */
-		if (next != &init_mm)
-			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+	} else {
+		/* The new ASID is already up to date. */
+		load_new_mm_cr3(next->pgd, new_asid, false);
 
-		this_cpu_write(cpu_tlbstate.loaded_mm, next);
-		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+		/* See above wrt _rcuidle. */
+		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
 	}
 
+	/*
+	 * Record last user mm's context id, so we can avoid
+	 * flushing branch buffer with IBPB if we switch back
+	 * to the same user.
+	 */
+	if (next != &init_mm)
+		this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+
+	this_cpu_write(cpu_tlbstate.loaded_mm, next);
+	this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+
 	load_mm_cr4(next);
 	switch_ldt(real_prev, next);
 }
@@ -454,6 +467,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 		 * paging-structure cache to avoid speculatively reading
 		 * garbage into our TLB.  Since switching to init_mm is barely
 		 * slower than a minimal flush, just switch to init_mm.
+		 *
+		 * This should be rare, with native_flush_tlb_others skipping
+		 * IPIs to lazy TLB mode CPUs.
 		 */
 		switch_mm_irqs_off(NULL, &init_mm, NULL);
 		return;
@@ -560,6 +576,22 @@ static void flush_tlb_func_remote(void *info)
 void native_flush_tlb_others(const struct cpumask *cpumask,
 			     const struct flush_tlb_info *info)
 {
+	cpumask_t *mask = (struct cpumask *)cpumask;
+	cpumask_var_t varmask;
+	bool can_lazy_flush = false;
+	unsigned int cpu;
+
+	/*
+	 * A temporary cpumask allows the kernel to skip sending IPIs
+	 * to CPUs in lazy TLB state, without also removing them from
+	 * mm_cpumask(mm).
+	 */
+	if (alloc_cpumask_var(&varmask, GFP_ATOMIC)) {
+		cpumask_copy(varmask, cpumask);
+		mask = varmask;
+		can_lazy_flush = true;
+	}
+
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
 	if (info->end == TLB_FLUSH_ALL)
 		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -583,17 +615,30 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 		 * that UV should be updated so that smp_call_function_many(),
 		 * etc, are optimal on UV.
 		 */
-		unsigned int cpu;
-
 		cpu = smp_processor_id();
-		cpumask = uv_flush_tlb_others(cpumask, info);
-		if (cpumask)
-			smp_call_function_many(cpumask, flush_tlb_func_remote,
+		mask = uv_flush_tlb_others(mask, info);
+		if (mask)
+			smp_call_function_many(mask, flush_tlb_func_remote,
 					       (void *)info, 1);
-		return;
+		goto out;
+	}
+
+	/*
+	 * Instead of sending IPIs to CPUs in lazy TLB mode, move that
+	 * CPU's TLB state to TLBSTATE_FLUSH, causing the TLB to be flushed
+	 * at the next context switch, or at page table free time.
+	 */
+	if (can_lazy_flush) {
+		for_each_cpu(cpu, mask) {
+			if (per_cpu(cpu_tlbstate.is_lazy, cpu))
+				cpumask_clear_cpu(cpu, mask);
+		}
 	}
-	smp_call_function_many(cpumask, flush_tlb_func_remote,
+
+	smp_call_function_many(mask, flush_tlb_func_remote,
 			       (void *)info, 1);
+ out:
+	free_cpumask_var(varmask);
 }
 
 /*
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index ca446da48fd2..84a4c6679da6 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1102,7 +1102,7 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
  * Returns pointer to cpumask if some remote flushing remains to be
  * done.  The returned pointer is valid till preemption is re-enabled.
  */
-const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask,
 					  const struct flush_tlb_info *info)
 {
 	unsigned int cpu = smp_processor_id();
-- 
2.14.4


  parent reply	other threads:[~2018-06-26 17:31 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-26 17:31 [PATCH v2 0/7] x86,tlb,mm: make lazy TLB mode even lazier Rik van Riel
2018-06-26 17:31 ` [PATCH 1/6] mm: allocate mm_cpumask dynamically based on nr_cpu_ids Rik van Riel
2018-06-26 17:31 ` [PATCH 2/6] x86,tlb: leave lazy TLB mode at page table free time Rik van Riel
2018-06-27  6:03   ` kbuild test robot
2018-06-26 17:31 ` Rik van Riel [this message]
2018-06-27 18:10   ` [PATCH 3/6] x86,tlb: make lazy TLB mode lazier Andy Lutomirski
2018-06-27 18:17     ` Rik van Riel
2018-06-28 20:05     ` Rik van Riel
2018-06-26 17:31 ` [PATCH 4/6] x86,tlb: only send page table free TLB flush to lazy TLB CPUs Rik van Riel
2018-06-26 20:16   ` [RFC PATCH] x86,tlb: mm_fill_lazy_tlb_cpu_mask() can be static kbuild test robot
2018-06-26 20:16   ` [PATCH 4/6] x86,tlb: only send page table free TLB flush to lazy TLB CPUs kbuild test robot
2018-06-26 17:31 ` [PATCH 5/6] x86,mm: always use lazy TLB mode Rik van Riel
2018-06-26 17:31 ` [PATCH 6/6] x86,switch_mm: skip atomic operations for init_mm Rik van Riel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180626173126.12296-4-riel@surriel.com \
    --to=riel@surriel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=efault@gmx.de \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mingo@kernel.org \
    --cc=songliubraving@fb.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.