* [PATCHv3] arm64/mm: save memory access in check_and_switch_context() fast switch path
@ 2020-07-10 14:04 Pingfan Liu
2020-07-30 11:40 ` Mark Rutland
0 siblings, 1 reply; 2+ messages in thread
From: Pingfan Liu @ 2020-07-10 14:04 UTC (permalink / raw)
To: linux-arm-kernel
Cc: Mark Rutland, Jean-Philippe Brucker, Vladimir Murzin,
Steve Capper, Catalin Marinas, Pingfan Liu, Will Deacon
On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
using the per-cpu offset stored in the tpidr_el1 system register. In
some cases we generate a per-cpu address with a sequence like:
cpu_ptr = &per_cpu(ptr, smp_processor_id());
Which potentially incurs a cache miss for both `cpu_number` and the
in-memory `__per_cpu_offset` array. This can be written more optimally
as:
cpu_ptr = this_cpu_ptr(ptr);
Which only needs the offset from tpidr_el1, and does not need to
load from memory.
The following two test cases show a small performance improvement measured
on a 46-cpus qualcomm machine with 5.8.0-rc4 kernel.
Test 1: (about 0.3% improvement)
#cat b.sh
make clean && make all -j138
#perf stat --repeat 10 --null --sync sh b.sh
- before this patch
Performance counter stats for 'sh b.sh' (10 runs):
298.62 +- 1.86 seconds time elapsed ( +- 0.62% )
- after this patch
Performance counter stats for 'sh b.sh' (10 runs):
297.734 +- 0.954 seconds time elapsed ( +- 0.32% )
Test 2: (about 1.69% improvement)
'perf stat -r 10 perf bench sched messaging'
Then sum the total time of 'sched/messaging' by manual.
- before this patch
total 0.707 sec for 10 times
- after this patch
totol 0.695 sec for 10 times
Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
To: linux-arm-kernel@lists.infradead.org
---
v2 -> v3: improve commit log with performance result
arch/arm64/include/asm/mmu_context.h | 6 ++----
arch/arm64/mm/context.c | 10 ++++++----
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index ab46187..808c3be 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
* take CPU migration into account.
*/
#define destroy_context(mm) do { } while(0)
-void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
+void check_and_switch_context(struct mm_struct *mm);
#define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; })
@@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
static inline void __switch_mm(struct mm_struct *next)
{
- unsigned int cpu = smp_processor_id();
-
/*
* init_mm.pgd does not contain any user mappings and it is always
* active for kernel addresses in TTBR1. Just set the reserved TTBR0.
@@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next)
return;
}
- check_and_switch_context(next, cpu);
+ check_and_switch_context(next);
}
static inline void
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index d702d60..a206655 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -198,9 +198,10 @@ static u64 new_context(struct mm_struct *mm)
return idx2asid(asid) | generation;
}
-void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
+void check_and_switch_context(struct mm_struct *mm)
{
unsigned long flags;
+ unsigned int cpu;
u64 asid, old_active_asid;
if (system_supports_cnp())
@@ -222,9 +223,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
* relaxed xchg in flush_context will treat us as reserved
* because atomic RmWs are totally ordered for a given location.
*/
- old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
+ old_active_asid = atomic64_read(this_cpu_ptr(&active_asids));
if (old_active_asid && asid_gen_match(asid) &&
- atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
+ atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids),
old_active_asid, asid))
goto switch_mm_fastpath;
@@ -236,10 +237,11 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
atomic64_set(&mm->context.id, asid);
}
+ cpu = smp_processor_id();
if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
local_flush_tlb_all();
- atomic64_set(&per_cpu(active_asids, cpu), asid);
+ atomic64_set(this_cpu_ptr(&active_asids), asid);
raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
switch_mm_fastpath:
--
2.7.5
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCHv3] arm64/mm: save memory access in check_and_switch_context() fast switch path
2020-07-10 14:04 [PATCHv3] arm64/mm: save memory access in check_and_switch_context() fast switch path Pingfan Liu
@ 2020-07-30 11:40 ` Mark Rutland
0 siblings, 0 replies; 2+ messages in thread
From: Mark Rutland @ 2020-07-30 11:40 UTC (permalink / raw)
To: Pingfan Liu
Cc: Jean-Philippe Brucker, Vladimir Murzin, Steve Capper,
Catalin Marinas, Will Deacon, linux-arm-kernel
On Fri, Jul 10, 2020 at 10:04:12PM +0800, Pingfan Liu wrote:
> On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
> using the per-cpu offset stored in the tpidr_el1 system register. In
> some cases we generate a per-cpu address with a sequence like:
>
> cpu_ptr = &per_cpu(ptr, smp_processor_id());
>
> Which potentially incurs a cache miss for both `cpu_number` and the
> in-memory `__per_cpu_offset` array. This can be written more optimally
> as:
>
> cpu_ptr = this_cpu_ptr(ptr);
>
> Which only needs the offset from tpidr_el1, and does not need to
> load from memory.
>
> The following two test cases show a small performance improvement measured
> on a 46-cpus qualcomm machine with 5.8.0-rc4 kernel.
>
> Test 1: (about 0.3% improvement)
> #cat b.sh
> make clean && make all -j138
> #perf stat --repeat 10 --null --sync sh b.sh
>
> - before this patch
> Performance counter stats for 'sh b.sh' (10 runs):
>
> 298.62 +- 1.86 seconds time elapsed ( +- 0.62% )
>
> - after this patch
> Performance counter stats for 'sh b.sh' (10 runs):
>
> 297.734 +- 0.954 seconds time elapsed ( +- 0.32% )
>
> Test 2: (about 1.69% improvement)
> 'perf stat -r 10 perf bench sched messaging'
> Then sum the total time of 'sched/messaging' by manual.
>
> - before this patch
> total 0.707 sec for 10 times
> - after this patch
> totol 0.695 sec for 10 times
>
> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Steve Capper <steve.capper@arm.com>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Vladimir Murzin <vladimir.murzin@arm.com>
> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> To: linux-arm-kernel@lists.infradead.org
The patch looks sound, so FWIW:
Acked-by: Mark Rutland <mark.rutland@arm.com>
... I'll leave it to Catalin and Will to decide whether to pick this up.
Mark.
> ---
> v2 -> v3: improve commit log with performance result
> arch/arm64/include/asm/mmu_context.h | 6 ++----
> arch/arm64/mm/context.c | 10 ++++++----
> 2 files changed, 8 insertions(+), 8 deletions(-)
>
> diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
> index ab46187..808c3be 100644
> --- a/arch/arm64/include/asm/mmu_context.h
> +++ b/arch/arm64/include/asm/mmu_context.h
> @@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
> * take CPU migration into account.
> */
> #define destroy_context(mm) do { } while(0)
> -void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
> +void check_and_switch_context(struct mm_struct *mm);
>
> #define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; })
>
> @@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
>
> static inline void __switch_mm(struct mm_struct *next)
> {
> - unsigned int cpu = smp_processor_id();
> -
> /*
> * init_mm.pgd does not contain any user mappings and it is always
> * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
> @@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next)
> return;
> }
>
> - check_and_switch_context(next, cpu);
> + check_and_switch_context(next);
> }
>
> static inline void
> diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
> index d702d60..a206655 100644
> --- a/arch/arm64/mm/context.c
> +++ b/arch/arm64/mm/context.c
> @@ -198,9 +198,10 @@ static u64 new_context(struct mm_struct *mm)
> return idx2asid(asid) | generation;
> }
>
> -void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
> +void check_and_switch_context(struct mm_struct *mm)
> {
> unsigned long flags;
> + unsigned int cpu;
> u64 asid, old_active_asid;
>
> if (system_supports_cnp())
> @@ -222,9 +223,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
> * relaxed xchg in flush_context will treat us as reserved
> * because atomic RmWs are totally ordered for a given location.
> */
> - old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
> + old_active_asid = atomic64_read(this_cpu_ptr(&active_asids));
> if (old_active_asid && asid_gen_match(asid) &&
> - atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
> + atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids),
> old_active_asid, asid))
> goto switch_mm_fastpath;
>
> @@ -236,10 +237,11 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
> atomic64_set(&mm->context.id, asid);
> }
>
> + cpu = smp_processor_id();
> if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
> local_flush_tlb_all();
>
> - atomic64_set(&per_cpu(active_asids, cpu), asid);
> + atomic64_set(this_cpu_ptr(&active_asids), asid);
> raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
>
> switch_mm_fastpath:
> --
> 2.7.5
>
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2020-07-30 11:42 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-10 14:04 [PATCHv3] arm64/mm: save memory access in check_and_switch_context() fast switch path Pingfan Liu
2020-07-30 11:40 ` Mark Rutland
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).