From: He Ying <heying24@huawei.com> To: <catalin.marinas@arm.com>, <will@kernel.org>, <mark.rutland@arm.com>, <marcan@marcan.st>, <maz@kernel.org>, <joey.gouly@arm.com>, <pcc@google.com> Cc: <linux-arm-kernel@lists.infradead.org>, <linux-kernel@vger.kernel.org>, <heying24@huawei.com> Subject: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code Date: Fri, 7 Jan 2022 03:55:36 -0500 [thread overview] Message-ID: <20220107085536.214501-1-heying24@huawei.com> (raw) Our product has been updating its kernel from 4.4 to 5.10 recently and found a performance issue. We do a bussiness test called ARP test, which tests the latency for a ping-pong packets traffic with a certain payload. The result is as following. - 4.4 kernel: avg = ~20s - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s I have been just learning arm64 pseudo-NMI code and have a question, why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI? I wonder if this brings some performance regression. First, I make this patch and then do the test again. Here's the result. - 5.10 kernel with this patch not applied: avg = ~40s - 5.10 kernel with this patch applied: avg = ~23s Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not set. It seems the pseudo-NMI feature actually brings some overhead to performance event if CONFIG_ARM64_PSEUDO_NMI is not set. Furthermore, I find the feature also brings some overhead to vmlinux size. I build 5.10 kernel with this patch applied or not while CONFIG_ARM64_PSEUDO_NMI is not set. - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes. - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes. That means arm64 pseudo-NMI feature may bring ~200KB overhead to vmlinux size. Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size and performance even if config is not set. To avoid it, add macro control all around the related code. Signed-off-by: He Ying <heying24@huawei.com> --- arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++-- arch/arm64/kernel/entry.S | 4 ++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index b57b9b1e4344..82f771b41cf5 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -26,6 +26,7 @@ */ static inline void arch_local_irq_enable(void) { +#ifdef CONFIG_ARM64_PSEUDO_NMI if (system_has_prio_mask_debugging()) { u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1); @@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void) : "memory"); pmr_sync(); +#else + asm volatile( + "msr daifclr, #3 // arch_local_irq_enable" + : + : + : "memory"); +#endif } static inline void arch_local_irq_disable(void) { +#ifdef CONFIG_ARM64_PSEUDO_NMI if (system_has_prio_mask_debugging()) { u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1); @@ -58,6 +67,13 @@ static inline void arch_local_irq_disable(void) : : "r" ((unsigned long) GIC_PRIO_IRQOFF) : "memory"); +#else + asm volatile( + "msr daifset, #3 // arch_local_irq_disable" + : + : + : "memory"); +#endif } /* @@ -66,7 +82,7 @@ static inline void arch_local_irq_disable(void) static inline unsigned long arch_local_save_flags(void) { unsigned long flags; - +#ifdef CONFIG_ARM64_PSEUDO_NMI asm volatile(ALTERNATIVE( "mrs %0, daif", __mrs_s("%0", SYS_ICC_PMR_EL1), @@ -74,12 +90,19 @@ static inline unsigned long arch_local_save_flags(void) : "=&r" (flags) : : "memory"); - +#else + asm volatile( + "mrs %0, daif" + : "=r" (flags) + : + : "memory"); +#endif return flags; } static inline int arch_irqs_disabled_flags(unsigned long flags) { +#ifdef CONFIG_ARM64_PSEUDO_NMI int res; asm volatile(ALTERNATIVE( @@ -91,6 +114,9 @@ static inline int arch_irqs_disabled_flags(unsigned long flags) : "memory"); return res; +#else + return flags & PSR_I_BIT; +#endif } static inline int arch_irqs_disabled(void) @@ -119,6 +145,7 @@ static inline unsigned long arch_local_irq_save(void) */ static inline void arch_local_irq_restore(unsigned long flags) { +#ifdef CONFIG_ARM64_PSEUDO_NMI asm volatile(ALTERNATIVE( "msr daif, %0", __msr_s(SYS_ICC_PMR_EL1, "%0"), @@ -128,6 +155,13 @@ static inline void arch_local_irq_restore(unsigned long flags) : "memory"); pmr_sync(); +#else + asm volatile( + "msr daif, %0" + : + : "r" (flags) + : "memory"); +#endif } #endif /* __ASM_IRQFLAGS_H */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2f69ae43941d..ffc32d3d909a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -300,6 +300,7 @@ alternative_else_nop_endif str w21, [sp, #S_SYSCALLNO] .endif +#ifdef CONFIG_ARM64_PSEUDO_NMI /* Save pmr */ alternative_if ARM64_HAS_IRQ_PRIO_MASKING mrs_s x20, SYS_ICC_PMR_EL1 @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET msr_s SYS_ICC_PMR_EL1, x20 alternative_else_nop_endif +#endif /* Re-enable tag checking (TCO set on exception entry) */ #ifdef CONFIG_ARM64_MTE @@ -330,6 +332,7 @@ alternative_else_nop_endif disable_daif .endif +#ifdef CONFIG_ARM64_PSEUDO_NMI /* Restore pmr */ alternative_if ARM64_HAS_IRQ_PRIO_MASKING ldr x20, [sp, #S_PMR_SAVE] @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING dsb sy // Ensure priority change is seen by redistributor .L__skip_pmr_sync\@: alternative_else_nop_endif +#endif ldp x21, x22, [sp, #S_PC] // load ELR, SPSR -- 2.17.1
WARNING: multiple messages have this Message-ID (diff)
From: He Ying <heying24@huawei.com> To: <catalin.marinas@arm.com>, <will@kernel.org>, <mark.rutland@arm.com>, <marcan@marcan.st>, <maz@kernel.org>, <joey.gouly@arm.com>, <pcc@google.com> Cc: <linux-arm-kernel@lists.infradead.org>, <linux-kernel@vger.kernel.org>, <heying24@huawei.com> Subject: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code Date: Fri, 7 Jan 2022 03:55:36 -0500 [thread overview] Message-ID: <20220107085536.214501-1-heying24@huawei.com> (raw) Our product has been updating its kernel from 4.4 to 5.10 recently and found a performance issue. We do a bussiness test called ARP test, which tests the latency for a ping-pong packets traffic with a certain payload. The result is as following. - 4.4 kernel: avg = ~20s - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s I have been just learning arm64 pseudo-NMI code and have a question, why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI? I wonder if this brings some performance regression. First, I make this patch and then do the test again. Here's the result. - 5.10 kernel with this patch not applied: avg = ~40s - 5.10 kernel with this patch applied: avg = ~23s Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not set. It seems the pseudo-NMI feature actually brings some overhead to performance event if CONFIG_ARM64_PSEUDO_NMI is not set. Furthermore, I find the feature also brings some overhead to vmlinux size. I build 5.10 kernel with this patch applied or not while CONFIG_ARM64_PSEUDO_NMI is not set. - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes. - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes. That means arm64 pseudo-NMI feature may bring ~200KB overhead to vmlinux size. Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size and performance even if config is not set. To avoid it, add macro control all around the related code. Signed-off-by: He Ying <heying24@huawei.com> --- arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++-- arch/arm64/kernel/entry.S | 4 ++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index b57b9b1e4344..82f771b41cf5 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -26,6 +26,7 @@ */ static inline void arch_local_irq_enable(void) { +#ifdef CONFIG_ARM64_PSEUDO_NMI if (system_has_prio_mask_debugging()) { u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1); @@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void) : "memory"); pmr_sync(); +#else + asm volatile( + "msr daifclr, #3 // arch_local_irq_enable" + : + : + : "memory"); +#endif } static inline void arch_local_irq_disable(void) { +#ifdef CONFIG_ARM64_PSEUDO_NMI if (system_has_prio_mask_debugging()) { u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1); @@ -58,6 +67,13 @@ static inline void arch_local_irq_disable(void) : : "r" ((unsigned long) GIC_PRIO_IRQOFF) : "memory"); +#else + asm volatile( + "msr daifset, #3 // arch_local_irq_disable" + : + : + : "memory"); +#endif } /* @@ -66,7 +82,7 @@ static inline void arch_local_irq_disable(void) static inline unsigned long arch_local_save_flags(void) { unsigned long flags; - +#ifdef CONFIG_ARM64_PSEUDO_NMI asm volatile(ALTERNATIVE( "mrs %0, daif", __mrs_s("%0", SYS_ICC_PMR_EL1), @@ -74,12 +90,19 @@ static inline unsigned long arch_local_save_flags(void) : "=&r" (flags) : : "memory"); - +#else + asm volatile( + "mrs %0, daif" + : "=r" (flags) + : + : "memory"); +#endif return flags; } static inline int arch_irqs_disabled_flags(unsigned long flags) { +#ifdef CONFIG_ARM64_PSEUDO_NMI int res; asm volatile(ALTERNATIVE( @@ -91,6 +114,9 @@ static inline int arch_irqs_disabled_flags(unsigned long flags) : "memory"); return res; +#else + return flags & PSR_I_BIT; +#endif } static inline int arch_irqs_disabled(void) @@ -119,6 +145,7 @@ static inline unsigned long arch_local_irq_save(void) */ static inline void arch_local_irq_restore(unsigned long flags) { +#ifdef CONFIG_ARM64_PSEUDO_NMI asm volatile(ALTERNATIVE( "msr daif, %0", __msr_s(SYS_ICC_PMR_EL1, "%0"), @@ -128,6 +155,13 @@ static inline void arch_local_irq_restore(unsigned long flags) : "memory"); pmr_sync(); +#else + asm volatile( + "msr daif, %0" + : + : "r" (flags) + : "memory"); +#endif } #endif /* __ASM_IRQFLAGS_H */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2f69ae43941d..ffc32d3d909a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -300,6 +300,7 @@ alternative_else_nop_endif str w21, [sp, #S_SYSCALLNO] .endif +#ifdef CONFIG_ARM64_PSEUDO_NMI /* Save pmr */ alternative_if ARM64_HAS_IRQ_PRIO_MASKING mrs_s x20, SYS_ICC_PMR_EL1 @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET msr_s SYS_ICC_PMR_EL1, x20 alternative_else_nop_endif +#endif /* Re-enable tag checking (TCO set on exception entry) */ #ifdef CONFIG_ARM64_MTE @@ -330,6 +332,7 @@ alternative_else_nop_endif disable_daif .endif +#ifdef CONFIG_ARM64_PSEUDO_NMI /* Restore pmr */ alternative_if ARM64_HAS_IRQ_PRIO_MASKING ldr x20, [sp, #S_PMR_SAVE] @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING dsb sy // Ensure priority change is seen by redistributor .L__skip_pmr_sync\@: alternative_else_nop_endif +#endif ldp x21, x22, [sp, #S_PC] // load ELR, SPSR -- 2.17.1 _______________________________________________ linux-arm-kernel mailing list linux-arm-kernel@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
next reply other threads:[~2022-01-07 8:55 UTC|newest] Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top 2022-01-07 8:55 He Ying [this message] 2022-01-07 8:55 ` [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code He Ying 2022-01-07 13:19 ` Mark Rutland 2022-01-07 13:19 ` Mark Rutland 2022-01-10 3:00 ` He Ying 2022-01-10 3:00 ` He Ying 2022-01-10 11:26 ` Mark Rutland 2022-01-10 11:26 ` Mark Rutland 2022-01-11 8:52 ` He Ying 2022-01-11 8:52 ` He Ying 2022-01-11 11:05 ` Mark Rutland 2022-01-11 11:05 ` Mark Rutland 2022-01-08 12:51 ` Marc Zyngier 2022-01-08 12:51 ` Marc Zyngier 2022-01-10 3:20 ` He Ying 2022-01-10 3:20 ` He Ying 2022-01-12 3:24 ` [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set He Ying 2022-01-12 3:24 ` He Ying 2022-01-19 6:40 ` He Ying 2022-01-19 6:40 ` He Ying 2022-01-19 9:35 ` Mark Rutland 2022-01-19 9:35 ` Mark Rutland 2022-01-19 9:47 ` He Ying 2022-01-19 9:47 ` He Ying 2022-02-15 23:18 ` Will Deacon 2022-02-15 23:18 ` Will Deacon
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20220107085536.214501-1-heying24@huawei.com \ --to=heying24@huawei.com \ --cc=catalin.marinas@arm.com \ --cc=joey.gouly@arm.com \ --cc=linux-arm-kernel@lists.infradead.org \ --cc=linux-kernel@vger.kernel.org \ --cc=marcan@marcan.st \ --cc=mark.rutland@arm.com \ --cc=maz@kernel.org \ --cc=pcc@google.com \ --cc=will@kernel.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.