All of lore.kernel.org
 help / color / mirror / Atom feed
From: He Ying <heying24@huawei.com>
To: <catalin.marinas@arm.com>, <will@kernel.org>,
	<mark.rutland@arm.com>, <marcan@marcan.st>, <maz@kernel.org>,
	<joey.gouly@arm.com>, <pcc@google.com>
Cc: <linux-arm-kernel@lists.infradead.org>,
	<linux-kernel@vger.kernel.org>, <heying24@huawei.com>
Subject: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
Date: Fri, 7 Jan 2022 03:55:36 -0500	[thread overview]
Message-ID: <20220107085536.214501-1-heying24@huawei.com> (raw)

Our product has been updating its kernel from 4.4 to 5.10 recently and
found a performance issue. We do a bussiness test called ARP test, which
tests the latency for a ping-pong packets traffic with a certain payload.
The result is as following.

 - 4.4 kernel: avg = ~20s
 - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s

I have been just learning arm64 pseudo-NMI code and have a question,
why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
I wonder if this brings some performance regression.

First, I make this patch and then do the test again. Here's the result.

 - 5.10 kernel with this patch not applied: avg = ~40s
 - 5.10 kernel with this patch applied: avg = ~23s

Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
set. It seems the pseudo-NMI feature actually brings some overhead to
performance event if CONFIG_ARM64_PSEUDO_NMI is not set.

Furthermore, I find the feature also brings some overhead to vmlinux size.
I build 5.10 kernel with this patch applied or not while
CONFIG_ARM64_PSEUDO_NMI is not set.

 - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
 - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.

That means arm64 pseudo-NMI feature may bring ~200KB overhead to
vmlinux size.

Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
and performance even if config is not set. To avoid it, add macro control
all around the related code.

Signed-off-by: He Ying <heying24@huawei.com>
---
 arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
 arch/arm64/kernel/entry.S         |  4 ++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
index b57b9b1e4344..82f771b41cf5 100644
--- a/arch/arm64/include/asm/irqflags.h
+++ b/arch/arm64/include/asm/irqflags.h
@@ -26,6 +26,7 @@
  */
 static inline void arch_local_irq_enable(void)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	if (system_has_prio_mask_debugging()) {
 		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
 
@@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
 		: "memory");
 
 	pmr_sync();
+#else
+	asm volatile(
+		"msr	daifclr, #3		// arch_local_irq_enable"
+		:
+		:
+		: "memory");
+#endif
 }
 
 static inline void arch_local_irq_disable(void)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	if (system_has_prio_mask_debugging()) {
 		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
 
@@ -58,6 +67,13 @@ static inline void arch_local_irq_disable(void)
 		:
 		: "r" ((unsigned long) GIC_PRIO_IRQOFF)
 		: "memory");
+#else
+	asm volatile(
+		"msr	daifset, #3		// arch_local_irq_disable"
+		:
+		:
+		: "memory");
+#endif
 }
 
 /*
@@ -66,7 +82,7 @@ static inline void arch_local_irq_disable(void)
 static inline unsigned long arch_local_save_flags(void)
 {
 	unsigned long flags;
-
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	asm volatile(ALTERNATIVE(
 		"mrs	%0, daif",
 		__mrs_s("%0", SYS_ICC_PMR_EL1),
@@ -74,12 +90,19 @@ static inline unsigned long arch_local_save_flags(void)
 		: "=&r" (flags)
 		:
 		: "memory");
-
+#else
+	asm volatile(
+		"mrs	%0, daif"
+		: "=r" (flags)
+		:
+		: "memory");
+#endif
 	return flags;
 }
 
 static inline int arch_irqs_disabled_flags(unsigned long flags)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	int res;
 
 	asm volatile(ALTERNATIVE(
@@ -91,6 +114,9 @@ static inline int arch_irqs_disabled_flags(unsigned long flags)
 		: "memory");
 
 	return res;
+#else
+	return flags & PSR_I_BIT;
+#endif
 }
 
 static inline int arch_irqs_disabled(void)
@@ -119,6 +145,7 @@ static inline unsigned long arch_local_irq_save(void)
  */
 static inline void arch_local_irq_restore(unsigned long flags)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	asm volatile(ALTERNATIVE(
 		"msr	daif, %0",
 		__msr_s(SYS_ICC_PMR_EL1, "%0"),
@@ -128,6 +155,13 @@ static inline void arch_local_irq_restore(unsigned long flags)
 		: "memory");
 
 	pmr_sync();
+#else
+	asm volatile(
+		"msr	daif, %0"
+		:
+		: "r" (flags)
+		: "memory");
+#endif
 }
 
 #endif /* __ASM_IRQFLAGS_H */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2f69ae43941d..ffc32d3d909a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -300,6 +300,7 @@ alternative_else_nop_endif
 	str	w21, [sp, #S_SYSCALLNO]
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Save pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mrs_s	x20, SYS_ICC_PMR_EL1
@@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
 	msr_s	SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif
+#endif
 
 	/* Re-enable tag checking (TCO set on exception entry) */
 #ifdef CONFIG_ARM64_MTE
@@ -330,6 +332,7 @@ alternative_else_nop_endif
 	disable_daif
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Restore pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	ldr	x20, [sp, #S_PMR_SAVE]
@@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	dsb	sy				// Ensure priority change is seen by redistributor
 .L__skip_pmr_sync\@:
 alternative_else_nop_endif
+#endif
 
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 
-- 
2.17.1


WARNING: multiple messages have this Message-ID (diff)
From: He Ying <heying24@huawei.com>
To: <catalin.marinas@arm.com>, <will@kernel.org>,
	<mark.rutland@arm.com>, <marcan@marcan.st>, <maz@kernel.org>,
	<joey.gouly@arm.com>, <pcc@google.com>
Cc: <linux-arm-kernel@lists.infradead.org>,
	<linux-kernel@vger.kernel.org>, <heying24@huawei.com>
Subject: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
Date: Fri, 7 Jan 2022 03:55:36 -0500	[thread overview]
Message-ID: <20220107085536.214501-1-heying24@huawei.com> (raw)

Our product has been updating its kernel from 4.4 to 5.10 recently and
found a performance issue. We do a bussiness test called ARP test, which
tests the latency for a ping-pong packets traffic with a certain payload.
The result is as following.

 - 4.4 kernel: avg = ~20s
 - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s

I have been just learning arm64 pseudo-NMI code and have a question,
why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
I wonder if this brings some performance regression.

First, I make this patch and then do the test again. Here's the result.

 - 5.10 kernel with this patch not applied: avg = ~40s
 - 5.10 kernel with this patch applied: avg = ~23s

Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
set. It seems the pseudo-NMI feature actually brings some overhead to
performance event if CONFIG_ARM64_PSEUDO_NMI is not set.

Furthermore, I find the feature also brings some overhead to vmlinux size.
I build 5.10 kernel with this patch applied or not while
CONFIG_ARM64_PSEUDO_NMI is not set.

 - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
 - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.

That means arm64 pseudo-NMI feature may bring ~200KB overhead to
vmlinux size.

Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
and performance even if config is not set. To avoid it, add macro control
all around the related code.

Signed-off-by: He Ying <heying24@huawei.com>
---
 arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
 arch/arm64/kernel/entry.S         |  4 ++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
index b57b9b1e4344..82f771b41cf5 100644
--- a/arch/arm64/include/asm/irqflags.h
+++ b/arch/arm64/include/asm/irqflags.h
@@ -26,6 +26,7 @@
  */
 static inline void arch_local_irq_enable(void)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	if (system_has_prio_mask_debugging()) {
 		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
 
@@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
 		: "memory");
 
 	pmr_sync();
+#else
+	asm volatile(
+		"msr	daifclr, #3		// arch_local_irq_enable"
+		:
+		:
+		: "memory");
+#endif
 }
 
 static inline void arch_local_irq_disable(void)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	if (system_has_prio_mask_debugging()) {
 		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
 
@@ -58,6 +67,13 @@ static inline void arch_local_irq_disable(void)
 		:
 		: "r" ((unsigned long) GIC_PRIO_IRQOFF)
 		: "memory");
+#else
+	asm volatile(
+		"msr	daifset, #3		// arch_local_irq_disable"
+		:
+		:
+		: "memory");
+#endif
 }
 
 /*
@@ -66,7 +82,7 @@ static inline void arch_local_irq_disable(void)
 static inline unsigned long arch_local_save_flags(void)
 {
 	unsigned long flags;
-
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	asm volatile(ALTERNATIVE(
 		"mrs	%0, daif",
 		__mrs_s("%0", SYS_ICC_PMR_EL1),
@@ -74,12 +90,19 @@ static inline unsigned long arch_local_save_flags(void)
 		: "=&r" (flags)
 		:
 		: "memory");
-
+#else
+	asm volatile(
+		"mrs	%0, daif"
+		: "=r" (flags)
+		:
+		: "memory");
+#endif
 	return flags;
 }
 
 static inline int arch_irqs_disabled_flags(unsigned long flags)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	int res;
 
 	asm volatile(ALTERNATIVE(
@@ -91,6 +114,9 @@ static inline int arch_irqs_disabled_flags(unsigned long flags)
 		: "memory");
 
 	return res;
+#else
+	return flags & PSR_I_BIT;
+#endif
 }
 
 static inline int arch_irqs_disabled(void)
@@ -119,6 +145,7 @@ static inline unsigned long arch_local_irq_save(void)
  */
 static inline void arch_local_irq_restore(unsigned long flags)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	asm volatile(ALTERNATIVE(
 		"msr	daif, %0",
 		__msr_s(SYS_ICC_PMR_EL1, "%0"),
@@ -128,6 +155,13 @@ static inline void arch_local_irq_restore(unsigned long flags)
 		: "memory");
 
 	pmr_sync();
+#else
+	asm volatile(
+		"msr	daif, %0"
+		:
+		: "r" (flags)
+		: "memory");
+#endif
 }
 
 #endif /* __ASM_IRQFLAGS_H */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2f69ae43941d..ffc32d3d909a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -300,6 +300,7 @@ alternative_else_nop_endif
 	str	w21, [sp, #S_SYSCALLNO]
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Save pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mrs_s	x20, SYS_ICC_PMR_EL1
@@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
 	msr_s	SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif
+#endif
 
 	/* Re-enable tag checking (TCO set on exception entry) */
 #ifdef CONFIG_ARM64_MTE
@@ -330,6 +332,7 @@ alternative_else_nop_endif
 	disable_daif
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Restore pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	ldr	x20, [sp, #S_PMR_SAVE]
@@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	dsb	sy				// Ensure priority change is seen by redistributor
 .L__skip_pmr_sync\@:
 alternative_else_nop_endif
+#endif
 
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 
-- 
2.17.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

             reply	other threads:[~2022-01-07  8:55 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-07  8:55 He Ying [this message]
2022-01-07  8:55 ` [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code He Ying
2022-01-07 13:19 ` Mark Rutland
2022-01-07 13:19   ` Mark Rutland
2022-01-10  3:00   ` He Ying
2022-01-10  3:00     ` He Ying
2022-01-10 11:26     ` Mark Rutland
2022-01-10 11:26       ` Mark Rutland
2022-01-11  8:52       ` He Ying
2022-01-11  8:52         ` He Ying
2022-01-11 11:05         ` Mark Rutland
2022-01-11 11:05           ` Mark Rutland
2022-01-08 12:51 ` Marc Zyngier
2022-01-08 12:51   ` Marc Zyngier
2022-01-10  3:20   ` He Ying
2022-01-10  3:20     ` He Ying
2022-01-12  3:24 ` [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set He Ying
2022-01-12  3:24   ` He Ying
2022-01-19  6:40   ` He Ying
2022-01-19  6:40     ` He Ying
2022-01-19  9:35     ` Mark Rutland
2022-01-19  9:35       ` Mark Rutland
2022-01-19  9:47       ` He Ying
2022-01-19  9:47         ` He Ying
2022-02-15 23:18   ` Will Deacon
2022-02-15 23:18     ` Will Deacon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220107085536.214501-1-heying24@huawei.com \
    --to=heying24@huawei.com \
    --cc=catalin.marinas@arm.com \
    --cc=joey.gouly@arm.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=marcan@marcan.st \
    --cc=mark.rutland@arm.com \
    --cc=maz@kernel.org \
    --cc=pcc@google.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.