All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
@ 2022-01-07  8:55 ` He Ying
  0 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-07  8:55 UTC (permalink / raw)
  To: catalin.marinas, will, mark.rutland, marcan, maz, joey.gouly, pcc
  Cc: linux-arm-kernel, linux-kernel, heying24

Our product has been updating its kernel from 4.4 to 5.10 recently and
found a performance issue. We do a bussiness test called ARP test, which
tests the latency for a ping-pong packets traffic with a certain payload.
The result is as following.

 - 4.4 kernel: avg = ~20s
 - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s

I have been just learning arm64 pseudo-NMI code and have a question,
why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
I wonder if this brings some performance regression.

First, I make this patch and then do the test again. Here's the result.

 - 5.10 kernel with this patch not applied: avg = ~40s
 - 5.10 kernel with this patch applied: avg = ~23s

Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
set. It seems the pseudo-NMI feature actually brings some overhead to
performance event if CONFIG_ARM64_PSEUDO_NMI is not set.

Furthermore, I find the feature also brings some overhead to vmlinux size.
I build 5.10 kernel with this patch applied or not while
CONFIG_ARM64_PSEUDO_NMI is not set.

 - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
 - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.

That means arm64 pseudo-NMI feature may bring ~200KB overhead to
vmlinux size.

Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
and performance even if config is not set. To avoid it, add macro control
all around the related code.

Signed-off-by: He Ying <heying24@huawei.com>
---
 arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
 arch/arm64/kernel/entry.S         |  4 ++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
index b57b9b1e4344..82f771b41cf5 100644
--- a/arch/arm64/include/asm/irqflags.h
+++ b/arch/arm64/include/asm/irqflags.h
@@ -26,6 +26,7 @@
  */
 static inline void arch_local_irq_enable(void)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	if (system_has_prio_mask_debugging()) {
 		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
 
@@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
 		: "memory");
 
 	pmr_sync();
+#else
+	asm volatile(
+		"msr	daifclr, #3		// arch_local_irq_enable"
+		:
+		:
+		: "memory");
+#endif
 }
 
 static inline void arch_local_irq_disable(void)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	if (system_has_prio_mask_debugging()) {
 		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
 
@@ -58,6 +67,13 @@ static inline void arch_local_irq_disable(void)
 		:
 		: "r" ((unsigned long) GIC_PRIO_IRQOFF)
 		: "memory");
+#else
+	asm volatile(
+		"msr	daifset, #3		// arch_local_irq_disable"
+		:
+		:
+		: "memory");
+#endif
 }
 
 /*
@@ -66,7 +82,7 @@ static inline void arch_local_irq_disable(void)
 static inline unsigned long arch_local_save_flags(void)
 {
 	unsigned long flags;
-
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	asm volatile(ALTERNATIVE(
 		"mrs	%0, daif",
 		__mrs_s("%0", SYS_ICC_PMR_EL1),
@@ -74,12 +90,19 @@ static inline unsigned long arch_local_save_flags(void)
 		: "=&r" (flags)
 		:
 		: "memory");
-
+#else
+	asm volatile(
+		"mrs	%0, daif"
+		: "=r" (flags)
+		:
+		: "memory");
+#endif
 	return flags;
 }
 
 static inline int arch_irqs_disabled_flags(unsigned long flags)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	int res;
 
 	asm volatile(ALTERNATIVE(
@@ -91,6 +114,9 @@ static inline int arch_irqs_disabled_flags(unsigned long flags)
 		: "memory");
 
 	return res;
+#else
+	return flags & PSR_I_BIT;
+#endif
 }
 
 static inline int arch_irqs_disabled(void)
@@ -119,6 +145,7 @@ static inline unsigned long arch_local_irq_save(void)
  */
 static inline void arch_local_irq_restore(unsigned long flags)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	asm volatile(ALTERNATIVE(
 		"msr	daif, %0",
 		__msr_s(SYS_ICC_PMR_EL1, "%0"),
@@ -128,6 +155,13 @@ static inline void arch_local_irq_restore(unsigned long flags)
 		: "memory");
 
 	pmr_sync();
+#else
+	asm volatile(
+		"msr	daif, %0"
+		:
+		: "r" (flags)
+		: "memory");
+#endif
 }
 
 #endif /* __ASM_IRQFLAGS_H */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2f69ae43941d..ffc32d3d909a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -300,6 +300,7 @@ alternative_else_nop_endif
 	str	w21, [sp, #S_SYSCALLNO]
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Save pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mrs_s	x20, SYS_ICC_PMR_EL1
@@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
 	msr_s	SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif
+#endif
 
 	/* Re-enable tag checking (TCO set on exception entry) */
 #ifdef CONFIG_ARM64_MTE
@@ -330,6 +332,7 @@ alternative_else_nop_endif
 	disable_daif
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Restore pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	ldr	x20, [sp, #S_PMR_SAVE]
@@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	dsb	sy				// Ensure priority change is seen by redistributor
 .L__skip_pmr_sync\@:
 alternative_else_nop_endif
+#endif
 
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
@ 2022-01-07  8:55 ` He Ying
  0 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-07  8:55 UTC (permalink / raw)
  To: catalin.marinas, will, mark.rutland, marcan, maz, joey.gouly, pcc
  Cc: linux-arm-kernel, linux-kernel, heying24

Our product has been updating its kernel from 4.4 to 5.10 recently and
found a performance issue. We do a bussiness test called ARP test, which
tests the latency for a ping-pong packets traffic with a certain payload.
The result is as following.

 - 4.4 kernel: avg = ~20s
 - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s

I have been just learning arm64 pseudo-NMI code and have a question,
why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
I wonder if this brings some performance regression.

First, I make this patch and then do the test again. Here's the result.

 - 5.10 kernel with this patch not applied: avg = ~40s
 - 5.10 kernel with this patch applied: avg = ~23s

Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
set. It seems the pseudo-NMI feature actually brings some overhead to
performance event if CONFIG_ARM64_PSEUDO_NMI is not set.

Furthermore, I find the feature also brings some overhead to vmlinux size.
I build 5.10 kernel with this patch applied or not while
CONFIG_ARM64_PSEUDO_NMI is not set.

 - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
 - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.

That means arm64 pseudo-NMI feature may bring ~200KB overhead to
vmlinux size.

Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
and performance even if config is not set. To avoid it, add macro control
all around the related code.

Signed-off-by: He Ying <heying24@huawei.com>
---
 arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
 arch/arm64/kernel/entry.S         |  4 ++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
index b57b9b1e4344..82f771b41cf5 100644
--- a/arch/arm64/include/asm/irqflags.h
+++ b/arch/arm64/include/asm/irqflags.h
@@ -26,6 +26,7 @@
  */
 static inline void arch_local_irq_enable(void)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	if (system_has_prio_mask_debugging()) {
 		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
 
@@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
 		: "memory");
 
 	pmr_sync();
+#else
+	asm volatile(
+		"msr	daifclr, #3		// arch_local_irq_enable"
+		:
+		:
+		: "memory");
+#endif
 }
 
 static inline void arch_local_irq_disable(void)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	if (system_has_prio_mask_debugging()) {
 		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
 
@@ -58,6 +67,13 @@ static inline void arch_local_irq_disable(void)
 		:
 		: "r" ((unsigned long) GIC_PRIO_IRQOFF)
 		: "memory");
+#else
+	asm volatile(
+		"msr	daifset, #3		// arch_local_irq_disable"
+		:
+		:
+		: "memory");
+#endif
 }
 
 /*
@@ -66,7 +82,7 @@ static inline void arch_local_irq_disable(void)
 static inline unsigned long arch_local_save_flags(void)
 {
 	unsigned long flags;
-
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	asm volatile(ALTERNATIVE(
 		"mrs	%0, daif",
 		__mrs_s("%0", SYS_ICC_PMR_EL1),
@@ -74,12 +90,19 @@ static inline unsigned long arch_local_save_flags(void)
 		: "=&r" (flags)
 		:
 		: "memory");
-
+#else
+	asm volatile(
+		"mrs	%0, daif"
+		: "=r" (flags)
+		:
+		: "memory");
+#endif
 	return flags;
 }
 
 static inline int arch_irqs_disabled_flags(unsigned long flags)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	int res;
 
 	asm volatile(ALTERNATIVE(
@@ -91,6 +114,9 @@ static inline int arch_irqs_disabled_flags(unsigned long flags)
 		: "memory");
 
 	return res;
+#else
+	return flags & PSR_I_BIT;
+#endif
 }
 
 static inline int arch_irqs_disabled(void)
@@ -119,6 +145,7 @@ static inline unsigned long arch_local_irq_save(void)
  */
 static inline void arch_local_irq_restore(unsigned long flags)
 {
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	asm volatile(ALTERNATIVE(
 		"msr	daif, %0",
 		__msr_s(SYS_ICC_PMR_EL1, "%0"),
@@ -128,6 +155,13 @@ static inline void arch_local_irq_restore(unsigned long flags)
 		: "memory");
 
 	pmr_sync();
+#else
+	asm volatile(
+		"msr	daif, %0"
+		:
+		: "r" (flags)
+		: "memory");
+#endif
 }
 
 #endif /* __ASM_IRQFLAGS_H */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2f69ae43941d..ffc32d3d909a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -300,6 +300,7 @@ alternative_else_nop_endif
 	str	w21, [sp, #S_SYSCALLNO]
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Save pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mrs_s	x20, SYS_ICC_PMR_EL1
@@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
 	msr_s	SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif
+#endif
 
 	/* Re-enable tag checking (TCO set on exception entry) */
 #ifdef CONFIG_ARM64_MTE
@@ -330,6 +332,7 @@ alternative_else_nop_endif
 	disable_daif
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Restore pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	ldr	x20, [sp, #S_PMR_SAVE]
@@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	dsb	sy				// Ensure priority change is seen by redistributor
 .L__skip_pmr_sync\@:
 alternative_else_nop_endif
+#endif
 
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 
-- 
2.17.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
  2022-01-07  8:55 ` He Ying
@ 2022-01-07 13:19   ` Mark Rutland
  -1 siblings, 0 replies; 26+ messages in thread
From: Mark Rutland @ 2022-01-07 13:19 UTC (permalink / raw)
  To: He Ying
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

On Fri, Jan 07, 2022 at 03:55:36AM -0500, He Ying wrote:
> Our product has been updating its kernel from 4.4 to 5.10 recently and
> found a performance issue. We do a bussiness test called ARP test, which
> tests the latency for a ping-pong packets traffic with a certain payload.
> The result is as following.
> 
>  - 4.4 kernel: avg = ~20s
>  - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s

Have you tested with a recent mainline kernel, e.g. v5.15?

Is this test publicly available, and can you say which hardrware (e.g. which
CPU implementation) you're testing with?

> I have been just learning arm64 pseudo-NMI code and have a question,
> why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?

The code in question is all patched via alternatives, and when
CONFIG_ARM64_PSEUDO_NMI is not selected, the code was expected to only have the
overhead of the regular DAIF manipulation.

> I wonder if this brings some performance regression.
> 
> First, I make this patch and then do the test again. Here's the result.
> 
>  - 5.10 kernel with this patch not applied: avg = ~40s
>  - 5.10 kernel with this patch applied: avg = ~23s
> 
> Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
> set. It seems the pseudo-NMI feature actually brings some overhead to
> performance event if CONFIG_ARM64_PSEUDO_NMI is not set.

I'm surprised the overhead is so significant; as above this is all patched in
and so the overhead when this is disabled is expected to be *extremely* small.

For example, wjen CONFIG_ARM64_PSEUDO_NMI, in arch_local_irq_enable():

* The portion under the system_has_prio_mask_debugging() test will be removed
  entirely by the compiler, as this internally checks
  IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI).

* The assembly will be left as a write to DAIFClr. The only additional cost
  should be that of generating GIC_PRIO_IRQON into a register.

* The pmr_sync() will be removed entirely by the compiler as is defined
  conditionally dependent on CONFIG_ARM64_PSEUDO_NMI.

I can't spot an obvious issue with that or ther other cases. In the common case
those add no new instructions, and in the worst case they only add NOPs.

> Furthermore, I find the feature also brings some overhead to vmlinux size.
> I build 5.10 kernel with this patch applied or not while
> CONFIG_ARM64_PSEUDO_NMI is not set.
> 
>  - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
>  - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
> 
> That means arm64 pseudo-NMI feature may bring ~200KB overhead to
> vmlinux size.

I suspect that's just the (unused) alternatives, and we could improve that by
passing the config into the alternative blocks.

> Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
> and performance even if config is not set. To avoid it, add macro control
> all around the related code.
> 
> Signed-off-by: He Ying <heying24@huawei.com>
> ---
>  arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
>  arch/arm64/kernel/entry.S         |  4 ++++
>  2 files changed, 40 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
> index b57b9b1e4344..82f771b41cf5 100644
> --- a/arch/arm64/include/asm/irqflags.h
> +++ b/arch/arm64/include/asm/irqflags.h
> @@ -26,6 +26,7 @@
>   */
>  static inline void arch_local_irq_enable(void)
>  {
> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>  	if (system_has_prio_mask_debugging()) {
>  		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
>  
> @@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
>  		: "memory");
>  
>  	pmr_sync();
> +#else
> +	asm volatile(
> +		"msr	daifclr, #3		// arch_local_irq_enable"
> +		:
> +		:
> +		: "memory");
> +#endif

I'm happy to rework this to improve matters, but I am very much not happy with
duplicating the logic for the !PSEUDO_NMI case. Adding more ifdeffery and
copies of that is not acceptable.

Instead, can you please try changing the alternative to also take the config,
e.g. here have:

|       asm volatile(ALTERNATIVE(
|               "msr    daifclr, #3             // arch_local_irq_enable",
|               __msr_s(SYS_ICC_PMR_EL1, "%0"),
|               ARM64_HAS_IRQ_PRIO_MASKING,
|               CONFIG_ARM64_PSEUDO_NMI)
|               :   
|               : "r" ((unsigned long) GIC_PRIO_IRQON)
|               : "memory");

... and see if that makes a significant difference?

Likewise for the other casees.

>  #endif /* __ASM_IRQFLAGS_H */
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 2f69ae43941d..ffc32d3d909a 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -300,6 +300,7 @@ alternative_else_nop_endif
>  	str	w21, [sp, #S_SYSCALLNO]
>  	.endif
>  
> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>  	/* Save pmr */
>  alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>  	mrs_s	x20, SYS_ICC_PMR_EL1
> @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>  	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
>  	msr_s	SYS_ICC_PMR_EL1, x20
>  alternative_else_nop_endif
> +#endif
>  
>  	/* Re-enable tag checking (TCO set on exception entry) */
>  #ifdef CONFIG_ARM64_MTE
> @@ -330,6 +332,7 @@ alternative_else_nop_endif
>  	disable_daif
>  	.endif
>  
> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>  	/* Restore pmr */
>  alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>  	ldr	x20, [sp, #S_PMR_SAVE]
> @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>  	dsb	sy				// Ensure priority change is seen by redistributor
>  .L__skip_pmr_sync\@:
>  alternative_else_nop_endif
> +#endif

For these two I think the ifdeffery is fine, but I'm surprised this has a
measureable impact as the alternatives should be initialized to NOPS (and never
modified).

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
@ 2022-01-07 13:19   ` Mark Rutland
  0 siblings, 0 replies; 26+ messages in thread
From: Mark Rutland @ 2022-01-07 13:19 UTC (permalink / raw)
  To: He Ying
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

On Fri, Jan 07, 2022 at 03:55:36AM -0500, He Ying wrote:
> Our product has been updating its kernel from 4.4 to 5.10 recently and
> found a performance issue. We do a bussiness test called ARP test, which
> tests the latency for a ping-pong packets traffic with a certain payload.
> The result is as following.
> 
>  - 4.4 kernel: avg = ~20s
>  - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s

Have you tested with a recent mainline kernel, e.g. v5.15?

Is this test publicly available, and can you say which hardrware (e.g. which
CPU implementation) you're testing with?

> I have been just learning arm64 pseudo-NMI code and have a question,
> why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?

The code in question is all patched via alternatives, and when
CONFIG_ARM64_PSEUDO_NMI is not selected, the code was expected to only have the
overhead of the regular DAIF manipulation.

> I wonder if this brings some performance regression.
> 
> First, I make this patch and then do the test again. Here's the result.
> 
>  - 5.10 kernel with this patch not applied: avg = ~40s
>  - 5.10 kernel with this patch applied: avg = ~23s
> 
> Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
> set. It seems the pseudo-NMI feature actually brings some overhead to
> performance event if CONFIG_ARM64_PSEUDO_NMI is not set.

I'm surprised the overhead is so significant; as above this is all patched in
and so the overhead when this is disabled is expected to be *extremely* small.

For example, wjen CONFIG_ARM64_PSEUDO_NMI, in arch_local_irq_enable():

* The portion under the system_has_prio_mask_debugging() test will be removed
  entirely by the compiler, as this internally checks
  IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI).

* The assembly will be left as a write to DAIFClr. The only additional cost
  should be that of generating GIC_PRIO_IRQON into a register.

* The pmr_sync() will be removed entirely by the compiler as is defined
  conditionally dependent on CONFIG_ARM64_PSEUDO_NMI.

I can't spot an obvious issue with that or ther other cases. In the common case
those add no new instructions, and in the worst case they only add NOPs.

> Furthermore, I find the feature also brings some overhead to vmlinux size.
> I build 5.10 kernel with this patch applied or not while
> CONFIG_ARM64_PSEUDO_NMI is not set.
> 
>  - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
>  - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
> 
> That means arm64 pseudo-NMI feature may bring ~200KB overhead to
> vmlinux size.

I suspect that's just the (unused) alternatives, and we could improve that by
passing the config into the alternative blocks.

> Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
> and performance even if config is not set. To avoid it, add macro control
> all around the related code.
> 
> Signed-off-by: He Ying <heying24@huawei.com>
> ---
>  arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
>  arch/arm64/kernel/entry.S         |  4 ++++
>  2 files changed, 40 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
> index b57b9b1e4344..82f771b41cf5 100644
> --- a/arch/arm64/include/asm/irqflags.h
> +++ b/arch/arm64/include/asm/irqflags.h
> @@ -26,6 +26,7 @@
>   */
>  static inline void arch_local_irq_enable(void)
>  {
> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>  	if (system_has_prio_mask_debugging()) {
>  		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
>  
> @@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
>  		: "memory");
>  
>  	pmr_sync();
> +#else
> +	asm volatile(
> +		"msr	daifclr, #3		// arch_local_irq_enable"
> +		:
> +		:
> +		: "memory");
> +#endif

I'm happy to rework this to improve matters, but I am very much not happy with
duplicating the logic for the !PSEUDO_NMI case. Adding more ifdeffery and
copies of that is not acceptable.

Instead, can you please try changing the alternative to also take the config,
e.g. here have:

|       asm volatile(ALTERNATIVE(
|               "msr    daifclr, #3             // arch_local_irq_enable",
|               __msr_s(SYS_ICC_PMR_EL1, "%0"),
|               ARM64_HAS_IRQ_PRIO_MASKING,
|               CONFIG_ARM64_PSEUDO_NMI)
|               :   
|               : "r" ((unsigned long) GIC_PRIO_IRQON)
|               : "memory");

... and see if that makes a significant difference?

Likewise for the other casees.

>  #endif /* __ASM_IRQFLAGS_H */
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 2f69ae43941d..ffc32d3d909a 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -300,6 +300,7 @@ alternative_else_nop_endif
>  	str	w21, [sp, #S_SYSCALLNO]
>  	.endif
>  
> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>  	/* Save pmr */
>  alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>  	mrs_s	x20, SYS_ICC_PMR_EL1
> @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>  	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
>  	msr_s	SYS_ICC_PMR_EL1, x20
>  alternative_else_nop_endif
> +#endif
>  
>  	/* Re-enable tag checking (TCO set on exception entry) */
>  #ifdef CONFIG_ARM64_MTE
> @@ -330,6 +332,7 @@ alternative_else_nop_endif
>  	disable_daif
>  	.endif
>  
> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>  	/* Restore pmr */
>  alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>  	ldr	x20, [sp, #S_PMR_SAVE]
> @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>  	dsb	sy				// Ensure priority change is seen by redistributor
>  .L__skip_pmr_sync\@:
>  alternative_else_nop_endif
> +#endif

For these two I think the ifdeffery is fine, but I'm surprised this has a
measureable impact as the alternatives should be initialized to NOPS (and never
modified).

Thanks,
Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
  2022-01-07  8:55 ` He Ying
@ 2022-01-08 12:51   ` Marc Zyngier
  -1 siblings, 0 replies; 26+ messages in thread
From: Marc Zyngier @ 2022-01-08 12:51 UTC (permalink / raw)
  To: He Ying
  Cc: catalin.marinas, will, mark.rutland, marcan, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

On Fri, 07 Jan 2022 08:55:36 +0000,
He Ying <heying24@huawei.com> wrote:
> 
> Our product has been updating its kernel from 4.4 to 5.10 recently and
> found a performance issue. We do a bussiness test called ARP test, which
> tests the latency for a ping-pong packets traffic with a certain payload.
> The result is as following.
> 
>  - 4.4 kernel: avg = ~20s
>  - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s
> 
> I have been just learning arm64 pseudo-NMI code and have a question,
> why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
> I wonder if this brings some performance regression.
> 
> First, I make this patch and then do the test again. Here's the result.
> 
>  - 5.10 kernel with this patch not applied: avg = ~40s
>  - 5.10 kernel with this patch applied: avg = ~23s
> 
> Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
> set. It seems the pseudo-NMI feature actually brings some overhead to
> performance event if CONFIG_ARM64_PSEUDO_NMI is not set.
> 
> Furthermore, I find the feature also brings some overhead to vmlinux size.
> I build 5.10 kernel with this patch applied or not while
> CONFIG_ARM64_PSEUDO_NMI is not set.
> 
>  - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
>  - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
> 
> That means arm64 pseudo-NMI feature may bring ~200KB overhead to
> vmlinux size.
> 
> Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
> and performance even if config is not set. To avoid it, add macro control
> all around the related code.

This obviously attracted my attention, and I took this patch for a
ride on 5.16-rc8 on a machine that doesn't support GICv3 NMIs to make
sure that any extra code would only result in pure overhead.

There was no measurable difference with this patch applied or not,
with CONFIG_ARM64_PSEUDO_NMI selected or not for the workloads I tried
(I/O heavy virtual machines, hackbench).

Mark already asked a number of questions (test case, implementation,
test on a modern kernel). Please provide as many detail as you
possibly can, because such a regression really isn't expected, and
doesn't show up on the systems I have at hand. Some profiling numbers
could also be interesting, in case this is a result of a particular
resource being thrashed (TLB, cache...).

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
@ 2022-01-08 12:51   ` Marc Zyngier
  0 siblings, 0 replies; 26+ messages in thread
From: Marc Zyngier @ 2022-01-08 12:51 UTC (permalink / raw)
  To: He Ying
  Cc: catalin.marinas, will, mark.rutland, marcan, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

On Fri, 07 Jan 2022 08:55:36 +0000,
He Ying <heying24@huawei.com> wrote:
> 
> Our product has been updating its kernel from 4.4 to 5.10 recently and
> found a performance issue. We do a bussiness test called ARP test, which
> tests the latency for a ping-pong packets traffic with a certain payload.
> The result is as following.
> 
>  - 4.4 kernel: avg = ~20s
>  - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s
> 
> I have been just learning arm64 pseudo-NMI code and have a question,
> why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
> I wonder if this brings some performance regression.
> 
> First, I make this patch and then do the test again. Here's the result.
> 
>  - 5.10 kernel with this patch not applied: avg = ~40s
>  - 5.10 kernel with this patch applied: avg = ~23s
> 
> Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
> set. It seems the pseudo-NMI feature actually brings some overhead to
> performance event if CONFIG_ARM64_PSEUDO_NMI is not set.
> 
> Furthermore, I find the feature also brings some overhead to vmlinux size.
> I build 5.10 kernel with this patch applied or not while
> CONFIG_ARM64_PSEUDO_NMI is not set.
> 
>  - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
>  - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
> 
> That means arm64 pseudo-NMI feature may bring ~200KB overhead to
> vmlinux size.
> 
> Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
> and performance even if config is not set. To avoid it, add macro control
> all around the related code.

This obviously attracted my attention, and I took this patch for a
ride on 5.16-rc8 on a machine that doesn't support GICv3 NMIs to make
sure that any extra code would only result in pure overhead.

There was no measurable difference with this patch applied or not,
with CONFIG_ARM64_PSEUDO_NMI selected or not for the workloads I tried
(I/O heavy virtual machines, hackbench).

Mark already asked a number of questions (test case, implementation,
test on a modern kernel). Please provide as many detail as you
possibly can, because such a regression really isn't expected, and
doesn't show up on the systems I have at hand. Some profiling numbers
could also be interesting, in case this is a result of a particular
resource being thrashed (TLB, cache...).

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
  2022-01-07 13:19   ` Mark Rutland
@ 2022-01-10  3:00     ` He Ying
  -1 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-10  3:00 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

Hi Mark,

I'm just back from the weekend and sorry for the delayed reply.


在 2022/1/7 21:19, Mark Rutland 写道:
> On Fri, Jan 07, 2022 at 03:55:36AM -0500, He Ying wrote:
>> Our product has been updating its kernel from 4.4 to 5.10 recently and
>> found a performance issue. We do a bussiness test called ARP test, which
>> tests the latency for a ping-pong packets traffic with a certain payload.
>> The result is as following.
>>
>>   - 4.4 kernel: avg = ~20s
>>   - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s
> Have you tested with a recent mainline kernel, e.g. v5.15?

Actuallly no, that's because this test is only available for the product 
environment and

we don't have an available 5.15 kernel for it yet.

>
> Is this test publicly available, and can you say which hardrware (e.g. which
> CPU implementation) you're testing with?

Actually no. The test is only available for our product environment now. 
We are testing

with hisilicon 1213 (4 ARM Cortex-A72 cores).

>
>> I have been just learning arm64 pseudo-NMI code and have a question,
>> why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
> The code in question is all patched via alternatives, and when
> CONFIG_ARM64_PSEUDO_NMI is not selected, the code was expected to only have the
> overhead of the regular DAIF manipulation.
I don't understand alernatives very well and I'll apreciate it if you 
can explain it a bit more.
>
>> I wonder if this brings some performance regression.
>>
>> First, I make this patch and then do the test again. Here's the result.
>>
>>   - 5.10 kernel with this patch not applied: avg = ~40s
>>   - 5.10 kernel with this patch applied: avg = ~23s
>>
>> Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
>> set. It seems the pseudo-NMI feature actually brings some overhead to
>> performance event if CONFIG_ARM64_PSEUDO_NMI is not set.
> I'm surprised the overhead is so significant; as above this is all patched in
> and so the overhead when this is disabled is expected to be *extremely* small.
>
> For example, wjen CONFIG_ARM64_PSEUDO_NMI, in arch_local_irq_enable():
>
> * The portion under the system_has_prio_mask_debugging() test will be removed
>    entirely by the compiler, as this internally checks
>    IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI).
>
> * The assembly will be left as a write to DAIFClr. The only additional cost
>    should be that of generating GIC_PRIO_IRQON into a register.
>
> * The pmr_sync() will be removed entirely by the compiler as is defined
>    conditionally dependent on CONFIG_ARM64_PSEUDO_NMI.
>
> I can't spot an obvious issue with that or ther other cases. In the common case
> those add no new instructions, and in the worst case they only add NOPs.

Thanks for your detailed explaination! Actually I can't understand the 
result exactly.

I build two 5.10 kernel images with this patch applied or not and 
objdump them. Indeed,

the disassembles of 'arch_local_irq_restore' are the same. Do you have 
any ideas how

we can find the root cause why this patch improves the performance so much?


However, the test result is trustworthy because we do it many times and 
the result is

always repeatable.

>
>> Furthermore, I find the feature also brings some overhead to vmlinux size.
>> I build 5.10 kernel with this patch applied or not while
>> CONFIG_ARM64_PSEUDO_NMI is not set.
>>
>>   - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
>>   - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
>>
>> That means arm64 pseudo-NMI feature may bring ~200KB overhead to
>> vmlinux size.
> I suspect that's just the (unused) alternatives, and we could improve that by
> passing the config into the alternative blocks.

Do you mean the sections generated by the alternatives? I don't understand

alernatives very well and I'll apreciate it if you can explain it a bit 
more.

>
>> Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
>> and performance even if config is not set. To avoid it, add macro control
>> all around the related code.
>>
>> Signed-off-by: He Ying <heying24@huawei.com>
>> ---
>>   arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
>>   arch/arm64/kernel/entry.S         |  4 ++++
>>   2 files changed, 40 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
>> index b57b9b1e4344..82f771b41cf5 100644
>> --- a/arch/arm64/include/asm/irqflags.h
>> +++ b/arch/arm64/include/asm/irqflags.h
>> @@ -26,6 +26,7 @@
>>    */
>>   static inline void arch_local_irq_enable(void)
>>   {
>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>   	if (system_has_prio_mask_debugging()) {
>>   		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
>>   
>> @@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
>>   		: "memory");
>>   
>>   	pmr_sync();
>> +#else
>> +	asm volatile(
>> +		"msr	daifclr, #3		// arch_local_irq_enable"
>> +		:
>> +		:
>> +		: "memory");
>> +#endif
> I'm happy to rework this to improve matters, but I am very much not happy with
> duplicating the logic for the !PSEUDO_NMI case. Adding more ifdeffery and
> copies of that is not acceptable.
I agree. Adding these ifdeffery is a bit ugly. Let's see if there are 
some better ways.
>
> Instead, can you please try changing the alternative to also take the config,
> e.g. here have:
>
> |       asm volatile(ALTERNATIVE(
> |               "msr    daifclr, #3             // arch_local_irq_enable",
> |               __msr_s(SYS_ICC_PMR_EL1, "%0"),
> |               ARM64_HAS_IRQ_PRIO_MASKING,
> |               CONFIG_ARM64_PSEUDO_NMI)
> |               :
> |               : "r" ((unsigned long) GIC_PRIO_IRQON)
> |               : "memory");
>
> ... and see if that makes a significant difference?
>
> Likewise for the other casees.

OK, I'll try it. But I have some questions. Here's the comment of 
ALERNATIVE:

/*
  * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
  *
  * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
  * N.B. If CONFIG_FOO is specified, but not selected, the whole block
  *      will be omitted, including oldinstr.
  */

If CONFIG_FOO is not selected, the whole block will be omitted including 
oldinstr.

But we still want the oldinstr in this situation. Do I misunderstand 
something?

>
>>   #endif /* __ASM_IRQFLAGS_H */
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index 2f69ae43941d..ffc32d3d909a 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -300,6 +300,7 @@ alternative_else_nop_endif
>>   	str	w21, [sp, #S_SYSCALLNO]
>>   	.endif
>>   
>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>   	/* Save pmr */
>>   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>   	mrs_s	x20, SYS_ICC_PMR_EL1
>> @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>   	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
>>   	msr_s	SYS_ICC_PMR_EL1, x20
>>   alternative_else_nop_endif
>> +#endif
>>   
>>   	/* Re-enable tag checking (TCO set on exception entry) */
>>   #ifdef CONFIG_ARM64_MTE
>> @@ -330,6 +332,7 @@ alternative_else_nop_endif
>>   	disable_daif
>>   	.endif
>>   
>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>   	/* Restore pmr */
>>   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>   	ldr	x20, [sp, #S_PMR_SAVE]
>> @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>   	dsb	sy				// Ensure priority change is seen by redistributor
>>   .L__skip_pmr_sync\@:
>>   alternative_else_nop_endif
>> +#endif
> For these two I think the ifdeffery is fine, but I'm surprised this has a
> measureable impact as the alternatives should be initialized to NOPS (and never
> modified).

Yes, these NOPs may bring some overhead. But I can't say how much these 
NOPs contribute to

the result of test.

>
> Thanks,
> Mark.
> .

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
@ 2022-01-10  3:00     ` He Ying
  0 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-10  3:00 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

Hi Mark,

I'm just back from the weekend and sorry for the delayed reply.


在 2022/1/7 21:19, Mark Rutland 写道:
> On Fri, Jan 07, 2022 at 03:55:36AM -0500, He Ying wrote:
>> Our product has been updating its kernel from 4.4 to 5.10 recently and
>> found a performance issue. We do a bussiness test called ARP test, which
>> tests the latency for a ping-pong packets traffic with a certain payload.
>> The result is as following.
>>
>>   - 4.4 kernel: avg = ~20s
>>   - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s
> Have you tested with a recent mainline kernel, e.g. v5.15?

Actuallly no, that's because this test is only available for the product 
environment and

we don't have an available 5.15 kernel for it yet.

>
> Is this test publicly available, and can you say which hardrware (e.g. which
> CPU implementation) you're testing with?

Actually no. The test is only available for our product environment now. 
We are testing

with hisilicon 1213 (4 ARM Cortex-A72 cores).

>
>> I have been just learning arm64 pseudo-NMI code and have a question,
>> why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
> The code in question is all patched via alternatives, and when
> CONFIG_ARM64_PSEUDO_NMI is not selected, the code was expected to only have the
> overhead of the regular DAIF manipulation.
I don't understand alernatives very well and I'll apreciate it if you 
can explain it a bit more.
>
>> I wonder if this brings some performance regression.
>>
>> First, I make this patch and then do the test again. Here's the result.
>>
>>   - 5.10 kernel with this patch not applied: avg = ~40s
>>   - 5.10 kernel with this patch applied: avg = ~23s
>>
>> Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
>> set. It seems the pseudo-NMI feature actually brings some overhead to
>> performance event if CONFIG_ARM64_PSEUDO_NMI is not set.
> I'm surprised the overhead is so significant; as above this is all patched in
> and so the overhead when this is disabled is expected to be *extremely* small.
>
> For example, wjen CONFIG_ARM64_PSEUDO_NMI, in arch_local_irq_enable():
>
> * The portion under the system_has_prio_mask_debugging() test will be removed
>    entirely by the compiler, as this internally checks
>    IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI).
>
> * The assembly will be left as a write to DAIFClr. The only additional cost
>    should be that of generating GIC_PRIO_IRQON into a register.
>
> * The pmr_sync() will be removed entirely by the compiler as is defined
>    conditionally dependent on CONFIG_ARM64_PSEUDO_NMI.
>
> I can't spot an obvious issue with that or ther other cases. In the common case
> those add no new instructions, and in the worst case they only add NOPs.

Thanks for your detailed explaination! Actually I can't understand the 
result exactly.

I build two 5.10 kernel images with this patch applied or not and 
objdump them. Indeed,

the disassembles of 'arch_local_irq_restore' are the same. Do you have 
any ideas how

we can find the root cause why this patch improves the performance so much?


However, the test result is trustworthy because we do it many times and 
the result is

always repeatable.

>
>> Furthermore, I find the feature also brings some overhead to vmlinux size.
>> I build 5.10 kernel with this patch applied or not while
>> CONFIG_ARM64_PSEUDO_NMI is not set.
>>
>>   - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
>>   - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
>>
>> That means arm64 pseudo-NMI feature may bring ~200KB overhead to
>> vmlinux size.
> I suspect that's just the (unused) alternatives, and we could improve that by
> passing the config into the alternative blocks.

Do you mean the sections generated by the alternatives? I don't understand

alernatives very well and I'll apreciate it if you can explain it a bit 
more.

>
>> Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
>> and performance even if config is not set. To avoid it, add macro control
>> all around the related code.
>>
>> Signed-off-by: He Ying <heying24@huawei.com>
>> ---
>>   arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
>>   arch/arm64/kernel/entry.S         |  4 ++++
>>   2 files changed, 40 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
>> index b57b9b1e4344..82f771b41cf5 100644
>> --- a/arch/arm64/include/asm/irqflags.h
>> +++ b/arch/arm64/include/asm/irqflags.h
>> @@ -26,6 +26,7 @@
>>    */
>>   static inline void arch_local_irq_enable(void)
>>   {
>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>   	if (system_has_prio_mask_debugging()) {
>>   		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
>>   
>> @@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
>>   		: "memory");
>>   
>>   	pmr_sync();
>> +#else
>> +	asm volatile(
>> +		"msr	daifclr, #3		// arch_local_irq_enable"
>> +		:
>> +		:
>> +		: "memory");
>> +#endif
> I'm happy to rework this to improve matters, but I am very much not happy with
> duplicating the logic for the !PSEUDO_NMI case. Adding more ifdeffery and
> copies of that is not acceptable.
I agree. Adding these ifdeffery is a bit ugly. Let's see if there are 
some better ways.
>
> Instead, can you please try changing the alternative to also take the config,
> e.g. here have:
>
> |       asm volatile(ALTERNATIVE(
> |               "msr    daifclr, #3             // arch_local_irq_enable",
> |               __msr_s(SYS_ICC_PMR_EL1, "%0"),
> |               ARM64_HAS_IRQ_PRIO_MASKING,
> |               CONFIG_ARM64_PSEUDO_NMI)
> |               :
> |               : "r" ((unsigned long) GIC_PRIO_IRQON)
> |               : "memory");
>
> ... and see if that makes a significant difference?
>
> Likewise for the other casees.

OK, I'll try it. But I have some questions. Here's the comment of 
ALERNATIVE:

/*
  * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
  *
  * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
  * N.B. If CONFIG_FOO is specified, but not selected, the whole block
  *      will be omitted, including oldinstr.
  */

If CONFIG_FOO is not selected, the whole block will be omitted including 
oldinstr.

But we still want the oldinstr in this situation. Do I misunderstand 
something?

>
>>   #endif /* __ASM_IRQFLAGS_H */
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index 2f69ae43941d..ffc32d3d909a 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -300,6 +300,7 @@ alternative_else_nop_endif
>>   	str	w21, [sp, #S_SYSCALLNO]
>>   	.endif
>>   
>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>   	/* Save pmr */
>>   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>   	mrs_s	x20, SYS_ICC_PMR_EL1
>> @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>   	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
>>   	msr_s	SYS_ICC_PMR_EL1, x20
>>   alternative_else_nop_endif
>> +#endif
>>   
>>   	/* Re-enable tag checking (TCO set on exception entry) */
>>   #ifdef CONFIG_ARM64_MTE
>> @@ -330,6 +332,7 @@ alternative_else_nop_endif
>>   	disable_daif
>>   	.endif
>>   
>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>   	/* Restore pmr */
>>   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>   	ldr	x20, [sp, #S_PMR_SAVE]
>> @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>   	dsb	sy				// Ensure priority change is seen by redistributor
>>   .L__skip_pmr_sync\@:
>>   alternative_else_nop_endif
>> +#endif
> For these two I think the ifdeffery is fine, but I'm surprised this has a
> measureable impact as the alternatives should be initialized to NOPS (and never
> modified).

Yes, these NOPs may bring some overhead. But I can't say how much these 
NOPs contribute to

the result of test.

>
> Thanks,
> Mark.
> .

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
  2022-01-08 12:51   ` Marc Zyngier
@ 2022-01-10  3:20     ` He Ying
  -1 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-10  3:20 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: catalin.marinas, will, mark.rutland, marcan, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

Hi Marc,

I'm just back from the weekend and sorry for the delayed reply.


在 2022/1/8 20:51, Marc Zyngier 写道:
> On Fri, 07 Jan 2022 08:55:36 +0000,
> He Ying <heying24@huawei.com> wrote:
>> Our product has been updating its kernel from 4.4 to 5.10 recently and
>> found a performance issue. We do a bussiness test called ARP test, which
>> tests the latency for a ping-pong packets traffic with a certain payload.
>> The result is as following.
>>
>>   - 4.4 kernel: avg = ~20s
>>   - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s
>>
>> I have been just learning arm64 pseudo-NMI code and have a question,
>> why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
>> I wonder if this brings some performance regression.
>>
>> First, I make this patch and then do the test again. Here's the result.
>>
>>   - 5.10 kernel with this patch not applied: avg = ~40s
>>   - 5.10 kernel with this patch applied: avg = ~23s
>>
>> Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
>> set. It seems the pseudo-NMI feature actually brings some overhead to
>> performance event if CONFIG_ARM64_PSEUDO_NMI is not set.
>>
>> Furthermore, I find the feature also brings some overhead to vmlinux size.
>> I build 5.10 kernel with this patch applied or not while
>> CONFIG_ARM64_PSEUDO_NMI is not set.
>>
>>   - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
>>   - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
>>
>> That means arm64 pseudo-NMI feature may bring ~200KB overhead to
>> vmlinux size.
>>
>> Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
>> and performance even if config is not set. To avoid it, add macro control
>> all around the related code.
> This obviously attracted my attention, and I took this patch for a
> ride on 5.16-rc8 on a machine that doesn't support GICv3 NMIs to make
> sure that any extra code would only result in pure overhead.
>
> There was no measurable difference with this patch applied or not,
> with CONFIG_ARM64_PSEUDO_NMI selected or not for the workloads I tried
> (I/O heavy virtual machines, hackbench).
Our test is some kind of network test.
>
> Mark already asked a number of questions (test case, implementation,
> test on a modern kernel). Please provide as many detail as you
> possibly can, because such a regression really isn't expected, and
> doesn't show up on the systems I have at hand. Some profiling numbers
> could also be interesting, in case this is a result of a particular
> resource being thrashed (TLB, cache...).

I replied to Mark a few moments ago and provided as many details as I can.

You mentioned TLB and cache could be thrashed. How can we check this?

By using perf tools?

>
> Thanks,
>
> 	M.
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
@ 2022-01-10  3:20     ` He Ying
  0 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-10  3:20 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: catalin.marinas, will, mark.rutland, marcan, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

Hi Marc,

I'm just back from the weekend and sorry for the delayed reply.


在 2022/1/8 20:51, Marc Zyngier 写道:
> On Fri, 07 Jan 2022 08:55:36 +0000,
> He Ying <heying24@huawei.com> wrote:
>> Our product has been updating its kernel from 4.4 to 5.10 recently and
>> found a performance issue. We do a bussiness test called ARP test, which
>> tests the latency for a ping-pong packets traffic with a certain payload.
>> The result is as following.
>>
>>   - 4.4 kernel: avg = ~20s
>>   - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s
>>
>> I have been just learning arm64 pseudo-NMI code and have a question,
>> why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
>> I wonder if this brings some performance regression.
>>
>> First, I make this patch and then do the test again. Here's the result.
>>
>>   - 5.10 kernel with this patch not applied: avg = ~40s
>>   - 5.10 kernel with this patch applied: avg = ~23s
>>
>> Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
>> set. It seems the pseudo-NMI feature actually brings some overhead to
>> performance event if CONFIG_ARM64_PSEUDO_NMI is not set.
>>
>> Furthermore, I find the feature also brings some overhead to vmlinux size.
>> I build 5.10 kernel with this patch applied or not while
>> CONFIG_ARM64_PSEUDO_NMI is not set.
>>
>>   - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
>>   - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
>>
>> That means arm64 pseudo-NMI feature may bring ~200KB overhead to
>> vmlinux size.
>>
>> Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
>> and performance even if config is not set. To avoid it, add macro control
>> all around the related code.
> This obviously attracted my attention, and I took this patch for a
> ride on 5.16-rc8 on a machine that doesn't support GICv3 NMIs to make
> sure that any extra code would only result in pure overhead.
>
> There was no measurable difference with this patch applied or not,
> with CONFIG_ARM64_PSEUDO_NMI selected or not for the workloads I tried
> (I/O heavy virtual machines, hackbench).
Our test is some kind of network test.
>
> Mark already asked a number of questions (test case, implementation,
> test on a modern kernel). Please provide as many detail as you
> possibly can, because such a regression really isn't expected, and
> doesn't show up on the systems I have at hand. Some profiling numbers
> could also be interesting, in case this is a result of a particular
> resource being thrashed (TLB, cache...).

I replied to Mark a few moments ago and provided as many details as I can.

You mentioned TLB and cache could be thrashed. How can we check this?

By using perf tools?

>
> Thanks,
>
> 	M.
>

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
  2022-01-10  3:00     ` He Ying
@ 2022-01-10 11:26       ` Mark Rutland
  -1 siblings, 0 replies; 26+ messages in thread
From: Mark Rutland @ 2022-01-10 11:26 UTC (permalink / raw)
  To: He Ying
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

On Mon, Jan 10, 2022 at 11:00:43AM +0800, He Ying wrote:
> Hi Mark,
> 
> I'm just back from the weekend and sorry for the delayed reply.
> 
> 
> 在 2022/1/7 21:19, Mark Rutland 写道:
> > On Fri, Jan 07, 2022 at 03:55:36AM -0500, He Ying wrote:
> > > Our product has been updating its kernel from 4.4 to 5.10 recently and
> > > found a performance issue. We do a bussiness test called ARP test, which
> > > tests the latency for a ping-pong packets traffic with a certain payload.
> > > The result is as following.
> > > 
> > >   - 4.4 kernel: avg = ~20s
> > >   - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s
> > Have you tested with a recent mainline kernel, e.g. v5.15?
> 
> Actuallly no, that's because this test is only available for the product
> environment and
> 
> we don't have an available 5.15 kernel for it yet.

Ok; do you see anything comparable with any tests available to upstream
developers? e.g. hackbench or `perf bench sched` ?

> > Is this test publicly available, and can you say which hardrware (e.g. which
> > CPU implementation) you're testing with?
> 
> Actually no. The test is only available for our product environment now. We
> are testing
> 
> with hisilicon 1213 (4 ARM Cortex-A72 cores).

Thanks for the CPU info; there are a number of other Cortex-A72 platforms out
there, so it might be possible to reproduce the behaviour elsewhere.

> > > I have been just learning arm64 pseudo-NMI code and have a question,
> > > why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
> > The code in question is all patched via alternatives, and when
> > CONFIG_ARM64_PSEUDO_NMI is not selected, the code was expected to only have the
> > overhead of the regular DAIF manipulation.
> I don't understand alernatives very well and I'll apreciate it if you can
> explain it a bit more.

Code using alternatives is patched at runtime, so for:

	ALTERNATIVE(x, y, cap)

... the `x` instructions will be inline in the code, and the `y` instructions
will be placed in a separate linker section. If the `cap` capability is
detected, the `y` instructions will be copied over the `x` instructions.

So for:

| asm volatile(ALTERNATIVE(
|         "msr    daifclr, #3             // arch_local_irq_enable",
|         __msr_s(SYS_ICC_PMR_EL1, "%0"),
|         ARM64_HAS_IRQ_PRIO_MASKING)
|         :
|         : "r" ((unsigned long) GIC_PRIO_IRQON)
|         : "memory");

... where ARM64_HAS_IRQ_PRIO_MASKING is not detected, the inline instructions
will be:

| msr    daifclr, #3             // arch_local_irq_enable 

A separate linker section will contain:
... and a separate linker section will contain:

| __msr_s(SYS_ICC_PMR_EL1, "%0"

... and some metadata will be recorded in a `struct alt_instr`, which will be
unused and will take some space, but should have no runtime overhead -- the
runtime cost should only be the `msr daifclr, #3` instruction.

> > > I wonder if this brings some performance regression.
> > > 
> > > First, I make this patch and then do the test again. Here's the result.
> > > 
> > >   - 5.10 kernel with this patch not applied: avg = ~40s
> > >   - 5.10 kernel with this patch applied: avg = ~23s
> > > 
> > > Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
> > > set. It seems the pseudo-NMI feature actually brings some overhead to
> > > performance event if CONFIG_ARM64_PSEUDO_NMI is not set.
> > I'm surprised the overhead is so significant; as above this is all patched in
> > and so the overhead when this is disabled is expected to be *extremely* small.
> > 
> > For example, wjen CONFIG_ARM64_PSEUDO_NMI, in arch_local_irq_enable():
> > 
> > * The portion under the system_has_prio_mask_debugging() test will be removed
> >    entirely by the compiler, as this internally checks
> >    IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI).
> > 
> > * The assembly will be left as a write to DAIFClr. The only additional cost
> >    should be that of generating GIC_PRIO_IRQON into a register.
> > 
> > * The pmr_sync() will be removed entirely by the compiler as is defined
> >    conditionally dependent on CONFIG_ARM64_PSEUDO_NMI.
> > 
> > I can't spot an obvious issue with that or ther other cases. In the common case
> > those add no new instructions, and in the worst case they only add NOPs.
> 
> Thanks for your detailed explaination! Actually I can't understand the
> result exactly.
> 
> I build two 5.10 kernel images with this patch applied or not and objdump
> them. Indeed, the disassembles of 'arch_local_irq_restore' are the same. Do
> you have any ideas how we can find the root cause why this patch improves the
> performance so much?
> 
> However, the test result is trustworthy because we do it many times and the
> result is always repeatable.

Due to the large numbers, I suspect this must be due to a specific fast-path,
and it's possible that this is due to secondary factors (e.g. alignment of
code) rather than the pseudo-NMK code itself.

We need to narrow down *where* time is being spent. Since it appears that this
is related to the local IRQ state management, it isn't likely that we can
determine that reliably with the PMU. Given that, I think the first step is to
reproduce the result elsewhere, for which we will need some plublicly available
test-case.

> > > Furthermore, I find the feature also brings some overhead to vmlinux size.
> > > I build 5.10 kernel with this patch applied or not while
> > > CONFIG_ARM64_PSEUDO_NMI is not set.
> > > 
> > >   - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
> > >   - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
> > > 
> > > That means arm64 pseudo-NMI feature may bring ~200KB overhead to
> > > vmlinux size.
> > I suspect that's just the (unused) alternatives, and we could improve that by
> > passing the config into the alternative blocks.
> 
> Do you mean the sections generated by the alternatives? I don't understand
> alernatives very well and I'll apreciate it if you can explain it a bit
> more.

Yes; I meant the sections generated to hold the alternatives code and the
alternatives metadata.


> > > Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
> > > and performance even if config is not set. To avoid it, add macro control
> > > all around the related code.
> > > 
> > > Signed-off-by: He Ying <heying24@huawei.com>
> > > ---
> > >   arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
> > >   arch/arm64/kernel/entry.S         |  4 ++++
> > >   2 files changed, 40 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
> > > index b57b9b1e4344..82f771b41cf5 100644
> > > --- a/arch/arm64/include/asm/irqflags.h
> > > +++ b/arch/arm64/include/asm/irqflags.h
> > > @@ -26,6 +26,7 @@
> > >    */
> > >   static inline void arch_local_irq_enable(void)
> > >   {
> > > +#ifdef CONFIG_ARM64_PSEUDO_NMI
> > >   	if (system_has_prio_mask_debugging()) {
> > >   		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
> > > @@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
> > >   		: "memory");
> > >   	pmr_sync();
> > > +#else
> > > +	asm volatile(
> > > +		"msr	daifclr, #3		// arch_local_irq_enable"
> > > +		:
> > > +		:
> > > +		: "memory");
> > > +#endif
> > I'm happy to rework this to improve matters, but I am very much not happy with
> > duplicating the logic for the !PSEUDO_NMI case. Adding more ifdeffery and
> > copies of that is not acceptable.
> I agree. Adding these ifdeffery is a bit ugly. Let's see if there are some
> better ways.
> > 
> > Instead, can you please try changing the alternative to also take the config,
> > e.g. here have:
> > 
> > |       asm volatile(ALTERNATIVE(
> > |               "msr    daifclr, #3             // arch_local_irq_enable",
> > |               __msr_s(SYS_ICC_PMR_EL1, "%0"),
> > |               ARM64_HAS_IRQ_PRIO_MASKING,
> > |               CONFIG_ARM64_PSEUDO_NMI)
> > |               :
> > |               : "r" ((unsigned long) GIC_PRIO_IRQON)
> > |               : "memory");
> > 
> > ... and see if that makes a significant difference?
> > 
> > Likewise for the other casees.
> 
> OK, I'll try it. But I have some questions. Here's the comment of
> ALERNATIVE:
> 
> /*
>  * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
>  *
>  * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
>  * N.B. If CONFIG_FOO is specified, but not selected, the whole block
>  *      will be omitted, including oldinstr.
>  */
> 
> If CONFIG_FOO is not selected, the whole block will be omitted including
> oldinstr.
> 
> But we still want the oldinstr in this situation. Do I misunderstand
> something?

Sorry; you are right, and my suggestion was broken.

I'll need to have a think about this; we might be able to rework this to use a
static key instead, but IIRC last time I tried there were issues with that
approach. I have some (old) work-in-progress patches at:

  https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/daif-rework

I suspect I won't have time to renew that in the near future, but an approach
like that might be worthwhile.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
@ 2022-01-10 11:26       ` Mark Rutland
  0 siblings, 0 replies; 26+ messages in thread
From: Mark Rutland @ 2022-01-10 11:26 UTC (permalink / raw)
  To: He Ying
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

On Mon, Jan 10, 2022 at 11:00:43AM +0800, He Ying wrote:
> Hi Mark,
> 
> I'm just back from the weekend and sorry for the delayed reply.
> 
> 
> 在 2022/1/7 21:19, Mark Rutland 写道:
> > On Fri, Jan 07, 2022 at 03:55:36AM -0500, He Ying wrote:
> > > Our product has been updating its kernel from 4.4 to 5.10 recently and
> > > found a performance issue. We do a bussiness test called ARP test, which
> > > tests the latency for a ping-pong packets traffic with a certain payload.
> > > The result is as following.
> > > 
> > >   - 4.4 kernel: avg = ~20s
> > >   - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s
> > Have you tested with a recent mainline kernel, e.g. v5.15?
> 
> Actuallly no, that's because this test is only available for the product
> environment and
> 
> we don't have an available 5.15 kernel for it yet.

Ok; do you see anything comparable with any tests available to upstream
developers? e.g. hackbench or `perf bench sched` ?

> > Is this test publicly available, and can you say which hardrware (e.g. which
> > CPU implementation) you're testing with?
> 
> Actually no. The test is only available for our product environment now. We
> are testing
> 
> with hisilicon 1213 (4 ARM Cortex-A72 cores).

Thanks for the CPU info; there are a number of other Cortex-A72 platforms out
there, so it might be possible to reproduce the behaviour elsewhere.

> > > I have been just learning arm64 pseudo-NMI code and have a question,
> > > why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
> > The code in question is all patched via alternatives, and when
> > CONFIG_ARM64_PSEUDO_NMI is not selected, the code was expected to only have the
> > overhead of the regular DAIF manipulation.
> I don't understand alernatives very well and I'll apreciate it if you can
> explain it a bit more.

Code using alternatives is patched at runtime, so for:

	ALTERNATIVE(x, y, cap)

... the `x` instructions will be inline in the code, and the `y` instructions
will be placed in a separate linker section. If the `cap` capability is
detected, the `y` instructions will be copied over the `x` instructions.

So for:

| asm volatile(ALTERNATIVE(
|         "msr    daifclr, #3             // arch_local_irq_enable",
|         __msr_s(SYS_ICC_PMR_EL1, "%0"),
|         ARM64_HAS_IRQ_PRIO_MASKING)
|         :
|         : "r" ((unsigned long) GIC_PRIO_IRQON)
|         : "memory");

... where ARM64_HAS_IRQ_PRIO_MASKING is not detected, the inline instructions
will be:

| msr    daifclr, #3             // arch_local_irq_enable 

A separate linker section will contain:
... and a separate linker section will contain:

| __msr_s(SYS_ICC_PMR_EL1, "%0"

... and some metadata will be recorded in a `struct alt_instr`, which will be
unused and will take some space, but should have no runtime overhead -- the
runtime cost should only be the `msr daifclr, #3` instruction.

> > > I wonder if this brings some performance regression.
> > > 
> > > First, I make this patch and then do the test again. Here's the result.
> > > 
> > >   - 5.10 kernel with this patch not applied: avg = ~40s
> > >   - 5.10 kernel with this patch applied: avg = ~23s
> > > 
> > > Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
> > > set. It seems the pseudo-NMI feature actually brings some overhead to
> > > performance event if CONFIG_ARM64_PSEUDO_NMI is not set.
> > I'm surprised the overhead is so significant; as above this is all patched in
> > and so the overhead when this is disabled is expected to be *extremely* small.
> > 
> > For example, wjen CONFIG_ARM64_PSEUDO_NMI, in arch_local_irq_enable():
> > 
> > * The portion under the system_has_prio_mask_debugging() test will be removed
> >    entirely by the compiler, as this internally checks
> >    IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI).
> > 
> > * The assembly will be left as a write to DAIFClr. The only additional cost
> >    should be that of generating GIC_PRIO_IRQON into a register.
> > 
> > * The pmr_sync() will be removed entirely by the compiler as is defined
> >    conditionally dependent on CONFIG_ARM64_PSEUDO_NMI.
> > 
> > I can't spot an obvious issue with that or ther other cases. In the common case
> > those add no new instructions, and in the worst case they only add NOPs.
> 
> Thanks for your detailed explaination! Actually I can't understand the
> result exactly.
> 
> I build two 5.10 kernel images with this patch applied or not and objdump
> them. Indeed, the disassembles of 'arch_local_irq_restore' are the same. Do
> you have any ideas how we can find the root cause why this patch improves the
> performance so much?
> 
> However, the test result is trustworthy because we do it many times and the
> result is always repeatable.

Due to the large numbers, I suspect this must be due to a specific fast-path,
and it's possible that this is due to secondary factors (e.g. alignment of
code) rather than the pseudo-NMK code itself.

We need to narrow down *where* time is being spent. Since it appears that this
is related to the local IRQ state management, it isn't likely that we can
determine that reliably with the PMU. Given that, I think the first step is to
reproduce the result elsewhere, for which we will need some plublicly available
test-case.

> > > Furthermore, I find the feature also brings some overhead to vmlinux size.
> > > I build 5.10 kernel with this patch applied or not while
> > > CONFIG_ARM64_PSEUDO_NMI is not set.
> > > 
> > >   - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
> > >   - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
> > > 
> > > That means arm64 pseudo-NMI feature may bring ~200KB overhead to
> > > vmlinux size.
> > I suspect that's just the (unused) alternatives, and we could improve that by
> > passing the config into the alternative blocks.
> 
> Do you mean the sections generated by the alternatives? I don't understand
> alernatives very well and I'll apreciate it if you can explain it a bit
> more.

Yes; I meant the sections generated to hold the alternatives code and the
alternatives metadata.


> > > Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
> > > and performance even if config is not set. To avoid it, add macro control
> > > all around the related code.
> > > 
> > > Signed-off-by: He Ying <heying24@huawei.com>
> > > ---
> > >   arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
> > >   arch/arm64/kernel/entry.S         |  4 ++++
> > >   2 files changed, 40 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
> > > index b57b9b1e4344..82f771b41cf5 100644
> > > --- a/arch/arm64/include/asm/irqflags.h
> > > +++ b/arch/arm64/include/asm/irqflags.h
> > > @@ -26,6 +26,7 @@
> > >    */
> > >   static inline void arch_local_irq_enable(void)
> > >   {
> > > +#ifdef CONFIG_ARM64_PSEUDO_NMI
> > >   	if (system_has_prio_mask_debugging()) {
> > >   		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
> > > @@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
> > >   		: "memory");
> > >   	pmr_sync();
> > > +#else
> > > +	asm volatile(
> > > +		"msr	daifclr, #3		// arch_local_irq_enable"
> > > +		:
> > > +		:
> > > +		: "memory");
> > > +#endif
> > I'm happy to rework this to improve matters, but I am very much not happy with
> > duplicating the logic for the !PSEUDO_NMI case. Adding more ifdeffery and
> > copies of that is not acceptable.
> I agree. Adding these ifdeffery is a bit ugly. Let's see if there are some
> better ways.
> > 
> > Instead, can you please try changing the alternative to also take the config,
> > e.g. here have:
> > 
> > |       asm volatile(ALTERNATIVE(
> > |               "msr    daifclr, #3             // arch_local_irq_enable",
> > |               __msr_s(SYS_ICC_PMR_EL1, "%0"),
> > |               ARM64_HAS_IRQ_PRIO_MASKING,
> > |               CONFIG_ARM64_PSEUDO_NMI)
> > |               :
> > |               : "r" ((unsigned long) GIC_PRIO_IRQON)
> > |               : "memory");
> > 
> > ... and see if that makes a significant difference?
> > 
> > Likewise for the other casees.
> 
> OK, I'll try it. But I have some questions. Here's the comment of
> ALERNATIVE:
> 
> /*
>  * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
>  *
>  * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
>  * N.B. If CONFIG_FOO is specified, but not selected, the whole block
>  *      will be omitted, including oldinstr.
>  */
> 
> If CONFIG_FOO is not selected, the whole block will be omitted including
> oldinstr.
> 
> But we still want the oldinstr in this situation. Do I misunderstand
> something?

Sorry; you are right, and my suggestion was broken.

I'll need to have a think about this; we might be able to rework this to use a
static key instead, but IIRC last time I tried there were issues with that
approach. I have some (old) work-in-progress patches at:

  https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/daif-rework

I suspect I won't have time to renew that in the near future, but an approach
like that might be worthwhile.

Thanks,
Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
  2022-01-10 11:26       ` Mark Rutland
@ 2022-01-11  8:52         ` He Ying
  -1 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-11  8:52 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

Hi Mark,

在 2022/1/10 19:26, Mark Rutland 写道:
> On Mon, Jan 10, 2022 at 11:00:43AM +0800, He Ying wrote:
>> Hi Mark,
>>
>> I'm just back from the weekend and sorry for the delayed reply.
>>
>>
>> 在 2022/1/7 21:19, Mark Rutland 写道:
>>> On Fri, Jan 07, 2022 at 03:55:36AM -0500, He Ying wrote:
>>>> Our product has been updating its kernel from 4.4 to 5.10 recently and
>>>> found a performance issue. We do a bussiness test called ARP test, which
>>>> tests the latency for a ping-pong packets traffic with a certain payload.
>>>> The result is as following.
>>>>
>>>>    - 4.4 kernel: avg = ~20s
>>>>    - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s
>>> Have you tested with a recent mainline kernel, e.g. v5.15?
>> Actuallly no, that's because this test is only available for the product
>> environment and
>>
>> we don't have an available 5.15 kernel for it yet.
> Ok; do you see anything comparable with any tests available to upstream
> developers? e.g. hackbench or `perf bench sched` ?

I'm not sure. The test is some kind of network test. And I don't know which

bench test is comparable.

>
>>> Is this test publicly available, and can you say which hardrware (e.g. which
>>> CPU implementation) you're testing with?
>> Actually no. The test is only available for our product environment now. We
>> are testing
>>
>> with hisilicon 1213 (4 ARM Cortex-A72 cores).
> Thanks for the CPU info; there are a number of other Cortex-A72 platforms out
> there, so it might be possible to reproduce the behaviour elsewhere.

I asked our collegues how they did the test. Actually it's a private 
test and

difficult to be reproduced elsewhere. But I got another important 
information.

See below.

>
>>>> I have been just learning arm64 pseudo-NMI code and have a question,
>>>> why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
>>> The code in question is all patched via alternatives, and when
>>> CONFIG_ARM64_PSEUDO_NMI is not selected, the code was expected to only have the
>>> overhead of the regular DAIF manipulation.
>> I don't understand alernatives very well and I'll apreciate it if you can
>> explain it a bit more.
> Code using alternatives is patched at runtime, so for:
>
> 	ALTERNATIVE(x, y, cap)
>
> ... the `x` instructions will be inline in the code, and the `y` instructions
> will be placed in a separate linker section. If the `cap` capability is
> detected, the `y` instructions will be copied over the `x` instructions.
>
> So for:
>
> | asm volatile(ALTERNATIVE(
> |         "msr    daifclr, #3             // arch_local_irq_enable",
> |         __msr_s(SYS_ICC_PMR_EL1, "%0"),
> |         ARM64_HAS_IRQ_PRIO_MASKING)
> |         :
> |         : "r" ((unsigned long) GIC_PRIO_IRQON)
> |         : "memory");
>
> ... where ARM64_HAS_IRQ_PRIO_MASKING is not detected, the inline instructions
> will be:
>
> | msr    daifclr, #3             // arch_local_irq_enable
>
> A separate linker section will contain:
> ... and a separate linker section will contain:
>
> | __msr_s(SYS_ICC_PMR_EL1, "%0"
>
> ... and some metadata will be recorded in a `struct alt_instr`, which will be
> unused and will take some space, but should have no runtime overhead -- the
> runtime cost should only be the `msr daifclr, #3` instruction.

Thank you very much for the detailed explanation. I think I understand 
it better.

I agree that these ALERNATIVEs might only bring some additional space but no

runtime overhead.

>
>>>> I wonder if this brings some performance regression.
>>>>
>>>> First, I make this patch and then do the test again. Here's the result.
>>>>
>>>>    - 5.10 kernel with this patch not applied: avg = ~40s
>>>>    - 5.10 kernel with this patch applied: avg = ~23s
>>>>
>>>> Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
>>>> set. It seems the pseudo-NMI feature actually brings some overhead to
>>>> performance event if CONFIG_ARM64_PSEUDO_NMI is not set.
>>> I'm surprised the overhead is so significant; as above this is all patched in
>>> and so the overhead when this is disabled is expected to be *extremely* small.
>>>
>>> For example, wjen CONFIG_ARM64_PSEUDO_NMI, in arch_local_irq_enable():
>>>
>>> * The portion under the system_has_prio_mask_debugging() test will be removed
>>>     entirely by the compiler, as this internally checks
>>>     IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI).
>>>
>>> * The assembly will be left as a write to DAIFClr. The only additional cost
>>>     should be that of generating GIC_PRIO_IRQON into a register.
>>>
>>> * The pmr_sync() will be removed entirely by the compiler as is defined
>>>     conditionally dependent on CONFIG_ARM64_PSEUDO_NMI.
>>>
>>> I can't spot an obvious issue with that or ther other cases. In the common case
>>> those add no new instructions, and in the worst case they only add NOPs.
>> Thanks for your detailed explaination! Actually I can't understand the
>> result exactly.
>>
>> I build two 5.10 kernel images with this patch applied or not and objdump
>> them. Indeed, the disassembles of 'arch_local_irq_restore' are the same. Do
>> you have any ideas how we can find the root cause why this patch improves the
>> performance so much?
>>
>> However, the test result is trustworthy because we do it many times and the
>> result is always repeatable.
> Due to the large numbers, I suspect this must be due to a specific fast-path,
> and it's possible that this is due to secondary factors (e.g. alignment of
> code) rather than the pseudo-NMK code itself.
>
> We need to narrow down *where* time is being spent. Since it appears that this
> is related to the local IRQ state management, it isn't likely that we can
> determine that reliably with the PMU. Given that, I think the first step is to
> reproduce the result elsewhere, for which we will need some plublicly available
> test-case.

As said before, I asked our collegues how they did the ARP test. In one 
word,

a very small performance regression may bring the big change to the test 
result.

I feel very sorry for missing this important information. So, this patch may

improve the performance a little and then lead to the big change to the

test result.

>
>>>> Furthermore, I find the feature also brings some overhead to vmlinux size.
>>>> I build 5.10 kernel with this patch applied or not while
>>>> CONFIG_ARM64_PSEUDO_NMI is not set.
>>>>
>>>>    - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
>>>>    - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
>>>>
>>>> That means arm64 pseudo-NMI feature may bring ~200KB overhead to
>>>> vmlinux size.
>>> I suspect that's just the (unused) alternatives, and we could improve that by
>>> passing the config into the alternative blocks.
>> Do you mean the sections generated by the alternatives? I don't understand
>> alernatives very well and I'll apreciate it if you can explain it a bit
>> more.
> Yes; I meant the sections generated to hold the alternatives code and the
> alternatives metadata.
I got it. Thanks.
>
>
>>>> Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
>>>> and performance even if config is not set. To avoid it, add macro control
>>>> all around the related code.
>>>>
>>>> Signed-off-by: He Ying <heying24@huawei.com>
>>>> ---
>>>>    arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
>>>>    arch/arm64/kernel/entry.S         |  4 ++++
>>>>    2 files changed, 40 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
>>>> index b57b9b1e4344..82f771b41cf5 100644
>>>> --- a/arch/arm64/include/asm/irqflags.h
>>>> +++ b/arch/arm64/include/asm/irqflags.h
>>>> @@ -26,6 +26,7 @@
>>>>     */
>>>>    static inline void arch_local_irq_enable(void)
>>>>    {
>>>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>>>    	if (system_has_prio_mask_debugging()) {
>>>>    		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
>>>> @@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
>>>>    		: "memory");
>>>>    	pmr_sync();
>>>> +#else
>>>> +	asm volatile(
>>>> +		"msr	daifclr, #3		// arch_local_irq_enable"
>>>> +		:
>>>> +		:
>>>> +		: "memory");
>>>> +#endif
>>> I'm happy to rework this to improve matters, but I am very much not happy with
>>> duplicating the logic for the !PSEUDO_NMI case. Adding more ifdeffery and
>>> copies of that is not acceptable.
>> I agree. Adding these ifdeffery is a bit ugly. Let's see if there are some
>> better ways.
>>> Instead, can you please try changing the alternative to also take the config,
>>> e.g. here have:
>>>
>>> |       asm volatile(ALTERNATIVE(
>>> |               "msr    daifclr, #3             // arch_local_irq_enable",
>>> |               __msr_s(SYS_ICC_PMR_EL1, "%0"),
>>> |               ARM64_HAS_IRQ_PRIO_MASKING,
>>> |               CONFIG_ARM64_PSEUDO_NMI)
>>> |               :
>>> |               : "r" ((unsigned long) GIC_PRIO_IRQON)
>>> |               : "memory");
>>>
>>> ... and see if that makes a significant difference?
>>>
>>> Likewise for the other casees.
>> OK, I'll try it. But I have some questions. Here's the comment of
>> ALERNATIVE:
>>
>> /*
>>   * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
>>   *
>>   * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
>>   * N.B. If CONFIG_FOO is specified, but not selected, the whole block
>>   *      will be omitted, including oldinstr.
>>   */
>>
>> If CONFIG_FOO is not selected, the whole block will be omitted including
>> oldinstr.
>>
>> But we still want the oldinstr in this situation. Do I misunderstand
>> something?
> Sorry; you are right, and my suggestion was broken.
>
> I'll need to have a think about this; we might be able to rework this to use a
> static key instead, but IIRC last time I tried there were issues with that
> approach. I have some (old) work-in-progress patches at:
>
>    https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/daif-rework
>
> I suspect I won't have time to renew that in the near future, but an approach
> like that might be worthwhile.

OK, I see. What do you think if I send a v2 patch that only adds 
ifdeffery to

arch/arm64/kernel/entry.S and leave the rework to ALERNATIVE behind?

I think these modifications can save some NOPs definitely.

>
> Thanks,
> Mark.
> .

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
@ 2022-01-11  8:52         ` He Ying
  0 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-11  8:52 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

Hi Mark,

在 2022/1/10 19:26, Mark Rutland 写道:
> On Mon, Jan 10, 2022 at 11:00:43AM +0800, He Ying wrote:
>> Hi Mark,
>>
>> I'm just back from the weekend and sorry for the delayed reply.
>>
>>
>> 在 2022/1/7 21:19, Mark Rutland 写道:
>>> On Fri, Jan 07, 2022 at 03:55:36AM -0500, He Ying wrote:
>>>> Our product has been updating its kernel from 4.4 to 5.10 recently and
>>>> found a performance issue. We do a bussiness test called ARP test, which
>>>> tests the latency for a ping-pong packets traffic with a certain payload.
>>>> The result is as following.
>>>>
>>>>    - 4.4 kernel: avg = ~20s
>>>>    - 5.10 kernel (CONFIG_ARM64_PSEUDO_NMI is not set): avg = ~40s
>>> Have you tested with a recent mainline kernel, e.g. v5.15?
>> Actuallly no, that's because this test is only available for the product
>> environment and
>>
>> we don't have an available 5.15 kernel for it yet.
> Ok; do you see anything comparable with any tests available to upstream
> developers? e.g. hackbench or `perf bench sched` ?

I'm not sure. The test is some kind of network test. And I don't know which

bench test is comparable.

>
>>> Is this test publicly available, and can you say which hardrware (e.g. which
>>> CPU implementation) you're testing with?
>> Actually no. The test is only available for our product environment now. We
>> are testing
>>
>> with hisilicon 1213 (4 ARM Cortex-A72 cores).
> Thanks for the CPU info; there are a number of other Cortex-A72 platforms out
> there, so it might be possible to reproduce the behaviour elsewhere.

I asked our collegues how they did the test. Actually it's a private 
test and

difficult to be reproduced elsewhere. But I got another important 
information.

See below.

>
>>>> I have been just learning arm64 pseudo-NMI code and have a question,
>>>> why is the related code not wrapped by CONFIG_ARM64_PSEUDO_NMI?
>>> The code in question is all patched via alternatives, and when
>>> CONFIG_ARM64_PSEUDO_NMI is not selected, the code was expected to only have the
>>> overhead of the regular DAIF manipulation.
>> I don't understand alernatives very well and I'll apreciate it if you can
>> explain it a bit more.
> Code using alternatives is patched at runtime, so for:
>
> 	ALTERNATIVE(x, y, cap)
>
> ... the `x` instructions will be inline in the code, and the `y` instructions
> will be placed in a separate linker section. If the `cap` capability is
> detected, the `y` instructions will be copied over the `x` instructions.
>
> So for:
>
> | asm volatile(ALTERNATIVE(
> |         "msr    daifclr, #3             // arch_local_irq_enable",
> |         __msr_s(SYS_ICC_PMR_EL1, "%0"),
> |         ARM64_HAS_IRQ_PRIO_MASKING)
> |         :
> |         : "r" ((unsigned long) GIC_PRIO_IRQON)
> |         : "memory");
>
> ... where ARM64_HAS_IRQ_PRIO_MASKING is not detected, the inline instructions
> will be:
>
> | msr    daifclr, #3             // arch_local_irq_enable
>
> A separate linker section will contain:
> ... and a separate linker section will contain:
>
> | __msr_s(SYS_ICC_PMR_EL1, "%0"
>
> ... and some metadata will be recorded in a `struct alt_instr`, which will be
> unused and will take some space, but should have no runtime overhead -- the
> runtime cost should only be the `msr daifclr, #3` instruction.

Thank you very much for the detailed explanation. I think I understand 
it better.

I agree that these ALERNATIVEs might only bring some additional space but no

runtime overhead.

>
>>>> I wonder if this brings some performance regression.
>>>>
>>>> First, I make this patch and then do the test again. Here's the result.
>>>>
>>>>    - 5.10 kernel with this patch not applied: avg = ~40s
>>>>    - 5.10 kernel with this patch applied: avg = ~23s
>>>>
>>>> Amazing! Note that all kernel is built with CONFIG_ARM64_PSEUDO_NMI not
>>>> set. It seems the pseudo-NMI feature actually brings some overhead to
>>>> performance event if CONFIG_ARM64_PSEUDO_NMI is not set.
>>> I'm surprised the overhead is so significant; as above this is all patched in
>>> and so the overhead when this is disabled is expected to be *extremely* small.
>>>
>>> For example, wjen CONFIG_ARM64_PSEUDO_NMI, in arch_local_irq_enable():
>>>
>>> * The portion under the system_has_prio_mask_debugging() test will be removed
>>>     entirely by the compiler, as this internally checks
>>>     IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI).
>>>
>>> * The assembly will be left as a write to DAIFClr. The only additional cost
>>>     should be that of generating GIC_PRIO_IRQON into a register.
>>>
>>> * The pmr_sync() will be removed entirely by the compiler as is defined
>>>     conditionally dependent on CONFIG_ARM64_PSEUDO_NMI.
>>>
>>> I can't spot an obvious issue with that or ther other cases. In the common case
>>> those add no new instructions, and in the worst case they only add NOPs.
>> Thanks for your detailed explaination! Actually I can't understand the
>> result exactly.
>>
>> I build two 5.10 kernel images with this patch applied or not and objdump
>> them. Indeed, the disassembles of 'arch_local_irq_restore' are the same. Do
>> you have any ideas how we can find the root cause why this patch improves the
>> performance so much?
>>
>> However, the test result is trustworthy because we do it many times and the
>> result is always repeatable.
> Due to the large numbers, I suspect this must be due to a specific fast-path,
> and it's possible that this is due to secondary factors (e.g. alignment of
> code) rather than the pseudo-NMK code itself.
>
> We need to narrow down *where* time is being spent. Since it appears that this
> is related to the local IRQ state management, it isn't likely that we can
> determine that reliably with the PMU. Given that, I think the first step is to
> reproduce the result elsewhere, for which we will need some plublicly available
> test-case.

As said before, I asked our collegues how they did the ARP test. In one 
word,

a very small performance regression may bring the big change to the test 
result.

I feel very sorry for missing this important information. So, this patch may

improve the performance a little and then lead to the big change to the

test result.

>
>>>> Furthermore, I find the feature also brings some overhead to vmlinux size.
>>>> I build 5.10 kernel with this patch applied or not while
>>>> CONFIG_ARM64_PSEUDO_NMI is not set.
>>>>
>>>>    - 5.10 kernel with this patch not applied: vmlinux size is 384060600 Bytes.
>>>>    - 5.10 kernel with this patch applied: vmlinux size is 383842936 Bytes.
>>>>
>>>> That means arm64 pseudo-NMI feature may bring ~200KB overhead to
>>>> vmlinux size.
>>> I suspect that's just the (unused) alternatives, and we could improve that by
>>> passing the config into the alternative blocks.
>> Do you mean the sections generated by the alternatives? I don't understand
>> alernatives very well and I'll apreciate it if you can explain it a bit
>> more.
> Yes; I meant the sections generated to hold the alternatives code and the
> alternatives metadata.
I got it. Thanks.
>
>
>>>> Above all, arm64 pseudo-NMI feature brings some overhead to vmlinux size
>>>> and performance even if config is not set. To avoid it, add macro control
>>>> all around the related code.
>>>>
>>>> Signed-off-by: He Ying <heying24@huawei.com>
>>>> ---
>>>>    arch/arm64/include/asm/irqflags.h | 38 +++++++++++++++++++++++++++++--
>>>>    arch/arm64/kernel/entry.S         |  4 ++++
>>>>    2 files changed, 40 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
>>>> index b57b9b1e4344..82f771b41cf5 100644
>>>> --- a/arch/arm64/include/asm/irqflags.h
>>>> +++ b/arch/arm64/include/asm/irqflags.h
>>>> @@ -26,6 +26,7 @@
>>>>     */
>>>>    static inline void arch_local_irq_enable(void)
>>>>    {
>>>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>>>    	if (system_has_prio_mask_debugging()) {
>>>>    		u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
>>>> @@ -41,10 +42,18 @@ static inline void arch_local_irq_enable(void)
>>>>    		: "memory");
>>>>    	pmr_sync();
>>>> +#else
>>>> +	asm volatile(
>>>> +		"msr	daifclr, #3		// arch_local_irq_enable"
>>>> +		:
>>>> +		:
>>>> +		: "memory");
>>>> +#endif
>>> I'm happy to rework this to improve matters, but I am very much not happy with
>>> duplicating the logic for the !PSEUDO_NMI case. Adding more ifdeffery and
>>> copies of that is not acceptable.
>> I agree. Adding these ifdeffery is a bit ugly. Let's see if there are some
>> better ways.
>>> Instead, can you please try changing the alternative to also take the config,
>>> e.g. here have:
>>>
>>> |       asm volatile(ALTERNATIVE(
>>> |               "msr    daifclr, #3             // arch_local_irq_enable",
>>> |               __msr_s(SYS_ICC_PMR_EL1, "%0"),
>>> |               ARM64_HAS_IRQ_PRIO_MASKING,
>>> |               CONFIG_ARM64_PSEUDO_NMI)
>>> |               :
>>> |               : "r" ((unsigned long) GIC_PRIO_IRQON)
>>> |               : "memory");
>>>
>>> ... and see if that makes a significant difference?
>>>
>>> Likewise for the other casees.
>> OK, I'll try it. But I have some questions. Here's the comment of
>> ALERNATIVE:
>>
>> /*
>>   * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature));
>>   *
>>   * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO));
>>   * N.B. If CONFIG_FOO is specified, but not selected, the whole block
>>   *      will be omitted, including oldinstr.
>>   */
>>
>> If CONFIG_FOO is not selected, the whole block will be omitted including
>> oldinstr.
>>
>> But we still want the oldinstr in this situation. Do I misunderstand
>> something?
> Sorry; you are right, and my suggestion was broken.
>
> I'll need to have a think about this; we might be able to rework this to use a
> static key instead, but IIRC last time I tried there were issues with that
> approach. I have some (old) work-in-progress patches at:
>
>    https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/daif-rework
>
> I suspect I won't have time to renew that in the near future, but an approach
> like that might be worthwhile.

OK, I see. What do you think if I send a v2 patch that only adds 
ifdeffery to

arch/arm64/kernel/entry.S and leave the rework to ALERNATIVE behind?

I think these modifications can save some NOPs definitely.

>
> Thanks,
> Mark.
> .

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
  2022-01-11  8:52         ` He Ying
@ 2022-01-11 11:05           ` Mark Rutland
  -1 siblings, 0 replies; 26+ messages in thread
From: Mark Rutland @ 2022-01-11 11:05 UTC (permalink / raw)
  To: He Ying
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

On Tue, Jan 11, 2022 at 04:52:32PM +0800, He Ying wrote:
> 在 2022/1/10 19:26, Mark Rutland 写道:
> > On Mon, Jan 10, 2022 at 11:00:43AM +0800, He Ying wrote:
> > > 在 2022/1/7 21:19, Mark Rutland 写道:
> > > > On Fri, Jan 07, 2022 at 03:55:36AM -0500, He Ying wrote:
 
> > Due to the large numbers, I suspect this must be due to a specific fast-path,
> > and it's possible that this is due to secondary factors (e.g. alignment of
> > code) rather than the pseudo-NMK code itself.
> > 
> > We need to narrow down *where* time is being spent. Since it appears that this
> > is related to the local IRQ state management, it isn't likely that we can
> > determine that reliably with the PMU. Given that, I think the first step is to
> > reproduce the result elsewhere, for which we will need some plublicly available
> > test-case.
> 
> As said before, I asked our collegues how they did the ARP test. In one word,
> a very small performance regression may bring the big change to the test
> result.
> 
> I feel very sorry for missing this important information. So, this patch may
> improve the performance a little and then lead to the big change to the
> test result.

No problem; thanks for confirming.

[...]

> OK, I see. What do you think if I send a v2 patch that only adds ifdeffery
> to
> 
> arch/arm64/kernel/entry.S and leave the rework to ALERNATIVE behind?

I think that would be reasonable given we do that for other bits in entry.S;
I'd be happy with such a patch.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code
@ 2022-01-11 11:05           ` Mark Rutland
  0 siblings, 0 replies; 26+ messages in thread
From: Mark Rutland @ 2022-01-11 11:05 UTC (permalink / raw)
  To: He Ying
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

On Tue, Jan 11, 2022 at 04:52:32PM +0800, He Ying wrote:
> 在 2022/1/10 19:26, Mark Rutland 写道:
> > On Mon, Jan 10, 2022 at 11:00:43AM +0800, He Ying wrote:
> > > 在 2022/1/7 21:19, Mark Rutland 写道:
> > > > On Fri, Jan 07, 2022 at 03:55:36AM -0500, He Ying wrote:
 
> > Due to the large numbers, I suspect this must be due to a specific fast-path,
> > and it's possible that this is due to secondary factors (e.g. alignment of
> > code) rather than the pseudo-NMK code itself.
> > 
> > We need to narrow down *where* time is being spent. Since it appears that this
> > is related to the local IRQ state management, it isn't likely that we can
> > determine that reliably with the PMU. Given that, I think the first step is to
> > reproduce the result elsewhere, for which we will need some plublicly available
> > test-case.
> 
> As said before, I asked our collegues how they did the ARP test. In one word,
> a very small performance regression may bring the big change to the test
> result.
> 
> I feel very sorry for missing this important information. So, this patch may
> improve the performance a little and then lead to the big change to the
> test result.

No problem; thanks for confirming.

[...]

> OK, I see. What do you think if I send a v2 patch that only adds ifdeffery
> to
> 
> arch/arm64/kernel/entry.S and leave the rework to ALERNATIVE behind?

I think that would be reasonable given we do that for other bits in entry.S;
I'd be happy with such a patch.

Thanks,
Mark.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
  2022-01-07  8:55 ` He Ying
@ 2022-01-12  3:24   ` He Ying
  -1 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-12  3:24 UTC (permalink / raw)
  To: catalin.marinas, will, mark.rutland, marcan, maz, joey.gouly, pcc
  Cc: linux-arm-kernel, linux-kernel, heying24

Arm64 pseudo-NMI feature code brings some additional nops
when CONFIG_ARM64_PSEUDO_NMI is not set, which is not
necessary. So add necessary ifdeffery to avoid it.

Signed-off-by: He Ying <heying24@huawei.com>
---
 arch/arm64/kernel/entry.S | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2f69ae43941d..ffc32d3d909a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -300,6 +300,7 @@ alternative_else_nop_endif
 	str	w21, [sp, #S_SYSCALLNO]
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Save pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mrs_s	x20, SYS_ICC_PMR_EL1
@@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
 	msr_s	SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif
+#endif
 
 	/* Re-enable tag checking (TCO set on exception entry) */
 #ifdef CONFIG_ARM64_MTE
@@ -330,6 +332,7 @@ alternative_else_nop_endif
 	disable_daif
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Restore pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	ldr	x20, [sp, #S_PMR_SAVE]
@@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	dsb	sy				// Ensure priority change is seen by redistributor
 .L__skip_pmr_sync\@:
 alternative_else_nop_endif
+#endif
 
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
@ 2022-01-12  3:24   ` He Ying
  0 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-12  3:24 UTC (permalink / raw)
  To: catalin.marinas, will, mark.rutland, marcan, maz, joey.gouly, pcc
  Cc: linux-arm-kernel, linux-kernel, heying24

Arm64 pseudo-NMI feature code brings some additional nops
when CONFIG_ARM64_PSEUDO_NMI is not set, which is not
necessary. So add necessary ifdeffery to avoid it.

Signed-off-by: He Ying <heying24@huawei.com>
---
 arch/arm64/kernel/entry.S | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2f69ae43941d..ffc32d3d909a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -300,6 +300,7 @@ alternative_else_nop_endif
 	str	w21, [sp, #S_SYSCALLNO]
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Save pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mrs_s	x20, SYS_ICC_PMR_EL1
@@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
 	msr_s	SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif
+#endif
 
 	/* Re-enable tag checking (TCO set on exception entry) */
 #ifdef CONFIG_ARM64_MTE
@@ -330,6 +332,7 @@ alternative_else_nop_endif
 	disable_daif
 	.endif
 
+#ifdef CONFIG_ARM64_PSEUDO_NMI
 	/* Restore pmr */
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	ldr	x20, [sp, #S_PMR_SAVE]
@@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	dsb	sy				// Ensure priority change is seen by redistributor
 .L__skip_pmr_sync\@:
 alternative_else_nop_endif
+#endif
 
 	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
 
-- 
2.17.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
  2022-01-12  3:24   ` He Ying
@ 2022-01-19  6:40     ` He Ying
  -1 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-19  6:40 UTC (permalink / raw)
  To: catalin.marinas, will, mark.rutland, marcan, maz, joey.gouly, pcc
  Cc: linux-arm-kernel, linux-kernel

Hi all,

Ping. Any comments?

在 2022/1/12 11:24, He Ying 写道:
> Arm64 pseudo-NMI feature code brings some additional nops
> when CONFIG_ARM64_PSEUDO_NMI is not set, which is not
> necessary. So add necessary ifdeffery to avoid it.
>
> Signed-off-by: He Ying <heying24@huawei.com>
> ---
>   arch/arm64/kernel/entry.S | 4 ++++
>   1 file changed, 4 insertions(+)
>
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 2f69ae43941d..ffc32d3d909a 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -300,6 +300,7 @@ alternative_else_nop_endif
>   	str	w21, [sp, #S_SYSCALLNO]
>   	.endif
>   
> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>   	/* Save pmr */
>   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>   	mrs_s	x20, SYS_ICC_PMR_EL1
> @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>   	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
>   	msr_s	SYS_ICC_PMR_EL1, x20
>   alternative_else_nop_endif
> +#endif
>   
>   	/* Re-enable tag checking (TCO set on exception entry) */
>   #ifdef CONFIG_ARM64_MTE
> @@ -330,6 +332,7 @@ alternative_else_nop_endif
>   	disable_daif
>   	.endif
>   
> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>   	/* Restore pmr */
>   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>   	ldr	x20, [sp, #S_PMR_SAVE]
> @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>   	dsb	sy				// Ensure priority change is seen by redistributor
>   .L__skip_pmr_sync\@:
>   alternative_else_nop_endif
> +#endif
>   
>   	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
>   

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
@ 2022-01-19  6:40     ` He Ying
  0 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-19  6:40 UTC (permalink / raw)
  To: catalin.marinas, will, mark.rutland, marcan, maz, joey.gouly, pcc
  Cc: linux-arm-kernel, linux-kernel

Hi all,

Ping. Any comments?

在 2022/1/12 11:24, He Ying 写道:
> Arm64 pseudo-NMI feature code brings some additional nops
> when CONFIG_ARM64_PSEUDO_NMI is not set, which is not
> necessary. So add necessary ifdeffery to avoid it.
>
> Signed-off-by: He Ying <heying24@huawei.com>
> ---
>   arch/arm64/kernel/entry.S | 4 ++++
>   1 file changed, 4 insertions(+)
>
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 2f69ae43941d..ffc32d3d909a 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -300,6 +300,7 @@ alternative_else_nop_endif
>   	str	w21, [sp, #S_SYSCALLNO]
>   	.endif
>   
> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>   	/* Save pmr */
>   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>   	mrs_s	x20, SYS_ICC_PMR_EL1
> @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>   	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
>   	msr_s	SYS_ICC_PMR_EL1, x20
>   alternative_else_nop_endif
> +#endif
>   
>   	/* Re-enable tag checking (TCO set on exception entry) */
>   #ifdef CONFIG_ARM64_MTE
> @@ -330,6 +332,7 @@ alternative_else_nop_endif
>   	disable_daif
>   	.endif
>   
> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>   	/* Restore pmr */
>   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>   	ldr	x20, [sp, #S_PMR_SAVE]
> @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>   	dsb	sy				// Ensure priority change is seen by redistributor
>   .L__skip_pmr_sync\@:
>   alternative_else_nop_endif
> +#endif
>   
>   	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
>   

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
  2022-01-19  6:40     ` He Ying
@ 2022-01-19  9:35       ` Mark Rutland
  -1 siblings, 0 replies; 26+ messages in thread
From: Mark Rutland @ 2022-01-19  9:35 UTC (permalink / raw)
  To: He Ying
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

On Wed, Jan 19, 2022 at 02:40:58PM +0800, He Ying wrote:
> Hi all,
> 
> Ping. Any comments?

The patch looks fine, but as it's the middle of the merge window people
are busy and unlikely to look at this for the next few days.

Generally it's a good idea to wait until rc1 or rc2, rebase atop that,
and post the updated patch. Stuff like this usually gets queued around
rc3/rc4 time.

> 锟斤拷 2022/1/12 11:24, He Ying 写锟斤拷:
> > Arm64 pseudo-NMI feature code brings some additional nops
> > when CONFIG_ARM64_PSEUDO_NMI is not set, which is not
> > necessary. So add necessary ifdeffery to avoid it.
> > 
> > Signed-off-by: He Ying <heying24@huawei.com>

FWIW:

Acked-by: Mark Rutland <mark.rutland@arm.com>

Mark.

> > ---
> >   arch/arm64/kernel/entry.S | 4 ++++
> >   1 file changed, 4 insertions(+)
> > 
> > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> > index 2f69ae43941d..ffc32d3d909a 100644
> > --- a/arch/arm64/kernel/entry.S
> > +++ b/arch/arm64/kernel/entry.S
> > @@ -300,6 +300,7 @@ alternative_else_nop_endif
> >   	str	w21, [sp, #S_SYSCALLNO]
> >   	.endif
> > +#ifdef CONFIG_ARM64_PSEUDO_NMI
> >   	/* Save pmr */
> >   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
> >   	mrs_s	x20, SYS_ICC_PMR_EL1
> > @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
> >   	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
> >   	msr_s	SYS_ICC_PMR_EL1, x20
> >   alternative_else_nop_endif
> > +#endif
> >   	/* Re-enable tag checking (TCO set on exception entry) */
> >   #ifdef CONFIG_ARM64_MTE
> > @@ -330,6 +332,7 @@ alternative_else_nop_endif
> >   	disable_daif
> >   	.endif
> > +#ifdef CONFIG_ARM64_PSEUDO_NMI
> >   	/* Restore pmr */
> >   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
> >   	ldr	x20, [sp, #S_PMR_SAVE]
> > @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
> >   	dsb	sy				// Ensure priority change is seen by redistributor
> >   .L__skip_pmr_sync\@:
> >   alternative_else_nop_endif
> > +#endif
> >   	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
@ 2022-01-19  9:35       ` Mark Rutland
  0 siblings, 0 replies; 26+ messages in thread
From: Mark Rutland @ 2022-01-19  9:35 UTC (permalink / raw)
  To: He Ying
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel

On Wed, Jan 19, 2022 at 02:40:58PM +0800, He Ying wrote:
> Hi all,
> 
> Ping. Any comments?

The patch looks fine, but as it's the middle of the merge window people
are busy and unlikely to look at this for the next few days.

Generally it's a good idea to wait until rc1 or rc2, rebase atop that,
and post the updated patch. Stuff like this usually gets queued around
rc3/rc4 time.

> 锟斤拷 2022/1/12 11:24, He Ying 写锟斤拷:
> > Arm64 pseudo-NMI feature code brings some additional nops
> > when CONFIG_ARM64_PSEUDO_NMI is not set, which is not
> > necessary. So add necessary ifdeffery to avoid it.
> > 
> > Signed-off-by: He Ying <heying24@huawei.com>

FWIW:

Acked-by: Mark Rutland <mark.rutland@arm.com>

Mark.

> > ---
> >   arch/arm64/kernel/entry.S | 4 ++++
> >   1 file changed, 4 insertions(+)
> > 
> > diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> > index 2f69ae43941d..ffc32d3d909a 100644
> > --- a/arch/arm64/kernel/entry.S
> > +++ b/arch/arm64/kernel/entry.S
> > @@ -300,6 +300,7 @@ alternative_else_nop_endif
> >   	str	w21, [sp, #S_SYSCALLNO]
> >   	.endif
> > +#ifdef CONFIG_ARM64_PSEUDO_NMI
> >   	/* Save pmr */
> >   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
> >   	mrs_s	x20, SYS_ICC_PMR_EL1
> > @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
> >   	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
> >   	msr_s	SYS_ICC_PMR_EL1, x20
> >   alternative_else_nop_endif
> > +#endif
> >   	/* Re-enable tag checking (TCO set on exception entry) */
> >   #ifdef CONFIG_ARM64_MTE
> > @@ -330,6 +332,7 @@ alternative_else_nop_endif
> >   	disable_daif
> >   	.endif
> > +#ifdef CONFIG_ARM64_PSEUDO_NMI
> >   	/* Restore pmr */
> >   alternative_if ARM64_HAS_IRQ_PRIO_MASKING
> >   	ldr	x20, [sp, #S_PMR_SAVE]
> > @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
> >   	dsb	sy				// Ensure priority change is seen by redistributor
> >   .L__skip_pmr_sync\@:
> >   alternative_else_nop_endif
> > +#endif
> >   	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
  2022-01-19  9:35       ` Mark Rutland
@ 2022-01-19  9:47         ` He Ying
  -1 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-19  9:47 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel


在 2022/1/19 17:35, Mark Rutland 写道:
> On Wed, Jan 19, 2022 at 02:40:58PM +0800, He Ying wrote:
>> Hi all,
>>
>> Ping. Any comments?
> The patch looks fine, but as it's the middle of the merge window people
> are busy and unlikely to look at this for the next few days.
>
> Generally it's a good idea to wait until rc1 or rc2, rebase atop that,
> and post the updated patch. Stuff like this usually gets queued around
> rc3/rc4 time.
OK. Thanks for your comment.
>
>> 锟斤拷 2022/1/12 11:24, He Ying 写锟斤拷:
>>> Arm64 pseudo-NMI feature code brings some additional nops
>>> when CONFIG_ARM64_PSEUDO_NMI is not set, which is not
>>> necessary. So add necessary ifdeffery to avoid it.
>>>
>>> Signed-off-by: He Ying <heying24@huawei.com>
> FWIW:
>
> Acked-by: Mark Rutland <mark.rutland@arm.com>
>
> Mark.
>
>>> ---
>>>    arch/arm64/kernel/entry.S | 4 ++++
>>>    1 file changed, 4 insertions(+)
>>>
>>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>>> index 2f69ae43941d..ffc32d3d909a 100644
>>> --- a/arch/arm64/kernel/entry.S
>>> +++ b/arch/arm64/kernel/entry.S
>>> @@ -300,6 +300,7 @@ alternative_else_nop_endif
>>>    	str	w21, [sp, #S_SYSCALLNO]
>>>    	.endif
>>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>>    	/* Save pmr */
>>>    alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>>    	mrs_s	x20, SYS_ICC_PMR_EL1
>>> @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>>    	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
>>>    	msr_s	SYS_ICC_PMR_EL1, x20
>>>    alternative_else_nop_endif
>>> +#endif
>>>    	/* Re-enable tag checking (TCO set on exception entry) */
>>>    #ifdef CONFIG_ARM64_MTE
>>> @@ -330,6 +332,7 @@ alternative_else_nop_endif
>>>    	disable_daif
>>>    	.endif
>>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>>    	/* Restore pmr */
>>>    alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>>    	ldr	x20, [sp, #S_PMR_SAVE]
>>> @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>>    	dsb	sy				// Ensure priority change is seen by redistributor
>>>    .L__skip_pmr_sync\@:
>>>    alternative_else_nop_endif
>>> +#endif
>>>    	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
> .

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
@ 2022-01-19  9:47         ` He Ying
  0 siblings, 0 replies; 26+ messages in thread
From: He Ying @ 2022-01-19  9:47 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, marcan, maz, joey.gouly, pcc,
	linux-arm-kernel, linux-kernel


在 2022/1/19 17:35, Mark Rutland 写道:
> On Wed, Jan 19, 2022 at 02:40:58PM +0800, He Ying wrote:
>> Hi all,
>>
>> Ping. Any comments?
> The patch looks fine, but as it's the middle of the merge window people
> are busy and unlikely to look at this for the next few days.
>
> Generally it's a good idea to wait until rc1 or rc2, rebase atop that,
> and post the updated patch. Stuff like this usually gets queued around
> rc3/rc4 time.
OK. Thanks for your comment.
>
>> 锟斤拷 2022/1/12 11:24, He Ying 写锟斤拷:
>>> Arm64 pseudo-NMI feature code brings some additional nops
>>> when CONFIG_ARM64_PSEUDO_NMI is not set, which is not
>>> necessary. So add necessary ifdeffery to avoid it.
>>>
>>> Signed-off-by: He Ying <heying24@huawei.com>
> FWIW:
>
> Acked-by: Mark Rutland <mark.rutland@arm.com>
>
> Mark.
>
>>> ---
>>>    arch/arm64/kernel/entry.S | 4 ++++
>>>    1 file changed, 4 insertions(+)
>>>
>>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>>> index 2f69ae43941d..ffc32d3d909a 100644
>>> --- a/arch/arm64/kernel/entry.S
>>> +++ b/arch/arm64/kernel/entry.S
>>> @@ -300,6 +300,7 @@ alternative_else_nop_endif
>>>    	str	w21, [sp, #S_SYSCALLNO]
>>>    	.endif
>>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>>    	/* Save pmr */
>>>    alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>>    	mrs_s	x20, SYS_ICC_PMR_EL1
>>> @@ -307,6 +308,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>>    	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
>>>    	msr_s	SYS_ICC_PMR_EL1, x20
>>>    alternative_else_nop_endif
>>> +#endif
>>>    	/* Re-enable tag checking (TCO set on exception entry) */
>>>    #ifdef CONFIG_ARM64_MTE
>>> @@ -330,6 +332,7 @@ alternative_else_nop_endif
>>>    	disable_daif
>>>    	.endif
>>> +#ifdef CONFIG_ARM64_PSEUDO_NMI
>>>    	/* Restore pmr */
>>>    alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>>    	ldr	x20, [sp, #S_PMR_SAVE]
>>> @@ -339,6 +342,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
>>>    	dsb	sy				// Ensure priority change is seen by redistributor
>>>    .L__skip_pmr_sync\@:
>>>    alternative_else_nop_endif
>>> +#endif
>>>    	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
> .

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
  2022-01-12  3:24   ` He Ying
@ 2022-02-15 23:18     ` Will Deacon
  -1 siblings, 0 replies; 26+ messages in thread
From: Will Deacon @ 2022-02-15 23:18 UTC (permalink / raw)
  To: joey.gouly, maz, catalin.marinas, He Ying, pcc, mark.rutland, marcan
  Cc: kernel-team, Will Deacon, linux-arm-kernel, linux-kernel

On Tue, 11 Jan 2022 22:24:10 -0500, He Ying wrote:
> Arm64 pseudo-NMI feature code brings some additional nops
> when CONFIG_ARM64_PSEUDO_NMI is not set, which is not
> necessary. So add necessary ifdeffery to avoid it.
> 
> 

Applied to arm64 (for-next/misc), thanks!

[1/1] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
      https://git.kernel.org/arm64/c/3352a5556f52

Cheers,
-- 
Will

https://fixes.arm64.dev
https://next.arm64.dev
https://will.arm64.dev

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
@ 2022-02-15 23:18     ` Will Deacon
  0 siblings, 0 replies; 26+ messages in thread
From: Will Deacon @ 2022-02-15 23:18 UTC (permalink / raw)
  To: joey.gouly, maz, catalin.marinas, He Ying, pcc, mark.rutland, marcan
  Cc: kernel-team, Will Deacon, linux-arm-kernel, linux-kernel

On Tue, 11 Jan 2022 22:24:10 -0500, He Ying wrote:
> Arm64 pseudo-NMI feature code brings some additional nops
> when CONFIG_ARM64_PSEUDO_NMI is not set, which is not
> necessary. So add necessary ifdeffery to avoid it.
> 
> 

Applied to arm64 (for-next/misc), thanks!

[1/1] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set
      https://git.kernel.org/arm64/c/3352a5556f52

Cheers,
-- 
Will

https://fixes.arm64.dev
https://next.arm64.dev
https://will.arm64.dev

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2022-02-15 23:21 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-07  8:55 [PATCH] arm64: Make CONFIG_ARM64_PSEUDO_NMI macro wrap all the pseudo-NMI code He Ying
2022-01-07  8:55 ` He Ying
2022-01-07 13:19 ` Mark Rutland
2022-01-07 13:19   ` Mark Rutland
2022-01-10  3:00   ` He Ying
2022-01-10  3:00     ` He Ying
2022-01-10 11:26     ` Mark Rutland
2022-01-10 11:26       ` Mark Rutland
2022-01-11  8:52       ` He Ying
2022-01-11  8:52         ` He Ying
2022-01-11 11:05         ` Mark Rutland
2022-01-11 11:05           ` Mark Rutland
2022-01-08 12:51 ` Marc Zyngier
2022-01-08 12:51   ` Marc Zyngier
2022-01-10  3:20   ` He Ying
2022-01-10  3:20     ` He Ying
2022-01-12  3:24 ` [PATCH] arm64: entry: Save some nops when CONFIG_ARM64_PSEUDO_NMI is not set He Ying
2022-01-12  3:24   ` He Ying
2022-01-19  6:40   ` He Ying
2022-01-19  6:40     ` He Ying
2022-01-19  9:35     ` Mark Rutland
2022-01-19  9:35       ` Mark Rutland
2022-01-19  9:47       ` He Ying
2022-01-19  9:47         ` He Ying
2022-02-15 23:18   ` Will Deacon
2022-02-15 23:18     ` Will Deacon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.