All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-08 23:00 ` Stephen Boyd
  0 siblings, 0 replies; 40+ messages in thread
From: Stephen Boyd @ 2013-11-08 23:00 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, Jean-Christophe PLAGNIOL-VILLARD,
	Christopher Covington, Russell King - ARM Linux,
	Måns Rullgård, Rob Herring

If we're running on a v7 ARM CPU, detect if the CPU supports the
sdiv/udiv instructions and replace the signed and unsigned
division library functions with an sdiv/udiv instruction.

Running the perf messaging benchmark in pipe mode

 $ perf bench sched messaging -p

shows a modest improvement on my v7 CPU.

before:
(5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805

after:
(4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538

(5.805 - 5.538) / 5.805 = 4.6%

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---

Changes since v1:
 * Replace signed with unsigned in unsigned divide function
 * drop & in inline assembly
 * Use IS_ENABLED() instead of #ifdef
 * Pass DIV_V7 into lib1funcs.S instead of depending on ZIMAGE or CPU_V7

 arch/arm/kernel/setup.c  | 13 ++++++++++-
 arch/arm/lib/Makefile    |  6 +++++
 arch/arm/lib/div-v7.c    | 58 ++++++++++++++++++++++++++++++++++++++++++++++++
 arch/arm/lib/lib1funcs.S | 16 +++++++++++++
 4 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm/lib/div-v7.c

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 0e1e2b3..f9e577a 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -30,6 +30,7 @@
 #include <linux/bug.h>
 #include <linux/compiler.h>
 #include <linux/sort.h>
+#include <linux/static_key.h>
 
 #include <asm/unified.h>
 #include <asm/cp15.h>
@@ -365,9 +366,11 @@ void __init early_print(const char *str, ...)
 	printk("%s", buf);
 }
 
+struct static_key cpu_has_idiv = STATIC_KEY_INIT_FALSE;
+
 static void __init cpuid_init_hwcaps(void)
 {
-	unsigned int divide_instrs, vmsa;
+	unsigned int divide_instrs, vmsa, idiv_mask;
 
 	if (cpu_architecture() < CPU_ARCH_ARMv7)
 		return;
@@ -381,6 +384,14 @@ static void __init cpuid_init_hwcaps(void)
 		elf_hwcap |= HWCAP_IDIVT;
 	}
 
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL))
+		idiv_mask = HWCAP_IDIVT;
+	else
+		idiv_mask = HWCAP_IDIVA;
+
+	if (elf_hwcap & idiv_mask)
+		static_key_slow_inc(&cpu_has_idiv);
+
 	/* LPAE implies atomic ldrd/strd instructions */
 	vmsa = (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xf) >> 0;
 	if (vmsa >= 5)
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index bd454b0..38621729 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -15,6 +15,12 @@ lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
 		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
 		   call_with_stack.o
 
+lib-$(CONFIG_CPU_V7) += div-v7.o
+CFLAGS_div-v7.o := -march=armv7-a
+ifeq ($(CONFIG_CPU_V7),y)
+  AFLAGS_lib1funcs.o := -DDIV_V7
+endif
+
 mmu-y	:= clear_user.o copy_page.o getuser.o putuser.o
 
 # the code in uaccess.S is not preemption safe and
diff --git a/arch/arm/lib/div-v7.c b/arch/arm/lib/div-v7.c
new file mode 100644
index 0000000..e20945a
--- /dev/null
+++ b/arch/arm/lib/div-v7.c
@@ -0,0 +1,58 @@
+/* Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/static_key.h>
+
+extern int ___aeabi_idiv(int, int);
+extern unsigned ___aeabi_uidiv(int, int);
+
+extern struct static_key cpu_has_idiv;
+
+int __aeabi_idiv(int numerator, int denominator)
+{
+	if (static_key_false(&cpu_has_idiv)) {
+		int ret;
+
+		asm volatile (
+		".arch_extension idiv\n"
+		"sdiv %0, %1, %2"
+		: "=r" (ret)
+		: "r" (numerator), "r" (denominator));
+
+		return ret;
+	}
+
+	return ___aeabi_idiv(numerator, denominator);
+}
+
+int __divsi3(int numerator, int denominator)
+	__attribute__((alias("__aeabi_idiv")));
+
+unsigned __aeabi_uidiv(unsigned numerator, unsigned denominator)
+{
+	if (static_key_false(&cpu_has_idiv)) {
+		unsigned ret;
+
+		asm volatile (
+		".arch_extension idiv\n"
+		"udiv %0, %1, %2"
+		: "=r" (ret)
+		: "r" (numerator), "r" (denominator));
+
+		return ret;
+	}
+
+	return ___aeabi_uidiv(numerator, denominator);
+}
+
+unsigned __udivsi3(unsigned numerator, unsigned denominator)
+	__attribute__((alias("__aeabi_uidiv")));
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index c562f64..82bbcc7 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -205,8 +205,12 @@ Boston, MA 02111-1307, USA.  */
 .endm
 
 
+#ifdef DIV_V7
+ENTRY(___aeabi_uidiv)
+#else
 ENTRY(__udivsi3)
 ENTRY(__aeabi_uidiv)
+#endif
 UNWIND(.fnstart)
 
 	subs	r2, r1, #1
@@ -232,8 +236,12 @@ UNWIND(.fnstart)
 	mov	pc, lr
 
 UNWIND(.fnend)
+#ifdef DIV_V7
+ENDPROC(___aeabi_uidiv)
+#else
 ENDPROC(__udivsi3)
 ENDPROC(__aeabi_uidiv)
+#endif
 
 ENTRY(__umodsi3)
 UNWIND(.fnstart)
@@ -253,8 +261,12 @@ UNWIND(.fnstart)
 UNWIND(.fnend)
 ENDPROC(__umodsi3)
 
+#ifdef DIV_V7
+ENTRY(___aeabi_idiv)
+#else
 ENTRY(__divsi3)
 ENTRY(__aeabi_idiv)
+#endif
 UNWIND(.fnstart)
 
 	cmp	r1, #0
@@ -293,8 +305,12 @@ UNWIND(.fnstart)
 	mov	pc, lr
 
 UNWIND(.fnend)
+#ifdef DIV_V7
+ENDPROC(___aeabi_idiv)
+#else
 ENDPROC(__divsi3)
 ENDPROC(__aeabi_idiv)
+#endif
 
 ENTRY(__modsi3)
 UNWIND(.fnstart)
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-08 23:00 ` Stephen Boyd
  0 siblings, 0 replies; 40+ messages in thread
From: Stephen Boyd @ 2013-11-08 23:00 UTC (permalink / raw)
  To: linux-arm-kernel

If we're running on a v7 ARM CPU, detect if the CPU supports the
sdiv/udiv instructions and replace the signed and unsigned
division library functions with an sdiv/udiv instruction.

Running the perf messaging benchmark in pipe mode

 $ perf bench sched messaging -p

shows a modest improvement on my v7 CPU.

before:
(5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805

after:
(4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538

(5.805 - 5.538) / 5.805 = 4.6%

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---

Changes since v1:
 * Replace signed with unsigned in unsigned divide function
 * drop & in inline assembly
 * Use IS_ENABLED() instead of #ifdef
 * Pass DIV_V7 into lib1funcs.S instead of depending on ZIMAGE or CPU_V7

 arch/arm/kernel/setup.c  | 13 ++++++++++-
 arch/arm/lib/Makefile    |  6 +++++
 arch/arm/lib/div-v7.c    | 58 ++++++++++++++++++++++++++++++++++++++++++++++++
 arch/arm/lib/lib1funcs.S | 16 +++++++++++++
 4 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm/lib/div-v7.c

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 0e1e2b3..f9e577a 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -30,6 +30,7 @@
 #include <linux/bug.h>
 #include <linux/compiler.h>
 #include <linux/sort.h>
+#include <linux/static_key.h>
 
 #include <asm/unified.h>
 #include <asm/cp15.h>
@@ -365,9 +366,11 @@ void __init early_print(const char *str, ...)
 	printk("%s", buf);
 }
 
+struct static_key cpu_has_idiv = STATIC_KEY_INIT_FALSE;
+
 static void __init cpuid_init_hwcaps(void)
 {
-	unsigned int divide_instrs, vmsa;
+	unsigned int divide_instrs, vmsa, idiv_mask;
 
 	if (cpu_architecture() < CPU_ARCH_ARMv7)
 		return;
@@ -381,6 +384,14 @@ static void __init cpuid_init_hwcaps(void)
 		elf_hwcap |= HWCAP_IDIVT;
 	}
 
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL))
+		idiv_mask = HWCAP_IDIVT;
+	else
+		idiv_mask = HWCAP_IDIVA;
+
+	if (elf_hwcap & idiv_mask)
+		static_key_slow_inc(&cpu_has_idiv);
+
 	/* LPAE implies atomic ldrd/strd instructions */
 	vmsa = (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xf) >> 0;
 	if (vmsa >= 5)
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index bd454b0..38621729 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -15,6 +15,12 @@ lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
 		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
 		   call_with_stack.o
 
+lib-$(CONFIG_CPU_V7) += div-v7.o
+CFLAGS_div-v7.o := -march=armv7-a
+ifeq ($(CONFIG_CPU_V7),y)
+  AFLAGS_lib1funcs.o := -DDIV_V7
+endif
+
 mmu-y	:= clear_user.o copy_page.o getuser.o putuser.o
 
 # the code in uaccess.S is not preemption safe and
diff --git a/arch/arm/lib/div-v7.c b/arch/arm/lib/div-v7.c
new file mode 100644
index 0000000..e20945a
--- /dev/null
+++ b/arch/arm/lib/div-v7.c
@@ -0,0 +1,58 @@
+/* Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/static_key.h>
+
+extern int ___aeabi_idiv(int, int);
+extern unsigned ___aeabi_uidiv(int, int);
+
+extern struct static_key cpu_has_idiv;
+
+int __aeabi_idiv(int numerator, int denominator)
+{
+	if (static_key_false(&cpu_has_idiv)) {
+		int ret;
+
+		asm volatile (
+		".arch_extension idiv\n"
+		"sdiv %0, %1, %2"
+		: "=r" (ret)
+		: "r" (numerator), "r" (denominator));
+
+		return ret;
+	}
+
+	return ___aeabi_idiv(numerator, denominator);
+}
+
+int __divsi3(int numerator, int denominator)
+	__attribute__((alias("__aeabi_idiv")));
+
+unsigned __aeabi_uidiv(unsigned numerator, unsigned denominator)
+{
+	if (static_key_false(&cpu_has_idiv)) {
+		unsigned ret;
+
+		asm volatile (
+		".arch_extension idiv\n"
+		"udiv %0, %1, %2"
+		: "=r" (ret)
+		: "r" (numerator), "r" (denominator));
+
+		return ret;
+	}
+
+	return ___aeabi_uidiv(numerator, denominator);
+}
+
+unsigned __udivsi3(unsigned numerator, unsigned denominator)
+	__attribute__((alias("__aeabi_uidiv")));
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index c562f64..82bbcc7 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -205,8 +205,12 @@ Boston, MA 02111-1307, USA.  */
 .endm
 
 
+#ifdef DIV_V7
+ENTRY(___aeabi_uidiv)
+#else
 ENTRY(__udivsi3)
 ENTRY(__aeabi_uidiv)
+#endif
 UNWIND(.fnstart)
 
 	subs	r2, r1, #1
@@ -232,8 +236,12 @@ UNWIND(.fnstart)
 	mov	pc, lr
 
 UNWIND(.fnend)
+#ifdef DIV_V7
+ENDPROC(___aeabi_uidiv)
+#else
 ENDPROC(__udivsi3)
 ENDPROC(__aeabi_uidiv)
+#endif
 
 ENTRY(__umodsi3)
 UNWIND(.fnstart)
@@ -253,8 +261,12 @@ UNWIND(.fnstart)
 UNWIND(.fnend)
 ENDPROC(__umodsi3)
 
+#ifdef DIV_V7
+ENTRY(___aeabi_idiv)
+#else
 ENTRY(__divsi3)
 ENTRY(__aeabi_idiv)
+#endif
 UNWIND(.fnstart)
 
 	cmp	r1, #0
@@ -293,8 +305,12 @@ UNWIND(.fnstart)
 	mov	pc, lr
 
 UNWIND(.fnend)
+#ifdef DIV_V7
+ENDPROC(___aeabi_idiv)
+#else
 ENDPROC(__divsi3)
 ENDPROC(__aeabi_idiv)
+#endif
 
 ENTRY(__modsi3)
 UNWIND(.fnstart)
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-08 23:00 ` Stephen Boyd
@ 2013-11-09  6:46   ` Matt Sealey
  -1 siblings, 0 replies; 40+ messages in thread
From: Matt Sealey @ 2013-11-09  6:46 UTC (permalink / raw)
  To: Stephen Boyd
  Cc: linux-arm-kernel, linux-kernel, Jean-Christophe PLAGNIOL-VILLARD,
	Christopher Covington, Russell King - ARM Linux,
	Måns Rullgård, Rob Herring

On Fri, Nov 8, 2013 at 5:00 PM, Stephen Boyd <sboyd@codeaurora.org> wrote:
> If we're running on a v7 ARM CPU, detect if the CPU supports the
> sdiv/udiv instructions and replace the signed and unsigned
> division library functions with an sdiv/udiv instruction.
>
> Running the perf messaging benchmark in pipe mode
>
>  $ perf bench sched messaging -p
>
> shows a modest improvement on my v7 CPU.
>
> before:
> (5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805
>
> after:
> (4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538
>
> (5.805 - 5.538) / 5.805 = 4.6%

Even with the change to the output constraint suggested by Mans, you
get absolutely identical benchmark results? There's a lot of variance
in any case..

BTW has there been any evaluation of the penalty for the extra
branching, or the performance hit for the ARMv7-without-division
cases?

Ta,
Matt Sealey <neko@bakuhatsu.net>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-09  6:46   ` Matt Sealey
  0 siblings, 0 replies; 40+ messages in thread
From: Matt Sealey @ 2013-11-09  6:46 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Nov 8, 2013 at 5:00 PM, Stephen Boyd <sboyd@codeaurora.org> wrote:
> If we're running on a v7 ARM CPU, detect if the CPU supports the
> sdiv/udiv instructions and replace the signed and unsigned
> division library functions with an sdiv/udiv instruction.
>
> Running the perf messaging benchmark in pipe mode
>
>  $ perf bench sched messaging -p
>
> shows a modest improvement on my v7 CPU.
>
> before:
> (5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805
>
> after:
> (4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538
>
> (5.805 - 5.538) / 5.805 = 4.6%

Even with the change to the output constraint suggested by Mans, you
get absolutely identical benchmark results? There's a lot of variance
in any case..

BTW has there been any evaluation of the penalty for the extra
branching, or the performance hit for the ARMv7-without-division
cases?

Ta,
Matt Sealey <neko@bakuhatsu.net>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-09  6:46   ` Matt Sealey
@ 2013-11-09 18:20     ` Måns Rullgård
  -1 siblings, 0 replies; 40+ messages in thread
From: Måns Rullgård @ 2013-11-09 18:20 UTC (permalink / raw)
  To: Matt Sealey
  Cc: Stephen Boyd, linux-arm-kernel, linux-kernel,
	Jean-Christophe PLAGNIOL-VILLARD, Christopher Covington,
	Russell King - ARM Linux, Måns Rullgård, Rob Herring

Matt Sealey <neko@bakuhatsu.net> writes:

> BTW has there been any evaluation of the penalty for the extra
> branching, or the performance hit for the ARMv7-without-division
> cases?

The branches themselves probably have minimal overhead.  There will
however be code to preserve call-clobbered registers (and move the
values to/from r0/r1) that would not be needed if the div instructions
were done inline (obviously such a kernel could only run on hardware
with division support).

-- 
Måns Rullgård
mans@mansr.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-09 18:20     ` Måns Rullgård
  0 siblings, 0 replies; 40+ messages in thread
From: Måns Rullgård @ 2013-11-09 18:20 UTC (permalink / raw)
  To: linux-arm-kernel

Matt Sealey <neko@bakuhatsu.net> writes:

> BTW has there been any evaluation of the penalty for the extra
> branching, or the performance hit for the ARMv7-without-division
> cases?

The branches themselves probably have minimal overhead.  There will
however be code to preserve call-clobbered registers (and move the
values to/from r0/r1) that would not be needed if the div instructions
were done inline (obviously such a kernel could only run on hardware
with division support).

-- 
M?ns Rullg?rd
mans at mansr.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-08 23:00 ` Stephen Boyd
@ 2013-11-10  5:03   ` Nicolas Pitre
  -1 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-10  5:03 UTC (permalink / raw)
  To: Stephen Boyd
  Cc: linux-arm-kernel, linux-kernel, Jean-Christophe PLAGNIOL-VILLARD,
	Christopher Covington, Russell King - ARM Linux,
	Måns Rullgård, Rob Herring

On Fri, 8 Nov 2013, Stephen Boyd wrote:

> If we're running on a v7 ARM CPU, detect if the CPU supports the
> sdiv/udiv instructions and replace the signed and unsigned
> division library functions with an sdiv/udiv instruction.
> 
> Running the perf messaging benchmark in pipe mode
> 
>  $ perf bench sched messaging -p
> 
> shows a modest improvement on my v7 CPU.
> 
> before:
> (5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805
> 
> after:
> (4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538
> 
> (5.805 - 5.538) / 5.805 = 4.6%
> 
> Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>

Bah..... NAK.

We are doing runtime patching of the kernel for many many things
already.  So why not do the same here?

The obvious strategy is to simply overwrite the start of the existing 
__aeabi_idiv code with the "sdiv r0, r0, r1" and "bx lr" opcodes.  

Similarly for the unsigned case.

That let you test the hardware capability only once during boot instead 
of everytime a divide operation is performed.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-10  5:03   ` Nicolas Pitre
  0 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-10  5:03 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 8 Nov 2013, Stephen Boyd wrote:

> If we're running on a v7 ARM CPU, detect if the CPU supports the
> sdiv/udiv instructions and replace the signed and unsigned
> division library functions with an sdiv/udiv instruction.
> 
> Running the perf messaging benchmark in pipe mode
> 
>  $ perf bench sched messaging -p
> 
> shows a modest improvement on my v7 CPU.
> 
> before:
> (5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805
> 
> after:
> (4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538
> 
> (5.805 - 5.538) / 5.805 = 4.6%
> 
> Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>

Bah..... NAK.

We are doing runtime patching of the kernel for many many things
already.  So why not do the same here?

The obvious strategy is to simply overwrite the start of the existing 
__aeabi_idiv code with the "sdiv r0, r0, r1" and "bx lr" opcodes.  

Similarly for the unsigned case.

That let you test the hardware capability only once during boot instead 
of everytime a divide operation is performed.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-08 23:00 ` Stephen Boyd
@ 2013-11-11  7:46   ` Uwe Kleine-König
  -1 siblings, 0 replies; 40+ messages in thread
From: Uwe Kleine-König @ 2013-11-11  7:46 UTC (permalink / raw)
  To: Stephen Boyd
  Cc: linux-arm-kernel, Måns Rullgård,
	Russell King - ARM Linux, linux-kernel, Christopher Covington,
	Jean-Christophe PLAGNIOL-VILLARD

Hello,

On Fri, Nov 08, 2013 at 03:00:32PM -0800, Stephen Boyd wrote:
> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> index bd454b0..38621729 100644
> --- a/arch/arm/lib/Makefile
> +++ b/arch/arm/lib/Makefile
> @@ -15,6 +15,12 @@ lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
>  		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
>  		   call_with_stack.o
>  
> +lib-$(CONFIG_CPU_V7) += div-v7.o
CPU_V7M could make use of that, too.
(If you follow Nico's advice to use runtime patching I cannot test it
for you on v7-M though as my machine has to use an XIP kernel.)

Best regards
Uwe

-- 
Pengutronix e.K.                           | Uwe Kleine-König            |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-11  7:46   ` Uwe Kleine-König
  0 siblings, 0 replies; 40+ messages in thread
From: Uwe Kleine-König @ 2013-11-11  7:46 UTC (permalink / raw)
  To: linux-arm-kernel

Hello,

On Fri, Nov 08, 2013 at 03:00:32PM -0800, Stephen Boyd wrote:
> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> index bd454b0..38621729 100644
> --- a/arch/arm/lib/Makefile
> +++ b/arch/arm/lib/Makefile
> @@ -15,6 +15,12 @@ lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
>  		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
>  		   call_with_stack.o
>  
> +lib-$(CONFIG_CPU_V7) += div-v7.o
CPU_V7M could make use of that, too.
(If you follow Nico's advice to use runtime patching I cannot test it
for you on v7-M though as my machine has to use an XIP kernel.)

Best regards
Uwe

-- 
Pengutronix e.K.                           | Uwe Kleine-K?nig            |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-09  6:46   ` Matt Sealey
@ 2013-11-12  1:23     ` Stephen Boyd
  -1 siblings, 0 replies; 40+ messages in thread
From: Stephen Boyd @ 2013-11-12  1:23 UTC (permalink / raw)
  To: Matt Sealey
  Cc: linux-arm-kernel, linux-kernel, Jean-Christophe PLAGNIOL-VILLARD,
	Christopher Covington, Russell King - ARM Linux,
	Måns Rullgård, Rob Herring

On 11/08/13 22:46, Matt Sealey wrote:
> On Fri, Nov 8, 2013 at 5:00 PM, Stephen Boyd <sboyd@codeaurora.org> wrote:
>> If we're running on a v7 ARM CPU, detect if the CPU supports the
>> sdiv/udiv instructions and replace the signed and unsigned
>> division library functions with an sdiv/udiv instruction.
>>
>> Running the perf messaging benchmark in pipe mode
>>
>>  $ perf bench sched messaging -p
>>
>> shows a modest improvement on my v7 CPU.
>>
>> before:
>> (5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805
>>
>> after:
>> (4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538
>>
>> (5.805 - 5.538) / 5.805 = 4.6%
> Even with the change to the output constraint suggested by Mans, you
> get absolutely identical benchmark results? There's a lot of variance
> in any case..

Yeah sorry I didn't run the testcase again to see if numbers changed
because I assumed one less instruction would be in the noise. I agree
there is a lot of variance so if you have any better
benchmarks/testcases please let me know.

>
> BTW has there been any evaluation of the penalty for the extra
> branching, or the performance hit for the ARMv7-without-division
> cases?

I haven't done any. I'll factor that in for the next round.

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12  1:23     ` Stephen Boyd
  0 siblings, 0 replies; 40+ messages in thread
From: Stephen Boyd @ 2013-11-12  1:23 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/08/13 22:46, Matt Sealey wrote:
> On Fri, Nov 8, 2013 at 5:00 PM, Stephen Boyd <sboyd@codeaurora.org> wrote:
>> If we're running on a v7 ARM CPU, detect if the CPU supports the
>> sdiv/udiv instructions and replace the signed and unsigned
>> division library functions with an sdiv/udiv instruction.
>>
>> Running the perf messaging benchmark in pipe mode
>>
>>  $ perf bench sched messaging -p
>>
>> shows a modest improvement on my v7 CPU.
>>
>> before:
>> (5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805
>>
>> after:
>> (4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538
>>
>> (5.805 - 5.538) / 5.805 = 4.6%
> Even with the change to the output constraint suggested by Mans, you
> get absolutely identical benchmark results? There's a lot of variance
> in any case..

Yeah sorry I didn't run the testcase again to see if numbers changed
because I assumed one less instruction would be in the noise. I agree
there is a lot of variance so if you have any better
benchmarks/testcases please let me know.

>
> BTW has there been any evaluation of the penalty for the extra
> branching, or the performance hit for the ARMv7-without-division
> cases?

I haven't done any. I'll factor that in for the next round.

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-10  5:03   ` Nicolas Pitre
@ 2013-11-12  2:34     ` Stephen Boyd
  -1 siblings, 0 replies; 40+ messages in thread
From: Stephen Boyd @ 2013-11-12  2:34 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: linux-arm-kernel, linux-kernel, Jean-Christophe PLAGNIOL-VILLARD,
	Christopher Covington, Russell King - ARM Linux,
	Måns Rullgård, Rob Herring

On 11/09/13 21:03, Nicolas Pitre wrote:
> Bah..... NAK. We are doing runtime patching of the kernel for many
> many things already. So why not do the same here?

static keys are a form of runtime patching, albeit not as extreme as
you're suggesting.

>
> The obvious strategy is to simply overwrite the start of the existing 
> __aeabi_idiv code with the "sdiv r0, r0, r1" and "bx lr" opcodes.  
>
> Similarly for the unsigned case.

I was thinking the same thing when I wrote this, but I didn't know how
to tell the compiler to either inline this function or to let me inilne
an assembly stub with some section magic.

>
> That let you test the hardware capability only once during boot instead 
> of everytime a divide operation is performed.

The test for hardware capability really isn't done more than once during
boot. The assembly is like so at compile time

00000000 <__aeabi_idiv>:
   0:   nop     {0}
   4:   b       0 <___aeabi_idiv>
   8:   sdiv    r0, r0, r1
   c:   bx      lr

and after we test and find support for the instruction it will be
replaced with

00000000 <__aeabi_idiv>:
   0:   b       8
   4:   b       0 <___aeabi_idiv>
   8:   sdiv    r0, r0, r1
   c:   bx      lr

Unfortunately we still have to jump to this function. It would be great
if we could inline this function at the call site but as I already said
I don't know how to do that.

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12  2:34     ` Stephen Boyd
  0 siblings, 0 replies; 40+ messages in thread
From: Stephen Boyd @ 2013-11-12  2:34 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/09/13 21:03, Nicolas Pitre wrote:
> Bah..... NAK. We are doing runtime patching of the kernel for many
> many things already. So why not do the same here?

static keys are a form of runtime patching, albeit not as extreme as
you're suggesting.

>
> The obvious strategy is to simply overwrite the start of the existing 
> __aeabi_idiv code with the "sdiv r0, r0, r1" and "bx lr" opcodes.  
>
> Similarly for the unsigned case.

I was thinking the same thing when I wrote this, but I didn't know how
to tell the compiler to either inline this function or to let me inilne
an assembly stub with some section magic.

>
> That let you test the hardware capability only once during boot instead 
> of everytime a divide operation is performed.

The test for hardware capability really isn't done more than once during
boot. The assembly is like so at compile time

00000000 <__aeabi_idiv>:
   0:   nop     {0}
   4:   b       0 <___aeabi_idiv>
   8:   sdiv    r0, r0, r1
   c:   bx      lr

and after we test and find support for the instruction it will be
replaced with

00000000 <__aeabi_idiv>:
   0:   b       8
   4:   b       0 <___aeabi_idiv>
   8:   sdiv    r0, r0, r1
   c:   bx      lr

Unfortunately we still have to jump to this function. It would be great
if we could inline this function at the call site but as I already said
I don't know how to do that.

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-11  7:46   ` Uwe Kleine-König
@ 2013-11-12  2:35     ` Stephen Boyd
  -1 siblings, 0 replies; 40+ messages in thread
From: Stephen Boyd @ 2013-11-12  2:35 UTC (permalink / raw)
  To: Uwe Kleine-König
  Cc: linux-arm-kernel, Måns Rullgård,
	Russell King - ARM Linux, linux-kernel, Christopher Covington,
	Jean-Christophe PLAGNIOL-VILLARD

On 11/10/13 23:46, Uwe Kleine-König wrote:
> Hello,
>
> On Fri, Nov 08, 2013 at 03:00:32PM -0800, Stephen Boyd wrote:
>> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
>> index bd454b0..38621729 100644
>> --- a/arch/arm/lib/Makefile
>> +++ b/arch/arm/lib/Makefile
>> @@ -15,6 +15,12 @@ lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
>>  		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
>>  		   call_with_stack.o
>>  
>> +lib-$(CONFIG_CPU_V7) += div-v7.o
> CPU_V7M could make use of that, too.
> (If you follow Nico's advice to use runtime patching I cannot test it
> for you on v7-M though as my machine has to use an XIP kernel.)
>

It already is runtime patching so I suspect you won't be able to test it
anyway. I suppose we need another config like MIGHT_HAVE_IDIV or
something that both v7 and v7M select?

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12  2:35     ` Stephen Boyd
  0 siblings, 0 replies; 40+ messages in thread
From: Stephen Boyd @ 2013-11-12  2:35 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/10/13 23:46, Uwe Kleine-K?nig wrote:
> Hello,
>
> On Fri, Nov 08, 2013 at 03:00:32PM -0800, Stephen Boyd wrote:
>> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
>> index bd454b0..38621729 100644
>> --- a/arch/arm/lib/Makefile
>> +++ b/arch/arm/lib/Makefile
>> @@ -15,6 +15,12 @@ lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
>>  		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
>>  		   call_with_stack.o
>>  
>> +lib-$(CONFIG_CPU_V7) += div-v7.o
> CPU_V7M could make use of that, too.
> (If you follow Nico's advice to use runtime patching I cannot test it
> for you on v7-M though as my machine has to use an XIP kernel.)
>

It already is runtime patching so I suspect you won't be able to test it
anyway. I suppose we need another config like MIGHT_HAVE_IDIV or
something that both v7 and v7M select?

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12  2:34     ` Stephen Boyd
@ 2013-11-12 11:28       ` Måns Rullgård
  -1 siblings, 0 replies; 40+ messages in thread
From: Måns Rullgård @ 2013-11-12 11:28 UTC (permalink / raw)
  To: Stephen Boyd
  Cc: Nicolas Pitre, linux-arm-kernel, linux-kernel,
	Jean-Christophe PLAGNIOL-VILLARD, Christopher Covington,
	Russell King - ARM Linux, Måns Rullgård, Rob Herring

Stephen Boyd <sboyd@codeaurora.org> writes:

> On 11/09/13 21:03, Nicolas Pitre wrote:
>> Bah..... NAK. We are doing runtime patching of the kernel for many
>> many things already. So why not do the same here?
>
> static keys are a form of runtime patching, albeit not as extreme as
> you're suggesting.
>
>>
>> The obvious strategy is to simply overwrite the start of the existing 
>> __aeabi_idiv code with the "sdiv r0, r0, r1" and "bx lr" opcodes.  
>>
>> Similarly for the unsigned case.
>
> I was thinking the same thing when I wrote this, but I didn't know how
> to tell the compiler to either inline this function or to let me inilne
> an assembly stub with some section magic.
>
>>
>> That let you test the hardware capability only once during boot instead 
>> of everytime a divide operation is performed.
>
> The test for hardware capability really isn't done more than once during
> boot. The assembly is like so at compile time
>
> 00000000 <__aeabi_idiv>:
>    0:   nop     {0}
>    4:   b       0 <___aeabi_idiv>
>    8:   sdiv    r0, r0, r1
>    c:   bx      lr
>
> and after we test and find support for the instruction it will be
> replaced with
>
> 00000000 <__aeabi_idiv>:
>    0:   b       8
>    4:   b       0 <___aeabi_idiv>
>    8:   sdiv    r0, r0, r1
>    c:   bx      lr
>
> Unfortunately we still have to jump to this function. It would be great
> if we could inline this function at the call site but as I already said
> I don't know how to do that.

Ideally the bl instruction at the call site would be patched over with
sdiv/udiv when supported.  This would leave things exactly as they are
for hardware without div capability and incur only the call setup cost
(but no actual call) on div-capable hardware.  No, I don't know how to
achieve this.

-- 
Måns Rullgård
mans@mansr.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 11:28       ` Måns Rullgård
  0 siblings, 0 replies; 40+ messages in thread
From: Måns Rullgård @ 2013-11-12 11:28 UTC (permalink / raw)
  To: linux-arm-kernel

Stephen Boyd <sboyd@codeaurora.org> writes:

> On 11/09/13 21:03, Nicolas Pitre wrote:
>> Bah..... NAK. We are doing runtime patching of the kernel for many
>> many things already. So why not do the same here?
>
> static keys are a form of runtime patching, albeit not as extreme as
> you're suggesting.
>
>>
>> The obvious strategy is to simply overwrite the start of the existing 
>> __aeabi_idiv code with the "sdiv r0, r0, r1" and "bx lr" opcodes.  
>>
>> Similarly for the unsigned case.
>
> I was thinking the same thing when I wrote this, but I didn't know how
> to tell the compiler to either inline this function or to let me inilne
> an assembly stub with some section magic.
>
>>
>> That let you test the hardware capability only once during boot instead 
>> of everytime a divide operation is performed.
>
> The test for hardware capability really isn't done more than once during
> boot. The assembly is like so at compile time
>
> 00000000 <__aeabi_idiv>:
>    0:   nop     {0}
>    4:   b       0 <___aeabi_idiv>
>    8:   sdiv    r0, r0, r1
>    c:   bx      lr
>
> and after we test and find support for the instruction it will be
> replaced with
>
> 00000000 <__aeabi_idiv>:
>    0:   b       8
>    4:   b       0 <___aeabi_idiv>
>    8:   sdiv    r0, r0, r1
>    c:   bx      lr
>
> Unfortunately we still have to jump to this function. It would be great
> if we could inline this function at the call site but as I already said
> I don't know how to do that.

Ideally the bl instruction at the call site would be patched over with
sdiv/udiv when supported.  This would leave things exactly as they are
for hardware without div capability and incur only the call setup cost
(but no actual call) on div-capable hardware.  No, I don't know how to
achieve this.

-- 
M?ns Rullg?rd
mans at mansr.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12  2:34     ` Stephen Boyd
@ 2013-11-12 14:01       ` Nicolas Pitre
  -1 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 14:01 UTC (permalink / raw)
  To: Stephen Boyd
  Cc: linux-arm-kernel, linux-kernel, Jean-Christophe PLAGNIOL-VILLARD,
	Christopher Covington, Russell King - ARM Linux,
	Måns Rullgård, Rob Herring

On Mon, 11 Nov 2013, Stephen Boyd wrote:

> On 11/09/13 21:03, Nicolas Pitre wrote:
> > Bah..... NAK. We are doing runtime patching of the kernel for many
> > many things already. So why not do the same here?
> 
> static keys are a form of runtime patching, albeit not as extreme as
> you're suggesting.
> 
> >
> > The obvious strategy is to simply overwrite the start of the existing 
> > __aeabi_idiv code with the "sdiv r0, r0, r1" and "bx lr" opcodes.  
> >
> > Similarly for the unsigned case.
> 
> I was thinking the same thing when I wrote this, but I didn't know how
> to tell the compiler to either inline this function or to let me inilne
> an assembly stub with some section magic.
> 
> >
> > That let you test the hardware capability only once during boot instead 
> > of everytime a divide operation is performed.
> 
> The test for hardware capability really isn't done more than once during
> boot. The assembly is like so at compile time
> 
> 00000000 <__aeabi_idiv>:
>    0:   nop     {0}
>    4:   b       0 <___aeabi_idiv>
>    8:   sdiv    r0, r0, r1
>    c:   bx      lr
> 
> and after we test and find support for the instruction it will be
> replaced with
> 
> 00000000 <__aeabi_idiv>:
>    0:   b       8
>    4:   b       0 <___aeabi_idiv>
>    8:   sdiv    r0, r0, r1
>    c:   bx      lr
> 
> Unfortunately we still have to jump to this function. It would be great
> if we could inline this function at the call site but as I already said
> I don't know how to do that.

What about this patch which I think is currently your best option.  Note 
it would need to use the facilities from asm/opcodes.h to make it endian 
agnostic.

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 6a1b8a81b1..379cffe4ab 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
 		elf_hwcap |= HWCAP_IDIVT;
 	}
 
+	/*
+	 * Patch our division routines with the corresponding opcode
+	 * if the hardware supports it.
+	 */
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
+		extern char __aeabi_uidiv, __aeabi_idiv;
+		u16 *uidiv = (u16 *)&__aeabi_uidiv;
+		u16 *idiv = (u16 *)&__aeabi_idiv;
+
+		uidiv[0] = 0xfbb0;  /* udiv r0, r0, r1 */
+		uidiv[1] = 0xf0f1;
+		uidiv[2] = 0x4770;  /* bx lr */
+
+		idiv[0] = 0xfb90;  /* sdiv r0, r0, r1 */
+		idiv[1] = 0xf0f1;  
+		idiv[2] = 0x4770;  /* bx lr */
+	} else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
+		extern char __aeabi_uidiv, __aeabi_idiv;
+		u32 *uidiv = (u32 *)&__aeabi_uidiv;
+		u32 *idiv = (u32 *)&__aeabi_idiv;
+
+		uidiv[0] = 0xe730f110;  /* udiv r0, r0, r1 */
+		uidiv[1] = 0xe12fff1e;  /* bx lr */
+
+		idiv[0] = 0xe710f110;  /* sdiv r0, r0, r1 */
+		idiv[1] = 0xe12fff1e;  /* bx lr */
+	}
+
 	/* LPAE implies atomic ldrd/strd instructions */
 	vmsa = (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xf) >> 0;
 	if (vmsa >= 5)


Nicolas

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 14:01       ` Nicolas Pitre
  0 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 14:01 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 11 Nov 2013, Stephen Boyd wrote:

> On 11/09/13 21:03, Nicolas Pitre wrote:
> > Bah..... NAK. We are doing runtime patching of the kernel for many
> > many things already. So why not do the same here?
> 
> static keys are a form of runtime patching, albeit not as extreme as
> you're suggesting.
> 
> >
> > The obvious strategy is to simply overwrite the start of the existing 
> > __aeabi_idiv code with the "sdiv r0, r0, r1" and "bx lr" opcodes.  
> >
> > Similarly for the unsigned case.
> 
> I was thinking the same thing when I wrote this, but I didn't know how
> to tell the compiler to either inline this function or to let me inilne
> an assembly stub with some section magic.
> 
> >
> > That let you test the hardware capability only once during boot instead 
> > of everytime a divide operation is performed.
> 
> The test for hardware capability really isn't done more than once during
> boot. The assembly is like so at compile time
> 
> 00000000 <__aeabi_idiv>:
>    0:   nop     {0}
>    4:   b       0 <___aeabi_idiv>
>    8:   sdiv    r0, r0, r1
>    c:   bx      lr
> 
> and after we test and find support for the instruction it will be
> replaced with
> 
> 00000000 <__aeabi_idiv>:
>    0:   b       8
>    4:   b       0 <___aeabi_idiv>
>    8:   sdiv    r0, r0, r1
>    c:   bx      lr
> 
> Unfortunately we still have to jump to this function. It would be great
> if we could inline this function at the call site but as I already said
> I don't know how to do that.

What about this patch which I think is currently your best option.  Note 
it would need to use the facilities from asm/opcodes.h to make it endian 
agnostic.

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 6a1b8a81b1..379cffe4ab 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
 		elf_hwcap |= HWCAP_IDIVT;
 	}
 
+	/*
+	 * Patch our division routines with the corresponding opcode
+	 * if the hardware supports it.
+	 */
+	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
+		extern char __aeabi_uidiv, __aeabi_idiv;
+		u16 *uidiv = (u16 *)&__aeabi_uidiv;
+		u16 *idiv = (u16 *)&__aeabi_idiv;
+
+		uidiv[0] = 0xfbb0;  /* udiv r0, r0, r1 */
+		uidiv[1] = 0xf0f1;
+		uidiv[2] = 0x4770;  /* bx lr */
+
+		idiv[0] = 0xfb90;  /* sdiv r0, r0, r1 */
+		idiv[1] = 0xf0f1;  
+		idiv[2] = 0x4770;  /* bx lr */
+	} else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
+		extern char __aeabi_uidiv, __aeabi_idiv;
+		u32 *uidiv = (u32 *)&__aeabi_uidiv;
+		u32 *idiv = (u32 *)&__aeabi_idiv;
+
+		uidiv[0] = 0xe730f110;  /* udiv r0, r0, r1 */
+		uidiv[1] = 0xe12fff1e;  /* bx lr */
+
+		idiv[0] = 0xe710f110;  /* sdiv r0, r0, r1 */
+		idiv[1] = 0xe12fff1e;  /* bx lr */
+	}
+
 	/* LPAE implies atomic ldrd/strd instructions */
 	vmsa = (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xf) >> 0;
 	if (vmsa >= 5)


Nicolas

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12 14:01       ` Nicolas Pitre
@ 2013-11-12 14:04         ` Russell King - ARM Linux
  -1 siblings, 0 replies; 40+ messages in thread
From: Russell King - ARM Linux @ 2013-11-12 14:04 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: Stephen Boyd, linux-arm-kernel, linux-kernel,
	Jean-Christophe PLAGNIOL-VILLARD, Christopher Covington,
	Måns Rullgård, Rob Herring

On Tue, Nov 12, 2013 at 09:01:16AM -0500, Nicolas Pitre wrote:
> What about this patch which I think is currently your best option.  Note 
> it would need to use the facilities from asm/opcodes.h to make it endian 
> agnostic.
> 
> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> index 6a1b8a81b1..379cffe4ab 100644
> --- a/arch/arm/kernel/setup.c
> +++ b/arch/arm/kernel/setup.c
> @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
>  		elf_hwcap |= HWCAP_IDIVT;
>  	}
>  
> +	/*
> +	 * Patch our division routines with the corresponding opcode
> +	 * if the hardware supports it.
> +	 */
> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
> +		extern char __aeabi_uidiv, __aeabi_idiv;
> +		u16 *uidiv = (u16 *)&__aeabi_uidiv;
> +		u16 *idiv = (u16 *)&__aeabi_idiv;
> +
> +		uidiv[0] = 0xfbb0;  /* udiv r0, r0, r1 */
> +		uidiv[1] = 0xf0f1;
> +		uidiv[2] = 0x4770;  /* bx lr */
> +
> +		idiv[0] = 0xfb90;  /* sdiv r0, r0, r1 */
> +		idiv[1] = 0xf0f1;  
> +		idiv[2] = 0x4770;  /* bx lr */
> +	} else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
> +		extern char __aeabi_uidiv, __aeabi_idiv;
> +		u32 *uidiv = (u32 *)&__aeabi_uidiv;
> +		u32 *idiv = (u32 *)&__aeabi_idiv;
> +
> +		uidiv[0] = 0xe730f110;  /* udiv r0, r0, r1 */
> +		uidiv[1] = 0xe12fff1e;  /* bx lr */
> +
> +		idiv[0] = 0xe710f110;  /* sdiv r0, r0, r1 */
> +		idiv[1] = 0xe12fff1e;  /* bx lr */
> +	}

What about endianness, and what if XIP is enabled?

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 14:04         ` Russell King - ARM Linux
  0 siblings, 0 replies; 40+ messages in thread
From: Russell King - ARM Linux @ 2013-11-12 14:04 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 12, 2013 at 09:01:16AM -0500, Nicolas Pitre wrote:
> What about this patch which I think is currently your best option.  Note 
> it would need to use the facilities from asm/opcodes.h to make it endian 
> agnostic.
> 
> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> index 6a1b8a81b1..379cffe4ab 100644
> --- a/arch/arm/kernel/setup.c
> +++ b/arch/arm/kernel/setup.c
> @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
>  		elf_hwcap |= HWCAP_IDIVT;
>  	}
>  
> +	/*
> +	 * Patch our division routines with the corresponding opcode
> +	 * if the hardware supports it.
> +	 */
> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
> +		extern char __aeabi_uidiv, __aeabi_idiv;
> +		u16 *uidiv = (u16 *)&__aeabi_uidiv;
> +		u16 *idiv = (u16 *)&__aeabi_idiv;
> +
> +		uidiv[0] = 0xfbb0;  /* udiv r0, r0, r1 */
> +		uidiv[1] = 0xf0f1;
> +		uidiv[2] = 0x4770;  /* bx lr */
> +
> +		idiv[0] = 0xfb90;  /* sdiv r0, r0, r1 */
> +		idiv[1] = 0xf0f1;  
> +		idiv[2] = 0x4770;  /* bx lr */
> +	} else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
> +		extern char __aeabi_uidiv, __aeabi_idiv;
> +		u32 *uidiv = (u32 *)&__aeabi_uidiv;
> +		u32 *idiv = (u32 *)&__aeabi_idiv;
> +
> +		uidiv[0] = 0xe730f110;  /* udiv r0, r0, r1 */
> +		uidiv[1] = 0xe12fff1e;  /* bx lr */
> +
> +		idiv[0] = 0xe710f110;  /* sdiv r0, r0, r1 */
> +		idiv[1] = 0xe12fff1e;  /* bx lr */
> +	}

What about endianness, and what if XIP is enabled?

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12 14:04         ` Russell King - ARM Linux
@ 2013-11-12 14:16           ` Nicolas Pitre
  -1 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 14:16 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Stephen Boyd, linux-arm-kernel, linux-kernel,
	Jean-Christophe PLAGNIOL-VILLARD, Christopher Covington,
	Måns Rullgård, Rob Herring

On Tue, 12 Nov 2013, Russell King - ARM Linux wrote:

> On Tue, Nov 12, 2013 at 09:01:16AM -0500, Nicolas Pitre wrote:
> > What about this patch which I think is currently your best option.  Note 
> > it would need to use the facilities from asm/opcodes.h to make it endian 
> > agnostic.
> > 
> > diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> > index 6a1b8a81b1..379cffe4ab 100644
> > --- a/arch/arm/kernel/setup.c
> > +++ b/arch/arm/kernel/setup.c
> > @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
> >  		elf_hwcap |= HWCAP_IDIVT;
> >  	}
> >  
> > +	/*
> > +	 * Patch our division routines with the corresponding opcode
> > +	 * if the hardware supports it.
> > +	 */
> > +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
> > +		extern char __aeabi_uidiv, __aeabi_idiv;
> > +		u16 *uidiv = (u16 *)&__aeabi_uidiv;
> > +		u16 *idiv = (u16 *)&__aeabi_idiv;
> > +
> > +		uidiv[0] = 0xfbb0;  /* udiv r0, r0, r1 */
> > +		uidiv[1] = 0xf0f1;
> > +		uidiv[2] = 0x4770;  /* bx lr */
> > +
> > +		idiv[0] = 0xfb90;  /* sdiv r0, r0, r1 */
> > +		idiv[1] = 0xf0f1;  
> > +		idiv[2] = 0x4770;  /* bx lr */
> > +	} else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
> > +		extern char __aeabi_uidiv, __aeabi_idiv;
> > +		u32 *uidiv = (u32 *)&__aeabi_uidiv;
> > +		u32 *idiv = (u32 *)&__aeabi_idiv;
> > +
> > +		uidiv[0] = 0xe730f110;  /* udiv r0, r0, r1 */
> > +		uidiv[1] = 0xe12fff1e;  /* bx lr */
> > +
> > +		idiv[0] = 0xe710f110;  /* sdiv r0, r0, r1 */
> > +		idiv[1] = 0xe12fff1e;  /* bx lr */
> > +	}
> 
> What about endianness, and what if XIP is enabled?

Just as I said above the diff: this needs refined.

Obviously XIP can't use this and doesn't need it either as a XIP kernel 
should be optimized for the very platform it will run onto i.e. gcc 
should already emit those div opcodes inline if appropriate.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 14:16           ` Nicolas Pitre
  0 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 14:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 12 Nov 2013, Russell King - ARM Linux wrote:

> On Tue, Nov 12, 2013 at 09:01:16AM -0500, Nicolas Pitre wrote:
> > What about this patch which I think is currently your best option.  Note 
> > it would need to use the facilities from asm/opcodes.h to make it endian 
> > agnostic.
> > 
> > diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> > index 6a1b8a81b1..379cffe4ab 100644
> > --- a/arch/arm/kernel/setup.c
> > +++ b/arch/arm/kernel/setup.c
> > @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
> >  		elf_hwcap |= HWCAP_IDIVT;
> >  	}
> >  
> > +	/*
> > +	 * Patch our division routines with the corresponding opcode
> > +	 * if the hardware supports it.
> > +	 */
> > +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
> > +		extern char __aeabi_uidiv, __aeabi_idiv;
> > +		u16 *uidiv = (u16 *)&__aeabi_uidiv;
> > +		u16 *idiv = (u16 *)&__aeabi_idiv;
> > +
> > +		uidiv[0] = 0xfbb0;  /* udiv r0, r0, r1 */
> > +		uidiv[1] = 0xf0f1;
> > +		uidiv[2] = 0x4770;  /* bx lr */
> > +
> > +		idiv[0] = 0xfb90;  /* sdiv r0, r0, r1 */
> > +		idiv[1] = 0xf0f1;  
> > +		idiv[2] = 0x4770;  /* bx lr */
> > +	} else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
> > +		extern char __aeabi_uidiv, __aeabi_idiv;
> > +		u32 *uidiv = (u32 *)&__aeabi_uidiv;
> > +		u32 *idiv = (u32 *)&__aeabi_idiv;
> > +
> > +		uidiv[0] = 0xe730f110;  /* udiv r0, r0, r1 */
> > +		uidiv[1] = 0xe12fff1e;  /* bx lr */
> > +
> > +		idiv[0] = 0xe710f110;  /* sdiv r0, r0, r1 */
> > +		idiv[1] = 0xe12fff1e;  /* bx lr */
> > +	}
> 
> What about endianness, and what if XIP is enabled?

Just as I said above the diff: this needs refined.

Obviously XIP can't use this and doesn't need it either as a XIP kernel 
should be optimized for the very platform it will run onto i.e. gcc 
should already emit those div opcodes inline if appropriate.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12 14:04         ` Russell King - ARM Linux
@ 2013-11-12 14:17           ` Ben Dooks
  -1 siblings, 0 replies; 40+ messages in thread
From: Ben Dooks @ 2013-11-12 14:17 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Nicolas Pitre, Måns Rullgård, Stephen Boyd,
	linux-kernel, Christopher Covington,
	Jean-Christophe PLAGNIOL-VILLARD, linux-arm-kernel

On 12/11/13 14:04, Russell King - ARM Linux wrote:
> On Tue, Nov 12, 2013 at 09:01:16AM -0500, Nicolas Pitre wrote:
>> What about this patch which I think is currently your best option.  Note
>> it would need to use the facilities from asm/opcodes.h to make it endian
>> agnostic.
>>
>> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
>> index 6a1b8a81b1..379cffe4ab 100644
>> --- a/arch/arm/kernel/setup.c
>> +++ b/arch/arm/kernel/setup.c
>> @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
>>   		elf_hwcap |= HWCAP_IDIVT;
>>   	}
>>
>> +	/*
>> +	 * Patch our division routines with the corresponding opcode
>> +	 * if the hardware supports it.
>> +	 */
>> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
>> +		extern char __aeabi_uidiv, __aeabi_idiv;
>> +		u16 *uidiv = (u16 *)&__aeabi_uidiv;
>> +		u16 *idiv = (u16 *)&__aeabi_idiv;
>> +
>> +		uidiv[0] = 0xfbb0;  /* udiv r0, r0, r1 */
>> +		uidiv[1] = 0xf0f1;
>> +		uidiv[2] = 0x4770;  /* bx lr */
>> +
>> +		idiv[0] = 0xfb90;  /* sdiv r0, r0, r1 */
>> +		idiv[1] = 0xf0f1;
>> +		idiv[2] = 0x4770;  /* bx lr */
>> +	} else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
>> +		extern char __aeabi_uidiv, __aeabi_idiv;
>> +		u32 *uidiv = (u32 *)&__aeabi_uidiv;
>> +		u32 *idiv = (u32 *)&__aeabi_idiv;
>> +
>> +		uidiv[0] = 0xe730f110;  /* udiv r0, r0, r1 */
>> +		uidiv[1] = 0xe12fff1e;  /* bx lr */
>> +
>> +		idiv[0] = 0xe710f110;  /* sdiv r0, r0, r1 */
>> +		idiv[1] = 0xe12fff1e;  /* bx lr */
>> +	}
>
> What about endianness, and what if XIP is enabled?

I was also going to add a note about endian-ness.

Given these are single instructoins for ARM, is it possible we could
make a table of all the callers and fix them up when we initialise
as we do for the SMP/UP case and for page-offset?


-- 
Ben Dooks				http://www.codethink.co.uk/
Senior Engineer				Codethink - Providing Genius

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 14:17           ` Ben Dooks
  0 siblings, 0 replies; 40+ messages in thread
From: Ben Dooks @ 2013-11-12 14:17 UTC (permalink / raw)
  To: linux-arm-kernel

On 12/11/13 14:04, Russell King - ARM Linux wrote:
> On Tue, Nov 12, 2013 at 09:01:16AM -0500, Nicolas Pitre wrote:
>> What about this patch which I think is currently your best option.  Note
>> it would need to use the facilities from asm/opcodes.h to make it endian
>> agnostic.
>>
>> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
>> index 6a1b8a81b1..379cffe4ab 100644
>> --- a/arch/arm/kernel/setup.c
>> +++ b/arch/arm/kernel/setup.c
>> @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
>>   		elf_hwcap |= HWCAP_IDIVT;
>>   	}
>>
>> +	/*
>> +	 * Patch our division routines with the corresponding opcode
>> +	 * if the hardware supports it.
>> +	 */
>> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
>> +		extern char __aeabi_uidiv, __aeabi_idiv;
>> +		u16 *uidiv = (u16 *)&__aeabi_uidiv;
>> +		u16 *idiv = (u16 *)&__aeabi_idiv;
>> +
>> +		uidiv[0] = 0xfbb0;  /* udiv r0, r0, r1 */
>> +		uidiv[1] = 0xf0f1;
>> +		uidiv[2] = 0x4770;  /* bx lr */
>> +
>> +		idiv[0] = 0xfb90;  /* sdiv r0, r0, r1 */
>> +		idiv[1] = 0xf0f1;
>> +		idiv[2] = 0x4770;  /* bx lr */
>> +	} else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
>> +		extern char __aeabi_uidiv, __aeabi_idiv;
>> +		u32 *uidiv = (u32 *)&__aeabi_uidiv;
>> +		u32 *idiv = (u32 *)&__aeabi_idiv;
>> +
>> +		uidiv[0] = 0xe730f110;  /* udiv r0, r0, r1 */
>> +		uidiv[1] = 0xe12fff1e;  /* bx lr */
>> +
>> +		idiv[0] = 0xe710f110;  /* sdiv r0, r0, r1 */
>> +		idiv[1] = 0xe12fff1e;  /* bx lr */
>> +	}
>
> What about endianness, and what if XIP is enabled?

I was also going to add a note about endian-ness.

Given these are single instructoins for ARM, is it possible we could
make a table of all the callers and fix them up when we initialise
as we do for the SMP/UP case and for page-offset?


-- 
Ben Dooks				http://www.codethink.co.uk/
Senior Engineer				Codethink - Providing Genius

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12 14:01       ` Nicolas Pitre
@ 2013-11-12 14:22         ` Måns Rullgård
  -1 siblings, 0 replies; 40+ messages in thread
From: Måns Rullgård @ 2013-11-12 14:22 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: Stephen Boyd, linux-arm-kernel, linux-kernel,
	Jean-Christophe PLAGNIOL-VILLARD, Christopher Covington,
	Russell King - ARM Linux, Måns Rullgård, Rob Herring

Nicolas Pitre <nicolas.pitre@linaro.org> writes:

> What about this patch which I think is currently your best option.  Note 
> it would need to use the facilities from asm/opcodes.h to make it endian 
> agnostic.
>
> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> index 6a1b8a81b1..379cffe4ab 100644
> --- a/arch/arm/kernel/setup.c
> +++ b/arch/arm/kernel/setup.c
> @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
>  		elf_hwcap |= HWCAP_IDIVT;
>  	}
>
> +	/*
> +	 * Patch our division routines with the corresponding opcode
> +	 * if the hardware supports it.
> +	 */
> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
> +		extern char __aeabi_uidiv, __aeabi_idiv;

It would be safer to declare these as arrays of unspecified size.
Otherwise the compiler might do evil things with what to it looks like
out of bounds indexing.

There should also be some cache maintenance after this patching, or is
that already happening for some other reason?

-- 
Måns Rullgård
mans@mansr.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 14:22         ` Måns Rullgård
  0 siblings, 0 replies; 40+ messages in thread
From: Måns Rullgård @ 2013-11-12 14:22 UTC (permalink / raw)
  To: linux-arm-kernel

Nicolas Pitre <nicolas.pitre@linaro.org> writes:

> What about this patch which I think is currently your best option.  Note 
> it would need to use the facilities from asm/opcodes.h to make it endian 
> agnostic.
>
> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> index 6a1b8a81b1..379cffe4ab 100644
> --- a/arch/arm/kernel/setup.c
> +++ b/arch/arm/kernel/setup.c
> @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
>  		elf_hwcap |= HWCAP_IDIVT;
>  	}
>
> +	/*
> +	 * Patch our division routines with the corresponding opcode
> +	 * if the hardware supports it.
> +	 */
> +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
> +		extern char __aeabi_uidiv, __aeabi_idiv;

It would be safer to declare these as arrays of unspecified size.
Otherwise the compiler might do evil things with what to it looks like
out of bounds indexing.

There should also be some cache maintenance after this patching, or is
that already happening for some other reason?

-- 
M?ns Rullg?rd
mans at mansr.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12 14:17           ` Ben Dooks
@ 2013-11-12 14:32             ` Nicolas Pitre
  -1 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 14:32 UTC (permalink / raw)
  To: Ben Dooks
  Cc: Russell King - ARM Linux, Måns Rullgård, Stephen Boyd,
	linux-kernel, Christopher Covington,
	Jean-Christophe PLAGNIOL-VILLARD, linux-arm-kernel

On Tue, 12 Nov 2013, Ben Dooks wrote:

> Given these are single instructoins for ARM, is it possible we could
> make a table of all the callers and fix them up when we initialise
> as we do for the SMP/UP case and for page-offset?

Not really.  Calls to those functions are generated by the compiler 
implicitly when a divisor operand is used and therefore we cannot 
annotate those calls.  We'd have to use special accessors everywhere to 
replace the standard division operand (like we do for 64 by 32 bit 
divisions) but I doubt that people would accept that.

You cannot just scan the binary for the appropriate branch opcode either 
as you may turn up false positives in literal pools.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 14:32             ` Nicolas Pitre
  0 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 14:32 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 12 Nov 2013, Ben Dooks wrote:

> Given these are single instructoins for ARM, is it possible we could
> make a table of all the callers and fix them up when we initialise
> as we do for the SMP/UP case and for page-offset?

Not really.  Calls to those functions are generated by the compiler 
implicitly when a divisor operand is used and therefore we cannot 
annotate those calls.  We'd have to use special accessors everywhere to 
replace the standard division operand (like we do for 64 by 32 bit 
divisions) but I doubt that people would accept that.

You cannot just scan the binary for the appropriate branch opcode either 
as you may turn up false positives in literal pools.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12 14:22         ` Måns Rullgård
@ 2013-11-12 14:36           ` Nicolas Pitre
  -1 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 14:36 UTC (permalink / raw)
  To: Måns Rullgård
  Cc: Stephen Boyd, linux-arm-kernel, linux-kernel,
	Jean-Christophe PLAGNIOL-VILLARD, Christopher Covington,
	Russell King - ARM Linux, Rob Herring

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1224 bytes --]

On Tue, 12 Nov 2013, Måns Rullgård wrote:

> Nicolas Pitre <nicolas.pitre@linaro.org> writes:
> 
> > What about this patch which I think is currently your best option.  Note 
> > it would need to use the facilities from asm/opcodes.h to make it endian 
> > agnostic.
> >
> > diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> > index 6a1b8a81b1..379cffe4ab 100644
> > --- a/arch/arm/kernel/setup.c
> > +++ b/arch/arm/kernel/setup.c
> > @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
> >  		elf_hwcap |= HWCAP_IDIVT;
> >  	}
> >
> > +	/*
> > +	 * Patch our division routines with the corresponding opcode
> > +	 * if the hardware supports it.
> > +	 */
> > +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
> > +		extern char __aeabi_uidiv, __aeabi_idiv;
> 
> It would be safer to declare these as arrays of unspecified size.
> Otherwise the compiler might do evil things with what to it looks like
> out of bounds indexing.

Right.

> 
> There should also be some cache maintenance after this patching, or is
> that already happening for some other reason?

This is so early during boot that the MMU isn't even fully initialized 
yet.  The cache will be flushed.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 14:36           ` Nicolas Pitre
  0 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 14:36 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 12 Nov 2013, M?ns Rullg?rd wrote:

> Nicolas Pitre <nicolas.pitre@linaro.org> writes:
> 
> > What about this patch which I think is currently your best option.  Note 
> > it would need to use the facilities from asm/opcodes.h to make it endian 
> > agnostic.
> >
> > diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> > index 6a1b8a81b1..379cffe4ab 100644
> > --- a/arch/arm/kernel/setup.c
> > +++ b/arch/arm/kernel/setup.c
> > @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
> >  		elf_hwcap |= HWCAP_IDIVT;
> >  	}
> >
> > +	/*
> > +	 * Patch our division routines with the corresponding opcode
> > +	 * if the hardware supports it.
> > +	 */
> > +	if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
> > +		extern char __aeabi_uidiv, __aeabi_idiv;
> 
> It would be safer to declare these as arrays of unspecified size.
> Otherwise the compiler might do evil things with what to it looks like
> out of bounds indexing.

Right.

> 
> There should also be some cache maintenance after this patching, or is
> that already happening for some other reason?

This is so early during boot that the MMU isn't even fully initialized 
yet.  The cache will be flushed.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12 14:32             ` Nicolas Pitre
@ 2013-11-12 14:40               ` Måns Rullgård
  -1 siblings, 0 replies; 40+ messages in thread
From: Måns Rullgård @ 2013-11-12 14:40 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: Ben Dooks, Russell King - ARM Linux, Måns Rullgård,
	Stephen Boyd, linux-kernel, Christopher Covington,
	Jean-Christophe PLAGNIOL-VILLARD, linux-arm-kernel

Nicolas Pitre <nicolas.pitre@linaro.org> writes:

> On Tue, 12 Nov 2013, Ben Dooks wrote:
>
>> Given these are single instructoins for ARM, is it possible we could
>> make a table of all the callers and fix them up when we initialise
>> as we do for the SMP/UP case and for page-offset?
>
> Not really.  Calls to those functions are generated by the compiler 
> implicitly when a divisor operand is used and therefore we cannot 
> annotate those calls.  We'd have to use special accessors everywhere to 
> replace the standard division operand (like we do for 64 by 32 bit 
> divisions) but I doubt that people would accept that.

It might be possible to extract this information from relocation tables.

-- 
Måns Rullgård
mans@mansr.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 14:40               ` Måns Rullgård
  0 siblings, 0 replies; 40+ messages in thread
From: Måns Rullgård @ 2013-11-12 14:40 UTC (permalink / raw)
  To: linux-arm-kernel

Nicolas Pitre <nicolas.pitre@linaro.org> writes:

> On Tue, 12 Nov 2013, Ben Dooks wrote:
>
>> Given these are single instructoins for ARM, is it possible we could
>> make a table of all the callers and fix them up when we initialise
>> as we do for the SMP/UP case and for page-offset?
>
> Not really.  Calls to those functions are generated by the compiler 
> implicitly when a divisor operand is used and therefore we cannot 
> annotate those calls.  We'd have to use special accessors everywhere to 
> replace the standard division operand (like we do for 64 by 32 bit 
> divisions) but I doubt that people would accept that.

It might be possible to extract this information from relocation tables.

-- 
M?ns Rullg?rd
mans at mansr.com

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12 14:40               ` Måns Rullgård
@ 2013-11-12 14:55                 ` Nicolas Pitre
  -1 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 14:55 UTC (permalink / raw)
  To: Måns Rullgård
  Cc: Ben Dooks, Russell King - ARM Linux, Stephen Boyd, linux-kernel,
	Christopher Covington, Jean-Christophe PLAGNIOL-VILLARD,
	linux-arm-kernel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1093 bytes --]

On Tue, 12 Nov 2013, Måns Rullgård wrote:

> Nicolas Pitre <nicolas.pitre@linaro.org> writes:
> 
> > On Tue, 12 Nov 2013, Ben Dooks wrote:
> >
> >> Given these are single instructoins for ARM, is it possible we could
> >> make a table of all the callers and fix them up when we initialise
> >> as we do for the SMP/UP case and for page-offset?
> >
> > Not really.  Calls to those functions are generated by the compiler 
> > implicitly when a divisor operand is used and therefore we cannot 
> > annotate those calls.  We'd have to use special accessors everywhere to 
> > replace the standard division operand (like we do for 64 by 32 bit 
> > divisions) but I doubt that people would accept that.
> 
> It might be possible to extract this information from relocation tables.

True, but only for individual .o files.  Once the linker puts them 
together the information is lost, and trying to infer what the linker 
has done is insane.

Filtering the compiler output to annotate idiv calls before it is 
assembled would probably be a better solution.

Is it worth it?  I'm not sure.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 14:55                 ` Nicolas Pitre
  0 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 14:55 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 12 Nov 2013, M?ns Rullg?rd wrote:

> Nicolas Pitre <nicolas.pitre@linaro.org> writes:
> 
> > On Tue, 12 Nov 2013, Ben Dooks wrote:
> >
> >> Given these are single instructoins for ARM, is it possible we could
> >> make a table of all the callers and fix them up when we initialise
> >> as we do for the SMP/UP case and for page-offset?
> >
> > Not really.  Calls to those functions are generated by the compiler 
> > implicitly when a divisor operand is used and therefore we cannot 
> > annotate those calls.  We'd have to use special accessors everywhere to 
> > replace the standard division operand (like we do for 64 by 32 bit 
> > divisions) but I doubt that people would accept that.
> 
> It might be possible to extract this information from relocation tables.

True, but only for individual .o files.  Once the linker puts them 
together the information is lost, and trying to infer what the linker 
has done is insane.

Filtering the compiler output to annotate idiv calls before it is 
assembled would probably be a better solution.

Is it worth it?  I'm not sure.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12 14:55                 ` Nicolas Pitre
@ 2013-11-12 15:20                   ` Nicolas Pitre
  -1 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 15:20 UTC (permalink / raw)
  To: Måns Rullgård
  Cc: Ben Dooks, Russell King - ARM Linux, Stephen Boyd, linux-kernel,
	Christopher Covington, Jean-Christophe PLAGNIOL-VILLARD,
	linux-arm-kernel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 732 bytes --]

On Tue, 12 Nov 2013, Nicolas Pitre wrote:

> On Tue, 12 Nov 2013, Måns Rullgård wrote:
> 
> > It might be possible to extract this information from relocation tables.
> 
> True, but only for individual .o files.  Once the linker puts them 
> together the information is lost, and trying to infer what the linker 
> has done is insane.
> 
> Filtering the compiler output to annotate idiv calls before it is 
> assembled would probably be a better solution.

Another solution is to patch the call site from within __aeabi_idiv at 
run time using lr.  That wouldn't work in the presence of tail call 
optimization though.

Again this might not be worth it and patching __aeabi_idiv instead might 
be a good enough compromize.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 15:20                   ` Nicolas Pitre
  0 siblings, 0 replies; 40+ messages in thread
From: Nicolas Pitre @ 2013-11-12 15:20 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 12 Nov 2013, Nicolas Pitre wrote:

> On Tue, 12 Nov 2013, M?ns Rullg?rd wrote:
> 
> > It might be possible to extract this information from relocation tables.
> 
> True, but only for individual .o files.  Once the linker puts them 
> together the information is lost, and trying to infer what the linker 
> has done is insane.
> 
> Filtering the compiler output to annotate idiv calls before it is 
> assembled would probably be a better solution.

Another solution is to patch the call site from within __aeabi_idiv at 
run time using lr.  That wouldn't work in the presence of tail call 
optimization though.

Again this might not be worth it and patching __aeabi_idiv instead might 
be a good enough compromize.


Nicolas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
  2013-11-12 14:55                 ` Nicolas Pitre
@ 2013-11-12 18:03                   ` Måns Rullgård
  -1 siblings, 0 replies; 40+ messages in thread
From: Måns Rullgård @ 2013-11-12 18:03 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: Måns Rullgård, Ben Dooks, Russell King - ARM Linux,
	Stephen Boyd, linux-kernel, Christopher Covington,
	Jean-Christophe PLAGNIOL-VILLARD, linux-arm-kernel

Nicolas Pitre <nicolas.pitre@linaro.org> writes:

> On Tue, 12 Nov 2013, Måns Rullgård wrote:
>
>> Nicolas Pitre <nicolas.pitre@linaro.org> writes:
>> 
>> > On Tue, 12 Nov 2013, Ben Dooks wrote:
>> >
>> >> Given these are single instructoins for ARM, is it possible we could
>> >> make a table of all the callers and fix them up when we initialise
>> >> as we do for the SMP/UP case and for page-offset?
>> >
>> > Not really.  Calls to those functions are generated by the compiler 
>> > implicitly when a divisor operand is used and therefore we cannot 
>> > annotate those calls.  We'd have to use special accessors everywhere to 
>> > replace the standard division operand (like we do for 64 by 32 bit 
>> > divisions) but I doubt that people would accept that.
>> 
>> It might be possible to extract this information from relocation tables.
>
> True, but only for individual .o files.  Once the linker puts them 
> together the information is lost, and trying to infer what the linker 
> has done is insane.
>
> Filtering the compiler output to annotate idiv calls before it is 
> assembled would probably be a better solution.

OK, here's an extremely ugly hootenanny of a patch.  It seems to work on
an A7 Cubieboard2.  I would never suggest actually doing this, but maybe
it can be useful for comparing performance against the more palatable
solutions.

diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 7397db6..cf1cd30 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -113,7 +113,7 @@ endif
 endif
 
 # Need -Uarm for gcc < 3.x
-KBUILD_CFLAGS	+=$(CFLAGS_ABI) $(CFLAGS_THUMB2) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm
+KBUILD_CFLAGS	+=$(CFLAGS_ABI) $(CFLAGS_THUMB2) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm -include asm/divhack.h
 KBUILD_AFLAGS	+=$(CFLAGS_ABI) $(AFLAGS_THUMB2) $(arch-y) $(tune-y) -include asm/unified.h -msoft-float
 
 CHECKFLAGS	+= -D__arm__
diff --git a/arch/arm/include/asm/divhack.h b/arch/arm/include/asm/divhack.h
new file mode 100644
index 0000000..c750b78
--- /dev/null
+++ b/arch/arm/include/asm/divhack.h
@@ -0,0 +1,23 @@
+__asm__ (".macro dobl tgt                                       \n"
+         "    .ifc \\tgt, __aeabi_idiv                          \n"
+         "        .L.sdiv.\\@:                                  \n"
+         "        .pushsection .sdiv_tab.init, \"a\", %progbits \n"
+         "        .word    .L.sdiv.\\@                          \n"
+         "        .popsection                                   \n"
+         "    .endif                                            \n"
+         "    .ifc \\tgt, __aeabi_uidiv                         \n"
+         "        .L.udiv.\\@:                                  \n"
+         "        .pushsection .udiv_tab.init, \"a\", %progbits \n"
+         "        .word    .L.udiv.\\@                          \n"
+         "        .popsection                                   \n"
+         "    .endif                                            \n"
+         "    bl \\tgt                                          \n"
+         ".endm                                                 \n"
+         ".macro defbl                                          \n"
+         "    .macro bl tgt                                     \n"
+         "        .purgem bl                                    \n"
+         "        dobl \\tgt                                    \n"
+         "        defbl                                         \n"
+         "    .endm                                             \n"
+         ".endm                                                 \n"
+         "defbl                                                 \n");
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 067815c1..b3a3fe1 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -375,6 +375,18 @@ static void __init cpuid_init_hwcaps(void)
 	case 1:
 		elf_hwcap |= HWCAP_IDIVT;
 	}
+
+	if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
+		extern u32 __sdiv_tab_start, __sdiv_tab_end;
+		extern u32 __udiv_tab_start, __udiv_tab_end;
+		u32 *div;
+
+		for (div = &__sdiv_tab_start; div < &__sdiv_tab_end; div++)
+			*(u32 *)*div = 0xe710f110;
+
+		for (div = &__udiv_tab_start; div < &__udiv_tab_end; div++)
+			*(u32 *)*div = 0xe730f110;
+	}
 }
 
 static void __init feat_v6_fixup(void)
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 43a31fb..3d5c103 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -176,6 +176,8 @@ SECTIONS
 		CON_INITCALL
 		SECURITY_INITCALL
 		INIT_RAM_FS
+		__sdiv_tab_start = .; *(.sdiv_tab.init); __sdiv_tab_end = .;
+		__udiv_tab_start = .; *(.udiv_tab.init); __udiv_tab_end = .;
 	}
 #ifndef CONFIG_XIP_KERNEL
 	.exit.data : {


-- 
Måns Rullgård
mans@mansr.com

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions
@ 2013-11-12 18:03                   ` Måns Rullgård
  0 siblings, 0 replies; 40+ messages in thread
From: Måns Rullgård @ 2013-11-12 18:03 UTC (permalink / raw)
  To: linux-arm-kernel

Nicolas Pitre <nicolas.pitre@linaro.org> writes:

> On Tue, 12 Nov 2013, M?ns Rullg?rd wrote:
>
>> Nicolas Pitre <nicolas.pitre@linaro.org> writes:
>> 
>> > On Tue, 12 Nov 2013, Ben Dooks wrote:
>> >
>> >> Given these are single instructoins for ARM, is it possible we could
>> >> make a table of all the callers and fix them up when we initialise
>> >> as we do for the SMP/UP case and for page-offset?
>> >
>> > Not really.  Calls to those functions are generated by the compiler 
>> > implicitly when a divisor operand is used and therefore we cannot 
>> > annotate those calls.  We'd have to use special accessors everywhere to 
>> > replace the standard division operand (like we do for 64 by 32 bit 
>> > divisions) but I doubt that people would accept that.
>> 
>> It might be possible to extract this information from relocation tables.
>
> True, but only for individual .o files.  Once the linker puts them 
> together the information is lost, and trying to infer what the linker 
> has done is insane.
>
> Filtering the compiler output to annotate idiv calls before it is 
> assembled would probably be a better solution.

OK, here's an extremely ugly hootenanny of a patch.  It seems to work on
an A7 Cubieboard2.  I would never suggest actually doing this, but maybe
it can be useful for comparing performance against the more palatable
solutions.

diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 7397db6..cf1cd30 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -113,7 +113,7 @@ endif
 endif
 
 # Need -Uarm for gcc < 3.x
-KBUILD_CFLAGS	+=$(CFLAGS_ABI) $(CFLAGS_THUMB2) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm
+KBUILD_CFLAGS	+=$(CFLAGS_ABI) $(CFLAGS_THUMB2) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm -include asm/divhack.h
 KBUILD_AFLAGS	+=$(CFLAGS_ABI) $(AFLAGS_THUMB2) $(arch-y) $(tune-y) -include asm/unified.h -msoft-float
 
 CHECKFLAGS	+= -D__arm__
diff --git a/arch/arm/include/asm/divhack.h b/arch/arm/include/asm/divhack.h
new file mode 100644
index 0000000..c750b78
--- /dev/null
+++ b/arch/arm/include/asm/divhack.h
@@ -0,0 +1,23 @@
+__asm__ (".macro dobl tgt                                       \n"
+         "    .ifc \\tgt, __aeabi_idiv                          \n"
+         "        .L.sdiv.\\@:                                  \n"
+         "        .pushsection .sdiv_tab.init, \"a\", %progbits \n"
+         "        .word    .L.sdiv.\\@                          \n"
+         "        .popsection                                   \n"
+         "    .endif                                            \n"
+         "    .ifc \\tgt, __aeabi_uidiv                         \n"
+         "        .L.udiv.\\@:                                  \n"
+         "        .pushsection .udiv_tab.init, \"a\", %progbits \n"
+         "        .word    .L.udiv.\\@                          \n"
+         "        .popsection                                   \n"
+         "    .endif                                            \n"
+         "    bl \\tgt                                          \n"
+         ".endm                                                 \n"
+         ".macro defbl                                          \n"
+         "    .macro bl tgt                                     \n"
+         "        .purgem bl                                    \n"
+         "        dobl \\tgt                                    \n"
+         "        defbl                                         \n"
+         "    .endm                                             \n"
+         ".endm                                                 \n"
+         "defbl                                                 \n");
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 067815c1..b3a3fe1 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -375,6 +375,18 @@ static void __init cpuid_init_hwcaps(void)
 	case 1:
 		elf_hwcap |= HWCAP_IDIVT;
 	}
+
+	if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
+		extern u32 __sdiv_tab_start, __sdiv_tab_end;
+		extern u32 __udiv_tab_start, __udiv_tab_end;
+		u32 *div;
+
+		for (div = &__sdiv_tab_start; div < &__sdiv_tab_end; div++)
+			*(u32 *)*div = 0xe710f110;
+
+		for (div = &__udiv_tab_start; div < &__udiv_tab_end; div++)
+			*(u32 *)*div = 0xe730f110;
+	}
 }
 
 static void __init feat_v6_fixup(void)
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 43a31fb..3d5c103 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -176,6 +176,8 @@ SECTIONS
 		CON_INITCALL
 		SECURITY_INITCALL
 		INIT_RAM_FS
+		__sdiv_tab_start = .; *(.sdiv_tab.init); __sdiv_tab_end = .;
+		__udiv_tab_start = .; *(.udiv_tab.init); __udiv_tab_end = .;
 	}
 #ifndef CONFIG_XIP_KERNEL
 	.exit.data : {


-- 
M?ns Rullg?rd
mans@mansr.com

^ permalink raw reply related	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2013-11-12 18:04 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-08 23:00 [PATCH v2] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions Stephen Boyd
2013-11-08 23:00 ` Stephen Boyd
2013-11-09  6:46 ` Matt Sealey
2013-11-09  6:46   ` Matt Sealey
2013-11-09 18:20   ` Måns Rullgård
2013-11-09 18:20     ` Måns Rullgård
2013-11-12  1:23   ` Stephen Boyd
2013-11-12  1:23     ` Stephen Boyd
2013-11-10  5:03 ` Nicolas Pitre
2013-11-10  5:03   ` Nicolas Pitre
2013-11-12  2:34   ` Stephen Boyd
2013-11-12  2:34     ` Stephen Boyd
2013-11-12 11:28     ` Måns Rullgård
2013-11-12 11:28       ` Måns Rullgård
2013-11-12 14:01     ` Nicolas Pitre
2013-11-12 14:01       ` Nicolas Pitre
2013-11-12 14:04       ` Russell King - ARM Linux
2013-11-12 14:04         ` Russell King - ARM Linux
2013-11-12 14:16         ` Nicolas Pitre
2013-11-12 14:16           ` Nicolas Pitre
2013-11-12 14:17         ` Ben Dooks
2013-11-12 14:17           ` Ben Dooks
2013-11-12 14:32           ` Nicolas Pitre
2013-11-12 14:32             ` Nicolas Pitre
2013-11-12 14:40             ` Måns Rullgård
2013-11-12 14:40               ` Måns Rullgård
2013-11-12 14:55               ` Nicolas Pitre
2013-11-12 14:55                 ` Nicolas Pitre
2013-11-12 15:20                 ` Nicolas Pitre
2013-11-12 15:20                   ` Nicolas Pitre
2013-11-12 18:03                 ` Måns Rullgård
2013-11-12 18:03                   ` Måns Rullgård
2013-11-12 14:22       ` Måns Rullgård
2013-11-12 14:22         ` Måns Rullgård
2013-11-12 14:36         ` Nicolas Pitre
2013-11-12 14:36           ` Nicolas Pitre
2013-11-11  7:46 ` Uwe Kleine-König
2013-11-11  7:46   ` Uwe Kleine-König
2013-11-12  2:35   ` Stephen Boyd
2013-11-12  2:35     ` Stephen Boyd

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.