All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] arm64: lib: improve copy performance
@ 2021-03-23  7:34 ` Yang Yingliang
  0 siblings, 0 replies; 20+ messages in thread
From: Yang Yingliang @ 2021-03-23  7:34 UTC (permalink / raw)
  To: linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun, yangyingliang

This patchset reduce instructions in copy_template.S
to improve the performance of copy memory, when size
is ge 64 bytes.

Yang Yingliang (3):
  arm64: lib: introduce ldp2/stp2 macro
  arm64: lib: improve copy performance when size is ge 128 bytes
  arm64: lib: improve copy performance when size is less than 128 and ge
    64 bytes

 arch/arm64/include/asm/asm-uaccess.h | 16 +++++++++
 arch/arm64/lib/copy_from_user.S      |  8 +++++
 arch/arm64/lib/copy_in_user.S        |  8 +++++
 arch/arm64/lib/copy_template.S       | 54 +++++++++++++++-------------
 arch/arm64/lib/copy_to_user.S        |  8 +++++
 arch/arm64/lib/memcpy.S              |  8 +++++
 6 files changed, 78 insertions(+), 24 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 0/3] arm64: lib: improve copy performance
@ 2021-03-23  7:34 ` Yang Yingliang
  0 siblings, 0 replies; 20+ messages in thread
From: Yang Yingliang @ 2021-03-23  7:34 UTC (permalink / raw)
  To: linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun, yangyingliang

This patchset reduce instructions in copy_template.S
to improve the performance of copy memory, when size
is ge 64 bytes.

Yang Yingliang (3):
  arm64: lib: introduce ldp2/stp2 macro
  arm64: lib: improve copy performance when size is ge 128 bytes
  arm64: lib: improve copy performance when size is less than 128 and ge
    64 bytes

 arch/arm64/include/asm/asm-uaccess.h | 16 +++++++++
 arch/arm64/lib/copy_from_user.S      |  8 +++++
 arch/arm64/lib/copy_in_user.S        |  8 +++++
 arch/arm64/lib/copy_template.S       | 54 +++++++++++++++-------------
 arch/arm64/lib/copy_to_user.S        |  8 +++++
 arch/arm64/lib/memcpy.S              |  8 +++++
 6 files changed, 78 insertions(+), 24 deletions(-)

-- 
2.25.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/3] arm64: lib: introduce ldp2/stp2 macro
  2021-03-23  7:34 ` Yang Yingliang
@ 2021-03-23  7:34   ` Yang Yingliang
  -1 siblings, 0 replies; 20+ messages in thread
From: Yang Yingliang @ 2021-03-23  7:34 UTC (permalink / raw)
  To: linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun, yangyingliang

Introduce ldp2/stp2 to load/store without add src/dst.

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
---
 arch/arm64/include/asm/asm-uaccess.h | 16 ++++++++++++++++
 arch/arm64/lib/copy_from_user.S      |  8 ++++++++
 arch/arm64/lib/copy_in_user.S        |  8 ++++++++
 arch/arm64/lib/copy_to_user.S        |  8 ++++++++
 arch/arm64/lib/memcpy.S              |  8 ++++++++
 5 files changed, 48 insertions(+)

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index ccedf548dac9..129c08621df1 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -72,6 +72,14 @@ alternative_else_nop_endif
 		_asm_extable	8889b,\l;
 	.endm
 
+	.macro user_ldp2 l, reg1, reg2, addr, post_inc1, post_inc2
+8888:		ldtr	\reg1, [\addr, \post_inc1];
+8889:		ldtr	\reg2, [\addr, \post_inc2];
+
+		_asm_extable	8888b,\l;
+		_asm_extable	8889b,\l;
+	.endm
+
 	.macro user_stp l, reg1, reg2, addr, post_inc
 8888:		sttr	\reg1, [\addr];
 8889:		sttr	\reg2, [\addr, #8];
@@ -81,6 +89,14 @@ alternative_else_nop_endif
 		_asm_extable	8889b,\l;
 	.endm
 
+	.macro user_stp2 l, reg1, reg2, addr, post_inc1, post_inc2
+8888:		sttr	\reg1, [\addr, \post_inc1];
+8889:		sttr	\reg2, [\addr, \post_inc2];
+
+		_asm_extable	8888b,\l;
+		_asm_extable	8889b,\l;
+	.endm
+
 	.macro user_ldst l, inst, reg, addr, post_inc
 8888:		\inst		\reg, [\addr];
 		add		\addr, \addr, \post_inc;
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 95cd62d67371..37308bcb338e 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -48,10 +48,18 @@
 	user_ldp 9998f, \reg1, \reg2, \ptr, \val
 	.endm
 
+	.macro ldp2 reg1, reg2, ptr, val1, val2
+	user_ldp2 9998f, \reg1, \reg2, \ptr, \val1, \val2
+	.endm
+
 	.macro stp1 reg1, reg2, ptr, val
 	stp \reg1, \reg2, [\ptr], \val
 	.endm
 
+	.macro stp2 reg1, reg2, ptr, val1, val2
+	stp \reg1, \reg2, [\ptr, \val1]
+	.endm
+
 end	.req	x5
 SYM_FUNC_START(__arch_copy_from_user)
 	add	end, x0, x2
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 1f61cd0df062..5654f7098102 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -49,10 +49,18 @@
 	user_ldp 9998f, \reg1, \reg2, \ptr, \val
 	.endm
 
+	.macro ldp2 reg1, reg2, ptr, val1, val2
+	user_ldp2 9998f, \reg1, \reg2, \ptr, \val1, \val2
+	.endm
+
 	.macro stp1 reg1, reg2, ptr, val
 	user_stp 9998f, \reg1, \reg2, \ptr, \val
 	.endm
 
+	.macro stp2 reg1, reg2, ptr, val1, val2
+	user_stp2 9998f, \reg1, \reg2, \ptr, \val1, \val2
+	.endm
+
 end	.req	x5
 
 SYM_FUNC_START(__arch_copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 043da90f5dd7..a1f95169ce04 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -47,10 +47,18 @@
 	ldp \reg1, \reg2, [\ptr], \val
 	.endm
 
+	.macro ldp2 reg1, reg2, ptr, val1, val2
+	ldp \reg1, \reg2, [\ptr, \val1]
+	.endm
+
 	.macro stp1 reg1, reg2, ptr, val
 	user_stp 9998f, \reg1, \reg2, \ptr, \val
 	.endm
 
+	.macro stp2 reg1, reg2, ptr, val1, val2
+	user_stp2 9998f, \reg1, \reg2, \ptr, \val1, \val2
+	.endm
+
 end	.req	x5
 SYM_FUNC_START(__arch_copy_to_user)
 	add	end, x0, x2
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index dc8d2a216a6e..9e0bfefd2673 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -52,10 +52,18 @@
 	ldp \reg1, \reg2, [\ptr], \val
 	.endm
 
+	.macro ldp2 reg1, reg2, ptr, val1, val2
+	ldp \reg1, \reg2, [\ptr, \val1]
+	.endm
+
 	.macro stp1 reg1, reg2, ptr, val
 	stp \reg1, \reg2, [\ptr], \val
 	.endm
 
+	.macro stp2 reg1, reg2, ptr, val1, val2
+	stp \reg1, \reg2, [\ptr, \val1]
+	.endm
+
 SYM_FUNC_START_ALIAS(__memcpy)
 SYM_FUNC_START_WEAK_PI(memcpy)
 #include "copy_template.S"
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 1/3] arm64: lib: introduce ldp2/stp2 macro
@ 2021-03-23  7:34   ` Yang Yingliang
  0 siblings, 0 replies; 20+ messages in thread
From: Yang Yingliang @ 2021-03-23  7:34 UTC (permalink / raw)
  To: linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun, yangyingliang

Introduce ldp2/stp2 to load/store without add src/dst.

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
---
 arch/arm64/include/asm/asm-uaccess.h | 16 ++++++++++++++++
 arch/arm64/lib/copy_from_user.S      |  8 ++++++++
 arch/arm64/lib/copy_in_user.S        |  8 ++++++++
 arch/arm64/lib/copy_to_user.S        |  8 ++++++++
 arch/arm64/lib/memcpy.S              |  8 ++++++++
 5 files changed, 48 insertions(+)

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index ccedf548dac9..129c08621df1 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -72,6 +72,14 @@ alternative_else_nop_endif
 		_asm_extable	8889b,\l;
 	.endm
 
+	.macro user_ldp2 l, reg1, reg2, addr, post_inc1, post_inc2
+8888:		ldtr	\reg1, [\addr, \post_inc1];
+8889:		ldtr	\reg2, [\addr, \post_inc2];
+
+		_asm_extable	8888b,\l;
+		_asm_extable	8889b,\l;
+	.endm
+
 	.macro user_stp l, reg1, reg2, addr, post_inc
 8888:		sttr	\reg1, [\addr];
 8889:		sttr	\reg2, [\addr, #8];
@@ -81,6 +89,14 @@ alternative_else_nop_endif
 		_asm_extable	8889b,\l;
 	.endm
 
+	.macro user_stp2 l, reg1, reg2, addr, post_inc1, post_inc2
+8888:		sttr	\reg1, [\addr, \post_inc1];
+8889:		sttr	\reg2, [\addr, \post_inc2];
+
+		_asm_extable	8888b,\l;
+		_asm_extable	8889b,\l;
+	.endm
+
 	.macro user_ldst l, inst, reg, addr, post_inc
 8888:		\inst		\reg, [\addr];
 		add		\addr, \addr, \post_inc;
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 95cd62d67371..37308bcb338e 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -48,10 +48,18 @@
 	user_ldp 9998f, \reg1, \reg2, \ptr, \val
 	.endm
 
+	.macro ldp2 reg1, reg2, ptr, val1, val2
+	user_ldp2 9998f, \reg1, \reg2, \ptr, \val1, \val2
+	.endm
+
 	.macro stp1 reg1, reg2, ptr, val
 	stp \reg1, \reg2, [\ptr], \val
 	.endm
 
+	.macro stp2 reg1, reg2, ptr, val1, val2
+	stp \reg1, \reg2, [\ptr, \val1]
+	.endm
+
 end	.req	x5
 SYM_FUNC_START(__arch_copy_from_user)
 	add	end, x0, x2
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 1f61cd0df062..5654f7098102 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -49,10 +49,18 @@
 	user_ldp 9998f, \reg1, \reg2, \ptr, \val
 	.endm
 
+	.macro ldp2 reg1, reg2, ptr, val1, val2
+	user_ldp2 9998f, \reg1, \reg2, \ptr, \val1, \val2
+	.endm
+
 	.macro stp1 reg1, reg2, ptr, val
 	user_stp 9998f, \reg1, \reg2, \ptr, \val
 	.endm
 
+	.macro stp2 reg1, reg2, ptr, val1, val2
+	user_stp2 9998f, \reg1, \reg2, \ptr, \val1, \val2
+	.endm
+
 end	.req	x5
 
 SYM_FUNC_START(__arch_copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 043da90f5dd7..a1f95169ce04 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -47,10 +47,18 @@
 	ldp \reg1, \reg2, [\ptr], \val
 	.endm
 
+	.macro ldp2 reg1, reg2, ptr, val1, val2
+	ldp \reg1, \reg2, [\ptr, \val1]
+	.endm
+
 	.macro stp1 reg1, reg2, ptr, val
 	user_stp 9998f, \reg1, \reg2, \ptr, \val
 	.endm
 
+	.macro stp2 reg1, reg2, ptr, val1, val2
+	user_stp2 9998f, \reg1, \reg2, \ptr, \val1, \val2
+	.endm
+
 end	.req	x5
 SYM_FUNC_START(__arch_copy_to_user)
 	add	end, x0, x2
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index dc8d2a216a6e..9e0bfefd2673 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -52,10 +52,18 @@
 	ldp \reg1, \reg2, [\ptr], \val
 	.endm
 
+	.macro ldp2 reg1, reg2, ptr, val1, val2
+	ldp \reg1, \reg2, [\ptr, \val1]
+	.endm
+
 	.macro stp1 reg1, reg2, ptr, val
 	stp \reg1, \reg2, [\ptr], \val
 	.endm
 
+	.macro stp2 reg1, reg2, ptr, val1, val2
+	stp \reg1, \reg2, [\ptr, \val1]
+	.endm
+
 SYM_FUNC_START_ALIAS(__memcpy)
 SYM_FUNC_START_WEAK_PI(memcpy)
 #include "copy_template.S"
-- 
2.25.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
  2021-03-23  7:34 ` Yang Yingliang
@ 2021-03-23  7:34   ` Yang Yingliang
  -1 siblings, 0 replies; 20+ messages in thread
From: Yang Yingliang @ 2021-03-23  7:34 UTC (permalink / raw)
  To: linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun, yangyingliang

When copy over 128 bytes, src/dst is added after
each ldp/stp instruction, it will cost more time.
To improve this, we only add src/dst after load
or store 64 bytes.

Copy 4096 bytes cost on Kunpeng920 (ms):
Without this patch:
memcpy: 143.85 copy_from_user: 172.69 copy_to_user: 199.23

With this patch:
memcpy: 107.12 copy_from_user: 157.50 copy_to_user: 198.85

It's about 25% improvement in memcpy().

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
---
 arch/arm64/lib/copy_template.S | 36 +++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..c3cd6f84c9c0 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -152,29 +152,33 @@ D_h	.req	x14
 	.p2align	L1_CACHE_SHIFT
 .Lcpy_body_large:
 	/* pre-get 64 bytes data. */
-	ldp1	A_l, A_h, src, #16
-	ldp1	B_l, B_h, src, #16
-	ldp1	C_l, C_h, src, #16
-	ldp1	D_l, D_h, src, #16
+	ldp2	A_l, A_h, src, #0,  #8
+	ldp2	B_l, B_h, src, #16, #24
+	ldp2	C_l, C_h, src, #32, #40
+	ldp2	D_l, D_h, src, #48, #56
+	add	src, src, #64
 1:
 	/*
 	* interlace the load of next 64 bytes data block with store of the last
 	* loaded 64 bytes data.
 	*/
-	stp1	A_l, A_h, dst, #16
-	ldp1	A_l, A_h, src, #16
-	stp1	B_l, B_h, dst, #16
-	ldp1	B_l, B_h, src, #16
-	stp1	C_l, C_h, dst, #16
-	ldp1	C_l, C_h, src, #16
-	stp1	D_l, D_h, dst, #16
-	ldp1	D_l, D_h, src, #16
+	stp2	A_l, A_h, dst, #0,  #8
+	ldp2	A_l, A_h, src, #0,  #8
+	stp2	B_l, B_h, dst, #16, #24
+	ldp2	B_l, B_h, src, #16, #24
+	stp2	C_l, C_h, dst, #32, #40
+	ldp2	C_l, C_h, src, #32, #40
+	stp2	D_l, D_h, dst, #48, #56
+	ldp2	D_l, D_h, src, #48, #56
+	add	src, src, #64
+	add	dst, dst, #64
 	subs	count, count, #64
 	b.ge	1b
-	stp1	A_l, A_h, dst, #16
-	stp1	B_l, B_h, dst, #16
-	stp1	C_l, C_h, dst, #16
-	stp1	D_l, D_h, dst, #16
+	stp2	A_l, A_h, dst, #0,  #8
+	stp2	B_l, B_h, dst, #16, #24
+	stp2	C_l, C_h, dst, #32, #40
+	stp2	D_l, D_h, dst, #48, #56
+	add	dst, dst, #64
 
 	tst	count, #0x3f
 	b.ne	.Ltail63
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
@ 2021-03-23  7:34   ` Yang Yingliang
  0 siblings, 0 replies; 20+ messages in thread
From: Yang Yingliang @ 2021-03-23  7:34 UTC (permalink / raw)
  To: linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun, yangyingliang

When copy over 128 bytes, src/dst is added after
each ldp/stp instruction, it will cost more time.
To improve this, we only add src/dst after load
or store 64 bytes.

Copy 4096 bytes cost on Kunpeng920 (ms):
Without this patch:
memcpy: 143.85 copy_from_user: 172.69 copy_to_user: 199.23

With this patch:
memcpy: 107.12 copy_from_user: 157.50 copy_to_user: 198.85

It's about 25% improvement in memcpy().

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
---
 arch/arm64/lib/copy_template.S | 36 +++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..c3cd6f84c9c0 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -152,29 +152,33 @@ D_h	.req	x14
 	.p2align	L1_CACHE_SHIFT
 .Lcpy_body_large:
 	/* pre-get 64 bytes data. */
-	ldp1	A_l, A_h, src, #16
-	ldp1	B_l, B_h, src, #16
-	ldp1	C_l, C_h, src, #16
-	ldp1	D_l, D_h, src, #16
+	ldp2	A_l, A_h, src, #0,  #8
+	ldp2	B_l, B_h, src, #16, #24
+	ldp2	C_l, C_h, src, #32, #40
+	ldp2	D_l, D_h, src, #48, #56
+	add	src, src, #64
 1:
 	/*
 	* interlace the load of next 64 bytes data block with store of the last
 	* loaded 64 bytes data.
 	*/
-	stp1	A_l, A_h, dst, #16
-	ldp1	A_l, A_h, src, #16
-	stp1	B_l, B_h, dst, #16
-	ldp1	B_l, B_h, src, #16
-	stp1	C_l, C_h, dst, #16
-	ldp1	C_l, C_h, src, #16
-	stp1	D_l, D_h, dst, #16
-	ldp1	D_l, D_h, src, #16
+	stp2	A_l, A_h, dst, #0,  #8
+	ldp2	A_l, A_h, src, #0,  #8
+	stp2	B_l, B_h, dst, #16, #24
+	ldp2	B_l, B_h, src, #16, #24
+	stp2	C_l, C_h, dst, #32, #40
+	ldp2	C_l, C_h, src, #32, #40
+	stp2	D_l, D_h, dst, #48, #56
+	ldp2	D_l, D_h, src, #48, #56
+	add	src, src, #64
+	add	dst, dst, #64
 	subs	count, count, #64
 	b.ge	1b
-	stp1	A_l, A_h, dst, #16
-	stp1	B_l, B_h, dst, #16
-	stp1	C_l, C_h, dst, #16
-	stp1	D_l, D_h, dst, #16
+	stp2	A_l, A_h, dst, #0,  #8
+	stp2	B_l, B_h, dst, #16, #24
+	stp2	C_l, C_h, dst, #32, #40
+	stp2	D_l, D_h, dst, #48, #56
+	add	dst, dst, #64
 
 	tst	count, #0x3f
 	b.ne	.Ltail63
-- 
2.25.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/3] arm64: lib: improve copy performance when size is less than 128 and ge 64 bytes
  2021-03-23  7:34 ` Yang Yingliang
@ 2021-03-23  7:34   ` Yang Yingliang
  -1 siblings, 0 replies; 20+ messages in thread
From: Yang Yingliang @ 2021-03-23  7:34 UTC (permalink / raw)
  To: linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun, yangyingliang

When copy less than 128 and ge than 64 bytes, add src/dst after
load and store 64 bytes to improve performance.

Copy 127 bytes cost on Kunpeng920 (ms):
Without this patch:
memcpy: 14.62 copy_from_user: 14.23 copy_to_user: 14.42

With this patch:
memcpy: 13.85 copy_from_user: 13.26 copy_to_user: 13.84

It's about 5.27% improvement in memcpy().

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
---
 arch/arm64/lib/copy_template.S | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index c3cd6f84c9c0..a9cbd47473f0 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -132,14 +132,16 @@ D_h	.req	x14
 	* Less than 128 bytes to copy, so handle 64 here and then jump
 	* to the tail.
 	*/
-	ldp1	A_l, A_h, src, #16
-	stp1	A_l, A_h, dst, #16
-	ldp1	B_l, B_h, src, #16
-	ldp1	C_l, C_h, src, #16
-	stp1	B_l, B_h, dst, #16
-	stp1	C_l, C_h, dst, #16
-	ldp1	D_l, D_h, src, #16
-	stp1	D_l, D_h, dst, #16
+	ldp2	A_l, A_h, src, #0,  #8
+	stp2	A_l, A_h, dst, #0,  #8
+	ldp2	B_l, B_h, src, #16, #24
+	ldp2	C_l, C_h, src, #32, #40
+	stp2	B_l, B_h, dst, #16, #24
+	stp2	C_l, C_h, dst, #32, #40
+	ldp2	D_l, D_h, src, #48, #56
+	stp2	D_l, D_h, dst, #48, #56
+	add	src, src, #64
+	add	dst, dst, #64
 
 	tst	count, #0x3f
 	b.ne	.Ltail63
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/3] arm64: lib: improve copy performance when size is less than 128 and ge 64 bytes
@ 2021-03-23  7:34   ` Yang Yingliang
  0 siblings, 0 replies; 20+ messages in thread
From: Yang Yingliang @ 2021-03-23  7:34 UTC (permalink / raw)
  To: linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun, yangyingliang

When copy less than 128 and ge than 64 bytes, add src/dst after
load and store 64 bytes to improve performance.

Copy 127 bytes cost on Kunpeng920 (ms):
Without this patch:
memcpy: 14.62 copy_from_user: 14.23 copy_to_user: 14.42

With this patch:
memcpy: 13.85 copy_from_user: 13.26 copy_to_user: 13.84

It's about 5.27% improvement in memcpy().

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
---
 arch/arm64/lib/copy_template.S | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index c3cd6f84c9c0..a9cbd47473f0 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -132,14 +132,16 @@ D_h	.req	x14
 	* Less than 128 bytes to copy, so handle 64 here and then jump
 	* to the tail.
 	*/
-	ldp1	A_l, A_h, src, #16
-	stp1	A_l, A_h, dst, #16
-	ldp1	B_l, B_h, src, #16
-	ldp1	C_l, C_h, src, #16
-	stp1	B_l, B_h, dst, #16
-	stp1	C_l, C_h, dst, #16
-	ldp1	D_l, D_h, src, #16
-	stp1	D_l, D_h, dst, #16
+	ldp2	A_l, A_h, src, #0,  #8
+	stp2	A_l, A_h, dst, #0,  #8
+	ldp2	B_l, B_h, src, #16, #24
+	ldp2	C_l, C_h, src, #32, #40
+	stp2	B_l, B_h, dst, #16, #24
+	stp2	C_l, C_h, dst, #32, #40
+	ldp2	D_l, D_h, src, #48, #56
+	stp2	D_l, D_h, dst, #48, #56
+	add	src, src, #64
+	add	dst, dst, #64
 
 	tst	count, #0x3f
 	b.ne	.Ltail63
-- 
2.25.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
  2021-03-23  7:34   ` Yang Yingliang
@ 2021-03-23 12:08     ` Robin Murphy
  -1 siblings, 0 replies; 20+ messages in thread
From: Robin Murphy @ 2021-03-23 12:08 UTC (permalink / raw)
  To: Yang Yingliang, linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun

On 2021-03-23 07:34, Yang Yingliang wrote:
> When copy over 128 bytes, src/dst is added after
> each ldp/stp instruction, it will cost more time.
> To improve this, we only add src/dst after load
> or store 64 bytes.

This breaks the required behaviour for copy_*_user(), since the fault 
handler expects the base address to be up-to-date at all times. Say 
you're copying 128 bytes and fault on the 4th store, it should return 80 
bytes not copied; the code below would return 128 bytes not copied, even 
though 48 bytes have actually been written to the destination.

We've had a couple of tries at updating this code (because the whole 
template is frankly a bit terrible, and a long way from the 
well-optimised code it was derived from), but getting the fault-handling 
behaviour right without making the handler itself ludicrously complex 
has proven tricky. And then it got bumped down the priority list while 
the uaccess behaviour in general was in flux - now that the dust has 
largely settled on that I should probably try to find time to pick this 
up again...

Robin.

> Copy 4096 bytes cost on Kunpeng920 (ms):
> Without this patch:
> memcpy: 143.85 copy_from_user: 172.69 copy_to_user: 199.23
> 
> With this patch:
> memcpy: 107.12 copy_from_user: 157.50 copy_to_user: 198.85
> 
> It's about 25% improvement in memcpy().
> 
> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
> ---
>   arch/arm64/lib/copy_template.S | 36 +++++++++++++++++++---------------
>   1 file changed, 20 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
> index 488df234c49a..c3cd6f84c9c0 100644
> --- a/arch/arm64/lib/copy_template.S
> +++ b/arch/arm64/lib/copy_template.S
> @@ -152,29 +152,33 @@ D_h	.req	x14
>   	.p2align	L1_CACHE_SHIFT
>   .Lcpy_body_large:
>   	/* pre-get 64 bytes data. */
> -	ldp1	A_l, A_h, src, #16
> -	ldp1	B_l, B_h, src, #16
> -	ldp1	C_l, C_h, src, #16
> -	ldp1	D_l, D_h, src, #16
> +	ldp2	A_l, A_h, src, #0,  #8
> +	ldp2	B_l, B_h, src, #16, #24
> +	ldp2	C_l, C_h, src, #32, #40
> +	ldp2	D_l, D_h, src, #48, #56
> +	add	src, src, #64
>   1:
>   	/*
>   	* interlace the load of next 64 bytes data block with store of the last
>   	* loaded 64 bytes data.
>   	*/
> -	stp1	A_l, A_h, dst, #16
> -	ldp1	A_l, A_h, src, #16
> -	stp1	B_l, B_h, dst, #16
> -	ldp1	B_l, B_h, src, #16
> -	stp1	C_l, C_h, dst, #16
> -	ldp1	C_l, C_h, src, #16
> -	stp1	D_l, D_h, dst, #16
> -	ldp1	D_l, D_h, src, #16
> +	stp2	A_l, A_h, dst, #0,  #8
> +	ldp2	A_l, A_h, src, #0,  #8
> +	stp2	B_l, B_h, dst, #16, #24
> +	ldp2	B_l, B_h, src, #16, #24
> +	stp2	C_l, C_h, dst, #32, #40
> +	ldp2	C_l, C_h, src, #32, #40
> +	stp2	D_l, D_h, dst, #48, #56
> +	ldp2	D_l, D_h, src, #48, #56
> +	add	src, src, #64
> +	add	dst, dst, #64
>   	subs	count, count, #64
>   	b.ge	1b
> -	stp1	A_l, A_h, dst, #16
> -	stp1	B_l, B_h, dst, #16
> -	stp1	C_l, C_h, dst, #16
> -	stp1	D_l, D_h, dst, #16
> +	stp2	A_l, A_h, dst, #0,  #8
> +	stp2	B_l, B_h, dst, #16, #24
> +	stp2	C_l, C_h, dst, #32, #40
> +	stp2	D_l, D_h, dst, #48, #56
> +	add	dst, dst, #64
>   
>   	tst	count, #0x3f
>   	b.ne	.Ltail63
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
@ 2021-03-23 12:08     ` Robin Murphy
  0 siblings, 0 replies; 20+ messages in thread
From: Robin Murphy @ 2021-03-23 12:08 UTC (permalink / raw)
  To: Yang Yingliang, linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun

On 2021-03-23 07:34, Yang Yingliang wrote:
> When copy over 128 bytes, src/dst is added after
> each ldp/stp instruction, it will cost more time.
> To improve this, we only add src/dst after load
> or store 64 bytes.

This breaks the required behaviour for copy_*_user(), since the fault 
handler expects the base address to be up-to-date at all times. Say 
you're copying 128 bytes and fault on the 4th store, it should return 80 
bytes not copied; the code below would return 128 bytes not copied, even 
though 48 bytes have actually been written to the destination.

We've had a couple of tries at updating this code (because the whole 
template is frankly a bit terrible, and a long way from the 
well-optimised code it was derived from), but getting the fault-handling 
behaviour right without making the handler itself ludicrously complex 
has proven tricky. And then it got bumped down the priority list while 
the uaccess behaviour in general was in flux - now that the dust has 
largely settled on that I should probably try to find time to pick this 
up again...

Robin.

> Copy 4096 bytes cost on Kunpeng920 (ms):
> Without this patch:
> memcpy: 143.85 copy_from_user: 172.69 copy_to_user: 199.23
> 
> With this patch:
> memcpy: 107.12 copy_from_user: 157.50 copy_to_user: 198.85
> 
> It's about 25% improvement in memcpy().
> 
> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
> ---
>   arch/arm64/lib/copy_template.S | 36 +++++++++++++++++++---------------
>   1 file changed, 20 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
> index 488df234c49a..c3cd6f84c9c0 100644
> --- a/arch/arm64/lib/copy_template.S
> +++ b/arch/arm64/lib/copy_template.S
> @@ -152,29 +152,33 @@ D_h	.req	x14
>   	.p2align	L1_CACHE_SHIFT
>   .Lcpy_body_large:
>   	/* pre-get 64 bytes data. */
> -	ldp1	A_l, A_h, src, #16
> -	ldp1	B_l, B_h, src, #16
> -	ldp1	C_l, C_h, src, #16
> -	ldp1	D_l, D_h, src, #16
> +	ldp2	A_l, A_h, src, #0,  #8
> +	ldp2	B_l, B_h, src, #16, #24
> +	ldp2	C_l, C_h, src, #32, #40
> +	ldp2	D_l, D_h, src, #48, #56
> +	add	src, src, #64
>   1:
>   	/*
>   	* interlace the load of next 64 bytes data block with store of the last
>   	* loaded 64 bytes data.
>   	*/
> -	stp1	A_l, A_h, dst, #16
> -	ldp1	A_l, A_h, src, #16
> -	stp1	B_l, B_h, dst, #16
> -	ldp1	B_l, B_h, src, #16
> -	stp1	C_l, C_h, dst, #16
> -	ldp1	C_l, C_h, src, #16
> -	stp1	D_l, D_h, dst, #16
> -	ldp1	D_l, D_h, src, #16
> +	stp2	A_l, A_h, dst, #0,  #8
> +	ldp2	A_l, A_h, src, #0,  #8
> +	stp2	B_l, B_h, dst, #16, #24
> +	ldp2	B_l, B_h, src, #16, #24
> +	stp2	C_l, C_h, dst, #32, #40
> +	ldp2	C_l, C_h, src, #32, #40
> +	stp2	D_l, D_h, dst, #48, #56
> +	ldp2	D_l, D_h, src, #48, #56
> +	add	src, src, #64
> +	add	dst, dst, #64
>   	subs	count, count, #64
>   	b.ge	1b
> -	stp1	A_l, A_h, dst, #16
> -	stp1	B_l, B_h, dst, #16
> -	stp1	C_l, C_h, dst, #16
> -	stp1	D_l, D_h, dst, #16
> +	stp2	A_l, A_h, dst, #0,  #8
> +	stp2	B_l, B_h, dst, #16, #24
> +	stp2	C_l, C_h, dst, #32, #40
> +	stp2	D_l, D_h, dst, #48, #56
> +	add	dst, dst, #64
>   
>   	tst	count, #0x3f
>   	b.ne	.Ltail63
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
  2021-03-23 12:08     ` Robin Murphy
@ 2021-03-23 13:32       ` Will Deacon
  -1 siblings, 0 replies; 20+ messages in thread
From: Will Deacon @ 2021-03-23 13:32 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Yang Yingliang, linux-arm-kernel, linux-kernel, catalin.marinas,
	guohanjun

On Tue, Mar 23, 2021 at 12:08:56PM +0000, Robin Murphy wrote:
> On 2021-03-23 07:34, Yang Yingliang wrote:
> > When copy over 128 bytes, src/dst is added after
> > each ldp/stp instruction, it will cost more time.
> > To improve this, we only add src/dst after load
> > or store 64 bytes.
> 
> This breaks the required behaviour for copy_*_user(), since the fault
> handler expects the base address to be up-to-date at all times. Say you're
> copying 128 bytes and fault on the 4th store, it should return 80 bytes not
> copied; the code below would return 128 bytes not copied, even though 48
> bytes have actually been written to the destination.
> 
> We've had a couple of tries at updating this code (because the whole
> template is frankly a bit terrible, and a long way from the well-optimised
> code it was derived from), but getting the fault-handling behaviour right
> without making the handler itself ludicrously complex has proven tricky. And
> then it got bumped down the priority list while the uaccess behaviour in
> general was in flux - now that the dust has largely settled on that I should
> probably try to find time to pick this up again...

I think the v5 from Oli was pretty close, but it didn't get any review:

https://lore.kernel.org/r/20200914151800.2270-1-oli.swede@arm.com

he also included tests:

https://lore.kernel.org/r/20200916104636.19172-1-oli.swede@arm.com

It would be great if you or somebody else has time to revive those!

Will

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
@ 2021-03-23 13:32       ` Will Deacon
  0 siblings, 0 replies; 20+ messages in thread
From: Will Deacon @ 2021-03-23 13:32 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Yang Yingliang, linux-arm-kernel, linux-kernel, catalin.marinas,
	guohanjun

On Tue, Mar 23, 2021 at 12:08:56PM +0000, Robin Murphy wrote:
> On 2021-03-23 07:34, Yang Yingliang wrote:
> > When copy over 128 bytes, src/dst is added after
> > each ldp/stp instruction, it will cost more time.
> > To improve this, we only add src/dst after load
> > or store 64 bytes.
> 
> This breaks the required behaviour for copy_*_user(), since the fault
> handler expects the base address to be up-to-date at all times. Say you're
> copying 128 bytes and fault on the 4th store, it should return 80 bytes not
> copied; the code below would return 128 bytes not copied, even though 48
> bytes have actually been written to the destination.
> 
> We've had a couple of tries at updating this code (because the whole
> template is frankly a bit terrible, and a long way from the well-optimised
> code it was derived from), but getting the fault-handling behaviour right
> without making the handler itself ludicrously complex has proven tricky. And
> then it got bumped down the priority list while the uaccess behaviour in
> general was in flux - now that the dust has largely settled on that I should
> probably try to find time to pick this up again...

I think the v5 from Oli was pretty close, but it didn't get any review:

https://lore.kernel.org/r/20200914151800.2270-1-oli.swede@arm.com

he also included tests:

https://lore.kernel.org/r/20200916104636.19172-1-oli.swede@arm.com

It would be great if you or somebody else has time to revive those!

Will

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
  2021-03-23 13:32       ` Will Deacon
@ 2021-03-23 14:28         ` Robin Murphy
  -1 siblings, 0 replies; 20+ messages in thread
From: Robin Murphy @ 2021-03-23 14:28 UTC (permalink / raw)
  To: Will Deacon
  Cc: Yang Yingliang, linux-arm-kernel, linux-kernel, catalin.marinas,
	guohanjun

On 2021-03-23 13:32, Will Deacon wrote:
> On Tue, Mar 23, 2021 at 12:08:56PM +0000, Robin Murphy wrote:
>> On 2021-03-23 07:34, Yang Yingliang wrote:
>>> When copy over 128 bytes, src/dst is added after
>>> each ldp/stp instruction, it will cost more time.
>>> To improve this, we only add src/dst after load
>>> or store 64 bytes.
>>
>> This breaks the required behaviour for copy_*_user(), since the fault
>> handler expects the base address to be up-to-date at all times. Say you're
>> copying 128 bytes and fault on the 4th store, it should return 80 bytes not
>> copied; the code below would return 128 bytes not copied, even though 48
>> bytes have actually been written to the destination.
>>
>> We've had a couple of tries at updating this code (because the whole
>> template is frankly a bit terrible, and a long way from the well-optimised
>> code it was derived from), but getting the fault-handling behaviour right
>> without making the handler itself ludicrously complex has proven tricky. And
>> then it got bumped down the priority list while the uaccess behaviour in
>> general was in flux - now that the dust has largely settled on that I should
>> probably try to find time to pick this up again...
> 
> I think the v5 from Oli was pretty close, but it didn't get any review:
> 
> https://lore.kernel.org/r/20200914151800.2270-1-oli.swede@arm.com
> 
> he also included tests:
> 
> https://lore.kernel.org/r/20200916104636.19172-1-oli.swede@arm.com
> 
> It would be great if you or somebody else has time to revive those!

Yeah, we still have a ticket open for it. Since the uaccess overhaul has 
pretty much killed off any remaining value in the template idea, I might 
have a quick go at spinning a series to just update memcpy() and the 
other library routines to their shiny new versions, then come back and 
work on some dedicated usercopy routines built around LDTR/STTR (and the 
desired fault behaviour) as a follow-up.

(I was also holding out hope for copy_in_user() to disappear if we wait 
long enough, but apparently not yet...)

Robin.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
@ 2021-03-23 14:28         ` Robin Murphy
  0 siblings, 0 replies; 20+ messages in thread
From: Robin Murphy @ 2021-03-23 14:28 UTC (permalink / raw)
  To: Will Deacon
  Cc: Yang Yingliang, linux-arm-kernel, linux-kernel, catalin.marinas,
	guohanjun

On 2021-03-23 13:32, Will Deacon wrote:
> On Tue, Mar 23, 2021 at 12:08:56PM +0000, Robin Murphy wrote:
>> On 2021-03-23 07:34, Yang Yingliang wrote:
>>> When copy over 128 bytes, src/dst is added after
>>> each ldp/stp instruction, it will cost more time.
>>> To improve this, we only add src/dst after load
>>> or store 64 bytes.
>>
>> This breaks the required behaviour for copy_*_user(), since the fault
>> handler expects the base address to be up-to-date at all times. Say you're
>> copying 128 bytes and fault on the 4th store, it should return 80 bytes not
>> copied; the code below would return 128 bytes not copied, even though 48
>> bytes have actually been written to the destination.
>>
>> We've had a couple of tries at updating this code (because the whole
>> template is frankly a bit terrible, and a long way from the well-optimised
>> code it was derived from), but getting the fault-handling behaviour right
>> without making the handler itself ludicrously complex has proven tricky. And
>> then it got bumped down the priority list while the uaccess behaviour in
>> general was in flux - now that the dust has largely settled on that I should
>> probably try to find time to pick this up again...
> 
> I think the v5 from Oli was pretty close, but it didn't get any review:
> 
> https://lore.kernel.org/r/20200914151800.2270-1-oli.swede@arm.com
> 
> he also included tests:
> 
> https://lore.kernel.org/r/20200916104636.19172-1-oli.swede@arm.com
> 
> It would be great if you or somebody else has time to revive those!

Yeah, we still have a ticket open for it. Since the uaccess overhaul has 
pretty much killed off any remaining value in the template idea, I might 
have a quick go at spinning a series to just update memcpy() and the 
other library routines to their shiny new versions, then come back and 
work on some dedicated usercopy routines built around LDTR/STTR (and the 
desired fault behaviour) as a follow-up.

(I was also holding out hope for copy_in_user() to disappear if we wait 
long enough, but apparently not yet...)

Robin.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
  2021-03-23 13:32       ` Will Deacon
@ 2021-03-23 15:03         ` Catalin Marinas
  -1 siblings, 0 replies; 20+ messages in thread
From: Catalin Marinas @ 2021-03-23 15:03 UTC (permalink / raw)
  To: Will Deacon
  Cc: Robin Murphy, Yang Yingliang, linux-arm-kernel, linux-kernel, guohanjun

On Tue, Mar 23, 2021 at 01:32:18PM +0000, Will Deacon wrote:
> On Tue, Mar 23, 2021 at 12:08:56PM +0000, Robin Murphy wrote:
> > On 2021-03-23 07:34, Yang Yingliang wrote:
> > > When copy over 128 bytes, src/dst is added after
> > > each ldp/stp instruction, it will cost more time.
> > > To improve this, we only add src/dst after load
> > > or store 64 bytes.
> > 
> > This breaks the required behaviour for copy_*_user(), since the fault
> > handler expects the base address to be up-to-date at all times. Say you're
> > copying 128 bytes and fault on the 4th store, it should return 80 bytes not
> > copied; the code below would return 128 bytes not copied, even though 48
> > bytes have actually been written to the destination.
> > 
> > We've had a couple of tries at updating this code (because the whole
> > template is frankly a bit terrible, and a long way from the well-optimised
> > code it was derived from), but getting the fault-handling behaviour right
> > without making the handler itself ludicrously complex has proven tricky. And
> > then it got bumped down the priority list while the uaccess behaviour in
> > general was in flux - now that the dust has largely settled on that I should
> > probably try to find time to pick this up again...
> 
> I think the v5 from Oli was pretty close, but it didn't get any review:
> 
> https://lore.kernel.org/r/20200914151800.2270-1-oli.swede@arm.com

These are still unread in my inbox as I was planning to look at them
again. However, I think we discussed a few options on how to proceed but
no concrete plans:

1. Merge Oli's patches as they are, with some potential complexity
   issues as fixing the user copy accuracy was non-trivial. I think the
   latest version uses a two-stage approach: when taking a fault, it
   falls back to to byte-by-byte with the expectation that it faults
   again and we can then report the correct fault address.

2. Only use Cortex Strings for in-kernel memcpy() while the uaccess
   routines are some simple loops that align the uaccess part only
   (unlike Cortex Strings which usually to align the source).

3. Similar to 2 but with Cortex Strings imported automatically with some
   script to make it easier to keep the routines up to date.

If having non-optimal (but good enough) uaccess routines is acceptable,
I'd go for (2) with a plan to move to (3) at the next Cortex Strings
update.

I also need to look again at option (1) to see how complex it is but
given the time one spends on importing a new Cortex Strings library, I
don't think (1) scales well on the long term. We could, however, go for
(1) now and look at (3) with the next update.

-- 
Catalin

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
@ 2021-03-23 15:03         ` Catalin Marinas
  0 siblings, 0 replies; 20+ messages in thread
From: Catalin Marinas @ 2021-03-23 15:03 UTC (permalink / raw)
  To: Will Deacon
  Cc: Robin Murphy, Yang Yingliang, linux-arm-kernel, linux-kernel, guohanjun

On Tue, Mar 23, 2021 at 01:32:18PM +0000, Will Deacon wrote:
> On Tue, Mar 23, 2021 at 12:08:56PM +0000, Robin Murphy wrote:
> > On 2021-03-23 07:34, Yang Yingliang wrote:
> > > When copy over 128 bytes, src/dst is added after
> > > each ldp/stp instruction, it will cost more time.
> > > To improve this, we only add src/dst after load
> > > or store 64 bytes.
> > 
> > This breaks the required behaviour for copy_*_user(), since the fault
> > handler expects the base address to be up-to-date at all times. Say you're
> > copying 128 bytes and fault on the 4th store, it should return 80 bytes not
> > copied; the code below would return 128 bytes not copied, even though 48
> > bytes have actually been written to the destination.
> > 
> > We've had a couple of tries at updating this code (because the whole
> > template is frankly a bit terrible, and a long way from the well-optimised
> > code it was derived from), but getting the fault-handling behaviour right
> > without making the handler itself ludicrously complex has proven tricky. And
> > then it got bumped down the priority list while the uaccess behaviour in
> > general was in flux - now that the dust has largely settled on that I should
> > probably try to find time to pick this up again...
> 
> I think the v5 from Oli was pretty close, but it didn't get any review:
> 
> https://lore.kernel.org/r/20200914151800.2270-1-oli.swede@arm.com

These are still unread in my inbox as I was planning to look at them
again. However, I think we discussed a few options on how to proceed but
no concrete plans:

1. Merge Oli's patches as they are, with some potential complexity
   issues as fixing the user copy accuracy was non-trivial. I think the
   latest version uses a two-stage approach: when taking a fault, it
   falls back to to byte-by-byte with the expectation that it faults
   again and we can then report the correct fault address.

2. Only use Cortex Strings for in-kernel memcpy() while the uaccess
   routines are some simple loops that align the uaccess part only
   (unlike Cortex Strings which usually to align the source).

3. Similar to 2 but with Cortex Strings imported automatically with some
   script to make it easier to keep the routines up to date.

If having non-optimal (but good enough) uaccess routines is acceptable,
I'd go for (2) with a plan to move to (3) at the next Cortex Strings
update.

I also need to look again at option (1) to see how complex it is but
given the time one spends on importing a new Cortex Strings library, I
don't think (1) scales well on the long term. We could, however, go for
(1) now and look at (3) with the next update.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
  2021-03-23 12:08     ` Robin Murphy
@ 2021-03-24 16:38       ` David Laight
  -1 siblings, 0 replies; 20+ messages in thread
From: David Laight @ 2021-03-24 16:38 UTC (permalink / raw)
  To: 'Robin Murphy', Yang Yingliang, linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun

From: Robin Murphy
> Sent: 23 March 2021 12:09
> 
> On 2021-03-23 07:34, Yang Yingliang wrote:
> > When copy over 128 bytes, src/dst is added after
> > each ldp/stp instruction, it will cost more time.
> > To improve this, we only add src/dst after load
> > or store 64 bytes.
> 
> This breaks the required behaviour for copy_*_user(), since the fault
> handler expects the base address to be up-to-date at all times. Say
> you're copying 128 bytes and fault on the 4th store, it should return 80
> bytes not copied; the code below would return 128 bytes not copied, even
> though 48 bytes have actually been written to the destination.

Are there any non-superscaler amd64 cpu (that anyone cares about)?

If the cpu can execute multiple instructions in one clock
then it is usually possible to get the loop control (almost) free.

You might need to unroll once to interleave read/write
but any more may be pointless.
So something like:
	a = *src++
	do {
		b = *src++;
		*dst++ = a;
		a = *src++;
		*dst++ = b;
	} while (src != lim);
	*dst++ = b;

    David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
@ 2021-03-24 16:38       ` David Laight
  0 siblings, 0 replies; 20+ messages in thread
From: David Laight @ 2021-03-24 16:38 UTC (permalink / raw)
  To: 'Robin Murphy', Yang Yingliang, linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun

From: Robin Murphy
> Sent: 23 March 2021 12:09
> 
> On 2021-03-23 07:34, Yang Yingliang wrote:
> > When copy over 128 bytes, src/dst is added after
> > each ldp/stp instruction, it will cost more time.
> > To improve this, we only add src/dst after load
> > or store 64 bytes.
> 
> This breaks the required behaviour for copy_*_user(), since the fault
> handler expects the base address to be up-to-date at all times. Say
> you're copying 128 bytes and fault on the 4th store, it should return 80
> bytes not copied; the code below would return 128 bytes not copied, even
> though 48 bytes have actually been written to the destination.

Are there any non-superscaler amd64 cpu (that anyone cares about)?

If the cpu can execute multiple instructions in one clock
then it is usually possible to get the loop control (almost) free.

You might need to unroll once to interleave read/write
but any more may be pointless.
So something like:
	a = *src++
	do {
		b = *src++;
		*dst++ = a;
		a = *src++;
		*dst++ = b;
	} while (src != lim);
	*dst++ = b;

    David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
  2021-03-24 16:38       ` David Laight
@ 2021-03-24 19:36         ` Robin Murphy
  -1 siblings, 0 replies; 20+ messages in thread
From: Robin Murphy @ 2021-03-24 19:36 UTC (permalink / raw)
  To: David Laight, Yang Yingliang, linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun

On 2021-03-24 16:38, David Laight wrote:
> From: Robin Murphy
>> Sent: 23 March 2021 12:09
>>
>> On 2021-03-23 07:34, Yang Yingliang wrote:
>>> When copy over 128 bytes, src/dst is added after
>>> each ldp/stp instruction, it will cost more time.
>>> To improve this, we only add src/dst after load
>>> or store 64 bytes.
>>
>> This breaks the required behaviour for copy_*_user(), since the fault
>> handler expects the base address to be up-to-date at all times. Say
>> you're copying 128 bytes and fault on the 4th store, it should return 80
>> bytes not copied; the code below would return 128 bytes not copied, even
>> though 48 bytes have actually been written to the destination.
> 
> Are there any non-superscaler amd64 cpu (that anyone cares about)?
> 
> If the cpu can execute multiple instructions in one clock
> then it is usually possible to get the loop control (almost) free.
> 
> You might need to unroll once to interleave read/write
> but any more may be pointless.

Nah, the whole point is that using post-increment addressing is crap in 
the first place because it introduces register dependencies between each 
access that could be avoided entirely if we could use offset addressing 
(and especially crap when we don't even *have* a post-index addressing 
mode for the unprivileged load/store instructions used in copy_*_user() 
and have to simulate it with extra instructions that throw off the code 
alignment).

We already have code that's tuned to work well across our 
microarchitectures[1], the issue is that butchering it to satisfy the 
additional requirements of copy_*_user() with a common template has 
hobbled regular memcpy() performance. I intend to have a crack at fixing 
that properly tomorrow ;)

Robin.

[1] https://github.com/ARM-software/optimized-routines

> So something like:
> 	a = *src++
> 	do {
> 		b = *src++;
> 		*dst++ = a;
> 		a = *src++;
> 		*dst++ = b;
> 	} while (src != lim);
> 	*dst++ = b;
> 
>      David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes
@ 2021-03-24 19:36         ` Robin Murphy
  0 siblings, 0 replies; 20+ messages in thread
From: Robin Murphy @ 2021-03-24 19:36 UTC (permalink / raw)
  To: David Laight, Yang Yingliang, linux-arm-kernel, linux-kernel
  Cc: catalin.marinas, will, guohanjun

On 2021-03-24 16:38, David Laight wrote:
> From: Robin Murphy
>> Sent: 23 March 2021 12:09
>>
>> On 2021-03-23 07:34, Yang Yingliang wrote:
>>> When copy over 128 bytes, src/dst is added after
>>> each ldp/stp instruction, it will cost more time.
>>> To improve this, we only add src/dst after load
>>> or store 64 bytes.
>>
>> This breaks the required behaviour for copy_*_user(), since the fault
>> handler expects the base address to be up-to-date at all times. Say
>> you're copying 128 bytes and fault on the 4th store, it should return 80
>> bytes not copied; the code below would return 128 bytes not copied, even
>> though 48 bytes have actually been written to the destination.
> 
> Are there any non-superscaler amd64 cpu (that anyone cares about)?
> 
> If the cpu can execute multiple instructions in one clock
> then it is usually possible to get the loop control (almost) free.
> 
> You might need to unroll once to interleave read/write
> but any more may be pointless.

Nah, the whole point is that using post-increment addressing is crap in 
the first place because it introduces register dependencies between each 
access that could be avoided entirely if we could use offset addressing 
(and especially crap when we don't even *have* a post-index addressing 
mode for the unprivileged load/store instructions used in copy_*_user() 
and have to simulate it with extra instructions that throw off the code 
alignment).

We already have code that's tuned to work well across our 
microarchitectures[1], the issue is that butchering it to satisfy the 
additional requirements of copy_*_user() with a common template has 
hobbled regular memcpy() performance. I intend to have a crack at fixing 
that properly tomorrow ;)

Robin.

[1] https://github.com/ARM-software/optimized-routines

> So something like:
> 	a = *src++
> 	do {
> 		b = *src++;
> 		*dst++ = a;
> 		a = *src++;
> 		*dst++ = b;
> 	} while (src != lim);
> 	*dst++ = b;
> 
>      David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2021-03-24 19:39 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-23  7:34 [PATCH 0/3] arm64: lib: improve copy performance Yang Yingliang
2021-03-23  7:34 ` Yang Yingliang
2021-03-23  7:34 ` [PATCH 1/3] arm64: lib: introduce ldp2/stp2 macro Yang Yingliang
2021-03-23  7:34   ` Yang Yingliang
2021-03-23  7:34 ` [PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes Yang Yingliang
2021-03-23  7:34   ` Yang Yingliang
2021-03-23 12:08   ` Robin Murphy
2021-03-23 12:08     ` Robin Murphy
2021-03-23 13:32     ` Will Deacon
2021-03-23 13:32       ` Will Deacon
2021-03-23 14:28       ` Robin Murphy
2021-03-23 14:28         ` Robin Murphy
2021-03-23 15:03       ` Catalin Marinas
2021-03-23 15:03         ` Catalin Marinas
2021-03-24 16:38     ` David Laight
2021-03-24 16:38       ` David Laight
2021-03-24 19:36       ` Robin Murphy
2021-03-24 19:36         ` Robin Murphy
2021-03-23  7:34 ` [PATCH 3/3] arm64: lib: improve copy performance when size is less than 128 and ge 64 bytes Yang Yingliang
2021-03-23  7:34   ` Yang Yingliang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.