linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec
@ 2016-03-01  5:55 Cyril Bur
  2016-03-01  5:55 ` [PATCH 2/2] powerpc: Batch up loads/stores on saving and restoring VSX Cyril Bur
  2016-03-09 23:01 ` [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Michael Neuling
  0 siblings, 2 replies; 8+ messages in thread
From: Cyril Bur @ 2016-03-01  5:55 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: mikey, anton

Currently the assembly to save and restore Altivec registers boils down to
a load immediate of the offset of the specific Altivec register in memory
followed by the load/store which repeats in sequence for each Altivec
register.

This patch attempts to do better by loading up four registers with
immediates so that the loads and stores can be batched up and better
pipelined by the processor.

This patch results in four load/stores in sequence and one add between
groups of four. Also, by using a pair of base registers it means that the
result of the add is not needed by the following instruction.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
These patches need to be applied on to of my rework of FPU/VMX/VSX
switching: https://patchwork.ozlabs.org/patch/589703/

I left in some of my comments indicating if functions are called from C or
not. Looking at them now, they might be a bit much, let me know what you
think.

Tested 64 bit BE and LE under KVM, not sure how I can test 32bit.


 arch/powerpc/include/asm/ppc_asm.h | 63 ++++++++++++++++++++++++++++++--------
 arch/powerpc/kernel/tm.S           |  6 ++--
 arch/powerpc/kernel/vector.S       | 20 +++++++++---
 3 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 499d9f8..5ba69ed 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -110,18 +110,57 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #define REST_16FPRS(n, base)	REST_8FPRS(n, base); REST_8FPRS(n+8, base)
 #define REST_32FPRS(n, base)	REST_16FPRS(n, base); REST_16FPRS(n+16, base)
 
-#define SAVE_VR(n,b,base)	li b,16*(n);  stvx n,base,b
-#define SAVE_2VRS(n,b,base)	SAVE_VR(n,b,base); SAVE_VR(n+1,b,base)
-#define SAVE_4VRS(n,b,base)	SAVE_2VRS(n,b,base); SAVE_2VRS(n+2,b,base)
-#define SAVE_8VRS(n,b,base)	SAVE_4VRS(n,b,base); SAVE_4VRS(n+4,b,base)
-#define SAVE_16VRS(n,b,base)	SAVE_8VRS(n,b,base); SAVE_8VRS(n+8,b,base)
-#define SAVE_32VRS(n,b,base)	SAVE_16VRS(n,b,base); SAVE_16VRS(n+16,b,base)
-#define REST_VR(n,b,base)	li b,16*(n); lvx n,base,b
-#define REST_2VRS(n,b,base)	REST_VR(n,b,base); REST_VR(n+1,b,base)
-#define REST_4VRS(n,b,base)	REST_2VRS(n,b,base); REST_2VRS(n+2,b,base)
-#define REST_8VRS(n,b,base)	REST_4VRS(n,b,base); REST_4VRS(n+4,b,base)
-#define REST_16VRS(n,b,base)	REST_8VRS(n,b,base); REST_8VRS(n+8,b,base)
-#define REST_32VRS(n,b,base)	REST_16VRS(n,b,base); REST_16VRS(n+16,b,base)
+#define __SAVE_4VRS(n,off0,off1,off2,off3,base) \
+	stvx n,base,off0; \
+	stvx n+1,base,off1; \
+	stvx n+2,base,off2; \
+	stvx n+3,base,off3
+
+/* Restores the base for the caller */
+#define SAVE_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
+	addi reg4,base,64; \
+	li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
+	__SAVE_4VRS(0,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__SAVE_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__SAVE_4VRS(8,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__SAVE_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__SAVE_4VRS(16,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__SAVE_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__SAVE_4VRS(24,reg0,reg1,reg2,reg3,base); \
+	__SAVE_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
+	subi base,base,384
+
+#define __REST_4VRS(n,off0,off1,off2,off3,base) \
+	lvx n,base,off0; \
+	lvx n+1,base,off1; \
+	lvx n+2,base,off2; \
+	lvx n+3,base,off3
+
+/* Restores the base for the caller */
+#define REST_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
+	addi reg4,base,64; \
+	li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
+	__REST_4VRS(0,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__REST_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__REST_4VRS(8,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__REST_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__REST_4VRS(16,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__REST_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__REST_4VRS(24,reg0,reg1,reg2,reg3,base); \
+	__REST_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
+	subi base,base,384
 
 #ifdef __BIG_ENDIAN__
 #define STXVD2X_ROT(n,b,base)		STXVD2X(n,b,base)
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index bf8f34a..81e1305 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -96,6 +96,8 @@ _GLOBAL(tm_abort)
  * they will abort back to the checkpointed state we save out here.
  *
  * Call with IRQs off, stacks get all out of sync for some periods in here!
+ *
+ * Is called from C
  */
 _GLOBAL(tm_reclaim)
 	mfcr	r6
@@ -151,7 +153,7 @@ _GLOBAL(tm_reclaim)
 	beq	dont_backup_vec
 
 	addi	r7, r3, THREAD_TRANSACT_VRSTATE
-	SAVE_32VRS(0, r6, r7)	/* r6 scratch, r7 transact vr state */
+	SAVE_32VRS(r6,r8,r9,r10,r11,r7)	/* r6,r8,r9,r10,r11 scratch, r7 transact vr state */
 	mfvscr	v0
 	li	r6, VRSTATE_VSCR
 	stvx	v0, r7, r6
@@ -361,7 +363,7 @@ _GLOBAL(__tm_recheckpoint)
 	li	r5, VRSTATE_VSCR
 	lvx	v0, r8, r5
 	mtvscr	v0
-	REST_32VRS(0, r5, r8)			/* r5 scratch, r8 ptr */
+	REST_32VRS(r5,r6,r9,r10,r11,r8)			/* r5,r6,r9,r10,r11 scratch, r8 ptr */
 dont_restore_vec:
 	ld	r5, THREAD_VRSAVE(r3)
 	mtspr	SPRN_VRSAVE, r5
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 1c2e7a3..8d587fb 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -13,6 +13,8 @@
  * This is similar to load_up_altivec but for the transactional version of the
  * vector regs.  It doesn't mess with the task MSR or valid flags.
  * Furthermore, VEC laziness is not supported with TM currently.
+ *
+ * Is called from C
  */
 _GLOBAL(do_load_up_transact_altivec)
 	mfmsr	r6
@@ -27,7 +29,7 @@ _GLOBAL(do_load_up_transact_altivec)
 	lvx	v0,r10,r3
 	mtvscr	v0
 	addi	r10,r3,THREAD_TRANSACT_VRSTATE
-	REST_32VRS(0,r4,r10)
+	REST_32VRS(r4,r5,r6,r7,r8,r10)
 
 	blr
 #endif
@@ -35,20 +37,24 @@ _GLOBAL(do_load_up_transact_altivec)
 /*
  * Load state from memory into VMX registers including VSCR.
  * Assumes the caller has enabled VMX in the MSR.
+ *
+ * Is called from C
  */
 _GLOBAL(load_vr_state)
 	li	r4,VRSTATE_VSCR
 	lvx	v0,r4,r3
 	mtvscr	v0
-	REST_32VRS(0,r4,r3)
+	REST_32VRS(r4,r5,r6,r7,r8,r3)
 	blr
 
 /*
  * Store VMX state into memory, including VSCR.
  * Assumes the caller has enabled VMX in the MSR.
+ *
+ * NOT called from C
  */
 _GLOBAL(store_vr_state)
-	SAVE_32VRS(0, r4, r3)
+	SAVE_32VRS(r4,r5,r6,r7,r8,r3)
 	mfvscr	v0
 	li	r4, VRSTATE_VSCR
 	stvx	v0, r4, r3
@@ -63,6 +69,8 @@ _GLOBAL(store_vr_state)
  *
  * Note that on 32-bit this can only use registers that will be
  * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
+ *
+ * NOT called from C
  */
 _GLOBAL(load_up_altivec)
 	mfmsr	r5			/* grab the current MSR */
@@ -101,13 +109,15 @@ _GLOBAL(load_up_altivec)
 	stw	r4,THREAD_USED_VR(r5)
 	lvx	v0,r10,r6
 	mtvscr	v0
-	REST_32VRS(0,r4,r6)
+	REST_32VRS(r3,r4,r5,r10,r11,r6)
 	/* restore registers and return */
 	blr
 
 /*
  * save_altivec(tsk)
  * Save the vector registers to its thread_struct
+ *
+ * Is called from C
  */
 _GLOBAL(save_altivec)
 	addi	r3,r3,THREAD		/* want THREAD of task */
@@ -116,7 +126,7 @@ _GLOBAL(save_altivec)
 	PPC_LCMPI	0,r7,0
 	bne	2f
 	addi	r7,r3,THREAD_VRSTATE
-2:	SAVE_32VRS(0,r4,r7)
+2:	SAVE_32VRS(r4,r5,r6,r8,r9,r7)
 	mfvscr	v0
 	li	r4,VRSTATE_VSCR
 	stvx	v0,r4,r7
-- 
2.7.2

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] powerpc: Batch up loads/stores on saving and restoring VSX
  2016-03-01  5:55 [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Cyril Bur
@ 2016-03-01  5:55 ` Cyril Bur
  2016-03-10  0:09   ` Michael Neuling
  2016-03-09 23:01 ` [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Michael Neuling
  1 sibling, 1 reply; 8+ messages in thread
From: Cyril Bur @ 2016-03-01  5:55 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: mikey, anton

Currently the assembly to save and restore VSX registers boils down to a
load immediate of the offset of the specific VSX register in memory
followed by the load/store which repeats in sequence for each VSX register.

This patch attempts to do better by loading up four registers with
immediates so that the loads and stores can be batched up and better
pipelined by the processor.

This patch results in four load/stores in sequence and one add between
groups of four. Also, by using a pair of base registers it means that the
result of the add is not needed by the following instruction.

Due to the overlapping layout of FPU registers and VSX registers on POWER
chips, this patch also benefits FPU loads and stores when VSX is compiled
in and the CPU is VSX capable.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
 arch/powerpc/include/asm/ppc_asm.h | 65 ++++++++++++++++++++++++++++++--------
 arch/powerpc/kernel/fpu.S          | 43 ++++++++++++++++---------
 arch/powerpc/kernel/tm.S           | 46 ++++++++++++++-------------
 3 files changed, 104 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 5ba69ed..dd0df12 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -173,19 +173,58 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #define LXVD2X_ROT(n,b,base)		LXVD2X(n,b,base);	\
 					XXSWAPD(n,n)
 #endif
-/* Save the lower 32 VSRs in the thread VSR region */
-#define SAVE_VSR(n,b,base)	li b,16*(n);  STXVD2X_ROT(n,R##base,R##b)
-#define SAVE_2VSRS(n,b,base)	SAVE_VSR(n,b,base); SAVE_VSR(n+1,b,base)
-#define SAVE_4VSRS(n,b,base)	SAVE_2VSRS(n,b,base); SAVE_2VSRS(n+2,b,base)
-#define SAVE_8VSRS(n,b,base)	SAVE_4VSRS(n,b,base); SAVE_4VSRS(n+4,b,base)
-#define SAVE_16VSRS(n,b,base)	SAVE_8VSRS(n,b,base); SAVE_8VSRS(n+8,b,base)
-#define SAVE_32VSRS(n,b,base)	SAVE_16VSRS(n,b,base); SAVE_16VSRS(n+16,b,base)
-#define REST_VSR(n,b,base)	li b,16*(n); LXVD2X_ROT(n,R##base,R##b)
-#define REST_2VSRS(n,b,base)	REST_VSR(n,b,base); REST_VSR(n+1,b,base)
-#define REST_4VSRS(n,b,base)	REST_2VSRS(n,b,base); REST_2VSRS(n+2,b,base)
-#define REST_8VSRS(n,b,base)	REST_4VSRS(n,b,base); REST_4VSRS(n+4,b,base)
-#define REST_16VSRS(n,b,base)	REST_8VSRS(n,b,base); REST_8VSRS(n+8,b,base)
-#define REST_32VSRS(n,b,base)	REST_16VSRS(n,b,base); REST_16VSRS(n+16,b,base)
+
+#define __SAVE_4VSRS(n,off0,off1,off2,off3,base) \
+	STXVD2X_ROT(n,R##base,R##off0); \
+	STXVD2X_ROT(n+1,R##base,R##off1); \
+	STXVD2X_ROT(n+2,R##base,R##off2); \
+	STXVD2X_ROT(n+3,R##base,R##off3)
+
+/* Restores the base for the caller */
+#define SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base) \
+	addi reg4,base,64; \
+	li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
+	__SAVE_4VSRS(0,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__SAVE_4VSRS(4,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__SAVE_4VSRS(8,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__SAVE_4VSRS(12,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__SAVE_4VSRS(16,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__SAVE_4VSRS(20,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__SAVE_4VSRS(24,reg0,reg1,reg2,reg3,base); \
+	__SAVE_4VSRS(28,reg0,reg1,reg2,reg3,reg4); \
+	subi base,base,384
+
+#define __REST_4VSRS(n,off0,off1,off2,off3,base) \
+	LXVD2X_ROT(n,R##base,R##off0); \
+	LXVD2X_ROT(n+1,R##base,R##off1); \
+	LXVD2X_ROT(n+2,R##base,R##off2); \
+	LXVD2X_ROT(n+3,R##base,R##off3)
+
+/* Restores the base for the caller */
+#define REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base) \
+	addi reg4,base,64; \
+	li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
+	__REST_4VSRS(0,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__REST_4VSRS(4,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__REST_4VSRS(8,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__REST_4VSRS(12,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__REST_4VSRS(16,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__REST_4VSRS(20,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__REST_4VSRS(24,reg0,reg1,reg2,reg3,base); \
+	__REST_4VSRS(28,reg0,reg1,reg2,reg3,reg4); \
+	subi base,base,384
 
 /*
  * b = base register for addressing, o = base offset from register of 1st EVR
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 15da2b5..dc57ff1 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -26,29 +26,32 @@
 #include <asm/ptrace.h>
 
 #ifdef CONFIG_VSX
-#define __REST_32FPVSRS(n,c,base)					\
+#define __REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)	\
 BEGIN_FTR_SECTION							\
 	b	2f;							\
 END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
-	REST_32FPRS(n,base);						\
+	REST_32FPRS(0,base);						\
 	b	3f;							\
-2:	REST_32VSRS(n,c,base);						\
+2:	REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
 3:
 
-#define __SAVE_32FPVSRS(n,c,base)					\
+#define __SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
 BEGIN_FTR_SECTION							\
 	b	2f;							\
 END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
-	SAVE_32FPRS(n,base);						\
+	SAVE_32FPRS(0,base);						\
 	b	3f;							\
-2:	SAVE_32VSRS(n,c,base);						\
+2:	SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
 3:
 #else
-#define __REST_32FPVSRS(n,b,base)	REST_32FPRS(n, base)
-#define __SAVE_32FPVSRS(n,b,base)	SAVE_32FPRS(n, base)
+#define __REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)	REST_32FPRS(0, base)
+#define __SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)	SAVE_32FPRS(0, base)
 #endif
-#define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base)
-#define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base)
+#define REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
+__REST_32FPVSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
+
+#define SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
+__SAVE_32FPVSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 /* void do_load_up_transact_fpu(struct thread_struct *thread)
@@ -56,6 +59,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
  * This is similar to load_up_fpu but for the transactional version of the FP
  * register set.  It doesn't mess with the task MSR or valid flags.
  * Furthermore, we don't do lazy FP with TM currently.
+ *
+ * Is called from C
  */
 _GLOBAL(do_load_up_transact_fpu)
 	mfmsr	r6
@@ -71,7 +76,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	addi	r7,r3,THREAD_TRANSACT_FPSTATE
 	lfd	fr0,FPSTATE_FPSCR(r7)
 	MTFSF_L(fr0)
-	REST_32FPVSRS(0, R4, R7)
+	REST_32FPVSRS(R4,R5,R6,R8,R9,R7)
 
 	blr
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
@@ -79,19 +84,23 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 /*
  * Load state from memory into FP registers including FPSCR.
  * Assumes the caller has enabled FP in the MSR.
+ *
+ * Is called from C
  */
 _GLOBAL(load_fp_state)
 	lfd	fr0,FPSTATE_FPSCR(r3)
 	MTFSF_L(fr0)
-	REST_32FPVSRS(0, R4, R3)
+	REST_32FPVSRS(R4,R5,R6,R7,R8,R3)
 	blr
 
 /*
  * Store FP state into memory, including FPSCR
  * Assumes the caller has enabled FP in the MSR.
+ *
+ * NOT called from C
  */
 _GLOBAL(store_fp_state)
-	SAVE_32FPVSRS(0, R4, R3)
+	SAVE_32FPVSRS(R4,R5,R6,R7,R8,R3)
 	mffs	fr0
 	stfd	fr0,FPSTATE_FPSCR(r3)
 	blr
@@ -104,6 +113,8 @@ _GLOBAL(store_fp_state)
  * enable the FPU for the current task and return to the task.
  * Note that on 32-bit this can only use registers that will be
  * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
+ *
+ * NOT called from C
  */
 _GLOBAL(load_up_fpu)
 	mfmsr	r5
@@ -137,7 +148,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	addi	r10,r5,THREAD_FPSTATE
 	lfd	fr0,FPSTATE_FPSCR(r10)
 	MTFSF_L(fr0)
-	REST_32FPVSRS(0, R4, R10)
+	REST_32FPVSRS(R3,R4,R5,R6,R11,R10)
 	/* restore registers and return */
 	/* we haven't used ctr or xer or lr */
 	blr
@@ -146,6 +157,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
  * save_fpu(tsk)
  * Save the floating-point registers in its thread_struct.
  * Enables the FPU for use in the kernel on return.
+ *
+ * Is called from C
  */
 _GLOBAL(save_fpu)
 	addi	r3,r3,THREAD	        /* want THREAD of task */
@@ -154,7 +167,7 @@ _GLOBAL(save_fpu)
 	PPC_LCMPI	0,r6,0
 	bne	2f
 	addi	r6,r3,THREAD_FPSTATE
-2:	SAVE_32FPVSRS(0, R4, R6)
+2:	SAVE_32FPVSRS(R4,R5,R7,R8,R9,R6)
 	mffs	fr0
 	stfd	fr0,FPSTATE_FPSCR(r6)
 	blr
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 81e1305..61900b8 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -14,30 +14,32 @@
 
 #ifdef CONFIG_VSX
 /* See fpu.S, this is borrowed from there */
-#define __SAVE_32FPRS_VSRS(n,c,base)		\
-BEGIN_FTR_SECTION				\
-	b	2f;				\
-END_FTR_SECTION_IFSET(CPU_FTR_VSX);		\
-	SAVE_32FPRS(n,base);			\
-	b	3f;				\
-2:	SAVE_32VSRS(n,c,base);			\
+#define __SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
+BEGIN_FTR_SECTION							\
+	b	2f;							\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
+	SAVE_32FPRS(0,base);						\
+	b	3f;							\
+2:	SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
 3:
-#define __REST_32FPRS_VSRS(n,c,base)		\
-BEGIN_FTR_SECTION				\
-	b	2f;				\
-END_FTR_SECTION_IFSET(CPU_FTR_VSX);		\
-	REST_32FPRS(n,base);			\
-	b	3f;				\
-2:	REST_32VSRS(n,c,base);			\
+
+#define __REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)	\
+BEGIN_FTR_SECTION							\
+	b	2f;							\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
+	REST_32FPRS(0,base);						\
+	b	3f;							\
+2:	REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
 3:
+
 #else
-#define __SAVE_32FPRS_VSRS(n,c,base)	SAVE_32FPRS(n, base)
-#define __REST_32FPRS_VSRS(n,c,base)	REST_32FPRS(n, base)
+#define __SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)	SAVE_32FPRS(0, base)
+#define __REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)	REST_32FPRS(0, base)
 #endif
-#define SAVE_32FPRS_VSRS(n,c,base) \
-	__SAVE_32FPRS_VSRS(n,__REG_##c,__REG_##base)
-#define REST_32FPRS_VSRS(n,c,base) \
-	__REST_32FPRS_VSRS(n,__REG_##c,__REG_##base)
+#define SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
+__SAVE_32FPRS_VSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
+#define REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
+__REST_32FPRS_VSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
 
 /* Stack frame offsets for local variables. */
 #define TM_FRAME_L0	TM_FRAME_SIZE-16
@@ -165,7 +167,7 @@ dont_backup_vec:
 	beq	dont_backup_fp
 
 	addi	r7, r3, THREAD_TRANSACT_FPSTATE
-	SAVE_32FPRS_VSRS(0, R6, R7)	/* r6 scratch, r7 transact fp state */
+	SAVE_32FPRS_VSRS(R6,R8,R9,R10,R11,R7) /* r6,r8,r9,r10,r11 scratch, r7 transact fp state */
 
 	mffs    fr0
 	stfd    fr0,FPSTATE_FPSCR(r7)
@@ -375,7 +377,7 @@ dont_restore_vec:
 	addi	r8, r3, THREAD_FPSTATE
 	lfd	fr0, FPSTATE_FPSCR(r8)
 	MTFSF_L(fr0)
-	REST_32FPRS_VSRS(0, R4, R8)
+	REST_32FPRS_VSRS(R4,R5,R6,R7,R9,R8)
 
 dont_restore_fp:
 	mtmsr	r6				/* FP/Vec off again! */
-- 
2.7.2

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec
  2016-03-01  5:55 [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Cyril Bur
  2016-03-01  5:55 ` [PATCH 2/2] powerpc: Batch up loads/stores on saving and restoring VSX Cyril Bur
@ 2016-03-09 23:01 ` Michael Neuling
  2016-03-10  0:56   ` Cyril Bur
  2016-03-10  5:37   ` Cyril Bur
  1 sibling, 2 replies; 8+ messages in thread
From: Michael Neuling @ 2016-03-09 23:01 UTC (permalink / raw)
  To: Cyril Bur, linuxppc-dev; +Cc: anton

On Tue, 2016-03-01 at 16:55 +1100, Cyril Bur wrote:

> Currently the assembly to save and restore Altivec registers boils down t=
o
> a load immediate of the offset of the specific Altivec register in memory
> followed by the load/store which repeats in sequence for each Altivec
> register.
>=20
> This patch attempts to do better by loading up four registers with
> immediates so that the loads and stores can be batched up and better
> pipelined by the processor.
>=20
> This patch results in four load/stores in sequence and one add between
> groups of four. Also, by using a pair of base registers it means that the
> result of the add is not needed by the following instruction.

What the performance improvement?

> Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> ---
> These patches need to be applied on to of my rework of FPU/VMX/VSX
> switching: https://patchwork.ozlabs.org/patch/589703/
>=20
> I left in some of my comments indicating if functions are called from C o=
r
> not. Looking at them now, they might be a bit much, let me know what you
> think.

I think that's ok, although they are likely to get stale quickly.

>=20
> Tested 64 bit BE and LE under KVM, not sure how I can test 32bit.
>=20
>=20
>  arch/powerpc/include/asm/ppc_asm.h | 63 ++++++++++++++++++++++++++++++--=
------
>  arch/powerpc/kernel/tm.S           |  6 ++--
>  arch/powerpc/kernel/vector.S       | 20 +++++++++---
>  3 files changed, 70 insertions(+), 19 deletions(-)
>=20
> diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/as=
m/ppc_asm.h
> index 499d9f8..5ba69ed 100644
> --- a/arch/powerpc/include/asm/ppc_asm.h
> +++ b/arch/powerpc/include/asm/ppc_asm.h
> @@ -110,18 +110,57 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
>  #define REST_16FPRS(n, base)  	  REST_8FPRS(n, base); REST_8FPRS(n+8, ba=
se)
>  #define REST_32FPRS(n, base)  	  REST_16FPRS(n, base); REST_16FPRS(n+16,=
 base)
> =20
> -#define SAVE_VR(n,b,base)  	  li b,16*(n);  stvx n,base,b
> -#define SAVE_2VRS(n,b,base)  	  SAVE_VR(n,b,base); SAVE_VR(n+1,b,base)
> -#define SAVE_4VRS(n,b,base)  	  SAVE_2VRS(n,b,base); SAVE_2VRS(n+2,b,bas=
e)
> -#define SAVE_8VRS(n,b,base)  	  SAVE_4VRS(n,b,base); SAVE_4VRS(n+4,b,bas=
e)
> -#define SAVE_16VRS(n,b,base)  	  SAVE_8VRS(n,b,base); SAVE_8VRS(n+8,b,ba=
se)
> -#define SAVE_32VRS(n,b,base)  	  SAVE_16VRS(n,b,base); SAVE_16VRS(n+16,b=
,base)
> -#define REST_VR(n,b,base)  	  li b,16*(n); lvx n,base,b
> -#define REST_2VRS(n,b,base)  	  REST_VR(n,b,base); REST_VR(n+1,b,base)
> -#define REST_4VRS(n,b,base)  	  REST_2VRS(n,b,base); REST_2VRS(n+2,b,bas=
e)
> -#define REST_8VRS(n,b,base)  	  REST_4VRS(n,b,base); REST_4VRS(n+4,b,bas=
e)
> -#define REST_16VRS(n,b,base)  	  REST_8VRS(n,b,base); REST_8VRS(n+8,b,ba=
se)
> -#define REST_32VRS(n,b,base)  	  REST_16VRS(n,b,base); REST_16VRS(n+16,b=
,base)

Can you use consistent names between off and reg? in the below

> +#define __SAVE_4VRS(n,off0,off1,off2,off3,base) \
> +  	  stvx n,base,off0; \
> +  	  stvx n+1,base,off1; \
> +  	  stvx n+2,base,off2; \
> +  	  stvx n+3,base,off3
> +
> +/* Restores the base for the caller */

Can you make this:
/* Base: non-volatile, reg[0-4]: volatile */

> +#define SAVE_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> +  	  addi reg4,base,64; \
> +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> +  	  __SAVE_4VRS(0,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __SAVE_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __SAVE_4VRS(8,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __SAVE_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __SAVE_4VRS(16,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __SAVE_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __SAVE_4VRS(24,reg0,reg1,reg2,reg3,base); \
> +  	  __SAVE_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> +  	  subi base,base,384

You can swap these last two lines which will make base reuse quicker
later.  Although that might not be needed.

> +#define __REST_4VRS(n,off0,off1,off2,off3,base) \
> +  	  lvx n,base,off0; \
> +  	  lvx n+1,base,off1; \
> +  	  lvx n+2,base,off2; \
> +  	  lvx n+3,base,off3
> +
> +/* Restores the base for the caller */
> +#define REST_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> +  	  addi reg4,base,64; \
> +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> +  	  __REST_4VRS(0,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __REST_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __REST_4VRS(8,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __REST_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __REST_4VRS(16,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __REST_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __REST_4VRS(24,reg0,reg1,reg2,reg3,base); \
> +  	  __REST_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> +  	  subi base,base,384
> =20
>  #ifdef __BIG_ENDIAN__
>  #define STXVD2X_ROT(n,b,base)  	  	> STXVD2X(n,b,base)
> diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
> index bf8f34a..81e1305 100644
> --- a/arch/powerpc/kernel/tm.S
> +++ b/arch/powerpc/kernel/tm.S
> @@ -96,6 +96,8 @@ _GLOBAL(tm_abort)
>   * they will abort back to the checkpointed state we save out here.
>   *
>   * Call with IRQs off, stacks get all out of sync for some periods in he=
re!
> + *
> + * Is called from C
>   */
>  _GLOBAL(tm_reclaim)
>    	  mfcr  	  r6
> @@ -151,7 +153,7 @@ _GLOBAL(tm_reclaim)
>    	  beq  	  dont_backup_vec
> =20
>    	  addi  	  r7, r3, THREAD_TRANSACT_VRSTATE
> -  	  SAVE_32VRS(0, r6, r7)  	  /* r6 scratch, r7 transact vr state */
> +  	  SAVE_32VRS(r6,r8,r9,r10,r11,r7)  	  /* r6,r8,r9,r10,r11 scratch, r7=
 transact vr state */

Line wrapping here.

>    	  mfvscr  	  v0
>    	  li  	  r6, VRSTATE_VSCR
>    	  stvx  	  v0, r7, r6
> @@ -361,7 +363,7 @@ _GLOBAL(__tm_recheckpoint)
>    	  li  	  r5, VRSTATE_VSCR
>    	  lvx  	  v0, r8, r5
>    	  mtvscr  	  v0
> -  	  REST_32VRS(0, r5, r8)  	  	  	  /* r5 scratch, r8 ptr */
> +  	  REST_32VRS(r5,r6,r9,r10,r11,r8)  	  	  	  /* r5,r6,r9,r10,r11 scrat=
ch, r8 ptr */

wrapping here too


>  dont_restore_vec:
>    	  ld  	  r5, THREAD_VRSAVE(r3)
>    	  mtspr  	  SPRN_VRSAVE, r5
> diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
> index 1c2e7a3..8d587fb 100644
> --- a/arch/powerpc/kernel/vector.S
> +++ b/arch/powerpc/kernel/vector.S
> @@ -13,6 +13,8 @@
>   * This is similar to load_up_altivec but for the transactional version =
of the
>   * vector regs.  It doesn't mess with the task MSR or valid flags.
>   * Furthermore, VEC laziness is not supported with TM currently.
> + *
> + * Is called from C
>   */
>  _GLOBAL(do_load_up_transact_altivec)
>    	  mfmsr  	  r6
> @@ -27,7 +29,7 @@ _GLOBAL(do_load_up_transact_altivec)
>    	  lvx  	  v0,r10,r3
>    	  mtvscr  	  v0
>    	  addi  	  r10,r3,THREAD_TRANSACT_VRSTATE
> -  	  REST_32VRS(0,r4,r10)
> +  	  REST_32VRS(r4,r5,r6,r7,r8,r10)
> =20
>    	  blr
>  #endif
> @@ -35,20 +37,24 @@ _GLOBAL(do_load_up_transact_altivec)
>  /*
>   * Load state from memory into VMX registers including VSCR.
>   * Assumes the caller has enabled VMX in the MSR.
> + *
> + * Is called from C
>   */
>  _GLOBAL(load_vr_state)
>    	  li  	  r4,VRSTATE_VSCR
>    	  lvx  	  v0,r4,r3
>    	  mtvscr  	  v0
> -  	  REST_32VRS(0,r4,r3)
> +  	  REST_32VRS(r4,r5,r6,r7,r8,r3)
>    	  blr
> =20
>  /*
>   * Store VMX state into memory, including VSCR.
>   * Assumes the caller has enabled VMX in the MSR.
> + *
> + * NOT called from C
>   */
>  _GLOBAL(store_vr_state)
> -  	  SAVE_32VRS(0, r4, r3)
> +  	  SAVE_32VRS(r4,r5,r6,r7,r8,r3)
>    	  mfvscr  	  v0
>    	  li  	  r4, VRSTATE_VSCR
>    	  stvx  	  v0, r4, r3
> @@ -63,6 +69,8 @@ _GLOBAL(store_vr_state)
>   *
>   * Note that on 32-bit this can only use registers that will be
>   * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
> + *
> + * NOT called from C
>   */
>  _GLOBAL(load_up_altivec)
>    	  mfmsr  	  r5  	  	  	  /* grab the current MSR */
> @@ -101,13 +109,15 @@ _GLOBAL(load_up_altivec)
>    	  stw  	  r4,THREAD_USED_VR(r5)
>    	  lvx  	  v0,r10,r6
>    	  mtvscr  	  v0
> -  	  REST_32VRS(0,r4,r6)
> +  	  REST_32VRS(r3,r4,r5,r10,r11,r6)
>    	  /* restore registers and return */
>    	  blr
> =20
>  /*
>   * save_altivec(tsk)
>   * Save the vector registers to its thread_struct
> + *
> + * Is called from C
>   */
>  _GLOBAL(save_altivec)
>    	  addi  	  r3,r3,THREAD  	  	> /* want THREAD of task */
> @@ -116,7 +126,7 @@ _GLOBAL(save_altivec)
>    	  PPC_LCMPI  	  0,r7,0
>    	  bne  	  2f
>    	  addi  	  r7,r3,THREAD_VRSTATE
> -2:  	  SAVE_32VRS(0,r4,r7)
> +2:  	  SAVE_32VRS(r4,r5,r6,r8,r9,r7)
>    	  mfvscr  	  v0
>    	  li  	  r4,VRSTATE_VSCR
>    	  stvx  	  v0,r4,r7

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] powerpc: Batch up loads/stores on saving and restoring VSX
  2016-03-01  5:55 ` [PATCH 2/2] powerpc: Batch up loads/stores on saving and restoring VSX Cyril Bur
@ 2016-03-10  0:09   ` Michael Neuling
  2016-03-10  1:02     ` Cyril Bur
  0 siblings, 1 reply; 8+ messages in thread
From: Michael Neuling @ 2016-03-10  0:09 UTC (permalink / raw)
  To: Cyril Bur, linuxppc-dev; +Cc: anton

On Tue, 2016-03-01 at 16:55 +1100, Cyril Bur wrote:

> Currently the assembly to save and restore VSX registers boils down to a
> load immediate of the offset of the specific VSX register in memory
> followed by the load/store which repeats in sequence for each VSX registe=
r.
>=20
> This patch attempts to do better by loading up four registers with
> immediates so that the loads and stores can be batched up and better
> pipelined by the processor.
>=20
> This patch results in four load/stores in sequence and one add between
> groups of four. Also, by using a pair of base registers it means that the
> result of the add is not needed by the following instruction.
>=20
> Due to the overlapping layout of FPU registers and VSX registers on POWER
> chips, this patch also benefits FPU loads and stores when VSX is compiled
> in and the CPU is VSX capable.

The extra use of registers scares the hell out of me. That being said
I've eyeballed each use and they look ok to me though.

What is the performance gain?  I think there's a trade off here
between code complexity/fragility verse what it gains us.  If it's a
tiny improvement, this may not be worth it.

> Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> ---
>  arch/powerpc/include/asm/ppc_asm.h | 65 ++++++++++++++++++++++++++++++--=
------
>  arch/powerpc/kernel/fpu.S          | 43 ++++++++++++++++---------
>  arch/powerpc/kernel/tm.S           | 46 ++++++++++++++-------------
>  3 files changed, 104 insertions(+), 50 deletions(-)
>=20
> diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/as=
m/ppc_asm.h
> index 5ba69ed..dd0df12 100644
> --- a/arch/powerpc/include/asm/ppc_asm.h
> +++ b/arch/powerpc/include/asm/ppc_asm.h
> @@ -173,19 +173,58 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
>  #define LXVD2X_ROT(n,b,base)  	  	> LXVD2X(n,b,base);  	  \
>    	  	  	  	  	  XXSWAPD(n,n)
>  #endif
> -/* Save the lower 32 VSRs in the thread VSR region */
> -#define SAVE_VSR(n,b,base)  	  li b,16*(n);  STXVD2X_ROT(n,R##base,R##b)
> -#define SAVE_2VSRS(n,b,base)  	  SAVE_VSR(n,b,base); SAVE_VSR(n+1,b,base=
)
> -#define SAVE_4VSRS(n,b,base)  	  SAVE_2VSRS(n,b,base); SAVE_2VSRS(n+2,b,=
base)
> -#define SAVE_8VSRS(n,b,base)  	  SAVE_4VSRS(n,b,base); SAVE_4VSRS(n+4,b,=
base)
> -#define SAVE_16VSRS(n,b,base)  	  SAVE_8VSRS(n,b,base); SAVE_8VSRS(n+8,b=
,base)
> -#define SAVE_32VSRS(n,b,base)  	  SAVE_16VSRS(n,b,base); SAVE_16VSRS(n+1=
6,b,base)
> -#define REST_VSR(n,b,base)  	  li b,16*(n); LXVD2X_ROT(n,R##base,R##b)
> -#define REST_2VSRS(n,b,base)  	  REST_VSR(n,b,base); REST_VSR(n+1,b,base=
)
> -#define REST_4VSRS(n,b,base)  	  REST_2VSRS(n,b,base); REST_2VSRS(n+2,b,=
base)
> -#define REST_8VSRS(n,b,base)  	  REST_4VSRS(n,b,base); REST_4VSRS(n+4,b,=
base)
> -#define REST_16VSRS(n,b,base)  	  REST_8VSRS(n,b,base); REST_8VSRS(n+8,b=
,base)
> -#define REST_32VSRS(n,b,base)  	  REST_16VSRS(n,b,base); REST_16VSRS(n+1=
6,b,base)
> +
> +#define __SAVE_4VSRS(n,off0,off1,off2,off3,base) \
> +  	  STXVD2X_ROT(n,R##base,R##off0); \
> +  	  STXVD2X_ROT(n+1,R##base,R##off1); \
> +  	  STXVD2X_ROT(n+2,R##base,R##off2); \
> +  	  STXVD2X_ROT(n+3,R##base,R##off3)

same comment about off vs reg=20


> +/* Restores the base for the caller */
> +#define SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base) \
> +  	  addi reg4,base,64; \
> +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> +  	  __SAVE_4VSRS(0,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __SAVE_4VSRS(4,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __SAVE_4VSRS(8,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __SAVE_4VSRS(12,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __SAVE_4VSRS(16,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __SAVE_4VSRS(20,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __SAVE_4VSRS(24,reg0,reg1,reg2,reg3,base); \
> +  	  __SAVE_4VSRS(28,reg0,reg1,reg2,reg3,reg4); \
> +  	  subi base,base,384
> +
> +#define __REST_4VSRS(n,off0,off1,off2,off3,base) \
> +  	  LXVD2X_ROT(n,R##base,R##off0); \
> +  	  LXVD2X_ROT(n+1,R##base,R##off1); \
> +  	  LXVD2X_ROT(n+2,R##base,R##off2); \
> +  	  LXVD2X_ROT(n+3,R##base,R##off3)
> +
> +/* Restores the base for the caller */
> +#define REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base) \
> +  	  addi reg4,base,64; \
> +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> +  	  __REST_4VSRS(0,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __REST_4VSRS(4,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __REST_4VSRS(8,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __REST_4VSRS(12,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __REST_4VSRS(16,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __REST_4VSRS(20,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __REST_4VSRS(24,reg0,reg1,reg2,reg3,base); \
> +  	  __REST_4VSRS(28,reg0,reg1,reg2,reg3,reg4); \
> +  	  subi base,base,384
> =20
>  /*
>   * b =3D base register for addressing, o =3D base offset from register o=
f 1st EVR
> diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
> index 15da2b5..dc57ff1 100644
> --- a/arch/powerpc/kernel/fpu.S
> +++ b/arch/powerpc/kernel/fpu.S
> @@ -26,29 +26,32 @@
>  #include <asm/ptrace.h>
> =20
>  #ifdef CONFIG_VSX
> -#define __REST_32FPVSRS(n,c,base)  	  	  	  	  	  \
> +#define __REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)  	  \
>  BEGIN_FTR_SECTION  	  	  	  	  	  	  	  \
>    	  b  	  2f;  	  	  	  	  	  	  	  \
>  END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	  	  	  	  \
> -  	  REST_32FPRS(n,base);  	  	  	  	  	  	> \
> +  	  REST_32FPRS(0,base);  	  	  	  	  	  	> \
>    	  b  	  3f;  	  	  	  	  	  	  	  \
> -2:  	  REST_32VSRS(n,c,base);  	  	  	  	  	  	> \
> +2:  	  REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
>  3:
> =20
> -#define __SAVE_32FPVSRS(n,c,base)  	  	  	  	  	  \
> +#define __SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
>  BEGIN_FTR_SECTION  	  	  	  	  	  	  	  \
>    	  b  	  2f;  	  	  	  	  	  	  	  \
>  END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	  	  	  	  \
> -  	  SAVE_32FPRS(n,base);  	  	  	  	  	  	> \
> +  	  SAVE_32FPRS(0,base);  	  	  	  	  	  	> \
>    	  b  	  3f;  	  	  	  	  	  	  	  \
> -2:  	  SAVE_32VSRS(n,c,base);  	  	  	  	  	  	> \
> +2:  	  SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
>  3:
>  #else
> -#define __REST_32FPVSRS(n,b,base)  	  REST_32FPRS(n, base)
> -#define __SAVE_32FPVSRS(n,b,base)  	  SAVE_32FPRS(n, base)
> +#define __REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)  	  REST_32FPRS(0=
, base)
> +#define __SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)  	  SAVE_32FPRS(0=
, base)
>  #endif
> -#define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base=
)
> -#define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base=
)
> +#define REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
> +__REST_32FPVSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__RE=
G_##reg4,__REG_##base)
> +
> +#define SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
> +__SAVE_32FPVSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__RE=
G_##reg4,__REG_##base)
> =20
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>  /* void do_load_up_transact_fpu(struct thread_struct *thread)
> @@ -56,6 +59,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	  	  	  	  \
>   * This is similar to load_up_fpu but for the transactional version of t=
he FP
>   * register set.  It doesn't mess with the task MSR or valid flags.
>   * Furthermore, we don't do lazy FP with TM currently.
> + *
> + * Is called from C
>   */
>  _GLOBAL(do_load_up_transact_fpu)
>    	  mfmsr  	  r6
> @@ -71,7 +76,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
>    	  addi  	  r7,r3,THREAD_TRANSACT_FPSTATE
>    	  lfd  	  fr0,FPSTATE_FPSCR(r7)
>    	  MTFSF_L(fr0)
> -  	  REST_32FPVSRS(0, R4, R7)
> +  	  REST_32FPVSRS(R4,R5,R6,R8,R9,R7)
> =20
>    	  blr
>  #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
> @@ -79,19 +84,23 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
>  /*
>   * Load state from memory into FP registers including FPSCR.
>   * Assumes the caller has enabled FP in the MSR.
> + *
> + * Is called from C
>   */
>  _GLOBAL(load_fp_state)
>    	  lfd  	  fr0,FPSTATE_FPSCR(r3)
>    	  MTFSF_L(fr0)
> -  	  REST_32FPVSRS(0, R4, R3)
> +  	  REST_32FPVSRS(R4,R5,R6,R7,R8,R3)
>    	  blr
> =20
>  /*
>   * Store FP state into memory, including FPSCR
>   * Assumes the caller has enabled FP in the MSR.
> + *
> + * NOT called from C
>   */
>  _GLOBAL(store_fp_state)
> -  	  SAVE_32FPVSRS(0, R4, R3)
> +  	  SAVE_32FPVSRS(R4,R5,R6,R7,R8,R3)
>    	  mffs  	  fr0
>    	  stfd  	  fr0,FPSTATE_FPSCR(r3)
>    	  blr
> @@ -104,6 +113,8 @@ _GLOBAL(store_fp_state)
>   * enable the FPU for the current task and return to the task.
>   * Note that on 32-bit this can only use registers that will be
>   * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
> + *
> + * NOT called from C
>   */
>  _GLOBAL(load_up_fpu)
>    	  mfmsr  	  r5
> @@ -137,7 +148,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
>    	  addi  	  r10,r5,THREAD_FPSTATE
>    	  lfd  	  fr0,FPSTATE_FPSCR(r10)
>    	  MTFSF_L(fr0)
> -  	  REST_32FPVSRS(0, R4, R10)
> +  	  REST_32FPVSRS(R3,R4,R5,R6,R11,R10)
>    	  /* restore registers and return */
>    	  /* we haven't used ctr or xer or lr */
>    	  blr
> @@ -146,6 +157,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
>   * save_fpu(tsk)
>   * Save the floating-point registers in its thread_struct.
>   * Enables the FPU for use in the kernel on return.
> + *
> + * Is called from C
>   */
>  _GLOBAL(save_fpu)
>    	  addi  	  r3,r3,THREAD  	          /* want THREAD of task */
> @@ -154,7 +167,7 @@ _GLOBAL(save_fpu)
>    	  PPC_LCMPI  	  0,r6,0
>    	  bne  	  2f
>    	  addi  	  r6,r3,THREAD_FPSTATE
> -2:  	  SAVE_32FPVSRS(0, R4, R6)
> +2:  	  SAVE_32FPVSRS(R4,R5,R7,R8,R9,R6)
>    	  mffs  	  fr0
>    	  stfd  	  fr0,FPSTATE_FPSCR(r6)
>    	  blr
> diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
> index 81e1305..61900b8 100644
> --- a/arch/powerpc/kernel/tm.S
> +++ b/arch/powerpc/kernel/tm.S
> @@ -14,30 +14,32 @@
> =20
>  #ifdef CONFIG_VSX
>  /* See fpu.S, this is borrowed from there */
> -#define __SAVE_32FPRS_VSRS(n,c,base)  	  	> \
> -BEGIN_FTR_SECTION  	  	  	  	> \
> -  	  b  	  2f;  	  	  	  	> \
> -END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	> \
> -  	  SAVE_32FPRS(n,base);  	  	  	  \
> -  	  b  	  3f;  	  	  	  	> \
> -2:  	  SAVE_32VSRS(n,c,base);  	  	  	  \
> +#define __SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
> +BEGIN_FTR_SECTION  	  	  	  	  	  	  	  \
> +  	  b  	  2f;  	  	  	  	  	  	  	  \
> +END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	  	  	  	  \
> +  	  SAVE_32FPRS(0,base);  	  	  	  	  	  	> \
> +  	  b  	  3f;  	  	  	  	  	  	  	  \
> +2:  	  SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
>  3:
> -#define __REST_32FPRS_VSRS(n,c,base)  	  	> \
> -BEGIN_FTR_SECTION  	  	  	  	> \
> -  	  b  	  2f;  	  	  	  	> \
> -END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	> \
> -  	  REST_32FPRS(n,base);  	  	  	  \
> -  	  b  	  3f;  	  	  	  	> \
> -2:  	  REST_32VSRS(n,c,base);  	  	  	  \
> +
> +#define __REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)  	  \
> +BEGIN_FTR_SECTION  	  	  	  	  	  	  	  \
> +  	  b  	  2f;  	  	  	  	  	  	  	  \
> +END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	  	  	  	  \
> +  	  REST_32FPRS(0,base);  	  	  	  	  	  	> \
> +  	  b  	  3f;  	  	  	  	  	  	  	  \
> +2:  	  REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
>  3:
> +
>  #else
> -#define __SAVE_32FPRS_VSRS(n,c,base)  	  SAVE_32FPRS(n, base)
> -#define __REST_32FPRS_VSRS(n,c,base)  	  REST_32FPRS(n, base)
> +#define __SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)  	  SAVE_32FPR=
S(0, base)
> +#define __REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)  	  REST_32FPR=
S(0, base)
>  #endif
> -#define SAVE_32FPRS_VSRS(n,c,base) \
> -  	  __SAVE_32FPRS_VSRS(n,__REG_##c,__REG_##base)
> -#define REST_32FPRS_VSRS(n,c,base) \
> -  	  __REST_32FPRS_VSRS(n,__REG_##c,__REG_##base)
> +#define SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
> +__SAVE_32FPRS_VSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,_=
_REG_##reg4,__REG_##base)
> +#define REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
> +__REST_32FPRS_VSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,_=
_REG_##reg4,__REG_##base)
> =20
>  /* Stack frame offsets for local variables. */
>  #define TM_FRAME_L0  	  TM_FRAME_SIZE-16
> @@ -165,7 +167,7 @@ dont_backup_vec:
>    	  beq  	  dont_backup_fp
> =20
>    	  addi  	  r7, r3, THREAD_TRANSACT_FPSTATE
> -  	  SAVE_32FPRS_VSRS(0, R6, R7)  	  /* r6 scratch, r7 transact fp state=
 */
> +  	  SAVE_32FPRS_VSRS(R6,R8,R9,R10,R11,R7) /* r6,r8,r9,r10,r11 scratch, =
r7 transact fp state */
> =20
>    	  mffs    fr0
>    	  stfd    fr0,FPSTATE_FPSCR(r7)
> @@ -375,7 +377,7 @@ dont_restore_vec:
>    	  addi  	  r8, r3, THREAD_FPSTATE
>    	  lfd  	  fr0, FPSTATE_FPSCR(r8)
>    	  MTFSF_L(fr0)
> -  	  REST_32FPRS_VSRS(0, R4, R8)
> +  	  REST_32FPRS_VSRS(R4,R5,R6,R7,R9,R8)
> =20
>  dont_restore_fp:
>    	  mtmsr  	  r6  	  	  	  	> /* FP/Vec off again! */

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec
  2016-03-09 23:01 ` [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Michael Neuling
@ 2016-03-10  0:56   ` Cyril Bur
  2016-03-10  5:37   ` Cyril Bur
  1 sibling, 0 replies; 8+ messages in thread
From: Cyril Bur @ 2016-03-10  0:56 UTC (permalink / raw)
  To: Michael Neuling; +Cc: linuxppc-dev, anton

On Thu, 10 Mar 2016 10:01:07 +1100
Michael Neuling <mikey@neuling.org> wrote:

> On Tue, 2016-03-01 at 16:55 +1100, Cyril Bur wrote:
> 
> > Currently the assembly to save and restore Altivec registers boils down to
> > a load immediate of the offset of the specific Altivec register in memory
> > followed by the load/store which repeats in sequence for each Altivec
> > register.
> > 
> > This patch attempts to do better by loading up four registers with
> > immediates so that the loads and stores can be batched up and better
> > pipelined by the processor.
> > 
> > This patch results in four load/stores in sequence and one add between
> > groups of four. Also, by using a pair of base registers it means that the
> > result of the add is not needed by the following instruction.  
> 

Hi Mikey,

Thanks for the review.

> What the performance improvement?

Well the CPU instruction traces should look much nicer - this is something I'll
look at getting with this series applied. Currently the CPU isn't thrilled
about the order we're telling it to do things.

Good looking CPU execution traces are nice but we live in the real world.
Before I wrote this patch I thought I'd microbenchmark it in userspace
unfortunately I failed to get a reliably observable performance gain.

I'll run it through my battery of context switch performance scripts I used for
my other series (which have been developed since the userspace microbenchmark)
to see what happens but I have a hunch any gains here will be overshadowed by
something slower.

> 
> > Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> > ---
> > These patches need to be applied on to of my rework of FPU/VMX/VSX
> > switching: https://patchwork.ozlabs.org/patch/589703/
> > 
> > I left in some of my comments indicating if functions are called from C or
> > not. Looking at them now, they might be a bit much, let me know what you
> > think.  
> 
> I think that's ok, although they are likely to get stale quickly.
> 
> > 
> > Tested 64 bit BE and LE under KVM, not sure how I can test 32bit.
> > 
> > 
> >  arch/powerpc/include/asm/ppc_asm.h | 63 ++++++++++++++++++++++++++++++--------
> >  arch/powerpc/kernel/tm.S           |  6 ++--
> >  arch/powerpc/kernel/vector.S       | 20 +++++++++---
> >  3 files changed, 70 insertions(+), 19 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
> > index 499d9f8..5ba69ed 100644
> > --- a/arch/powerpc/include/asm/ppc_asm.h
> > +++ b/arch/powerpc/include/asm/ppc_asm.h
> > @@ -110,18 +110,57 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
> >  #define REST_16FPRS(n, base)  	  REST_8FPRS(n, base); REST_8FPRS(n+8, base)
> >  #define REST_32FPRS(n, base)  	  REST_16FPRS(n, base); REST_16FPRS(n+16, base)
> >  
> > -#define SAVE_VR(n,b,base)  	  li b,16*(n);  stvx n,base,b
> > -#define SAVE_2VRS(n,b,base)  	  SAVE_VR(n,b,base); SAVE_VR(n+1,b,base)
> > -#define SAVE_4VRS(n,b,base)  	  SAVE_2VRS(n,b,base); SAVE_2VRS(n+2,b,base)
> > -#define SAVE_8VRS(n,b,base)  	  SAVE_4VRS(n,b,base); SAVE_4VRS(n+4,b,base)
> > -#define SAVE_16VRS(n,b,base)  	  SAVE_8VRS(n,b,base); SAVE_8VRS(n+8,b,base)
> > -#define SAVE_32VRS(n,b,base)  	  SAVE_16VRS(n,b,base); SAVE_16VRS(n+16,b,base)
> > -#define REST_VR(n,b,base)  	  li b,16*(n); lvx n,base,b
> > -#define REST_2VRS(n,b,base)  	  REST_VR(n,b,base); REST_VR(n+1,b,base)
> > -#define REST_4VRS(n,b,base)  	  REST_2VRS(n,b,base); REST_2VRS(n+2,b,base)
> > -#define REST_8VRS(n,b,base)  	  REST_4VRS(n,b,base); REST_4VRS(n+4,b,base)
> > -#define REST_16VRS(n,b,base)  	  REST_8VRS(n,b,base); REST_8VRS(n+8,b,base)
> > -#define REST_32VRS(n,b,base)  	  REST_16VRS(n,b,base); REST_16VRS(n+16,b,base)  
> 
> Can you use consistent names between off and reg? in the below
> 

Sure.

> > +#define __SAVE_4VRS(n,off0,off1,off2,off3,base) \
> > +  	  stvx n,base,off0; \
> > +  	  stvx n+1,base,off1; \
> > +  	  stvx n+2,base,off2; \
> > +  	  stvx n+3,base,off3
> > +
> > +/* Restores the base for the caller */  
> 
> Can you make this:
> /* Base: non-volatile, reg[0-4]: volatile */
> 

Ah yes

> > +#define SAVE_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +  	  addi reg4,base,64; \
> > +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> > +  	  __SAVE_4VRS(0,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __SAVE_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __SAVE_4VRS(8,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __SAVE_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __SAVE_4VRS(16,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __SAVE_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __SAVE_4VRS(24,reg0,reg1,reg2,reg3,base); \
> > +  	  __SAVE_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> > +  	  subi base,base,384  
> 
> You can swap these last two lines which will make base reuse quicker
> later.  Although that might not be needed.
> 

Can't hurt! I'm not sure the base gets reused that fast in most paths but hey,
we're in the business of optimising here.

> > +#define __REST_4VRS(n,off0,off1,off2,off3,base) \
> > +  	  lvx n,base,off0; \
> > +  	  lvx n+1,base,off1; \
> > +  	  lvx n+2,base,off2; \
> > +  	  lvx n+3,base,off3
> > +
> > +/* Restores the base for the caller */
> > +#define REST_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +  	  addi reg4,base,64; \
> > +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> > +  	  __REST_4VRS(0,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __REST_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __REST_4VRS(8,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __REST_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __REST_4VRS(16,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __REST_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __REST_4VRS(24,reg0,reg1,reg2,reg3,base); \
> > +  	  __REST_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> > +  	  subi base,base,384
> >  
> >  #ifdef __BIG_ENDIAN__
> >  #define STXVD2X_ROT(n,b,base)  	  	> STXVD2X(n,b,base)
> > diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
> > index bf8f34a..81e1305 100644
> > --- a/arch/powerpc/kernel/tm.S
> > +++ b/arch/powerpc/kernel/tm.S
> > @@ -96,6 +96,8 @@ _GLOBAL(tm_abort)
> >   * they will abort back to the checkpointed state we save out here.
> >   *
> >   * Call with IRQs off, stacks get all out of sync for some periods in here!
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(tm_reclaim)
> >    	  mfcr  	  r6
> > @@ -151,7 +153,7 @@ _GLOBAL(tm_reclaim)
> >    	  beq  	  dont_backup_vec
> >  
> >    	  addi  	  r7, r3, THREAD_TRANSACT_VRSTATE
> > -  	  SAVE_32VRS(0, r6, r7)  	  /* r6 scratch, r7 transact vr state */
> > +  	  SAVE_32VRS(r6,r8,r9,r10,r11,r7)  	  /* r6,r8,r9,r10,r11 scratch, r7 transact vr state */  
> 
> Line wrapping here.
> 

Thanks

> >    	  mfvscr  	  v0
> >    	  li  	  r6, VRSTATE_VSCR
> >    	  stvx  	  v0, r7, r6
> > @@ -361,7 +363,7 @@ _GLOBAL(__tm_recheckpoint)
> >    	  li  	  r5, VRSTATE_VSCR
> >    	  lvx  	  v0, r8, r5
> >    	  mtvscr  	  v0
> > -  	  REST_32VRS(0, r5, r8)  	  	  	  /* r5 scratch, r8 ptr */
> > +  	  REST_32VRS(r5,r6,r9,r10,r11,r8)  	  	  	  /* r5,r6,r9,r10,r11 scratch, r8 ptr */  
> 
> wrapping here too
> 

Thanks

> 
> >  dont_restore_vec:
> >    	  ld  	  r5, THREAD_VRSAVE(r3)
> >    	  mtspr  	  SPRN_VRSAVE, r5
> > diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
> > index 1c2e7a3..8d587fb 100644
> > --- a/arch/powerpc/kernel/vector.S
> > +++ b/arch/powerpc/kernel/vector.S
> > @@ -13,6 +13,8 @@
> >   * This is similar to load_up_altivec but for the transactional version of the
> >   * vector regs.  It doesn't mess with the task MSR or valid flags.
> >   * Furthermore, VEC laziness is not supported with TM currently.
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(do_load_up_transact_altivec)
> >    	  mfmsr  	  r6
> > @@ -27,7 +29,7 @@ _GLOBAL(do_load_up_transact_altivec)
> >    	  lvx  	  v0,r10,r3
> >    	  mtvscr  	  v0
> >    	  addi  	  r10,r3,THREAD_TRANSACT_VRSTATE
> > -  	  REST_32VRS(0,r4,r10)
> > +  	  REST_32VRS(r4,r5,r6,r7,r8,r10)
> >  
> >    	  blr
> >  #endif
> > @@ -35,20 +37,24 @@ _GLOBAL(do_load_up_transact_altivec)
> >  /*
> >   * Load state from memory into VMX registers including VSCR.
> >   * Assumes the caller has enabled VMX in the MSR.
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(load_vr_state)
> >    	  li  	  r4,VRSTATE_VSCR
> >    	  lvx  	  v0,r4,r3
> >    	  mtvscr  	  v0
> > -  	  REST_32VRS(0,r4,r3)
> > +  	  REST_32VRS(r4,r5,r6,r7,r8,r3)
> >    	  blr
> >  
> >  /*
> >   * Store VMX state into memory, including VSCR.
> >   * Assumes the caller has enabled VMX in the MSR.
> > + *
> > + * NOT called from C
> >   */
> >  _GLOBAL(store_vr_state)
> > -  	  SAVE_32VRS(0, r4, r3)
> > +  	  SAVE_32VRS(r4,r5,r6,r7,r8,r3)
> >    	  mfvscr  	  v0
> >    	  li  	  r4, VRSTATE_VSCR
> >    	  stvx  	  v0, r4, r3
> > @@ -63,6 +69,8 @@ _GLOBAL(store_vr_state)
> >   *
> >   * Note that on 32-bit this can only use registers that will be
> >   * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
> > + *
> > + * NOT called from C
> >   */
> >  _GLOBAL(load_up_altivec)
> >    	  mfmsr  	  r5  	  	  	  /* grab the current MSR */
> > @@ -101,13 +109,15 @@ _GLOBAL(load_up_altivec)
> >    	  stw  	  r4,THREAD_USED_VR(r5)
> >    	  lvx  	  v0,r10,r6
> >    	  mtvscr  	  v0
> > -  	  REST_32VRS(0,r4,r6)
> > +  	  REST_32VRS(r3,r4,r5,r10,r11,r6)
> >    	  /* restore registers and return */
> >    	  blr
> >  
> >  /*
> >   * save_altivec(tsk)
> >   * Save the vector registers to its thread_struct
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(save_altivec)
> >    	  addi  	  r3,r3,THREAD  	  	> /* want THREAD of task */
> > @@ -116,7 +126,7 @@ _GLOBAL(save_altivec)
> >    	  PPC_LCMPI  	  0,r7,0
> >    	  bne  	  2f
> >    	  addi  	  r7,r3,THREAD_VRSTATE
> > -2:  	  SAVE_32VRS(0,r4,r7)
> > +2:  	  SAVE_32VRS(r4,r5,r6,r8,r9,r7)
> >    	  mfvscr  	  v0
> >    	  li  	  r4,VRSTATE_VSCR
> >    	  stvx  	  v0,r4,r7  

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] powerpc: Batch up loads/stores on saving and restoring VSX
  2016-03-10  0:09   ` Michael Neuling
@ 2016-03-10  1:02     ` Cyril Bur
  0 siblings, 0 replies; 8+ messages in thread
From: Cyril Bur @ 2016-03-10  1:02 UTC (permalink / raw)
  To: Michael Neuling; +Cc: linuxppc-dev, anton

On Thu, 10 Mar 2016 11:09:32 +1100
Michael Neuling <mikey@neuling.org> wrote:

> On Tue, 2016-03-01 at 16:55 +1100, Cyril Bur wrote:
> 
> > Currently the assembly to save and restore VSX registers boils down to a
> > load immediate of the offset of the specific VSX register in memory
> > followed by the load/store which repeats in sequence for each VSX register.
> > 
> > This patch attempts to do better by loading up four registers with
> > immediates so that the loads and stores can be batched up and better
> > pipelined by the processor.
> > 
> > This patch results in four load/stores in sequence and one add between
> > groups of four. Also, by using a pair of base registers it means that the
> > result of the add is not needed by the following instruction.
> > 
> > Due to the overlapping layout of FPU registers and VSX registers on POWER
> > chips, this patch also benefits FPU loads and stores when VSX is compiled
> > in and the CPU is VSX capable.  
> 
> The extra use of registers scares the hell out of me. That being said
> I've eyeballed each use and they look ok to me though.
> 
> What is the performance gain?  I think there's a trade off here
> between code complexity/fragility verse what it gains us.  If it's a
> tiny improvement, this may not be worth it.

My suspicion is that gains will be overshadowed by something even slower. We
can always shelve this patch and dust it off when we've got nothing else to
speed up...

> 
> > Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> > ---
> >  arch/powerpc/include/asm/ppc_asm.h | 65 ++++++++++++++++++++++++++++++--------
> >  arch/powerpc/kernel/fpu.S          | 43 ++++++++++++++++---------
> >  arch/powerpc/kernel/tm.S           | 46 ++++++++++++++-------------
> >  3 files changed, 104 insertions(+), 50 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
> > index 5ba69ed..dd0df12 100644
> > --- a/arch/powerpc/include/asm/ppc_asm.h
> > +++ b/arch/powerpc/include/asm/ppc_asm.h
> > @@ -173,19 +173,58 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
> >  #define LXVD2X_ROT(n,b,base)  	  	> LXVD2X(n,b,base);  	  \
> >    	  	  	  	  	  XXSWAPD(n,n)
> >  #endif
> > -/* Save the lower 32 VSRs in the thread VSR region */
> > -#define SAVE_VSR(n,b,base)  	  li b,16*(n);  STXVD2X_ROT(n,R##base,R##b)
> > -#define SAVE_2VSRS(n,b,base)  	  SAVE_VSR(n,b,base); SAVE_VSR(n+1,b,base)
> > -#define SAVE_4VSRS(n,b,base)  	  SAVE_2VSRS(n,b,base); SAVE_2VSRS(n+2,b,base)
> > -#define SAVE_8VSRS(n,b,base)  	  SAVE_4VSRS(n,b,base); SAVE_4VSRS(n+4,b,base)
> > -#define SAVE_16VSRS(n,b,base)  	  SAVE_8VSRS(n,b,base); SAVE_8VSRS(n+8,b,base)
> > -#define SAVE_32VSRS(n,b,base)  	  SAVE_16VSRS(n,b,base); SAVE_16VSRS(n+16,b,base)
> > -#define REST_VSR(n,b,base)  	  li b,16*(n); LXVD2X_ROT(n,R##base,R##b)
> > -#define REST_2VSRS(n,b,base)  	  REST_VSR(n,b,base); REST_VSR(n+1,b,base)
> > -#define REST_4VSRS(n,b,base)  	  REST_2VSRS(n,b,base); REST_2VSRS(n+2,b,base)
> > -#define REST_8VSRS(n,b,base)  	  REST_4VSRS(n,b,base); REST_4VSRS(n+4,b,base)
> > -#define REST_16VSRS(n,b,base)  	  REST_8VSRS(n,b,base); REST_8VSRS(n+8,b,base)
> > -#define REST_32VSRS(n,b,base)  	  REST_16VSRS(n,b,base); REST_16VSRS(n+16,b,base)
> > +
> > +#define __SAVE_4VSRS(n,off0,off1,off2,off3,base) \
> > +  	  STXVD2X_ROT(n,R##base,R##off0); \
> > +  	  STXVD2X_ROT(n+1,R##base,R##off1); \
> > +  	  STXVD2X_ROT(n+2,R##base,R##off2); \
> > +  	  STXVD2X_ROT(n+3,R##base,R##off3)  
> 
> same comment about off vs reg 
> 
> 
> > +/* Restores the base for the caller */
> > +#define SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +  	  addi reg4,base,64; \
> > +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> > +  	  __SAVE_4VSRS(0,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __SAVE_4VSRS(4,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __SAVE_4VSRS(8,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __SAVE_4VSRS(12,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __SAVE_4VSRS(16,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __SAVE_4VSRS(20,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __SAVE_4VSRS(24,reg0,reg1,reg2,reg3,base); \
> > +  	  __SAVE_4VSRS(28,reg0,reg1,reg2,reg3,reg4); \
> > +  	  subi base,base,384
> > +
> > +#define __REST_4VSRS(n,off0,off1,off2,off3,base) \
> > +  	  LXVD2X_ROT(n,R##base,R##off0); \
> > +  	  LXVD2X_ROT(n+1,R##base,R##off1); \
> > +  	  LXVD2X_ROT(n+2,R##base,R##off2); \
> > +  	  LXVD2X_ROT(n+3,R##base,R##off3)
> > +
> > +/* Restores the base for the caller */
> > +#define REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +  	  addi reg4,base,64; \
> > +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> > +  	  __REST_4VSRS(0,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __REST_4VSRS(4,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __REST_4VSRS(8,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __REST_4VSRS(12,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __REST_4VSRS(16,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __REST_4VSRS(20,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __REST_4VSRS(24,reg0,reg1,reg2,reg3,base); \
> > +  	  __REST_4VSRS(28,reg0,reg1,reg2,reg3,reg4); \
> > +  	  subi base,base,384
> >  
> >  /*
> >   * b = base register for addressing, o = base offset from register of 1st EVR
> > diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
> > index 15da2b5..dc57ff1 100644
> > --- a/arch/powerpc/kernel/fpu.S
> > +++ b/arch/powerpc/kernel/fpu.S
> > @@ -26,29 +26,32 @@
> >  #include <asm/ptrace.h>
> >  
> >  #ifdef CONFIG_VSX
> > -#define __REST_32FPVSRS(n,c,base)  	  	  	  	  	  \
> > +#define __REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)  	  \
> >  BEGIN_FTR_SECTION  	  	  	  	  	  	  	  \
> >    	  b  	  2f;  	  	  	  	  	  	  	  \
> >  END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	  	  	  	  \
> > -  	  REST_32FPRS(n,base);  	  	  	  	  	  	> \
> > +  	  REST_32FPRS(0,base);  	  	  	  	  	  	> \
> >    	  b  	  3f;  	  	  	  	  	  	  	  \
> > -2:  	  REST_32VSRS(n,c,base);  	  	  	  	  	  	> \
> > +2:  	  REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
> >  3:
> >  
> > -#define __SAVE_32FPVSRS(n,c,base)  	  	  	  	  	  \
> > +#define __SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
> >  BEGIN_FTR_SECTION  	  	  	  	  	  	  	  \
> >    	  b  	  2f;  	  	  	  	  	  	  	  \
> >  END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	  	  	  	  \
> > -  	  SAVE_32FPRS(n,base);  	  	  	  	  	  	> \
> > +  	  SAVE_32FPRS(0,base);  	  	  	  	  	  	> \
> >    	  b  	  3f;  	  	  	  	  	  	  	  \
> > -2:  	  SAVE_32VSRS(n,c,base);  	  	  	  	  	  	> \
> > +2:  	  SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
> >  3:
> >  #else
> > -#define __REST_32FPVSRS(n,b,base)  	  REST_32FPRS(n, base)
> > -#define __SAVE_32FPVSRS(n,b,base)  	  SAVE_32FPRS(n, base)
> > +#define __REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)  	  REST_32FPRS(0, base)
> > +#define __SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)  	  SAVE_32FPRS(0, base)
> >  #endif
> > -#define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base)
> > -#define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base)
> > +#define REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +__REST_32FPVSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
> > +
> > +#define SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +__SAVE_32FPVSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
> >  
> >  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> >  /* void do_load_up_transact_fpu(struct thread_struct *thread)
> > @@ -56,6 +59,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	  	  	  	  \
> >   * This is similar to load_up_fpu but for the transactional version of the FP
> >   * register set.  It doesn't mess with the task MSR or valid flags.
> >   * Furthermore, we don't do lazy FP with TM currently.
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(do_load_up_transact_fpu)
> >    	  mfmsr  	  r6
> > @@ -71,7 +76,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
> >    	  addi  	  r7,r3,THREAD_TRANSACT_FPSTATE
> >    	  lfd  	  fr0,FPSTATE_FPSCR(r7)
> >    	  MTFSF_L(fr0)
> > -  	  REST_32FPVSRS(0, R4, R7)
> > +  	  REST_32FPVSRS(R4,R5,R6,R8,R9,R7)
> >  
> >    	  blr
> >  #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
> > @@ -79,19 +84,23 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
> >  /*
> >   * Load state from memory into FP registers including FPSCR.
> >   * Assumes the caller has enabled FP in the MSR.
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(load_fp_state)
> >    	  lfd  	  fr0,FPSTATE_FPSCR(r3)
> >    	  MTFSF_L(fr0)
> > -  	  REST_32FPVSRS(0, R4, R3)
> > +  	  REST_32FPVSRS(R4,R5,R6,R7,R8,R3)
> >    	  blr
> >  
> >  /*
> >   * Store FP state into memory, including FPSCR
> >   * Assumes the caller has enabled FP in the MSR.
> > + *
> > + * NOT called from C
> >   */
> >  _GLOBAL(store_fp_state)
> > -  	  SAVE_32FPVSRS(0, R4, R3)
> > +  	  SAVE_32FPVSRS(R4,R5,R6,R7,R8,R3)
> >    	  mffs  	  fr0
> >    	  stfd  	  fr0,FPSTATE_FPSCR(r3)
> >    	  blr
> > @@ -104,6 +113,8 @@ _GLOBAL(store_fp_state)
> >   * enable the FPU for the current task and return to the task.
> >   * Note that on 32-bit this can only use registers that will be
> >   * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
> > + *
> > + * NOT called from C
> >   */
> >  _GLOBAL(load_up_fpu)
> >    	  mfmsr  	  r5
> > @@ -137,7 +148,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
> >    	  addi  	  r10,r5,THREAD_FPSTATE
> >    	  lfd  	  fr0,FPSTATE_FPSCR(r10)
> >    	  MTFSF_L(fr0)
> > -  	  REST_32FPVSRS(0, R4, R10)
> > +  	  REST_32FPVSRS(R3,R4,R5,R6,R11,R10)
> >    	  /* restore registers and return */
> >    	  /* we haven't used ctr or xer or lr */
> >    	  blr
> > @@ -146,6 +157,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
> >   * save_fpu(tsk)
> >   * Save the floating-point registers in its thread_struct.
> >   * Enables the FPU for use in the kernel on return.
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(save_fpu)
> >    	  addi  	  r3,r3,THREAD  	          /* want THREAD of task */
> > @@ -154,7 +167,7 @@ _GLOBAL(save_fpu)
> >    	  PPC_LCMPI  	  0,r6,0
> >    	  bne  	  2f
> >    	  addi  	  r6,r3,THREAD_FPSTATE
> > -2:  	  SAVE_32FPVSRS(0, R4, R6)
> > +2:  	  SAVE_32FPVSRS(R4,R5,R7,R8,R9,R6)
> >    	  mffs  	  fr0
> >    	  stfd  	  fr0,FPSTATE_FPSCR(r6)
> >    	  blr
> > diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
> > index 81e1305..61900b8 100644
> > --- a/arch/powerpc/kernel/tm.S
> > +++ b/arch/powerpc/kernel/tm.S
> > @@ -14,30 +14,32 @@
> >  
> >  #ifdef CONFIG_VSX
> >  /* See fpu.S, this is borrowed from there */
> > -#define __SAVE_32FPRS_VSRS(n,c,base)  	  	> \
> > -BEGIN_FTR_SECTION  	  	  	  	> \
> > -  	  b  	  2f;  	  	  	  	> \
> > -END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	> \
> > -  	  SAVE_32FPRS(n,base);  	  	  	  \
> > -  	  b  	  3f;  	  	  	  	> \
> > -2:  	  SAVE_32VSRS(n,c,base);  	  	  	  \
> > +#define __SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +BEGIN_FTR_SECTION  	  	  	  	  	  	  	  \
> > +  	  b  	  2f;  	  	  	  	  	  	  	  \
> > +END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	  	  	  	  \
> > +  	  SAVE_32FPRS(0,base);  	  	  	  	  	  	> \
> > +  	  b  	  3f;  	  	  	  	  	  	  	  \
> > +2:  	  SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
> >  3:
> > -#define __REST_32FPRS_VSRS(n,c,base)  	  	> \
> > -BEGIN_FTR_SECTION  	  	  	  	> \
> > -  	  b  	  2f;  	  	  	  	> \
> > -END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	> \
> > -  	  REST_32FPRS(n,base);  	  	  	  \
> > -  	  b  	  3f;  	  	  	  	> \
> > -2:  	  REST_32VSRS(n,c,base);  	  	  	  \
> > +
> > +#define __REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)  	  \
> > +BEGIN_FTR_SECTION  	  	  	  	  	  	  	  \
> > +  	  b  	  2f;  	  	  	  	  	  	  	  \
> > +END_FTR_SECTION_IFSET(CPU_FTR_VSX);  	  	  	  	  	  \
> > +  	  REST_32FPRS(0,base);  	  	  	  	  	  	> \
> > +  	  b  	  3f;  	  	  	  	  	  	  	  \
> > +2:  	  REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
> >  3:
> > +
> >  #else
> > -#define __SAVE_32FPRS_VSRS(n,c,base)  	  SAVE_32FPRS(n, base)
> > -#define __REST_32FPRS_VSRS(n,c,base)  	  REST_32FPRS(n, base)
> > +#define __SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)  	  SAVE_32FPRS(0, base)
> > +#define __REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)  	  REST_32FPRS(0, base)
> >  #endif
> > -#define SAVE_32FPRS_VSRS(n,c,base) \
> > -  	  __SAVE_32FPRS_VSRS(n,__REG_##c,__REG_##base)
> > -#define REST_32FPRS_VSRS(n,c,base) \
> > -  	  __REST_32FPRS_VSRS(n,__REG_##c,__REG_##base)
> > +#define SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +__SAVE_32FPRS_VSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
> > +#define REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +__REST_32FPRS_VSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
> >  
> >  /* Stack frame offsets for local variables. */
> >  #define TM_FRAME_L0  	  TM_FRAME_SIZE-16
> > @@ -165,7 +167,7 @@ dont_backup_vec:
> >    	  beq  	  dont_backup_fp
> >  
> >    	  addi  	  r7, r3, THREAD_TRANSACT_FPSTATE
> > -  	  SAVE_32FPRS_VSRS(0, R6, R7)  	  /* r6 scratch, r7 transact fp state */
> > +  	  SAVE_32FPRS_VSRS(R6,R8,R9,R10,R11,R7) /* r6,r8,r9,r10,r11 scratch, r7 transact fp state */
> >  
> >    	  mffs    fr0
> >    	  stfd    fr0,FPSTATE_FPSCR(r7)
> > @@ -375,7 +377,7 @@ dont_restore_vec:
> >    	  addi  	  r8, r3, THREAD_FPSTATE
> >    	  lfd  	  fr0, FPSTATE_FPSCR(r8)
> >    	  MTFSF_L(fr0)
> > -  	  REST_32FPRS_VSRS(0, R4, R8)
> > +  	  REST_32FPRS_VSRS(R4,R5,R6,R7,R9,R8)
> >  
> >  dont_restore_fp:
> >    	  mtmsr  	  r6  	  	  	  	> /* FP/Vec off again! */  

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec
  2016-03-09 23:01 ` [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Michael Neuling
  2016-03-10  0:56   ` Cyril Bur
@ 2016-03-10  5:37   ` Cyril Bur
  2016-03-11  4:25     ` Cyril Bur
  1 sibling, 1 reply; 8+ messages in thread
From: Cyril Bur @ 2016-03-10  5:37 UTC (permalink / raw)
  To: Michael Neuling; +Cc: linuxppc-dev, anton

On Thu, 10 Mar 2016 10:01:07 +1100
Michael Neuling <mikey@neuling.org> wrote:

> On Tue, 2016-03-01 at 16:55 +1100, Cyril Bur wrote:
> 
> > Currently the assembly to save and restore Altivec registers boils down to
> > a load immediate of the offset of the specific Altivec register in memory
> > followed by the load/store which repeats in sequence for each Altivec
> > register.
> > 
> > This patch attempts to do better by loading up four registers with
> > immediates so that the loads and stores can be batched up and better
> > pipelined by the processor.
> > 
> > This patch results in four load/stores in sequence and one add between
> > groups of four. Also, by using a pair of base registers it means that the
> > result of the add is not needed by the following instruction.  
> 
> What the performance improvement?

So I have some numbers. This is the same context switch benchmark that was used
for my other series, a modified version of:
http://www.ozlabs.org/~anton/junkcode/context_switch2.c

We pingpong across a pipe and touch FP/VMX/VSX or some combinations of both.
The numbers are the number of pingpongs per second, the tests run for 52
seconds with the first two seconds of results being ignored to allow for the
test to stabilise.

Run 1
Touched Facility  Average  Stddev   Speedup   %Speedup  Speedup/Stddev
None              1845984  8261.28  15017.32  100.8201  1.817793
FPU               1639296  3966.94  54770.04  103.4565  13.80660
FPU + VEC         1555836  3708.59  34533.72  102.2700  9.311813
FPU + VSX         1523202  78984.7  -362.64   99.97619  -0.00459
VEC               1631529  23665.5  -11818.0  99.28085  -0.49937
VEC + VSX         1543007  24614.0  32330.52  102.1401  1.313500
VSX               1554007  28723.0  40296.56  102.6621  1.402932
FPU + VEC + VSX   1546072  17201.1  41687.28  102.7710  2.423519

Run 2
Touched Facility  Average  Stddev   Speedup  %Speedup  Speedup/Stddev
None              1837869  30263.4  -7780.6  99.57843  -0.25709
FPU               1587883  70260.6  -23927   98.51546  -0.34055
FPU + VEC         1552703  13563.6  37243.4  102.4575  2.745831
FPU + VSX         1558519  13706.7  32365.9  102.1207  2.361308
VEC               1651599  1388.83  13474.3  100.8225  9.701918
VEC + VSX         1552319  1752.77  42443.2  102.8110  24.21487
VSX               1559806  7891.66  55542.5  103.6923  7.038124
FPU + VEC + VSX   1549330  22148.1  29010.8  101.9082  1.309849


I can't help but notice these are noisy. These were run on KVM on a fairly
busy box. I wonder if the numbers smooth out on an otherwise idle machine. It
doesn't look super consistent across two runs.

> 
> > Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> > ---
> > These patches need to be applied on to of my rework of FPU/VMX/VSX
> > switching: https://patchwork.ozlabs.org/patch/589703/
> > 
> > I left in some of my comments indicating if functions are called from C or
> > not. Looking at them now, they might be a bit much, let me know what you
> > think.  
> 
> I think that's ok, although they are likely to get stale quickly.
> 
> > 
> > Tested 64 bit BE and LE under KVM, not sure how I can test 32bit.
> > 
> > 
> >  arch/powerpc/include/asm/ppc_asm.h | 63 ++++++++++++++++++++++++++++++--------
> >  arch/powerpc/kernel/tm.S           |  6 ++--
> >  arch/powerpc/kernel/vector.S       | 20 +++++++++---
> >  3 files changed, 70 insertions(+), 19 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
> > index 499d9f8..5ba69ed 100644
> > --- a/arch/powerpc/include/asm/ppc_asm.h
> > +++ b/arch/powerpc/include/asm/ppc_asm.h
> > @@ -110,18 +110,57 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
> >  #define REST_16FPRS(n, base)  	  REST_8FPRS(n, base); REST_8FPRS(n+8, base)
> >  #define REST_32FPRS(n, base)  	  REST_16FPRS(n, base); REST_16FPRS(n+16, base)
> >  
> > -#define SAVE_VR(n,b,base)  	  li b,16*(n);  stvx n,base,b
> > -#define SAVE_2VRS(n,b,base)  	  SAVE_VR(n,b,base); SAVE_VR(n+1,b,base)
> > -#define SAVE_4VRS(n,b,base)  	  SAVE_2VRS(n,b,base); SAVE_2VRS(n+2,b,base)
> > -#define SAVE_8VRS(n,b,base)  	  SAVE_4VRS(n,b,base); SAVE_4VRS(n+4,b,base)
> > -#define SAVE_16VRS(n,b,base)  	  SAVE_8VRS(n,b,base); SAVE_8VRS(n+8,b,base)
> > -#define SAVE_32VRS(n,b,base)  	  SAVE_16VRS(n,b,base); SAVE_16VRS(n+16,b,base)
> > -#define REST_VR(n,b,base)  	  li b,16*(n); lvx n,base,b
> > -#define REST_2VRS(n,b,base)  	  REST_VR(n,b,base); REST_VR(n+1,b,base)
> > -#define REST_4VRS(n,b,base)  	  REST_2VRS(n,b,base); REST_2VRS(n+2,b,base)
> > -#define REST_8VRS(n,b,base)  	  REST_4VRS(n,b,base); REST_4VRS(n+4,b,base)
> > -#define REST_16VRS(n,b,base)  	  REST_8VRS(n,b,base); REST_8VRS(n+8,b,base)
> > -#define REST_32VRS(n,b,base)  	  REST_16VRS(n,b,base); REST_16VRS(n+16,b,base)  
> 
> Can you use consistent names between off and reg? in the below
> 
> > +#define __SAVE_4VRS(n,off0,off1,off2,off3,base) \
> > +  	  stvx n,base,off0; \
> > +  	  stvx n+1,base,off1; \
> > +  	  stvx n+2,base,off2; \
> > +  	  stvx n+3,base,off3
> > +
> > +/* Restores the base for the caller */  
> 
> Can you make this:
> /* Base: non-volatile, reg[0-4]: volatile */
> 
> > +#define SAVE_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +  	  addi reg4,base,64; \
> > +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> > +  	  __SAVE_4VRS(0,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __SAVE_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __SAVE_4VRS(8,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __SAVE_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __SAVE_4VRS(16,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __SAVE_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __SAVE_4VRS(24,reg0,reg1,reg2,reg3,base); \
> > +  	  __SAVE_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> > +  	  subi base,base,384  
> 
> You can swap these last two lines which will make base reuse quicker
> later.  Although that might not be needed.
> 
> > +#define __REST_4VRS(n,off0,off1,off2,off3,base) \
> > +  	  lvx n,base,off0; \
> > +  	  lvx n+1,base,off1; \
> > +  	  lvx n+2,base,off2; \
> > +  	  lvx n+3,base,off3
> > +
> > +/* Restores the base for the caller */
> > +#define REST_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> > +  	  addi reg4,base,64; \
> > +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> > +  	  __REST_4VRS(0,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __REST_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __REST_4VRS(8,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __REST_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __REST_4VRS(16,reg0,reg1,reg2,reg3,base); \
> > +  	  addi base,base,128; \
> > +  	  __REST_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> > +  	  addi reg4,reg4,128; \
> > +  	  __REST_4VRS(24,reg0,reg1,reg2,reg3,base); \
> > +  	  __REST_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> > +  	  subi base,base,384
> >  
> >  #ifdef __BIG_ENDIAN__
> >  #define STXVD2X_ROT(n,b,base)  	  	> STXVD2X(n,b,base)
> > diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
> > index bf8f34a..81e1305 100644
> > --- a/arch/powerpc/kernel/tm.S
> > +++ b/arch/powerpc/kernel/tm.S
> > @@ -96,6 +96,8 @@ _GLOBAL(tm_abort)
> >   * they will abort back to the checkpointed state we save out here.
> >   *
> >   * Call with IRQs off, stacks get all out of sync for some periods in here!
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(tm_reclaim)
> >    	  mfcr  	  r6
> > @@ -151,7 +153,7 @@ _GLOBAL(tm_reclaim)
> >    	  beq  	  dont_backup_vec
> >  
> >    	  addi  	  r7, r3, THREAD_TRANSACT_VRSTATE
> > -  	  SAVE_32VRS(0, r6, r7)  	  /* r6 scratch, r7 transact vr state */
> > +  	  SAVE_32VRS(r6,r8,r9,r10,r11,r7)  	  /* r6,r8,r9,r10,r11 scratch, r7 transact vr state */  
> 
> Line wrapping here.
> 
> >    	  mfvscr  	  v0
> >    	  li  	  r6, VRSTATE_VSCR
> >    	  stvx  	  v0, r7, r6
> > @@ -361,7 +363,7 @@ _GLOBAL(__tm_recheckpoint)
> >    	  li  	  r5, VRSTATE_VSCR
> >    	  lvx  	  v0, r8, r5
> >    	  mtvscr  	  v0
> > -  	  REST_32VRS(0, r5, r8)  	  	  	  /* r5 scratch, r8 ptr */
> > +  	  REST_32VRS(r5,r6,r9,r10,r11,r8)  	  	  	  /* r5,r6,r9,r10,r11 scratch, r8 ptr */  
> 
> wrapping here too
> 
> 
> >  dont_restore_vec:
> >    	  ld  	  r5, THREAD_VRSAVE(r3)
> >    	  mtspr  	  SPRN_VRSAVE, r5
> > diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
> > index 1c2e7a3..8d587fb 100644
> > --- a/arch/powerpc/kernel/vector.S
> > +++ b/arch/powerpc/kernel/vector.S
> > @@ -13,6 +13,8 @@
> >   * This is similar to load_up_altivec but for the transactional version of the
> >   * vector regs.  It doesn't mess with the task MSR or valid flags.
> >   * Furthermore, VEC laziness is not supported with TM currently.
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(do_load_up_transact_altivec)
> >    	  mfmsr  	  r6
> > @@ -27,7 +29,7 @@ _GLOBAL(do_load_up_transact_altivec)
> >    	  lvx  	  v0,r10,r3
> >    	  mtvscr  	  v0
> >    	  addi  	  r10,r3,THREAD_TRANSACT_VRSTATE
> > -  	  REST_32VRS(0,r4,r10)
> > +  	  REST_32VRS(r4,r5,r6,r7,r8,r10)
> >  
> >    	  blr
> >  #endif
> > @@ -35,20 +37,24 @@ _GLOBAL(do_load_up_transact_altivec)
> >  /*
> >   * Load state from memory into VMX registers including VSCR.
> >   * Assumes the caller has enabled VMX in the MSR.
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(load_vr_state)
> >    	  li  	  r4,VRSTATE_VSCR
> >    	  lvx  	  v0,r4,r3
> >    	  mtvscr  	  v0
> > -  	  REST_32VRS(0,r4,r3)
> > +  	  REST_32VRS(r4,r5,r6,r7,r8,r3)
> >    	  blr
> >  
> >  /*
> >   * Store VMX state into memory, including VSCR.
> >   * Assumes the caller has enabled VMX in the MSR.
> > + *
> > + * NOT called from C
> >   */
> >  _GLOBAL(store_vr_state)
> > -  	  SAVE_32VRS(0, r4, r3)
> > +  	  SAVE_32VRS(r4,r5,r6,r7,r8,r3)
> >    	  mfvscr  	  v0
> >    	  li  	  r4, VRSTATE_VSCR
> >    	  stvx  	  v0, r4, r3
> > @@ -63,6 +69,8 @@ _GLOBAL(store_vr_state)
> >   *
> >   * Note that on 32-bit this can only use registers that will be
> >   * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
> > + *
> > + * NOT called from C
> >   */
> >  _GLOBAL(load_up_altivec)
> >    	  mfmsr  	  r5  	  	  	  /* grab the current MSR */
> > @@ -101,13 +109,15 @@ _GLOBAL(load_up_altivec)
> >    	  stw  	  r4,THREAD_USED_VR(r5)
> >    	  lvx  	  v0,r10,r6
> >    	  mtvscr  	  v0
> > -  	  REST_32VRS(0,r4,r6)
> > +  	  REST_32VRS(r3,r4,r5,r10,r11,r6)
> >    	  /* restore registers and return */
> >    	  blr
> >  
> >  /*
> >   * save_altivec(tsk)
> >   * Save the vector registers to its thread_struct
> > + *
> > + * Is called from C
> >   */
> >  _GLOBAL(save_altivec)
> >    	  addi  	  r3,r3,THREAD  	  	> /* want THREAD of task */
> > @@ -116,7 +126,7 @@ _GLOBAL(save_altivec)
> >    	  PPC_LCMPI  	  0,r7,0
> >    	  bne  	  2f
> >    	  addi  	  r7,r3,THREAD_VRSTATE
> > -2:  	  SAVE_32VRS(0,r4,r7)
> > +2:  	  SAVE_32VRS(r4,r5,r6,r8,r9,r7)
> >    	  mfvscr  	  v0
> >    	  li  	  r4,VRSTATE_VSCR
> >    	  stvx  	  v0,r4,r7  

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec
  2016-03-10  5:37   ` Cyril Bur
@ 2016-03-11  4:25     ` Cyril Bur
  0 siblings, 0 replies; 8+ messages in thread
From: Cyril Bur @ 2016-03-11  4:25 UTC (permalink / raw)
  To: Michael Neuling; +Cc: linuxppc-dev, anton

On Thu, 10 Mar 2016 16:37:47 +1100
Cyril Bur <cyrilbur@gmail.com> wrote:

> On Thu, 10 Mar 2016 10:01:07 +1100
> Michael Neuling <mikey@neuling.org> wrote:
> 
> > On Tue, 2016-03-01 at 16:55 +1100, Cyril Bur wrote:
> >   
> > > Currently the assembly to save and restore Altivec registers boils down to
> > > a load immediate of the offset of the specific Altivec register in memory
> > > followed by the load/store which repeats in sequence for each Altivec
> > > register.
> > > 
> > > This patch attempts to do better by loading up four registers with
> > > immediates so that the loads and stores can be batched up and better
> > > pipelined by the processor.
> > > 
> > > This patch results in four load/stores in sequence and one add between
> > > groups of four. Also, by using a pair of base registers it means that the
> > > result of the add is not needed by the following instruction.    
> > 
> > What the performance improvement?  
> 
> So I have some numbers. This is the same context switch benchmark that was used
> for my other series, a modified version of:
> http://www.ozlabs.org/~anton/junkcode/context_switch2.c
> 
> We pingpong across a pipe and touch FP/VMX/VSX or some combinations of both.
> The numbers are the number of pingpongs per second, the tests run for 52
> seconds with the first two seconds of results being ignored to allow for the
> test to stabilise.
> 
> Run 1
> Touched Facility  Average  Stddev   Speedup   %Speedup  Speedup/Stddev
> None              1845984  8261.28  15017.32  100.8201  1.817793
> FPU               1639296  3966.94  54770.04  103.4565  13.80660
> FPU + VEC         1555836  3708.59  34533.72  102.2700  9.311813
> FPU + VSX         1523202  78984.7  -362.64   99.97619  -0.00459
> VEC               1631529  23665.5  -11818.0  99.28085  -0.49937
> VEC + VSX         1543007  24614.0  32330.52  102.1401  1.313500
> VSX               1554007  28723.0  40296.56  102.6621  1.402932
> FPU + VEC + VSX   1546072  17201.1  41687.28  102.7710  2.423519
> 
> Run 2
> Touched Facility  Average  Stddev   Speedup  %Speedup  Speedup/Stddev
> None              1837869  30263.4  -7780.6  99.57843  -0.25709
> FPU               1587883  70260.6  -23927   98.51546  -0.34055
> FPU + VEC         1552703  13563.6  37243.4  102.4575  2.745831
> FPU + VSX         1558519  13706.7  32365.9  102.1207  2.361308
> VEC               1651599  1388.83  13474.3  100.8225  9.701918
> VEC + VSX         1552319  1752.77  42443.2  102.8110  24.21487
> VSX               1559806  7891.66  55542.5  103.6923  7.038124
> FPU + VEC + VSX   1549330  22148.1  29010.8  101.9082  1.309849
> 
> 
> I can't help but notice these are noisy. These were run on KVM on a fairly
> busy box. I wonder if the numbers smooth out on an otherwise idle machine. It
> doesn't look super consistent across two runs.

Did four runs on a OpenPower system, booted into a buildroot ramfs with
basically nothing running except ssh daemon and busybox.

Run 1
Touched         Average         Stddev          % of Average    Speedup         %Speedup        Speedup/Stddev
None		1772303.48	1029.674394	0.05809808564	-2514.2 	99.85834038	-2.441742764
FPU		1564827.36	623.4840148	0.03984362945	21815.16	101.4138035	34.98912479
FPU + VEC 	1485572.76	865.0766997	0.05823186336	34317.64	102.3646869	39.6700547
FPU + VSX	1485859.72	606.5295067	0.04082010559	47010.68	103.267242	77.50765541
VEC		1571237.88	927.2565997	0.0590143995	8459.76 	100.5413283	9.123429268
VEC + VSX	1480972.92	652.0581181	0.04402903722	47769.32	103.3330449	73.25929802
VSX		1468496.12	140955.9898	9.598662731	10461.72	100.7175222	0.0742197619
All		1480732.36	777.801684	0.05252817491	50908.56408	103.5604782	65.45185634

Run 2
Touched         Average         Stddev          % of Average     Speedup        %Speedup        Speedup/Stddev
None		1773836.72	868.7485496	0.0489756774	-2374.68	99.86630645	-2.733449168
FPU		1564397.8	688.6031113	0.04401713626	21650.48	101.4033717	31.44115913
FPU + VEC	1484855.48	905.2225623	0.06096368128	33020.52	102.274399	36.47779162
FPU + VSX	1486762.64	933.4386234	0.06278329831	48576.48	103.3776212	52.04035786
VEC		1571551.44	785.1688538	0.04996138426	7980.44 	100.5103983	10.16397933
VEC + VSX	1480887.72	574.0000725	0.03876053969	49535.16	103.4607239	86.29817725
VSX		1489902.32	546.0581355	0.03665059972	35301.64	102.4268956	64.64813489
All		1480020.88	733.4377717	0.04955590705	54243.28	103.8044699	73.95757636

Run 3
Touched         Average         Stddev          % of Average    Speedup         %Speedup        Speedup/Stddev
None		1766262.8	614.1256934	0.03476978021	-5233.36	99.70457966	-8.521643136
FPU		1565494.04	627.6075064	0.04009006041	22975.6 	101.4894862	36.60823009
FPU + VEC	1484286.4	569.3538049	0.03835875643	34148.76	102.3548634	59.97810098
FPU + VSX	1486067.76	847.4846952	0.0570286711	45883.84	103.1859709	54.14120191
VEC		1570977.16	860.8764485	0.05479878832	11356.4 	100.7281514	13.1916723
VEC + VSX	1480634.88	912.5305617	0.06163103234	48774.88	103.4064001	53.45013312
VSX		1488969.72	697.1188532	0.04681887374	29984.4 	102.0551543	43.01189082
All		1479573.16	681.8953286	0.04608730052	47822.32	103.3401286	70.13146739

Run 4
Touched         Average         Stddev          % of Average    Speedup         %Speedup        Speedup/Stddev
None		1773258.28	1230.022665	0.06936511613	-252.88 	99.98574128	-0.2055897075
FPU		1564657.04	665.6801021	0.04254479321	20985.88	101.3594787	31.52547287
FPU + VEC	1485552 	542.697721	0.03653172161	34167.56	102.3541358	62.95873132
FPU + VSX	1487035.84	659.5565412	0.04435377571	44355.44	103.074516	67.25039815
VEC		1570597.04	726.2013259	0.0462372784	10325.4 	100.6617694	14.21837118
VEC + VSX	1480685.84	874.2851391	0.05904595799	49180.12	103.4355518	56.25180825
VSX		1489545.44	718.5865991	0.0482420059	34501.6 	102.3711725	48.01314141
All		1480782.08	702.6882445	0.04745385928	49976.52	103.4928939	71.12189565

Compared to the two runs done under KVM, these are so much more consistent. I
can't explain Run 1 VSX result, there are a few possibilities. Based on all
the other results I'm seeing being so much more consistent, I'm liking the
effect bare metal.

To add fuel to the green fire, the lack of speedup on the None case is
expected, the optimisation would not be used.


> 
> >   
> > > Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> > > ---
> > > These patches need to be applied on to of my rework of FPU/VMX/VSX
> > > switching: https://patchwork.ozlabs.org/patch/589703/
> > > 
> > > I left in some of my comments indicating if functions are called from C or
> > > not. Looking at them now, they might be a bit much, let me know what you
> > > think.    
> > 
> > I think that's ok, although they are likely to get stale quickly.
> >   
> > > 
> > > Tested 64 bit BE and LE under KVM, not sure how I can test 32bit.
> > > 
> > > 
> > >  arch/powerpc/include/asm/ppc_asm.h | 63 ++++++++++++++++++++++++++++++--------
> > >  arch/powerpc/kernel/tm.S           |  6 ++--
> > >  arch/powerpc/kernel/vector.S       | 20 +++++++++---
> > >  3 files changed, 70 insertions(+), 19 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
> > > index 499d9f8..5ba69ed 100644
> > > --- a/arch/powerpc/include/asm/ppc_asm.h
> > > +++ b/arch/powerpc/include/asm/ppc_asm.h
> > > @@ -110,18 +110,57 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
> > >  #define REST_16FPRS(n, base)  	  REST_8FPRS(n, base); REST_8FPRS(n+8, base)
> > >  #define REST_32FPRS(n, base)  	  REST_16FPRS(n, base); REST_16FPRS(n+16, base)
> > >  
> > > -#define SAVE_VR(n,b,base)  	  li b,16*(n);  stvx n,base,b
> > > -#define SAVE_2VRS(n,b,base)  	  SAVE_VR(n,b,base); SAVE_VR(n+1,b,base)
> > > -#define SAVE_4VRS(n,b,base)  	  SAVE_2VRS(n,b,base); SAVE_2VRS(n+2,b,base)
> > > -#define SAVE_8VRS(n,b,base)  	  SAVE_4VRS(n,b,base); SAVE_4VRS(n+4,b,base)
> > > -#define SAVE_16VRS(n,b,base)  	  SAVE_8VRS(n,b,base); SAVE_8VRS(n+8,b,base)
> > > -#define SAVE_32VRS(n,b,base)  	  SAVE_16VRS(n,b,base); SAVE_16VRS(n+16,b,base)
> > > -#define REST_VR(n,b,base)  	  li b,16*(n); lvx n,base,b
> > > -#define REST_2VRS(n,b,base)  	  REST_VR(n,b,base); REST_VR(n+1,b,base)
> > > -#define REST_4VRS(n,b,base)  	  REST_2VRS(n,b,base); REST_2VRS(n+2,b,base)
> > > -#define REST_8VRS(n,b,base)  	  REST_4VRS(n,b,base); REST_4VRS(n+4,b,base)
> > > -#define REST_16VRS(n,b,base)  	  REST_8VRS(n,b,base); REST_8VRS(n+8,b,base)
> > > -#define REST_32VRS(n,b,base)  	  REST_16VRS(n,b,base); REST_16VRS(n+16,b,base)    
> > 
> > Can you use consistent names between off and reg? in the below
> >   
> > > +#define __SAVE_4VRS(n,off0,off1,off2,off3,base) \
> > > +  	  stvx n,base,off0; \
> > > +  	  stvx n+1,base,off1; \
> > > +  	  stvx n+2,base,off2; \
> > > +  	  stvx n+3,base,off3
> > > +
> > > +/* Restores the base for the caller */    
> > 
> > Can you make this:
> > /* Base: non-volatile, reg[0-4]: volatile */
> >   
> > > +#define SAVE_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> > > +  	  addi reg4,base,64; \
> > > +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> > > +  	  __SAVE_4VRS(0,reg0,reg1,reg2,reg3,base); \
> > > +  	  addi base,base,128; \
> > > +  	  __SAVE_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> > > +  	  addi reg4,reg4,128; \
> > > +  	  __SAVE_4VRS(8,reg0,reg1,reg2,reg3,base); \
> > > +  	  addi base,base,128; \
> > > +  	  __SAVE_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> > > +  	  addi reg4,reg4,128; \
> > > +  	  __SAVE_4VRS(16,reg0,reg1,reg2,reg3,base); \
> > > +  	  addi base,base,128; \
> > > +  	  __SAVE_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> > > +  	  addi reg4,reg4,128; \
> > > +  	  __SAVE_4VRS(24,reg0,reg1,reg2,reg3,base); \
> > > +  	  __SAVE_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> > > +  	  subi base,base,384    
> > 
> > You can swap these last two lines which will make base reuse quicker
> > later.  Although that might not be needed.
> >   
> > > +#define __REST_4VRS(n,off0,off1,off2,off3,base) \
> > > +  	  lvx n,base,off0; \
> > > +  	  lvx n+1,base,off1; \
> > > +  	  lvx n+2,base,off2; \
> > > +  	  lvx n+3,base,off3
> > > +
> > > +/* Restores the base for the caller */
> > > +#define REST_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> > > +  	  addi reg4,base,64; \
> > > +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> > > +  	  __REST_4VRS(0,reg0,reg1,reg2,reg3,base); \
> > > +  	  addi base,base,128; \
> > > +  	  __REST_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> > > +  	  addi reg4,reg4,128; \
> > > +  	  __REST_4VRS(8,reg0,reg1,reg2,reg3,base); \
> > > +  	  addi base,base,128; \
> > > +  	  __REST_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> > > +  	  addi reg4,reg4,128; \
> > > +  	  __REST_4VRS(16,reg0,reg1,reg2,reg3,base); \
> > > +  	  addi base,base,128; \
> > > +  	  __REST_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> > > +  	  addi reg4,reg4,128; \
> > > +  	  __REST_4VRS(24,reg0,reg1,reg2,reg3,base); \
> > > +  	  __REST_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> > > +  	  subi base,base,384
> > >  
> > >  #ifdef __BIG_ENDIAN__
> > >  #define STXVD2X_ROT(n,b,base)  	  	> STXVD2X(n,b,base)
> > > diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
> > > index bf8f34a..81e1305 100644
> > > --- a/arch/powerpc/kernel/tm.S
> > > +++ b/arch/powerpc/kernel/tm.S
> > > @@ -96,6 +96,8 @@ _GLOBAL(tm_abort)
> > >   * they will abort back to the checkpointed state we save out here.
> > >   *
> > >   * Call with IRQs off, stacks get all out of sync for some periods in here!
> > > + *
> > > + * Is called from C
> > >   */
> > >  _GLOBAL(tm_reclaim)
> > >    	  mfcr  	  r6
> > > @@ -151,7 +153,7 @@ _GLOBAL(tm_reclaim)
> > >    	  beq  	  dont_backup_vec
> > >  
> > >    	  addi  	  r7, r3, THREAD_TRANSACT_VRSTATE
> > > -  	  SAVE_32VRS(0, r6, r7)  	  /* r6 scratch, r7 transact vr state */
> > > +  	  SAVE_32VRS(r6,r8,r9,r10,r11,r7)  	  /* r6,r8,r9,r10,r11 scratch, r7 transact vr state */    
> > 
> > Line wrapping here.
> >   
> > >    	  mfvscr  	  v0
> > >    	  li  	  r6, VRSTATE_VSCR
> > >    	  stvx  	  v0, r7, r6
> > > @@ -361,7 +363,7 @@ _GLOBAL(__tm_recheckpoint)
> > >    	  li  	  r5, VRSTATE_VSCR
> > >    	  lvx  	  v0, r8, r5
> > >    	  mtvscr  	  v0
> > > -  	  REST_32VRS(0, r5, r8)  	  	  	  /* r5 scratch, r8 ptr */
> > > +  	  REST_32VRS(r5,r6,r9,r10,r11,r8)  	  	  	  /* r5,r6,r9,r10,r11 scratch, r8 ptr */    
> > 
> > wrapping here too
> > 
> >   
> > >  dont_restore_vec:
> > >    	  ld  	  r5, THREAD_VRSAVE(r3)
> > >    	  mtspr  	  SPRN_VRSAVE, r5
> > > diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
> > > index 1c2e7a3..8d587fb 100644
> > > --- a/arch/powerpc/kernel/vector.S
> > > +++ b/arch/powerpc/kernel/vector.S
> > > @@ -13,6 +13,8 @@
> > >   * This is similar to load_up_altivec but for the transactional version of the
> > >   * vector regs.  It doesn't mess with the task MSR or valid flags.
> > >   * Furthermore, VEC laziness is not supported with TM currently.
> > > + *
> > > + * Is called from C
> > >   */
> > >  _GLOBAL(do_load_up_transact_altivec)
> > >    	  mfmsr  	  r6
> > > @@ -27,7 +29,7 @@ _GLOBAL(do_load_up_transact_altivec)
> > >    	  lvx  	  v0,r10,r3
> > >    	  mtvscr  	  v0
> > >    	  addi  	  r10,r3,THREAD_TRANSACT_VRSTATE
> > > -  	  REST_32VRS(0,r4,r10)
> > > +  	  REST_32VRS(r4,r5,r6,r7,r8,r10)
> > >  
> > >    	  blr
> > >  #endif
> > > @@ -35,20 +37,24 @@ _GLOBAL(do_load_up_transact_altivec)
> > >  /*
> > >   * Load state from memory into VMX registers including VSCR.
> > >   * Assumes the caller has enabled VMX in the MSR.
> > > + *
> > > + * Is called from C
> > >   */
> > >  _GLOBAL(load_vr_state)
> > >    	  li  	  r4,VRSTATE_VSCR
> > >    	  lvx  	  v0,r4,r3
> > >    	  mtvscr  	  v0
> > > -  	  REST_32VRS(0,r4,r3)
> > > +  	  REST_32VRS(r4,r5,r6,r7,r8,r3)
> > >    	  blr
> > >  
> > >  /*
> > >   * Store VMX state into memory, including VSCR.
> > >   * Assumes the caller has enabled VMX in the MSR.
> > > + *
> > > + * NOT called from C
> > >   */
> > >  _GLOBAL(store_vr_state)
> > > -  	  SAVE_32VRS(0, r4, r3)
> > > +  	  SAVE_32VRS(r4,r5,r6,r7,r8,r3)
> > >    	  mfvscr  	  v0
> > >    	  li  	  r4, VRSTATE_VSCR
> > >    	  stvx  	  v0, r4, r3
> > > @@ -63,6 +69,8 @@ _GLOBAL(store_vr_state)
> > >   *
> > >   * Note that on 32-bit this can only use registers that will be
> > >   * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
> > > + *
> > > + * NOT called from C
> > >   */
> > >  _GLOBAL(load_up_altivec)
> > >    	  mfmsr  	  r5  	  	  	  /* grab the current MSR */
> > > @@ -101,13 +109,15 @@ _GLOBAL(load_up_altivec)
> > >    	  stw  	  r4,THREAD_USED_VR(r5)
> > >    	  lvx  	  v0,r10,r6
> > >    	  mtvscr  	  v0
> > > -  	  REST_32VRS(0,r4,r6)
> > > +  	  REST_32VRS(r3,r4,r5,r10,r11,r6)
> > >    	  /* restore registers and return */
> > >    	  blr
> > >  
> > >  /*
> > >   * save_altivec(tsk)
> > >   * Save the vector registers to its thread_struct
> > > + *
> > > + * Is called from C
> > >   */
> > >  _GLOBAL(save_altivec)
> > >    	  addi  	  r3,r3,THREAD  	  	> /* want THREAD of task */
> > > @@ -116,7 +126,7 @@ _GLOBAL(save_altivec)
> > >    	  PPC_LCMPI  	  0,r7,0
> > >    	  bne  	  2f
> > >    	  addi  	  r7,r3,THREAD_VRSTATE
> > > -2:  	  SAVE_32VRS(0,r4,r7)
> > > +2:  	  SAVE_32VRS(r4,r5,r6,r8,r9,r7)
> > >    	  mfvscr  	  v0
> > >    	  li  	  r4,VRSTATE_VSCR
> > >    	  stvx  	  v0,r4,r7    
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-03-11  4:25 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-01  5:55 [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Cyril Bur
2016-03-01  5:55 ` [PATCH 2/2] powerpc: Batch up loads/stores on saving and restoring VSX Cyril Bur
2016-03-10  0:09   ` Michael Neuling
2016-03-10  1:02     ` Cyril Bur
2016-03-09 23:01 ` [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Michael Neuling
2016-03-10  0:56   ` Cyril Bur
2016-03-10  5:37   ` Cyril Bur
2016-03-11  4:25     ` Cyril Bur

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).