All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
@ 2015-06-13 22:20 ` Mario Smarduch
  0 siblings, 0 replies; 18+ messages in thread
From: Mario Smarduch @ 2015-06-13 22:20 UTC (permalink / raw)
  To: kvmarm, linux-arm-kernel; +Cc: Marc Zyngier, Catalin Marinas, Will Deacon, kvm

Currently VFP/SIMD registers are always saved and restored
on Guest entry and exit.

This patch only saves and restores VFP/SIMD registers on
Guest access. To do this cptr_el2 VFP/SIMD trap is set
on Guest entry and later checked on exit. This follows
the ARMv7 VFPv3 implementation. Running an informal test
there are high number of exits that don't access VFP/SIMD
registers.

Tested on FVP Model, executed threads on host and
Guest accessing VFP/SIMD registers resulting in consistent
results.


Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 arch/arm64/include/asm/kvm_arm.h |  5 +++-
 arch/arm64/kvm/hyp.S             | 57
+++++++++++++++++++++++++++++++++++++---
 2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h
b/arch/arm64/include/asm/kvm_arm.h
index ac6fafb..7605e09 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -171,10 +171,13 @@
 #define HSTR_EL2_TTEE	(1 << 16)
 #define HSTR_EL2_T(x)	(1 << x)

+/* Hyp Coproccessor Trap Register Shifts */
+#define CPTR_EL2_TFP_SHIFT 10
+
 /* Hyp Coprocessor Trap Register */
 #define CPTR_EL2_TCPAC	(1 << 31)
 #define CPTR_EL2_TTA	(1 << 20)
-#define CPTR_EL2_TFP	(1 << 10)
+#define CPTR_EL2_TFP	(1 << CPTR_EL2_TFP_SHIFT)

 /* Hyp Debug Configuration Register bits */
 #define MDCR_EL2_TDRA		(1 << 11)
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 5befd01..b3044b4 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -673,6 +673,24 @@
 	tbz	\tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
 .endm

+/*
+ * Check cptr VFP/SIMD accessed bit, if set VFP/SIMD not accessed by guest.
+ */
+.macro skip_fpsimd_state tmp, target
+	mrs	\tmp, cptr_el2
+	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
+.endm
+
+/*
+ * Check cptr VFP/SIMD accessed bit if set, VFP/SIMD not accessed by guest.
+ * Also disable all cptr traps on return to host.
+ */
+.macro skip_fpsimd_state_reset tmp, target
+	mrs	\tmp, cptr_el2
+	msr	cptr_el2, xzr
+	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
+.endm
+
 .macro compute_debug_state target
 	// Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
 	// is set, we do a full save/restore cycle and disable trapping.
@@ -763,6 +781,7 @@
 	ldr     x2, [x0, #VCPU_HCR_EL2]
 	msr     hcr_el2, x2
 	mov	x2, #CPTR_EL2_TTA
+	orr	x2, x2, #CPTR_EL2_TFP
 	msr	cptr_el2, x2

 	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
@@ -785,7 +804,6 @@
 .macro deactivate_traps
 	mov	x2, #HCR_RW
 	msr	hcr_el2, x2
-	msr	cptr_el2, xzr
 	msr	hstr_el2, xzr

 	mrs	x2, mdcr_el2
@@ -911,6 +929,30 @@ __save_fpsimd:
 __restore_fpsimd:
 	restore_fpsimd
 	ret
+/*
+ * On Guest VFP/SIMD access, switch Guest/Host registers and reset cptr
access
+ * bit to disable trapping.
+ */
+switch_to_guest_vfp:
+	ldr	x2, =(CPTR_EL2_TTA)
+	msr	cptr_el2, x2
+
+	mrs	x0, tpidr_el2
+
+	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
+	kern_hyp_va x2
+
+	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
+	fpsimd_save x3, 1
+
+	add	x2, x0, #VCPU_CONTEXT
+	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
+	fpsimd_restore x3, 1
+
+	pop	x2, x3
+	pop	x0, x1
+
+	eret

 /*
  * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
@@ -932,7 +974,6 @@ ENTRY(__kvm_vcpu_run)
 	kern_hyp_va x2

 	save_host_regs
-	bl __save_fpsimd
 	bl __save_sysregs

 	compute_debug_state 1f
@@ -948,7 +989,6 @@ ENTRY(__kvm_vcpu_run)
 	add	x2, x0, #VCPU_CONTEXT

 	bl __restore_sysregs
-	bl __restore_fpsimd

 	skip_debug_state x3, 1f
 	bl	__restore_debug
@@ -967,7 +1007,9 @@ __kvm_vcpu_return:
 	add	x2, x0, #VCPU_CONTEXT

 	save_guest_regs
+	skip_fpsimd_state x3, 1f
 	bl __save_fpsimd
+1:
 	bl __save_sysregs

 	skip_debug_state x3, 1f
@@ -986,8 +1028,11 @@ __kvm_vcpu_return:
 	kern_hyp_va x2

 	bl __restore_sysregs
-	bl __restore_fpsimd

+	/* Disable cptr VFP/SIMD access bit, skip restore if VFP not accessed */
+	skip_fpsimd_state_reset x3, 1f
+	bl __restore_fpsimd
+1:
 	skip_debug_state x3, 1f
 	// Clear the dirty flag for the next run, as all the state has
 	// already been saved. Note that we nuke the whole 64bit word.
@@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
 	mrs	x1, esr_el2
 	lsr	x2, x1, #ESR_ELx_EC_SHIFT

+	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
+	cmp	x2, #ESR_ELx_EC_FP_ASIMD
+	b.eq	switch_to_guest_vfp
+
 	cmp	x2, #ESR_ELx_EC_HVC64
 	b.ne	el1_trap

-- 
1.9.1

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
@ 2015-06-13 22:20 ` Mario Smarduch
  0 siblings, 0 replies; 18+ messages in thread
From: Mario Smarduch @ 2015-06-13 22:20 UTC (permalink / raw)
  To: linux-arm-kernel

Currently VFP/SIMD registers are always saved and restored
on Guest entry and exit.

This patch only saves and restores VFP/SIMD registers on
Guest access. To do this cptr_el2 VFP/SIMD trap is set
on Guest entry and later checked on exit. This follows
the ARMv7 VFPv3 implementation. Running an informal test
there are high number of exits that don't access VFP/SIMD
registers.

Tested on FVP Model, executed threads on host and
Guest accessing VFP/SIMD registers resulting in consistent
results.


Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 arch/arm64/include/asm/kvm_arm.h |  5 +++-
 arch/arm64/kvm/hyp.S             | 57
+++++++++++++++++++++++++++++++++++++---
 2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h
b/arch/arm64/include/asm/kvm_arm.h
index ac6fafb..7605e09 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -171,10 +171,13 @@
 #define HSTR_EL2_TTEE	(1 << 16)
 #define HSTR_EL2_T(x)	(1 << x)

+/* Hyp Coproccessor Trap Register Shifts */
+#define CPTR_EL2_TFP_SHIFT 10
+
 /* Hyp Coprocessor Trap Register */
 #define CPTR_EL2_TCPAC	(1 << 31)
 #define CPTR_EL2_TTA	(1 << 20)
-#define CPTR_EL2_TFP	(1 << 10)
+#define CPTR_EL2_TFP	(1 << CPTR_EL2_TFP_SHIFT)

 /* Hyp Debug Configuration Register bits */
 #define MDCR_EL2_TDRA		(1 << 11)
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 5befd01..b3044b4 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -673,6 +673,24 @@
 	tbz	\tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
 .endm

+/*
+ * Check cptr VFP/SIMD accessed bit, if set VFP/SIMD not accessed by guest.
+ */
+.macro skip_fpsimd_state tmp, target
+	mrs	\tmp, cptr_el2
+	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
+.endm
+
+/*
+ * Check cptr VFP/SIMD accessed bit if set, VFP/SIMD not accessed by guest.
+ * Also disable all cptr traps on return to host.
+ */
+.macro skip_fpsimd_state_reset tmp, target
+	mrs	\tmp, cptr_el2
+	msr	cptr_el2, xzr
+	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
+.endm
+
 .macro compute_debug_state target
 	// Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
 	// is set, we do a full save/restore cycle and disable trapping.
@@ -763,6 +781,7 @@
 	ldr     x2, [x0, #VCPU_HCR_EL2]
 	msr     hcr_el2, x2
 	mov	x2, #CPTR_EL2_TTA
+	orr	x2, x2, #CPTR_EL2_TFP
 	msr	cptr_el2, x2

 	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
@@ -785,7 +804,6 @@
 .macro deactivate_traps
 	mov	x2, #HCR_RW
 	msr	hcr_el2, x2
-	msr	cptr_el2, xzr
 	msr	hstr_el2, xzr

 	mrs	x2, mdcr_el2
@@ -911,6 +929,30 @@ __save_fpsimd:
 __restore_fpsimd:
 	restore_fpsimd
 	ret
+/*
+ * On Guest VFP/SIMD access, switch Guest/Host registers and reset cptr
access
+ * bit to disable trapping.
+ */
+switch_to_guest_vfp:
+	ldr	x2, =(CPTR_EL2_TTA)
+	msr	cptr_el2, x2
+
+	mrs	x0, tpidr_el2
+
+	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
+	kern_hyp_va x2
+
+	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
+	fpsimd_save x3, 1
+
+	add	x2, x0, #VCPU_CONTEXT
+	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
+	fpsimd_restore x3, 1
+
+	pop	x2, x3
+	pop	x0, x1
+
+	eret

 /*
  * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
@@ -932,7 +974,6 @@ ENTRY(__kvm_vcpu_run)
 	kern_hyp_va x2

 	save_host_regs
-	bl __save_fpsimd
 	bl __save_sysregs

 	compute_debug_state 1f
@@ -948,7 +989,6 @@ ENTRY(__kvm_vcpu_run)
 	add	x2, x0, #VCPU_CONTEXT

 	bl __restore_sysregs
-	bl __restore_fpsimd

 	skip_debug_state x3, 1f
 	bl	__restore_debug
@@ -967,7 +1007,9 @@ __kvm_vcpu_return:
 	add	x2, x0, #VCPU_CONTEXT

 	save_guest_regs
+	skip_fpsimd_state x3, 1f
 	bl __save_fpsimd
+1:
 	bl __save_sysregs

 	skip_debug_state x3, 1f
@@ -986,8 +1028,11 @@ __kvm_vcpu_return:
 	kern_hyp_va x2

 	bl __restore_sysregs
-	bl __restore_fpsimd

+	/* Disable cptr VFP/SIMD access bit, skip restore if VFP not accessed */
+	skip_fpsimd_state_reset x3, 1f
+	bl __restore_fpsimd
+1:
 	skip_debug_state x3, 1f
 	// Clear the dirty flag for the next run, as all the state has
 	// already been saved. Note that we nuke the whole 64bit word.
@@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
 	mrs	x1, esr_el2
 	lsr	x2, x1, #ESR_ELx_EC_SHIFT

+	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
+	cmp	x2, #ESR_ELx_EC_FP_ASIMD
+	b.eq	switch_to_guest_vfp
+
 	cmp	x2, #ESR_ELx_EC_HVC64
 	b.ne	el1_trap

-- 
1.9.1

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
  2015-06-13 22:20 ` Mario Smarduch
@ 2015-06-15 10:00   ` Marc Zyngier
  -1 siblings, 0 replies; 18+ messages in thread
From: Marc Zyngier @ 2015-06-15 10:00 UTC (permalink / raw)
  To: Mario Smarduch, kvmarm, linux-arm-kernel
  Cc: christoffer.dall, Catalin Marinas, Will Deacon, kvm

Hi Mario,

I was working on a more ambitious patch series, but we probably ought to
start small, and this looks fairly sensible to me.

A few minor comments below.

On 13/06/15 23:20, Mario Smarduch wrote:
> Currently VFP/SIMD registers are always saved and restored
> on Guest entry and exit.
> 
> This patch only saves and restores VFP/SIMD registers on
> Guest access. To do this cptr_el2 VFP/SIMD trap is set
> on Guest entry and later checked on exit. This follows
> the ARMv7 VFPv3 implementation. Running an informal test
> there are high number of exits that don't access VFP/SIMD
> registers.

It would be good to add some numbers here. How often do we exit without
having touched the FPSIMD regs? For which workload?

> Tested on FVP Model, executed threads on host and
> Guest accessing VFP/SIMD registers resulting in consistent
> results.
> 
> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
>  arch/arm64/include/asm/kvm_arm.h |  5 +++-
>  arch/arm64/kvm/hyp.S             | 57
> +++++++++++++++++++++++++++++++++++++---
>  2 files changed, 57 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_arm.h
> b/arch/arm64/include/asm/kvm_arm.h
> index ac6fafb..7605e09 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -171,10 +171,13 @@
>  #define HSTR_EL2_TTEE	(1 << 16)
>  #define HSTR_EL2_T(x)	(1 << x)
> 
> +/* Hyp Coproccessor Trap Register Shifts */
> +#define CPTR_EL2_TFP_SHIFT 10
> +
>  /* Hyp Coprocessor Trap Register */
>  #define CPTR_EL2_TCPAC	(1 << 31)
>  #define CPTR_EL2_TTA	(1 << 20)
> -#define CPTR_EL2_TFP	(1 << 10)
> +#define CPTR_EL2_TFP	(1 << CPTR_EL2_TFP_SHIFT)
> 
>  /* Hyp Debug Configuration Register bits */
>  #define MDCR_EL2_TDRA		(1 << 11)
> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
> index 5befd01..b3044b4 100644
> --- a/arch/arm64/kvm/hyp.S
> +++ b/arch/arm64/kvm/hyp.S
> @@ -673,6 +673,24 @@
>  	tbz	\tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
>  .endm
> 
> +/*
> + * Check cptr VFP/SIMD accessed bit, if set VFP/SIMD not accessed by guest.
> + */
> +.macro skip_fpsimd_state tmp, target
> +	mrs	\tmp, cptr_el2
> +	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
> +.endm
> +
> +/*
> + * Check cptr VFP/SIMD accessed bit if set, VFP/SIMD not accessed by guest.
> + * Also disable all cptr traps on return to host.
> + */
> +.macro skip_fpsimd_state_reset tmp, target
> +	mrs	\tmp, cptr_el2
> +	msr	cptr_el2, xzr
> +	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
> +.endm
> +
>  .macro compute_debug_state target
>  	// Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
>  	// is set, we do a full save/restore cycle and disable trapping.
> @@ -763,6 +781,7 @@
>  	ldr     x2, [x0, #VCPU_HCR_EL2]
>  	msr     hcr_el2, x2
>  	mov	x2, #CPTR_EL2_TTA
> +	orr	x2, x2, #CPTR_EL2_TFP
>  	msr	cptr_el2, x2
> 
>  	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
> @@ -785,7 +804,6 @@
>  .macro deactivate_traps
>  	mov	x2, #HCR_RW
>  	msr	hcr_el2, x2
> -	msr	cptr_el2, xzr
>  	msr	hstr_el2, xzr
> 
>  	mrs	x2, mdcr_el2
> @@ -911,6 +929,30 @@ __save_fpsimd:
>  __restore_fpsimd:
>  	restore_fpsimd
>  	ret
> +/*
> + * On Guest VFP/SIMD access, switch Guest/Host registers and reset cptr
> access
> + * bit to disable trapping.
> + */
> +switch_to_guest_vfp:

Consider renaming this to "switch_to_guest_fpsimd" for consistency.

> +	ldr	x2, =(CPTR_EL2_TTA)
> +	msr	cptr_el2, x2

I'd feel more comfortable if you read cptr_el2, cleared the TFP bit, and
wrote the result back. Loading an arbitrary value is likely to cause
bugs in the long run.

> +
> +	mrs	x0, tpidr_el2
> +
> +	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
> +	kern_hyp_va x2
> +
> +	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
> +	fpsimd_save x3, 1
> +
> +	add	x2, x0, #VCPU_CONTEXT
> +	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
> +	fpsimd_restore x3, 1

That's quite a lot of code expansion (fpsimd_{save,restore} are macros).

How about saving x4 and lr on the stack, and doing a a bl in each case?
That would be consistent with the rest of the code (well, what's left of
it).

> +
> +	pop	x2, x3
> +	pop	x0, x1
> +
> +	eret
> 
>  /*
>   * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
> @@ -932,7 +974,6 @@ ENTRY(__kvm_vcpu_run)
>  	kern_hyp_va x2
> 
>  	save_host_regs
> -	bl __save_fpsimd
>  	bl __save_sysregs
> 
>  	compute_debug_state 1f
> @@ -948,7 +989,6 @@ ENTRY(__kvm_vcpu_run)
>  	add	x2, x0, #VCPU_CONTEXT
> 
>  	bl __restore_sysregs
> -	bl __restore_fpsimd
> 
>  	skip_debug_state x3, 1f
>  	bl	__restore_debug
> @@ -967,7 +1007,9 @@ __kvm_vcpu_return:
>  	add	x2, x0, #VCPU_CONTEXT
> 
>  	save_guest_regs
> +	skip_fpsimd_state x3, 1f
>  	bl __save_fpsimd
> +1:
>  	bl __save_sysregs
> 
>  	skip_debug_state x3, 1f
> @@ -986,8 +1028,11 @@ __kvm_vcpu_return:
>  	kern_hyp_va x2
> 
>  	bl __restore_sysregs
> -	bl __restore_fpsimd
> 
> +	/* Disable cptr VFP/SIMD access bit, skip restore if VFP not accessed */
> +	skip_fpsimd_state_reset x3, 1f
> +	bl __restore_fpsimd
> +1:

Neither the macro name (skip_fpsimd_state_reset) nor the comment make it
clear that it is also re-enabling access to the trace registers.

I'd rather see:

	skip_fpsimd_state x3, 1f
	bl __restore_fpsimd
1:
	/* Clear FPSIMD and Trace trapping */
	msr	cptr_el2, xzr

>  	skip_debug_state x3, 1f
>  	// Clear the dirty flag for the next run, as all the state has
>  	// already been saved. Note that we nuke the whole 64bit word.
> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>  	mrs	x1, esr_el2
>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
> 
> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
> +	b.eq	switch_to_guest_vfp
> +

I'd prefer you moved that hunk to el1_trap, where we handle all the
traps coming from the guest.

>  	cmp	x2, #ESR_ELx_EC_HVC64
>  	b.ne	el1_trap
> 

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
@ 2015-06-15 10:00   ` Marc Zyngier
  0 siblings, 0 replies; 18+ messages in thread
From: Marc Zyngier @ 2015-06-15 10:00 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Mario,

I was working on a more ambitious patch series, but we probably ought to
start small, and this looks fairly sensible to me.

A few minor comments below.

On 13/06/15 23:20, Mario Smarduch wrote:
> Currently VFP/SIMD registers are always saved and restored
> on Guest entry and exit.
> 
> This patch only saves and restores VFP/SIMD registers on
> Guest access. To do this cptr_el2 VFP/SIMD trap is set
> on Guest entry and later checked on exit. This follows
> the ARMv7 VFPv3 implementation. Running an informal test
> there are high number of exits that don't access VFP/SIMD
> registers.

It would be good to add some numbers here. How often do we exit without
having touched the FPSIMD regs? For which workload?

> Tested on FVP Model, executed threads on host and
> Guest accessing VFP/SIMD registers resulting in consistent
> results.
> 
> 
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
>  arch/arm64/include/asm/kvm_arm.h |  5 +++-
>  arch/arm64/kvm/hyp.S             | 57
> +++++++++++++++++++++++++++++++++++++---
>  2 files changed, 57 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_arm.h
> b/arch/arm64/include/asm/kvm_arm.h
> index ac6fafb..7605e09 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -171,10 +171,13 @@
>  #define HSTR_EL2_TTEE	(1 << 16)
>  #define HSTR_EL2_T(x)	(1 << x)
> 
> +/* Hyp Coproccessor Trap Register Shifts */
> +#define CPTR_EL2_TFP_SHIFT 10
> +
>  /* Hyp Coprocessor Trap Register */
>  #define CPTR_EL2_TCPAC	(1 << 31)
>  #define CPTR_EL2_TTA	(1 << 20)
> -#define CPTR_EL2_TFP	(1 << 10)
> +#define CPTR_EL2_TFP	(1 << CPTR_EL2_TFP_SHIFT)
> 
>  /* Hyp Debug Configuration Register bits */
>  #define MDCR_EL2_TDRA		(1 << 11)
> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
> index 5befd01..b3044b4 100644
> --- a/arch/arm64/kvm/hyp.S
> +++ b/arch/arm64/kvm/hyp.S
> @@ -673,6 +673,24 @@
>  	tbz	\tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
>  .endm
> 
> +/*
> + * Check cptr VFP/SIMD accessed bit, if set VFP/SIMD not accessed by guest.
> + */
> +.macro skip_fpsimd_state tmp, target
> +	mrs	\tmp, cptr_el2
> +	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
> +.endm
> +
> +/*
> + * Check cptr VFP/SIMD accessed bit if set, VFP/SIMD not accessed by guest.
> + * Also disable all cptr traps on return to host.
> + */
> +.macro skip_fpsimd_state_reset tmp, target
> +	mrs	\tmp, cptr_el2
> +	msr	cptr_el2, xzr
> +	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
> +.endm
> +
>  .macro compute_debug_state target
>  	// Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
>  	// is set, we do a full save/restore cycle and disable trapping.
> @@ -763,6 +781,7 @@
>  	ldr     x2, [x0, #VCPU_HCR_EL2]
>  	msr     hcr_el2, x2
>  	mov	x2, #CPTR_EL2_TTA
> +	orr	x2, x2, #CPTR_EL2_TFP
>  	msr	cptr_el2, x2
> 
>  	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
> @@ -785,7 +804,6 @@
>  .macro deactivate_traps
>  	mov	x2, #HCR_RW
>  	msr	hcr_el2, x2
> -	msr	cptr_el2, xzr
>  	msr	hstr_el2, xzr
> 
>  	mrs	x2, mdcr_el2
> @@ -911,6 +929,30 @@ __save_fpsimd:
>  __restore_fpsimd:
>  	restore_fpsimd
>  	ret
> +/*
> + * On Guest VFP/SIMD access, switch Guest/Host registers and reset cptr
> access
> + * bit to disable trapping.
> + */
> +switch_to_guest_vfp:

Consider renaming this to "switch_to_guest_fpsimd" for consistency.

> +	ldr	x2, =(CPTR_EL2_TTA)
> +	msr	cptr_el2, x2

I'd feel more comfortable if you read cptr_el2, cleared the TFP bit, and
wrote the result back. Loading an arbitrary value is likely to cause
bugs in the long run.

> +
> +	mrs	x0, tpidr_el2
> +
> +	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
> +	kern_hyp_va x2
> +
> +	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
> +	fpsimd_save x3, 1
> +
> +	add	x2, x0, #VCPU_CONTEXT
> +	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
> +	fpsimd_restore x3, 1

That's quite a lot of code expansion (fpsimd_{save,restore} are macros).

How about saving x4 and lr on the stack, and doing a a bl in each case?
That would be consistent with the rest of the code (well, what's left of
it).

> +
> +	pop	x2, x3
> +	pop	x0, x1
> +
> +	eret
> 
>  /*
>   * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
> @@ -932,7 +974,6 @@ ENTRY(__kvm_vcpu_run)
>  	kern_hyp_va x2
> 
>  	save_host_regs
> -	bl __save_fpsimd
>  	bl __save_sysregs
> 
>  	compute_debug_state 1f
> @@ -948,7 +989,6 @@ ENTRY(__kvm_vcpu_run)
>  	add	x2, x0, #VCPU_CONTEXT
> 
>  	bl __restore_sysregs
> -	bl __restore_fpsimd
> 
>  	skip_debug_state x3, 1f
>  	bl	__restore_debug
> @@ -967,7 +1007,9 @@ __kvm_vcpu_return:
>  	add	x2, x0, #VCPU_CONTEXT
> 
>  	save_guest_regs
> +	skip_fpsimd_state x3, 1f
>  	bl __save_fpsimd
> +1:
>  	bl __save_sysregs
> 
>  	skip_debug_state x3, 1f
> @@ -986,8 +1028,11 @@ __kvm_vcpu_return:
>  	kern_hyp_va x2
> 
>  	bl __restore_sysregs
> -	bl __restore_fpsimd
> 
> +	/* Disable cptr VFP/SIMD access bit, skip restore if VFP not accessed */
> +	skip_fpsimd_state_reset x3, 1f
> +	bl __restore_fpsimd
> +1:

Neither the macro name (skip_fpsimd_state_reset) nor the comment make it
clear that it is also re-enabling access to the trace registers.

I'd rather see:

	skip_fpsimd_state x3, 1f
	bl __restore_fpsimd
1:
	/* Clear FPSIMD and Trace trapping */
	msr	cptr_el2, xzr

>  	skip_debug_state x3, 1f
>  	// Clear the dirty flag for the next run, as all the state has
>  	// already been saved. Note that we nuke the whole 64bit word.
> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>  	mrs	x1, esr_el2
>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
> 
> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
> +	b.eq	switch_to_guest_vfp
> +

I'd prefer you moved that hunk to el1_trap, where we handle all the
traps coming from the guest.

>  	cmp	x2, #ESR_ELx_EC_HVC64
>  	b.ne	el1_trap
> 

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
  2015-06-15 10:00   ` Marc Zyngier
@ 2015-06-15 18:04     ` Mario Smarduch
  -1 siblings, 0 replies; 18+ messages in thread
From: Mario Smarduch @ 2015-06-15 18:04 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: kvmarm, linux-arm-kernel, christoffer.dall, Catalin Marinas,
	Will Deacon, kvm

On 06/15/2015 03:00 AM, Marc Zyngier wrote:
> Hi Mario,
> 
> I was working on a more ambitious patch series, 
> but we probably ought to
> start small, and this looks fairly sensible to me.

Hi Marc,
   thanks for reviewing, I was thinking to post this
first and next iteration on guest access switch
back to host registers only upon  return to user space or
vCPU context switch. This should save more cycles for
various exits.

Were you thinking along the same lines or something
altogether different?

> 
> A few minor comments below.
> 
> On 13/06/15 23:20, Mario Smarduch wrote:
>> Currently VFP/SIMD registers are always saved and restored
>> on Guest entry and exit.
>>
>> This patch only saves and restores VFP/SIMD registers on
>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>> on Guest entry and later checked on exit. This follows
>> the ARMv7 VFPv3 implementation. Running an informal test
>> there are high number of exits that don't access VFP/SIMD
>> registers.
> 
> It would be good to add some numbers here. How often do we exit without
> having touched the FPSIMD regs? For which workload?

Lmbench is what I typically use, with ssh server, i.e., cause page
faults and interrupts - usually registers are not touched.
I'll run the tests again and define usually.

Any other loads you had in mind?

> 
>> Tested on FVP Model, executed threads on host and
>> Guest accessing VFP/SIMD registers resulting in consistent
>> results.
>>
>>
>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>> ---
>>  arch/arm64/include/asm/kvm_arm.h |  5 +++-
>>  arch/arm64/kvm/hyp.S             | 57
>> +++++++++++++++++++++++++++++++++++++---
>>  2 files changed, 57 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_arm.h
>> b/arch/arm64/include/asm/kvm_arm.h
>> index ac6fafb..7605e09 100644
>> --- a/arch/arm64/include/asm/kvm_arm.h
>> +++ b/arch/arm64/include/asm/kvm_arm.h
>> @@ -171,10 +171,13 @@
>>  #define HSTR_EL2_TTEE	(1 << 16)
>>  #define HSTR_EL2_T(x)	(1 << x)
>>
>> +/* Hyp Coproccessor Trap Register Shifts */
>> +#define CPTR_EL2_TFP_SHIFT 10
>> +
>>  /* Hyp Coprocessor Trap Register */
>>  #define CPTR_EL2_TCPAC	(1 << 31)
>>  #define CPTR_EL2_TTA	(1 << 20)
>> -#define CPTR_EL2_TFP	(1 << 10)
>> +#define CPTR_EL2_TFP	(1 << CPTR_EL2_TFP_SHIFT)
>>
>>  /* Hyp Debug Configuration Register bits */
>>  #define MDCR_EL2_TDRA		(1 << 11)
>> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
>> index 5befd01..b3044b4 100644
>> --- a/arch/arm64/kvm/hyp.S
>> +++ b/arch/arm64/kvm/hyp.S
>> @@ -673,6 +673,24 @@
>>  	tbz	\tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
>>  .endm
>>
>> +/*
>> + * Check cptr VFP/SIMD accessed bit, if set VFP/SIMD not accessed by guest.
>> + */
>> +.macro skip_fpsimd_state tmp, target
>> +	mrs	\tmp, cptr_el2
>> +	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
>> +.endm
>> +
>> +/*
>> + * Check cptr VFP/SIMD accessed bit if set, VFP/SIMD not accessed by guest.
>> + * Also disable all cptr traps on return to host.
>> + */
>> +.macro skip_fpsimd_state_reset tmp, target
>> +	mrs	\tmp, cptr_el2
>> +	msr	cptr_el2, xzr
>> +	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
>> +.endm
>> +
>>  .macro compute_debug_state target
>>  	// Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
>>  	// is set, we do a full save/restore cycle and disable trapping.
>> @@ -763,6 +781,7 @@
>>  	ldr     x2, [x0, #VCPU_HCR_EL2]
>>  	msr     hcr_el2, x2
>>  	mov	x2, #CPTR_EL2_TTA
>> +	orr	x2, x2, #CPTR_EL2_TFP
>>  	msr	cptr_el2, x2
>>
>>  	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
>> @@ -785,7 +804,6 @@
>>  .macro deactivate_traps
>>  	mov	x2, #HCR_RW
>>  	msr	hcr_el2, x2
>> -	msr	cptr_el2, xzr
>>  	msr	hstr_el2, xzr
>>
>>  	mrs	x2, mdcr_el2
>> @@ -911,6 +929,30 @@ __save_fpsimd:
>>  __restore_fpsimd:
>>  	restore_fpsimd
>>  	ret
>> +/*
>> + * On Guest VFP/SIMD access, switch Guest/Host registers and reset cptr
>> access
>> + * bit to disable trapping.
>> + */
>> +switch_to_guest_vfp:
> 
> Consider renaming this to "switch_to_guest_fpsimd" for consistency.
> 
>> +	ldr	x2, =(CPTR_EL2_TTA)
>> +	msr	cptr_el2, x2
> 
> I'd feel more comfortable if you read cptr_el2, cleared the TFP bit, and
> wrote the result back. Loading an arbitrary value is likely to cause
> bugs in the long run.
Yep will do.
> 
>> +
>> +	mrs	x0, tpidr_el2
>> +
>> +	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
>> +	kern_hyp_va x2
>> +
>> +	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
>> +	fpsimd_save x3, 1
>> +
>> +	add	x2, x0, #VCPU_CONTEXT
>> +	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
>> +	fpsimd_restore x3, 1
> 
> That's quite a lot of code expansion (fpsimd_{save,restore} are macros).
> 
> How about saving x4 and lr on the stack, and doing a a bl in each case?
> That would be consistent with the rest of the code (well, what's left of
> it).

Yeah, will change needed help/feedback on this.
> 
>> +
>> +	pop	x2, x3
>> +	pop	x0, x1
>> +
>> +	eret
>>
>>  /*
>>   * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
>> @@ -932,7 +974,6 @@ ENTRY(__kvm_vcpu_run)
>>  	kern_hyp_va x2
>>
>>  	save_host_regs
>> -	bl __save_fpsimd
>>  	bl __save_sysregs
>>
>>  	compute_debug_state 1f
>> @@ -948,7 +989,6 @@ ENTRY(__kvm_vcpu_run)
>>  	add	x2, x0, #VCPU_CONTEXT
>>
>>  	bl __restore_sysregs
>> -	bl __restore_fpsimd
>>
>>  	skip_debug_state x3, 1f
>>  	bl	__restore_debug
>> @@ -967,7 +1007,9 @@ __kvm_vcpu_return:
>>  	add	x2, x0, #VCPU_CONTEXT
>>
>>  	save_guest_regs
>> +	skip_fpsimd_state x3, 1f
>>  	bl __save_fpsimd
>> +1:
>>  	bl __save_sysregs
>>
>>  	skip_debug_state x3, 1f
>> @@ -986,8 +1028,11 @@ __kvm_vcpu_return:
>>  	kern_hyp_va x2
>>
>>  	bl __restore_sysregs
>> -	bl __restore_fpsimd
>>
>> +	/* Disable cptr VFP/SIMD access bit, skip restore if VFP not accessed */
>> +	skip_fpsimd_state_reset x3, 1f
>> +	bl __restore_fpsimd
>> +1:
> 
> Neither the macro name (skip_fpsimd_state_reset) nor the comment make it
> clear that it is also re-enabling access to the trace registers.
> 
> I'd rather see:
> 
> 	skip_fpsimd_state x3, 1f
> 	bl __restore_fpsimd
> 1:
> 	/* Clear FPSIMD and Trace trapping */
> 	msr	cptr_el2, xzr

Will do.

> 
>>  	skip_debug_state x3, 1f
>>  	// Clear the dirty flag for the next run, as all the state has
>>  	// already been saved. Note that we nuke the whole 64bit word.
>> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>>  	mrs	x1, esr_el2
>>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
>>
>> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
>> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
>> +	b.eq	switch_to_guest_vfp
>> +
> 
> I'd prefer you moved that hunk to el1_trap, where we handle all the
> traps coming from the guest.

I'm thinking would it make sense to update the armv7 side as
well. When reading both exit handlers the flow mirrors
each other.

Thanks,
   Mario
> 
>>  	cmp	x2, #ESR_ELx_EC_HVC64
>>  	b.ne	el1_trap
>>
> 
> Thanks,
> 
> 	M.
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
@ 2015-06-15 18:04     ` Mario Smarduch
  0 siblings, 0 replies; 18+ messages in thread
From: Mario Smarduch @ 2015-06-15 18:04 UTC (permalink / raw)
  To: linux-arm-kernel

On 06/15/2015 03:00 AM, Marc Zyngier wrote:
> Hi Mario,
> 
> I was working on a more ambitious patch series, 
> but we probably ought to
> start small, and this looks fairly sensible to me.

Hi Marc,
   thanks for reviewing, I was thinking to post this
first and next iteration on guest access switch
back to host registers only upon  return to user space or
vCPU context switch. This should save more cycles for
various exits.

Were you thinking along the same lines or something
altogether different?

> 
> A few minor comments below.
> 
> On 13/06/15 23:20, Mario Smarduch wrote:
>> Currently VFP/SIMD registers are always saved and restored
>> on Guest entry and exit.
>>
>> This patch only saves and restores VFP/SIMD registers on
>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>> on Guest entry and later checked on exit. This follows
>> the ARMv7 VFPv3 implementation. Running an informal test
>> there are high number of exits that don't access VFP/SIMD
>> registers.
> 
> It would be good to add some numbers here. How often do we exit without
> having touched the FPSIMD regs? For which workload?

Lmbench is what I typically use, with ssh server, i.e., cause page
faults and interrupts - usually registers are not touched.
I'll run the tests again and define usually.

Any other loads you had in mind?

> 
>> Tested on FVP Model, executed threads on host and
>> Guest accessing VFP/SIMD registers resulting in consistent
>> results.
>>
>>
>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
>> ---
>>  arch/arm64/include/asm/kvm_arm.h |  5 +++-
>>  arch/arm64/kvm/hyp.S             | 57
>> +++++++++++++++++++++++++++++++++++++---
>>  2 files changed, 57 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_arm.h
>> b/arch/arm64/include/asm/kvm_arm.h
>> index ac6fafb..7605e09 100644
>> --- a/arch/arm64/include/asm/kvm_arm.h
>> +++ b/arch/arm64/include/asm/kvm_arm.h
>> @@ -171,10 +171,13 @@
>>  #define HSTR_EL2_TTEE	(1 << 16)
>>  #define HSTR_EL2_T(x)	(1 << x)
>>
>> +/* Hyp Coproccessor Trap Register Shifts */
>> +#define CPTR_EL2_TFP_SHIFT 10
>> +
>>  /* Hyp Coprocessor Trap Register */
>>  #define CPTR_EL2_TCPAC	(1 << 31)
>>  #define CPTR_EL2_TTA	(1 << 20)
>> -#define CPTR_EL2_TFP	(1 << 10)
>> +#define CPTR_EL2_TFP	(1 << CPTR_EL2_TFP_SHIFT)
>>
>>  /* Hyp Debug Configuration Register bits */
>>  #define MDCR_EL2_TDRA		(1 << 11)
>> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
>> index 5befd01..b3044b4 100644
>> --- a/arch/arm64/kvm/hyp.S
>> +++ b/arch/arm64/kvm/hyp.S
>> @@ -673,6 +673,24 @@
>>  	tbz	\tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
>>  .endm
>>
>> +/*
>> + * Check cptr VFP/SIMD accessed bit, if set VFP/SIMD not accessed by guest.
>> + */
>> +.macro skip_fpsimd_state tmp, target
>> +	mrs	\tmp, cptr_el2
>> +	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
>> +.endm
>> +
>> +/*
>> + * Check cptr VFP/SIMD accessed bit if set, VFP/SIMD not accessed by guest.
>> + * Also disable all cptr traps on return to host.
>> + */
>> +.macro skip_fpsimd_state_reset tmp, target
>> +	mrs	\tmp, cptr_el2
>> +	msr	cptr_el2, xzr
>> +	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
>> +.endm
>> +
>>  .macro compute_debug_state target
>>  	// Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
>>  	// is set, we do a full save/restore cycle and disable trapping.
>> @@ -763,6 +781,7 @@
>>  	ldr     x2, [x0, #VCPU_HCR_EL2]
>>  	msr     hcr_el2, x2
>>  	mov	x2, #CPTR_EL2_TTA
>> +	orr	x2, x2, #CPTR_EL2_TFP
>>  	msr	cptr_el2, x2
>>
>>  	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
>> @@ -785,7 +804,6 @@
>>  .macro deactivate_traps
>>  	mov	x2, #HCR_RW
>>  	msr	hcr_el2, x2
>> -	msr	cptr_el2, xzr
>>  	msr	hstr_el2, xzr
>>
>>  	mrs	x2, mdcr_el2
>> @@ -911,6 +929,30 @@ __save_fpsimd:
>>  __restore_fpsimd:
>>  	restore_fpsimd
>>  	ret
>> +/*
>> + * On Guest VFP/SIMD access, switch Guest/Host registers and reset cptr
>> access
>> + * bit to disable trapping.
>> + */
>> +switch_to_guest_vfp:
> 
> Consider renaming this to "switch_to_guest_fpsimd" for consistency.
> 
>> +	ldr	x2, =(CPTR_EL2_TTA)
>> +	msr	cptr_el2, x2
> 
> I'd feel more comfortable if you read cptr_el2, cleared the TFP bit, and
> wrote the result back. Loading an arbitrary value is likely to cause
> bugs in the long run.
Yep will do.
> 
>> +
>> +	mrs	x0, tpidr_el2
>> +
>> +	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
>> +	kern_hyp_va x2
>> +
>> +	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
>> +	fpsimd_save x3, 1
>> +
>> +	add	x2, x0, #VCPU_CONTEXT
>> +	add	x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
>> +	fpsimd_restore x3, 1
> 
> That's quite a lot of code expansion (fpsimd_{save,restore} are macros).
> 
> How about saving x4 and lr on the stack, and doing a a bl in each case?
> That would be consistent with the rest of the code (well, what's left of
> it).

Yeah, will change needed help/feedback on this.
> 
>> +
>> +	pop	x2, x3
>> +	pop	x0, x1
>> +
>> +	eret
>>
>>  /*
>>   * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
>> @@ -932,7 +974,6 @@ ENTRY(__kvm_vcpu_run)
>>  	kern_hyp_va x2
>>
>>  	save_host_regs
>> -	bl __save_fpsimd
>>  	bl __save_sysregs
>>
>>  	compute_debug_state 1f
>> @@ -948,7 +989,6 @@ ENTRY(__kvm_vcpu_run)
>>  	add	x2, x0, #VCPU_CONTEXT
>>
>>  	bl __restore_sysregs
>> -	bl __restore_fpsimd
>>
>>  	skip_debug_state x3, 1f
>>  	bl	__restore_debug
>> @@ -967,7 +1007,9 @@ __kvm_vcpu_return:
>>  	add	x2, x0, #VCPU_CONTEXT
>>
>>  	save_guest_regs
>> +	skip_fpsimd_state x3, 1f
>>  	bl __save_fpsimd
>> +1:
>>  	bl __save_sysregs
>>
>>  	skip_debug_state x3, 1f
>> @@ -986,8 +1028,11 @@ __kvm_vcpu_return:
>>  	kern_hyp_va x2
>>
>>  	bl __restore_sysregs
>> -	bl __restore_fpsimd
>>
>> +	/* Disable cptr VFP/SIMD access bit, skip restore if VFP not accessed */
>> +	skip_fpsimd_state_reset x3, 1f
>> +	bl __restore_fpsimd
>> +1:
> 
> Neither the macro name (skip_fpsimd_state_reset) nor the comment make it
> clear that it is also re-enabling access to the trace registers.
> 
> I'd rather see:
> 
> 	skip_fpsimd_state x3, 1f
> 	bl __restore_fpsimd
> 1:
> 	/* Clear FPSIMD and Trace trapping */
> 	msr	cptr_el2, xzr

Will do.

> 
>>  	skip_debug_state x3, 1f
>>  	// Clear the dirty flag for the next run, as all the state has
>>  	// already been saved. Note that we nuke the whole 64bit word.
>> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>>  	mrs	x1, esr_el2
>>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
>>
>> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
>> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
>> +	b.eq	switch_to_guest_vfp
>> +
> 
> I'd prefer you moved that hunk to el1_trap, where we handle all the
> traps coming from the guest.

I'm thinking would it make sense to update the armv7 side as
well. When reading both exit handlers the flow mirrors
each other.

Thanks,
   Mario
> 
>>  	cmp	x2, #ESR_ELx_EC_HVC64
>>  	b.ne	el1_trap
>>
> 
> Thanks,
> 
> 	M.
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
  2015-06-15 18:04     ` Mario Smarduch
@ 2015-06-15 18:20       ` Marc Zyngier
  -1 siblings, 0 replies; 18+ messages in thread
From: Marc Zyngier @ 2015-06-15 18:20 UTC (permalink / raw)
  To: Mario Smarduch
  Cc: kvmarm, linux-arm-kernel, christoffer.dall, Catalin Marinas,
	Will Deacon, kvm

On 15/06/15 19:04, Mario Smarduch wrote:
> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>> Hi Mario,
>>
>> I was working on a more ambitious patch series, 
>> but we probably ought to
>> start small, and this looks fairly sensible to me.
> 
> Hi Marc,
>    thanks for reviewing, I was thinking to post this
> first and next iteration on guest access switch
> back to host registers only upon  return to user space or
> vCPU context switch. This should save more cycles for
> various exits.
> 
> Were you thinking along the same lines or something
> altogether different?

That's mostly what I had in mind. Basically staying away from touching
the FP registers until vcpu_put(). I had it mostly working, but
experienced some interesting corruption cases, specially when using
32bit guests.

> 
>>
>> A few minor comments below.
>>
>> On 13/06/15 23:20, Mario Smarduch wrote:
>>> Currently VFP/SIMD registers are always saved and restored
>>> on Guest entry and exit.
>>>
>>> This patch only saves and restores VFP/SIMD registers on
>>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>>> on Guest entry and later checked on exit. This follows
>>> the ARMv7 VFPv3 implementation. Running an informal test
>>> there are high number of exits that don't access VFP/SIMD
>>> registers.
>>
>> It would be good to add some numbers here. How often do we exit without
>> having touched the FPSIMD regs? For which workload?
> 
> Lmbench is what I typically use, with ssh server, i.e., cause page
> faults and interrupts - usually registers are not touched.
> I'll run the tests again and define usually.
> 
> Any other loads you had in mind?

Not really (apart from running hackbench, of course...;-). I'd just like
to see the numbers in the commit message, so that we can document the
improvement (and maybe track regressions).

[...]

>>
>>>  	skip_debug_state x3, 1f
>>>  	// Clear the dirty flag for the next run, as all the state has
>>>  	// already been saved. Note that we nuke the whole 64bit word.
>>> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>>>  	mrs	x1, esr_el2
>>>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
>>>
>>> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
>>> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
>>> +	b.eq	switch_to_guest_vfp
>>> +
>>
>> I'd prefer you moved that hunk to el1_trap, where we handle all the
>> traps coming from the guest.
> 
> I'm thinking would it make sense to update the armv7 side as
> well. When reading both exit handlers the flow mirrors
> each other.

The 32bit code is starting to show its age, and could probably do with a
refactor. If you have some cycles to spare, that'd be quite interesting.

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
@ 2015-06-15 18:20       ` Marc Zyngier
  0 siblings, 0 replies; 18+ messages in thread
From: Marc Zyngier @ 2015-06-15 18:20 UTC (permalink / raw)
  To: linux-arm-kernel

On 15/06/15 19:04, Mario Smarduch wrote:
> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>> Hi Mario,
>>
>> I was working on a more ambitious patch series, 
>> but we probably ought to
>> start small, and this looks fairly sensible to me.
> 
> Hi Marc,
>    thanks for reviewing, I was thinking to post this
> first and next iteration on guest access switch
> back to host registers only upon  return to user space or
> vCPU context switch. This should save more cycles for
> various exits.
> 
> Were you thinking along the same lines or something
> altogether different?

That's mostly what I had in mind. Basically staying away from touching
the FP registers until vcpu_put(). I had it mostly working, but
experienced some interesting corruption cases, specially when using
32bit guests.

> 
>>
>> A few minor comments below.
>>
>> On 13/06/15 23:20, Mario Smarduch wrote:
>>> Currently VFP/SIMD registers are always saved and restored
>>> on Guest entry and exit.
>>>
>>> This patch only saves and restores VFP/SIMD registers on
>>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>>> on Guest entry and later checked on exit. This follows
>>> the ARMv7 VFPv3 implementation. Running an informal test
>>> there are high number of exits that don't access VFP/SIMD
>>> registers.
>>
>> It would be good to add some numbers here. How often do we exit without
>> having touched the FPSIMD regs? For which workload?
> 
> Lmbench is what I typically use, with ssh server, i.e., cause page
> faults and interrupts - usually registers are not touched.
> I'll run the tests again and define usually.
> 
> Any other loads you had in mind?

Not really (apart from running hackbench, of course...;-). I'd just like
to see the numbers in the commit message, so that we can document the
improvement (and maybe track regressions).

[...]

>>
>>>  	skip_debug_state x3, 1f
>>>  	// Clear the dirty flag for the next run, as all the state has
>>>  	// already been saved. Note that we nuke the whole 64bit word.
>>> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>>>  	mrs	x1, esr_el2
>>>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
>>>
>>> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
>>> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
>>> +	b.eq	switch_to_guest_vfp
>>> +
>>
>> I'd prefer you moved that hunk to el1_trap, where we handle all the
>> traps coming from the guest.
> 
> I'm thinking would it make sense to update the armv7 side as
> well. When reading both exit handlers the flow mirrors
> each other.

The 32bit code is starting to show its age, and could probably do with a
refactor. If you have some cycles to spare, that'd be quite interesting.

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
  2015-06-15 18:20       ` Marc Zyngier
@ 2015-06-15 18:44         ` Mario Smarduch
  -1 siblings, 0 replies; 18+ messages in thread
From: Mario Smarduch @ 2015-06-15 18:44 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: kvmarm, linux-arm-kernel, christoffer.dall, Catalin Marinas,
	Will Deacon, kvm

On 06/15/2015 11:20 AM, Marc Zyngier wrote:
> On 15/06/15 19:04, Mario Smarduch wrote:
>> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>>> Hi Mario,
>>>
>>> I was working on a more ambitious patch series, 
>>> but we probably ought to
>>> start small, and this looks fairly sensible to me.
>>
>> Hi Marc,
>>    thanks for reviewing, I was thinking to post this
>> first and next iteration on guest access switch
>> back to host registers only upon  return to user space or
>> vCPU context switch. This should save more cycles for
>> various exits.
>>
>> Were you thinking along the same lines or something
>> altogether different?
> 
> That's mostly what I had in mind. Basically staying away from touching
> the FP registers until vcpu_put(). I had it mostly working, but
> experienced some interesting corruption cases, specially when using
> 32bit guests.
> 
>>
>>>
>>> A few minor comments below.
>>>
>>> On 13/06/15 23:20, Mario Smarduch wrote:
>>>> Currently VFP/SIMD registers are always saved and restored
>>>> on Guest entry and exit.
>>>>
>>>> This patch only saves and restores VFP/SIMD registers on
>>>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>>>> on Guest entry and later checked on exit. This follows
>>>> the ARMv7 VFPv3 implementation. Running an informal test
>>>> there are high number of exits that don't access VFP/SIMD
>>>> registers.
>>>
>>> It would be good to add some numbers here. How often do we exit without
>>> having touched the FPSIMD regs? For which workload?
>>
>> Lmbench is what I typically use, with ssh server, i.e., cause page
>> faults and interrupts - usually registers are not touched.
>> I'll run the tests again and define usually.
>>
>> Any other loads you had in mind?
> 
> Not really (apart from running hackbench, of course...;-). I'd just like
> to see the numbers in the commit message, so that we can document the
> improvement (and maybe track regressions).

Ok I understand.

> 
> [...]
> 
>>>
>>>>  	skip_debug_state x3, 1f
>>>>  	// Clear the dirty flag for the next run, as all the state has
>>>>  	// already been saved. Note that we nuke the whole 64bit word.
>>>> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>>>>  	mrs	x1, esr_el2
>>>>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
>>>>
>>>> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
>>>> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
>>>> +	b.eq	switch_to_guest_vfp
>>>> +
>>>
>>> I'd prefer you moved that hunk to el1_trap, where we handle all the
>>> traps coming from the guest.
>>
>> I'm thinking would it make sense to update the armv7 side as
>> well. When reading both exit handlers the flow mirrors
>> each other.
> 
> The 32bit code is starting to show its age, and could probably do with a
> refactor. If you have some cycles to spare, that'd be quite interesting.

Yep, will do, ARMv7 is still very relevant.

> 
> Thanks,
> 
> 	M.
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
@ 2015-06-15 18:44         ` Mario Smarduch
  0 siblings, 0 replies; 18+ messages in thread
From: Mario Smarduch @ 2015-06-15 18:44 UTC (permalink / raw)
  To: linux-arm-kernel

On 06/15/2015 11:20 AM, Marc Zyngier wrote:
> On 15/06/15 19:04, Mario Smarduch wrote:
>> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>>> Hi Mario,
>>>
>>> I was working on a more ambitious patch series, 
>>> but we probably ought to
>>> start small, and this looks fairly sensible to me.
>>
>> Hi Marc,
>>    thanks for reviewing, I was thinking to post this
>> first and next iteration on guest access switch
>> back to host registers only upon  return to user space or
>> vCPU context switch. This should save more cycles for
>> various exits.
>>
>> Were you thinking along the same lines or something
>> altogether different?
> 
> That's mostly what I had in mind. Basically staying away from touching
> the FP registers until vcpu_put(). I had it mostly working, but
> experienced some interesting corruption cases, specially when using
> 32bit guests.
> 
>>
>>>
>>> A few minor comments below.
>>>
>>> On 13/06/15 23:20, Mario Smarduch wrote:
>>>> Currently VFP/SIMD registers are always saved and restored
>>>> on Guest entry and exit.
>>>>
>>>> This patch only saves and restores VFP/SIMD registers on
>>>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>>>> on Guest entry and later checked on exit. This follows
>>>> the ARMv7 VFPv3 implementation. Running an informal test
>>>> there are high number of exits that don't access VFP/SIMD
>>>> registers.
>>>
>>> It would be good to add some numbers here. How often do we exit without
>>> having touched the FPSIMD regs? For which workload?
>>
>> Lmbench is what I typically use, with ssh server, i.e., cause page
>> faults and interrupts - usually registers are not touched.
>> I'll run the tests again and define usually.
>>
>> Any other loads you had in mind?
> 
> Not really (apart from running hackbench, of course...;-). I'd just like
> to see the numbers in the commit message, so that we can document the
> improvement (and maybe track regressions).

Ok I understand.

> 
> [...]
> 
>>>
>>>>  	skip_debug_state x3, 1f
>>>>  	// Clear the dirty flag for the next run, as all the state has
>>>>  	// already been saved. Note that we nuke the whole 64bit word.
>>>> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>>>>  	mrs	x1, esr_el2
>>>>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
>>>>
>>>> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
>>>> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
>>>> +	b.eq	switch_to_guest_vfp
>>>> +
>>>
>>> I'd prefer you moved that hunk to el1_trap, where we handle all the
>>> traps coming from the guest.
>>
>> I'm thinking would it make sense to update the armv7 side as
>> well. When reading both exit handlers the flow mirrors
>> each other.
> 
> The 32bit code is starting to show its age, and could probably do with a
> refactor. If you have some cycles to spare, that'd be quite interesting.

Yep, will do, ARMv7 is still very relevant.

> 
> Thanks,
> 
> 	M.
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
  2015-06-15 18:44         ` Mario Smarduch
@ 2015-06-15 18:51           ` Marc Zyngier
  -1 siblings, 0 replies; 18+ messages in thread
From: Marc Zyngier @ 2015-06-15 18:51 UTC (permalink / raw)
  To: Mario Smarduch
  Cc: kvmarm, linux-arm-kernel, christoffer.dall, Catalin Marinas,
	Will Deacon, kvm

On 15/06/15 19:44, Mario Smarduch wrote:
> On 06/15/2015 11:20 AM, Marc Zyngier wrote:
>> On 15/06/15 19:04, Mario Smarduch wrote:
>>> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>>>> Hi Mario,
>>>>
>>>> I was working on a more ambitious patch series, 
>>>> but we probably ought to
>>>> start small, and this looks fairly sensible to me.
>>>
>>> Hi Marc,
>>>    thanks for reviewing, I was thinking to post this
>>> first and next iteration on guest access switch
>>> back to host registers only upon  return to user space or
>>> vCPU context switch. This should save more cycles for
>>> various exits.
>>>
>>> Were you thinking along the same lines or something
>>> altogether different?
>>
>> That's mostly what I had in mind. Basically staying away from touching
>> the FP registers until vcpu_put(). I had it mostly working, but
>> experienced some interesting corruption cases, specially when using
>> 32bit guests.
>>
>>>
>>>>
>>>> A few minor comments below.
>>>>
>>>> On 13/06/15 23:20, Mario Smarduch wrote:
>>>>> Currently VFP/SIMD registers are always saved and restored
>>>>> on Guest entry and exit.
>>>>>
>>>>> This patch only saves and restores VFP/SIMD registers on
>>>>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>>>>> on Guest entry and later checked on exit. This follows
>>>>> the ARMv7 VFPv3 implementation. Running an informal test
>>>>> there are high number of exits that don't access VFP/SIMD
>>>>> registers.
>>>>
>>>> It would be good to add some numbers here. How often do we exit without
>>>> having touched the FPSIMD regs? For which workload?
>>>
>>> Lmbench is what I typically use, with ssh server, i.e., cause page
>>> faults and interrupts - usually registers are not touched.
>>> I'll run the tests again and define usually.
>>>
>>> Any other loads you had in mind?
>>
>> Not really (apart from running hackbench, of course...;-). I'd just like
>> to see the numbers in the commit message, so that we can document the
>> improvement (and maybe track regressions).
> 
> Ok I understand.
> 
>>
>> [...]
>>
>>>>
>>>>>  	skip_debug_state x3, 1f
>>>>>  	// Clear the dirty flag for the next run, as all the state has
>>>>>  	// already been saved. Note that we nuke the whole 64bit word.
>>>>> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>>>>>  	mrs	x1, esr_el2
>>>>>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
>>>>>
>>>>> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
>>>>> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
>>>>> +	b.eq	switch_to_guest_vfp
>>>>> +
>>>>
>>>> I'd prefer you moved that hunk to el1_trap, where we handle all the
>>>> traps coming from the guest.
>>>
>>> I'm thinking would it make sense to update the armv7 side as
>>> well. When reading both exit handlers the flow mirrors
>>> each other.
>>
>> The 32bit code is starting to show its age, and could probably do with a
>> refactor. If you have some cycles to spare, that'd be quite interesting.
> 
> Yep, will do, ARMv7 is still very relevant.

You bet it is. My home router is a v7 VM...

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
@ 2015-06-15 18:51           ` Marc Zyngier
  0 siblings, 0 replies; 18+ messages in thread
From: Marc Zyngier @ 2015-06-15 18:51 UTC (permalink / raw)
  To: linux-arm-kernel

On 15/06/15 19:44, Mario Smarduch wrote:
> On 06/15/2015 11:20 AM, Marc Zyngier wrote:
>> On 15/06/15 19:04, Mario Smarduch wrote:
>>> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>>>> Hi Mario,
>>>>
>>>> I was working on a more ambitious patch series, 
>>>> but we probably ought to
>>>> start small, and this looks fairly sensible to me.
>>>
>>> Hi Marc,
>>>    thanks for reviewing, I was thinking to post this
>>> first and next iteration on guest access switch
>>> back to host registers only upon  return to user space or
>>> vCPU context switch. This should save more cycles for
>>> various exits.
>>>
>>> Were you thinking along the same lines or something
>>> altogether different?
>>
>> That's mostly what I had in mind. Basically staying away from touching
>> the FP registers until vcpu_put(). I had it mostly working, but
>> experienced some interesting corruption cases, specially when using
>> 32bit guests.
>>
>>>
>>>>
>>>> A few minor comments below.
>>>>
>>>> On 13/06/15 23:20, Mario Smarduch wrote:
>>>>> Currently VFP/SIMD registers are always saved and restored
>>>>> on Guest entry and exit.
>>>>>
>>>>> This patch only saves and restores VFP/SIMD registers on
>>>>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>>>>> on Guest entry and later checked on exit. This follows
>>>>> the ARMv7 VFPv3 implementation. Running an informal test
>>>>> there are high number of exits that don't access VFP/SIMD
>>>>> registers.
>>>>
>>>> It would be good to add some numbers here. How often do we exit without
>>>> having touched the FPSIMD regs? For which workload?
>>>
>>> Lmbench is what I typically use, with ssh server, i.e., cause page
>>> faults and interrupts - usually registers are not touched.
>>> I'll run the tests again and define usually.
>>>
>>> Any other loads you had in mind?
>>
>> Not really (apart from running hackbench, of course...;-). I'd just like
>> to see the numbers in the commit message, so that we can document the
>> improvement (and maybe track regressions).
> 
> Ok I understand.
> 
>>
>> [...]
>>
>>>>
>>>>>  	skip_debug_state x3, 1f
>>>>>  	// Clear the dirty flag for the next run, as all the state has
>>>>>  	// already been saved. Note that we nuke the whole 64bit word.
>>>>> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>>>>>  	mrs	x1, esr_el2
>>>>>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
>>>>>
>>>>> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
>>>>> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
>>>>> +	b.eq	switch_to_guest_vfp
>>>>> +
>>>>
>>>> I'd prefer you moved that hunk to el1_trap, where we handle all the
>>>> traps coming from the guest.
>>>
>>> I'm thinking would it make sense to update the armv7 side as
>>> well. When reading both exit handlers the flow mirrors
>>> each other.
>>
>> The 32bit code is starting to show its age, and could probably do with a
>> refactor. If you have some cycles to spare, that'd be quite interesting.
> 
> Yep, will do, ARMv7 is still very relevant.

You bet it is. My home router is a v7 VM...

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
  2015-06-15 18:51           ` Marc Zyngier
@ 2015-06-15 19:08             ` Mario Smarduch
  -1 siblings, 0 replies; 18+ messages in thread
From: Mario Smarduch @ 2015-06-15 19:08 UTC (permalink / raw)
  To: Marc Zyngier; +Cc: kvm, Catalin Marinas, Will Deacon, linux-arm-kernel, kvmarm

On 06/15/2015 11:51 AM, Marc Zyngier wrote:
> On 15/06/15 19:44, Mario Smarduch wrote:
>> On 06/15/2015 11:20 AM, Marc Zyngier wrote:
>>> On 15/06/15 19:04, Mario Smarduch wrote:
>>>> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>>>>> Hi Mario,
>>>>>
[...]
>>>
>>> The 32bit code is starting to show its age, and could probably do with a
>>> refactor. If you have some cycles to spare, that'd be quite interesting.
>>
>> Yep, will do, ARMv7 is still very relevant.
> 
> You bet it is. My home router is a v7 VM...
> 
> 	M.
> 
Definitely nice use case, another one RPi2 appears like
everyone is hacking on it :)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
@ 2015-06-15 19:08             ` Mario Smarduch
  0 siblings, 0 replies; 18+ messages in thread
From: Mario Smarduch @ 2015-06-15 19:08 UTC (permalink / raw)
  To: linux-arm-kernel

On 06/15/2015 11:51 AM, Marc Zyngier wrote:
> On 15/06/15 19:44, Mario Smarduch wrote:
>> On 06/15/2015 11:20 AM, Marc Zyngier wrote:
>>> On 15/06/15 19:04, Mario Smarduch wrote:
>>>> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>>>>> Hi Mario,
>>>>>
[...]
>>>
>>> The 32bit code is starting to show its age, and could probably do with a
>>> refactor. If you have some cycles to spare, that'd be quite interesting.
>>
>> Yep, will do, ARMv7 is still very relevant.
> 
> You bet it is. My home router is a v7 VM...
> 
> 	M.
> 
Definitely nice use case, another one RPi2 appears like
everyone is hacking on it :)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
  2015-06-15 18:20       ` Marc Zyngier
@ 2015-06-16  3:04         ` Mario Smarduch
  -1 siblings, 0 replies; 18+ messages in thread
From: Mario Smarduch @ 2015-06-16  3:04 UTC (permalink / raw)
  To: Marc Zyngier; +Cc: kvm, Catalin Marinas, Will Deacon, linux-arm-kernel, kvmarm

On 06/15/2015 11:20 AM, Marc Zyngier wrote:
> On 15/06/15 19:04, Mario Smarduch wrote:
>> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>>> Hi Mario,
>>>
[ ... ]
>>>
>>> On 13/06/15 23:20, Mario Smarduch wrote:
>>>> Currently VFP/SIMD registers are always saved and restored
>>>> on Guest entry and exit.
>>>>
>>>> This patch only saves and restores VFP/SIMD registers on
>>>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>>>> on Guest entry and later checked on exit. This follows
>>>> the ARMv7 VFPv3 implementation. Running an informal test
>>>> there are high number of exits that don't access VFP/SIMD
>>>> registers.
>>>
>>> It would be good to add some numbers here. How often do we exit without
>>> having touched the FPSIMD regs? For which workload?
>>
>> Lmbench is what I typically use, with ssh server, i.e., cause page
>> faults and interrupts - usually registers are not touched.
>> I'll run the tests again and define usually.
>>
>> Any other loads you had in mind?
> 
> Not really (apart from running hackbench, of course...;-). I'd just like
> to see the numbers in the commit message, so that we can document the
> improvement (and maybe track regressions).

Hi Marc,
   some ballpark numbers.

   hackbench about 30% of the time optimized path is taken
(for 10*40 test).

Lmbench3 upwards of 50% for context switching, memory bw,
pipe, proc creation, sys call. There are lot more tests
but I limited to these tests. In addition other processes
are running in background NTP, SSH, ... doing their own
thing.

I added a tmp counter to kvm_vcpu_arch to count vfpsimd
events.

- Mario
> 
> [...]
> 
>>>
>>>>  	skip_debug_state x3, 1f
>>>>  	// Clear the dirty flag for the next run, as all the state has
>>>>  	// already been saved. Note that we nuke the whole 64bit word.
>>>> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>>>>  	mrs	x1, esr_el2
>>>>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
>>>>
>>>> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
>>>> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
>>>> +	b.eq	switch_to_guest_vfp
>>>> +
>>>
>>> I'd prefer you moved that hunk to el1_trap, where we handle all the
>>> traps coming from the guest.
>>
>> I'm thinking would it make sense to update the armv7 side as
>> well. When reading both exit handlers the flow mirrors
>> each other.
> 
> The 32bit code is starting to show its age, and could probably do with a
> refactor. If you have some cycles to spare, that'd be quite interesting.
> 
> Thanks,
> 
> 	M.
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
@ 2015-06-16  3:04         ` Mario Smarduch
  0 siblings, 0 replies; 18+ messages in thread
From: Mario Smarduch @ 2015-06-16  3:04 UTC (permalink / raw)
  To: linux-arm-kernel

On 06/15/2015 11:20 AM, Marc Zyngier wrote:
> On 15/06/15 19:04, Mario Smarduch wrote:
>> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>>> Hi Mario,
>>>
[ ... ]
>>>
>>> On 13/06/15 23:20, Mario Smarduch wrote:
>>>> Currently VFP/SIMD registers are always saved and restored
>>>> on Guest entry and exit.
>>>>
>>>> This patch only saves and restores VFP/SIMD registers on
>>>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>>>> on Guest entry and later checked on exit. This follows
>>>> the ARMv7 VFPv3 implementation. Running an informal test
>>>> there are high number of exits that don't access VFP/SIMD
>>>> registers.
>>>
>>> It would be good to add some numbers here. How often do we exit without
>>> having touched the FPSIMD regs? For which workload?
>>
>> Lmbench is what I typically use, with ssh server, i.e., cause page
>> faults and interrupts - usually registers are not touched.
>> I'll run the tests again and define usually.
>>
>> Any other loads you had in mind?
> 
> Not really (apart from running hackbench, of course...;-). I'd just like
> to see the numbers in the commit message, so that we can document the
> improvement (and maybe track regressions).

Hi Marc,
   some ballpark numbers.

   hackbench about 30% of the time optimized path is taken
(for 10*40 test).

Lmbench3 upwards of 50% for context switching, memory bw,
pipe, proc creation, sys call. There are lot more tests
but I limited to these tests. In addition other processes
are running in background NTP, SSH, ... doing their own
thing.

I added a tmp counter to kvm_vcpu_arch to count vfpsimd
events.

- Mario
> 
> [...]
> 
>>>
>>>>  	skip_debug_state x3, 1f
>>>>  	// Clear the dirty flag for the next run, as all the state has
>>>>  	// already been saved. Note that we nuke the whole 64bit word.
>>>> @@ -1166,6 +1211,10 @@ el1_sync:					// Guest trapped into EL2
>>>>  	mrs	x1, esr_el2
>>>>  	lsr	x2, x1, #ESR_ELx_EC_SHIFT
>>>>
>>>> +	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
>>>> +	cmp	x2, #ESR_ELx_EC_FP_ASIMD
>>>> +	b.eq	switch_to_guest_vfp
>>>> +
>>>
>>> I'd prefer you moved that hunk to el1_trap, where we handle all the
>>> traps coming from the guest.
>>
>> I'm thinking would it make sense to update the armv7 side as
>> well. When reading both exit handlers the flow mirrors
>> each other.
> 
> The 32bit code is starting to show its age, and could probably do with a
> refactor. If you have some cycles to spare, that'd be quite interesting.
> 
> Thanks,
> 
> 	M.
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
  2015-06-16  3:04         ` Mario Smarduch
@ 2015-06-16  8:30           ` Marc Zyngier
  -1 siblings, 0 replies; 18+ messages in thread
From: Marc Zyngier @ 2015-06-16  8:30 UTC (permalink / raw)
  To: Mario Smarduch
  Cc: kvmarm, linux-arm-kernel, christoffer.dall, Catalin Marinas,
	Will Deacon, kvm

On 16/06/15 04:04, Mario Smarduch wrote:
> On 06/15/2015 11:20 AM, Marc Zyngier wrote:
>> On 15/06/15 19:04, Mario Smarduch wrote:
>>> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>>>> Hi Mario,
>>>>
> [ ... ]
>>>>
>>>> On 13/06/15 23:20, Mario Smarduch wrote:
>>>>> Currently VFP/SIMD registers are always saved and restored
>>>>> on Guest entry and exit.
>>>>>
>>>>> This patch only saves and restores VFP/SIMD registers on
>>>>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>>>>> on Guest entry and later checked on exit. This follows
>>>>> the ARMv7 VFPv3 implementation. Running an informal test
>>>>> there are high number of exits that don't access VFP/SIMD
>>>>> registers.
>>>>
>>>> It would be good to add some numbers here. How often do we exit without
>>>> having touched the FPSIMD regs? For which workload?
>>>
>>> Lmbench is what I typically use, with ssh server, i.e., cause page
>>> faults and interrupts - usually registers are not touched.
>>> I'll run the tests again and define usually.
>>>
>>> Any other loads you had in mind?
>>
>> Not really (apart from running hackbench, of course...;-). I'd just like
>> to see the numbers in the commit message, so that we can document the
>> improvement (and maybe track regressions).
> 
> Hi Marc,
>    some ballpark numbers.
> 
>    hackbench about 30% of the time optimized path is taken
> (for 10*40 test).
> 
> Lmbench3 upwards of 50% for context switching, memory bw,
> pipe, proc creation, sys call. There are lot more tests
> but I limited to these tests. In addition other processes
> are running in background NTP, SSH, ... doing their own
> thing.
> 
> I added a tmp counter to kvm_vcpu_arch to count vfpsimd
> events.

That looks good. Please include these numbers in the commit message for
v2 of that patch.

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore
@ 2015-06-16  8:30           ` Marc Zyngier
  0 siblings, 0 replies; 18+ messages in thread
From: Marc Zyngier @ 2015-06-16  8:30 UTC (permalink / raw)
  To: linux-arm-kernel

On 16/06/15 04:04, Mario Smarduch wrote:
> On 06/15/2015 11:20 AM, Marc Zyngier wrote:
>> On 15/06/15 19:04, Mario Smarduch wrote:
>>> On 06/15/2015 03:00 AM, Marc Zyngier wrote:
>>>> Hi Mario,
>>>>
> [ ... ]
>>>>
>>>> On 13/06/15 23:20, Mario Smarduch wrote:
>>>>> Currently VFP/SIMD registers are always saved and restored
>>>>> on Guest entry and exit.
>>>>>
>>>>> This patch only saves and restores VFP/SIMD registers on
>>>>> Guest access. To do this cptr_el2 VFP/SIMD trap is set
>>>>> on Guest entry and later checked on exit. This follows
>>>>> the ARMv7 VFPv3 implementation. Running an informal test
>>>>> there are high number of exits that don't access VFP/SIMD
>>>>> registers.
>>>>
>>>> It would be good to add some numbers here. How often do we exit without
>>>> having touched the FPSIMD regs? For which workload?
>>>
>>> Lmbench is what I typically use, with ssh server, i.e., cause page
>>> faults and interrupts - usually registers are not touched.
>>> I'll run the tests again and define usually.
>>>
>>> Any other loads you had in mind?
>>
>> Not really (apart from running hackbench, of course...;-). I'd just like
>> to see the numbers in the commit message, so that we can document the
>> improvement (and maybe track regressions).
> 
> Hi Marc,
>    some ballpark numbers.
> 
>    hackbench about 30% of the time optimized path is taken
> (for 10*40 test).
> 
> Lmbench3 upwards of 50% for context switching, memory bw,
> pipe, proc creation, sys call. There are lot more tests
> but I limited to these tests. In addition other processes
> are running in background NTP, SSH, ... doing their own
> thing.
> 
> I added a tmp counter to kvm_vcpu_arch to count vfpsimd
> events.

That looks good. Please include these numbers in the commit message for
v2 of that patch.

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2015-06-16  8:31 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-13 22:20 [PATCH] arm64: KVM: Optimize arm64 guest exit VFP/SIMD register save/restore Mario Smarduch
2015-06-13 22:20 ` Mario Smarduch
2015-06-15 10:00 ` Marc Zyngier
2015-06-15 10:00   ` Marc Zyngier
2015-06-15 18:04   ` Mario Smarduch
2015-06-15 18:04     ` Mario Smarduch
2015-06-15 18:20     ` Marc Zyngier
2015-06-15 18:20       ` Marc Zyngier
2015-06-15 18:44       ` Mario Smarduch
2015-06-15 18:44         ` Mario Smarduch
2015-06-15 18:51         ` Marc Zyngier
2015-06-15 18:51           ` Marc Zyngier
2015-06-15 19:08           ` Mario Smarduch
2015-06-15 19:08             ` Mario Smarduch
2015-06-16  3:04       ` Mario Smarduch
2015-06-16  3:04         ` Mario Smarduch
2015-06-16  8:30         ` Marc Zyngier
2015-06-16  8:30           ` Marc Zyngier

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.