All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
@ 2013-09-10  9:51 Marc Zyngier
  2013-09-11 12:21 ` Anup Patel
  2013-09-11 18:06 ` Peter Maydell
  0 siblings, 2 replies; 23+ messages in thread
From: Marc Zyngier @ 2013-09-10  9:51 UTC (permalink / raw)
  To: linux-arm-kernel

Anup Patel recently brought up the fact that when we run a guest
with cache disabled, we don't flush the cache to the Point of
Coherency, hence possibly missing bits of data that have been
written in the cache, but have not reached memory.

There are several approaches to this issue:
- Using the DC bit when caches are off: this breaks guests assuming
  caches off while doing DMA operations. Bootloaders, for example.
- Fetch the memory attributes on translation fault, and flush the
  cache while handling the fault. This relies on using the PAR_EL1
  register to obtain the Stage-1 memory attributes.

While this second solution is clearly not ideal (it duplicates what
the HW already does to generate HPFAR_EL2), it is more correct than
not doing anything.

Cc: Anup Patel <anup@brainfault.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---

 arch/arm/include/asm/kvm_mmu.h    |  4 ++--
 arch/arm/kvm/mmu.c                |  2 +-
 arch/arm64/include/asm/kvm_host.h |  1 +
 arch/arm64/include/asm/kvm_mmu.h  |  9 +++++++--
 arch/arm64/kernel/asm-offsets.c   |  1 +
 arch/arm64/kvm/hyp.S              | 28 ++++++++++++----------------
 6 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 472ac70..faffdf0 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -105,7 +105,7 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
 
 struct kvm;
 
-static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
+static inline void coherent_icache_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	/*
 	 * If we are going to insert an instruction page and the icache is
@@ -120,7 +120,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
 	 * need any kind of flushing (DDI 0406C.b - Page B3-1392).
 	 */
 	if (icache_is_pipt()) {
-		unsigned long hva = gfn_to_hva(kvm, gfn);
+		unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
 		__cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
 	} else if (!icache_is_vivt_asid_tagged()) {
 		/* any kind of VIPT cache */
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 0988d9e..ffb3cc4 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -547,7 +547,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 
 	new_pte = pfn_pte(pfn, PAGE_S2);
-	coherent_icache_guest_page(vcpu->kvm, gfn);
+	coherent_icache_guest_page(vcpu, gfn);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 331a852..2530f92 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -79,6 +79,7 @@ struct kvm_mmu_memory_cache {
 struct kvm_vcpu_fault_info {
 	u32 esr_el2;		/* Hyp Syndrom Register */
 	u64 far_el2;		/* Hyp Fault Address Register */
+	u64 par_el2;		/* Result of FAR_EL2 translation */
 	u64 hpfar_el2;		/* Hyp IPA Fault Address Register */
 };
 
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index efe609c..84baddd 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -118,11 +118,16 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
 
 struct kvm;
 
-static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
+static inline void coherent_icache_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	if (!icache_is_aliasing()) {		/* PIPT */
-		unsigned long hva = gfn_to_hva(kvm, gfn);
+		unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
+		u8 attr;
 		flush_icache_range(hva, hva + PAGE_SIZE);
+		attr = vcpu->arch.fault.par_el2 >> 56;
+		/* Check for non-device, non-cacheable access */
+		if ((attr & 0xf0) && (attr & 0x0f) == 4)
+			__flush_dcache_area((void *)hva, PAGE_SIZE);
 	} else if (!icache_is_aivivt()) {	/* non ASID-tagged VIVT */
 		/* any kind of VIPT cache */
 		__flush_icache_all();
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 1f3a39d..e67dcac 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -117,6 +117,7 @@ int main(void)
   DEFINE(CPU_SYSREGS,		offsetof(struct kvm_cpu_context, sys_regs));
   DEFINE(VCPU_ESR_EL2,		offsetof(struct kvm_vcpu, arch.fault.esr_el2));
   DEFINE(VCPU_FAR_EL2,		offsetof(struct kvm_vcpu, arch.fault.far_el2));
+  DEFINE(VCPU_PAR_EL2,		offsetof(struct kvm_vcpu, arch.fault.par_el2));
   DEFINE(VCPU_HPFAR_EL2,	offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
   DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
   DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 20a58fe..417636b 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -671,24 +671,16 @@ el1_trap:
 	ccmp	x2, x0, #4, ne
 	b.ne	1f		// Not an abort we care about
 
-	/* This is an abort. Check for permission fault */
-	and	x2, x1, #ESR_EL2_FSC_TYPE
-	cmp	x2, #FSC_PERM
-	b.ne	1f		// Not a permission fault
-
-	/*
-	 * Check for Stage-1 page table walk, which is guaranteed
-	 * to give a valid HPFAR_EL2.
-	 */
-	tbnz	x1, #7, 1f	// S1PTW is set
-
 	/* Preserve PAR_EL1 */
 	mrs	x3, par_el1
 	push	x3, xzr
 
 	/*
-	 * Permission fault, HPFAR_EL2 is invalid.
-	 * Resolve the IPA the hard way using the guest VA.
+	 * Two cases:
+	 * - S2 translation fault, we need the memory attributes.
+	 * - S2 permission fault, HPFAR_EL2 is invalid.
+	 *
+	 * In both cases, resolve the IPA the hard way using the guest VA.
 	 * Stage-1 translation already validated the memory access rights.
 	 * As such, we can use the EL1 translation regime, and don't have
 	 * to distinguish between EL0 and EL1 access.
@@ -702,15 +694,19 @@ el1_trap:
 	pop	x0, xzr			// Restore PAR_EL1 from the stack
 	msr	par_el1, x0
 	tbnz	x3, #0, 3f		// Bail out if we failed the translation
+
+	mrs	x0, tpidr_el2
+	str	x3, [x0, #VCPU_PAR_EL2]
 	ubfx	x3, x3, #12, #36	// Extract IPA
 	lsl	x3, x3, #4		// and present it like HPFAR
+
 	b	2f
 
-1:	mrs	x3, hpfar_el2
+1:	mrs	x0, tpidr_el2
 	mrs	x2, far_el2
+	mrs	x3, hpfar_el2
 
-2:	mrs	x0, tpidr_el2
-	str	x1, [x0, #VCPU_ESR_EL2]
+2:	str	x1, [x0, #VCPU_ESR_EL2]
 	str	x2, [x0, #VCPU_FAR_EL2]
 	str	x3, [x0, #VCPU_HPFAR_EL2]
 
-- 
1.8.2.3

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-09-10  9:51 [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault Marc Zyngier
@ 2013-09-11 12:21 ` Anup Patel
  2013-09-11 12:35   ` Marc Zyngier
  2013-09-11 18:06 ` Peter Maydell
  1 sibling, 1 reply; 23+ messages in thread
From: Anup Patel @ 2013-09-11 12:21 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Sep 10, 2013 at 3:21 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> Anup Patel recently brought up the fact that when we run a guest
> with cache disabled, we don't flush the cache to the Point of
> Coherency, hence possibly missing bits of data that have been
> written in the cache, but have not reached memory.
>
> There are several approaches to this issue:
> - Using the DC bit when caches are off: this breaks guests assuming
>   caches off while doing DMA operations. Bootloaders, for example.
> - Fetch the memory attributes on translation fault, and flush the
>   cache while handling the fault. This relies on using the PAR_EL1
>   register to obtain the Stage-1 memory attributes.
>
> While this second solution is clearly not ideal (it duplicates what
> the HW already does to generate HPFAR_EL2), it is more correct than
> not doing anything.
>
> Cc: Anup Patel <anup@brainfault.org>
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>
>  arch/arm/include/asm/kvm_mmu.h    |  4 ++--
>  arch/arm/kvm/mmu.c                |  2 +-
>  arch/arm64/include/asm/kvm_host.h |  1 +
>  arch/arm64/include/asm/kvm_mmu.h  |  9 +++++++--
>  arch/arm64/kernel/asm-offsets.c   |  1 +
>  arch/arm64/kvm/hyp.S              | 28 ++++++++++++----------------
>  6 files changed, 24 insertions(+), 21 deletions(-)
>
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index 472ac70..faffdf0 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -105,7 +105,7 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
>
>  struct kvm;
>
> -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
> +static inline void coherent_icache_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn)
>  {
>         /*
>          * If we are going to insert an instruction page and the icache is
> @@ -120,7 +120,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
>          * need any kind of flushing (DDI 0406C.b - Page B3-1392).
>          */
>         if (icache_is_pipt()) {
> -               unsigned long hva = gfn_to_hva(kvm, gfn);
> +               unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
>                 __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
>         } else if (!icache_is_vivt_asid_tagged()) {
>                 /* any kind of VIPT cache */
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 0988d9e..ffb3cc4 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -547,7 +547,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>                 return -EFAULT;
>
>         new_pte = pfn_pte(pfn, PAGE_S2);
> -       coherent_icache_guest_page(vcpu->kvm, gfn);
> +       coherent_icache_guest_page(vcpu, gfn);
>
>         spin_lock(&vcpu->kvm->mmu_lock);
>         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 331a852..2530f92 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -79,6 +79,7 @@ struct kvm_mmu_memory_cache {
>  struct kvm_vcpu_fault_info {
>         u32 esr_el2;            /* Hyp Syndrom Register */
>         u64 far_el2;            /* Hyp Fault Address Register */
> +       u64 par_el2;            /* Result of FAR_EL2 translation */

Please rename this to par_el1 because we are saving result of Stage1
translation only.

>         u64 hpfar_el2;          /* Hyp IPA Fault Address Register */
>  };
>
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index efe609c..84baddd 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -118,11 +118,16 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
>
>  struct kvm;
>
> -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
> +static inline void coherent_icache_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn)
>  {
>         if (!icache_is_aliasing()) {            /* PIPT */
> -               unsigned long hva = gfn_to_hva(kvm, gfn);
> +               unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
> +               u8 attr;
>                 flush_icache_range(hva, hva + PAGE_SIZE);
> +               attr = vcpu->arch.fault.par_el2 >> 56;
> +               /* Check for non-device, non-cacheable access */
> +               if ((attr & 0xf0) && (attr & 0x0f) == 4)
> +                       __flush_dcache_area((void *)hva, PAGE_SIZE);

The condition is not complete because when MMU is disabled the memory
attribute is assumed to be 0x0 (i.e. Device-nGnRnE memory)

The condition check that works on X-Gene and Foundation v8 Model is:

+               /* Check for device access OR
+                * non-device, non-cacheable access
+                */
+               if (!(attr & 0xf0) ||
+                   ((attr & 0xf0) && (attr & 0x0f) == 4))

Also, we should avoid integer constants here for better code readability.

>         } else if (!icache_is_aivivt()) {       /* non ASID-tagged VIVT */
>                 /* any kind of VIPT cache */
>                 __flush_icache_all();
> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> index 1f3a39d..e67dcac 100644
> --- a/arch/arm64/kernel/asm-offsets.c
> +++ b/arch/arm64/kernel/asm-offsets.c
> @@ -117,6 +117,7 @@ int main(void)
>    DEFINE(CPU_SYSREGS,          offsetof(struct kvm_cpu_context, sys_regs));
>    DEFINE(VCPU_ESR_EL2,         offsetof(struct kvm_vcpu, arch.fault.esr_el2));
>    DEFINE(VCPU_FAR_EL2,         offsetof(struct kvm_vcpu, arch.fault.far_el2));
> +  DEFINE(VCPU_PAR_EL2,         offsetof(struct kvm_vcpu, arch.fault.par_el2));
>    DEFINE(VCPU_HPFAR_EL2,       offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
>    DEFINE(VCPU_HCR_EL2,         offsetof(struct kvm_vcpu, arch.hcr_el2));
>    DEFINE(VCPU_IRQ_LINES,       offsetof(struct kvm_vcpu, arch.irq_lines));
> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
> index 20a58fe..417636b 100644
> --- a/arch/arm64/kvm/hyp.S
> +++ b/arch/arm64/kvm/hyp.S
> @@ -671,24 +671,16 @@ el1_trap:
>         ccmp    x2, x0, #4, ne
>         b.ne    1f              // Not an abort we care about
>
> -       /* This is an abort. Check for permission fault */
> -       and     x2, x1, #ESR_EL2_FSC_TYPE
> -       cmp     x2, #FSC_PERM
> -       b.ne    1f              // Not a permission fault
> -
> -       /*
> -        * Check for Stage-1 page table walk, which is guaranteed
> -        * to give a valid HPFAR_EL2.
> -        */
> -       tbnz    x1, #7, 1f      // S1PTW is set
> -
>         /* Preserve PAR_EL1 */
>         mrs     x3, par_el1
>         push    x3, xzr
>
>         /*
> -        * Permission fault, HPFAR_EL2 is invalid.
> -        * Resolve the IPA the hard way using the guest VA.
> +        * Two cases:
> +        * - S2 translation fault, we need the memory attributes.
> +        * - S2 permission fault, HPFAR_EL2 is invalid.
> +        *
> +        * In both cases, resolve the IPA the hard way using the guest VA.
>          * Stage-1 translation already validated the memory access rights.
>          * As such, we can use the EL1 translation regime, and don't have
>          * to distinguish between EL0 and EL1 access.
> @@ -702,15 +694,19 @@ el1_trap:
>         pop     x0, xzr                 // Restore PAR_EL1 from the stack
>         msr     par_el1, x0
>         tbnz    x3, #0, 3f              // Bail out if we failed the translation
> +
> +       mrs     x0, tpidr_el2
> +       str     x3, [x0, #VCPU_PAR_EL2]
>         ubfx    x3, x3, #12, #36        // Extract IPA
>         lsl     x3, x3, #4              // and present it like HPFAR
> +
>         b       2f
>
> -1:     mrs     x3, hpfar_el2
> +1:     mrs     x0, tpidr_el2
>         mrs     x2, far_el2
> +       mrs     x3, hpfar_el2
>
> -2:     mrs     x0, tpidr_el2
> -       str     x1, [x0, #VCPU_ESR_EL2]
> +2:     str     x1, [x0, #VCPU_ESR_EL2]
>         str     x2, [x0, #VCPU_FAR_EL2]
>         str     x3, [x0, #VCPU_HPFAR_EL2]
>
> --
> 1.8.2.3
>
>

In general, the patch had formatting issues so had to apply the changes
manually.

This patch works on X-Gene and Foundation v8 Model with above mentioned
modified condition check.

Tested-by: Anup Patel <anup@brainfault.org>

Thanks,
Anup

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-09-11 12:21 ` Anup Patel
@ 2013-09-11 12:35   ` Marc Zyngier
  2013-09-11 19:38     ` Christoffer Dall
  0 siblings, 1 reply; 23+ messages in thread
From: Marc Zyngier @ 2013-09-11 12:35 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/09/13 13:21, Anup Patel wrote:
> On Tue, Sep 10, 2013 at 3:21 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> Anup Patel recently brought up the fact that when we run a guest
>> with cache disabled, we don't flush the cache to the Point of
>> Coherency, hence possibly missing bits of data that have been
>> written in the cache, but have not reached memory.
>>
>> There are several approaches to this issue:
>> - Using the DC bit when caches are off: this breaks guests assuming
>>   caches off while doing DMA operations. Bootloaders, for example.
>> - Fetch the memory attributes on translation fault, and flush the
>>   cache while handling the fault. This relies on using the PAR_EL1
>>   register to obtain the Stage-1 memory attributes.
>>
>> While this second solution is clearly not ideal (it duplicates what
>> the HW already does to generate HPFAR_EL2), it is more correct than
>> not doing anything.
>>
>> Cc: Anup Patel <anup@brainfault.org>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> ---
>>
>>  arch/arm/include/asm/kvm_mmu.h    |  4 ++--
>>  arch/arm/kvm/mmu.c                |  2 +-
>>  arch/arm64/include/asm/kvm_host.h |  1 +
>>  arch/arm64/include/asm/kvm_mmu.h  |  9 +++++++--
>>  arch/arm64/kernel/asm-offsets.c   |  1 +
>>  arch/arm64/kvm/hyp.S              | 28 ++++++++++++----------------
>>  6 files changed, 24 insertions(+), 21 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>> index 472ac70..faffdf0 100644
>> --- a/arch/arm/include/asm/kvm_mmu.h
>> +++ b/arch/arm/include/asm/kvm_mmu.h
>> @@ -105,7 +105,7 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
>>
>>  struct kvm;
>>
>> -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
>> +static inline void coherent_icache_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn)
>>  {
>>         /*
>>          * If we are going to insert an instruction page and the icache is
>> @@ -120,7 +120,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
>>          * need any kind of flushing (DDI 0406C.b - Page B3-1392).
>>          */
>>         if (icache_is_pipt()) {
>> -               unsigned long hva = gfn_to_hva(kvm, gfn);
>> +               unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
>>                 __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
>>         } else if (!icache_is_vivt_asid_tagged()) {
>>                 /* any kind of VIPT cache */
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index 0988d9e..ffb3cc4 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -547,7 +547,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>                 return -EFAULT;
>>
>>         new_pte = pfn_pte(pfn, PAGE_S2);
>> -       coherent_icache_guest_page(vcpu->kvm, gfn);
>> +       coherent_icache_guest_page(vcpu, gfn);
>>
>>         spin_lock(&vcpu->kvm->mmu_lock);
>>         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 331a852..2530f92 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -79,6 +79,7 @@ struct kvm_mmu_memory_cache {
>>  struct kvm_vcpu_fault_info {
>>         u32 esr_el2;            /* Hyp Syndrom Register */
>>         u64 far_el2;            /* Hyp Fault Address Register */
>> +       u64 par_el2;            /* Result of FAR_EL2 translation */
> 
> Please rename this to par_el1 because we are saving result of Stage1
> translation only.

That'd be very confusing, as we deal with PAR_EL1 separately, as part of
the guest context.

If anything, I may rename it to "what_I_d_love_HPFAR_to_return" instead.

>>         u64 hpfar_el2;          /* Hyp IPA Fault Address Register */
>>  };
>>
>> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
>> index efe609c..84baddd 100644
>> --- a/arch/arm64/include/asm/kvm_mmu.h
>> +++ b/arch/arm64/include/asm/kvm_mmu.h
>> @@ -118,11 +118,16 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
>>
>>  struct kvm;
>>
>> -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
>> +static inline void coherent_icache_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn)
>>  {
>>         if (!icache_is_aliasing()) {            /* PIPT */
>> -               unsigned long hva = gfn_to_hva(kvm, gfn);
>> +               unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
>> +               u8 attr;
>>                 flush_icache_range(hva, hva + PAGE_SIZE);
>> +               attr = vcpu->arch.fault.par_el2 >> 56;
>> +               /* Check for non-device, non-cacheable access */
>> +               if ((attr & 0xf0) && (attr & 0x0f) == 4)
>> +                       __flush_dcache_area((void *)hva, PAGE_SIZE);
> 
> The condition is not complete because when MMU is disabled the memory
> attribute is assumed to be 0x0 (i.e. Device-nGnRnE memory)

That's a good point.

> The condition check that works on X-Gene and Foundation v8 Model is:
> 
> +               /* Check for device access OR
> +                * non-device, non-cacheable access
> +                */
> +               if (!(attr & 0xf0) ||
> +                   ((attr & 0xf0) && (attr & 0x0f) == 4))
> 
> Also, we should avoid integer constants here for better code readability.
> 
>>         } else if (!icache_is_aivivt()) {       /* non ASID-tagged VIVT */
>>                 /* any kind of VIPT cache */
>>                 __flush_icache_all();
>> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
>> index 1f3a39d..e67dcac 100644
>> --- a/arch/arm64/kernel/asm-offsets.c
>> +++ b/arch/arm64/kernel/asm-offsets.c
>> @@ -117,6 +117,7 @@ int main(void)
>>    DEFINE(CPU_SYSREGS,          offsetof(struct kvm_cpu_context, sys_regs));
>>    DEFINE(VCPU_ESR_EL2,         offsetof(struct kvm_vcpu, arch.fault.esr_el2));
>>    DEFINE(VCPU_FAR_EL2,         offsetof(struct kvm_vcpu, arch.fault.far_el2));
>> +  DEFINE(VCPU_PAR_EL2,         offsetof(struct kvm_vcpu, arch.fault.par_el2));
>>    DEFINE(VCPU_HPFAR_EL2,       offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
>>    DEFINE(VCPU_HCR_EL2,         offsetof(struct kvm_vcpu, arch.hcr_el2));
>>    DEFINE(VCPU_IRQ_LINES,       offsetof(struct kvm_vcpu, arch.irq_lines));
>> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
>> index 20a58fe..417636b 100644
>> --- a/arch/arm64/kvm/hyp.S
>> +++ b/arch/arm64/kvm/hyp.S
>> @@ -671,24 +671,16 @@ el1_trap:
>>         ccmp    x2, x0, #4, ne
>>         b.ne    1f              // Not an abort we care about
>>
>> -       /* This is an abort. Check for permission fault */
>> -       and     x2, x1, #ESR_EL2_FSC_TYPE
>> -       cmp     x2, #FSC_PERM
>> -       b.ne    1f              // Not a permission fault
>> -
>> -       /*
>> -        * Check for Stage-1 page table walk, which is guaranteed
>> -        * to give a valid HPFAR_EL2.
>> -        */
>> -       tbnz    x1, #7, 1f      // S1PTW is set
>> -
>>         /* Preserve PAR_EL1 */
>>         mrs     x3, par_el1
>>         push    x3, xzr
>>
>>         /*
>> -        * Permission fault, HPFAR_EL2 is invalid.
>> -        * Resolve the IPA the hard way using the guest VA.
>> +        * Two cases:
>> +        * - S2 translation fault, we need the memory attributes.
>> +        * - S2 permission fault, HPFAR_EL2 is invalid.
>> +        *
>> +        * In both cases, resolve the IPA the hard way using the guest VA.
>>          * Stage-1 translation already validated the memory access rights.
>>          * As such, we can use the EL1 translation regime, and don't have
>>          * to distinguish between EL0 and EL1 access.
>> @@ -702,15 +694,19 @@ el1_trap:
>>         pop     x0, xzr                 // Restore PAR_EL1 from the stack
>>         msr     par_el1, x0
>>         tbnz    x3, #0, 3f              // Bail out if we failed the translation
>> +
>> +       mrs     x0, tpidr_el2
>> +       str     x3, [x0, #VCPU_PAR_EL2]
>>         ubfx    x3, x3, #12, #36        // Extract IPA
>>         lsl     x3, x3, #4              // and present it like HPFAR
>> +
>>         b       2f
>>
>> -1:     mrs     x3, hpfar_el2
>> +1:     mrs     x0, tpidr_el2
>>         mrs     x2, far_el2
>> +       mrs     x3, hpfar_el2
>>
>> -2:     mrs     x0, tpidr_el2
>> -       str     x1, [x0, #VCPU_ESR_EL2]
>> +2:     str     x1, [x0, #VCPU_ESR_EL2]
>>         str     x2, [x0, #VCPU_FAR_EL2]
>>         str     x3, [x0, #VCPU_HPFAR_EL2]
>>
>> --
>> 1.8.2.3
>>
>>
> 
> In general, the patch had formatting issues so had to apply the changes
> manually.

maz at e102391-lin:~/Work/arm-platforms$ git checkout v3.11
HEAD is now at 6e46645... Linux 3.11
maz at e102391-lin:~/Work/arm-platforms$ git am
tmp/0001-arm64-KVM-honor-cacheability-attributes-on-S2-page-f.patch
Applying: arm64: KVM: honor cacheability attributes on S2 page fault
maz at e102391-lin:~/Work/arm-platforms$

Works fine here.

> This patch works on X-Gene and Foundation v8 Model with above mentioned
> modified condition check.
> 
> Tested-by: Anup Patel <anup@brainfault.org>

Thanks.

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-09-10  9:51 [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault Marc Zyngier
  2013-09-11 12:21 ` Anup Patel
@ 2013-09-11 18:06 ` Peter Maydell
  2013-09-11 19:25   ` Christoffer Dall
  1 sibling, 1 reply; 23+ messages in thread
From: Peter Maydell @ 2013-09-11 18:06 UTC (permalink / raw)
  To: linux-arm-kernel

On 10 September 2013 10:51, Marc Zyngier <marc.zyngier@arm.com> wrote:
> Anup Patel recently brought up the fact that when we run a guest
> with cache disabled, we don't flush the cache to the Point of
> Coherency, hence possibly missing bits of data that have been
> written in the cache, but have not reached memory.
>
> There are several approaches to this issue:
> - Using the DC bit when caches are off: this breaks guests assuming
>   caches off while doing DMA operations. Bootloaders, for example.
> - Fetch the memory attributes on translation fault, and flush the
>   cache while handling the fault. This relies on using the PAR_EL1
>   register to obtain the Stage-1 memory attributes.
>
> While this second solution is clearly not ideal (it duplicates what
> the HW already does to generate HPFAR_EL2), it is more correct than
> not doing anything.

Are you confident that extending the current very limited
set of circumstances where we re-do the address translation
operation doesn't introduce any new races if the guest
page tables change between taking the fault and redoing
the translation? I remember being this a pain to think through
last time around :-)

thanks
-- PMM

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-09-11 18:06 ` Peter Maydell
@ 2013-09-11 19:25   ` Christoffer Dall
  0 siblings, 0 replies; 23+ messages in thread
From: Christoffer Dall @ 2013-09-11 19:25 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Sep 11, 2013 at 07:06:06PM +0100, Peter Maydell wrote:
> On 10 September 2013 10:51, Marc Zyngier <marc.zyngier@arm.com> wrote:
> > Anup Patel recently brought up the fact that when we run a guest
> > with cache disabled, we don't flush the cache to the Point of
> > Coherency, hence possibly missing bits of data that have been
> > written in the cache, but have not reached memory.
> >
> > There are several approaches to this issue:
> > - Using the DC bit when caches are off: this breaks guests assuming
> >   caches off while doing DMA operations. Bootloaders, for example.
> > - Fetch the memory attributes on translation fault, and flush the
> >   cache while handling the fault. This relies on using the PAR_EL1
> >   register to obtain the Stage-1 memory attributes.
> >
> > While this second solution is clearly not ideal (it duplicates what
> > the HW already does to generate HPFAR_EL2), it is more correct than
> > not doing anything.
> 
> Are you confident that extending the current very limited
> set of circumstances where we re-do the address translation
> operation doesn't introduce any new races if the guest
> page tables change between taking the fault and redoing
> the translation? I remember being this a pain to think through
> last time around :-)
> 
I'm worried about the performance penalty hit for any translation fault
here.  I know that ideally locality should reduce the number of times we
do this, but it seems we are introducing logic on a very general path to
ensure correct behavior for something very specific.

I lost track: would we avoid all of this if we just require user space
to flush the dcache after loading code into memory?

-Christoffer

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-09-11 12:35   ` Marc Zyngier
@ 2013-09-11 19:38     ` Christoffer Dall
  2013-10-10  4:51       ` Anup Patel
  0 siblings, 1 reply; 23+ messages in thread
From: Christoffer Dall @ 2013-09-11 19:38 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Sep 11, 2013 at 01:35:56PM +0100, Marc Zyngier wrote:
> On 11/09/13 13:21, Anup Patel wrote:
> > On Tue, Sep 10, 2013 at 3:21 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> >> Anup Patel recently brought up the fact that when we run a guest
> >> with cache disabled, we don't flush the cache to the Point of
> >> Coherency, hence possibly missing bits of data that have been
> >> written in the cache, but have not reached memory.
> >>
> >> There are several approaches to this issue:
> >> - Using the DC bit when caches are off: this breaks guests assuming
> >>   caches off while doing DMA operations. Bootloaders, for example.
> >> - Fetch the memory attributes on translation fault, and flush the
> >>   cache while handling the fault. This relies on using the PAR_EL1
> >>   register to obtain the Stage-1 memory attributes.
> >>
> >> While this second solution is clearly not ideal (it duplicates what
> >> the HW already does to generate HPFAR_EL2), it is more correct than
> >> not doing anything.
> >>
> >> Cc: Anup Patel <anup@brainfault.org>
> >> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> >> ---
> >>
> >>  arch/arm/include/asm/kvm_mmu.h    |  4 ++--
> >>  arch/arm/kvm/mmu.c                |  2 +-
> >>  arch/arm64/include/asm/kvm_host.h |  1 +
> >>  arch/arm64/include/asm/kvm_mmu.h  |  9 +++++++--
> >>  arch/arm64/kernel/asm-offsets.c   |  1 +
> >>  arch/arm64/kvm/hyp.S              | 28 ++++++++++++----------------
> >>  6 files changed, 24 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> >> index 472ac70..faffdf0 100644
> >> --- a/arch/arm/include/asm/kvm_mmu.h
> >> +++ b/arch/arm/include/asm/kvm_mmu.h
> >> @@ -105,7 +105,7 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
> >>
> >>  struct kvm;
> >>
> >> -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
> >> +static inline void coherent_icache_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn)
> >>  {
> >>         /*
> >>          * If we are going to insert an instruction page and the icache is
> >> @@ -120,7 +120,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
> >>          * need any kind of flushing (DDI 0406C.b - Page B3-1392).
> >>          */
> >>         if (icache_is_pipt()) {
> >> -               unsigned long hva = gfn_to_hva(kvm, gfn);
> >> +               unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
> >>                 __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
> >>         } else if (!icache_is_vivt_asid_tagged()) {
> >>                 /* any kind of VIPT cache */
> >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >> index 0988d9e..ffb3cc4 100644
> >> --- a/arch/arm/kvm/mmu.c
> >> +++ b/arch/arm/kvm/mmu.c
> >> @@ -547,7 +547,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>                 return -EFAULT;
> >>
> >>         new_pte = pfn_pte(pfn, PAGE_S2);
> >> -       coherent_icache_guest_page(vcpu->kvm, gfn);
> >> +       coherent_icache_guest_page(vcpu, gfn);
> >>
> >>         spin_lock(&vcpu->kvm->mmu_lock);
> >>         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
> >> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> >> index 331a852..2530f92 100644
> >> --- a/arch/arm64/include/asm/kvm_host.h
> >> +++ b/arch/arm64/include/asm/kvm_host.h
> >> @@ -79,6 +79,7 @@ struct kvm_mmu_memory_cache {
> >>  struct kvm_vcpu_fault_info {
> >>         u32 esr_el2;            /* Hyp Syndrom Register */
> >>         u64 far_el2;            /* Hyp Fault Address Register */
> >> +       u64 par_el2;            /* Result of FAR_EL2 translation */
> > 
> > Please rename this to par_el1 because we are saving result of Stage1
> > translation only.
> 
> That'd be very confusing, as we deal with PAR_EL1 separately, as part of
> the guest context.
> 
> If anything, I may rename it to "what_I_d_love_HPFAR_to_return" instead.
> 

while extremely clear, it may be a bit verbose ;)

I honestly think that 'making up' a register name is a bit confusing as
well.  I can live with it, but perhaps fault_ipa, or s1_fault_attr or
something like that would work.  Eh.

-Christoffer

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-09-11 19:38     ` Christoffer Dall
@ 2013-10-10  4:51       ` Anup Patel
  2013-10-10  8:39         ` Marc Zyngier
  0 siblings, 1 reply; 23+ messages in thread
From: Anup Patel @ 2013-10-10  4:51 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Sep 12, 2013 at 1:08 AM, Christoffer Dall
<christoffer.dall@linaro.org> wrote:
> On Wed, Sep 11, 2013 at 01:35:56PM +0100, Marc Zyngier wrote:
>> On 11/09/13 13:21, Anup Patel wrote:
>> > On Tue, Sep 10, 2013 at 3:21 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> >> Anup Patel recently brought up the fact that when we run a guest
>> >> with cache disabled, we don't flush the cache to the Point of
>> >> Coherency, hence possibly missing bits of data that have been
>> >> written in the cache, but have not reached memory.
>> >>
>> >> There are several approaches to this issue:
>> >> - Using the DC bit when caches are off: this breaks guests assuming
>> >>   caches off while doing DMA operations. Bootloaders, for example.
>> >> - Fetch the memory attributes on translation fault, and flush the
>> >>   cache while handling the fault. This relies on using the PAR_EL1
>> >>   register to obtain the Stage-1 memory attributes.
>> >>
>> >> While this second solution is clearly not ideal (it duplicates what
>> >> the HW already does to generate HPFAR_EL2), it is more correct than
>> >> not doing anything.
>> >>
>> >> Cc: Anup Patel <anup@brainfault.org>
>> >> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> >> ---
>> >>
>> >>  arch/arm/include/asm/kvm_mmu.h    |  4 ++--
>> >>  arch/arm/kvm/mmu.c                |  2 +-
>> >>  arch/arm64/include/asm/kvm_host.h |  1 +
>> >>  arch/arm64/include/asm/kvm_mmu.h  |  9 +++++++--
>> >>  arch/arm64/kernel/asm-offsets.c   |  1 +
>> >>  arch/arm64/kvm/hyp.S              | 28 ++++++++++++----------------
>> >>  6 files changed, 24 insertions(+), 21 deletions(-)
>> >>
>> >> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>> >> index 472ac70..faffdf0 100644
>> >> --- a/arch/arm/include/asm/kvm_mmu.h
>> >> +++ b/arch/arm/include/asm/kvm_mmu.h
>> >> @@ -105,7 +105,7 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
>> >>
>> >>  struct kvm;
>> >>
>> >> -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
>> >> +static inline void coherent_icache_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn)
>> >>  {
>> >>         /*
>> >>          * If we are going to insert an instruction page and the icache is
>> >> @@ -120,7 +120,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
>> >>          * need any kind of flushing (DDI 0406C.b - Page B3-1392).
>> >>          */
>> >>         if (icache_is_pipt()) {
>> >> -               unsigned long hva = gfn_to_hva(kvm, gfn);
>> >> +               unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
>> >>                 __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
>> >>         } else if (!icache_is_vivt_asid_tagged()) {
>> >>                 /* any kind of VIPT cache */
>> >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> >> index 0988d9e..ffb3cc4 100644
>> >> --- a/arch/arm/kvm/mmu.c
>> >> +++ b/arch/arm/kvm/mmu.c
>> >> @@ -547,7 +547,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>> >>                 return -EFAULT;
>> >>
>> >>         new_pte = pfn_pte(pfn, PAGE_S2);
>> >> -       coherent_icache_guest_page(vcpu->kvm, gfn);
>> >> +       coherent_icache_guest_page(vcpu, gfn);
>> >>
>> >>         spin_lock(&vcpu->kvm->mmu_lock);
>> >>         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
>> >> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> >> index 331a852..2530f92 100644
>> >> --- a/arch/arm64/include/asm/kvm_host.h
>> >> +++ b/arch/arm64/include/asm/kvm_host.h
>> >> @@ -79,6 +79,7 @@ struct kvm_mmu_memory_cache {
>> >>  struct kvm_vcpu_fault_info {
>> >>         u32 esr_el2;            /* Hyp Syndrom Register */
>> >>         u64 far_el2;            /* Hyp Fault Address Register */
>> >> +       u64 par_el2;            /* Result of FAR_EL2 translation */
>> >
>> > Please rename this to par_el1 because we are saving result of Stage1
>> > translation only.
>>
>> That'd be very confusing, as we deal with PAR_EL1 separately, as part of
>> the guest context.
>>
>> If anything, I may rename it to "what_I_d_love_HPFAR_to_return" instead.
>>
>
> while extremely clear, it may be a bit verbose ;)
>
> I honestly think that 'making up' a register name is a bit confusing as
> well.  I can live with it, but perhaps fault_ipa, or s1_fault_attr or
> something like that would work.  Eh.
>
> -Christoffer

Hi Marc,

Are you planning to go ahead with this approach ?

We really need this patch for X-Gene L3 cache.

Regards,
Anup

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-10  4:51       ` Anup Patel
@ 2013-10-10  8:39         ` Marc Zyngier
  2013-10-10 11:24           ` Catalin Marinas
  0 siblings, 1 reply; 23+ messages in thread
From: Marc Zyngier @ 2013-10-10  8:39 UTC (permalink / raw)
  To: linux-arm-kernel

On 10/10/13 05:51, Anup Patel wrote:
> On Thu, Sep 12, 2013 at 1:08 AM, Christoffer Dall
> <christoffer.dall@linaro.org> wrote:
>> On Wed, Sep 11, 2013 at 01:35:56PM +0100, Marc Zyngier wrote:
>>> On 11/09/13 13:21, Anup Patel wrote:
>>>> On Tue, Sep 10, 2013 at 3:21 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>>>>> Anup Patel recently brought up the fact that when we run a guest
>>>>> with cache disabled, we don't flush the cache to the Point of
>>>>> Coherency, hence possibly missing bits of data that have been
>>>>> written in the cache, but have not reached memory.
>>>>>
>>>>> There are several approaches to this issue:
>>>>> - Using the DC bit when caches are off: this breaks guests assuming
>>>>>   caches off while doing DMA operations. Bootloaders, for example.
>>>>> - Fetch the memory attributes on translation fault, and flush the
>>>>>   cache while handling the fault. This relies on using the PAR_EL1
>>>>>   register to obtain the Stage-1 memory attributes.
>>>>>
>>>>> While this second solution is clearly not ideal (it duplicates what
>>>>> the HW already does to generate HPFAR_EL2), it is more correct than
>>>>> not doing anything.
>>>>>
>>>>> Cc: Anup Patel <anup@brainfault.org>
>>>>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>>>>> ---
>>>>>
>>>>>  arch/arm/include/asm/kvm_mmu.h    |  4 ++--
>>>>>  arch/arm/kvm/mmu.c                |  2 +-
>>>>>  arch/arm64/include/asm/kvm_host.h |  1 +
>>>>>  arch/arm64/include/asm/kvm_mmu.h  |  9 +++++++--
>>>>>  arch/arm64/kernel/asm-offsets.c   |  1 +
>>>>>  arch/arm64/kvm/hyp.S              | 28 ++++++++++++----------------
>>>>>  6 files changed, 24 insertions(+), 21 deletions(-)
>>>>>
>>>>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>>>>> index 472ac70..faffdf0 100644
>>>>> --- a/arch/arm/include/asm/kvm_mmu.h
>>>>> +++ b/arch/arm/include/asm/kvm_mmu.h
>>>>> @@ -105,7 +105,7 @@ static inline void kvm_set_s2pte_writable(pte_t *pte)
>>>>>
>>>>>  struct kvm;
>>>>>
>>>>> -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
>>>>> +static inline void coherent_icache_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn)
>>>>>  {
>>>>>         /*
>>>>>          * If we are going to insert an instruction page and the icache is
>>>>> @@ -120,7 +120,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
>>>>>          * need any kind of flushing (DDI 0406C.b - Page B3-1392).
>>>>>          */
>>>>>         if (icache_is_pipt()) {
>>>>> -               unsigned long hva = gfn_to_hva(kvm, gfn);
>>>>> +               unsigned long hva = gfn_to_hva(vcpu->kvm, gfn);
>>>>>                 __cpuc_coherent_user_range(hva, hva + PAGE_SIZE);
>>>>>         } else if (!icache_is_vivt_asid_tagged()) {
>>>>>                 /* any kind of VIPT cache */
>>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>>> index 0988d9e..ffb3cc4 100644
>>>>> --- a/arch/arm/kvm/mmu.c
>>>>> +++ b/arch/arm/kvm/mmu.c
>>>>> @@ -547,7 +547,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>>                 return -EFAULT;
>>>>>
>>>>>         new_pte = pfn_pte(pfn, PAGE_S2);
>>>>> -       coherent_icache_guest_page(vcpu->kvm, gfn);
>>>>> +       coherent_icache_guest_page(vcpu, gfn);
>>>>>
>>>>>         spin_lock(&vcpu->kvm->mmu_lock);
>>>>>         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
>>>>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>>>>> index 331a852..2530f92 100644
>>>>> --- a/arch/arm64/include/asm/kvm_host.h
>>>>> +++ b/arch/arm64/include/asm/kvm_host.h
>>>>> @@ -79,6 +79,7 @@ struct kvm_mmu_memory_cache {
>>>>>  struct kvm_vcpu_fault_info {
>>>>>         u32 esr_el2;            /* Hyp Syndrom Register */
>>>>>         u64 far_el2;            /* Hyp Fault Address Register */
>>>>> +       u64 par_el2;            /* Result of FAR_EL2 translation */
>>>>
>>>> Please rename this to par_el1 because we are saving result of Stage1
>>>> translation only.
>>>
>>> That'd be very confusing, as we deal with PAR_EL1 separately, as part of
>>> the guest context.
>>>
>>> If anything, I may rename it to "what_I_d_love_HPFAR_to_return" instead.
>>>
>>
>> while extremely clear, it may be a bit verbose ;)
>>
>> I honestly think that 'making up' a register name is a bit confusing as
>> well.  I can live with it, but perhaps fault_ipa, or s1_fault_attr or
>> something like that would work.  Eh.
>>
>> -Christoffer
> 
> Hi Marc,
> 
> Are you planning to go ahead with this approach ?

[adding Catalin, as we heavily discussed this recently]

Not as such, as it doesn't solve the full issue. It merely papers over
the whole "my cache is off" problem. More specifically, any kind of
speculative access from another CPU while caches are off in the guest
completely nukes the benefit of this patch.

Also, turning the the caches off is another source of problems, as
speculation also screws up set/way invalidation.

> We really need this patch for X-Gene L3 cache.

So far, I can see two possibilities:
- either we mandate caches to be always on (DC bit, and you're not
allowed to turn the caches off).
- Or we mandate that caches are invalidated (by VA) for each write that
is performed with caches off.

In both cases, it sucks, as it requires changes in the guest.

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-10  8:39         ` Marc Zyngier
@ 2013-10-10 11:24           ` Catalin Marinas
  2013-10-10 16:09             ` Anup Patel
  0 siblings, 1 reply; 23+ messages in thread
From: Catalin Marinas @ 2013-10-10 11:24 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Oct 10, 2013 at 09:39:55AM +0100, Marc Zyngier wrote:
> On 10/10/13 05:51, Anup Patel wrote:
> > Are you planning to go ahead with this approach ?
> 
> [adding Catalin, as we heavily discussed this recently]
> 
> Not as such, as it doesn't solve the full issue. It merely papers over
> the whole "my cache is off" problem. More specifically, any kind of
> speculative access from another CPU while caches are off in the guest
> completely nukes the benefit of this patch.
> 
> Also, turning the the caches off is another source of problems, as
> speculation also screws up set/way invalidation.

Indeed. The set/way operations trapping and broadcasting (or deferring)
to other CPUs in software just happens to work but there is no
guarantee, sooner or later we'll hit a problem. I'm even tempted to
remove flush_dcache_all() calls on the booting path for the arm64
kernel, we already require that whatever runs before Linux should
clean&invalidate the caches.

Basically, with KVM a VCPU even if running with caches/MMU disabled can
still get speculative allocation into the cache. The reason for this is
the other cacheable memory aliases created by the host kernel and
qemu/kvmtool. I can't tell whether Xen has this issue but it may be
easier in Xen to avoid memory aliases.

> > We really need this patch for X-Gene L3 cache.
> 
> So far, I can see two possibilities:
> - either we mandate caches to be always on (DC bit, and you're not
> allowed to turn the caches off).

That's my preferred approach. For hotplug, idle, the guest would use an
HVC call (PSCI) and the host takes care of re-enabling the DC bit. But
we may not catch all cases (kexec probably).

> - Or we mandate that caches are invalidated (by VA) for each write that
> is performed with caches off.

For some things like run-time code patching, on ARMv8 we need to do at
least I-cache maintenance since the CPU can allocate into the I-cache
(even if there are no aliases).

-- 
Catalin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-10 11:24           ` Catalin Marinas
@ 2013-10-10 16:09             ` Anup Patel
  2013-10-11 12:38               ` Catalin Marinas
  0 siblings, 1 reply; 23+ messages in thread
From: Anup Patel @ 2013-10-10 16:09 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Oct 10, 2013 at 4:54 PM, Catalin Marinas
<catalin.marinas@arm.com> wrote:
> On Thu, Oct 10, 2013 at 09:39:55AM +0100, Marc Zyngier wrote:
>> On 10/10/13 05:51, Anup Patel wrote:
>> > Are you planning to go ahead with this approach ?
>>
>> [adding Catalin, as we heavily discussed this recently]
>>
>> Not as such, as it doesn't solve the full issue. It merely papers over
>> the whole "my cache is off" problem. More specifically, any kind of
>> speculative access from another CPU while caches are off in the guest
>> completely nukes the benefit of this patch.
>>
>> Also, turning the the caches off is another source of problems, as
>> speculation also screws up set/way invalidation.
>
> Indeed. The set/way operations trapping and broadcasting (or deferring)
> to other CPUs in software just happens to work but there is no
> guarantee, sooner or later we'll hit a problem. I'm even tempted to
> remove flush_dcache_all() calls on the booting path for the arm64
> kernel, we already require that whatever runs before Linux should
> clean&invalidate the caches.
>
> Basically, with KVM a VCPU even if running with caches/MMU disabled can
> still get speculative allocation into the cache. The reason for this is
> the other cacheable memory aliases created by the host kernel and
> qemu/kvmtool. I can't tell whether Xen has this issue but it may be
> easier in Xen to avoid memory aliases.
>
>> > We really need this patch for X-Gene L3 cache.
>>
>> So far, I can see two possibilities:
>> - either we mandate caches to be always on (DC bit, and you're not
>> allowed to turn the caches off).
>
> That's my preferred approach. For hotplug, idle, the guest would use an
> HVC call (PSCI) and the host takes care of re-enabling the DC bit. But
> we may not catch all cases (kexec probably).
>
>> - Or we mandate that caches are invalidated (by VA) for each write that
>> is performed with caches off.
>
> For some things like run-time code patching, on ARMv8 we need to do at
> least I-cache maintenance since the CPU can allocate into the I-cache
> (even if there are no aliases).

It seems all approaches considered so far have a corner case in
one-way or another.

Coming back to where we started, the actual problem was that when
Guest starts booting it sees wrong contents because it is runs with
MMU disable and correct contents are still in external L3 cache of X-Gene.

How about reconsidering the approach of flushing Guest RAM (entire or
portion of it) to PoC by VA once before the first run of a VCPU ?
OR
We can also have KVM API using which user space can flush portions
of Guest RAM before running the VCPU. (I think this was a suggestion
from Marc Z initially)

--Anup

>
> --
> Catalin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-10 16:09             ` Anup Patel
@ 2013-10-11 12:38               ` Catalin Marinas
  2013-10-11 14:27                 ` Anup Patel
  0 siblings, 1 reply; 23+ messages in thread
From: Catalin Marinas @ 2013-10-11 12:38 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Oct 10, 2013 at 05:09:03PM +0100, Anup Patel wrote:
> On Thu, Oct 10, 2013 at 4:54 PM, Catalin Marinas
> <catalin.marinas@arm.com> wrote:
> > On Thu, Oct 10, 2013 at 09:39:55AM +0100, Marc Zyngier wrote:
> >> On 10/10/13 05:51, Anup Patel wrote:
> >> > Are you planning to go ahead with this approach ?
> >>
> >> [adding Catalin, as we heavily discussed this recently]
> >>
> >> Not as such, as it doesn't solve the full issue. It merely papers over
> >> the whole "my cache is off" problem. More specifically, any kind of
> >> speculative access from another CPU while caches are off in the guest
> >> completely nukes the benefit of this patch.
> >>
> >> Also, turning the the caches off is another source of problems, as
> >> speculation also screws up set/way invalidation.
> >
> > Indeed. The set/way operations trapping and broadcasting (or deferring)
> > to other CPUs in software just happens to work but there is no
> > guarantee, sooner or later we'll hit a problem. I'm even tempted to
> > remove flush_dcache_all() calls on the booting path for the arm64
> > kernel, we already require that whatever runs before Linux should
> > clean&invalidate the caches.
> >
> > Basically, with KVM a VCPU even if running with caches/MMU disabled can
> > still get speculative allocation into the cache. The reason for this is
> > the other cacheable memory aliases created by the host kernel and
> > qemu/kvmtool. I can't tell whether Xen has this issue but it may be
> > easier in Xen to avoid memory aliases.
> >
> >> > We really need this patch for X-Gene L3 cache.
> >>
> >> So far, I can see two possibilities:
> >> - either we mandate caches to be always on (DC bit, and you're not
> >> allowed to turn the caches off).
> >
> > That's my preferred approach. For hotplug, idle, the guest would use an
> > HVC call (PSCI) and the host takes care of re-enabling the DC bit. But
> > we may not catch all cases (kexec probably).
> >
> >> - Or we mandate that caches are invalidated (by VA) for each write that
> >> is performed with caches off.
> >
> > For some things like run-time code patching, on ARMv8 we need to do at
> > least I-cache maintenance since the CPU can allocate into the I-cache
> > (even if there are no aliases).
> 
> It seems all approaches considered so far have a corner case in
> one-way or another.

Yes, we try to settle on the one with least corner cases.

> Coming back to where we started, the actual problem was that when
> Guest starts booting it sees wrong contents because it is runs with
> MMU disable and correct contents are still in external L3 cache of X-Gene.

That's one of the problems and I think the easiest to solve. Note that
contents could still be in the L1/L2 (inner) cache since whole cache
flushing by set/way isn't guaranteed in an MP context.

> How about reconsidering the approach of flushing Guest RAM (entire or
> portion of it) to PoC by VA once before the first run of a VCPU ?

Flushing the entire guest RAM is not possible by set/way
(architecturally) and not efficient by VA (though some benchmark would
be good). Marc's patch defers this flushing when a page is faulted in
(at stage 2) and I think it covers the initial boot.

> OR
> We can also have KVM API using which user space can flush portions
> of Guest RAM before running the VCPU. (I think this was a suggestion
> from Marc Z initially)

This may not be enough. It indeed flushes the kernel image that gets
loaded but the kernel would write other pages (bss, page tables etc.)
with MMU disabled and those addresses may contain dirty cache lines that
have not been covered by the initial kvmtool flush. So you basically
need all guest non-cacheable accesses to be flushed.

The other problems are the cacheable aliases that I mentioned, so even
though the guest does non-cacheable accesses with the MMU off, the
hardware can still allocate into the cache via the other mappings. In
this case the guest needs to invalidate the areas of memory that it
wrote with caches off (or just use the DC bit to force memory accesses
with MMU off to be cacheable).

-- 
Catalin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-11 12:38               ` Catalin Marinas
@ 2013-10-11 14:27                 ` Anup Patel
  2013-10-11 14:37                   ` Catalin Marinas
  0 siblings, 1 reply; 23+ messages in thread
From: Anup Patel @ 2013-10-11 14:27 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Oct 11, 2013 at 6:08 PM, Catalin Marinas
<catalin.marinas@arm.com> wrote:
> On Thu, Oct 10, 2013 at 05:09:03PM +0100, Anup Patel wrote:
>> On Thu, Oct 10, 2013 at 4:54 PM, Catalin Marinas
>> <catalin.marinas@arm.com> wrote:
>> > On Thu, Oct 10, 2013 at 09:39:55AM +0100, Marc Zyngier wrote:
>> >> On 10/10/13 05:51, Anup Patel wrote:
>> >> > Are you planning to go ahead with this approach ?
>> >>
>> >> [adding Catalin, as we heavily discussed this recently]
>> >>
>> >> Not as such, as it doesn't solve the full issue. It merely papers over
>> >> the whole "my cache is off" problem. More specifically, any kind of
>> >> speculative access from another CPU while caches are off in the guest
>> >> completely nukes the benefit of this patch.
>> >>
>> >> Also, turning the the caches off is another source of problems, as
>> >> speculation also screws up set/way invalidation.
>> >
>> > Indeed. The set/way operations trapping and broadcasting (or deferring)
>> > to other CPUs in software just happens to work but there is no
>> > guarantee, sooner or later we'll hit a problem. I'm even tempted to
>> > remove flush_dcache_all() calls on the booting path for the arm64
>> > kernel, we already require that whatever runs before Linux should
>> > clean&invalidate the caches.
>> >
>> > Basically, with KVM a VCPU even if running with caches/MMU disabled can
>> > still get speculative allocation into the cache. The reason for this is
>> > the other cacheable memory aliases created by the host kernel and
>> > qemu/kvmtool. I can't tell whether Xen has this issue but it may be
>> > easier in Xen to avoid memory aliases.
>> >
>> >> > We really need this patch for X-Gene L3 cache.
>> >>
>> >> So far, I can see two possibilities:
>> >> - either we mandate caches to be always on (DC bit, and you're not
>> >> allowed to turn the caches off).
>> >
>> > That's my preferred approach. For hotplug, idle, the guest would use an
>> > HVC call (PSCI) and the host takes care of re-enabling the DC bit. But
>> > we may not catch all cases (kexec probably).
>> >
>> >> - Or we mandate that caches are invalidated (by VA) for each write that
>> >> is performed with caches off.
>> >
>> > For some things like run-time code patching, on ARMv8 we need to do at
>> > least I-cache maintenance since the CPU can allocate into the I-cache
>> > (even if there are no aliases).
>>
>> It seems all approaches considered so far have a corner case in
>> one-way or another.
>
> Yes, we try to settle on the one with least corner cases.

Ok.

>
>> Coming back to where we started, the actual problem was that when
>> Guest starts booting it sees wrong contents because it is runs with
>> MMU disable and correct contents are still in external L3 cache of X-Gene.
>
> That's one of the problems and I think the easiest to solve. Note that
> contents could still be in the L1/L2 (inner) cache since whole cache
> flushing by set/way isn't guaranteed in an MP context.
>
>> How about reconsidering the approach of flushing Guest RAM (entire or
>> portion of it) to PoC by VA once before the first run of a VCPU ?
>
> Flushing the entire guest RAM is not possible by set/way
> (architecturally) and not efficient by VA (though some benchmark would
> be good). Marc's patch defers this flushing when a page is faulted in
> (at stage 2) and I think it covers the initial boot.
>
>> OR
>> We can also have KVM API using which user space can flush portions
>> of Guest RAM before running the VCPU. (I think this was a suggestion
>> from Marc Z initially)
>
> This may not be enough. It indeed flushes the kernel image that gets
> loaded but the kernel would write other pages (bss, page tables etc.)
> with MMU disabled and those addresses may contain dirty cache lines that
> have not been covered by the initial kvmtool flush. So you basically
> need all guest non-cacheable accesses to be flushed.
>
> The other problems are the cacheable aliases that I mentioned, so even
> though the guest does non-cacheable accesses with the MMU off, the
> hardware can still allocate into the cache via the other mappings. In
> this case the guest needs to invalidate the areas of memory that it
> wrote with caches off (or just use the DC bit to force memory accesses
> with MMU off to be cacheable).

Having looked at all the approaches, I would vote for the approach taken
by this patch.

>
> --
> Catalin

--Anup

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-11 14:27                 ` Anup Patel
@ 2013-10-11 14:37                   ` Catalin Marinas
  2013-10-11 14:50                     ` Anup Patel
  0 siblings, 1 reply; 23+ messages in thread
From: Catalin Marinas @ 2013-10-11 14:37 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Oct 11, 2013 at 03:27:16PM +0100, Anup Patel wrote:
> On Fri, Oct 11, 2013 at 6:08 PM, Catalin Marinas
> <catalin.marinas@arm.com> wrote:
> > On Thu, Oct 10, 2013 at 05:09:03PM +0100, Anup Patel wrote:
> >> On Thu, Oct 10, 2013 at 4:54 PM, Catalin Marinas
> >> <catalin.marinas@arm.com> wrote:
> >> > On Thu, Oct 10, 2013 at 09:39:55AM +0100, Marc Zyngier wrote:
> >> >> On 10/10/13 05:51, Anup Patel wrote:
> >> >> > Are you planning to go ahead with this approach ?
> >> >>
> >> >> [adding Catalin, as we heavily discussed this recently]
> >> >>
> >> >> Not as such, as it doesn't solve the full issue. It merely papers over
> >> >> the whole "my cache is off" problem. More specifically, any kind of
> >> >> speculative access from another CPU while caches are off in the guest
> >> >> completely nukes the benefit of this patch.
> >> >>
> >> >> Also, turning the the caches off is another source of problems, as
> >> >> speculation also screws up set/way invalidation.
> >> >
> >> > Indeed. The set/way operations trapping and broadcasting (or deferring)
> >> > to other CPUs in software just happens to work but there is no
> >> > guarantee, sooner or later we'll hit a problem. I'm even tempted to
> >> > remove flush_dcache_all() calls on the booting path for the arm64
> >> > kernel, we already require that whatever runs before Linux should
> >> > clean&invalidate the caches.
> >> >
> >> > Basically, with KVM a VCPU even if running with caches/MMU disabled can
> >> > still get speculative allocation into the cache. The reason for this is
> >> > the other cacheable memory aliases created by the host kernel and
> >> > qemu/kvmtool. I can't tell whether Xen has this issue but it may be
> >> > easier in Xen to avoid memory aliases.
> >> >
> >> >> > We really need this patch for X-Gene L3 cache.
> >> >>
> >> >> So far, I can see two possibilities:
> >> >> - either we mandate caches to be always on (DC bit, and you're not
> >> >> allowed to turn the caches off).
> >> >
> >> > That's my preferred approach. For hotplug, idle, the guest would use an
> >> > HVC call (PSCI) and the host takes care of re-enabling the DC bit. But
> >> > we may not catch all cases (kexec probably).
> >> >
> >> >> - Or we mandate that caches are invalidated (by VA) for each write that
> >> >> is performed with caches off.
> >> >
> >> > For some things like run-time code patching, on ARMv8 we need to do at
> >> > least I-cache maintenance since the CPU can allocate into the I-cache
> >> > (even if there are no aliases).
> >>
> >> It seems all approaches considered so far have a corner case in
> >> one-way or another.
> >
> > Yes, we try to settle on the one with least corner cases.
> 
> Ok.
> 
> >
> >> Coming back to where we started, the actual problem was that when
> >> Guest starts booting it sees wrong contents because it is runs with
> >> MMU disable and correct contents are still in external L3 cache of X-Gene.
> >
> > That's one of the problems and I think the easiest to solve. Note that
> > contents could still be in the L1/L2 (inner) cache since whole cache
> > flushing by set/way isn't guaranteed in an MP context.
> >
> >> How about reconsidering the approach of flushing Guest RAM (entire or
> >> portion of it) to PoC by VA once before the first run of a VCPU ?
> >
> > Flushing the entire guest RAM is not possible by set/way
> > (architecturally) and not efficient by VA (though some benchmark would
> > be good). Marc's patch defers this flushing when a page is faulted in
> > (at stage 2) and I think it covers the initial boot.
> >
> >> OR
> >> We can also have KVM API using which user space can flush portions
> >> of Guest RAM before running the VCPU. (I think this was a suggestion
> >> from Marc Z initially)
> >
> > This may not be enough. It indeed flushes the kernel image that gets
> > loaded but the kernel would write other pages (bss, page tables etc.)
> > with MMU disabled and those addresses may contain dirty cache lines that
> > have not been covered by the initial kvmtool flush. So you basically
> > need all guest non-cacheable accesses to be flushed.
> >
> > The other problems are the cacheable aliases that I mentioned, so even
> > though the guest does non-cacheable accesses with the MMU off, the
> > hardware can still allocate into the cache via the other mappings. In
> > this case the guest needs to invalidate the areas of memory that it
> > wrote with caches off (or just use the DC bit to force memory accesses
> > with MMU off to be cacheable).
> 
> Having looked at all the approaches, I would vote for the approach taken
> by this patch.

But this patch alone doesn't solve the other issues. OTOH, the DC bit
would solve your initial problem and a few others.

-- 
Catalin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-11 14:37                   ` Catalin Marinas
@ 2013-10-11 14:50                     ` Anup Patel
  2013-10-11 14:59                       ` Marc Zyngier
  0 siblings, 1 reply; 23+ messages in thread
From: Anup Patel @ 2013-10-11 14:50 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Oct 11, 2013 at 8:07 PM, Catalin Marinas
<catalin.marinas@arm.com> wrote:
> On Fri, Oct 11, 2013 at 03:27:16PM +0100, Anup Patel wrote:
>> On Fri, Oct 11, 2013 at 6:08 PM, Catalin Marinas
>> <catalin.marinas@arm.com> wrote:
>> > On Thu, Oct 10, 2013 at 05:09:03PM +0100, Anup Patel wrote:
>> >> On Thu, Oct 10, 2013 at 4:54 PM, Catalin Marinas
>> >> <catalin.marinas@arm.com> wrote:
>> >> > On Thu, Oct 10, 2013 at 09:39:55AM +0100, Marc Zyngier wrote:
>> >> >> On 10/10/13 05:51, Anup Patel wrote:
>> >> >> > Are you planning to go ahead with this approach ?
>> >> >>
>> >> >> [adding Catalin, as we heavily discussed this recently]
>> >> >>
>> >> >> Not as such, as it doesn't solve the full issue. It merely papers over
>> >> >> the whole "my cache is off" problem. More specifically, any kind of
>> >> >> speculative access from another CPU while caches are off in the guest
>> >> >> completely nukes the benefit of this patch.
>> >> >>
>> >> >> Also, turning the the caches off is another source of problems, as
>> >> >> speculation also screws up set/way invalidation.
>> >> >
>> >> > Indeed. The set/way operations trapping and broadcasting (or deferring)
>> >> > to other CPUs in software just happens to work but there is no
>> >> > guarantee, sooner or later we'll hit a problem. I'm even tempted to
>> >> > remove flush_dcache_all() calls on the booting path for the arm64
>> >> > kernel, we already require that whatever runs before Linux should
>> >> > clean&invalidate the caches.
>> >> >
>> >> > Basically, with KVM a VCPU even if running with caches/MMU disabled can
>> >> > still get speculative allocation into the cache. The reason for this is
>> >> > the other cacheable memory aliases created by the host kernel and
>> >> > qemu/kvmtool. I can't tell whether Xen has this issue but it may be
>> >> > easier in Xen to avoid memory aliases.
>> >> >
>> >> >> > We really need this patch for X-Gene L3 cache.
>> >> >>
>> >> >> So far, I can see two possibilities:
>> >> >> - either we mandate caches to be always on (DC bit, and you're not
>> >> >> allowed to turn the caches off).
>> >> >
>> >> > That's my preferred approach. For hotplug, idle, the guest would use an
>> >> > HVC call (PSCI) and the host takes care of re-enabling the DC bit. But
>> >> > we may not catch all cases (kexec probably).
>> >> >
>> >> >> - Or we mandate that caches are invalidated (by VA) for each write that
>> >> >> is performed with caches off.
>> >> >
>> >> > For some things like run-time code patching, on ARMv8 we need to do at
>> >> > least I-cache maintenance since the CPU can allocate into the I-cache
>> >> > (even if there are no aliases).
>> >>
>> >> It seems all approaches considered so far have a corner case in
>> >> one-way or another.
>> >
>> > Yes, we try to settle on the one with least corner cases.
>>
>> Ok.
>>
>> >
>> >> Coming back to where we started, the actual problem was that when
>> >> Guest starts booting it sees wrong contents because it is runs with
>> >> MMU disable and correct contents are still in external L3 cache of X-Gene.
>> >
>> > That's one of the problems and I think the easiest to solve. Note that
>> > contents could still be in the L1/L2 (inner) cache since whole cache
>> > flushing by set/way isn't guaranteed in an MP context.
>> >
>> >> How about reconsidering the approach of flushing Guest RAM (entire or
>> >> portion of it) to PoC by VA once before the first run of a VCPU ?
>> >
>> > Flushing the entire guest RAM is not possible by set/way
>> > (architecturally) and not efficient by VA (though some benchmark would
>> > be good). Marc's patch defers this flushing when a page is faulted in
>> > (at stage 2) and I think it covers the initial boot.
>> >
>> >> OR
>> >> We can also have KVM API using which user space can flush portions
>> >> of Guest RAM before running the VCPU. (I think this was a suggestion
>> >> from Marc Z initially)
>> >
>> > This may not be enough. It indeed flushes the kernel image that gets
>> > loaded but the kernel would write other pages (bss, page tables etc.)
>> > with MMU disabled and those addresses may contain dirty cache lines that
>> > have not been covered by the initial kvmtool flush. So you basically
>> > need all guest non-cacheable accesses to be flushed.
>> >
>> > The other problems are the cacheable aliases that I mentioned, so even
>> > though the guest does non-cacheable accesses with the MMU off, the
>> > hardware can still allocate into the cache via the other mappings. In
>> > this case the guest needs to invalidate the areas of memory that it
>> > wrote with caches off (or just use the DC bit to force memory accesses
>> > with MMU off to be cacheable).
>>
>> Having looked at all the approaches, I would vote for the approach taken
>> by this patch.
>
> But this patch alone doesn't solve the other issues. OTOH, the DC bit
> would solve your initial problem and a few others.

DC bit might solve the initial problem but it can be problematic because
setting DC bit would mean Guest would have Caching ON even when Guest
MMU is disabled. This will be more problematic if Guest is running a
bootloader (uboot, grub, UEFI, ..) which does pass-through access to a
DMA-capable device and we will have to change the bootloader in this
case and put explicit flushes in bootloader for running inside Guest.

Also, having pass-through devices using VFIO (particularly PCIe devices)
is very common use-case this days in x86 world.

>
> --
> Catalin

--Anup

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-11 14:50                     ` Anup Patel
@ 2013-10-11 14:59                       ` Marc Zyngier
  2013-10-11 15:32                         ` Anup Patel
  0 siblings, 1 reply; 23+ messages in thread
From: Marc Zyngier @ 2013-10-11 14:59 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/10/13 15:50, Anup Patel wrote:
> On Fri, Oct 11, 2013 at 8:07 PM, Catalin Marinas
> <catalin.marinas@arm.com> wrote:
>> On Fri, Oct 11, 2013 at 03:27:16PM +0100, Anup Patel wrote:
>>> On Fri, Oct 11, 2013 at 6:08 PM, Catalin Marinas
>>> <catalin.marinas@arm.com> wrote:
>>>> On Thu, Oct 10, 2013 at 05:09:03PM +0100, Anup Patel wrote:
>>>>> On Thu, Oct 10, 2013 at 4:54 PM, Catalin Marinas
>>>>> <catalin.marinas@arm.com> wrote:
>>>>>> On Thu, Oct 10, 2013 at 09:39:55AM +0100, Marc Zyngier wrote:
>>>>>>> On 10/10/13 05:51, Anup Patel wrote:
>>>>>>>> Are you planning to go ahead with this approach ?
>>>>>>>
>>>>>>> [adding Catalin, as we heavily discussed this recently]
>>>>>>>
>>>>>>> Not as such, as it doesn't solve the full issue. It merely papers over
>>>>>>> the whole "my cache is off" problem. More specifically, any kind of
>>>>>>> speculative access from another CPU while caches are off in the guest
>>>>>>> completely nukes the benefit of this patch.
>>>>>>>
>>>>>>> Also, turning the the caches off is another source of problems, as
>>>>>>> speculation also screws up set/way invalidation.
>>>>>>
>>>>>> Indeed. The set/way operations trapping and broadcasting (or deferring)
>>>>>> to other CPUs in software just happens to work but there is no
>>>>>> guarantee, sooner or later we'll hit a problem. I'm even tempted to
>>>>>> remove flush_dcache_all() calls on the booting path for the arm64
>>>>>> kernel, we already require that whatever runs before Linux should
>>>>>> clean&invalidate the caches.
>>>>>>
>>>>>> Basically, with KVM a VCPU even if running with caches/MMU disabled can
>>>>>> still get speculative allocation into the cache. The reason for this is
>>>>>> the other cacheable memory aliases created by the host kernel and
>>>>>> qemu/kvmtool. I can't tell whether Xen has this issue but it may be
>>>>>> easier in Xen to avoid memory aliases.
>>>>>>
>>>>>>>> We really need this patch for X-Gene L3 cache.
>>>>>>>
>>>>>>> So far, I can see two possibilities:
>>>>>>> - either we mandate caches to be always on (DC bit, and you're not
>>>>>>> allowed to turn the caches off).
>>>>>>
>>>>>> That's my preferred approach. For hotplug, idle, the guest would use an
>>>>>> HVC call (PSCI) and the host takes care of re-enabling the DC bit. But
>>>>>> we may not catch all cases (kexec probably).
>>>>>>
>>>>>>> - Or we mandate that caches are invalidated (by VA) for each write that
>>>>>>> is performed with caches off.
>>>>>>
>>>>>> For some things like run-time code patching, on ARMv8 we need to do at
>>>>>> least I-cache maintenance since the CPU can allocate into the I-cache
>>>>>> (even if there are no aliases).
>>>>>
>>>>> It seems all approaches considered so far have a corner case in
>>>>> one-way or another.
>>>>
>>>> Yes, we try to settle on the one with least corner cases.
>>>
>>> Ok.
>>>
>>>>
>>>>> Coming back to where we started, the actual problem was that when
>>>>> Guest starts booting it sees wrong contents because it is runs with
>>>>> MMU disable and correct contents are still in external L3 cache of X-Gene.
>>>>
>>>> That's one of the problems and I think the easiest to solve. Note that
>>>> contents could still be in the L1/L2 (inner) cache since whole cache
>>>> flushing by set/way isn't guaranteed in an MP context.
>>>>
>>>>> How about reconsidering the approach of flushing Guest RAM (entire or
>>>>> portion of it) to PoC by VA once before the first run of a VCPU ?
>>>>
>>>> Flushing the entire guest RAM is not possible by set/way
>>>> (architecturally) and not efficient by VA (though some benchmark would
>>>> be good). Marc's patch defers this flushing when a page is faulted in
>>>> (at stage 2) and I think it covers the initial boot.
>>>>
>>>>> OR
>>>>> We can also have KVM API using which user space can flush portions
>>>>> of Guest RAM before running the VCPU. (I think this was a suggestion
>>>>> from Marc Z initially)
>>>>
>>>> This may not be enough. It indeed flushes the kernel image that gets
>>>> loaded but the kernel would write other pages (bss, page tables etc.)
>>>> with MMU disabled and those addresses may contain dirty cache lines that
>>>> have not been covered by the initial kvmtool flush. So you basically
>>>> need all guest non-cacheable accesses to be flushed.
>>>>
>>>> The other problems are the cacheable aliases that I mentioned, so even
>>>> though the guest does non-cacheable accesses with the MMU off, the
>>>> hardware can still allocate into the cache via the other mappings. In
>>>> this case the guest needs to invalidate the areas of memory that it
>>>> wrote with caches off (or just use the DC bit to force memory accesses
>>>> with MMU off to be cacheable).
>>>
>>> Having looked at all the approaches, I would vote for the approach taken
>>> by this patch.
>>
>> But this patch alone doesn't solve the other issues. OTOH, the DC bit
>> would solve your initial problem and a few others.
> 
> DC bit might solve the initial problem but it can be problematic because
> setting DC bit would mean Guest would have Caching ON even when Guest
> MMU is disabled. This will be more problematic if Guest is running a
> bootloader (uboot, grub, UEFI, ..) which does pass-through access to a
> DMA-capable device and we will have to change the bootloader in this
> case and put explicit flushes in bootloader for running inside Guest.

Well, as Catalin mentioned, we'll have to do some cache maintenance in
the guest in any case.

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-11 14:59                       ` Marc Zyngier
@ 2013-10-11 15:32                         ` Anup Patel
  2013-10-11 15:44                           ` Catalin Marinas
  0 siblings, 1 reply; 23+ messages in thread
From: Anup Patel @ 2013-10-11 15:32 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Oct 11, 2013 at 8:29 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> On 11/10/13 15:50, Anup Patel wrote:
>> On Fri, Oct 11, 2013 at 8:07 PM, Catalin Marinas
>> <catalin.marinas@arm.com> wrote:
>>> On Fri, Oct 11, 2013 at 03:27:16PM +0100, Anup Patel wrote:
>>>> On Fri, Oct 11, 2013 at 6:08 PM, Catalin Marinas
>>>> <catalin.marinas@arm.com> wrote:
>>>>> On Thu, Oct 10, 2013 at 05:09:03PM +0100, Anup Patel wrote:
>>>>>> On Thu, Oct 10, 2013 at 4:54 PM, Catalin Marinas
>>>>>> <catalin.marinas@arm.com> wrote:
>>>>>>> On Thu, Oct 10, 2013 at 09:39:55AM +0100, Marc Zyngier wrote:
>>>>>>>> On 10/10/13 05:51, Anup Patel wrote:
>>>>>>>>> Are you planning to go ahead with this approach ?
>>>>>>>>
>>>>>>>> [adding Catalin, as we heavily discussed this recently]
>>>>>>>>
>>>>>>>> Not as such, as it doesn't solve the full issue. It merely papers over
>>>>>>>> the whole "my cache is off" problem. More specifically, any kind of
>>>>>>>> speculative access from another CPU while caches are off in the guest
>>>>>>>> completely nukes the benefit of this patch.
>>>>>>>>
>>>>>>>> Also, turning the the caches off is another source of problems, as
>>>>>>>> speculation also screws up set/way invalidation.
>>>>>>>
>>>>>>> Indeed. The set/way operations trapping and broadcasting (or deferring)
>>>>>>> to other CPUs in software just happens to work but there is no
>>>>>>> guarantee, sooner or later we'll hit a problem. I'm even tempted to
>>>>>>> remove flush_dcache_all() calls on the booting path for the arm64
>>>>>>> kernel, we already require that whatever runs before Linux should
>>>>>>> clean&invalidate the caches.
>>>>>>>
>>>>>>> Basically, with KVM a VCPU even if running with caches/MMU disabled can
>>>>>>> still get speculative allocation into the cache. The reason for this is
>>>>>>> the other cacheable memory aliases created by the host kernel and
>>>>>>> qemu/kvmtool. I can't tell whether Xen has this issue but it may be
>>>>>>> easier in Xen to avoid memory aliases.
>>>>>>>
>>>>>>>>> We really need this patch for X-Gene L3 cache.
>>>>>>>>
>>>>>>>> So far, I can see two possibilities:
>>>>>>>> - either we mandate caches to be always on (DC bit, and you're not
>>>>>>>> allowed to turn the caches off).
>>>>>>>
>>>>>>> That's my preferred approach. For hotplug, idle, the guest would use an
>>>>>>> HVC call (PSCI) and the host takes care of re-enabling the DC bit. But
>>>>>>> we may not catch all cases (kexec probably).
>>>>>>>
>>>>>>>> - Or we mandate that caches are invalidated (by VA) for each write that
>>>>>>>> is performed with caches off.
>>>>>>>
>>>>>>> For some things like run-time code patching, on ARMv8 we need to do at
>>>>>>> least I-cache maintenance since the CPU can allocate into the I-cache
>>>>>>> (even if there are no aliases).
>>>>>>
>>>>>> It seems all approaches considered so far have a corner case in
>>>>>> one-way or another.
>>>>>
>>>>> Yes, we try to settle on the one with least corner cases.
>>>>
>>>> Ok.
>>>>
>>>>>
>>>>>> Coming back to where we started, the actual problem was that when
>>>>>> Guest starts booting it sees wrong contents because it is runs with
>>>>>> MMU disable and correct contents are still in external L3 cache of X-Gene.
>>>>>
>>>>> That's one of the problems and I think the easiest to solve. Note that
>>>>> contents could still be in the L1/L2 (inner) cache since whole cache
>>>>> flushing by set/way isn't guaranteed in an MP context.
>>>>>
>>>>>> How about reconsidering the approach of flushing Guest RAM (entire or
>>>>>> portion of it) to PoC by VA once before the first run of a VCPU ?
>>>>>
>>>>> Flushing the entire guest RAM is not possible by set/way
>>>>> (architecturally) and not efficient by VA (though some benchmark would
>>>>> be good). Marc's patch defers this flushing when a page is faulted in
>>>>> (at stage 2) and I think it covers the initial boot.
>>>>>
>>>>>> OR
>>>>>> We can also have KVM API using which user space can flush portions
>>>>>> of Guest RAM before running the VCPU. (I think this was a suggestion
>>>>>> from Marc Z initially)
>>>>>
>>>>> This may not be enough. It indeed flushes the kernel image that gets
>>>>> loaded but the kernel would write other pages (bss, page tables etc.)
>>>>> with MMU disabled and those addresses may contain dirty cache lines that
>>>>> have not been covered by the initial kvmtool flush. So you basically
>>>>> need all guest non-cacheable accesses to be flushed.
>>>>>
>>>>> The other problems are the cacheable aliases that I mentioned, so even
>>>>> though the guest does non-cacheable accesses with the MMU off, the
>>>>> hardware can still allocate into the cache via the other mappings. In
>>>>> this case the guest needs to invalidate the areas of memory that it
>>>>> wrote with caches off (or just use the DC bit to force memory accesses
>>>>> with MMU off to be cacheable).
>>>>
>>>> Having looked at all the approaches, I would vote for the approach taken
>>>> by this patch.
>>>
>>> But this patch alone doesn't solve the other issues. OTOH, the DC bit
>>> would solve your initial problem and a few others.
>>
>> DC bit might solve the initial problem but it can be problematic because
>> setting DC bit would mean Guest would have Caching ON even when Guest
>> MMU is disabled. This will be more problematic if Guest is running a
>> bootloader (uboot, grub, UEFI, ..) which does pass-through access to a
>> DMA-capable device and we will have to change the bootloader in this
>> case and put explicit flushes in bootloader for running inside Guest.
>
> Well, as Catalin mentioned, we'll have to do some cache maintenance in
> the guest in any case.

This would also mean that we will have to change Guest bootloader for
running as Guest under KVM ARM64.

In x86 world, everything that can run natively also runs as Guest OS
even if the Guest has pass-through devices.

>
>         M.
> --
> Jazz is not dead. It just smells funny...
>

--Anup

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-11 15:32                         ` Anup Patel
@ 2013-10-11 15:44                           ` Catalin Marinas
  2013-10-12 18:24                             ` Anup Patel
  0 siblings, 1 reply; 23+ messages in thread
From: Catalin Marinas @ 2013-10-11 15:44 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Oct 11, 2013 at 04:32:48PM +0100, Anup Patel wrote:
> On Fri, Oct 11, 2013 at 8:29 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> > On 11/10/13 15:50, Anup Patel wrote:
> >> On Fri, Oct 11, 2013 at 8:07 PM, Catalin Marinas
> >> <catalin.marinas@arm.com> wrote:
> >>> On Fri, Oct 11, 2013 at 03:27:16PM +0100, Anup Patel wrote:
> >>>> On Fri, Oct 11, 2013 at 6:08 PM, Catalin Marinas
> >>>> <catalin.marinas@arm.com> wrote:
> >>>>> On Thu, Oct 10, 2013 at 05:09:03PM +0100, Anup Patel wrote:
> >>>>>> Coming back to where we started, the actual problem was that when
> >>>>>> Guest starts booting it sees wrong contents because it is runs with
> >>>>>> MMU disable and correct contents are still in external L3 cache of X-Gene.
> >>>>>
> >>>>> That's one of the problems and I think the easiest to solve. Note that
> >>>>> contents could still be in the L1/L2 (inner) cache since whole cache
> >>>>> flushing by set/way isn't guaranteed in an MP context.
> >>>>>
> >>>>>> How about reconsidering the approach of flushing Guest RAM (entire or
> >>>>>> portion of it) to PoC by VA once before the first run of a VCPU ?
> >>>>>
> >>>>> Flushing the entire guest RAM is not possible by set/way
> >>>>> (architecturally) and not efficient by VA (though some benchmark would
> >>>>> be good). Marc's patch defers this flushing when a page is faulted in
> >>>>> (at stage 2) and I think it covers the initial boot.
> >>>>>
> >>>>>> OR
> >>>>>> We can also have KVM API using which user space can flush portions
> >>>>>> of Guest RAM before running the VCPU. (I think this was a suggestion
> >>>>>> from Marc Z initially)
> >>>>>
> >>>>> This may not be enough. It indeed flushes the kernel image that gets
> >>>>> loaded but the kernel would write other pages (bss, page tables etc.)
> >>>>> with MMU disabled and those addresses may contain dirty cache lines that
> >>>>> have not been covered by the initial kvmtool flush. So you basically
> >>>>> need all guest non-cacheable accesses to be flushed.
> >>>>>
> >>>>> The other problems are the cacheable aliases that I mentioned, so even
> >>>>> though the guest does non-cacheable accesses with the MMU off, the
> >>>>> hardware can still allocate into the cache via the other mappings. In
> >>>>> this case the guest needs to invalidate the areas of memory that it
> >>>>> wrote with caches off (or just use the DC bit to force memory accesses
> >>>>> with MMU off to be cacheable).
> >>>>
> >>>> Having looked at all the approaches, I would vote for the approach taken
> >>>> by this patch.
> >>>
> >>> But this patch alone doesn't solve the other issues. OTOH, the DC bit
> >>> would solve your initial problem and a few others.
> >>
> >> DC bit might solve the initial problem but it can be problematic because
> >> setting DC bit would mean Guest would have Caching ON even when Guest
> >> MMU is disabled. This will be more problematic if Guest is running a
> >> bootloader (uboot, grub, UEFI, ..) which does pass-through access to a
> >> DMA-capable device and we will have to change the bootloader in this
> >> case and put explicit flushes in bootloader for running inside Guest.
> >
> > Well, as Catalin mentioned, we'll have to do some cache maintenance in
> > the guest in any case.
> 
> This would also mean that we will have to change Guest bootloader for
> running as Guest under KVM ARM64.

Yes.

> In x86 world, everything that can run natively also runs as Guest OS
> even if the Guest has pass-through devices.

I guess on x86 the I/O is also coherent, in which case we could use the
DC bit.

-- 
Catalin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-11 15:44                           ` Catalin Marinas
@ 2013-10-12 18:24                             ` Anup Patel
  2013-10-15 14:38                               ` Catalin Marinas
  0 siblings, 1 reply; 23+ messages in thread
From: Anup Patel @ 2013-10-12 18:24 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Catalin,

On Fri, Oct 11, 2013 at 9:14 PM, Catalin Marinas
<catalin.marinas@arm.com> wrote:
> On Fri, Oct 11, 2013 at 04:32:48PM +0100, Anup Patel wrote:
>> On Fri, Oct 11, 2013 at 8:29 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> > On 11/10/13 15:50, Anup Patel wrote:
>> >> On Fri, Oct 11, 2013 at 8:07 PM, Catalin Marinas
>> >> <catalin.marinas@arm.com> wrote:
>> >>> On Fri, Oct 11, 2013 at 03:27:16PM +0100, Anup Patel wrote:
>> >>>> On Fri, Oct 11, 2013 at 6:08 PM, Catalin Marinas
>> >>>> <catalin.marinas@arm.com> wrote:
>> >>>>> On Thu, Oct 10, 2013 at 05:09:03PM +0100, Anup Patel wrote:
>> >>>>>> Coming back to where we started, the actual problem was that when
>> >>>>>> Guest starts booting it sees wrong contents because it is runs with
>> >>>>>> MMU disable and correct contents are still in external L3 cache of X-Gene.
>> >>>>>
>> >>>>> That's one of the problems and I think the easiest to solve. Note that
>> >>>>> contents could still be in the L1/L2 (inner) cache since whole cache
>> >>>>> flushing by set/way isn't guaranteed in an MP context.
>> >>>>>
>> >>>>>> How about reconsidering the approach of flushing Guest RAM (entire or
>> >>>>>> portion of it) to PoC by VA once before the first run of a VCPU ?
>> >>>>>
>> >>>>> Flushing the entire guest RAM is not possible by set/way
>> >>>>> (architecturally) and not efficient by VA (though some benchmark would
>> >>>>> be good). Marc's patch defers this flushing when a page is faulted in
>> >>>>> (at stage 2) and I think it covers the initial boot.
>> >>>>>
>> >>>>>> OR
>> >>>>>> We can also have KVM API using which user space can flush portions
>> >>>>>> of Guest RAM before running the VCPU. (I think this was a suggestion
>> >>>>>> from Marc Z initially)
>> >>>>>
>> >>>>> This may not be enough. It indeed flushes the kernel image that gets
>> >>>>> loaded but the kernel would write other pages (bss, page tables etc.)
>> >>>>> with MMU disabled and those addresses may contain dirty cache lines that
>> >>>>> have not been covered by the initial kvmtool flush. So you basically
>> >>>>> need all guest non-cacheable accesses to be flushed.
>> >>>>>
>> >>>>> The other problems are the cacheable aliases that I mentioned, so even
>> >>>>> though the guest does non-cacheable accesses with the MMU off, the
>> >>>>> hardware can still allocate into the cache via the other mappings. In
>> >>>>> this case the guest needs to invalidate the areas of memory that it
>> >>>>> wrote with caches off (or just use the DC bit to force memory accesses
>> >>>>> with MMU off to be cacheable).
>> >>>>
>> >>>> Having looked at all the approaches, I would vote for the approach taken
>> >>>> by this patch.
>> >>>
>> >>> But this patch alone doesn't solve the other issues. OTOH, the DC bit
>> >>> would solve your initial problem and a few others.
>> >>
>> >> DC bit might solve the initial problem but it can be problematic because
>> >> setting DC bit would mean Guest would have Caching ON even when Guest
>> >> MMU is disabled. This will be more problematic if Guest is running a
>> >> bootloader (uboot, grub, UEFI, ..) which does pass-through access to a
>> >> DMA-capable device and we will have to change the bootloader in this
>> >> case and put explicit flushes in bootloader for running inside Guest.
>> >
>> > Well, as Catalin mentioned, we'll have to do some cache maintenance in
>> > the guest in any case.
>>
>> This would also mean that we will have to change Guest bootloader for
>> running as Guest under KVM ARM64.
>
> Yes.
>
>> In x86 world, everything that can run natively also runs as Guest OS
>> even if the Guest has pass-through devices.
>
> I guess on x86 the I/O is also coherent, in which case we could use the
> DC bit.

We need to go ahead with some approach hence, if you are inclined
towards DC bit approach then let us go with that approach.

For the DC bit approach, I request to document somewhere the limitation
(or corner case) about Guest bootloader accessing DMA-capable device.

>
> --
> Catalin

--
Anup

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-12 18:24                             ` Anup Patel
@ 2013-10-15 14:38                               ` Catalin Marinas
  2013-10-17  4:19                                 ` Anup Patel
  0 siblings, 1 reply; 23+ messages in thread
From: Catalin Marinas @ 2013-10-15 14:38 UTC (permalink / raw)
  To: linux-arm-kernel

On Sat, Oct 12, 2013 at 07:24:17PM +0100, Anup Patel wrote:
> On Fri, Oct 11, 2013 at 9:14 PM, Catalin Marinas
> <catalin.marinas@arm.com> wrote:
> > On Fri, Oct 11, 2013 at 04:32:48PM +0100, Anup Patel wrote:
> >> On Fri, Oct 11, 2013 at 8:29 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
> >> > On 11/10/13 15:50, Anup Patel wrote:
> >> >> On Fri, Oct 11, 2013 at 8:07 PM, Catalin Marinas
> >> >> <catalin.marinas@arm.com> wrote:
> >> >>> On Fri, Oct 11, 2013 at 03:27:16PM +0100, Anup Patel wrote:
> >> >>>> On Fri, Oct 11, 2013 at 6:08 PM, Catalin Marinas
> >> >>>> <catalin.marinas@arm.com> wrote:
> >> >>>>> On Thu, Oct 10, 2013 at 05:09:03PM +0100, Anup Patel wrote:
> >> >>>>>> Coming back to where we started, the actual problem was that when
> >> >>>>>> Guest starts booting it sees wrong contents because it is runs with
> >> >>>>>> MMU disable and correct contents are still in external L3 cache of X-Gene.
> >> >>>>>
> >> >>>>> That's one of the problems and I think the easiest to solve. Note that
> >> >>>>> contents could still be in the L1/L2 (inner) cache since whole cache
> >> >>>>> flushing by set/way isn't guaranteed in an MP context.
> >> >>>>>
> >> >>>>>> How about reconsidering the approach of flushing Guest RAM (entire or
> >> >>>>>> portion of it) to PoC by VA once before the first run of a VCPU ?
> >> >>>>>
> >> >>>>> Flushing the entire guest RAM is not possible by set/way
> >> >>>>> (architecturally) and not efficient by VA (though some benchmark would
> >> >>>>> be good). Marc's patch defers this flushing when a page is faulted in
> >> >>>>> (at stage 2) and I think it covers the initial boot.
> >> >>>>>
> >> >>>>>> OR
> >> >>>>>> We can also have KVM API using which user space can flush portions
> >> >>>>>> of Guest RAM before running the VCPU. (I think this was a suggestion
> >> >>>>>> from Marc Z initially)
> >> >>>>>
> >> >>>>> This may not be enough. It indeed flushes the kernel image that gets
> >> >>>>> loaded but the kernel would write other pages (bss, page tables etc.)
> >> >>>>> with MMU disabled and those addresses may contain dirty cache lines that
> >> >>>>> have not been covered by the initial kvmtool flush. So you basically
> >> >>>>> need all guest non-cacheable accesses to be flushed.
> >> >>>>>
> >> >>>>> The other problems are the cacheable aliases that I mentioned, so even
> >> >>>>> though the guest does non-cacheable accesses with the MMU off, the
> >> >>>>> hardware can still allocate into the cache via the other mappings. In
> >> >>>>> this case the guest needs to invalidate the areas of memory that it
> >> >>>>> wrote with caches off (or just use the DC bit to force memory accesses
> >> >>>>> with MMU off to be cacheable).
> >> >>>>
> >> >>>> Having looked at all the approaches, I would vote for the approach taken
> >> >>>> by this patch.
> >> >>>
> >> >>> But this patch alone doesn't solve the other issues. OTOH, the DC bit
> >> >>> would solve your initial problem and a few others.
> >> >>
> >> >> DC bit might solve the initial problem but it can be problematic because
> >> >> setting DC bit would mean Guest would have Caching ON even when Guest
> >> >> MMU is disabled. This will be more problematic if Guest is running a
> >> >> bootloader (uboot, grub, UEFI, ..) which does pass-through access to a
> >> >> DMA-capable device and we will have to change the bootloader in this
> >> >> case and put explicit flushes in bootloader for running inside Guest.
> >> >
> >> > Well, as Catalin mentioned, we'll have to do some cache maintenance in
> >> > the guest in any case.
> >>
> >> This would also mean that we will have to change Guest bootloader for
> >> running as Guest under KVM ARM64.
> >
> > Yes.
> >
> >> In x86 world, everything that can run natively also runs as Guest OS
> >> even if the Guest has pass-through devices.
> >
> > I guess on x86 the I/O is also coherent, in which case we could use the
> > DC bit.
> 
> We need to go ahead with some approach hence, if you are inclined
> towards DC bit approach then let us go with that approach.
> 
> For the DC bit approach, I request to document somewhere the limitation
> (or corner case) about Guest bootloader accessing DMA-capable device.

I think we could avoid the DC bit but still use some part of that patch.
Another problem with DC is run-time code patching where the guest MMU is
disabled, the guest may not do D-cache maintenance.

So, the proposal:

1. Clean+invalidate D-cache for pages mapped into the stage 2 for the
   first time (if the access is non-cacheable). Covered by this patch.
2. Track guest's use of the MMU registers (SCTLR etc.) and detect when
   the stage 1 is enabled. When stage 1 is enabled, clean+invalidate the
   D-cache again for the all pages already mapped in stage 2 (in case we
   had speculative loads).

The above allow the guest OS to run the boot code with MMU disabled and
then enable the MMU. If the guest needs to disable the MMU or caches
after boot, we either ask the guest for a Hyp call or we extend point 2
above to detect disabling (though that's not very efficient). Guest
power management via PSCI already implies Hyp calls, it's more for kexec
where you have a soft reset.

This only needs to be done for the primary CPU (or until the first CPU
enabled the MMU). Once a CPU enabled its MMU, the others will have
to cope with speculative loads into the cache anyway (if secondary VCPU
are started by a PSCI HVC call, we can probably ignore the trapping of
MMU register access anyway).

Note that we don't cover the I-cache. On ARMv8 you can get speculative
loads into the I-cache even if it is disabled, so it needs to be
invalidated explicitly before the MMU or the I-cache is enabled.

Comments?

-- 
Catalin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-15 14:38                               ` Catalin Marinas
@ 2013-10-17  4:19                                 ` Anup Patel
  2013-10-17 11:16                                   ` Catalin Marinas
  0 siblings, 1 reply; 23+ messages in thread
From: Anup Patel @ 2013-10-17  4:19 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Oct 15, 2013 at 8:08 PM, Catalin Marinas
<catalin.marinas@arm.com> wrote:
> On Sat, Oct 12, 2013 at 07:24:17PM +0100, Anup Patel wrote:
>> On Fri, Oct 11, 2013 at 9:14 PM, Catalin Marinas
>> <catalin.marinas@arm.com> wrote:
>> > On Fri, Oct 11, 2013 at 04:32:48PM +0100, Anup Patel wrote:
>> >> On Fri, Oct 11, 2013 at 8:29 PM, Marc Zyngier <marc.zyngier@arm.com> wrote:
>> >> > On 11/10/13 15:50, Anup Patel wrote:
>> >> >> On Fri, Oct 11, 2013 at 8:07 PM, Catalin Marinas
>> >> >> <catalin.marinas@arm.com> wrote:
>> >> >>> On Fri, Oct 11, 2013 at 03:27:16PM +0100, Anup Patel wrote:
>> >> >>>> On Fri, Oct 11, 2013 at 6:08 PM, Catalin Marinas
>> >> >>>> <catalin.marinas@arm.com> wrote:
>> >> >>>>> On Thu, Oct 10, 2013 at 05:09:03PM +0100, Anup Patel wrote:
>> >> >>>>>> Coming back to where we started, the actual problem was that when
>> >> >>>>>> Guest starts booting it sees wrong contents because it is runs with
>> >> >>>>>> MMU disable and correct contents are still in external L3 cache of X-Gene.
>> >> >>>>>
>> >> >>>>> That's one of the problems and I think the easiest to solve. Note that
>> >> >>>>> contents could still be in the L1/L2 (inner) cache since whole cache
>> >> >>>>> flushing by set/way isn't guaranteed in an MP context.
>> >> >>>>>
>> >> >>>>>> How about reconsidering the approach of flushing Guest RAM (entire or
>> >> >>>>>> portion of it) to PoC by VA once before the first run of a VCPU ?
>> >> >>>>>
>> >> >>>>> Flushing the entire guest RAM is not possible by set/way
>> >> >>>>> (architecturally) and not efficient by VA (though some benchmark would
>> >> >>>>> be good). Marc's patch defers this flushing when a page is faulted in
>> >> >>>>> (at stage 2) and I think it covers the initial boot.
>> >> >>>>>
>> >> >>>>>> OR
>> >> >>>>>> We can also have KVM API using which user space can flush portions
>> >> >>>>>> of Guest RAM before running the VCPU. (I think this was a suggestion
>> >> >>>>>> from Marc Z initially)
>> >> >>>>>
>> >> >>>>> This may not be enough. It indeed flushes the kernel image that gets
>> >> >>>>> loaded but the kernel would write other pages (bss, page tables etc.)
>> >> >>>>> with MMU disabled and those addresses may contain dirty cache lines that
>> >> >>>>> have not been covered by the initial kvmtool flush. So you basically
>> >> >>>>> need all guest non-cacheable accesses to be flushed.
>> >> >>>>>
>> >> >>>>> The other problems are the cacheable aliases that I mentioned, so even
>> >> >>>>> though the guest does non-cacheable accesses with the MMU off, the
>> >> >>>>> hardware can still allocate into the cache via the other mappings. In
>> >> >>>>> this case the guest needs to invalidate the areas of memory that it
>> >> >>>>> wrote with caches off (or just use the DC bit to force memory accesses
>> >> >>>>> with MMU off to be cacheable).
>> >> >>>>
>> >> >>>> Having looked at all the approaches, I would vote for the approach taken
>> >> >>>> by this patch.
>> >> >>>
>> >> >>> But this patch alone doesn't solve the other issues. OTOH, the DC bit
>> >> >>> would solve your initial problem and a few others.
>> >> >>
>> >> >> DC bit might solve the initial problem but it can be problematic because
>> >> >> setting DC bit would mean Guest would have Caching ON even when Guest
>> >> >> MMU is disabled. This will be more problematic if Guest is running a
>> >> >> bootloader (uboot, grub, UEFI, ..) which does pass-through access to a
>> >> >> DMA-capable device and we will have to change the bootloader in this
>> >> >> case and put explicit flushes in bootloader for running inside Guest.
>> >> >
>> >> > Well, as Catalin mentioned, we'll have to do some cache maintenance in
>> >> > the guest in any case.
>> >>
>> >> This would also mean that we will have to change Guest bootloader for
>> >> running as Guest under KVM ARM64.
>> >
>> > Yes.
>> >
>> >> In x86 world, everything that can run natively also runs as Guest OS
>> >> even if the Guest has pass-through devices.
>> >
>> > I guess on x86 the I/O is also coherent, in which case we could use the
>> > DC bit.
>>
>> We need to go ahead with some approach hence, if you are inclined
>> towards DC bit approach then let us go with that approach.
>>
>> For the DC bit approach, I request to document somewhere the limitation
>> (or corner case) about Guest bootloader accessing DMA-capable device.
>
> I think we could avoid the DC bit but still use some part of that patch.
> Another problem with DC is run-time code patching where the guest MMU is
> disabled, the guest may not do D-cache maintenance.
>
> So, the proposal:
>
> 1. Clean+invalidate D-cache for pages mapped into the stage 2 for the
>    first time (if the access is non-cacheable). Covered by this patch.
> 2. Track guest's use of the MMU registers (SCTLR etc.) and detect when
>    the stage 1 is enabled. When stage 1 is enabled, clean+invalidate the
>    D-cache again for the all pages already mapped in stage 2 (in case we
>    had speculative loads).

I agree on both point1 & poin2.

The point2 is for avoiding speculative cache loads for Host-side mappings
of the Guest RAM. Right?

>
> The above allow the guest OS to run the boot code with MMU disabled and
> then enable the MMU. If the guest needs to disable the MMU or caches
> after boot, we either ask the guest for a Hyp call or we extend point 2
> above to detect disabling (though that's not very efficient). Guest
> power management via PSCI already implies Hyp calls, it's more for kexec
> where you have a soft reset.

Yes, Guest disabling MMU after boot could be problematic.

The Hyp call (or PSCI call) approach can certainly be efficient but we need
to change Guest OS for this. On the other hand, extending point2 (though
inefficient) could save us the pain of changing Guest OS.

Can we come-up with a way of avoiding Hyp call (or PSCI call) here ?

>
> This only needs to be done for the primary CPU (or until the first CPU
> enabled the MMU). Once a CPU enabled its MMU, the others will have
> to cope with speculative loads into the cache anyway (if secondary VCPU
> are started by a PSCI HVC call, we can probably ignore the trapping of
> MMU register access anyway).

Also, this would be a nice way of reducing Clean-invalidate D-cache upon
non-cacheable accesses for SMP Guest (i.e. An enhancement to this patch).

>
> Note that we don't cover the I-cache. On ARMv8 you can get speculative
> loads into the I-cache even if it is disabled, so it needs to be
> invalidated explicitly before the MMU or the I-cache is enabled.

I think it should be responsibility of Guest OS to invalidate I-cache before
enabling MMU or I-cache enable. Right?

>
> Comments?

Sorry, for little delay in reply.

>
> --
> Catalin

--
Anup

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-17  4:19                                 ` Anup Patel
@ 2013-10-17 11:16                                   ` Catalin Marinas
  2013-10-19 14:45                                     ` Christoffer Dall
  0 siblings, 1 reply; 23+ messages in thread
From: Catalin Marinas @ 2013-10-17 11:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Oct 17, 2013 at 05:19:01AM +0100, Anup Patel wrote:
> On Tue, Oct 15, 2013 at 8:08 PM, Catalin Marinas
> <catalin.marinas@arm.com> wrote:
> > So, the proposal:
> >
> > 1. Clean+invalidate D-cache for pages mapped into the stage 2 for the
> >    first time (if the access is non-cacheable). Covered by this patch.
> > 2. Track guest's use of the MMU registers (SCTLR etc.) and detect when
> >    the stage 1 is enabled. When stage 1 is enabled, clean+invalidate the
> >    D-cache again for the all pages already mapped in stage 2 (in case we
> >    had speculative loads).
> 
> I agree on both point1 & poin2.
> 
> The point2 is for avoiding speculative cache loads for Host-side mappings
> of the Guest RAM. Right?

Yes.

> > The above allow the guest OS to run the boot code with MMU disabled and
> > then enable the MMU. If the guest needs to disable the MMU or caches
> > after boot, we either ask the guest for a Hyp call or we extend point 2
> > above to detect disabling (though that's not very efficient). Guest
> > power management via PSCI already implies Hyp calls, it's more for kexec
> > where you have a soft reset.
> 
> Yes, Guest disabling MMU after boot could be problematic.
> 
> The Hyp call (or PSCI call) approach can certainly be efficient but we need
> to change Guest OS for this. On the other hand, extending point2 (though
> inefficient) could save us the pain of changing Guest OS.
> 
> Can we come-up with a way of avoiding Hyp call (or PSCI call) here ?

It could probably be done more efficiently by decoding the MMU register
access fault at the EL2 level and emulating it there to avoid a switch
to host Linux. But that's not a trivial task and I can't tell about the
performance impact.

We still have an issue here since normally the guest disables the caches
and flushes them by set/way (usually on the last standing CPU, the
others being parked via PSCI). Such guest set/way operation isn't safe
when physical CPUs are up, even if you trap it in Hyp (unless you do it
via other complications like stop_machine() but even that may not be
entirely race-free and it opens the way for DoS attacks). The safest
here would be to do the cache maintenance by VA for all the guest
address space (probably fine if you do it in a preemptible way).

> > This only needs to be done for the primary CPU (or until the first CPU
> > enabled the MMU). Once a CPU enabled its MMU, the others will have
> > to cope with speculative loads into the cache anyway (if secondary VCPU
> > are started by a PSCI HVC call, we can probably ignore the trapping of
> > MMU register access anyway).
> 
> Also, this would be a nice way of reducing Clean-invalidate D-cache upon
> non-cacheable accesses for SMP Guest (i.e. An enhancement to this patch).

I think this should be fine, just do the clean&invalidate when the MMU
is off on all the VCPUs. Once one of them enabled the MMU, fall back to
the faster implementation. The same for the trapping above (unless we
later want to deal with MMU being turned off).

> > Note that we don't cover the I-cache. On ARMv8 you can get speculative
> > loads into the I-cache even if it is disabled, so it needs to be
> > invalidated explicitly before the MMU or the I-cache is enabled.
> 
> I think it should be responsibility of Guest OS to invalidate I-cache before
> enabling MMU or I-cache enable. Right?

Yes.

-- 
Catalin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-17 11:16                                   ` Catalin Marinas
@ 2013-10-19 14:45                                     ` Christoffer Dall
  2013-10-20  9:06                                       ` Catalin Marinas
  0 siblings, 1 reply; 23+ messages in thread
From: Christoffer Dall @ 2013-10-19 14:45 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Oct 17, 2013 at 12:16:02PM +0100, Catalin Marinas wrote:
> On Thu, Oct 17, 2013 at 05:19:01AM +0100, Anup Patel wrote:
> > On Tue, Oct 15, 2013 at 8:08 PM, Catalin Marinas
> > <catalin.marinas@arm.com> wrote:
> > > So, the proposal:
> > >
> > > 1. Clean+invalidate D-cache for pages mapped into the stage 2 for the
> > >    first time (if the access is non-cacheable). Covered by this patch.
> > > 2. Track guest's use of the MMU registers (SCTLR etc.) and detect when
> > >    the stage 1 is enabled. When stage 1 is enabled, clean+invalidate the
> > >    D-cache again for the all pages already mapped in stage 2 (in case we
> > >    had speculative loads).
> > 
> > I agree on both point1 & poin2.
> > 
> > The point2 is for avoiding speculative cache loads for Host-side mappings
> > of the Guest RAM. Right?
> 
> Yes.
> 

I'm having a hard time imagining a scenario where (2) is needed, can you
give me a concrete example of a situation that we're addressing here?

> > > The above allow the guest OS to run the boot code with MMU disabled and
> > > then enable the MMU. If the guest needs to disable the MMU or caches
> > > after boot, we either ask the guest for a Hyp call or we extend point 2
> > > above to detect disabling (though that's not very efficient). Guest
> > > power management via PSCI already implies Hyp calls, it's more for kexec
> > > where you have a soft reset.

Why would we need to do anything if the guest disables the MMU?  Isn't it
completely the responsibility of the guest to flush whatever it needs in
physical memory before doing ?

> > 
> > Yes, Guest disabling MMU after boot could be problematic.
> > 
> > The Hyp call (or PSCI call) approach can certainly be efficient but we need
> > to change Guest OS for this. On the other hand, extending point2 (though
> > inefficient) could save us the pain of changing Guest OS.
> > 
> > Can we come-up with a way of avoiding Hyp call (or PSCI call) here ?
> 
> It could probably be done more efficiently by decoding the MMU register
> access fault at the EL2 level and emulating it there to avoid a switch
> to host Linux. But that's not a trivial task and I can't tell about the
> performance impact.

That would be trapping to Hyp mode on every context switch for example,
multiple times probably, and that sounds horrible.  Yes, this should be
isolated to EL2, but even then this will add overhead.

We could measure this though, but it sounds like something that will
hurt the system significantly overall in both performance and complexity
to solve and extremely rare situation.

A Hyp call sounds equally icky and quite different from PSCI imho, since
PSCI is used on native systems and support by a "standard", so we're not
doing paravirtualization there.

> 
> We still have an issue here since normally the guest disables the caches
> and flushes them by set/way (usually on the last standing CPU, the
> others being parked via PSCI). Such guest set/way operation isn't safe
> when physical CPUs are up, even if you trap it in Hyp (unless you do it
> via other complications like stop_machine() but even that may not be
> entirely race-free and it opens the way for DoS attacks). The safest
> here would be to do the cache maintenance by VA for all the guest
> address space (probably fine if you do it in a preemptible way).
> 

This should be handled properly already (see access_dcsw in
arch/arm/kvm/coproc.c) or am I missing something?

With respect to DoS this would eventually be managed by the host
scheduler which would detect that these processes are spending too much
time on the cpu (doing cache flushes or other things) and reschedules
another process, so I don't quite see the DoS happening.

Sounds like I'm missing some subtle aspect of the architecture here, but
let's see if we can't make KVM resiliant enough.

> > > This only needs to be done for the primary CPU (or until the first CPU
> > > enabled the MMU). Once a CPU enabled its MMU, the others will have
> > > to cope with speculative loads into the cache anyway (if secondary VCPU
> > > are started by a PSCI HVC call, we can probably ignore the trapping of
> > > MMU register access anyway).
> > 
> > Also, this would be a nice way of reducing Clean-invalidate D-cache upon
> > non-cacheable accesses for SMP Guest (i.e. An enhancement to this patch).
> 
> I think this should be fine, just do the clean&invalidate when the MMU
> is off on all the VCPUs. Once one of them enabled the MMU, fall back to
> the faster implementation. The same for the trapping above (unless we
> later want to deal with MMU being turned off).
> 
> > > Note that we don't cover the I-cache. On ARMv8 you can get speculative
> > > loads into the I-cache even if it is disabled, so it needs to be
> > > invalidated explicitly before the MMU or the I-cache is enabled.
> > 
> > I think it should be responsibility of Guest OS to invalidate I-cache before
> > enabling MMU or I-cache enable. Right?
> 
> Yes.
> 
> -- 
> Catalin

-- 
Christoffer

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault
  2013-10-19 14:45                                     ` Christoffer Dall
@ 2013-10-20  9:06                                       ` Catalin Marinas
  0 siblings, 0 replies; 23+ messages in thread
From: Catalin Marinas @ 2013-10-20  9:06 UTC (permalink / raw)
  To: linux-arm-kernel

On 19 Oct 2013, at 15:45, Christoffer Dall <christoffer.dall@linaro.org> wrote:
> On Thu, Oct 17, 2013 at 12:16:02PM +0100, Catalin Marinas wrote:
>> On Thu, Oct 17, 2013 at 05:19:01AM +0100, Anup Patel wrote:
>>> On Tue, Oct 15, 2013 at 8:08 PM, Catalin Marinas
>>> <catalin.marinas@arm.com> wrote:
>>>> So, the proposal:
>>>> 
>>>> 1. Clean+invalidate D-cache for pages mapped into the stage 2 for the
>>>>   first time (if the access is non-cacheable). Covered by this patch.
>>>> 2. Track guest's use of the MMU registers (SCTLR etc.) and detect when
>>>>   the stage 1 is enabled. When stage 1 is enabled, clean+invalidate the
>>>>   D-cache again for the all pages already mapped in stage 2 (in case we
>>>>   had speculative loads).
>>> 
>>> I agree on both point1 & poin2.
>>> 
>>> The point2 is for avoiding speculative cache loads for Host-side mappings
>>> of the Guest RAM. Right?
>> 
>> Yes.
> 
> I'm having a hard time imagining a scenario where (2) is needed, can you
> give me a concrete example of a situation that we're addressing here?

As Anup said, to avoid speculative cache loads from the host-side
mappings of the guest RAM.  Host Linux has a cacheable mapping of the
RAM while the guest assume non-cacheable.  Concrete example:

a) Guest starts populating the page table in head.S
b) Corresponding stage 2 page is faulted in, the caches are cleaned and
   invalidated (as per point 1 above)
c) Mid-way through, it is preempted and a switch to host occurs
d) CPU loads the cache speculatively for the page mapped at point (b)
   (since the host has a cacheable mapping of that page already)
e) The guest continues the page table population via non-cacheable
   accesses (at this point, RAM and cache for this page differ)
f) Guest enables the MMU with cacheable page table walks
g) The MMU (stage 1) sees stale data in the cache for the page table
   (because of (d))

Also note that even if you don't get a preemption as per point c,
another physical CPU can do the speculative cache load and because of
the snooping, point f is the same.

On a host OS, point (d) does not happen when the MMU is off for all the
CPUs.  This is not enough when all VCPUs have the MMU off.

>>>> The above allow the guest OS to run the boot code with MMU disabled and
>>>> then enable the MMU. If the guest needs to disable the MMU or caches
>>>> after boot, we either ask the guest for a Hyp call or we extend point 2
>>>> above to detect disabling (though that's not very efficient). Guest
>>>> power management via PSCI already implies Hyp calls, it's more for kexec
>>>> where you have a soft reset.
> 
> Why would we need to do anything if the guest disables the MMU?  Isn't it
> completely the responsibility of the guest to flush whatever it needs in
> physical memory before doing ?

It's not necessarily about what the guest does with the caches but
whether its assumptions after disable the MMU are still valid.  As per
point 2 above, we need to track again whether the guest simply assumes
MMU and caches are off.

Regarding what the guest does with the caches, it most likely performs a
full cache flush by set/way which is not guaranteed to work unless the
caches are disabled on all the physical CPUs (no snooping).

>>> Yes, Guest disabling MMU after boot could be problematic.
>>> 
>>> The Hyp call (or PSCI call) approach can certainly be efficient but we need
>>> to change Guest OS for this. On the other hand, extending point2 (though
>>> inefficient) could save us the pain of changing Guest OS.
>>> 
>>> Can we come-up with a way of avoiding Hyp call (or PSCI call) here ?
>> 
>> It could probably be done more efficiently by decoding the MMU register
>> access fault at the EL2 level and emulating it there to avoid a switch
>> to host Linux. But that's not a trivial task and I can't tell about the
>> performance impact.
> 
> That would be trapping to Hyp mode on every context switch for example,
> multiple times probably, and that sounds horrible.  Yes, this should be
> isolated to EL2, but even then this will add overhead.

There is an overhead and I'm not proposing that we do this now.  But as
soon as you have some bootloader or UEFI running before Linux, you
already have some more MMU enable/disable events.

> We could measure this though, but it sounds like something that will
> hurt the system significantly overall in both performance and complexity
> to solve and extremely rare situation.

It may not be so rare if the window from MMU disabling to re-enabling it
is extended.  But as long as you run Linux as the first thing in a guest
and you ignore kexec, it should be OK.

> A Hyp call sounds equally icky and quite different from PSCI imho, since
> PSCI is used on native systems and support by a "standard", so we're not
> doing paravirtualization there.

It is different from PSCI indeed, so we either change the guest
assumptions about caches or we add some for of detection into EL2.  I
agree, it looks like paravirtualisation.

Another option could be to start trapping the MMU reg accesses only if
you detect a full cache flush by set/way where all the VCPUs either have
the MMU or caches disabled.  That's a typical MMU off scenario in OS
(kexec) and boot loaders.

>> We still have an issue here since normally the guest disables the caches
>> and flushes them by set/way (usually on the last standing CPU, the
>> others being parked via PSCI). Such guest set/way operation isn't safe
>> when physical CPUs are up, even if you trap it in Hyp (unless you do it
>> via other complications like stop_machine() but even that may not be
>> entirely race-free and it opens the way for DoS attacks). The safest
>> here would be to do the cache maintenance by VA for all the guest
>> address space (probably fine if you do it in a preemptible way).
> 
> This should be handled properly already (see access_dcsw in
> arch/arm/kvm/coproc.c) or am I missing something?

DC CISW and friends only work when the physical caches are off on all
CPUs.  Otherwise you either get cache lines migrating between levels or
across to other CPUs.  On arm64 I plan to remove such calls and probably
only keep them for kexec when only one CPU is active.

>From the ARMv8 ARM (same as on ARMv7) page D4-1690:

  The cache maintenance instructions by set/way can clean or invalidate,
  or both, the entirety of one or more levels of cache attached to a
  processing element.  However, unless all processing elements attached
  to the caches regard all memory locations as Non-cacheable, it is not
  possible to prevent locations being allocated into the cache during
  such a sequence of the cache maintenance instructions.

  ----- Note -----
  In multi-processing environments, the cache maintenance instructions
  that operate by set/way are not broadcast within the shareability
  domains, and so allocations can occur from other, unmaintained,
  locations, in caches in other locations.  For this reason, the use of
  cache maintenance instructions that operate by set/way for the
  maintenance of large buffers of memory is not recommended in the
  architectural sequence.  The expected usage of the cache maintenance
  instructions that operate by set/way is associated with the cache
  maintenance instructions associated with the powerdown and powerup of
  caches, if this is required by the implementation.
  ----------------

None of the above requirements are met in a virtual environment.

Catalin

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2013-10-20  9:06 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-09-10  9:51 [RFC PATCH] arm64: KVM: honor cacheability attributes on S2 page fault Marc Zyngier
2013-09-11 12:21 ` Anup Patel
2013-09-11 12:35   ` Marc Zyngier
2013-09-11 19:38     ` Christoffer Dall
2013-10-10  4:51       ` Anup Patel
2013-10-10  8:39         ` Marc Zyngier
2013-10-10 11:24           ` Catalin Marinas
2013-10-10 16:09             ` Anup Patel
2013-10-11 12:38               ` Catalin Marinas
2013-10-11 14:27                 ` Anup Patel
2013-10-11 14:37                   ` Catalin Marinas
2013-10-11 14:50                     ` Anup Patel
2013-10-11 14:59                       ` Marc Zyngier
2013-10-11 15:32                         ` Anup Patel
2013-10-11 15:44                           ` Catalin Marinas
2013-10-12 18:24                             ` Anup Patel
2013-10-15 14:38                               ` Catalin Marinas
2013-10-17  4:19                                 ` Anup Patel
2013-10-17 11:16                                   ` Catalin Marinas
2013-10-19 14:45                                     ` Christoffer Dall
2013-10-20  9:06                                       ` Catalin Marinas
2013-09-11 18:06 ` Peter Maydell
2013-09-11 19:25   ` Christoffer Dall

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.