All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5
@ 2022-03-09  1:23 Fabiano Rosas
  2022-03-09  1:24 ` [RFC PATCH 2/2] linux-headers: Add KVM_CAP_PPC_GTSE Fabiano Rosas
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Fabiano Rosas @ 2022-03-09  1:23 UTC (permalink / raw)
  To: qemu-devel; +Cc: aneesh.kumar, danielhb413, npiggin, qemu-ppc, clg, david

QEMU reports MMU support to the guest via the ibm,architecture-vec-5
property of the /chosen node. Byte number 26 specifies Radix Table
Expansions, currently only GTSE (Guest Translation Shootdown
Enable). This feature determines whether the tlbie instruction (and
others) are HV privileged.

Up until now, we always reported GTSE=1 to guests. Even after the
support for GTSE=0 was added. As part of that support, a kernel
command line radix_hcall_invalidate=on was introduced that overrides
the GTSE value received via CAS. So a guest can run with GTSE=0 and
use the H_RPT_INVALIDATE hcall instead of tlbie.

In this scenario, having GTSE always set to 1 by QEMU leads to a crash
when running nested KVM guests because KVM does not allow a nested
hypervisor to set GTSE support for its nested guests. So a nested
guest always uses the same value for LPCR_GTSE as its HV. Since the
nested HV disabled GTSE, but the L2 QEMU always reports GTSE=1, we run
into a crash when:

L1 LPCR_GTSE=0
L2 LPCR_GTSE=0
L2 CAS GTSE=1

The nested guest will run 'tlbie' and crash because the HW looks at
LPCR_GTSE, which is clear.

Having GTSE disabled in the L1 and enabled in the L2 is not an option
because the whole purpose of GTSE is to disallow access to tlbie and
we cannot allow L1 to spawn L2s that can access features that L1
itself cannot.

We also cannot have the guest check the LPCR bit, because LPCR is
HV-privileged.

So this patch goes through the most intuitive route which is to have
QEMU ask KVM about GTSE support and advertise the correct value to the
guest. A new KVM_CAP_PPC_GTSE capability is being added.

TCG continues to always enable GTSE.

Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
---
 hw/ppc/spapr.c       | 38 +++++++++++++++++++++++++++++++-------
 target/ppc/kvm.c     |  8 ++++++++
 target/ppc/kvm_ppc.h |  6 ++++++
 3 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 4cc204f90d..3e95a1831f 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -971,7 +971,7 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
         23, 0x00, /* XICS / XIVE mode */
         24, 0x00, /* Hash/Radix, filled in below. */
         25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
-        26, 0x40, /* Radix options: GTSE == yes. */
+        26, 0x00, /* Radix options, filled in below. */
     };
 
     if (spapr->irq->xics && spapr->irq->xive) {
@@ -1000,10 +1000,16 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
         } else {
             val[3] = 0x00; /* Hash */
         }
+
+        if (kvmppc_has_cap_gtse()) {
+            val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
+        }
     } else {
         /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
         val[3] = 0xC0;
+        val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
     }
+
     _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
                      val, sizeof(val)));
 }
@@ -2824,14 +2830,32 @@ static void spapr_machine_init(MachineState *machine)
     /* Init numa_assoc_array */
     spapr_numa_associativity_init(spapr, machine);
 
-    if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
-        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
+    if (ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
                               spapr->max_compat_pvr)) {
-        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
-        /* KVM and TCG always allow GTSE with radix... */
-        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
+
+        /* TCG always supports Radix w/ GTSE */
+        if (!kvm_enabled()) {
+            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
+            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
+        } else {
+            if (kvmppc_has_cap_mmu_radix()) {
+                spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
+            }
+
+            /*
+             * Only disable Guest Translation Shootdown if KVM
+             * supports the H_RPT_INVALIDATE hypercall, otherwise we'd
+             * leave the guest with no way to make TLB invalidations.
+             */
+            if (kvmppc_has_cap_rpt_invalidate()) {
+                if (kvmppc_has_cap_gtse()) {
+                    spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
+                }
+            } else {
+                spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
+            }
+        }
     }
-    /* ... but not with hash (currently). */
 
     if (kvm_enabled()) {
         /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index dc93b99189..91582c4b15 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv;
 static int cap_large_decr;
 static int cap_fwnmi;
 static int cap_rpt_invalidate;
+static int cap_gtse;
 
 static uint32_t debug_inst_opcode;
 
@@ -154,6 +155,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
     }
 
     cap_rpt_invalidate = kvm_vm_check_extension(s, KVM_CAP_PPC_RPT_INVALIDATE);
+    cap_gtse = kvm_vm_check_extension(s, KVM_CAP_PPC_GTSE);
+
     kvm_ppc_register_host_cpu_type();
 
     return 0;
@@ -2397,6 +2400,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
     return cap_mmu_hash_v3;
 }
 
+bool kvmppc_has_cap_gtse(void)
+{
+    return cap_gtse;
+}
+
 static bool kvmppc_power8_host(void)
 {
     bool ret = false;
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index ee9325bf9a..7d6980edb7 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -63,6 +63,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
 bool kvmppc_has_cap_htm(void);
 bool kvmppc_has_cap_mmu_radix(void);
 bool kvmppc_has_cap_mmu_hash_v3(void);
+bool kvmppc_has_cap_gtse(void);
 bool kvmppc_has_cap_xive(void);
 int kvmppc_get_cap_safe_cache(void);
 int kvmppc_get_cap_safe_bounds_check(void);
@@ -343,6 +344,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
     return false;
 }
 
+static inline bool kvmppc_has_cap_gtse(void)
+{
+    return false;
+}
+
 static inline bool kvmppc_has_cap_xive(void)
 {
     return false;
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH 2/2] linux-headers: Add KVM_CAP_PPC_GTSE
  2022-03-09  1:23 [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5 Fabiano Rosas
@ 2022-03-09  1:24 ` Fabiano Rosas
  2022-03-09 19:09 ` [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5 Daniel Henrique Barboza
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 11+ messages in thread
From: Fabiano Rosas @ 2022-03-09  1:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: aneesh.kumar, danielhb413, npiggin, qemu-ppc, clg, david

Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
---
This is here just to facilitate review/testing of the feature.
---
 linux-headers/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 00af3bc333..15f19fd958 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
 #define KVM_CAP_VM_GPA_BITS 207
 #define KVM_CAP_XSAVE2 208
+#define KVM_CAP_PPC_GTSE 211
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5
  2022-03-09  1:23 [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5 Fabiano Rosas
  2022-03-09  1:24 ` [RFC PATCH 2/2] linux-headers: Add KVM_CAP_PPC_GTSE Fabiano Rosas
@ 2022-03-09 19:09 ` Daniel Henrique Barboza
  2022-03-11 14:45   ` Fabiano Rosas
  2022-03-10 19:51 ` Fabiano Rosas
  2022-03-12  9:17 ` David Gibson
  3 siblings, 1 reply; 11+ messages in thread
From: Daniel Henrique Barboza @ 2022-03-09 19:09 UTC (permalink / raw)
  To: Fabiano Rosas, qemu-devel; +Cc: aneesh.kumar, qemu-ppc, clg, npiggin, david



On 3/8/22 22:23, Fabiano Rosas wrote:
> QEMU reports MMU support to the guest via the ibm,architecture-vec-5
> property of the /chosen node. Byte number 26 specifies Radix Table
> Expansions, currently only GTSE (Guest Translation Shootdown
> Enable). This feature determines whether the tlbie instruction (and
> others) are HV privileged.
> 
> Up until now, we always reported GTSE=1 to guests. Even after the
> support for GTSE=0 was added. As part of that support, a kernel
> command line radix_hcall_invalidate=on was introduced that overrides
> the GTSE value received via CAS. So a guest can run with GTSE=0 and
> use the H_RPT_INVALIDATE hcall instead of tlbie.
> 
> In this scenario, having GTSE always set to 1 by QEMU leads to a crash
> when running nested KVM guests because KVM does not allow a nested
> hypervisor to set GTSE support for its nested guests. So a nested
> guest always uses the same value for LPCR_GTSE as its HV. Since the
> nested HV disabled GTSE, but the L2 QEMU always reports GTSE=1, we run
> into a crash when:
> 
> L1 LPCR_GTSE=0
> L2 LPCR_GTSE=0
> L2 CAS GTSE=1
> 
> The nested guest will run 'tlbie' and crash because the HW looks at
> LPCR_GTSE, which is clear.
> 
> Having GTSE disabled in the L1 and enabled in the L2 is not an option
> because the whole purpose of GTSE is to disallow access to tlbie and
> we cannot allow L1 to spawn L2s that can access features that L1
> itself cannot.
> 
> We also cannot have the guest check the LPCR bit, because LPCR is
> HV-privileged.
> 
> So this patch goes through the most intuitive route which is to have
> QEMU ask KVM about GTSE support and advertise the correct value to the
> guest. A new KVM_CAP_PPC_GTSE capability is being added.
> 
> TCG continues to always enable GTSE.
> 
> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
> ---


I'm not sure if I fully understood the situation, so let me recap. Once upon a time,
QEMU advertised GTSE=1 and the host would never advertise other value, and everyone
was happy.

The host started to support GTSE=0, but QEMU kept advertising GTSE=1 regardless, and no
KVM GTSE cap was added to reflect the host support. I'll then assume that:


- all guests would break if running in a GTSE=0 host prior to rpt_invalidate support (which
is necessary to allow the guest to run in GTSE=0)

- apparently no one ever tried to run a KVM guest in a GTSE=0 host, so no bugs were opened

After commit 82123b756a1a2f1 (target/ppc: Support for H_RPT_INVALIDATE hcall) we added
cap-rpt-invalidate. I didn't follow the discussions of this cap but it seems like, as with
almost every other cap we have, there would be a migration problem for a guest that was in
a rpt_invalidate aware host to migrate to another where this wouldn't be true, and the cap
solves that.

What I'm not following is why, even after having cap-rpt-invalidate, we are still "lying"
about the GTSE=1 regardless of what the host supports. We could've added the GTSE KVM cap
at the same time rpt_invalidate was introduced, and guests that want to ignore this setting
can use the cap to bypass it.


In the end this patch is a needed fix IMHO. My confusion is why we're doing this just now.


The patch itself LGTM.


Thanks,


Daniel


>   hw/ppc/spapr.c       | 38 +++++++++++++++++++++++++++++++-------
>   target/ppc/kvm.c     |  8 ++++++++
>   target/ppc/kvm_ppc.h |  6 ++++++
>   3 files changed, 45 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 4cc204f90d..3e95a1831f 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -971,7 +971,7 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
>           23, 0x00, /* XICS / XIVE mode */
>           24, 0x00, /* Hash/Radix, filled in below. */
>           25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
> -        26, 0x40, /* Radix options: GTSE == yes. */
> +        26, 0x00, /* Radix options, filled in below. */
>       };
>   
>       if (spapr->irq->xics && spapr->irq->xive) {
> @@ -1000,10 +1000,16 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
>           } else {
>               val[3] = 0x00; /* Hash */
>           }
> +
> +        if (kvmppc_has_cap_gtse()) {
> +            val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
> +        }
>       } else {
>           /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
>           val[3] = 0xC0;
> +        val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
>       }
> +
>       _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
>                        val, sizeof(val)));
>   }
> @@ -2824,14 +2830,32 @@ static void spapr_machine_init(MachineState *machine)
>       /* Init numa_assoc_array */
>       spapr_numa_associativity_init(spapr, machine);
>   
> -    if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
> -        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
> +    if (ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
>                                 spapr->max_compat_pvr)) {
> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> -        /* KVM and TCG always allow GTSE with radix... */
> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +
> +        /* TCG always supports Radix w/ GTSE */
> +        if (!kvm_enabled()) {
> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +        } else {
> +            if (kvmppc_has_cap_mmu_radix()) {
> +                spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> +            }
> +
> +            /*
> +             * Only disable Guest Translation Shootdown if KVM
> +             * supports the H_RPT_INVALIDATE hypercall, otherwise we'd
> +             * leave the guest with no way to make TLB invalidations.
> +             */
> +            if (kvmppc_has_cap_rpt_invalidate()) {
> +                if (kvmppc_has_cap_gtse()) {
> +                    spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +                }
> +            } else {
> +                spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +            }
> +        }
>       }
> -    /* ... but not with hash (currently). */
>   
>       if (kvm_enabled()) {
>           /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index dc93b99189..91582c4b15 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv;
>   static int cap_large_decr;
>   static int cap_fwnmi;
>   static int cap_rpt_invalidate;
> +static int cap_gtse;
>   
>   static uint32_t debug_inst_opcode;
>   
> @@ -154,6 +155,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>       }
>   
>       cap_rpt_invalidate = kvm_vm_check_extension(s, KVM_CAP_PPC_RPT_INVALIDATE);
> +    cap_gtse = kvm_vm_check_extension(s, KVM_CAP_PPC_GTSE);
> +
>       kvm_ppc_register_host_cpu_type();
>   
>       return 0;
> @@ -2397,6 +2400,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
>       return cap_mmu_hash_v3;
>   }
>   
> +bool kvmppc_has_cap_gtse(void)
> +{
> +    return cap_gtse;
> +}
> +
>   static bool kvmppc_power8_host(void)
>   {
>       bool ret = false;
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index ee9325bf9a..7d6980edb7 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -63,6 +63,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
>   bool kvmppc_has_cap_htm(void);
>   bool kvmppc_has_cap_mmu_radix(void);
>   bool kvmppc_has_cap_mmu_hash_v3(void);
> +bool kvmppc_has_cap_gtse(void);
>   bool kvmppc_has_cap_xive(void);
>   int kvmppc_get_cap_safe_cache(void);
>   int kvmppc_get_cap_safe_bounds_check(void);
> @@ -343,6 +344,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
>       return false;
>   }
>   
> +static inline bool kvmppc_has_cap_gtse(void)
> +{
> +    return false;
> +}
> +
>   static inline bool kvmppc_has_cap_xive(void)
>   {
>       return false;


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5
  2022-03-09  1:23 [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5 Fabiano Rosas
  2022-03-09  1:24 ` [RFC PATCH 2/2] linux-headers: Add KVM_CAP_PPC_GTSE Fabiano Rosas
  2022-03-09 19:09 ` [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5 Daniel Henrique Barboza
@ 2022-03-10 19:51 ` Fabiano Rosas
  2022-03-11  0:30   ` Daniel Henrique Barboza
  2022-03-12  9:17 ` David Gibson
  3 siblings, 1 reply; 11+ messages in thread
From: Fabiano Rosas @ 2022-03-10 19:51 UTC (permalink / raw)
  To: qemu-devel; +Cc: aneesh.kumar, danielhb413, npiggin, qemu-ppc, clg, david

Fabiano Rosas <farosas@linux.ibm.com> writes:

> QEMU reports MMU support to the guest via the ibm,architecture-vec-5
> property of the /chosen node. Byte number 26 specifies Radix Table
> Expansions, currently only GTSE (Guest Translation Shootdown
> Enable). This feature determines whether the tlbie instruction (and
> others) are HV privileged.
>
> Up until now, we always reported GTSE=1 to guests. Even after the
> support for GTSE=0 was added. As part of that support, a kernel
> command line radix_hcall_invalidate=on was introduced that overrides
> the GTSE value received via CAS. So a guest can run with GTSE=0 and
> use the H_RPT_INVALIDATE hcall instead of tlbie.
>
> In this scenario, having GTSE always set to 1 by QEMU leads to a crash
> when running nested KVM guests because KVM does not allow a nested
> hypervisor to set GTSE support for its nested guests. So a nested
> guest always uses the same value for LPCR_GTSE as its HV. Since the
> nested HV disabled GTSE, but the L2 QEMU always reports GTSE=1, we run
> into a crash when:
>
> L1 LPCR_GTSE=0
> L2 LPCR_GTSE=0
> L2 CAS GTSE=1
>
> The nested guest will run 'tlbie' and crash because the HW looks at
> LPCR_GTSE, which is clear.
>
> Having GTSE disabled in the L1 and enabled in the L2 is not an option
> because the whole purpose of GTSE is to disallow access to tlbie and
> we cannot allow L1 to spawn L2s that can access features that L1
> itself cannot.
>
> We also cannot have the guest check the LPCR bit, because LPCR is
> HV-privileged.
>
> So this patch goes through the most intuitive route which is to have
> QEMU ask KVM about GTSE support and advertise the correct value to the
> guest. A new KVM_CAP_PPC_GTSE capability is being added.
>
> TCG continues to always enable GTSE.
>
> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
> ---
>  hw/ppc/spapr.c       | 38 +++++++++++++++++++++++++++++++-------
>  target/ppc/kvm.c     |  8 ++++++++
>  target/ppc/kvm_ppc.h |  6 ++++++
>  3 files changed, 45 insertions(+), 7 deletions(-)
>
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 4cc204f90d..3e95a1831f 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -971,7 +971,7 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
>          23, 0x00, /* XICS / XIVE mode */
>          24, 0x00, /* Hash/Radix, filled in below. */
>          25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
> -        26, 0x40, /* Radix options: GTSE == yes. */
> +        26, 0x00, /* Radix options, filled in below. */
>      };
>  
>      if (spapr->irq->xics && spapr->irq->xive) {
> @@ -1000,10 +1000,16 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
>          } else {
>              val[3] = 0x00; /* Hash */
>          }
> +
> +        if (kvmppc_has_cap_gtse()) {
> +            val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
> +        }

This needs the same treatment as below to support kernels that don't
know about the cap. Also, look at that semicolon! =D

>      } else {
>          /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
>          val[3] = 0xC0;
> +        val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
>      }
> +
>      _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
>                       val, sizeof(val)));
>  }
> @@ -2824,14 +2830,32 @@ static void spapr_machine_init(MachineState *machine)
>      /* Init numa_assoc_array */
>      spapr_numa_associativity_init(spapr, machine);
>  
> -    if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
> -        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
> +    if (ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
>                                spapr->max_compat_pvr)) {
> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> -        /* KVM and TCG always allow GTSE with radix... */
> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +
> +        /* TCG always supports Radix w/ GTSE */
> +        if (!kvm_enabled()) {
> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +        } else {
> +            if (kvmppc_has_cap_mmu_radix()) {
> +                spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> +            }
> +
> +            /*
> +             * Only disable Guest Translation Shootdown if KVM
> +             * supports the H_RPT_INVALIDATE hypercall, otherwise we'd
> +             * leave the guest with no way to make TLB invalidations.
> +             */
> +            if (kvmppc_has_cap_rpt_invalidate()) {
> +                if (kvmppc_has_cap_gtse()) {
> +                    spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +                }
> +            } else {
> +                spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +            }
> +        }
>      }
> -    /* ... but not with hash (currently). */
>  
>      if (kvm_enabled()) {
>          /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index dc93b99189..91582c4b15 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv;
>  static int cap_large_decr;
>  static int cap_fwnmi;
>  static int cap_rpt_invalidate;
> +static int cap_gtse;
>  
>  static uint32_t debug_inst_opcode;
>  
> @@ -154,6 +155,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>      }
>  
>      cap_rpt_invalidate = kvm_vm_check_extension(s, KVM_CAP_PPC_RPT_INVALIDATE);
> +    cap_gtse = kvm_vm_check_extension(s, KVM_CAP_PPC_GTSE);
> +
>      kvm_ppc_register_host_cpu_type();
>  
>      return 0;
> @@ -2397,6 +2400,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
>      return cap_mmu_hash_v3;
>  }
>  
> +bool kvmppc_has_cap_gtse(void)
> +{
> +    return cap_gtse;
> +}
> +
>  static bool kvmppc_power8_host(void)
>  {
>      bool ret = false;
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index ee9325bf9a..7d6980edb7 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -63,6 +63,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
>  bool kvmppc_has_cap_htm(void);
>  bool kvmppc_has_cap_mmu_radix(void);
>  bool kvmppc_has_cap_mmu_hash_v3(void);
> +bool kvmppc_has_cap_gtse(void);
>  bool kvmppc_has_cap_xive(void);
>  int kvmppc_get_cap_safe_cache(void);
>  int kvmppc_get_cap_safe_bounds_check(void);
> @@ -343,6 +344,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
>      return false;
>  }
>  
> +static inline bool kvmppc_has_cap_gtse(void)
> +{
> +    return false;
> +}
> +
>  static inline bool kvmppc_has_cap_xive(void)
>  {
>      return false;


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5
  2022-03-10 19:51 ` Fabiano Rosas
@ 2022-03-11  0:30   ` Daniel Henrique Barboza
  0 siblings, 0 replies; 11+ messages in thread
From: Daniel Henrique Barboza @ 2022-03-11  0:30 UTC (permalink / raw)
  To: Fabiano Rosas, qemu-devel; +Cc: aneesh.kumar, qemu-ppc, clg, npiggin, david



On 3/10/22 16:51, Fabiano Rosas wrote:
> Fabiano Rosas <farosas@linux.ibm.com> writes:
> 
>> QEMU reports MMU support to the guest via the ibm,architecture-vec-5
>> property of the /chosen node. Byte number 26 specifies Radix Table
>> Expansions, currently only GTSE (Guest Translation Shootdown
>> Enable). This feature determines whether the tlbie instruction (and
>> others) are HV privileged.
>>
>> Up until now, we always reported GTSE=1 to guests. Even after the
>> support for GTSE=0 was added. As part of that support, a kernel
>> command line radix_hcall_invalidate=on was introduced that overrides
>> the GTSE value received via CAS. So a guest can run with GTSE=0 and
>> use the H_RPT_INVALIDATE hcall instead of tlbie.
>>
>> In this scenario, having GTSE always set to 1 by QEMU leads to a crash
>> when running nested KVM guests because KVM does not allow a nested
>> hypervisor to set GTSE support for its nested guests. So a nested
>> guest always uses the same value for LPCR_GTSE as its HV. Since the
>> nested HV disabled GTSE, but the L2 QEMU always reports GTSE=1, we run
>> into a crash when:
>>
>> L1 LPCR_GTSE=0
>> L2 LPCR_GTSE=0
>> L2 CAS GTSE=1
>>
>> The nested guest will run 'tlbie' and crash because the HW looks at
>> LPCR_GTSE, which is clear.
>>
>> Having GTSE disabled in the L1 and enabled in the L2 is not an option
>> because the whole purpose of GTSE is to disallow access to tlbie and
>> we cannot allow L1 to spawn L2s that can access features that L1
>> itself cannot.
>>
>> We also cannot have the guest check the LPCR bit, because LPCR is
>> HV-privileged.
>>
>> So this patch goes through the most intuitive route which is to have
>> QEMU ask KVM about GTSE support and advertise the correct value to the
>> guest. A new KVM_CAP_PPC_GTSE capability is being added.
>>
>> TCG continues to always enable GTSE.
>>
>> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
>> ---
>>   hw/ppc/spapr.c       | 38 +++++++++++++++++++++++++++++++-------
>>   target/ppc/kvm.c     |  8 ++++++++
>>   target/ppc/kvm_ppc.h |  6 ++++++
>>   3 files changed, 45 insertions(+), 7 deletions(-)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 4cc204f90d..3e95a1831f 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -971,7 +971,7 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
>>           23, 0x00, /* XICS / XIVE mode */
>>           24, 0x00, /* Hash/Radix, filled in below. */
>>           25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
>> -        26, 0x40, /* Radix options: GTSE == yes. */
>> +        26, 0x00, /* Radix options, filled in below. */
>>       };
>>   
>>       if (spapr->irq->xics && spapr->irq->xive) {
>> @@ -1000,10 +1000,16 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
>>           } else {
>>               val[3] = 0x00; /* Hash */
>>           }
>> +
>> +        if (kvmppc_has_cap_gtse()) {
>> +            val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
>> +        }
> 
> This needs the same treatment as below to support kernels that don't
> know about the cap. Also, look at that semicolon! =D


hahaha. There's another one down ...

> 
>>       } else {
>>           /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
>>           val[3] = 0xC0;
>> +        val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;

^ here



Daniel

>>       }
>> +
>>       _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
>>                        val, sizeof(val)));
>>   }
>> @@ -2824,14 +2830,32 @@ static void spapr_machine_init(MachineState *machine)
>>       /* Init numa_assoc_array */
>>       spapr_numa_associativity_init(spapr, machine);
>>   
>> -    if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
>> -        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
>> +    if (ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
>>                                 spapr->max_compat_pvr)) {
>> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
>> -        /* KVM and TCG always allow GTSE with radix... */
>> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
>> +
>> +        /* TCG always supports Radix w/ GTSE */
>> +        if (!kvm_enabled()) {
>> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
>> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
>> +        } else {
>> +            if (kvmppc_has_cap_mmu_radix()) {
>> +                spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
>> +            }
>> +
>> +            /*
>> +             * Only disable Guest Translation Shootdown if KVM
>> +             * supports the H_RPT_INVALIDATE hypercall, otherwise we'd
>> +             * leave the guest with no way to make TLB invalidations.
>> +             */
>> +            if (kvmppc_has_cap_rpt_invalidate()) {
>> +                if (kvmppc_has_cap_gtse()) {
>> +                    spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
>> +                }
>> +            } else {
>> +                spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
>> +            }
>> +        }
>>       }
>> -    /* ... but not with hash (currently). */
>>   
>>       if (kvm_enabled()) {
>>           /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
>> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
>> index dc93b99189..91582c4b15 100644
>> --- a/target/ppc/kvm.c
>> +++ b/target/ppc/kvm.c
>> @@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv;
>>   static int cap_large_decr;
>>   static int cap_fwnmi;
>>   static int cap_rpt_invalidate;
>> +static int cap_gtse;
>>   
>>   static uint32_t debug_inst_opcode;
>>   
>> @@ -154,6 +155,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>>       }
>>   
>>       cap_rpt_invalidate = kvm_vm_check_extension(s, KVM_CAP_PPC_RPT_INVALIDATE);
>> +    cap_gtse = kvm_vm_check_extension(s, KVM_CAP_PPC_GTSE);
>> +
>>       kvm_ppc_register_host_cpu_type();
>>   
>>       return 0;
>> @@ -2397,6 +2400,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
>>       return cap_mmu_hash_v3;
>>   }
>>   
>> +bool kvmppc_has_cap_gtse(void)
>> +{
>> +    return cap_gtse;
>> +}
>> +
>>   static bool kvmppc_power8_host(void)
>>   {
>>       bool ret = false;
>> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
>> index ee9325bf9a..7d6980edb7 100644
>> --- a/target/ppc/kvm_ppc.h
>> +++ b/target/ppc/kvm_ppc.h
>> @@ -63,6 +63,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
>>   bool kvmppc_has_cap_htm(void);
>>   bool kvmppc_has_cap_mmu_radix(void);
>>   bool kvmppc_has_cap_mmu_hash_v3(void);
>> +bool kvmppc_has_cap_gtse(void);
>>   bool kvmppc_has_cap_xive(void);
>>   int kvmppc_get_cap_safe_cache(void);
>>   int kvmppc_get_cap_safe_bounds_check(void);
>> @@ -343,6 +344,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
>>       return false;
>>   }
>>   
>> +static inline bool kvmppc_has_cap_gtse(void)
>> +{
>> +    return false;
>> +}
>> +
>>   static inline bool kvmppc_has_cap_xive(void)
>>   {
>>       return false;


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5
  2022-03-09 19:09 ` [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5 Daniel Henrique Barboza
@ 2022-03-11 14:45   ` Fabiano Rosas
  0 siblings, 0 replies; 11+ messages in thread
From: Fabiano Rosas @ 2022-03-11 14:45 UTC (permalink / raw)
  To: Daniel Henrique Barboza, qemu-devel
  Cc: aneesh.kumar, qemu-ppc, clg, npiggin, david

Daniel Henrique Barboza <danielhb413@gmail.com> writes:

> On 3/8/22 22:23, Fabiano Rosas wrote:
>> QEMU reports MMU support to the guest via the ibm,architecture-vec-5
>> property of the /chosen node. Byte number 26 specifies Radix Table
>> Expansions, currently only GTSE (Guest Translation Shootdown
>> Enable). This feature determines whether the tlbie instruction (and
>> others) are HV privileged.
>> 
>> Up until now, we always reported GTSE=1 to guests. Even after the
>> support for GTSE=0 was added. As part of that support, a kernel
>> command line radix_hcall_invalidate=on was introduced that overrides
>> the GTSE value received via CAS. So a guest can run with GTSE=0 and
>> use the H_RPT_INVALIDATE hcall instead of tlbie.
>> 
>> In this scenario, having GTSE always set to 1 by QEMU leads to a crash
>> when running nested KVM guests because KVM does not allow a nested
>> hypervisor to set GTSE support for its nested guests. So a nested
>> guest always uses the same value for LPCR_GTSE as its HV. Since the
>> nested HV disabled GTSE, but the L2 QEMU always reports GTSE=1, we run
>> into a crash when:
>> 
>> L1 LPCR_GTSE=0
>> L2 LPCR_GTSE=0
>> L2 CAS GTSE=1
>> 
>> The nested guest will run 'tlbie' and crash because the HW looks at
>> LPCR_GTSE, which is clear.
>> 
>> Having GTSE disabled in the L1 and enabled in the L2 is not an option
>> because the whole purpose of GTSE is to disallow access to tlbie and
>> we cannot allow L1 to spawn L2s that can access features that L1
>> itself cannot.
>> 
>> We also cannot have the guest check the LPCR bit, because LPCR is
>> HV-privileged.
>> 
>> So this patch goes through the most intuitive route which is to have
>> QEMU ask KVM about GTSE support and advertise the correct value to the
>> guest. A new KVM_CAP_PPC_GTSE capability is being added.
>> 
>> TCG continues to always enable GTSE.
>> 
>> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
>> ---
>
>
> I'm not sure if I fully understood the situation, so let me recap. Once upon a time,
> QEMU advertised GTSE=1 and the host would never advertise other value, and everyone
> was happy.
>
> The host started to support GTSE=0, but QEMU kept advertising GTSE=1 regardless, and no
> KVM GTSE cap was added to reflect the host support. I'll then assume that:
>
>
> - all guests would break if running in a GTSE=0 host prior to rpt_invalidate support (which
> is necessary to allow the guest to run in GTSE=0)
>
> - apparently no one ever tried to run a KVM guest in a GTSE=0 host, so no bugs were opened

There's a slight misconception in the above statements which is the
separation of QEMU vs. the host. GTSE is advertised via CAS, so the
guest on one side and the HV on the other. QEMU is not merely
advertising what the host GTSE value is, QEMU *is* the host.

Now, of course we could have done this in a way that QEMU asked the
kernel what GTSE value to use, but since we always thought of GTSE as
required for Radix, that was would have been useless. No HV ever
reported GTSE=0 via CAS, either PowerVM or QEMU/KVM, so having the value
hardcoded in QEMU and in the kernel was never an issue.

> After commit 82123b756a1a2f1 (target/ppc: Support for H_RPT_INVALIDATE hcall) we added
> cap-rpt-invalidate. I didn't follow the discussions of this cap but it seems like, as with
> almost every other cap we have, there would be a migration problem for a guest that was in
> a rpt_invalidate aware host to migrate to another where this wouldn't be true, and the cap
> solves that.

Yes, cap-rpt-invalidate works just as we would expect. When I mentioned
to you in private about migration I meant the kernel-side change:

https://git.kernel.org/torvalds/c/bf6b7661f41615

What that change does is add a kernel cmdline option to allow the kernel
to disable GTSE even when running along with an HV that allows GTSE.

> What I'm not following is why, even after having cap-rpt-invalidate, we are still "lying"
> about the GTSE=1 regardless of what the host supports. We could've added the GTSE KVM cap
> at the same time rpt_invalidate was introduced, and guests that want to ignore this setting
> can use the cap to bypass it.

We're still reporting GTSE=1 because that is a design decision from
Linux/KVM. The work to support GTSE=0 was just adding the support, not
deciding whether we should disable GTSE. So QEMU/kernel kept hardcoding
the value without issue.

> In the end this patch is a needed fix IMHO. My confusion is why we're doing this just now.

The bug only surfaces when we run an L1 guest that decided to disable
GTSE via kernel cmdline and a nested guest on top of it. The QEMU inside
the L1 continues to force GTSE=1 as always. That is why the capability
now seem so compelling when previously it might have not.

> The patch itself LGTM.

Unfortunately, this patch as it is cannot work. We always ran with
GTSE=1 so any kernel that does not know about CAP_GTSE will report
GTSE=0 and break any guest that is older than the initial H_RPT
enablement. And the trick of checking for cap-rpt-invalidate first does
not always work because there's a window between when that cap was added
and now.

So what I am going to do is to change the kernel side to always report
values different than 0 so that QEMU can use the 0 value to
unequivocally tell older kernels apart from ones that disable the
feature. That way we will continue to send GTSE=1 via CAS when KVM is
too old.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5
  2022-03-09  1:23 [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5 Fabiano Rosas
                   ` (2 preceding siblings ...)
  2022-03-10 19:51 ` Fabiano Rosas
@ 2022-03-12  9:17 ` David Gibson
  2022-03-14 22:10   ` Fabiano Rosas
  3 siblings, 1 reply; 11+ messages in thread
From: David Gibson @ 2022-03-12  9:17 UTC (permalink / raw)
  To: Fabiano Rosas
  Cc: aneesh.kumar, danielhb413, qemu-devel, npiggin, qemu-ppc, clg

[-- Attachment #1: Type: text/plain, Size: 7395 bytes --]

On Tue, Mar 08, 2022 at 10:23:59PM -0300, Fabiano Rosas wrote:
> QEMU reports MMU support to the guest via the ibm,architecture-vec-5
> property of the /chosen node. Byte number 26 specifies Radix Table
> Expansions, currently only GTSE (Guest Translation Shootdown
> Enable). This feature determines whether the tlbie instruction (and
> others) are HV privileged.
> 
> Up until now, we always reported GTSE=1 to guests. Even after the
> support for GTSE=0 was added. As part of that support, a kernel
> command line radix_hcall_invalidate=on was introduced that overrides
> the GTSE value received via CAS. So a guest can run with GTSE=0 and
> use the H_RPT_INVALIDATE hcall instead of tlbie.
> 
> In this scenario, having GTSE always set to 1 by QEMU leads to a crash
> when running nested KVM guests because KVM does not allow a nested
> hypervisor to set GTSE support for its nested guests. So a nested
> guest always uses the same value for LPCR_GTSE as its HV. Since the
> nested HV disabled GTSE, but the L2 QEMU always reports GTSE=1, we run
> into a crash when:
> 
> L1 LPCR_GTSE=0
> L2 LPCR_GTSE=0
> L2 CAS GTSE=1
> 
> The nested guest will run 'tlbie' and crash because the HW looks at
> LPCR_GTSE, which is clear.
> 
> Having GTSE disabled in the L1 and enabled in the L2 is not an option
> because the whole purpose of GTSE is to disallow access to tlbie and
> we cannot allow L1 to spawn L2s that can access features that L1
> itself cannot.
> 
> We also cannot have the guest check the LPCR bit, because LPCR is
> HV-privileged.
> 
> So this patch goes through the most intuitive route which is to have
> QEMU ask KVM about GTSE support and advertise the correct value to the
> guest. A new KVM_CAP_PPC_GTSE capability is being added.
> 
> TCG continues to always enable GTSE.
> 
> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
> ---
>  hw/ppc/spapr.c       | 38 +++++++++++++++++++++++++++++++-------
>  target/ppc/kvm.c     |  8 ++++++++
>  target/ppc/kvm_ppc.h |  6 ++++++
>  3 files changed, 45 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 4cc204f90d..3e95a1831f 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -971,7 +971,7 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
>          23, 0x00, /* XICS / XIVE mode */
>          24, 0x00, /* Hash/Radix, filled in below. */
>          25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
> -        26, 0x40, /* Radix options: GTSE == yes. */
> +        26, 0x00, /* Radix options, filled in below. */
>      };
>  
>      if (spapr->irq->xics && spapr->irq->xive) {
> @@ -1000,10 +1000,16 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
>          } else {
>              val[3] = 0x00; /* Hash */
>          }
> +
> +        if (kvmppc_has_cap_gtse()) {
> +            val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
> +        }
>      } else {
>          /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
>          val[3] = 0xC0;
> +        val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
>      }
> +
>      _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
>                       val, sizeof(val)));
>  }
> @@ -2824,14 +2830,32 @@ static void spapr_machine_init(MachineState *machine)
>      /* Init numa_assoc_array */
>      spapr_numa_associativity_init(spapr, machine);
>  
> -    if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
> -        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
> +    if (ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
>                                spapr->max_compat_pvr)) {
> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> -        /* KVM and TCG always allow GTSE with radix... */
> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +
> +        /* TCG always supports Radix w/ GTSE */
> +        if (!kvm_enabled()) {
> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);

Yeah, this is no good.  It's never ok to change guest visible
behaviour depending on host properties (in this case whether it's KVM
or not).  It messes up the invariants we need for migration, which
require that the guest visible state depend only on the user
configuration.

The usual way to handle this is with a new capability, you can then
change the default with the next machine version.

> +        } else {
> +            if (kvmppc_has_cap_mmu_radix()) {
> +                spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> +            }
> +
> +            /*
> +             * Only disable Guest Translation Shootdown if KVM
> +             * supports the H_RPT_INVALIDATE hypercall, otherwise we'd
> +             * leave the guest with no way to make TLB invalidations.
> +             */
> +            if (kvmppc_has_cap_rpt_invalidate()) {
> +                if (kvmppc_has_cap_gtse()) {
> +                    spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +                }
> +            } else {
> +                spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> +            }
> +        }
>      }
> -    /* ... but not with hash (currently). */
>  
>      if (kvm_enabled()) {
>          /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index dc93b99189..91582c4b15 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv;
>  static int cap_large_decr;
>  static int cap_fwnmi;
>  static int cap_rpt_invalidate;
> +static int cap_gtse;
>  
>  static uint32_t debug_inst_opcode;
>  
> @@ -154,6 +155,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>      }
>  
>      cap_rpt_invalidate = kvm_vm_check_extension(s, KVM_CAP_PPC_RPT_INVALIDATE);
> +    cap_gtse = kvm_vm_check_extension(s, KVM_CAP_PPC_GTSE);
> +
>      kvm_ppc_register_host_cpu_type();
>  
>      return 0;
> @@ -2397,6 +2400,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void)
>      return cap_mmu_hash_v3;
>  }
>  
> +bool kvmppc_has_cap_gtse(void)
> +{
> +    return cap_gtse;
> +}
> +
>  static bool kvmppc_power8_host(void)
>  {
>      bool ret = false;
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index ee9325bf9a..7d6980edb7 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -63,6 +63,7 @@ bool kvmppc_has_cap_fixup_hcalls(void);
>  bool kvmppc_has_cap_htm(void);
>  bool kvmppc_has_cap_mmu_radix(void);
>  bool kvmppc_has_cap_mmu_hash_v3(void);
> +bool kvmppc_has_cap_gtse(void);
>  bool kvmppc_has_cap_xive(void);
>  int kvmppc_get_cap_safe_cache(void);
>  int kvmppc_get_cap_safe_bounds_check(void);
> @@ -343,6 +344,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void)
>      return false;
>  }
>  
> +static inline bool kvmppc_has_cap_gtse(void)
> +{
> +    return false;
> +}
> +
>  static inline bool kvmppc_has_cap_xive(void)
>  {
>      return false;

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5
  2022-03-12  9:17 ` David Gibson
@ 2022-03-14 22:10   ` Fabiano Rosas
  2022-03-31  4:29     ` David Gibson
  0 siblings, 1 reply; 11+ messages in thread
From: Fabiano Rosas @ 2022-03-14 22:10 UTC (permalink / raw)
  To: David Gibson
  Cc: aneesh.kumar, danielhb413, qemu-devel, npiggin, qemu-ppc, clg

David Gibson <david@gibson.dropbear.id.au> writes:

> On Tue, Mar 08, 2022 at 10:23:59PM -0300, Fabiano Rosas wrote:
>> QEMU reports MMU support to the guest via the ibm,architecture-vec-5
>> property of the /chosen node. Byte number 26 specifies Radix Table
>> Expansions, currently only GTSE (Guest Translation Shootdown
>> Enable). This feature determines whether the tlbie instruction (and
>> others) are HV privileged.
>> 
>> Up until now, we always reported GTSE=1 to guests. Even after the
>> support for GTSE=0 was added. As part of that support, a kernel
>> command line radix_hcall_invalidate=on was introduced that overrides
>> the GTSE value received via CAS. So a guest can run with GTSE=0 and
>> use the H_RPT_INVALIDATE hcall instead of tlbie.
>> 
>> In this scenario, having GTSE always set to 1 by QEMU leads to a crash
>> when running nested KVM guests because KVM does not allow a nested
>> hypervisor to set GTSE support for its nested guests. So a nested
>> guest always uses the same value for LPCR_GTSE as its HV. Since the
>> nested HV disabled GTSE, but the L2 QEMU always reports GTSE=1, we run
>> into a crash when:
>> 
>> L1 LPCR_GTSE=0
>> L2 LPCR_GTSE=0
>> L2 CAS GTSE=1
>> 
>> The nested guest will run 'tlbie' and crash because the HW looks at
>> LPCR_GTSE, which is clear.
>> 
>> Having GTSE disabled in the L1 and enabled in the L2 is not an option
>> because the whole purpose of GTSE is to disallow access to tlbie and
>> we cannot allow L1 to spawn L2s that can access features that L1
>> itself cannot.
>> 
>> We also cannot have the guest check the LPCR bit, because LPCR is
>> HV-privileged.
>> 
>> So this patch goes through the most intuitive route which is to have
>> QEMU ask KVM about GTSE support and advertise the correct value to the
>> guest. A new KVM_CAP_PPC_GTSE capability is being added.
>> 
>> TCG continues to always enable GTSE.
>> 
>> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
>> ---
>>  hw/ppc/spapr.c       | 38 +++++++++++++++++++++++++++++++-------
>>  target/ppc/kvm.c     |  8 ++++++++
>>  target/ppc/kvm_ppc.h |  6 ++++++
>>  3 files changed, 45 insertions(+), 7 deletions(-)
>> 
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 4cc204f90d..3e95a1831f 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -971,7 +971,7 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
>>          23, 0x00, /* XICS / XIVE mode */
>>          24, 0x00, /* Hash/Radix, filled in below. */
>>          25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
>> -        26, 0x40, /* Radix options: GTSE == yes. */
>> +        26, 0x00, /* Radix options, filled in below. */
>>      };
>>  
>>      if (spapr->irq->xics && spapr->irq->xive) {
>> @@ -1000,10 +1000,16 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
>>          } else {
>>              val[3] = 0x00; /* Hash */
>>          }
>> +
>> +        if (kvmppc_has_cap_gtse()) {
>> +            val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
>> +        }
>>      } else {
>>          /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
>>          val[3] = 0xC0;
>> +        val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
>>      }
>> +
>>      _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
>>                       val, sizeof(val)));
>>  }
>> @@ -2824,14 +2830,32 @@ static void spapr_machine_init(MachineState *machine)
>>      /* Init numa_assoc_array */
>>      spapr_numa_associativity_init(spapr, machine);
>>  
>> -    if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
>> -        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
>> +    if (ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
>>                                spapr->max_compat_pvr)) {
>> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
>> -        /* KVM and TCG always allow GTSE with radix... */
>> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
>> +
>> +        /* TCG always supports Radix w/ GTSE */
>> +        if (!kvm_enabled()) {
>> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
>> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
>
> Yeah, this is no good.  It's never ok to change guest visible
> behaviour depending on host properties (in this case whether it's KVM
> or not).  It messes up the invariants we need for migration, which
> require that the guest visible state depend only on the user
> configuration.
>
> The usual way to handle this is with a new capability, you can then
> change the default with the next machine version.

This particular problem is tricky. TCG cannot disable GTSE because it
does not support H_RPT_INVALIDATE. And older kernels that don't know
about the feature require GTSE.

KVM can afford to disable GTSE because we have a compatibility mechanism
(although a bit crooked): We can invert the logic for the KVM_CAP so
that the presence of KVM_CAP_PPC_GTSE_DISABLE would mean QEMU is allowed
to disable GTSE. Then:
  - older KVM + new QEMU would keep GTSE enabled;

  - older L1 guests are not affected because the host would report
    GTSE=1 with the KVM capability. By the time we decide to disable
    GTSE for L1 guests hopefully all older kernels will be out of use;

  - older nested guests:
    - if L1 runs with GTSE=1, are not affected;

    - if L1 disabled GTSE via kernel cmdline, are already broken (this
      bug). But they would go from crashing to being aborted* by QEMU
      (the guest asks for HPT in the lack of GTSE; nested KVM is radix
      only);

      * there are other broken cases which are fixed completely.

To satisfy TCG we could keep a spapr capability as ON and usually the
guest would pass cap-gtse=off when running with KVM. However this
doesn't work because this crash happens precisely because the nested
guest doesn't know that it needs to use cap-rpt-invalidate=on. Another
cap wouldn't help.

So I think the only way to have a spapr capability for this is if TCG
always defaults to ON and KVM always defaults to OFF. But then we would
be changing guest visible behaviour depending on host properties.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5
  2022-03-14 22:10   ` Fabiano Rosas
@ 2022-03-31  4:29     ` David Gibson
  2022-04-01  7:01       ` Aneesh Kumar K.V
  0 siblings, 1 reply; 11+ messages in thread
From: David Gibson @ 2022-03-31  4:29 UTC (permalink / raw)
  To: Fabiano Rosas
  Cc: aneesh.kumar, danielhb413, qemu-devel, npiggin, qemu-ppc, clg

[-- Attachment #1: Type: text/plain, Size: 7714 bytes --]

On Mon, Mar 14, 2022 at 07:10:10PM -0300, Fabiano Rosas wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Tue, Mar 08, 2022 at 10:23:59PM -0300, Fabiano Rosas wrote:
> >> QEMU reports MMU support to the guest via the ibm,architecture-vec-5
> >> property of the /chosen node. Byte number 26 specifies Radix Table
> >> Expansions, currently only GTSE (Guest Translation Shootdown
> >> Enable). This feature determines whether the tlbie instruction (and
> >> others) are HV privileged.
> >> 
> >> Up until now, we always reported GTSE=1 to guests. Even after the
> >> support for GTSE=0 was added. As part of that support, a kernel
> >> command line radix_hcall_invalidate=on was introduced that overrides
> >> the GTSE value received via CAS. So a guest can run with GTSE=0 and
> >> use the H_RPT_INVALIDATE hcall instead of tlbie.
> >> 
> >> In this scenario, having GTSE always set to 1 by QEMU leads to a crash
> >> when running nested KVM guests because KVM does not allow a nested
> >> hypervisor to set GTSE support for its nested guests. So a nested
> >> guest always uses the same value for LPCR_GTSE as its HV. Since the
> >> nested HV disabled GTSE, but the L2 QEMU always reports GTSE=1, we run
> >> into a crash when:
> >> 
> >> L1 LPCR_GTSE=0
> >> L2 LPCR_GTSE=0
> >> L2 CAS GTSE=1
> >> 
> >> The nested guest will run 'tlbie' and crash because the HW looks at
> >> LPCR_GTSE, which is clear.
> >> 
> >> Having GTSE disabled in the L1 and enabled in the L2 is not an option
> >> because the whole purpose of GTSE is to disallow access to tlbie and
> >> we cannot allow L1 to spawn L2s that can access features that L1
> >> itself cannot.
> >> 
> >> We also cannot have the guest check the LPCR bit, because LPCR is
> >> HV-privileged.
> >> 
> >> So this patch goes through the most intuitive route which is to have
> >> QEMU ask KVM about GTSE support and advertise the correct value to the
> >> guest. A new KVM_CAP_PPC_GTSE capability is being added.
> >> 
> >> TCG continues to always enable GTSE.
> >> 
> >> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
> >> ---
> >>  hw/ppc/spapr.c       | 38 +++++++++++++++++++++++++++++++-------
> >>  target/ppc/kvm.c     |  8 ++++++++
> >>  target/ppc/kvm_ppc.h |  6 ++++++
> >>  3 files changed, 45 insertions(+), 7 deletions(-)
> >> 
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index 4cc204f90d..3e95a1831f 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -971,7 +971,7 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
> >>          23, 0x00, /* XICS / XIVE mode */
> >>          24, 0x00, /* Hash/Radix, filled in below. */
> >>          25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
> >> -        26, 0x40, /* Radix options: GTSE == yes. */
> >> +        26, 0x00, /* Radix options, filled in below. */
> >>      };
> >>  
> >>      if (spapr->irq->xics && spapr->irq->xive) {
> >> @@ -1000,10 +1000,16 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
> >>          } else {
> >>              val[3] = 0x00; /* Hash */
> >>          }
> >> +
> >> +        if (kvmppc_has_cap_gtse()) {
> >> +            val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
> >> +        }
> >>      } else {
> >>          /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
> >>          val[3] = 0xC0;
> >> +        val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */;
> >>      }
> >> +
> >>      _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
> >>                       val, sizeof(val)));
> >>  }
> >> @@ -2824,14 +2830,32 @@ static void spapr_machine_init(MachineState *machine)
> >>      /* Init numa_assoc_array */
> >>      spapr_numa_associativity_init(spapr, machine);
> >>  
> >> -    if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
> >> -        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
> >> +    if (ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
> >>                                spapr->max_compat_pvr)) {
> >> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> >> -        /* KVM and TCG always allow GTSE with radix... */
> >> -        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> >> +
> >> +        /* TCG always supports Radix w/ GTSE */
> >> +        if (!kvm_enabled()) {
> >> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
> >> +            spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
> >
> > Yeah, this is no good.  It's never ok to change guest visible
> > behaviour depending on host properties (in this case whether it's KVM
> > or not).  It messes up the invariants we need for migration, which
> > require that the guest visible state depend only on the user
> > configuration.
> >
> > The usual way to handle this is with a new capability, you can then
> > change the default with the next machine version.
> 
> This particular problem is tricky. TCG cannot disable GTSE because it
> does not support H_RPT_INVALIDATE.

We could implement H_RPT_INVALIDATE for TCG though.

> And older kernels that don't know
> about the feature require GTSE.
> 
> KVM can afford to disable GTSE because we have a compatibility mechanism
> (although a bit crooked): We can invert the logic for the KVM_CAP so
> that the presence of KVM_CAP_PPC_GTSE_DISABLE would mean QEMU is allowed
> to disable GTSE. Then:
>   - older KVM + new QEMU would keep GTSE enabled;
> 
>   - older L1 guests are not affected because the host would report
>     GTSE=1 with the KVM capability. By the time we decide to disable
>     GTSE for L1 guests hopefully all older kernels will be out of use;
> 
>   - older nested guests:
>     - if L1 runs with GTSE=1, are not affected;
> 
>     - if L1 disabled GTSE via kernel cmdline, are already broken (this
>       bug). But they would go from crashing to being aborted* by QEMU
>       (the guest asks for HPT in the lack of GTSE; nested KVM is radix
>       only);
> 
>       * there are other broken cases which are fixed completely.
> 
> To satisfy TCG we could keep a spapr capability as ON and usually the
> guest would pass cap-gtse=off when running with KVM. However this
> doesn't work because this crash happens precisely because the nested
> guest doesn't know that it needs to use cap-rpt-invalidate=on. Another
> cap wouldn't help.
> 
> So I think the only way to have a spapr capability for this is if TCG
> always defaults to ON and KVM always defaults to OFF. But then we would
> be changing guest visible behaviour depending on host properties.

Ok, I'd forgotten we already have cap-rpt-invalidate.  It still
defaults to OFF for now, which might help us.

What's clear is that we should never disable GTSE if
cap-rpt-invalidate is off - qemu should enforce that before even
starting the guest if at all possible.

What's less clear to me is if we want to enable GTSE by default or
not, in the cases where we're able to choose.  Would always disabling
GTSE when cap-rpt-invalidate=on be ok?  Or do we want to be able to
control GTSE separately.  In that case we might need a second cap, but
it would need inverted sense, so e.g. cap-disable-gtse.

I believe a guest that is expecting GTSE==0 should work if
LPCR[GTSE]==1, just not optimally (as long as H_RPT_INVALIDATE is
still available, of course).  Is that right?

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5
  2022-03-31  4:29     ` David Gibson
@ 2022-04-01  7:01       ` Aneesh Kumar K.V
  2022-04-01 15:50         ` Fabiano Rosas
  0 siblings, 1 reply; 11+ messages in thread
From: Aneesh Kumar K.V @ 2022-04-01  7:01 UTC (permalink / raw)
  To: David Gibson, Fabiano Rosas
  Cc: danielhb413, qemu-ppc, qemu-devel, npiggin, clg

David Gibson <david@gibson.dropbear.id.au> writes:

> On Mon, Mar 14, 2022 at 07:10:10PM -0300, Fabiano Rosas wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>> 
>> > On Tue, Mar 08, 2022 at 10:23:59PM -0300, Fabiano Rosas wrote:
>>

...

>> To satisfy TCG we could keep a spapr capability as ON and usually the
>> guest would pass cap-gtse=off when running with KVM. However this
>> doesn't work because this crash happens precisely because the nested
>> guest doesn't know that it needs to use cap-rpt-invalidate=on. Another
>> cap wouldn't help.
>> 
>> So I think the only way to have a spapr capability for this is if TCG
>> always defaults to ON and KVM always defaults to OFF. But then we would
>> be changing guest visible behaviour depending on host properties.
>
> Ok, I'd forgotten we already have cap-rpt-invalidate.  It still
> defaults to OFF for now, which might help us.
>
> What's clear is that we should never disable GTSE if
> cap-rpt-invalidate is off - qemu should enforce that before even
> starting the guest if at all possible.
>
> What's less clear to me is if we want to enable GTSE by default or
> not, in the cases where we're able to choose.  Would always disabling
> GTSE when cap-rpt-invalidate=on be ok?  Or do we want to be able to
> control GTSE separately.  In that case we might need a second cap, but
> it would need inverted sense, so e.g. cap-disable-gtse.


GTSE and cap-rpt-invalidate can be looked at as independent such that we
can do GTSE=1 or GTSE=0 with cap-rpt-invalidate=on. But GTSE=0 with
cap-rpt-invalidate=off is not allowed/possible. GTSE value is what is
negotiated via CAS so we should let the hypervisor inform the guest whether it
can do GTSE 0 or 1. The challenge IIUC is Qemu always assumed GTSE=1
which is not true in the case of nested virt where L1 guest that is booted
with GTSE=0.

with cap-disable-gtse how would one interpret that? Whether hypervisor
have the capability to disable gtse? 

>
> I believe a guest that is expecting GTSE==0 should work if
> LPCR[GTSE]==1, just not optimally (as long as H_RPT_INVALIDATE is
> still available, of course).  Is that right?

That is correct.

-aneesh


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5
  2022-04-01  7:01       ` Aneesh Kumar K.V
@ 2022-04-01 15:50         ` Fabiano Rosas
  0 siblings, 0 replies; 11+ messages in thread
From: Fabiano Rosas @ 2022-04-01 15:50 UTC (permalink / raw)
  To: Aneesh Kumar K.V, David Gibson
  Cc: danielhb413, qemu-ppc, qemu-devel, npiggin, clg

"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> David Gibson <david@gibson.dropbear.id.au> writes:
>
>> On Mon, Mar 14, 2022 at 07:10:10PM -0300, Fabiano Rosas wrote:
>>> David Gibson <david@gibson.dropbear.id.au> writes:
>>> 
>>> > On Tue, Mar 08, 2022 at 10:23:59PM -0300, Fabiano Rosas wrote:
>>>
>
> ...
>
>>> To satisfy TCG we could keep a spapr capability as ON and usually the
>>> guest would pass cap-gtse=off when running with KVM. However this
>>> doesn't work because this crash happens precisely because the nested
>>> guest doesn't know that it needs to use cap-rpt-invalidate=on. Another
>>> cap wouldn't help.
>>> 
>>> So I think the only way to have a spapr capability for this is if TCG
>>> always defaults to ON and KVM always defaults to OFF. But then we would
>>> be changing guest visible behaviour depending on host properties.
>>
>> Ok, I'd forgotten we already have cap-rpt-invalidate.  It still
>> defaults to OFF for now, which might help us.
>>
>> What's clear is that we should never disable GTSE if
>> cap-rpt-invalidate is off - qemu should enforce that before even
>> starting the guest if at all possible.
>>
>> What's less clear to me is if we want to enable GTSE by default or
>> not, in the cases where we're able to choose.  Would always disabling
>> GTSE when cap-rpt-invalidate=on be ok?  Or do we want to be able to
>> control GTSE separately.  In that case we might need a second cap, but
>> it would need inverted sense, so e.g. cap-disable-gtse.
>
>
> GTSE and cap-rpt-invalidate can be looked at as independent such that we
> can do GTSE=1 or GTSE=0 with cap-rpt-invalidate=on. But GTSE=0 with
> cap-rpt-invalidate=off is not allowed/possible. GTSE value is what is
> negotiated via CAS so we should let the hypervisor inform the guest whether it
> can do GTSE 0 or 1. The challenge IIUC is Qemu always assumed GTSE=1
> which is not true in the case of nested virt where L1 guest that is booted
> with GTSE=0.
>
> with cap-disable-gtse how would one interpret that? Whether hypervisor
> have the capability to disable gtse?

The spapr capability would mean "disable GTSE if KVM allows
it". Although I'd prefer using cap-gtse=<on/off> because it gives us
more flexibility if we ever want to change the default value.

On the KVM side I am testing a KVM_CAP_PPC_GTSE_DISABLE with the
semantics of "whether QEMU is allowed to disable GTSE". It reports the
inverse of MMU_FTR_GTSE. So if L1 runs with GTSE=0, then the capability
returns 1 and therefore QEMU can disable GTSE. If the capability is not
present, then QEMU is not allowed to disable GTSE.

With David's idea of disallowing cap-rpt-invalidate=off,cap-gtse=off we
can simply deny the nested guest command line if it doesn't include
cap-rpt-invalidate=on when KVM L1 reports KVM_CAP_PPC_GTSE_DISABLE. That
way cap-gtse can default to ON to keep TCG working.

On a first look, I think the above works. I'm still running some tests
with different QEMU/kernel versions.




^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2022-04-01 15:53 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-09  1:23 [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5 Fabiano Rosas
2022-03-09  1:24 ` [RFC PATCH 2/2] linux-headers: Add KVM_CAP_PPC_GTSE Fabiano Rosas
2022-03-09 19:09 ` [RFC PATCH 1/2] spapr: Report correct GTSE support via ov5 Daniel Henrique Barboza
2022-03-11 14:45   ` Fabiano Rosas
2022-03-10 19:51 ` Fabiano Rosas
2022-03-11  0:30   ` Daniel Henrique Barboza
2022-03-12  9:17 ` David Gibson
2022-03-14 22:10   ` Fabiano Rosas
2022-03-31  4:29     ` David Gibson
2022-04-01  7:01       ` Aneesh Kumar K.V
2022-04-01 15:50         ` Fabiano Rosas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.