kvm: x86: rewrite kvm_spec_ctrl_valid_bits
diff mbox series

Message ID 20200702174455.282252-1-mlevitsk@redhat.com
State New
Headers show
Series
  • kvm: x86: rewrite kvm_spec_ctrl_valid_bits
Related show

Commit Message

Maxim Levitsky July 2, 2020, 5:44 p.m. UTC
There are few cases when this function was creating a bogus #GP condition,
for example case when and AMD host supports STIBP but doesn't support SSBD.

Follow the rules for AMD and Intel strictly instead.

AMD #GP rules for IA32_SPEC_CTRL can be found here:
https://bugzilla.kernel.org/show_bug.cgi?id=199889

Fixes: 6441fa6178f5 ("KVM: x86: avoid incorrect writes to host MSR_IA32_SPEC_CTRL")

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 arch/x86/kvm/x86.c | 57 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 15 deletions(-)

Comments

Sean Christopherson July 2, 2020, 6:16 p.m. UTC | #1
On Thu, Jul 02, 2020 at 08:44:55PM +0300, Maxim Levitsky wrote:
> There are few cases when this function was creating a bogus #GP condition,
> for example case when and AMD host supports STIBP but doesn't support SSBD.
> 
> Follow the rules for AMD and Intel strictly instead.

Can you elaborate on the conditions that are problematic, e.g. what does
the guest expect to exist that KVM isn't providing?

> AMD #GP rules for IA32_SPEC_CTRL can be found here:
> https://bugzilla.kernel.org/show_bug.cgi?id=199889
> 
> Fixes: 6441fa6178f5 ("KVM: x86: avoid incorrect writes to host MSR_IA32_SPEC_CTRL")
> 
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  arch/x86/kvm/x86.c | 57 ++++++++++++++++++++++++++++++++++------------
>  1 file changed, 42 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 00c88c2f34e4..a6bed4670b7f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10670,27 +10670,54 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
>  
> -u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
> +
> +static u64 kvm_spec_ctrl_valid_bits_host(void)
> +{
> +	uint64_t bits = 0;
> +
> +	if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
> +		bits |= SPEC_CTRL_IBRS;
> +	if (boot_cpu_has(X86_FEATURE_INTEL_STIBP))
> +		bits |= SPEC_CTRL_STIBP;
> +	if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
> +		bits |= SPEC_CTRL_SSBD;
> +
> +	if (boot_cpu_has(X86_FEATURE_AMD_IBRS) || boot_cpu_has(X86_FEATURE_AMD_STIBP))
> +		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS;
> +
> +	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
> +		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS | SPEC_CTRL_SSBD;
> +
> +	return bits;
> +}

Rather than compute the mask every time, it can be computed once on module
load and stashed in a global.  Note, there's a RFC series[*] to support
reprobing bugs at runtime, but that has bigger issues with existing KVM
functionality to be addressed, i.e. it's not our problem, yet :-).

[*] https://lkml.kernel.org/r/1593703107-8852-1-git-send-email-mihai.carabas@oracle.com

> +
> +static u64 kvm_spec_ctrl_valid_bits_guest(struct kvm_vcpu *vcpu)
>  {
> -	uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD;
> +	uint64_t bits = 0;
>  
> -	/* The STIBP bit doesn't fault even if it's not advertised */
> -	if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
> -	    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
> -		bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
> -	if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
> -	    !boot_cpu_has(X86_FEATURE_AMD_IBRS))
> -		bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
> +	if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
> +		bits |= SPEC_CTRL_IBRS;
> +	if (guest_cpuid_has(vcpu, X86_FEATURE_INTEL_STIBP))
> +		bits |= SPEC_CTRL_STIBP;
> +	if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD))
> +		bits |= SPEC_CTRL_SSBD;
>  
> -	if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) &&
> -	    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
> -		bits &= ~SPEC_CTRL_SSBD;
> -	if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
> -	    !boot_cpu_has(X86_FEATURE_AMD_SSBD))
> -		bits &= ~SPEC_CTRL_SSBD;
> +	if (guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) ||
> +			guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP))

Bad indentation.

> +		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS;
> +	if (guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
> +		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS | SPEC_CTRL_SSBD;

Would it be feasible to split into two patches?  The first (tagged Fixes:)
to make the functional changes without inverting the logic or splitting, and
then do the cleanup?  It's really hard to review this patch because I can't
easily tease out what's different in terms of functionality.

>  	return bits;
>  }
> +
> +u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
> +{
> +	return kvm_spec_ctrl_valid_bits_host() &
> +	       kvm_spec_ctrl_valid_bits_guest(vcpu);
> +}
> +
> +
>  EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
>  
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
> -- 
> 2.25.4
>
Maxim Levitsky July 5, 2020, 9:40 a.m. UTC | #2
On Thu, 2020-07-02 at 11:16 -0700, Sean Christopherson wrote:
> On Thu, Jul 02, 2020 at 08:44:55PM +0300, Maxim Levitsky wrote:
> > There are few cases when this function was creating a bogus #GP condition,
> > for example case when and AMD host supports STIBP but doesn't support SSBD.
> > 
> > Follow the rules for AMD and Intel strictly instead.
> 
> Can you elaborate on the conditions that are problematic, e.g. what does
> the guest expect to exist that KVM isn't providing?

Hi Sean Christopherson!
Sorry that I haven't explained the issue here.
I explained it in bugzilla I opened in details and forgot to explain it
in the commit message.
https://bugzilla.redhat.com/show_bug.cgi?id=1853447
 
 
The issue is that on my cpu (3970X), it does not support IBRS,
but it does support STIBP, and thus guest gets the STIBP cpuid bits enabled
(both the amd one and KVM also enables the intel's cpuid bit for this feature).
 
Then, when guest actually starts to use STIBP, it gets #GP because both of these conditions
potentially don't allow STIBP bit to be set when IBRS is not supported:
 
	if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
	    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
		bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
	if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
	    !boot_cpu_has(X86_FEATURE_AMD_IBRS))
		bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
 
Most likely it fails on the second condition, since X86_FEATURE_SPEC_CTRL
is enabled in the guest because host does support IBPB which X86_FEATURE_SPEC_CTRL also cover.
 
But the host doesn't support X86_FEATURE_SPEC_CTRL and it doesn't support X86_FEATURE_AMD_IBRS
thus second condition clears the SPEC_CTRL_STIBP wrongly.
 
Now in addition to that, I long ago had known that win10 guests on my machine started to
crash when qemu added ability to pass through X86_FEATURE_AMD_IBRS.
I haven't paid much attention to that other than bisecting this and adding '-amd-stibp' to my cpu flags.
 
I did notice that I am not the only one to have that issue, for example
https://www.reddit.com/r/VFIO/comments/gf53o8/upgrading_to_qemu_5_broke_my_setup_windows_bsods/
https://forum.level1techs.com/t/amd-fix-for-host-passthrough-on-qemu-5-0-0-kernel-5-6/157710
 
Now after I debugged this issue in Linux, it occured to me that this might be the same issue as in Windows,
and indeed it is. The only difference is that Windows doesn't start to play with STIBP when Intel
specific cpuid bit is set on AMD machine (which KVM sets for long time) but starts to enable it when AMD specific
bit is set, that is X86_FEATURE_AMD_IBRS, the bit that qemu recently started to set and it gets the same #GP and crashes.
 
From findings on my machine, if we cross-reference this with the above posts, I can assume that many Ryzens have this configuration 
of no support for IBRS but support STIBP.
In fact I don't see the kernel use IBRS much (it seem only used around firmware calls or so), so it makes sense
that AMD chose to not enable it.
 
For the fix itself,
I can fix this by only changing the above condition, but then I read the AMD whitepaper on
this and they mention that bits in IA32_SPEC_CTRL don't #GP even if not supported,
and to implement this correctly would be too complicated with current logic,
thus I rewrote the logic to be as simple as possible and as close to the official docs as possible
as well.
 

> 
> > AMD #GP rules for IA32_SPEC_CTRL can be found here:
> > https://bugzilla.kernel.org/show_bug.cgi?id=199889
> > 
> > Fixes: 6441fa6178f5 ("KVM: x86: avoid incorrect writes to host MSR_IA32_SPEC_CTRL")
> > 
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  arch/x86/kvm/x86.c | 57 ++++++++++++++++++++++++++++++++++------------
> >  1 file changed, 42 insertions(+), 15 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 00c88c2f34e4..a6bed4670b7f 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -10670,27 +10670,54 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
> >  
> > -u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
> > +
> > +static u64 kvm_spec_ctrl_valid_bits_host(void)
> > +{
> > +	uint64_t bits = 0;
> > +
> > +	if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
> > +		bits |= SPEC_CTRL_IBRS;
> > +	if (boot_cpu_has(X86_FEATURE_INTEL_STIBP))
> > +		bits |= SPEC_CTRL_STIBP;
> > +	if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
> > +		bits |= SPEC_CTRL_SSBD;
> > +
> > +	if (boot_cpu_has(X86_FEATURE_AMD_IBRS) || boot_cpu_has(X86_FEATURE_AMD_STIBP))
> > +		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS;
> > +
> > +	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
> > +		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS | SPEC_CTRL_SSBD;
> > +
> > +	return bits;
> > +}
> 
> Rather than compute the mask every time, it can be computed once on module
> load and stashed in a global.  Note, there's a RFC series[*] to support
> reprobing bugs at runtime, but that has bigger issues with existing KVM
> functionality to be addressed, i.e. it's not our problem, yet :-).
> 
> [*] https://lkml.kernel.org/r/1593703107-8852-1-git-send-email-mihai.carabas@oracle.com

Thanks for the pointer!
 
Note though that the above code only runs once, since after a single successful (non #GP) set
of it to non-zero value, it is cleared in MSR bitmap for both reads and writes on
both VMX and SVM.
This is done because of performance reasons which in this case are more important than absolute correctness.
Thus to some extent the guest checks in the above are pointless.
 
If you ask
me, I would just remove the kvm_spec_ctrl_valid_bits, and pass this msr to guest
right away and not on first access.
 
I talked with Paulo about this and his opinion if I understand correctly is that the
above is
a best effort correctness wise since at least we emulate the bits correctly on first access.

> 
> > +
> > +static u64 kvm_spec_ctrl_valid_bits_guest(struct kvm_vcpu *vcpu)
> >  {
> > -	uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD;
> > +	uint64_t bits = 0;
> >  
> > -	/* The STIBP bit doesn't fault even if it's not advertised */
> > -	if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
> > -	    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
> > -		bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
> > -	if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
> > -	    !boot_cpu_has(X86_FEATURE_AMD_IBRS))
> > -		bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
> > +	if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
> > +		bits |= SPEC_CTRL_IBRS;
> > +	if (guest_cpuid_has(vcpu, X86_FEATURE_INTEL_STIBP))
> > +		bits |= SPEC_CTRL_STIBP;
> > +	if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD))
> > +		bits |= SPEC_CTRL_SSBD;
> >  
> > -	if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) &&
> > -	    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
> > -		bits &= ~SPEC_CTRL_SSBD;
> > -	if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
> > -	    !boot_cpu_has(X86_FEATURE_AMD_SSBD))
> > -		bits &= ~SPEC_CTRL_SSBD;
> > +	if (guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) ||
> > +			guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP))
> 
> Bad indentation.
True.

> 
> > +		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS;
> > +	if (guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
> > +		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS | SPEC_CTRL_SSBD;
> 
> Would it be feasible to split into two patches?  The first (tagged Fixes:)
> to make the functional changes without inverting the logic or splitting, and
> then do the cleanup?  It's really hard to review this patch because I can't
> easily tease out what's different in terms of functionality.

The new logic follows (hopefully) Intel's spec and AMD spec.
I will try to split it though.



> 
> >  	return bits;
> >  }
> > +
> > +u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
> > +{
> > +	return kvm_spec_ctrl_valid_bits_host() &
> > +	       kvm_spec_ctrl_valid_bits_guest(vcpu);
> > +}
> > +
> > +
> >  EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
> >  
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
> > -- 
> > 2.25.4
> > 

Thanks for the review,
	Best regards,
		Maxim Levitsky
Sean Christopherson July 7, 2020, 6:11 a.m. UTC | #3
On Sun, Jul 05, 2020 at 12:40:25PM +0300, Maxim Levitsky wrote:
> > Rather than compute the mask every time, it can be computed once on module
> > load and stashed in a global.  Note, there's a RFC series[*] to support
> > reprobing bugs at runtime, but that has bigger issues with existing KVM
> > functionality to be addressed, i.e. it's not our problem, yet :-).
> > 
> > [*] https://lkml.kernel.org/r/1593703107-8852-1-git-send-email-mihai.carabas@oracle.com
> 
> Thanks for the pointer!
>  
> Note though that the above code only runs once, since after a single
> successful (non #GP) set of it to non-zero value, it is cleared in MSR bitmap
> for both reads and writes on both VMX and SVM.

For me the performance is secondary to documenting the fact that the host
valid bits are fixed for a given instance of the kernel.  There's enough
going on in kvm_spec_ctrl_valid_bits_host() that's it's not super easy to
see that it's a "constant" value.

> This is done because of performance reasons which in this case are more
> important than absolute correctness.  Thus to some extent the guest checks in
> the above are pointless.
>  
> If you ask me, I would just remove the kvm_spec_ctrl_valid_bits, and pass
> this msr to guest right away and not on first access.

That would unnecessarily penalize guests that don't utilize the MSR as KVM
would need to do a RDMSR on every VM-Exit to grab the guest's value.

One oddity with this whole thing is that by passing through the MSR, KVM is
allowing the guest to write bits it doesn't know about, which is definitely
not normal.  It also means the guest could write bits that the host VMM
can't.

Somehwat crazy idea inbound... rather than calculating the valid bits in
software, what if we throw the value at the CPU and see if it fails?  At
least that way the host and guest are subject to the same rules.  E.g.

--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2062,11 +2062,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
                        return 1;

-               if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
-                       return 1;
-
+               ret = 0;
                vmx->spec_ctrl = data;
-               if (!data)
+
+               local_irq_disable();
+               if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &data))
+                       ret = 1;
+               else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl))
+                       ret = 1;
+               else
+                       wrmsrl(MSR_IA32_SPEC_CTRL, data))
+               local_irq_enable();
+
+               if (ret || !vmx->spec_ctrl)
                        break;

                /*
Paolo Bonzini July 7, 2020, 8:04 a.m. UTC | #4
On 07/07/20 08:11, Sean Christopherson wrote:
> One oddity with this whole thing is that by passing through the MSR, KVM is
> allowing the guest to write bits it doesn't know about, which is definitely
> not normal.  It also means the guest could write bits that the host VMM
> can't.

That's true.  However, the main purpose of the kvm_spec_ctrl_valid_bits
check is to ensure that host-initiated writes are valid; this way, you
don't get a #GP on the next vmentry's WRMSR to MSR_IA32_SPEC_CTRL.
Checking the guest CPUID bit is not even necessary.

Paolo

> Somehwat crazy idea inbound... rather than calculating the valid bits in
> software, what if we throw the value at the CPU and see if it fails?  At
> least that way the host and guest are subject to the same rules.  E.g.
> 
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -2062,11 +2062,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>                         return 1;
> 
> -               if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
> -                       return 1;
> -
> +               ret = 0;
>                 vmx->spec_ctrl = data;
> -               if (!data)
> +
> +               local_irq_disable();
> +               if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &data))
> +                       ret = 1;
> +               else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl))
> +                       ret = 1;
> +               else
> +                       wrmsrl(MSR_IA32_SPEC_CTRL, data))
> +               local_irq_enable();
> +
> +               if (ret || !vmx->spec_ctrl)
>                         break;
> 
>                 /*
>
Sean Christopherson July 7, 2020, 8:14 a.m. UTC | #5
Aren't you supposed to be on vacation? :-)

On Tue, Jul 07, 2020 at 10:04:22AM +0200, Paolo Bonzini wrote:
> On 07/07/20 08:11, Sean Christopherson wrote:
> > One oddity with this whole thing is that by passing through the MSR, KVM is
> > allowing the guest to write bits it doesn't know about, which is definitely
> > not normal.  It also means the guest could write bits that the host VMM
> > can't.
> 
> That's true.  However, the main purpose of the kvm_spec_ctrl_valid_bits
> check is to ensure that host-initiated writes are valid; this way, you
> don't get a #GP on the next vmentry's WRMSR to MSR_IA32_SPEC_CTRL.
> Checking the guest CPUID bit is not even necessary.

Right, what I'm saying is that rather than try and decipher specs to
determine what bits are supported, just throw the value at hardware and
go from there.  That's effectively what we end up doing for the guest writes
anyways.

Actually, the current behavior will break migration if there are ever legal
bits that KVM doesn't recognize, e.g. guest writes a value that KVM doesn't
allow and then migration fails when the destination tries to stuff the value
into KVM.
Paolo Bonzini July 7, 2020, 8:17 a.m. UTC | #6
On 07/07/20 10:14, Sean Christopherson wrote:
>>> One oddity with this whole thing is that by passing through the MSR, KVM is
>>> allowing the guest to write bits it doesn't know about, which is definitely
>>> not normal.  It also means the guest could write bits that the host VMM
>>> can't.
>> That's true.  However, the main purpose of the kvm_spec_ctrl_valid_bits
>> check is to ensure that host-initiated writes are valid; this way, you
>> don't get a #GP on the next vmentry's WRMSR to MSR_IA32_SPEC_CTRL.
>> Checking the guest CPUID bit is not even necessary.
> Right, what I'm saying is that rather than try and decipher specs to
> determine what bits are supported, just throw the value at hardware and
> go from there.  That's effectively what we end up doing for the guest writes
> anyways.

Yes, it would prevent the #GP.

> Actually, the current behavior will break migration if there are ever legal
> bits that KVM doesn't recognize, e.g. guest writes a value that KVM doesn't
> allow and then migration fails when the destination tries to stuff the value
> into KVM.

Yes, unfortunately migration would also be broken if the target (and the
guest CPUID) is an older CPU.  But that's not something we can fix
without trapping all writes which would be unacceptably slow.

Paolo
Sean Christopherson July 7, 2020, 8:26 a.m. UTC | #7
On Tue, Jul 07, 2020 at 10:17:14AM +0200, Paolo Bonzini wrote:
> On 07/07/20 10:14, Sean Christopherson wrote:
> >>> One oddity with this whole thing is that by passing through the MSR, KVM is
> >>> allowing the guest to write bits it doesn't know about, which is definitely
> >>> not normal.  It also means the guest could write bits that the host VMM
> >>> can't.
> >> That's true.  However, the main purpose of the kvm_spec_ctrl_valid_bits
> >> check is to ensure that host-initiated writes are valid; this way, you
> >> don't get a #GP on the next vmentry's WRMSR to MSR_IA32_SPEC_CTRL.
> >> Checking the guest CPUID bit is not even necessary.
> > Right, what I'm saying is that rather than try and decipher specs to
> > determine what bits are supported, just throw the value at hardware and
> > go from there.  That's effectively what we end up doing for the guest writes
> > anyways.
> 
> Yes, it would prevent the #GP.
> 
> > Actually, the current behavior will break migration if there are ever legal
> > bits that KVM doesn't recognize, e.g. guest writes a value that KVM doesn't
> > allow and then migration fails when the destination tries to stuff the value
> > into KVM.
> 
> Yes, unfortunately migration would also be broken if the target (and the
> guest CPUID) is an older CPU.  But that's not something we can fix
> without trapping all writes which would be unacceptably slow.

Ah, true, the guest would need to be setting bits that weren't enumerated
to it.
Wanpeng Li July 7, 2020, 8:56 a.m. UTC | #8
On Tue, 7 Jul 2020 at 16:15, Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> Aren't you supposed to be on vacation? :-)

A long vacation, enjoy!

>
> On Tue, Jul 07, 2020 at 10:04:22AM +0200, Paolo Bonzini wrote:
> > On 07/07/20 08:11, Sean Christopherson wrote:
> > > One oddity with this whole thing is that by passing through the MSR, KVM is
> > > allowing the guest to write bits it doesn't know about, which is definitely
> > > not normal.  It also means the guest could write bits that the host VMM
> > > can't.
> >
> > That's true.  However, the main purpose of the kvm_spec_ctrl_valid_bits
> > check is to ensure that host-initiated writes are valid; this way, you
> > don't get a #GP on the next vmentry's WRMSR to MSR_IA32_SPEC_CTRL.
> > Checking the guest CPUID bit is not even necessary.
>
> Right, what I'm saying is that rather than try and decipher specs to
> determine what bits are supported, just throw the value at hardware and
> go from there.  That's effectively what we end up doing for the guest writes
> anyways.
>
> Actually, the current behavior will break migration if there are ever legal
> bits that KVM doesn't recognize, e.g. guest writes a value that KVM doesn't
> allow and then migration fails when the destination tries to stuff the value
> into KVM.
Maxim Levitsky July 7, 2020, 11:30 a.m. UTC | #9
On Mon, 2020-07-06 at 23:11 -0700, Sean Christopherson wrote:
> On Sun, Jul 05, 2020 at 12:40:25PM +0300, Maxim Levitsky wrote:
> > > Rather than compute the mask every time, it can be computed once on module
> > > load and stashed in a global.  Note, there's a RFC series[*] to support
> > > reprobing bugs at runtime, but that has bigger issues with existing KVM
> > > functionality to be addressed, i.e. it's not our problem, yet :-).
> > > 
> > > [*] https://lkml.kernel.org/r/1593703107-8852-1-git-send-email-mihai.carabas@oracle.com
> > 
> > Thanks for the pointer!
> >  
> > Note though that the above code only runs once, since after a single
> > successful (non #GP) set of it to non-zero value, it is cleared in MSR bitmap
> > for both reads and writes on both VMX and SVM.
> 
> For me the performance is secondary to documenting the fact that the host
> valid bits are fixed for a given instance of the kernel.  There's enough
> going on in kvm_spec_ctrl_valid_bits_host() that's it's not super easy to
> see that it's a "constant" value.
> 
> > This is done because of performance reasons which in this case are more
> > important than absolute correctness.  Thus to some extent the guest checks in
> > the above are pointless.
> >  
> > If you ask me, I would just remove the kvm_spec_ctrl_valid_bits, and pass
> > this msr to guest right away and not on first access.
> 
> That would unnecessarily penalize guests that don't utilize the MSR as KVM
> would need to do a RDMSR on every VM-Exit to grab the guest's value.
I haven't thought about this, this makes sense.

> 
> One oddity with this whole thing is that by passing through the MSR, KVM is
> allowing the guest to write bits it doesn't know about, which is definitely
> not normal.  It also means the guest could write bits that the host VMM
> can't.
> 
> Somehwat crazy idea inbound... rather than calculating the valid bits in
> software, what if we throw the value at the CPU and see if it fails?  At
> least that way the host and guest are subject to the same rules.  E.g.
> 
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -2062,11 +2062,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>                         return 1;
> 
> -               if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
> -                       return 1;
> -
> +               ret = 0;
>                 vmx->spec_ctrl = data;
> -               if (!data)
> +
> +               local_irq_disable();
> +               if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &data))
> +                       ret = 1;
> +               else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl))
> +                       ret = 1;
> +               else
> +                       wrmsrl(MSR_IA32_SPEC_CTRL, data))
> +               local_irq_enable();
> +
> +               if (ret || !vmx->spec_ctrl)
>                         break;
> 
>                 /*
> 
I don't mind this as well, knowing that this is done only one per VM run anyway.

Best regards,
	Maxim Levitsky
Maxim Levitsky July 7, 2020, 11:35 a.m. UTC | #10
On Tue, 2020-07-07 at 01:14 -0700, Sean Christopherson wrote:
> Aren't you supposed to be on vacation? :-)
> 
> On Tue, Jul 07, 2020 at 10:04:22AM +0200, Paolo Bonzini wrote:
> > On 07/07/20 08:11, Sean Christopherson wrote:
> > > One oddity with this whole thing is that by passing through the MSR, KVM is
> > > allowing the guest to write bits it doesn't know about, which is definitely
> > > not normal.  It also means the guest could write bits that the host VMM
> > > can't.
> > 
> > That's true.  However, the main purpose of the kvm_spec_ctrl_valid_bits
> > check is to ensure that host-initiated writes are valid; this way, you
> > don't get a #GP on the next vmentry's WRMSR to MSR_IA32_SPEC_CTRL.
> > Checking the guest CPUID bit is not even necessary.
> 
> Right, what I'm saying is that rather than try and decipher specs to
> determine what bits are supported, just throw the value at hardware and
> go from there.  That's effectively what we end up doing for the guest writes
> anyways.
> 
> Actually, the current behavior will break migration if there are ever legal
> bits that KVM doesn't recognize, e.g. guest writes a value that KVM doesn't
> allow and then migration fails when the destination tries to stuff the value
> into KVM.

After thinking about this, I am thinking that we should apply similiar logic
as done with the 'cpu-pm' related features.
This way the user can choose between passing through the IA32_SPEC_CTRL,
(and in this case, we can since the user choose it, pass it right away, and thus
avoid using kvm_spec_ctrl_valid_bits completely), and between correctness,
in which case we can always emulate this msr, and therefore check all the bits,
both regard to guest and host supported values.
Does this makes sense, or do you think that this is overkill?

One thing for sure, we currently have a bug about wrong #GP in case STIBP is supported,
but IBRS isn't. I don't mind fixing it in any way that all of you agree upon.

Best regards,
	Maxim Levitsky
Paolo Bonzini July 7, 2020, 5:26 p.m. UTC | #11
On 07/07/20 13:35, Maxim Levitsky wrote:
> After thinking about this, I am thinking that we should apply similiar logic
> as done with the 'cpu-pm' related features.
> This way the user can choose between passing through the IA32_SPEC_CTRL,
> (and in this case, we can since the user choose it, pass it right away, and thus
> avoid using kvm_spec_ctrl_valid_bits completely), and between correctness,
> in which case we can always emulate this msr, and therefore check all the bits,
> both regard to guest and host supported values.

Unfortunately, passing it through is just too slow.  So I think it's
overkill.  There's two ways to deal with badly-behaved guests blocking
migration: 1) hide SPEC_CTRL altogether 2) kill them when migration
fails; both are acceptable depending on the situation.

Paolo

> Does this makes sense, or do you think that this is overkill?
Paolo Bonzini July 7, 2020, 5:26 p.m. UTC | #12
On 07/07/20 13:30, Maxim Levitsky wrote:
>> Somehwat crazy idea inbound... rather than calculating the valid bits in
>> software, what if we throw the value at the CPU and see if it fails?  At
>> least that way the host and guest are subject to the same rules.  E.g.
>>
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -2062,11 +2062,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>>                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
>>                         return 1;
>>
>> -               if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
>> -                       return 1;
>> -
>> +               ret = 0;
>>                 vmx->spec_ctrl = data;
>> -               if (!data)
>> +
>> +               local_irq_disable();
>> +               if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &data))
>> +                       ret = 1;
>> +               else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl))
>> +                       ret = 1;
>> +               else
>> +                       wrmsrl(MSR_IA32_SPEC_CTRL, data))
>> +               local_irq_enable();
>> +
>> +               if (ret || !vmx->spec_ctrl)
>>                         break;
>>
>>                 /*
>>
> I don't mind this as well, knowing that this is done only one per VM run anyway.

Maxim, this is okay as well; can you send a patch for it?

Paolo
Sean Christopherson July 7, 2020, 5:27 p.m. UTC | #13
On Tue, Jul 07, 2020 at 02:35:59PM +0300, Maxim Levitsky wrote:
> On Tue, 2020-07-07 at 01:14 -0700, Sean Christopherson wrote:
> > Aren't you supposed to be on vacation? :-)
> > 
> > On Tue, Jul 07, 2020 at 10:04:22AM +0200, Paolo Bonzini wrote:
> > > On 07/07/20 08:11, Sean Christopherson wrote:
> > > > One oddity with this whole thing is that by passing through the MSR, KVM is
> > > > allowing the guest to write bits it doesn't know about, which is definitely
> > > > not normal.  It also means the guest could write bits that the host VMM
> > > > can't.
> > > 
> > > That's true.  However, the main purpose of the kvm_spec_ctrl_valid_bits
> > > check is to ensure that host-initiated writes are valid; this way, you
> > > don't get a #GP on the next vmentry's WRMSR to MSR_IA32_SPEC_CTRL.
> > > Checking the guest CPUID bit is not even necessary.
> > 
> > Right, what I'm saying is that rather than try and decipher specs to
> > determine what bits are supported, just throw the value at hardware and
> > go from there.  That's effectively what we end up doing for the guest writes
> > anyways.
> > 
> > Actually, the current behavior will break migration if there are ever legal
> > bits that KVM doesn't recognize, e.g. guest writes a value that KVM doesn't
> > allow and then migration fails when the destination tries to stuff the value
> > into KVM.
> 
> After thinking about this, I am thinking that we should apply similiar logic
> as done with the 'cpu-pm' related features.
> This way the user can choose between passing through the IA32_SPEC_CTRL,
> (and in this case, we can since the user choose it, pass it right away, and thus
> avoid using kvm_spec_ctrl_valid_bits completely), and between correctness,
> in which case we can always emulate this msr, and therefore check all the bits,
> both regard to guest and host supported values.
> Does this makes sense, or do you think that this is overkill?

It doesn't really work because the host doesn't have a priori knowledge of
whether or not the guest will use IA32_SPEC_CTRL.  For PM stuff, there's no
meaningful overhead in exposing capabilities and the features more or less
ubiquitous, i.e. odds are very good the guest will use the exposed features
and there's no penalty if it doesn't.

Patch
diff mbox series

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00c88c2f34e4..a6bed4670b7f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10670,27 +10670,54 @@  bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
 
-u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
+
+static u64 kvm_spec_ctrl_valid_bits_host(void)
+{
+	uint64_t bits = 0;
+
+	if (boot_cpu_has(X86_FEATURE_SPEC_CTRL))
+		bits |= SPEC_CTRL_IBRS;
+	if (boot_cpu_has(X86_FEATURE_INTEL_STIBP))
+		bits |= SPEC_CTRL_STIBP;
+	if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+		bits |= SPEC_CTRL_SSBD;
+
+	if (boot_cpu_has(X86_FEATURE_AMD_IBRS) || boot_cpu_has(X86_FEATURE_AMD_STIBP))
+		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS;
+
+	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
+		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS | SPEC_CTRL_SSBD;
+
+	return bits;
+}
+
+static u64 kvm_spec_ctrl_valid_bits_guest(struct kvm_vcpu *vcpu)
 {
-	uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD;
+	uint64_t bits = 0;
 
-	/* The STIBP bit doesn't fault even if it's not advertised */
-	if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
-	    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
-		bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
-	if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
-	    !boot_cpu_has(X86_FEATURE_AMD_IBRS))
-		bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
+	if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+		bits |= SPEC_CTRL_IBRS;
+	if (guest_cpuid_has(vcpu, X86_FEATURE_INTEL_STIBP))
+		bits |= SPEC_CTRL_STIBP;
+	if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD))
+		bits |= SPEC_CTRL_SSBD;
 
-	if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) &&
-	    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
-		bits &= ~SPEC_CTRL_SSBD;
-	if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
-	    !boot_cpu_has(X86_FEATURE_AMD_SSBD))
-		bits &= ~SPEC_CTRL_SSBD;
+	if (guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) ||
+			guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP))
+		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS;
+	if (guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+		bits |= SPEC_CTRL_STIBP | SPEC_CTRL_IBRS | SPEC_CTRL_SSBD;
 
 	return bits;
 }
+
+u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
+{
+	return kvm_spec_ctrl_valid_bits_host() &
+	       kvm_spec_ctrl_valid_bits_guest(vcpu);
+}
+
+
 EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);