[v3,7/9] KVM: vmx/pmu: Add Arch LBR emulation and its VMCS field
diff mbox series

Message ID 20210303135756.1546253-8-like.xu@linux.intel.com
State New, archived
Headers show
Series
  • KVM: x86/pmu: Guest Architectural LBR Enabling
Related show

Commit Message

Like Xu March 3, 2021, 1:57 p.m. UTC
New VMX controls bits for Arch LBR are added. When bit 21 in vmentry_ctrl
is set, VM entry will write the value from the "Guest IA32_LBR_CTL" guest
state field to IA32_LBR_CTL. When bit 26 in vmexit_ctrl is set, VM exit
will clear IA32_LBR_CTL after the value has been saved to the "Guest
IA32_LBR_CTL" guest state field.

To enable guest Arch LBR, KVM should set both the "Load Guest IA32_LBR_CTL"
entry control and the "Clear IA32_LBR_CTL" exit control bits. If these two
conditions cannot be met, KVM will clear the LBR_FMT bits and will not
expose the Arch LBR feature.

If Arch LBR is exposed on KVM, the guest should set both the ARCH_LBR CPUID
and the same LBR_FMT value as the host via MSR_IA32_PERF_CAPABILITIES to
enable guest Arch LBR.

KVM will bypass the host/guest x86 cpu model check and the records msrs can
still be pass-through to guest as usual and work like a model-specific LBR.
KVM is consistent with the host and does not support the LER entry.

Signed-off-by: Like Xu <like.xu@linux.intel.com>
---
 arch/x86/include/asm/vmx.h      |  2 ++
 arch/x86/kvm/vmx/capabilities.h | 25 +++++++++++++++++--------
 arch/x86/kvm/vmx/pmu_intel.c    | 27 ++++++++++++++++++++++-----
 arch/x86/kvm/vmx/vmx.c          |  9 +++++++--
 4 files changed, 48 insertions(+), 15 deletions(-)

Comments

Sean Christopherson March 3, 2021, 5:26 p.m. UTC | #1
On Wed, Mar 03, 2021, Like Xu wrote:
> New VMX controls bits for Arch LBR are added. When bit 21 in vmentry_ctrl
> is set, VM entry will write the value from the "Guest IA32_LBR_CTL" guest
> state field to IA32_LBR_CTL. When bit 26 in vmexit_ctrl is set, VM exit
> will clear IA32_LBR_CTL after the value has been saved to the "Guest
> IA32_LBR_CTL" guest state field.

...

> @@ -2529,7 +2532,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  	      VM_EXIT_LOAD_IA32_EFER |
>  	      VM_EXIT_CLEAR_BNDCFGS |
>  	      VM_EXIT_PT_CONCEAL_PIP |
> -	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
> +	      VM_EXIT_CLEAR_IA32_RTIT_CTL |
> +	      VM_EXIT_CLEAR_IA32_LBR_CTL;

So, how does MSR_ARCH_LBR_CTL get restored on the host?  What if the host wants
to keep _its_ LBR recording active while the guest is running?

>  	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
>  				&_vmexit_control) < 0)
>  		return -EIO;
> @@ -2553,7 +2557,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>  	      VM_ENTRY_LOAD_IA32_EFER |
>  	      VM_ENTRY_LOAD_BNDCFGS |
>  	      VM_ENTRY_PT_CONCEAL_PIP |
> -	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
> +	      VM_ENTRY_LOAD_IA32_RTIT_CTL |
> +	      VM_ENTRY_LOAD_IA32_LBR_CTL;
>  	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
>  				&_vmentry_control) < 0)
>  		return -EIO;
> -- 
> 2.29.2
>
Xu, Like March 4, 2021, 3:02 a.m. UTC | #2
On 2021/3/4 1:26, Sean Christopherson wrote:
> On Wed, Mar 03, 2021, Like Xu wrote:
>> New VMX controls bits for Arch LBR are added. When bit 21 in vmentry_ctrl
>> is set, VM entry will write the value from the "Guest IA32_LBR_CTL" guest
>> state field to IA32_LBR_CTL. When bit 26 in vmexit_ctrl is set, VM exit
>> will clear IA32_LBR_CTL after the value has been saved to the "Guest
>> IA32_LBR_CTL" guest state field.
> ...
>
>> @@ -2529,7 +2532,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>>   	      VM_EXIT_LOAD_IA32_EFER |
>>   	      VM_EXIT_CLEAR_BNDCFGS |
>>   	      VM_EXIT_PT_CONCEAL_PIP |
>> -	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
>> +	      VM_EXIT_CLEAR_IA32_RTIT_CTL |
>> +	      VM_EXIT_CLEAR_IA32_LBR_CTL;
> So, how does MSR_ARCH_LBR_CTL get restored on the host?  What if the host wants
> to keep _its_ LBR recording active while the guest is running?

Thank you!

I will add "host_lbrctlmsr" field to "struct vcpu_vmx" and
repeat the update/get_debugctlmsr() stuff.

>>   	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
>>   				&_vmexit_control) < 0)
>>   		return -EIO;
>> @@ -2553,7 +2557,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>>   	      VM_ENTRY_LOAD_IA32_EFER |
>>   	      VM_ENTRY_LOAD_BNDCFGS |
>>   	      VM_ENTRY_PT_CONCEAL_PIP |
>> -	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
>> +	      VM_ENTRY_LOAD_IA32_RTIT_CTL |
>> +	      VM_ENTRY_LOAD_IA32_LBR_CTL;
>>   	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
>>   				&_vmentry_control) < 0)
>>   		return -EIO;
>> -- 
>> 2.29.2
>>
Sean Christopherson March 4, 2021, 5:23 p.m. UTC | #3
On Thu, Mar 04, 2021, Xu, Like wrote:
> On 2021/3/4 1:26, Sean Christopherson wrote:
> > On Wed, Mar 03, 2021, Like Xu wrote:
> > > New VMX controls bits for Arch LBR are added. When bit 21 in vmentry_ctrl
> > > is set, VM entry will write the value from the "Guest IA32_LBR_CTL" guest
> > > state field to IA32_LBR_CTL. When bit 26 in vmexit_ctrl is set, VM exit
> > > will clear IA32_LBR_CTL after the value has been saved to the "Guest
> > > IA32_LBR_CTL" guest state field.
> > ...
> > 
> > > @@ -2529,7 +2532,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
> > >   	      VM_EXIT_LOAD_IA32_EFER |
> > >   	      VM_EXIT_CLEAR_BNDCFGS |
> > >   	      VM_EXIT_PT_CONCEAL_PIP |
> > > -	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
> > > +	      VM_EXIT_CLEAR_IA32_RTIT_CTL |
> > > +	      VM_EXIT_CLEAR_IA32_LBR_CTL;
> > So, how does MSR_ARCH_LBR_CTL get restored on the host?  What if the host wants
> > to keep _its_ LBR recording active while the guest is running?
> 
> Thank you!
> 
> I will add "host_lbrctlmsr" field to "struct vcpu_vmx" and
> repeat the update/get_debugctlmsr() stuff.

I am not remotely confident that tracking LBRCTL via vcpu_vmx is correct, and
I'm far less confident that the existing DEBUGCTL logic is correct.  As Jim
pointed out[*], intel_pmu_handle_irq() can run at any time, and it's not at all
clear to me that the DEBUGCTL coming out of the NMI handler is guaranteed to be
the same value going in.  Ditto for LBRCTL.

Actually, NMIs aside, KVM's DEBUGCTL handling is provably broken since writing
/sys/devices/cpu/freeze_on_smi is propagated to other CPUs via IRQ, and KVM
snapshots DEBUCTL on vCPU load, i.e. runs with IRQs enabled long after grabbing
the value.

  WARNING: CPU: 5 PID: 0 at arch/x86/events/intel/core.c:4066 flip_smm_bit+0xb/0x30
  RIP: 0010:flip_smm_bit+0xb/0x30
  Call Trace:
   <IRQ>
   flush_smp_call_function_queue+0x118/0x1a0
   __sysvec_call_function+0x2c/0x90
   asm_call_irq_on_stack+0x12/0x20
   </IRQ>


So, rather than pile on more MSR handling that is at best dubious, and at worst
broken, I would like to see KVM properly integrate with perf to ensure KVM
restores the correct, fresh values of all MSRs that are owned by perf.  Or at
least add something that guarantees that intel_pmu_handle_irq() preserves the
MSRs.  As is, it's impossible to review these KVM changes without deep, deep
knowledge of what perf is doing.

https://lkml.kernel.org/r/20210209225653.1393771-1-jmattson@google.com
Xu, Like March 5, 2021, 6:35 a.m. UTC | #4
On 2021/3/5 1:23, Sean Christopherson wrote:
> On Thu, Mar 04, 2021, Xu, Like wrote:
>> On 2021/3/4 1:26, Sean Christopherson wrote:
>>> On Wed, Mar 03, 2021, Like Xu wrote:
>>>> New VMX controls bits for Arch LBR are added. When bit 21 in vmentry_ctrl
>>>> is set, VM entry will write the value from the "Guest IA32_LBR_CTL" guest
>>>> state field to IA32_LBR_CTL. When bit 26 in vmexit_ctrl is set, VM exit
>>>> will clear IA32_LBR_CTL after the value has been saved to the "Guest
>>>> IA32_LBR_CTL" guest state field.
>>> ...
>>>
>>>> @@ -2529,7 +2532,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
>>>>    	      VM_EXIT_LOAD_IA32_EFER |
>>>>    	      VM_EXIT_CLEAR_BNDCFGS |
>>>>    	      VM_EXIT_PT_CONCEAL_PIP |
>>>> -	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
>>>> +	      VM_EXIT_CLEAR_IA32_RTIT_CTL |
>>>> +	      VM_EXIT_CLEAR_IA32_LBR_CTL;
>>> So, how does MSR_ARCH_LBR_CTL get restored on the host?  What if the host wants
>>> to keep _its_ LBR recording active while the guest is running?
>> Thank you!
>>
>> I will add "host_lbrctlmsr" field to "struct vcpu_vmx" and
>> repeat the update/get_debugctlmsr() stuff.
> I am not remotely confident that tracking LBRCTL via vcpu_vmx is correct, and
> I'm far less confident that the existing DEBUGCTL logic is correct.  As Jim
> pointed out[*], intel_pmu_handle_irq() can run at any time, and it's not at all
> clear to me that the DEBUGCTL coming out of the NMI handler is guaranteed to be
> the same value going in.  Ditto for LBRCTL.

It's not true for "Ditto for LBRCTL".

Because the usage of ARCH_LBR_CTL is specified for LBR,
not the shared case of DEBUGCTL. And all LBR events created from
KVM or host perf syscall are all under the control of host perf subsystem.

The irq handler would restore the original value of the ARCH_LBR_CTL
even it's called after the KVM snapshots DEBUCTL on vCPU load.
The change is transparent to the update_lbrctlmsr() and get_lbrctlmsr().

> Actually, NMIs aside, KVM's DEBUGCTL handling is provably broken since writing
> /sys/devices/cpu/freeze_on_smi is propagated to other CPUs via IRQ, and KVM
> snapshots DEBUCTL on vCPU load, i.e. runs with IRQs enabled long after grabbing
> the value.
>
>    WARNING: CPU: 5 PID: 0 at arch/x86/events/intel/core.c:4066 flip_smm_bit+0xb/0x30
>    RIP: 0010:flip_smm_bit+0xb/0x30
>    Call Trace:
>     <IRQ>
>     flush_smp_call_function_queue+0x118/0x1a0
>     __sysvec_call_function+0x2c/0x90
>     asm_call_irq_on_stack+0x12/0x20
>     </IRQ>

This kind of bug with the keyword "flip_smm_bit" did not appear on the 
mailing list.
Would you mind to share testcases or more details about the steps to 
reproduce ?

>
> So, rather than pile on more MSR handling that is at best dubious, and at worst
> broken, I would like to see KVM properly integrate with perf to ensure KVM
> restores the correct, fresh values of all MSRs that are owned by perf.  Or at
> least add something that guarantees that intel_pmu_handle_irq() preserves the
> MSRs.  As is, it's impossible to review these KVM changes without deep, deep
> knowledge of what perf is doing.

Jim complained more about the inconsistent maintenance of
MSR_IA32_PEBS_ENABLE between KVM and perf subsystem.

The issue bothers the host due to the subsystem integration,
but the guest's use of PBES will be safe and reliable.

We could cover more details in the guest PEBS enabling thread.

>
> https://lkml.kernel.org/r/20210209225653.1393771-1-jmattson@google.com

Patch
diff mbox series

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 8ec7bc24b37a..c8186ec46fca 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -95,6 +95,7 @@ 
 #define VM_EXIT_CLEAR_BNDCFGS                   0x00800000
 #define VM_EXIT_PT_CONCEAL_PIP			0x01000000
 #define VM_EXIT_CLEAR_IA32_RTIT_CTL		0x02000000
+#define VM_EXIT_CLEAR_IA32_LBR_CTL		0x04000000
 
 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR	0x00036dff
 
@@ -108,6 +109,7 @@ 
 #define VM_ENTRY_LOAD_BNDCFGS                   0x00010000
 #define VM_ENTRY_PT_CONCEAL_PIP			0x00020000
 #define VM_ENTRY_LOAD_IA32_RTIT_CTL		0x00040000
+#define VM_ENTRY_LOAD_IA32_LBR_CTL			0x00200000
 
 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR	0x000011ff
 
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index d1d77985e889..73fceb534c7c 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -378,20 +378,29 @@  static inline bool vmx_pt_mode_is_host_guest(void)
 	return pt_mode == PT_MODE_HOST_GUEST;
 }
 
-static inline u64 vmx_get_perf_capabilities(void)
+static inline bool cpu_has_vmx_arch_lbr(void)
 {
-	u64 perf_cap = 0;
-
-	if (boot_cpu_has(X86_FEATURE_PDCM))
-		rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
-
-	perf_cap &= PMU_CAP_LBR_FMT;
+	return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_LBR_CTL) &&
+		(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_LBR_CTL);
+}
 
+static inline u64 vmx_get_perf_capabilities(void)
+{
 	/*
 	 * Since counters are virtualized, KVM would support full
 	 * width counting unconditionally, even if the host lacks it.
 	 */
-	return PMU_CAP_FW_WRITES | perf_cap;
+	u64 perf_cap = PMU_CAP_FW_WRITES;
+	u64 host_perf_cap = 0;
+
+	if (boot_cpu_has(X86_FEATURE_PDCM))
+		rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+
+	perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+	if (boot_cpu_has(X86_FEATURE_ARCH_LBR) && !cpu_has_vmx_arch_lbr())
+		perf_cap &= ~PMU_CAP_LBR_FMT;
+
+	return perf_cap;
 }
 
 static inline u64 vmx_supported_debugctl(void)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index d14a14eb712d..48a817be60ab 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -176,12 +176,16 @@  static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
 
 bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
 {
+	if (kvm_cpu_cap_has(X86_FEATURE_ARCH_LBR))
+		return guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR);
+
 	/*
 	 * As a first step, a guest could only enable LBR feature if its
 	 * cpu model is the same as the host because the LBR registers
 	 * would be pass-through to the guest and they're model specific.
 	 */
-	return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
+	return !boot_cpu_has(X86_FEATURE_ARCH_LBR) &&
+		boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
 }
 
 bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
@@ -199,8 +203,11 @@  static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
 	if (!intel_pmu_lbr_is_enabled(vcpu))
 		return ret;
 
-	ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) ||
-		(index >= records->from && index < records->from + records->nr) ||
+	if (!guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR))
+		ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS);
+
+	if (!ret)
+		ret = (index >= records->from && index < records->from + records->nr) ||
 		(index >= records->to && index < records->to + records->nr);
 
 	if (!ret && records->info)
@@ -692,6 +699,9 @@  static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
 			vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set);
 	}
 
+	if (guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR))
+		return;
+
 	vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set);
 	vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set);
 }
@@ -735,10 +745,13 @@  void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 	struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+	bool lbr_enable = guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR) ?
+		(vmcs_read64(GUEST_IA32_LBR_CTL) & ARCH_LBR_CTL_LBREN) :
+		(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR);
 
 	if (!lbr_desc->event) {
 		vmx_disable_lbr_msrs_passthrough(vcpu);
-		if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+		if (lbr_enable)
 			goto warn;
 		if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
 			goto warn;
@@ -761,7 +774,11 @@  void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
 
 static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
 {
-	if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+	bool lbr_enable = guest_cpuid_has(vcpu, X86_FEATURE_ARCH_LBR) ?
+		(vmcs_read64(GUEST_IA32_LBR_CTL) & ARCH_LBR_CTL_LBREN) :
+		(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR);
+
+	if (!lbr_enable)
 		intel_pmu_release_guest_lbr_event(vcpu);
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a0660b9934c6..2f307689a14b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -666,6 +666,9 @@  static bool is_valid_passthrough_msr(u32 msr)
 	case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
 	case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
 	case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
+	case MSR_ARCH_LBR_FROM_0 ... MSR_ARCH_LBR_FROM_0 + 31:
+	case MSR_ARCH_LBR_TO_0 ... MSR_ARCH_LBR_TO_0 + 31:
+	case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31:
 		/* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
 		return true;
 	}
@@ -2529,7 +2532,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	      VM_EXIT_LOAD_IA32_EFER |
 	      VM_EXIT_CLEAR_BNDCFGS |
 	      VM_EXIT_PT_CONCEAL_PIP |
-	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
+	      VM_EXIT_CLEAR_IA32_RTIT_CTL |
+	      VM_EXIT_CLEAR_IA32_LBR_CTL;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 				&_vmexit_control) < 0)
 		return -EIO;
@@ -2553,7 +2557,8 @@  static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
 	      VM_ENTRY_LOAD_IA32_EFER |
 	      VM_ENTRY_LOAD_BNDCFGS |
 	      VM_ENTRY_PT_CONCEAL_PIP |
-	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
+	      VM_ENTRY_LOAD_IA32_RTIT_CTL |
+	      VM_ENTRY_LOAD_IA32_LBR_CTL;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
 				&_vmentry_control) < 0)
 		return -EIO;