All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] KVM: nVMX: Fully support of nested VMX preemption timer
@ 2013-09-04 15:22 Arthur Chunqi Li
  2013-09-05  7:45 ` Zhang, Yang Z
  0 siblings, 1 reply; 8+ messages in thread
From: Arthur Chunqi Li @ 2013-09-04 15:22 UTC (permalink / raw)
  To: kvm; +Cc: jan.kiszka, gleb, pbonzini, Arthur Chunqi Li

This patch contains the following two changes:
1. Fix the bug in nested preemption timer support. If vmexit L2->L0
with some reasons not emulated by L1, preemption timer value should
be save in such exits.
2. Add support of "Save VMX-preemption timer value" VM-Exit controls
to nVMX.

With this patch, nested VMX preemption timer features are fully
supported.

Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
---
This series depends on queue.

 arch/x86/include/uapi/asm/msr-index.h |    1 +
 arch/x86/kvm/vmx.c                    |   51 ++++++++++++++++++++++++++++++---
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index bb04650..b93e09a 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -536,6 +536,7 @@
 
 /* MSR_IA32_VMX_MISC bits */
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
+#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
 
 #define MSR_VM_CR                       0xc0010114
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1f1da43..870caa8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2204,7 +2204,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #ifdef CONFIG_X86_64
 		VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+	if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER))
+		nested_vmx_exit_ctls_high &=
+			(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
+	if (!(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+		nested_vmx_pinbased_ctls_high &=
+			(~PIN_BASED_VMX_PREEMPTION_TIMER);
 	nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 				      VM_EXIT_LOAD_IA32_EFER);
 
@@ -6707,6 +6714,23 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
+static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
+{
+	u64 delta_tsc_l1;
+	u32 preempt_val_l1, preempt_val_l2, preempt_scale;
+
+	preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
+			MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
+	preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+	delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
+			native_read_tsc()) - vcpu->arch.last_guest_tsc;
+	preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
+	if (preempt_val_l2 - preempt_val_l1 < 0)
+		preempt_val_l2 = 0;
+	else
+		preempt_val_l2 -= preempt_val_l1;
+	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
+}
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -6716,6 +6740,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
+	int ret;
 
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required)
@@ -6795,12 +6820,15 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
 	if (exit_reason < kvm_vmx_max_exit_handlers
 	    && kvm_vmx_exit_handlers[exit_reason])
-		return kvm_vmx_exit_handlers[exit_reason](vcpu);
+		ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
 	else {
 		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
 		vcpu->run->hw.hardware_exit_reason = exit_reason;
+		ret = 0;
 	}
-	return 0;
+	if (is_guest_mode(vcpu))
+		nested_adjust_preemption_timer(vcpu);
+	return ret;
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -7518,6 +7546,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exec_control;
+	u32 exit_control;
 
 	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
 	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7691,7 +7720,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
 	 * bits are further modified by vmx_set_efer() below.
 	 */
-	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+	exit_control = vmcs_config.vmexit_ctrl;
+	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+		exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+	vmcs_write32(VM_EXIT_CONTROLS, exit_control);
 
 	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
 	 * emulated by vmx_set_efer(), below.
@@ -8090,6 +8122,17 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_pending_dbg_exceptions =
 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+	if (vmcs12->pin_based_vm_exec_control &
+			PIN_BASED_VMX_PREEMPTION_TIMER) {
+		if (vmcs12->vm_exit_controls &
+				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+			vmcs12->vmx_preemption_timer_value =
+				vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+		else
+			vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
+					vmcs12->vmx_preemption_timer_value);
+	}
+
 	/*
 	 * In some cases (usually, nested EPT), L2 is allowed to change its
 	 * own CR3 without exiting. If it has changed it, we must keep it.
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* RE: [PATCH v3] KVM: nVMX: Fully support of nested VMX preemption timer
  2013-09-04 15:22 [PATCH v3] KVM: nVMX: Fully support of nested VMX preemption timer Arthur Chunqi Li
@ 2013-09-05  7:45 ` Zhang, Yang Z
  2013-09-05  8:47   ` Arthur Chunqi Li
  0 siblings, 1 reply; 8+ messages in thread
From: Zhang, Yang Z @ 2013-09-05  7:45 UTC (permalink / raw)
  To: Arthur Chunqi Li, kvm; +Cc: jan.kiszka, gleb, pbonzini

Arthur Chunqi Li wrote on 2013-09-04:
> This patch contains the following two changes:
> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0 with some
> reasons not emulated by L1, preemption timer value should be save in such
> exits.
> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls to
> nVMX.
> 
> With this patch, nested VMX preemption timer features are fully supported.
> 
> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> ---
> This series depends on queue.
> 
>  arch/x86/include/uapi/asm/msr-index.h |    1 +
>  arch/x86/kvm/vmx.c                    |   51
> ++++++++++++++++++++++++++++++---
>  2 files changed, 48 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/uapi/asm/msr-index.h
> b/arch/x86/include/uapi/asm/msr-index.h
> index bb04650..b93e09a 100644
> --- a/arch/x86/include/uapi/asm/msr-index.h
> +++ b/arch/x86/include/uapi/asm/msr-index.h
> @@ -536,6 +536,7 @@
> 
>  /* MSR_IA32_VMX_MISC bits */
>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
>  /* AMD-V MSRs */
> 
>  #define MSR_VM_CR                       0xc0010114
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1f1da43..870caa8
> 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2204,7 +2204,14 @@ static __init void
> nested_vmx_setup_ctls_msrs(void)  #ifdef CONFIG_X86_64
>  		VM_EXIT_HOST_ADDR_SPACE_SIZE |
>  #endif
> -		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
> +		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
> +		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> +	if (!(nested_vmx_pinbased_ctls_high &
> PIN_BASED_VMX_PREEMPTION_TIMER))
> +		nested_vmx_exit_ctls_high &=
> +			(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
> +	if (!(nested_vmx_exit_ctls_high &
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
> +		nested_vmx_pinbased_ctls_high &=
> +			(~PIN_BASED_VMX_PREEMPTION_TIMER);
The following logic is more clearly:
if(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER)
	nested_vmx_exit_ctls_high |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER

BTW: I don't see nested_vmx_setup_ctls_msrs() considers the hardware's capability when expose those vmx features(not just preemption timer) to L1. 

>  	nested_vmx_exit_ctls_high |=
> (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>  				      VM_EXIT_LOAD_IA32_EFER);
> 
> @@ -6707,6 +6714,23 @@ static void vmx_get_exit_info(struct kvm_vcpu
> *vcpu, u64 *info1, u64 *info2)
>  	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);  }
> 
> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) {
> +	u64 delta_tsc_l1;
> +	u32 preempt_val_l1, preempt_val_l2, preempt_scale;
> +
> +	preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
> +			MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
> +	preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> +	delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
> +			native_read_tsc()) - vcpu->arch.last_guest_tsc;
> +	preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
> +	if (preempt_val_l2 - preempt_val_l1 < 0)
> +		preempt_val_l2 = 0;
> +	else
> +		preempt_val_l2 -= preempt_val_l1;
> +	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2); }
>  /*
>   * The guest has exited.  See if we can fix it or if we need userspace
>   * assistance.
> @@ -6716,6 +6740,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	u32 exit_reason = vmx->exit_reason;
>  	u32 vectoring_info = vmx->idt_vectoring_info;
> +	int ret;
> 
>  	/* If guest state is invalid, start emulating */
>  	if (vmx->emulation_required)
> @@ -6795,12 +6820,15 @@ static int vmx_handle_exit(struct kvm_vcpu
> *vcpu)
> 
>  	if (exit_reason < kvm_vmx_max_exit_handlers
>  	    && kvm_vmx_exit_handlers[exit_reason])
> -		return kvm_vmx_exit_handlers[exit_reason](vcpu);
> +		ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
>  	else {
>  		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
>  		vcpu->run->hw.hardware_exit_reason = exit_reason;
> +		ret = 0;
>  	}
> -	return 0;
> +	if (is_guest_mode(vcpu))
> +		nested_adjust_preemption_timer(vcpu);
Move this forward to avoid the changes for ret.
> +	return ret;
>  }
> 
>  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) @@
> -7518,6 +7546,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
> struct vmcs12 *vmcs12)  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	u32 exec_control;
> +	u32 exit_control;
> 
>  	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>  	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@
> -7691,7 +7720,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
> struct vmcs12 *vmcs12)
>  	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>  	 * bits are further modified by vmx_set_efer() below.
>  	 */
> -	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
> +	exit_control = vmcs_config.vmexit_ctrl;
> +	if (vmcs12->pin_based_vm_exec_control &
> PIN_BASED_VMX_PREEMPTION_TIMER)
> +		exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> +	vmcs_write32(VM_EXIT_CONTROLS, exit_control);
And here should be problem if host doesn't support VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.

> 
>  	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
> are
>  	 * emulated by vmx_set_efer(), below.
> @@ -8090,6 +8122,17 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu,
> struct vmcs12 *vmcs12)
>  	vmcs12->guest_pending_dbg_exceptions =
>  		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> 
> +	if (vmcs12->pin_based_vm_exec_control &
> +			PIN_BASED_VMX_PREEMPTION_TIMER) {
> +		if (vmcs12->vm_exit_controls &
> +				VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> +			vmcs12->vmx_preemption_timer_value =
> +				vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> +		else
> +			vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> +					vmcs12->vmx_preemption_timer_value);
Why write it to vmcs directly if VM_EXIT_SAVE_VMX_PREEMPTION_TIMER not set?

> +	}
> +
>  	/*
>  	 * In some cases (usually, nested EPT), L2 is allowed to change its
>  	 * own CR3 without exiting. If it has changed it, we must keep it.
> --
> 1.7.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a
> message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html

Best regards,
Yang



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] KVM: nVMX: Fully support of nested VMX preemption timer
  2013-09-05  7:45 ` Zhang, Yang Z
@ 2013-09-05  8:47   ` Arthur Chunqi Li
  2013-09-05  9:24     ` Zhang, Yang Z
  0 siblings, 1 reply; 8+ messages in thread
From: Arthur Chunqi Li @ 2013-09-05  8:47 UTC (permalink / raw)
  To: Zhang, Yang Z; +Cc: kvm, jan.kiszka, gleb, pbonzini

On Thu, Sep 5, 2013 at 3:45 PM, Zhang, Yang Z <yang.z.zhang@intel.com> wrote:
> Arthur Chunqi Li wrote on 2013-09-04:
>> This patch contains the following two changes:
>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0 with some
>> reasons not emulated by L1, preemption timer value should be save in such
>> exits.
>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls to
>> nVMX.
>>
>> With this patch, nested VMX preemption timer features are fully supported.
>>
>> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>> ---
>> This series depends on queue.
>>
>>  arch/x86/include/uapi/asm/msr-index.h |    1 +
>>  arch/x86/kvm/vmx.c                    |   51
>> ++++++++++++++++++++++++++++++---
>>  2 files changed, 48 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/x86/include/uapi/asm/msr-index.h
>> b/arch/x86/include/uapi/asm/msr-index.h
>> index bb04650..b93e09a 100644
>> --- a/arch/x86/include/uapi/asm/msr-index.h
>> +++ b/arch/x86/include/uapi/asm/msr-index.h
>> @@ -536,6 +536,7 @@
>>
>>  /* MSR_IA32_VMX_MISC bits */
>>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
>> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
>>  /* AMD-V MSRs */
>>
>>  #define MSR_VM_CR                       0xc0010114
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1f1da43..870caa8
>> 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2204,7 +2204,14 @@ static __init void
>> nested_vmx_setup_ctls_msrs(void)  #ifdef CONFIG_X86_64
>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>  #endif
>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> +     if (!(nested_vmx_pinbased_ctls_high &
>> PIN_BASED_VMX_PREEMPTION_TIMER))
>> +             nested_vmx_exit_ctls_high &=
>> +                     (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>> +     if (!(nested_vmx_exit_ctls_high &
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
>> +             nested_vmx_pinbased_ctls_high &=
>> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
> The following logic is more clearly:
> if(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER)
>         nested_vmx_exit_ctls_high |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
Here I have such consideration: this logic is wrong if CPU support
PIN_BASED_VMX_PREEMPTION_TIMER but doesn't support
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, though I don't know if this does
occurs. So the codes above reads the MSR and reserves the features it
supports, and here I just check if these two features are supported
simultaneously.

You remind that this piece of codes can write like this:
if (!(nested_vmx_pin_based_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
                !(nested_vmx_exit_ctls_high &
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
        nested_vmx_exit_ctls_high &=(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
        nested_vmx_pinbased_ctls_high &= (~PIN_BASED_VMX_PREEMPTION_TIMER);
}

This may reflect the logic I describe above that these two flags
should support simultaneously, and brings less confusion.
>
> BTW: I don't see nested_vmx_setup_ctls_msrs() considers the hardware's capability when expose those vmx features(not just preemption timer) to L1.
The codes just above here, when setting pinbased control for nested
vmx, it firstly "rdmsr MSR_IA32_VMX_PINBASED_CTLS", then use this to
mask the features hardware not support. So does other control fields.
>
>>       nested_vmx_exit_ctls_high |=
>> (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>                                     VM_EXIT_LOAD_IA32_EFER);
>>
>> @@ -6707,6 +6714,23 @@ static void vmx_get_exit_info(struct kvm_vcpu
>> *vcpu, u64 *info1, u64 *info2)
>>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);  }
>>
>> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) {
>> +     u64 delta_tsc_l1;
>> +     u32 preempt_val_l1, preempt_val_l2, preempt_scale;
>> +
>> +     preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
>> +                     MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
>> +     preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +     delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
>> +                     native_read_tsc()) - vcpu->arch.last_guest_tsc;
>> +     preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
>> +     if (preempt_val_l2 - preempt_val_l1 < 0)
>> +             preempt_val_l2 = 0;
>> +     else
>> +             preempt_val_l2 -= preempt_val_l1;
>> +     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2); }
>>  /*
>>   * The guest has exited.  See if we can fix it or if we need userspace
>>   * assistance.
>> @@ -6716,6 +6740,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>>       u32 exit_reason = vmx->exit_reason;
>>       u32 vectoring_info = vmx->idt_vectoring_info;
>> +     int ret;
>>
>>       /* If guest state is invalid, start emulating */
>>       if (vmx->emulation_required)
>> @@ -6795,12 +6820,15 @@ static int vmx_handle_exit(struct kvm_vcpu
>> *vcpu)
>>
>>       if (exit_reason < kvm_vmx_max_exit_handlers
>>           && kvm_vmx_exit_handlers[exit_reason])
>> -             return kvm_vmx_exit_handlers[exit_reason](vcpu);
>> +             ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
>>       else {
>>               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
>>               vcpu->run->hw.hardware_exit_reason = exit_reason;
>> +             ret = 0;
>>       }
>> -     return 0;
>> +     if (is_guest_mode(vcpu))
>> +             nested_adjust_preemption_timer(vcpu);
> Move this forward to avoid the changes for ret.
The previous codes simply "return
kvm_vmx_exit_handlers[exit_reason](vcpu);", which may also consumes
CPU times. So put "nested_adjust_preemption_timer" ahead may cause the
statistics inaccuracy.
>> +     return ret;
>>  }
>>
>>  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) @@
>> -7518,6 +7546,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
>> struct vmcs12 *vmcs12)  {
>>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>>       u32 exec_control;
>> +     u32 exit_control;
>>
>>       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>>       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@
>> -7691,7 +7720,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
>> struct vmcs12 *vmcs12)
>>        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>>        * bits are further modified by vmx_set_efer() below.
>>        */
>> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>> +     exit_control = vmcs_config.vmexit_ctrl;
>> +     if (vmcs12->pin_based_vm_exec_control &
>> PIN_BASED_VMX_PREEMPTION_TIMER)
>> +             exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> +     vmcs_write32(VM_EXIT_CONTROLS, exit_control);
> And here should be problem if host doesn't support VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
Nested vmx does check the hardware support of these features in
"nested_vmx_setup_ctls_msrs", see my comments above.
>
>>
>>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
>> are
>>        * emulated by vmx_set_efer(), below.
>> @@ -8090,6 +8122,17 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu,
>> struct vmcs12 *vmcs12)
>>       vmcs12->guest_pending_dbg_exceptions =
>>               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>>
>> +     if (vmcs12->pin_based_vm_exec_control &
>> +                     PIN_BASED_VMX_PREEMPTION_TIMER) {
>> +             if (vmcs12->vm_exit_controls &
>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>> +                     vmcs12->vmx_preemption_timer_value =
>> +                             vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +             else
>> +                     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> +                                     vmcs12->vmx_preemption_timer_value);
> Why write it to vmcs directly if VM_EXIT_SAVE_VMX_PREEMPTION_TIMER not set?
Yes, writing is needless here since vmcs02 will be re-prepared via
prepare_vmcs02() when L1->L2. This function just save information
needed, vmcs_write is useless.

Arthur
>
>> +     }
>> +
>>       /*
>>        * In some cases (usually, nested EPT), L2 is allowed to change its
>>        * own CR3 without exiting. If it has changed it, we must keep it.
>> --
>> 1.7.9.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a
>> message to majordomo@vger.kernel.org More majordomo info at
>> http://vger.kernel.org/majordomo-info.html
>
> Best regards,
> Yang
>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v3] KVM: nVMX: Fully support of nested VMX preemption timer
  2013-09-05  8:47   ` Arthur Chunqi Li
@ 2013-09-05  9:24     ` Zhang, Yang Z
  2013-09-05  9:50       ` Arthur Chunqi Li
  2013-09-13 11:58       ` Paolo Bonzini
  0 siblings, 2 replies; 8+ messages in thread
From: Zhang, Yang Z @ 2013-09-05  9:24 UTC (permalink / raw)
  To: Arthur Chunqi Li; +Cc: kvm, jan.kiszka, gleb, pbonzini

Arthur Chunqi Li wrote on 2013-09-05:
> On Thu, Sep 5, 2013 at 3:45 PM, Zhang, Yang Z <yang.z.zhang@intel.com>
> wrote:
> > Arthur Chunqi Li wrote on 2013-09-04:
> >> This patch contains the following two changes:
> >> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
> >> with some reasons not emulated by L1, preemption timer value should
> >> be save in such exits.
> >> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
> >> to nVMX.
> >>
> >> With this patch, nested VMX preemption timer features are fully supported.
> >>
> >> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> >> ---
> >> This series depends on queue.
> >>
> >>  arch/x86/include/uapi/asm/msr-index.h |    1 +
> >>  arch/x86/kvm/vmx.c                    |   51
> >> ++++++++++++++++++++++++++++++---
> >>  2 files changed, 48 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/arch/x86/include/uapi/asm/msr-index.h
> >> b/arch/x86/include/uapi/asm/msr-index.h
> >> index bb04650..b93e09a 100644
> >> --- a/arch/x86/include/uapi/asm/msr-index.h
> >> +++ b/arch/x86/include/uapi/asm/msr-index.h
> >> @@ -536,6 +536,7 @@
> >>
> >>  /* MSR_IA32_VMX_MISC bits */
> >>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL <<
> 29)
> >> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
> >>  /* AMD-V MSRs */
> >>
> >>  #define MSR_VM_CR                       0xc0010114
> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index
> >> 1f1da43..870caa8
> >> 100644
> >> --- a/arch/x86/kvm/vmx.c
> >> +++ b/arch/x86/kvm/vmx.c
> >> @@ -2204,7 +2204,14 @@ static __init void
> >> nested_vmx_setup_ctls_msrs(void)  #ifdef CONFIG_X86_64
> >>               VM_EXIT_HOST_ADDR_SPACE_SIZE |  #endif
> >> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
> >> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
> >> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> >> +     if (!(nested_vmx_pinbased_ctls_high &
> >> PIN_BASED_VMX_PREEMPTION_TIMER))
> >> +             nested_vmx_exit_ctls_high &=
> >> +                     (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
> >> +     if (!(nested_vmx_exit_ctls_high &
> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
> >> +             nested_vmx_pinbased_ctls_high &=
> >> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
> > The following logic is more clearly:
> > if(nested_vmx_pinbased_ctls_high &
> PIN_BASED_VMX_PREEMPTION_TIMER)
> >         nested_vmx_exit_ctls_high |=
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
> Here I have such consideration: this logic is wrong if CPU support
> PIN_BASED_VMX_PREEMPTION_TIMER but doesn't support
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, though I don't know if this does
> occurs. So the codes above reads the MSR and reserves the features it
> supports, and here I just check if these two features are supported
> simultaneously.
> 
No. Only VM_EXIT_SAVE_VMX_PREEMPTION_TIMER depends on PIN_BASED_VMX_PREEMPTION_TIMER. PIN_BASED_VMX_PREEMPTION_TIMER is an independent feature

> You remind that this piece of codes can write like this:
> if (!(nested_vmx_pin_based_ctls_high &
> PIN_BASED_VMX_PREEMPTION_TIMER) ||
>                 !(nested_vmx_exit_ctls_high &
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
>         nested_vmx_exit_ctls_high
> &=(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>         nested_vmx_pinbased_ctls_high &=
> (~PIN_BASED_VMX_PREEMPTION_TIMER);
> }
> 
> This may reflect the logic I describe above that these two flags should support
> simultaneously, and brings less confusion.
> >
> > BTW: I don't see nested_vmx_setup_ctls_msrs() considers the hardware's
> capability when expose those vmx features(not just preemption timer) to L1.
> The codes just above here, when setting pinbased control for nested vmx, it
> firstly "rdmsr MSR_IA32_VMX_PINBASED_CTLS", then use this to mask the
> features hardware not support. So does other control fields.
> >
Yes, I saw it.

> >>       nested_vmx_exit_ctls_high |=
> >> (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
> >>                                     VM_EXIT_LOAD_IA32_EFER);
> >>
> >> @@ -6707,6 +6714,23 @@ static void vmx_get_exit_info(struct kvm_vcpu
> >> *vcpu, u64 *info1, u64 *info2)
> >>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);  }
> >>
> >> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) {
> >> +     u64 delta_tsc_l1;
> >> +     u32 preempt_val_l1, preempt_val_l2, preempt_scale;
> >> +
> >> +     preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
> >> +
> MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
> >> +     preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> >> +     delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
> >> +                     native_read_tsc()) - vcpu->arch.last_guest_tsc;
> >> +     preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
> >> +     if (preempt_val_l2 - preempt_val_l1 < 0)
> >> +             preempt_val_l2 = 0;
> >> +     else
> >> +             preempt_val_l2 -= preempt_val_l1;
> >> +     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> preempt_val_l2); }
> >>  /*
> >>   * The guest has exited.  See if we can fix it or if we need userspace
> >>   * assistance.
> >> @@ -6716,6 +6740,7 @@ static int vmx_handle_exit(struct kvm_vcpu
> *vcpu)
> >>       struct vcpu_vmx *vmx = to_vmx(vcpu);
> >>       u32 exit_reason = vmx->exit_reason;
> >>       u32 vectoring_info = vmx->idt_vectoring_info;
> >> +     int ret;
> >>
> >>       /* If guest state is invalid, start emulating */
> >>       if (vmx->emulation_required)
> >> @@ -6795,12 +6820,15 @@ static int vmx_handle_exit(struct kvm_vcpu
> >> *vcpu)
> >>
> >>       if (exit_reason < kvm_vmx_max_exit_handlers
> >>           && kvm_vmx_exit_handlers[exit_reason])
> >> -             return kvm_vmx_exit_handlers[exit_reason](vcpu);
> >> +             ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
> >>       else {
> >>               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
> >>               vcpu->run->hw.hardware_exit_reason = exit_reason;
> >> +             ret = 0;
> >>       }
> >> -     return 0;
> >> +     if (is_guest_mode(vcpu))
> >> +             nested_adjust_preemption_timer(vcpu);
> > Move this forward to avoid the changes for ret.
> The previous codes simply "return
> kvm_vmx_exit_handlers[exit_reason](vcpu);", which may also consumes CPU
> times. So put "nested_adjust_preemption_timer" ahead may cause the
> statistics inaccuracy.
Then you should put it before vmentry. Here still far from the point of vmentry.

> >> +     return ret;
> >>  }
> >>
> >>  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int
> >> irr) @@
> >> -7518,6 +7546,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
> >> struct vmcs12 *vmcs12)  {
> >>       struct vcpu_vmx *vmx = to_vmx(vcpu);
> >>       u32 exec_control;
> >> +     u32 exit_control;
> >>
> >>       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
> >>       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
> @@
> >> -7691,7 +7720,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
> >> struct vmcs12 *vmcs12)
> >>        * we should use its exit controls. Note that
> VM_EXIT_LOAD_IA32_EFER
> >>        * bits are further modified by vmx_set_efer() below.
> >>        */
> >> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
> >> +     exit_control = vmcs_config.vmexit_ctrl;
> >> +     if (vmcs12->pin_based_vm_exec_control &
> >> PIN_BASED_VMX_PREEMPTION_TIMER)
> >> +             exit_control |=
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> >> +     vmcs_write32(VM_EXIT_CONTROLS, exit_control);
> > And here should be problem if host doesn't support
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
> Nested vmx does check the hardware support of these features in
> "nested_vmx_setup_ctls_msrs", see my comments above.
But here you don't check it. You just set it unconditionally.

> >
> >>
> >>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and
> VM_ENTRY_IA32E_MODE are
> >>        * emulated by vmx_set_efer(), below.
> >> @@ -8090,6 +8122,17 @@ static void prepare_vmcs12(struct kvm_vcpu
> >> *vcpu, struct vmcs12 *vmcs12)
> >>       vmcs12->guest_pending_dbg_exceptions =
> >>               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> >>
> >> +     if (vmcs12->pin_based_vm_exec_control &
> >> +                     PIN_BASED_VMX_PREEMPTION_TIMER) {
> >> +             if (vmcs12->vm_exit_controls &
> >> +
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> >> +                     vmcs12->vmx_preemption_timer_value =
> >> +
> vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> >> +             else
> >> +
> vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >> +
> >> + vmcs12->vmx_preemption_timer_value);
> > Why write it to vmcs directly if VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
> not set?
> Yes, writing is needless here since vmcs02 will be re-prepared via
> prepare_vmcs02() when L1->L2. This function just save information needed,
> vmcs_write is useless.
> 
I don't sure I am seeing your point. Per my point, even the VM_EXIT_SAVE_VMX_PREEMPTION_TIMER is not setting, you still need to save the value to vmcs12->vmx_preemption_timer_value. Or else, prepare_vmcs02 cannot get the right value.

> Arthur
> >
> >> +     }
> >> +
> >>       /*
> >>        * In some cases (usually, nested EPT), L2 is allowed to change its
> >>        * own CR3 without exiting. If it has changed it, we must keep it.
> >> --
> >> 1.7.9.5
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe kvm" in the
> >> body of a message to majordomo@vger.kernel.org More majordomo info at
> >> http://vger.kernel.org/majordomo-info.html
> >
> > Best regards,
> > Yang
> >
> >
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a
> message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html

Best regards,
Yang



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] KVM: nVMX: Fully support of nested VMX preemption timer
  2013-09-05  9:24     ` Zhang, Yang Z
@ 2013-09-05  9:50       ` Arthur Chunqi Li
  2013-09-05 11:05         ` Zhang, Yang Z
  2013-09-13 11:58       ` Paolo Bonzini
  1 sibling, 1 reply; 8+ messages in thread
From: Arthur Chunqi Li @ 2013-09-05  9:50 UTC (permalink / raw)
  To: Zhang, Yang Z; +Cc: kvm, jan.kiszka, gleb, pbonzini

On Thu, Sep 5, 2013 at 5:24 PM, Zhang, Yang Z <yang.z.zhang@intel.com> wrote:
> Arthur Chunqi Li wrote on 2013-09-05:
>> On Thu, Sep 5, 2013 at 3:45 PM, Zhang, Yang Z <yang.z.zhang@intel.com>
>> wrote:
>> > Arthur Chunqi Li wrote on 2013-09-04:
>> >> This patch contains the following two changes:
>> >> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>> >> with some reasons not emulated by L1, preemption timer value should
>> >> be save in such exits.
>> >> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>> >> to nVMX.
>> >>
>> >> With this patch, nested VMX preemption timer features are fully supported.
>> >>
>> >> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>> >> ---
>> >> This series depends on queue.
>> >>
>> >>  arch/x86/include/uapi/asm/msr-index.h |    1 +
>> >>  arch/x86/kvm/vmx.c                    |   51
>> >> ++++++++++++++++++++++++++++++---
>> >>  2 files changed, 48 insertions(+), 4 deletions(-)
>> >>
>> >> diff --git a/arch/x86/include/uapi/asm/msr-index.h
>> >> b/arch/x86/include/uapi/asm/msr-index.h
>> >> index bb04650..b93e09a 100644
>> >> --- a/arch/x86/include/uapi/asm/msr-index.h
>> >> +++ b/arch/x86/include/uapi/asm/msr-index.h
>> >> @@ -536,6 +536,7 @@
>> >>
>> >>  /* MSR_IA32_VMX_MISC bits */
>> >>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL <<
>> 29)
>> >> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
>> >>  /* AMD-V MSRs */
>> >>
>> >>  #define MSR_VM_CR                       0xc0010114
>> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index
>> >> 1f1da43..870caa8
>> >> 100644
>> >> --- a/arch/x86/kvm/vmx.c
>> >> +++ b/arch/x86/kvm/vmx.c
>> >> @@ -2204,7 +2204,14 @@ static __init void
>> >> nested_vmx_setup_ctls_msrs(void)  #ifdef CONFIG_X86_64
>> >>               VM_EXIT_HOST_ADDR_SPACE_SIZE |  #endif
>> >> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>> >> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>> >> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> >> +     if (!(nested_vmx_pinbased_ctls_high &
>> >> PIN_BASED_VMX_PREEMPTION_TIMER))
>> >> +             nested_vmx_exit_ctls_high &=
>> >> +                     (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>> >> +     if (!(nested_vmx_exit_ctls_high &
>> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
>> >> +             nested_vmx_pinbased_ctls_high &=
>> >> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
>> > The following logic is more clearly:
>> > if(nested_vmx_pinbased_ctls_high &
>> PIN_BASED_VMX_PREEMPTION_TIMER)
>> >         nested_vmx_exit_ctls_high |=
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
>> Here I have such consideration: this logic is wrong if CPU support
>> PIN_BASED_VMX_PREEMPTION_TIMER but doesn't support
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, though I don't know if this does
>> occurs. So the codes above reads the MSR and reserves the features it
>> supports, and here I just check if these two features are supported
>> simultaneously.
>>
> No. Only VM_EXIT_SAVE_VMX_PREEMPTION_TIMER depends on PIN_BASED_VMX_PREEMPTION_TIMER. PIN_BASED_VMX_PREEMPTION_TIMER is an independent feature
>
>> You remind that this piece of codes can write like this:
>> if (!(nested_vmx_pin_based_ctls_high &
>> PIN_BASED_VMX_PREEMPTION_TIMER) ||
>>                 !(nested_vmx_exit_ctls_high &
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
>>         nested_vmx_exit_ctls_high
>> &=(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>>         nested_vmx_pinbased_ctls_high &=
>> (~PIN_BASED_VMX_PREEMPTION_TIMER);
>> }
>>
>> This may reflect the logic I describe above that these two flags should support
>> simultaneously, and brings less confusion.
>> >
>> > BTW: I don't see nested_vmx_setup_ctls_msrs() considers the hardware's
>> capability when expose those vmx features(not just preemption timer) to L1.
>> The codes just above here, when setting pinbased control for nested vmx, it
>> firstly "rdmsr MSR_IA32_VMX_PINBASED_CTLS", then use this to mask the
>> features hardware not support. So does other control fields.
>> >
> Yes, I saw it.
>
>> >>       nested_vmx_exit_ctls_high |=
>> >> (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>> >>                                     VM_EXIT_LOAD_IA32_EFER);
>> >>
>> >> @@ -6707,6 +6714,23 @@ static void vmx_get_exit_info(struct kvm_vcpu
>> >> *vcpu, u64 *info1, u64 *info2)
>> >>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);  }
>> >>
>> >> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) {
>> >> +     u64 delta_tsc_l1;
>> >> +     u32 preempt_val_l1, preempt_val_l2, preempt_scale;
>> >> +
>> >> +     preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
>> >> +
>> MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
>> >> +     preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> >> +     delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
>> >> +                     native_read_tsc()) - vcpu->arch.last_guest_tsc;
>> >> +     preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
>> >> +     if (preempt_val_l2 - preempt_val_l1 < 0)
>> >> +             preempt_val_l2 = 0;
>> >> +     else
>> >> +             preempt_val_l2 -= preempt_val_l1;
>> >> +     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> preempt_val_l2); }
>> >>  /*
>> >>   * The guest has exited.  See if we can fix it or if we need userspace
>> >>   * assistance.
>> >> @@ -6716,6 +6740,7 @@ static int vmx_handle_exit(struct kvm_vcpu
>> *vcpu)
>> >>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>> >>       u32 exit_reason = vmx->exit_reason;
>> >>       u32 vectoring_info = vmx->idt_vectoring_info;
>> >> +     int ret;
>> >>
>> >>       /* If guest state is invalid, start emulating */
>> >>       if (vmx->emulation_required)
>> >> @@ -6795,12 +6820,15 @@ static int vmx_handle_exit(struct kvm_vcpu
>> >> *vcpu)
>> >>
>> >>       if (exit_reason < kvm_vmx_max_exit_handlers
>> >>           && kvm_vmx_exit_handlers[exit_reason])
>> >> -             return kvm_vmx_exit_handlers[exit_reason](vcpu);
>> >> +             ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
>> >>       else {
>> >>               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
>> >>               vcpu->run->hw.hardware_exit_reason = exit_reason;
>> >> +             ret = 0;
>> >>       }
>> >> -     return 0;
>> >> +     if (is_guest_mode(vcpu))
>> >> +             nested_adjust_preemption_timer(vcpu);
>> > Move this forward to avoid the changes for ret.
>> The previous codes simply "return
>> kvm_vmx_exit_handlers[exit_reason](vcpu);", which may also consumes CPU
>> times. So put "nested_adjust_preemption_timer" ahead may cause the
>> statistics inaccuracy.
> Then you should put it before vmentry. Here still far from the point of vmentry.
So where is the actual point of vmentry? I'm not quite familiar with
that piece of codes.
>
>> >> +     return ret;
>> >>  }
>> >>
>> >>  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int
>> >> irr) @@
>> >> -7518,6 +7546,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
>> >> struct vmcs12 *vmcs12)  {
>> >>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>> >>       u32 exec_control;
>> >> +     u32 exit_control;
>> >>
>> >>       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>> >>       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
>> @@
>> >> -7691,7 +7720,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu,
>> >> struct vmcs12 *vmcs12)
>> >>        * we should use its exit controls. Note that
>> VM_EXIT_LOAD_IA32_EFER
>> >>        * bits are further modified by vmx_set_efer() below.
>> >>        */
>> >> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>> >> +     exit_control = vmcs_config.vmexit_ctrl;
>> >> +     if (vmcs12->pin_based_vm_exec_control &
>> >> PIN_BASED_VMX_PREEMPTION_TIMER)
>> >> +             exit_control |=
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> >> +     vmcs_write32(VM_EXIT_CONTROLS, exit_control);
>> > And here should be problem if host doesn't support
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
>> Nested vmx does check the hardware support of these features in
>> "nested_vmx_setup_ctls_msrs", see my comments above.
> But here you don't check it. You just set it unconditionally.
What we have checked in nested_vmx_setup_ctls_msrs() confirms that
only if these two features supported by hardware, nested vmx can
enable preemption timer. That is to say, if L2 can set
PIN_BASED_VMX_PREEMPTION_TIMER without failure, hardware must support
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, which should guarantee in
nested_vmx_setup_ctls_msrs (). (This is also why I write code like
that in nested_vmx_setup_ctls_msrs ()).
>
>> >
>> >>
>> >>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and
>> VM_ENTRY_IA32E_MODE are
>> >>        * emulated by vmx_set_efer(), below.
>> >> @@ -8090,6 +8122,17 @@ static void prepare_vmcs12(struct kvm_vcpu
>> >> *vcpu, struct vmcs12 *vmcs12)
>> >>       vmcs12->guest_pending_dbg_exceptions =
>> >>               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>> >>
>> >> +     if (vmcs12->pin_based_vm_exec_control &
>> >> +                     PIN_BASED_VMX_PREEMPTION_TIMER) {
>> >> +             if (vmcs12->vm_exit_controls &
>> >> +
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>> >> +                     vmcs12->vmx_preemption_timer_value =
>> >> +
>> vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> >> +             else
>> >> +
>> vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> >> +
>> >> + vmcs12->vmx_preemption_timer_value);
>> > Why write it to vmcs directly if VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
>> not set?
>> Yes, writing is needless here since vmcs02 will be re-prepared via
>> prepare_vmcs02() when L1->L2. This function just save information needed,
>> vmcs_write is useless.
>>
> I don't sure I am seeing your point. Per my point, even the VM_EXIT_SAVE_VMX_PREEMPTION_TIMER is not setting, you still need to save the value to vmcs12->vmx_preemption_timer_value. Or else, prepare_vmcs02 cannot get the right value.
If L2 doesn't enable VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, its value will
be reset to what we set initially. This function is called due to
L2->L1, so we should emulate L2's real exit process here. What we have
done in other parts is to handle vmexit of L2->L0->L2, which is not
the "real" L2 vmexit.

Arthur
>
>> Arthur
>> >
>> >> +     }
>> >> +
>> >>       /*
>> >>        * In some cases (usually, nested EPT), L2 is allowed to change its
>> >>        * own CR3 without exiting. If it has changed it, we must keep it.
>> >> --
>> >> 1.7.9.5
>> >>
>> >> --
>> >> To unsubscribe from this list: send the line "unsubscribe kvm" in the
>> >> body of a message to majordomo@vger.kernel.org More majordomo info at
>> >> http://vger.kernel.org/majordomo-info.html
>> >
>> > Best regards,
>> > Yang
>> >
>> >
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a
>> message to majordomo@vger.kernel.org More majordomo info at
>> http://vger.kernel.org/majordomo-info.html
>
> Best regards,
> Yang
>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH v3] KVM: nVMX: Fully support of nested VMX preemption timer
  2013-09-05  9:50       ` Arthur Chunqi Li
@ 2013-09-05 11:05         ` Zhang, Yang Z
  2013-09-05 14:19           ` Arthur Chunqi Li
  0 siblings, 1 reply; 8+ messages in thread
From: Zhang, Yang Z @ 2013-09-05 11:05 UTC (permalink / raw)
  To: Arthur Chunqi Li; +Cc: kvm, jan.kiszka, gleb, pbonzini

Arthur Chunqi Li wrote on 2013-09-05:
> > Arthur Chunqi Li wrote on 2013-09-05:
> >> On Thu, Sep 5, 2013 at 3:45 PM, Zhang, Yang Z
> >> <yang.z.zhang@intel.com>
> >> wrote:
> >> > Arthur Chunqi Li wrote on 2013-09-04:
> >> >> This patch contains the following two changes:
> >> >> 1. Fix the bug in nested preemption timer support. If vmexit
> >> >> L2->L0 with some reasons not emulated by L1, preemption timer
> >> >> value should be save in such exits.
> >> >> 2. Add support of "Save VMX-preemption timer value" VM-Exit
> >> >> controls to nVMX.
> >> >>
> >> >> With this patch, nested VMX preemption timer features are fully
> supported.
> >> >>
> >> >> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
> >> >> ---
> >> >> This series depends on queue.
> >> >>
> >> >>  arch/x86/include/uapi/asm/msr-index.h |    1 +
> >> >>  arch/x86/kvm/vmx.c                    |   51
> >> >> ++++++++++++++++++++++++++++++---
> >> >>  2 files changed, 48 insertions(+), 4 deletions(-)
> >> >>
> >> >> diff --git a/arch/x86/include/uapi/asm/msr-index.h
> >> >> b/arch/x86/include/uapi/asm/msr-index.h
> >> >> index bb04650..b93e09a 100644
> >> >> --- a/arch/x86/include/uapi/asm/msr-index.h
> >> >> +++ b/arch/x86/include/uapi/asm/msr-index.h
> >> >> @@ -536,6 +536,7 @@
> >> >>
> >> >>  /* MSR_IA32_VMX_MISC bits */
> >> >>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL
> <<
> >> 29)
> >> >> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
> >> >>  /* AMD-V MSRs */
> >> >>
> >> >>  #define MSR_VM_CR                       0xc0010114
> >> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index
> >> >> 1f1da43..870caa8
> >> >> 100644
> >> >> --- a/arch/x86/kvm/vmx.c
> >> >> +++ b/arch/x86/kvm/vmx.c
> >> >> @@ -2204,7 +2204,14 @@ static __init void
> >> >> nested_vmx_setup_ctls_msrs(void)  #ifdef CONFIG_X86_64
> >> >>               VM_EXIT_HOST_ADDR_SPACE_SIZE |  #endif
> >> >> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
> >> >> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT
> |
> >> >> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> >> >> +     if (!(nested_vmx_pinbased_ctls_high &
> >> >> PIN_BASED_VMX_PREEMPTION_TIMER))
> >> >> +             nested_vmx_exit_ctls_high &=
> >> >> +
> (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
> >> >> +     if (!(nested_vmx_exit_ctls_high &
> >> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
> >> >> +             nested_vmx_pinbased_ctls_high &=
> >> >> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
> >> > The following logic is more clearly:
> >> > if(nested_vmx_pinbased_ctls_high &
> >> PIN_BASED_VMX_PREEMPTION_TIMER)
> >> >         nested_vmx_exit_ctls_high |=
> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
> >> Here I have such consideration: this logic is wrong if CPU support
> >> PIN_BASED_VMX_PREEMPTION_TIMER but doesn't support
> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, though I don't know if this
> does
> >> occurs. So the codes above reads the MSR and reserves the features it
> >> supports, and here I just check if these two features are supported
> >> simultaneously.
> >>
> > No. Only VM_EXIT_SAVE_VMX_PREEMPTION_TIMER depends on
> > PIN_BASED_VMX_PREEMPTION_TIMER.
> PIN_BASED_VMX_PREEMPTION_TIMER is an
> > independent feature
> >
> >> You remind that this piece of codes can write like this:
> >> if (!(nested_vmx_pin_based_ctls_high &
> >> PIN_BASED_VMX_PREEMPTION_TIMER) ||
> >>                 !(nested_vmx_exit_ctls_high &
> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
> >>         nested_vmx_exit_ctls_high
> >> &=(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
> >>         nested_vmx_pinbased_ctls_high &=
> >> (~PIN_BASED_VMX_PREEMPTION_TIMER);
> >> }
> >>
> >> This may reflect the logic I describe above that these two flags
> >> should support simultaneously, and brings less confusion.
> >> >
> >> > BTW: I don't see nested_vmx_setup_ctls_msrs() considers the
> >> > hardware's
> >> capability when expose those vmx features(not just preemption timer) to L1.
> >> The codes just above here, when setting pinbased control for nested
> >> vmx, it firstly "rdmsr MSR_IA32_VMX_PINBASED_CTLS", then use this to
> >> mask the features hardware not support. So does other control fields.
> >> >
> > Yes, I saw it.
> >
> >> >>       nested_vmx_exit_ctls_high |=
> >> >> (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
> >> >>
> VM_EXIT_LOAD_IA32_EFER);
> >> >>
> >> >> @@ -6707,6 +6714,23 @@ static void vmx_get_exit_info(struct
> >> >> kvm_vcpu *vcpu, u64 *info1, u64 *info2)
> >> >>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);  }
> >> >>
> >> >> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) {
> >> >> +     u64 delta_tsc_l1;
> >> >> +     u32 preempt_val_l1, preempt_val_l2, preempt_scale;
> >> >> +
> >> >> +     preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
> >> >> +
> >> MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
> >> >> +     preempt_val_l2 =
> vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> >> >> +     delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
> >> >> +                     native_read_tsc()) -
> vcpu->arch.last_guest_tsc;
> >> >> +     preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
> >> >> +     if (preempt_val_l2 - preempt_val_l1 < 0)
> >> >> +             preempt_val_l2 = 0;
> >> >> +     else
> >> >> +             preempt_val_l2 -= preempt_val_l1;
> >> >> +     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >> preempt_val_l2); }
> >> >>  /*
> >> >>   * The guest has exited.  See if we can fix it or if we need userspace
> >> >>   * assistance.
> >> >> @@ -6716,6 +6740,7 @@ static int vmx_handle_exit(struct kvm_vcpu
> >> *vcpu)
> >> >>       struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> >>       u32 exit_reason = vmx->exit_reason;
> >> >>       u32 vectoring_info = vmx->idt_vectoring_info;
> >> >> +     int ret;
> >> >>
> >> >>       /* If guest state is invalid, start emulating */
> >> >>       if (vmx->emulation_required) @@ -6795,12 +6820,15 @@ static
> >> >> int vmx_handle_exit(struct kvm_vcpu
> >> >> *vcpu)
> >> >>
> >> >>       if (exit_reason < kvm_vmx_max_exit_handlers
> >> >>           && kvm_vmx_exit_handlers[exit_reason])
> >> >> -             return kvm_vmx_exit_handlers[exit_reason](vcpu);
> >> >> +             ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
> >> >>       else {
> >> >>               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
> >> >>               vcpu->run->hw.hardware_exit_reason = exit_reason;
> >> >> +             ret = 0;
> >> >>       }
> >> >> -     return 0;
> >> >> +     if (is_guest_mode(vcpu))
> >> >> +             nested_adjust_preemption_timer(vcpu);
> >> > Move this forward to avoid the changes for ret.
> >> The previous codes simply "return
> >> kvm_vmx_exit_handlers[exit_reason](vcpu);", which may also consumes
> >> CPU times. So put "nested_adjust_preemption_timer" ahead may cause
> >> the statistics inaccuracy.
> > Then you should put it before vmentry. Here still far from the point of
> vmentry.
> So where is the actual point of vmentry? I'm not quite familiar with that piece
> of codes.
vmx_vcpu_run should be the right place.

> >
> >> >> +     return ret;
> >> >>  }
> >> >>
> >> >>  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr,
> >> >> int
> >> >> irr) @@
> >> >> -7518,6 +7546,7 @@ static void prepare_vmcs02(struct kvm_vcpu
> >> >> *vcpu, struct vmcs12 *vmcs12)  {
> >> >>       struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> >>       u32 exec_control;
> >> >> +     u32 exit_control;
> >> >>
> >> >>       vmcs_write16(GUEST_ES_SELECTOR,
> vmcs12->guest_es_selector);
> >> >>       vmcs_write16(GUEST_CS_SELECTOR,
> vmcs12->guest_cs_selector);
> >> @@
> >> >> -7691,7 +7720,10 @@ static void prepare_vmcs02(struct kvm_vcpu
> >> >> *vcpu, struct vmcs12 *vmcs12)
> >> >>        * we should use its exit controls. Note that
> >> VM_EXIT_LOAD_IA32_EFER
> >> >>        * bits are further modified by vmx_set_efer() below.
> >> >>        */
> >> >> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
> >> >> +     exit_control = vmcs_config.vmexit_ctrl;
> >> >> +     if (vmcs12->pin_based_vm_exec_control &
> >> >> PIN_BASED_VMX_PREEMPTION_TIMER)
> >> >> +             exit_control |=
> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
> >> >> +     vmcs_write32(VM_EXIT_CONTROLS, exit_control);
> >> > And here should be problem if host doesn't support
> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
> >> Nested vmx does check the hardware support of these features in
> >> "nested_vmx_setup_ctls_msrs", see my comments above.
> > But here you don't check it. You just set it unconditionally.
> What we have checked in nested_vmx_setup_ctls_msrs() confirms that only if
> these two features supported by hardware, nested vmx can enable preemption
> timer. That is to say, if L2 can set PIN_BASED_VMX_PREEMPTION_TIMER
> without failure, hardware must support
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, which should guarantee in
> nested_vmx_setup_ctls_msrs (). (This is also why I write code like that in
> nested_vmx_setup_ctls_msrs ()).
I see your point. But the fact is PIN_BASED_VMX_PREEMPTION_TIMER is an independent feature. So your previous assumption is wrong. The hardware may support PIN_BASED_VMX_PREEMPTION_TIMER without VM_EXIT_SAVE_VMX_PREEMPTION_TIMER

> >
> >> >
> >> >>
> >> >>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and
> >> VM_ENTRY_IA32E_MODE are
> >> >>        * emulated by vmx_set_efer(), below.
> >> >> @@ -8090,6 +8122,17 @@ static void prepare_vmcs12(struct kvm_vcpu
> >> >> *vcpu, struct vmcs12 *vmcs12)
> >> >>       vmcs12->guest_pending_dbg_exceptions =
> >> >>               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> >> >>
> >> >> +     if (vmcs12->pin_based_vm_exec_control &
> >> >> +                     PIN_BASED_VMX_PREEMPTION_TIMER) {
> >> >> +             if (vmcs12->vm_exit_controls &
> >> >> +
> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> >> >> +                     vmcs12->vmx_preemption_timer_value =
> >> >> +
> >> vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
> >> >> +             else
> >> >> +
> >> vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
> >> >> +
> >> >> + vmcs12->vmx_preemption_timer_value);
> >> > Why write it to vmcs directly if
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
> >> not set?
> >> Yes, writing is needless here since vmcs02 will be re-prepared via
> >> prepare_vmcs02() when L1->L2. This function just save information
> >> needed, vmcs_write is useless.
> >>
> > I don't sure I am seeing your point. Per my point, even the
> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER is not setting, you still need to
> save the value to vmcs12->vmx_preemption_timer_value. Or else,
> prepare_vmcs02 cannot get the right value.
> If L2 doesn't enable VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, its value will
> be reset to what we set initially. This function is called due to
> L2->L1, so we should emulate L2's real exit process here. What we have
> done in other parts is to handle vmexit of L2->L0->L2, which is not the "real" L2
> vmexit.
I am not describing it clearly. I mean if VM_EXIT_SAVE_VMX_PREEMPTION_TIMER is not set, then prepare_vmcs02() always write the initial value which means vmx_preemption_timer_value to vmcs02 before L2 running. So the vmcs wirte here is redundant. Am I wrong?

> 
> Arthur
> >
> >> Arthur
> >> >
> >> >> +     }
> >> >> +
> >> >>       /*
> >> >>        * In some cases (usually, nested EPT), L2 is allowed to change
> its
> >> >>        * own CR3 without exiting. If it has changed it, we must keep it.
> >> >> --
> >> >> 1.7.9.5
> >> >>
> >> >> --
> >> >> To unsubscribe from this list: send the line "unsubscribe kvm" in
> >> >> the body of a message to majordomo@vger.kernel.org More majordomo
> >> >> info at http://vger.kernel.org/majordomo-info.html
> >> >
> >> > Best regards,
> >> > Yang
> >> >
> >> >
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe kvm" in the
> >> body of a message to majordomo@vger.kernel.org More majordomo info at
> >> http://vger.kernel.org/majordomo-info.html
> >
> > Best regards,
> > Yang
> >
> >

Best regards,
Yang



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] KVM: nVMX: Fully support of nested VMX preemption timer
  2013-09-05 11:05         ` Zhang, Yang Z
@ 2013-09-05 14:19           ` Arthur Chunqi Li
  0 siblings, 0 replies; 8+ messages in thread
From: Arthur Chunqi Li @ 2013-09-05 14:19 UTC (permalink / raw)
  To: Zhang, Yang Z; +Cc: kvm, jan.kiszka, gleb, pbonzini

On Thu, Sep 5, 2013 at 7:05 PM, Zhang, Yang Z <yang.z.zhang@intel.com> wrote:
> Arthur Chunqi Li wrote on 2013-09-05:
>> > Arthur Chunqi Li wrote on 2013-09-05:
>> >> On Thu, Sep 5, 2013 at 3:45 PM, Zhang, Yang Z
>> >> <yang.z.zhang@intel.com>
>> >> wrote:
>> >> > Arthur Chunqi Li wrote on 2013-09-04:
>> >> >> This patch contains the following two changes:
>> >> >> 1. Fix the bug in nested preemption timer support. If vmexit
>> >> >> L2->L0 with some reasons not emulated by L1, preemption timer
>> >> >> value should be save in such exits.
>> >> >> 2. Add support of "Save VMX-preemption timer value" VM-Exit
>> >> >> controls to nVMX.
>> >> >>
>> >> >> With this patch, nested VMX preemption timer features are fully
>> supported.
>> >> >>
>> >> >> Signed-off-by: Arthur Chunqi Li <yzt356@gmail.com>
>> >> >> ---
>> >> >> This series depends on queue.
>> >> >>
>> >> >>  arch/x86/include/uapi/asm/msr-index.h |    1 +
>> >> >>  arch/x86/kvm/vmx.c                    |   51
>> >> >> ++++++++++++++++++++++++++++++---
>> >> >>  2 files changed, 48 insertions(+), 4 deletions(-)
>> >> >>
>> >> >> diff --git a/arch/x86/include/uapi/asm/msr-index.h
>> >> >> b/arch/x86/include/uapi/asm/msr-index.h
>> >> >> index bb04650..b93e09a 100644
>> >> >> --- a/arch/x86/include/uapi/asm/msr-index.h
>> >> >> +++ b/arch/x86/include/uapi/asm/msr-index.h
>> >> >> @@ -536,6 +536,7 @@
>> >> >>
>> >> >>  /* MSR_IA32_VMX_MISC bits */
>> >> >>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL
>> <<
>> >> 29)
>> >> >> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
>> >> >>  /* AMD-V MSRs */
>> >> >>
>> >> >>  #define MSR_VM_CR                       0xc0010114
>> >> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index
>> >> >> 1f1da43..870caa8
>> >> >> 100644
>> >> >> --- a/arch/x86/kvm/vmx.c
>> >> >> +++ b/arch/x86/kvm/vmx.c
>> >> >> @@ -2204,7 +2204,14 @@ static __init void
>> >> >> nested_vmx_setup_ctls_msrs(void)  #ifdef CONFIG_X86_64
>> >> >>               VM_EXIT_HOST_ADDR_SPACE_SIZE |  #endif
>> >> >> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>> >> >> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT
>> |
>> >> >> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> >> >> +     if (!(nested_vmx_pinbased_ctls_high &
>> >> >> PIN_BASED_VMX_PREEMPTION_TIMER))
>> >> >> +             nested_vmx_exit_ctls_high &=
>> >> >> +
>> (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>> >> >> +     if (!(nested_vmx_exit_ctls_high &
>> >> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
>> >> >> +             nested_vmx_pinbased_ctls_high &=
>> >> >> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
>> >> > The following logic is more clearly:
>> >> > if(nested_vmx_pinbased_ctls_high &
>> >> PIN_BASED_VMX_PREEMPTION_TIMER)
>> >> >         nested_vmx_exit_ctls_high |=
>> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
>> >> Here I have such consideration: this logic is wrong if CPU support
>> >> PIN_BASED_VMX_PREEMPTION_TIMER but doesn't support
>> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, though I don't know if this
>> does
>> >> occurs. So the codes above reads the MSR and reserves the features it
>> >> supports, and here I just check if these two features are supported
>> >> simultaneously.
>> >>
>> > No. Only VM_EXIT_SAVE_VMX_PREEMPTION_TIMER depends on
>> > PIN_BASED_VMX_PREEMPTION_TIMER.
>> PIN_BASED_VMX_PREEMPTION_TIMER is an
>> > independent feature
>> >
>> >> You remind that this piece of codes can write like this:
>> >> if (!(nested_vmx_pin_based_ctls_high &
>> >> PIN_BASED_VMX_PREEMPTION_TIMER) ||
>> >>                 !(nested_vmx_exit_ctls_high &
>> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
>> >>         nested_vmx_exit_ctls_high
>> >> &=(~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>> >>         nested_vmx_pinbased_ctls_high &=
>> >> (~PIN_BASED_VMX_PREEMPTION_TIMER);
>> >> }
>> >>
>> >> This may reflect the logic I describe above that these two flags
>> >> should support simultaneously, and brings less confusion.
>> >> >
>> >> > BTW: I don't see nested_vmx_setup_ctls_msrs() considers the
>> >> > hardware's
>> >> capability when expose those vmx features(not just preemption timer) to L1.
>> >> The codes just above here, when setting pinbased control for nested
>> >> vmx, it firstly "rdmsr MSR_IA32_VMX_PINBASED_CTLS", then use this to
>> >> mask the features hardware not support. So does other control fields.
>> >> >
>> > Yes, I saw it.
>> >
>> >> >>       nested_vmx_exit_ctls_high |=
>> >> >> (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>> >> >>
>> VM_EXIT_LOAD_IA32_EFER);
>> >> >>
>> >> >> @@ -6707,6 +6714,23 @@ static void vmx_get_exit_info(struct
>> >> >> kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>> >> >>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);  }
>> >> >>
>> >> >> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) {
>> >> >> +     u64 delta_tsc_l1;
>> >> >> +     u32 preempt_val_l1, preempt_val_l2, preempt_scale;
>> >> >> +
>> >> >> +     preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
>> >> >> +
>> >> MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
>> >> >> +     preempt_val_l2 =
>> vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> >> >> +     delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
>> >> >> +                     native_read_tsc()) -
>> vcpu->arch.last_guest_tsc;
>> >> >> +     preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
>> >> >> +     if (preempt_val_l2 - preempt_val_l1 < 0)
>> >> >> +             preempt_val_l2 = 0;
>> >> >> +     else
>> >> >> +             preempt_val_l2 -= preempt_val_l1;
>> >> >> +     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> >> preempt_val_l2); }
>> >> >>  /*
>> >> >>   * The guest has exited.  See if we can fix it or if we need userspace
>> >> >>   * assistance.
>> >> >> @@ -6716,6 +6740,7 @@ static int vmx_handle_exit(struct kvm_vcpu
>> >> *vcpu)
>> >> >>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>> >> >>       u32 exit_reason = vmx->exit_reason;
>> >> >>       u32 vectoring_info = vmx->idt_vectoring_info;
>> >> >> +     int ret;
>> >> >>
>> >> >>       /* If guest state is invalid, start emulating */
>> >> >>       if (vmx->emulation_required) @@ -6795,12 +6820,15 @@ static
>> >> >> int vmx_handle_exit(struct kvm_vcpu
>> >> >> *vcpu)
>> >> >>
>> >> >>       if (exit_reason < kvm_vmx_max_exit_handlers
>> >> >>           && kvm_vmx_exit_handlers[exit_reason])
>> >> >> -             return kvm_vmx_exit_handlers[exit_reason](vcpu);
>> >> >> +             ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
>> >> >>       else {
>> >> >>               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
>> >> >>               vcpu->run->hw.hardware_exit_reason = exit_reason;
>> >> >> +             ret = 0;
>> >> >>       }
>> >> >> -     return 0;
>> >> >> +     if (is_guest_mode(vcpu))
>> >> >> +             nested_adjust_preemption_timer(vcpu);
>> >> > Move this forward to avoid the changes for ret.
>> >> The previous codes simply "return
>> >> kvm_vmx_exit_handlers[exit_reason](vcpu);", which may also consumes
>> >> CPU times. So put "nested_adjust_preemption_timer" ahead may cause
>> >> the statistics inaccuracy.
>> > Then you should put it before vmentry. Here still far from the point of
>> vmentry.
>> So where is the actual point of vmentry? I'm not quite familiar with that piece
>> of codes.
> vmx_vcpu_run should be the right place.
>
>> >
>> >> >> +     return ret;
>> >> >>  }
>> >> >>
>> >> >>  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr,
>> >> >> int
>> >> >> irr) @@
>> >> >> -7518,6 +7546,7 @@ static void prepare_vmcs02(struct kvm_vcpu
>> >> >> *vcpu, struct vmcs12 *vmcs12)  {
>> >> >>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>> >> >>       u32 exec_control;
>> >> >> +     u32 exit_control;
>> >> >>
>> >> >>       vmcs_write16(GUEST_ES_SELECTOR,
>> vmcs12->guest_es_selector);
>> >> >>       vmcs_write16(GUEST_CS_SELECTOR,
>> vmcs12->guest_cs_selector);
>> >> @@
>> >> >> -7691,7 +7720,10 @@ static void prepare_vmcs02(struct kvm_vcpu
>> >> >> *vcpu, struct vmcs12 *vmcs12)
>> >> >>        * we should use its exit controls. Note that
>> >> VM_EXIT_LOAD_IA32_EFER
>> >> >>        * bits are further modified by vmx_set_efer() below.
>> >> >>        */
>> >> >> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>> >> >> +     exit_control = vmcs_config.vmexit_ctrl;
>> >> >> +     if (vmcs12->pin_based_vm_exec_control &
>> >> >> PIN_BASED_VMX_PREEMPTION_TIMER)
>> >> >> +             exit_control |=
>> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> >> >> +     vmcs_write32(VM_EXIT_CONTROLS, exit_control);
>> >> > And here should be problem if host doesn't support
>> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
>> >> Nested vmx does check the hardware support of these features in
>> >> "nested_vmx_setup_ctls_msrs", see my comments above.
>> > But here you don't check it. You just set it unconditionally.
>> What we have checked in nested_vmx_setup_ctls_msrs() confirms that only if
>> these two features supported by hardware, nested vmx can enable preemption
>> timer. That is to say, if L2 can set PIN_BASED_VMX_PREEMPTION_TIMER
>> without failure, hardware must support
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, which should guarantee in
>> nested_vmx_setup_ctls_msrs (). (This is also why I write code like that in
>> nested_vmx_setup_ctls_msrs ()).
> I see your point. But the fact is nested_vmx_setup_ctls_msrs s an independent feature. So your previous assumption is wrong. The hardware may support PIN_BASED_VMX_PREEMPTION_TIMER without VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
I know what you mean, but see the changes of
nested_vmx_setup_ctls_msrs() in this patch.
"nested_vmx_setup_ctls_msrs()" is used to advertise features to nested
VM. The changes to this function confirms that nested guest can set
PIN_BASED_VMX_PREEMPTION_TIMER only if hardware support both
PIN_BASED_VMX_PREEMPTION_TIMER and VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
So I don't need to check here because if
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER is not supported by hardware, both
features will not exploit to nested VM. Nested VM cannot vmenter if
they set any, so they of course can't reach here.
>
>> >
>> >> >
>> >> >>
>> >> >>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and
>> >> VM_ENTRY_IA32E_MODE are
>> >> >>        * emulated by vmx_set_efer(), below.
>> >> >> @@ -8090,6 +8122,17 @@ static void prepare_vmcs12(struct kvm_vcpu
>> >> >> *vcpu, struct vmcs12 *vmcs12)
>> >> >>       vmcs12->guest_pending_dbg_exceptions =
>> >> >>               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>> >> >>
>> >> >> +     if (vmcs12->pin_based_vm_exec_control &
>> >> >> +                     PIN_BASED_VMX_PREEMPTION_TIMER) {
>> >> >> +             if (vmcs12->vm_exit_controls &
>> >> >> +
>> >> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>> >> >> +                     vmcs12->vmx_preemption_timer_value =
>> >> >> +
>> >> vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> >> >> +             else
>> >> >> +
>> >> vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
>> >> >> +
>> >> >> + vmcs12->vmx_preemption_timer_value);
>> >> > Why write it to vmcs directly if
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
>> >> not set?
>> >> Yes, writing is needless here since vmcs02 will be re-prepared via
>> >> prepare_vmcs02() when L1->L2. This function just save information
>> >> needed, vmcs_write is useless.
>> >>
>> > I don't sure I am seeing your point. Per my point, even the
>> VM_EXIT_SAVE_VMX_PREEMPTION_TIMER is not setting, you still need to
>> save the value to vmcs12->vmx_preemption_timer_value. Or else,
>> prepare_vmcs02 cannot get the right value.
>> If L2 doesn't enable VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, its value will
>> be reset to what we set initially. This function is called due to
>> L2->L1, so we should emulate L2's real exit process here. What we have
>> done in other parts is to handle vmexit of L2->L0->L2, which is not the "real" L2
>> vmexit.
> I am not describing it clearly. I mean if VM_EXIT_SAVE_VMX_PREEMPTION_TIMER is not set, then prepare_vmcs02() always write the initial value which means vmx_preemption_timer_value to vmcs02 before L2 running. So the vmcs wirte here is redundant. Am I wrong?
Yes, I get your point and I meant the same, vmcs_write is redundant. I
may confuse you by my poor description. ;)

Arthur
>
>>
>> Arthur
>> >
>> >> Arthur
>> >> >
>> >> >> +     }
>> >> >> +
>> >> >>       /*
>> >> >>        * In some cases (usually, nested EPT), L2 is allowed to change
>> its
>> >> >>        * own CR3 without exiting. If it has changed it, we must keep it.
>> >> >> --
>> >> >> 1.7.9.5
>> >> >>
>> >> >> --
>> >> >> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> >> >> the body of a message to majordomo@vger.kernel.org More majordomo
>> >> >> info at http://vger.kernel.org/majordomo-info.html
>> >> >
>> >> > Best regards,
>> >> > Yang
>> >> >
>> >> >
>> >> --
>> >> To unsubscribe from this list: send the line "unsubscribe kvm" in the
>> >> body of a message to majordomo@vger.kernel.org More majordomo info at
>> >> http://vger.kernel.org/majordomo-info.html
>> >
>> > Best regards,
>> > Yang
>> >
>> >
>
> Best regards,
> Yang
>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] KVM: nVMX: Fully support of nested VMX preemption timer
  2013-09-05  9:24     ` Zhang, Yang Z
  2013-09-05  9:50       ` Arthur Chunqi Li
@ 2013-09-13 11:58       ` Paolo Bonzini
  1 sibling, 0 replies; 8+ messages in thread
From: Paolo Bonzini @ 2013-09-13 11:58 UTC (permalink / raw)
  To: Zhang, Yang Z; +Cc: Arthur Chunqi Li, kvm, jan.kiszka, gleb

Il 05/09/2013 11:24, Zhang, Yang Z ha scritto:
> > Here I have such consideration: this logic is wrong if CPU support
> > PIN_BASED_VMX_PREEMPTION_TIMER but doesn't support
> > VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, though I don't know if this does
> > occurs. So the codes above reads the MSR and reserves the features it
> > supports, and here I just check if these two features are supported
> > simultaneously.
> 
> No. Only VM_EXIT_SAVE_VMX_PREEMPTION_TIMER depends on
> PIN_BASED_VMX_PREEMPTION_TIMER. PIN_BASED_VMX_PREEMPTION_TIMER is an
> independent feature

Not for the current implementation of the preemption timer in nested VMX.

In order to emulate PIN_BASED_VMX_PREEMPTION_TIMER, you need to
decrement the preemption timer in vmcs02 whenever you have an L2->L0->L2
exit.  The current implementation chooses to use
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER to do this decrement.

It is possible to not use it.  As the Intel SDM says (31.13), "the VMM
may use another timer (e.g. the TSC) to track the amount of time the VM
has run while still using the VMX-preemption timer for VM preemption. In
this scenario the VMM would not save the VMX-preemption timer on each
VM-exit but instead would reload the VMX-preemption timer with initial
VM quantum less the time the VM has already run. This scenario includes
all the VM-entry and VM-exit latencies in the VM run time".  In fact,
perhaps it's even a better choice, but for Arthur's code it is correct
to depend on VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.

Paolo

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2013-09-13 11:58 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-09-04 15:22 [PATCH v3] KVM: nVMX: Fully support of nested VMX preemption timer Arthur Chunqi Li
2013-09-05  7:45 ` Zhang, Yang Z
2013-09-05  8:47   ` Arthur Chunqi Li
2013-09-05  9:24     ` Zhang, Yang Z
2013-09-05  9:50       ` Arthur Chunqi Li
2013-09-05 11:05         ` Zhang, Yang Z
2013-09-05 14:19           ` Arthur Chunqi Li
2013-09-13 11:58       ` Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.