[PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
@ 2010-12-02 13:59 Anthony Liguori
  2010-12-02 14:39 ` lidong chen
                   ` (2 more replies)
  0 siblings, 3 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-02 13:59 UTC (permalink / raw)
  To: kvm
  Cc: Avi Kivity, Marcelo Tosatti, Chris Wright, Srivatsa Vaddagiri,
	Anthony Liguori

In certain use-cases, we want to allocate guests fixed time slices where idle
guest cycles leave the machine idling.  There are many approaches to achieve
this but the most direct is to simply avoid trapping the HLT instruction which
lets the guest directly execute the instruction putting the processor to sleep.

Introduce this as a module-level option for kvm-vmx.ko since if you do this
for one guest, you probably want to do it for all.  A similar option is possible
for AMD but I don't have easy access to AMD test hardware.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
v1 -> v2
 - Rename parameter to yield_on_hlt
 - Remove __read_mostly

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index caa967e..d8310e4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 static int __read_mostly vmm_exclusive = 1;
 module_param(vmm_exclusive, bool, S_IRUGO);
 
+static int yield_on_hlt = 1;
+module_param(yield_on_hlt, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
 	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK						\
@@ -1419,7 +1422,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_pin_based_exec_control) < 0)
 		return -EIO;
 
-	min = CPU_BASED_HLT_EXITING |
+	min =
 #ifdef CONFIG_X86_64
 	      CPU_BASED_CR8_LOAD_EXITING |
 	      CPU_BASED_CR8_STORE_EXITING |
@@ -1432,6 +1435,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_MWAIT_EXITING |
 	      CPU_BASED_MONITOR_EXITING |
 	      CPU_BASED_INVLPG_EXITING;
+
+	if (yield_on_hlt)
+		min |= CPU_BASED_HLT_EXITING;
+
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 13:59 [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2) Anthony Liguori
@ 2010-12-02 14:39 ` lidong chen
  2010-12-02 15:23   ` Anthony Liguori
  2010-12-02 15:23   ` Anthony Liguori
  2010-12-02 17:37 ` Marcelo Tosatti
  2010-12-02 19:14 ` Chris Wright
  2 siblings, 2 replies; 69+ messages in thread
From: lidong chen @ 2010-12-02 14:39 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, Avi Kivity, Marcelo Tosatti, Chris Wright, Srivatsa Vaddagiri

In certain use-cases, we want to allocate guests fixed time slices where idle
guest cycles leave the machine idling.

i could not understand why need this? can you tell more detailedly?
thanks.


2010/12/2 Anthony Liguori <aliguori@us.ibm.com>:
> In certain use-cases, we want to allocate guests fixed time slices where idle
> guest cycles leave the machine idling.  There are many approaches to achieve
> this but the most direct is to simply avoid trapping the HLT instruction which
> lets the guest directly execute the instruction putting the processor to sleep.
>
> Introduce this as a module-level option for kvm-vmx.ko since if you do this
> for one guest, you probably want to do it for all.  A similar option is possible
> for AMD but I don't have easy access to AMD test hardware.
>
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
> ---
> v1 -> v2
>  - Rename parameter to yield_on_hlt
>  - Remove __read_mostly
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index caa967e..d8310e4 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>  static int __read_mostly vmm_exclusive = 1;
>  module_param(vmm_exclusive, bool, S_IRUGO);
>
> +static int yield_on_hlt = 1;
> +module_param(yield_on_hlt, bool, S_IRUGO);
> +
>  #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                          \
>        (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
>  #define KVM_GUEST_CR0_MASK                                             \
> @@ -1419,7 +1422,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>                                &_pin_based_exec_control) < 0)
>                return -EIO;
>
> -       min = CPU_BASED_HLT_EXITING |
> +       min =
>  #ifdef CONFIG_X86_64
>              CPU_BASED_CR8_LOAD_EXITING |
>              CPU_BASED_CR8_STORE_EXITING |
> @@ -1432,6 +1435,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>              CPU_BASED_MWAIT_EXITING |
>              CPU_BASED_MONITOR_EXITING |
>              CPU_BASED_INVLPG_EXITING;
> +
> +       if (yield_on_hlt)
> +               min |= CPU_BASED_HLT_EXITING;
> +
>        opt = CPU_BASED_TPR_SHADOW |
>              CPU_BASED_USE_MSR_BITMAPS |
>              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> --
> 1.7.0.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 14:39 ` lidong chen
@ 2010-12-02 15:23   ` Anthony Liguori
  2010-12-02 15:23   ` Anthony Liguori
  1 sibling, 0 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-02 15:23 UTC (permalink / raw)
  To: lidong chen
  Cc: kvm, Avi Kivity, Marcelo Tosatti, Chris Wright, Srivatsa Vaddagiri

On 12/02/2010 08:39 AM, lidong chen wrote:
> In certain use-cases, we want to allocate guests fixed time slices where idle
> guest cycles leave the machine idling.
>
> i could not understand why need this? can you tell more detailedly?
>    

If you run 4 guests on a CPU, and they're all trying to consume 100% 
CPU, all things being equal, you'll get ~25% CPU for each guest.

However, if one guest is idle, you'll get something like 1% 32% 33% 
32%.  This characteristic is usually desirable because it increase 
aggregate throughput but in some circumstances, determinism is more 
desirable than aggregate throughput.

This patch essentially makes guest execution non-work conserving by 
making it appear to the scheduler that each guest wants 100% CPU even 
though they may be idling.

That means that regardless of what each guest is doing, if you have four 
guests on one CPU, each will get ~25% CPU[1].

[1] there are corner cases around things like forced sleep due to PFs 
and the like.  The goal is not for 100% determinism but more to at least 
obtain more significantly more determinism than we have now.

Regards,

Anthony Liguori

> thanks.
>
>
> 2010/12/2 Anthony Liguori<aliguori@us.ibm.com>:
>    
>> In certain use-cases, we want to allocate guests fixed time slices where idle
>> guest cycles leave the machine idling.  There are many approaches to achieve
>> this but the most direct is to simply avoid trapping the HLT instruction which
>> lets the guest directly execute the instruction putting the processor to sleep.
>>
>> Introduce this as a module-level option for kvm-vmx.ko since if you do this
>> for one guest, you probably want to do it for all.  A similar option is possible
>> for AMD but I don't have easy access to AMD test hardware.
>>
>> Signed-off-by: Anthony Liguori<aliguori@us.ibm.com>
>> ---
>> v1 ->  v2
>>   - Rename parameter to yield_on_hlt
>>   - Remove __read_mostly
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index caa967e..d8310e4 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>>   static int __read_mostly vmm_exclusive = 1;
>>   module_param(vmm_exclusive, bool, S_IRUGO);
>>
>> +static int yield_on_hlt = 1;
>> +module_param(yield_on_hlt, bool, S_IRUGO);
>> +
>>   #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                          \
>>         (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
>>   #define KVM_GUEST_CR0_MASK                                             \
>> @@ -1419,7 +1422,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>>                                 &_pin_based_exec_control)<  0)
>>                 return -EIO;
>>
>> -       min = CPU_BASED_HLT_EXITING |
>> +       min =
>>   #ifdef CONFIG_X86_64
>>               CPU_BASED_CR8_LOAD_EXITING |
>>               CPU_BASED_CR8_STORE_EXITING |
>> @@ -1432,6 +1435,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>>               CPU_BASED_MWAIT_EXITING |
>>               CPU_BASED_MONITOR_EXITING |
>>               CPU_BASED_INVLPG_EXITING;
>> +
>> +       if (yield_on_hlt)
>> +               min |= CPU_BASED_HLT_EXITING;
>> +
>>         opt = CPU_BASED_TPR_SHADOW |
>>               CPU_BASED_USE_MSR_BITMAPS |
>>               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
>> --
>> 1.7.0.4
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>      
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 14:39 ` lidong chen
  2010-12-02 15:23   ` Anthony Liguori
@ 2010-12-02 15:23   ` Anthony Liguori
  2010-12-03  9:38     ` Avi Kivity
  1 sibling, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-02 15:23 UTC (permalink / raw)
  To: lidong chen
  Cc: kvm, Avi Kivity, Marcelo Tosatti, Chris Wright, Srivatsa Vaddagiri

On 12/02/2010 08:39 AM, lidong chen wrote:
> In certain use-cases, we want to allocate guests fixed time slices where idle
> guest cycles leave the machine idling.
>
> i could not understand why need this? can you tell more detailedly?
>    

If you run 4 guests on a CPU, and they're all trying to consume 100% 
CPU, all things being equal, you'll get ~25% CPU for each guest.

However, if one guest is idle, you'll get something like 1% 32% 33% 
32%.  This characteristic is usually desirable because it increase 
aggregate throughput but in some circumstances, determinism is more 
desirable than aggregate throughput.

This patch essentially makes guest execution non-work conserving by 
making it appear to the scheduler that each guest wants 100% CPU even 
though they may be idling.

That means that regardless of what each guest is doing, if you have four 
guests on one CPU, each will get ~25% CPU[1].

[1] there are corner cases around things like forced sleep due to PFs 
and the like.  The goal is not for 100% determinism but more to at least 
obtain more significantly more determinism than we have now.

Regards,

Anthony Liguori

> thanks.
>
>
> 2010/12/2 Anthony Liguori<aliguori@us.ibm.com>:
>    
>> In certain use-cases, we want to allocate guests fixed time slices where idle
>> guest cycles leave the machine idling.  There are many approaches to achieve
>> this but the most direct is to simply avoid trapping the HLT instruction which
>> lets the guest directly execute the instruction putting the processor to sleep.
>>
>> Introduce this as a module-level option for kvm-vmx.ko since if you do this
>> for one guest, you probably want to do it for all.  A similar option is possible
>> for AMD but I don't have easy access to AMD test hardware.
>>
>> Signed-off-by: Anthony Liguori<aliguori@us.ibm.com>
>> ---
>> v1 ->  v2
>>   - Rename parameter to yield_on_hlt
>>   - Remove __read_mostly
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index caa967e..d8310e4 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>>   static int __read_mostly vmm_exclusive = 1;
>>   module_param(vmm_exclusive, bool, S_IRUGO);
>>
>> +static int yield_on_hlt = 1;
>> +module_param(yield_on_hlt, bool, S_IRUGO);
>> +
>>   #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                          \
>>         (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
>>   #define KVM_GUEST_CR0_MASK                                             \
>> @@ -1419,7 +1422,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>>                                 &_pin_based_exec_control)<  0)
>>                 return -EIO;
>>
>> -       min = CPU_BASED_HLT_EXITING |
>> +       min =
>>   #ifdef CONFIG_X86_64
>>               CPU_BASED_CR8_LOAD_EXITING |
>>               CPU_BASED_CR8_STORE_EXITING |
>> @@ -1432,6 +1435,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>>               CPU_BASED_MWAIT_EXITING |
>>               CPU_BASED_MONITOR_EXITING |
>>               CPU_BASED_INVLPG_EXITING;
>> +
>> +       if (yield_on_hlt)
>> +               min |= CPU_BASED_HLT_EXITING;
>> +
>>         opt = CPU_BASED_TPR_SHADOW |
>>               CPU_BASED_USE_MSR_BITMAPS |
>>               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
>> --
>> 1.7.0.4
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>      
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 13:59 [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2) Anthony Liguori
  2010-12-02 14:39 ` lidong chen
@ 2010-12-02 17:37 ` Marcelo Tosatti
  2010-12-02 19:07   ` Anthony Liguori
  2010-12-03 22:42   ` Anthony Liguori
  2010-12-02 19:14 ` Chris Wright
  2 siblings, 2 replies; 69+ messages in thread
From: Marcelo Tosatti @ 2010-12-02 17:37 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm, Avi Kivity, Chris Wright, Srivatsa Vaddagiri

On Thu, Dec 02, 2010 at 07:59:17AM -0600, Anthony Liguori wrote:
> In certain use-cases, we want to allocate guests fixed time slices where idle
> guest cycles leave the machine idling.  There are many approaches to achieve
> this but the most direct is to simply avoid trapping the HLT instruction which
> lets the guest directly execute the instruction putting the processor to sleep.
> 
> Introduce this as a module-level option for kvm-vmx.ko since if you do this
> for one guest, you probably want to do it for all.  A similar option is possible
> for AMD but I don't have easy access to AMD test hardware.
> 
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
> ---
> v1 -> v2
>  - Rename parameter to yield_on_hlt
>  - Remove __read_mostly
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index caa967e..d8310e4 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>  static int __read_mostly vmm_exclusive = 1;
>  module_param(vmm_exclusive, bool, S_IRUGO);
>  
> +static int yield_on_hlt = 1;
> +module_param(yield_on_hlt, bool, S_IRUGO);
> +
>  #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
>  	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
>  #define KVM_GUEST_CR0_MASK						\
> @@ -1419,7 +1422,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>  				&_pin_based_exec_control) < 0)
>  		return -EIO;
>  
> -	min = CPU_BASED_HLT_EXITING |
> +	min =
>  #ifdef CONFIG_X86_64
>  	      CPU_BASED_CR8_LOAD_EXITING |
>  	      CPU_BASED_CR8_STORE_EXITING |
> @@ -1432,6 +1435,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>  	      CPU_BASED_MWAIT_EXITING |
>  	      CPU_BASED_MONITOR_EXITING |
>  	      CPU_BASED_INVLPG_EXITING;
> +
> +	if (yield_on_hlt)
> +		min |= CPU_BASED_HLT_EXITING;
> +
>  	opt = CPU_BASED_TPR_SHADOW |
>  	      CPU_BASED_USE_MSR_BITMAPS |
>  	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> -- 
> 1.7.0.4

Breaks async PF (see "checks on guest state"), timer reinjection
probably. It should be possible to achieve determinism with 
a scheduler policy?




^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 17:37 ` Marcelo Tosatti
@ 2010-12-02 19:07   ` Anthony Liguori
  2010-12-02 20:12     ` Marcelo Tosatti
  2010-12-03 22:42   ` Anthony Liguori
  1 sibling, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-02 19:07 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, Avi Kivity, Chris Wright, Srivatsa Vaddagiri

On 12/02/2010 11:37 AM, Marcelo Tosatti wrote:
> On Thu, Dec 02, 2010 at 07:59:17AM -0600, Anthony Liguori wrote:
>    
>> In certain use-cases, we want to allocate guests fixed time slices where idle
>> guest cycles leave the machine idling.  There are many approaches to achieve
>> this but the most direct is to simply avoid trapping the HLT instruction which
>> lets the guest directly execute the instruction putting the processor to sleep.
>>
>> Introduce this as a module-level option for kvm-vmx.ko since if you do this
>> for one guest, you probably want to do it for all.  A similar option is possible
>> for AMD but I don't have easy access to AMD test hardware.
>>
>> Signed-off-by: Anthony Liguori<aliguori@us.ibm.com>
>> ---
>> v1 ->  v2
>>   - Rename parameter to yield_on_hlt
>>   - Remove __read_mostly
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index caa967e..d8310e4 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>>   static int __read_mostly vmm_exclusive = 1;
>>   module_param(vmm_exclusive, bool, S_IRUGO);
>>
>> +static int yield_on_hlt = 1;
>> +module_param(yield_on_hlt, bool, S_IRUGO);
>> +
>>   #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
>>   	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
>>   #define KVM_GUEST_CR0_MASK						\
>> @@ -1419,7 +1422,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>>   				&_pin_based_exec_control)<  0)
>>   		return -EIO;
>>
>> -	min = CPU_BASED_HLT_EXITING |
>> +	min =
>>   #ifdef CONFIG_X86_64
>>   	      CPU_BASED_CR8_LOAD_EXITING |
>>   	      CPU_BASED_CR8_STORE_EXITING |
>> @@ -1432,6 +1435,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>>   	      CPU_BASED_MWAIT_EXITING |
>>   	      CPU_BASED_MONITOR_EXITING |
>>   	      CPU_BASED_INVLPG_EXITING;
>> +
>> +	if (yield_on_hlt)
>> +		min |= CPU_BASED_HLT_EXITING;
>> +
>>   	opt = CPU_BASED_TPR_SHADOW |
>>   	      CPU_BASED_USE_MSR_BITMAPS |
>>   	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
>> -- 
>> 1.7.0.4
>>      
> Breaks async PF (see "checks on guest state"),

Sorry, I don't follow what you mean here.  Can you elaborate?

>   timer reinjection
> probably.

Timer reinjection will continue to work as expected.  If a guest is 
halting an external interrupt is delivered (by a timer), the guest will 
still exit as expected.

I can think of anything that would be functionally correct and still 
depend on getting hlt exits because ultimately, a guest never actually 
has to do a hlt (and certainly there are guests that won't).

>   It should be possible to achieve determinism with
> a scheduler policy?
>    

If the desire is the ultimate desire is to have the guests be scheduled 
in a non-work conserving fashion, I can't see a more direct approach 
that to simply not have the guests yield (which is ultimately what hlt 
trapping does).

Anything the scheduler would do is after the fact and probably based on 
inference about why the yield.

Regards,

Anthony Liguori

>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 13:59 [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2) Anthony Liguori
  2010-12-02 14:39 ` lidong chen
  2010-12-02 17:37 ` Marcelo Tosatti
@ 2010-12-02 19:14 ` Chris Wright
  2010-12-02 20:25   ` Anthony Liguori
                     ` (3 more replies)
  2 siblings, 4 replies; 69+ messages in thread
From: Chris Wright @ 2010-12-02 19:14 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm, Avi Kivity, Marcelo Tosatti, Chris Wright, Srivatsa Vaddagiri

* Anthony Liguori (aliguori@us.ibm.com) wrote:
> In certain use-cases, we want to allocate guests fixed time slices where idle
> guest cycles leave the machine idling.  There are many approaches to achieve
> this but the most direct is to simply avoid trapping the HLT instruction which
> lets the guest directly execute the instruction putting the processor to sleep.

I like the idea, esp to keep from burning power.

> Introduce this as a module-level option for kvm-vmx.ko since if you do this
> for one guest, you probably want to do it for all.  A similar option is possible
> for AMD but I don't have easy access to AMD test hardware.

Perhaps it should be a VM level option.  And then invert the notion.
Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
(pin in place probably).  And have that VM do nothing other than hlt.
Then it's always runnable according to scheduler, and can "consume" the
extra work that CFS wants to give away.

What do you think?

thanks,
-chris

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 19:07   ` Anthony Liguori
@ 2010-12-02 20:12     ` Marcelo Tosatti
  2010-12-02 20:51       ` Anthony Liguori
  2010-12-03 23:31       ` Anthony Liguori
  0 siblings, 2 replies; 69+ messages in thread
From: Marcelo Tosatti @ 2010-12-02 20:12 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm, Avi Kivity, Chris Wright, Srivatsa Vaddagiri

> >>  	opt = CPU_BASED_TPR_SHADOW |
> >>  	      CPU_BASED_USE_MSR_BITMAPS |
> >>  	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> >>-- 
> >>1.7.0.4
> >Breaks async PF (see "checks on guest state"),
> 
> Sorry, I don't follow what you mean here.  Can you elaborate?

VCPU in HLT state only allows injection of certain events that
would be delivered on HLT. #PF is not one of them.

You'd have to handle this situation on event injection, vmentry fails
otherwise. Or perhaps clear HLT state on vmexit and vmentry.

> >  timer reinjection
> >probably.
> 
> Timer reinjection will continue to work as expected.  If a guest is
> halting an external interrupt is delivered (by a timer), the guest
> will still exit as expected.
> 
> I can think of anything that would be functionally correct and still
> depend on getting hlt exits because ultimately, a guest never
> actually has to do a hlt (and certainly there are guests that
> won't).

LAPIC pending timer events will be reinjected on entry path, if
accumulated. So they depend on any exit. If you disable HLT-exiting,
delay will increase. OK, maybe thats irrelevant.

> >  It should be possible to achieve determinism with
> >a scheduler policy?
> 
> If the desire is the ultimate desire is to have the guests be
> scheduled in a non-work conserving fashion, I can't see a more
> direct approach that to simply not have the guests yield (which is
> ultimately what hlt trapping does).
> 
> Anything the scheduler would do is after the fact and probably based
> on inference about why the yield.

Another issue is you ignore the hosts idea of the best way to sleep
(ACPI, or whatever).

And handling inactive HLT state (which was never enabled) can be painful.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 19:14 ` Chris Wright
@ 2010-12-02 20:25   ` Anthony Liguori
  2010-12-02 20:40     ` Chris Wright
  2010-12-02 20:40   ` Marcelo Tosatti
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-02 20:25 UTC (permalink / raw)
  To: Chris Wright; +Cc: kvm, Avi Kivity, Marcelo Tosatti, Srivatsa Vaddagiri

On 12/02/2010 01:14 PM, Chris Wright wrote:
> * Anthony Liguori (aliguori@us.ibm.com) wrote:
>    
>> In certain use-cases, we want to allocate guests fixed time slices where idle
>> guest cycles leave the machine idling.  There are many approaches to achieve
>> this but the most direct is to simply avoid trapping the HLT instruction which
>> lets the guest directly execute the instruction putting the processor to sleep.
>>      
> I like the idea, esp to keep from burning power.
>
>    
>> Introduce this as a module-level option for kvm-vmx.ko since if you do this
>> for one guest, you probably want to do it for all.  A similar option is possible
>> for AMD but I don't have easy access to AMD test hardware.
>>      
> Perhaps it should be a VM level option.  And then invert the notion.
> Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> (pin in place probably).  And have that VM do nothing other than hlt.
> Then it's always runnable according to scheduler, and can "consume" the
> extra work that CFS wants to give away.
>    

That's an interesting idea.  I think Vatsa had some ideas about how to 
do this with existing mechanisms.

I'm interesting in comparing behavior with fixed allocation because one 
thing the above relies upon is that the filler VM loses it's time when 
one of the non-filler VCPU needs to run.  This may all work correctly 
but I think it's easier to rationalize about having each non-filler VCPU 
have a fixed (long) time slice.  If a VCPU needs to wake up to become 
non-idle, it can do so immediately because it already has the PCPU.

Regards,

Anthony Liguori



> What do you think?
>
> thanks,
> -chris
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 19:14 ` Chris Wright
  2010-12-02 20:25   ` Anthony Liguori
@ 2010-12-02 20:40   ` Marcelo Tosatti
  2010-12-02 21:07     ` Chris Wright
                       ` (2 more replies)
  2010-12-03  9:40   ` Avi Kivity
  2010-12-03 11:57   ` Srivatsa Vaddagiri
  3 siblings, 3 replies; 69+ messages in thread
From: Marcelo Tosatti @ 2010-12-02 20:40 UTC (permalink / raw)
  To: Chris Wright; +Cc: Anthony Liguori, kvm, Avi Kivity, Srivatsa Vaddagiri

On Thu, Dec 02, 2010 at 11:14:16AM -0800, Chris Wright wrote:
> * Anthony Liguori (aliguori@us.ibm.com) wrote:
> > In certain use-cases, we want to allocate guests fixed time slices where idle
> > guest cycles leave the machine idling.  There are many approaches to achieve
> > this but the most direct is to simply avoid trapping the HLT instruction which
> > lets the guest directly execute the instruction putting the processor to sleep.
> 
> I like the idea, esp to keep from burning power.
> 
> > Introduce this as a module-level option for kvm-vmx.ko since if you do this
> > for one guest, you probably want to do it for all.  A similar option is possible
> > for AMD but I don't have easy access to AMD test hardware.
> 
> Perhaps it should be a VM level option.  And then invert the notion.
> Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> (pin in place probably).  And have that VM do nothing other than hlt.
> Then it's always runnable according to scheduler, and can "consume" the
> extra work that CFS wants to give away.
> 
> What do you think?
> 
> thanks,
> -chris

Consuming the timeslice outside guest mode is less intrusive and easier
to replace. Something like this should work?

if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
    while (!need_resched())
        default_idle();
} 

But you agree this is no KVM business.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 20:25   ` Anthony Liguori
@ 2010-12-02 20:40     ` Chris Wright
  0 siblings, 0 replies; 69+ messages in thread
From: Chris Wright @ 2010-12-02 20:40 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Chris Wright, kvm, Avi Kivity, Marcelo Tosatti, Srivatsa Vaddagiri

* Anthony Liguori (anthony@codemonkey.ws) wrote:
> On 12/02/2010 01:14 PM, Chris Wright wrote:
> >* Anthony Liguori (aliguori@us.ibm.com) wrote:
> >>In certain use-cases, we want to allocate guests fixed time slices where idle
> >>guest cycles leave the machine idling.  There are many approaches to achieve
> >>this but the most direct is to simply avoid trapping the HLT instruction which
> >>lets the guest directly execute the instruction putting the processor to sleep.
> >I like the idea, esp to keep from burning power.
> >
> >>Introduce this as a module-level option for kvm-vmx.ko since if you do this
> >>for one guest, you probably want to do it for all.  A similar option is possible
> >>for AMD but I don't have easy access to AMD test hardware.
> >Perhaps it should be a VM level option.  And then invert the notion.
> >Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> >(pin in place probably).  And have that VM do nothing other than hlt.
> >Then it's always runnable according to scheduler, and can "consume" the
> >extra work that CFS wants to give away.
> 
> That's an interesting idea.  I think Vatsa had some ideas about how
> to do this with existing mechanisms.

Yeah, should Just Work (TM) w/ smth like evilcap.

> I'm interesting in comparing behavior with fixed allocation because
> one thing the above relies upon is that the filler VM loses it's
> time when one of the non-filler VCPU needs to run.

Priorites?

> This may all
> work correctly but I think it's easier to rationalize about having
> each non-filler VCPU have a fixed (long) time slice.  If a VCPU
> needs to wake up to become non-idle, it can do so immediately
> because it already has the PCPU.

The flipside...dont' have to worry about the issues that Marcelo brought
up.

Should be pretty easy to compare though.

thanks,
-chris

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 20:12     ` Marcelo Tosatti
@ 2010-12-02 20:51       ` Anthony Liguori
  2010-12-03  9:36         ` Avi Kivity
  2010-12-03 12:40         ` Gleb Natapov
  2010-12-03 23:31       ` Anthony Liguori
  1 sibling, 2 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-02 20:51 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, Avi Kivity, Chris Wright, Srivatsa Vaddagiri

On 12/02/2010 02:12 PM, Marcelo Tosatti wrote:
>>>>   	opt = CPU_BASED_TPR_SHADOW |
>>>>   	      CPU_BASED_USE_MSR_BITMAPS |
>>>>   	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
>>>> -- 
>>>> 1.7.0.4
>>>>          
>>> Breaks async PF (see "checks on guest state"),
>>>        
>> Sorry, I don't follow what you mean here.  Can you elaborate?
>>      
> VCPU in HLT state only allows injection of certain events that
> would be delivered on HLT. #PF is not one of them.
>    

But you can't inject an exception into a guest while the VMCS is active, 
can you?  So the guest takes an exit while in the hlt instruction but 
that's no different than if the guest has been interrupted because of 
hlt exiting.

> You'd have to handle this situation on event injection, vmentry fails
> otherwise. Or perhaps clear HLT state on vmexit and vmentry.
>    

So this works today because on a hlt exit, emulate_halt() will clear the 
the HLT state which then puts the the vcpu into a state where it can 
receive an exception injection?

Regards,

Anthony Liguori

>>>   timer reinjection
>>> probably.
>>>        
>> Timer reinjection will continue to work as expected.  If a guest is
>> halting an external interrupt is delivered (by a timer), the guest
>> will still exit as expected.
>>
>> I can think of anything that would be functionally correct and still
>> depend on getting hlt exits because ultimately, a guest never
>> actually has to do a hlt (and certainly there are guests that
>> won't).
>>      
> LAPIC pending timer events will be reinjected on entry path, if
> accumulated. So they depend on any exit. If you disable HLT-exiting,
> delay will increase. OK, maybe thats irrelevant.
>
>    
>>>   It should be possible to achieve determinism with
>>> a scheduler policy?
>>>        
>> If the desire is the ultimate desire is to have the guests be
>> scheduled in a non-work conserving fashion, I can't see a more
>> direct approach that to simply not have the guests yield (which is
>> ultimately what hlt trapping does).
>>
>> Anything the scheduler would do is after the fact and probably based
>> on inference about why the yield.
>>      
> Another issue is you ignore the hosts idea of the best way to sleep
> (ACPI, or whatever).
>
> And handling inactive HLT state (which was never enabled) can be painful.
>
>    


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 20:40   ` Marcelo Tosatti
@ 2010-12-02 21:07     ` Chris Wright
  2010-12-02 22:37       ` Anthony Liguori
  2010-12-02 22:27     ` Anthony Liguori
  2010-12-03 22:49     ` Anthony Liguori
  2 siblings, 1 reply; 69+ messages in thread
From: Chris Wright @ 2010-12-02 21:07 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Chris Wright, Anthony Liguori, kvm, Avi Kivity, Srivatsa Vaddagiri

* Marcelo Tosatti (mtosatti@redhat.com) wrote:
> On Thu, Dec 02, 2010 at 11:14:16AM -0800, Chris Wright wrote:
> > * Anthony Liguori (aliguori@us.ibm.com) wrote:
> > > In certain use-cases, we want to allocate guests fixed time slices where idle
> > > guest cycles leave the machine idling.  There are many approaches to achieve
> > > this but the most direct is to simply avoid trapping the HLT instruction which
> > > lets the guest directly execute the instruction putting the processor to sleep.
> > 
> > I like the idea, esp to keep from burning power.
> > 
> > > Introduce this as a module-level option for kvm-vmx.ko since if you do this
> > > for one guest, you probably want to do it for all.  A similar option is possible
> > > for AMD but I don't have easy access to AMD test hardware.
> > 
> > Perhaps it should be a VM level option.  And then invert the notion.
> > Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> > (pin in place probably).  And have that VM do nothing other than hlt.
> > Then it's always runnable according to scheduler, and can "consume" the
> > extra work that CFS wants to give away.
> > 
> > What do you think?
> > 
> > thanks,
> > -chris
> 
> Consuming the timeslice outside guest mode is less intrusive and easier
> to replace. Something like this should work?
> 
> if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
>     while (!need_resched())
>         default_idle();
> } 
> 
> But you agree this is no KVM business.

Like non-trapping hlt, that too will guarantee that the guest is preempted
by timeslice exhaustion (and is simpler than non-trapping hlt).  So it
may well be the simplest for the case where we are perfectly committed
(i.e. the vcpu fractional core count totals the pcpu count).  But once
we are undercommitted we still need some extra logic to handle the hard
cap and something to kick the running guest off the cpu and suck up the
extra cycles in a power conserving way.

thanks,
-chris

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 20:40   ` Marcelo Tosatti
  2010-12-02 21:07     ` Chris Wright
@ 2010-12-02 22:27     ` Anthony Liguori
  2010-12-03 22:49     ` Anthony Liguori
  2 siblings, 0 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-02 22:27 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Chris Wright, kvm, Avi Kivity, Srivatsa Vaddagiri

On 12/02/2010 02:40 PM, Marcelo Tosatti wrote:
> Consuming the timeslice outside guest mode is less intrusive and easier
> to replace. Something like this should work?
>
> if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
>      while (!need_resched())
>          default_idle();
> }
>
> But you agree this is no KVM business.
>    

My initial inclination is that this would be inappropriate for KVM but I 
think I'm slowly convincing myself otherwise.

Ultimately, hard limits and deterministic scheduling are related goals 
but not quite the same.  A hard limit is very specific, you want to 
receive no more than an exit amount of CPU time per VCPU  With 
deterministic scheduling, you want to make sure that a set of VMs are 
not influenced by each other's behavior.

You want hard limits when you want to hide the density/capacity of a 
node from the end customer.  You want determinism when you simply want 
to isolate the performance of each customer from the other customers.

That is, the only thing that should affect the performance graph of a VM 
is how many neighbors it has (which is controlled by management 
software) rather than what its neighbors are doing.

If you have hard limits, you can approximate deterministic scheduling 
but it's complex in the face of changing numbers of guests.  
Additionally, hard limits present issues with directed yield that don't 
exist with a deterministic scheduling approach.

You can still donate your time slice to another VCPU because the VCPUs 
are not actually capped.  That may mean that an individual VCPU gets 
more PCPU time than an exact division but for the VM overall, it won't 
get more than it's total share.  So the principle of performance 
isolation for the guest isn't impacted.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 21:07     ` Chris Wright
@ 2010-12-02 22:37       ` Anthony Liguori
  2010-12-03  2:42         ` Chris Wright
  0 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-02 22:37 UTC (permalink / raw)
  To: Chris Wright; +Cc: Marcelo Tosatti, kvm, Avi Kivity, Srivatsa Vaddagiri

On 12/02/2010 03:07 PM, Chris Wright wrote:
>> But you agree this is no KVM business.
>>      
> Like non-trapping hlt, that too will guarantee that the guest is preempted
> by timeslice exhaustion (and is simpler than non-trapping hlt).  So it
> may well be the simplest for the case where we are perfectly committed
> (i.e. the vcpu fractional core count totals the pcpu count).  But once
> we are undercommitted we still need some extra logic to handle the hard
> cap and something to kick the running guest off the cpu and suck up the
> extra cycles in a power conserving way.
>    

I'm not entirely sure TBH.

If you think of a cloud's per-VCPU capacity in terms of Compute Units, 
having a model where a VCPU maps to 1-3 units depending on total load is 
potentially interesting particularly if the VCPU's capacity only changes 
in discrete amounts,  that the expected capacity is communicated to the 
guest, and that the capacity only changes periodically.

Regards,

Anthony Liguori

> thanks,
> -chris
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 22:37       ` Anthony Liguori
@ 2010-12-03  2:42         ` Chris Wright
  2010-12-03  3:21           ` Anthony Liguori
  0 siblings, 1 reply; 69+ messages in thread
From: Chris Wright @ 2010-12-03  2:42 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Chris Wright, Marcelo Tosatti, kvm, Avi Kivity, Srivatsa Vaddagiri

* Anthony Liguori (anthony@codemonkey.ws) wrote:
> On 12/02/2010 03:07 PM, Chris Wright wrote:
> >Like non-trapping hlt, that too will guarantee that the guest is preempted
> >by timeslice exhaustion (and is simpler than non-trapping hlt).  So it
> >may well be the simplest for the case where we are perfectly committed
> >(i.e. the vcpu fractional core count totals the pcpu count).  But once
> >we are undercommitted we still need some extra logic to handle the hard
> >cap and something to kick the running guest off the cpu and suck up the
> >extra cycles in a power conserving way.
> 
> I'm not entirely sure TBH.
> 
> If you think of a cloud's per-VCPU capacity in terms of Compute
> Units, having a model where a VCPU maps to 1-3 units depending on
> total load is potentially interesting particularly if the VCPU's
> capacity only changes in discrete amounts,  that the expected
> capacity is communicated to the guest, and that the capacity only
> changes periodically.

OK, let's say a single PCPU == 12 Compute Units.

If the guest is the first to migrate to a newly added unused host, and
we are using either non-trapping hlt or Marcelo's non-yielding trapping
hlt, then that guest is going to get more CPU than it expected unless
there is some throttling mechanism.  Specifically, it will get 12CU
instead of 1-3CU.

Do you agree with that?

thanks,
-chris

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03  2:42         ` Chris Wright
@ 2010-12-03  3:21           ` Anthony Liguori
  2010-12-03  3:44             ` Chris Wright
  0 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-03  3:21 UTC (permalink / raw)
  To: Chris Wright; +Cc: Marcelo Tosatti, kvm, Avi Kivity, Srivatsa Vaddagiri

On 12/02/2010 08:42 PM, Chris Wright wrote:
> OK, let's say a single PCPU == 12 Compute Units.
>
> If the guest is the first to migrate to a newly added unused host, and
> we are using either non-trapping hlt or Marcelo's non-yielding trapping
> hlt, then that guest is going to get more CPU than it expected unless
> there is some throttling mechanism.  Specifically, it will get 12CU
> instead of 1-3CU.
>
> Do you agree with that?
>    

Yes.

There's definitely a use-case to have a hard cap.

But I think another common use-case is really just performance 
isolation.  If over the course of a day, you go from 12CU, to 6CU, to 
4CU, that might not be that bad of a thing.

If the environment is designed correctly, of N nodes, N-1 will always be 
at capacity so it's really just a single node hat is under utilized.

Regards,

Anthony Liguori

> thanks,
> -chris
>    


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03  3:21           ` Anthony Liguori
@ 2010-12-03  3:44             ` Chris Wright
  2010-12-03 14:25               ` Anthony Liguori
  0 siblings, 1 reply; 69+ messages in thread
From: Chris Wright @ 2010-12-03  3:44 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Chris Wright, Marcelo Tosatti, kvm, Avi Kivity, Srivatsa Vaddagiri

* Anthony Liguori (anthony@codemonkey.ws) wrote:
> On 12/02/2010 08:42 PM, Chris Wright wrote:
> >OK, let's say a single PCPU == 12 Compute Units.
> >
> >If the guest is the first to migrate to a newly added unused host, and
> >we are using either non-trapping hlt or Marcelo's non-yielding trapping
> >hlt, then that guest is going to get more CPU than it expected unless
> >there is some throttling mechanism.  Specifically, it will get 12CU
> >instead of 1-3CU.
> >
> >Do you agree with that?
> 
> Yes.
> 
> There's definitely a use-case to have a hard cap.

OK, good, just wanted to be clear.  Because this started as a discussion
of hard caps, and it began to sound as if you were no longer advocating
for them.

> But I think another common use-case is really just performance
> isolation.  If over the course of a day, you go from 12CU, to 6CU,
> to 4CU, that might not be that bad of a thing.

I guess it depends on your SLA.  We don't have to do anything to give
varying CU based on host load.  That's the one thing CFS will do for
us quite well ;)

> If the environment is designed correctly, of N nodes, N-1 will
> always be at capacity so it's really just a single node hat is under
> utilized.

Many clouds do a variation on Small, Medium, Large sizing.  So depending
on the scheduler (best fit, rr...) even the notion of at capacity may
change from node to node and during the time of day.

thanks,
-chris

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 20:51       ` Anthony Liguori
@ 2010-12-03  9:36         ` Avi Kivity
  2010-12-03 22:45           ` Anthony Liguori
  2010-12-03 12:40         ` Gleb Natapov
  1 sibling, 1 reply; 69+ messages in thread
From: Avi Kivity @ 2010-12-03  9:36 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/02/2010 10:51 PM, Anthony Liguori wrote:
>> VCPU in HLT state only allows injection of certain events that
>> would be delivered on HLT. #PF is not one of them.
>
> But you can't inject an exception into a guest while the VMCS is 
> active, can you? 

No, but this is irrelevant.

> So the guest takes an exit while in the hlt instruction but that's no 
> different than if the guest has been interrupted because of hlt exiting.

hlt exiting doesn't leave vcpu in the halted state (since hlt has not 
been executed).  So currently we never see a vcpu in halted state.

>
>> You'd have to handle this situation on event injection, vmentry fails
>> otherwise. Or perhaps clear HLT state on vmexit and vmentry.
>
> So this works today because on a hlt exit, emulate_halt() will clear 
> the the HLT state which then puts the the vcpu into a state where it 
> can receive an exception injection?

The halt state is never entered.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 15:23   ` Anthony Liguori
@ 2010-12-03  9:38     ` Avi Kivity
  2010-12-03 11:12       ` Srivatsa Vaddagiri
  2010-12-03 23:28       ` Anthony Liguori
  0 siblings, 2 replies; 69+ messages in thread
From: Avi Kivity @ 2010-12-03  9:38 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: lidong chen, kvm, Marcelo Tosatti, Chris Wright, Srivatsa Vaddagiri

On 12/02/2010 05:23 PM, Anthony Liguori wrote:
> On 12/02/2010 08:39 AM, lidong chen wrote:
>> In certain use-cases, we want to allocate guests fixed time slices 
>> where idle
>> guest cycles leave the machine idling.
>>
>> i could not understand why need this? can you tell more detailedly?
>
> If you run 4 guests on a CPU, and they're all trying to consume 100% 
> CPU, all things being equal, you'll get ~25% CPU for each guest.
>
> However, if one guest is idle, you'll get something like 1% 32% 33% 
> 32%.  This characteristic is usually desirable because it increase 
> aggregate throughput but in some circumstances, determinism is more 
> desirable than aggregate throughput.
>
> This patch essentially makes guest execution non-work conserving by 
> making it appear to the scheduler that each guest wants 100% CPU even 
> though they may be idling.
>
> That means that regardless of what each guest is doing, if you have 
> four guests on one CPU, each will get ~25% CPU[1].
>

What if one of the guest crashes qemu or invokes a powerdown?  Suddenly 
the others get 33% each (with 1% going to my secret round-up account).  
Doesn't seem like a reliable way to limit cpu.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 19:14 ` Chris Wright
  2010-12-02 20:25   ` Anthony Liguori
  2010-12-02 20:40   ` Marcelo Tosatti
@ 2010-12-03  9:40   ` Avi Kivity
  2010-12-03 11:21     ` Srivatsa Vaddagiri
  2010-12-03 11:57   ` Srivatsa Vaddagiri
  3 siblings, 1 reply; 69+ messages in thread
From: Avi Kivity @ 2010-12-03  9:40 UTC (permalink / raw)
  To: Chris Wright; +Cc: Anthony Liguori, kvm, Marcelo Tosatti, Srivatsa Vaddagiri

On 12/02/2010 09:14 PM, Chris Wright wrote:
> Perhaps it should be a VM level option.  And then invert the notion.
> Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> (pin in place probably).  And have that VM do nothing other than hlt.
> Then it's always runnable according to scheduler, and can "consume" the
> extra work that CFS wants to give away.

What's the difference between this and the Linux idle threads?

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03  9:38     ` Avi Kivity
@ 2010-12-03 11:12       ` Srivatsa Vaddagiri
  2010-12-03 23:28       ` Anthony Liguori
  1 sibling, 0 replies; 69+ messages in thread
From: Srivatsa Vaddagiri @ 2010-12-03 11:12 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Anthony Liguori, lidong chen, kvm, Marcelo Tosatti, Chris Wright

On Fri, Dec 03, 2010 at 11:38:33AM +0200, Avi Kivity wrote:
> What if one of the guest crashes qemu or invokes a powerdown?
> Suddenly the others get 33% each (with 1% going to my secret
> round-up account).  Doesn't seem like a reliable way to limit cpu.

Some monitoring tool will need to catch that event and spawn a
"dummy" VM to consume 25% cpu, bringing back everyone's use to 25% as before.

That's admittedly not neat, but that's what we are thinking of atm in absence of
a better solution to the problem (ex: kernel scheduler supporting hard-limits).

- vatsa


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03  9:40   ` Avi Kivity
@ 2010-12-03 11:21     ` Srivatsa Vaddagiri
  0 siblings, 0 replies; 69+ messages in thread
From: Srivatsa Vaddagiri @ 2010-12-03 11:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Chris Wright, Anthony Liguori, kvm, Marcelo Tosatti

On Fri, Dec 03, 2010 at 11:40:27AM +0200, Avi Kivity wrote:
> On 12/02/2010 09:14 PM, Chris Wright wrote:
> >Perhaps it should be a VM level option.  And then invert the notion.
> >Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> >(pin in place probably).  And have that VM do nothing other than hlt.
> >Then it's always runnable according to scheduler, and can "consume" the
> >extra work that CFS wants to give away.
> 
> What's the difference between this and the Linux idle threads?

If we have 3 VMs and want to give them 25% each of a CPU, then having just idle
thread would end up giving them 33%. One way of achieving 25% rate limit is to
create a "dummy" or "filler" VM, and let it compete for resource, thus
rate-limiting everyone to 25% in this case. Essentially we are tackling
rate-limit problem by creating additional "filler" VMs/threads that will compete
for resource, thus keeping in check how much cpu resource is consumed by "real"
VMs. Admittedly not as neat as having a in-kernel support for rate-limit.

- vatsa

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 19:14 ` Chris Wright
                     ` (2 preceding siblings ...)
  2010-12-03  9:40   ` Avi Kivity
@ 2010-12-03 11:57   ` Srivatsa Vaddagiri
  2010-12-03 16:27     ` Srivatsa Vaddagiri
  2010-12-03 17:28     ` Chris Wright
  3 siblings, 2 replies; 69+ messages in thread
From: Srivatsa Vaddagiri @ 2010-12-03 11:57 UTC (permalink / raw)
  To: Chris Wright; +Cc: Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

On Thu, Dec 02, 2010 at 11:14:16AM -0800, Chris Wright wrote:
> Perhaps it should be a VM level option.  And then invert the notion.
> Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> (pin in place probably).  And have that VM do nothing other than hlt.
> Then it's always runnable according to scheduler, and can "consume" the
> extra work that CFS wants to give away.

That's not sufficient. Lets we have 3 guests A, B, C that need to be rate
limited to 25% on a single cpu system. We create this idle guest D that is 100%
cpu hog as per above definition. Now when one of the guest is idle, what ensures
that the idle cycles of A is given only to D and not partly to B/C?

- vatsa

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 20:51       ` Anthony Liguori
  2010-12-03  9:36         ` Avi Kivity
@ 2010-12-03 12:40         ` Gleb Natapov
  1 sibling, 0 replies; 69+ messages in thread
From: Gleb Natapov @ 2010-12-03 12:40 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Marcelo Tosatti, kvm, Avi Kivity, Chris Wright, Srivatsa Vaddagiri

On Thu, Dec 02, 2010 at 02:51:51PM -0600, Anthony Liguori wrote:
> On 12/02/2010 02:12 PM, Marcelo Tosatti wrote:
> >>>>  	opt = CPU_BASED_TPR_SHADOW |
> >>>>  	      CPU_BASED_USE_MSR_BITMAPS |
> >>>>  	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> >>>>-- 
> >>>>1.7.0.4
> >>>Breaks async PF (see "checks on guest state"),
> >>Sorry, I don't follow what you mean here.  Can you elaborate?
> >VCPU in HLT state only allows injection of certain events that
> >would be delivered on HLT. #PF is not one of them.
> 
> But you can't inject an exception into a guest while the VMCS is
> active, can you?  So the guest takes an exit while in the hlt
> instruction but that's no different than if the guest has been
> interrupted because of hlt exiting.
> 
Async PF completion do not kick vcpu out of a guest mode. It wakes vcpu
only if it is waiting on waitqueue. It was done to not generate
unnecessary overhead.

> >You'd have to handle this situation on event injection, vmentry fails
> >otherwise. Or perhaps clear HLT state on vmexit and vmentry.
> 
> So this works today because on a hlt exit, emulate_halt() will clear
> the the HLT state which then puts the the vcpu into a state where it
> can receive an exception injection?
> 
> Regards,
> 
> Anthony Liguori
> 
> >>>  timer reinjection
> >>>probably.
> >>Timer reinjection will continue to work as expected.  If a guest is
> >>halting an external interrupt is delivered (by a timer), the guest
> >>will still exit as expected.
> >>
> >>I can think of anything that would be functionally correct and still
> >>depend on getting hlt exits because ultimately, a guest never
> >>actually has to do a hlt (and certainly there are guests that
> >>won't).
> >LAPIC pending timer events will be reinjected on entry path, if
> >accumulated. So they depend on any exit. If you disable HLT-exiting,
> >delay will increase. OK, maybe thats irrelevant.
> >
> >>>  It should be possible to achieve determinism with
> >>>a scheduler policy?
> >>If the desire is the ultimate desire is to have the guests be
> >>scheduled in a non-work conserving fashion, I can't see a more
> >>direct approach that to simply not have the guests yield (which is
> >>ultimately what hlt trapping does).
> >>
> >>Anything the scheduler would do is after the fact and probably based
> >>on inference about why the yield.
> >Another issue is you ignore the hosts idea of the best way to sleep
> >(ACPI, or whatever).
> >
> >And handling inactive HLT state (which was never enabled) can be painful.
> >
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03  3:44             ` Chris Wright
@ 2010-12-03 14:25               ` Anthony Liguori
  0 siblings, 0 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-03 14:25 UTC (permalink / raw)
  To: Chris Wright; +Cc: Marcelo Tosatti, kvm, Avi Kivity, Srivatsa Vaddagiri

On 12/02/2010 09:44 PM, Chris Wright wrote:
>> Yes.
>>
>> There's definitely a use-case to have a hard cap.
>>      
> OK, good, just wanted to be clear.  Because this started as a discussion
> of hard caps, and it began to sound as if you were no longer advocating
> for them.
>
>    
>> But I think another common use-case is really just performance
>> isolation.  If over the course of a day, you go from 12CU, to 6CU,
>> to 4CU, that might not be that bad of a thing.
>>      
> I guess it depends on your SLA.  We don't have to do anything to give
> varying CU based on host load.  That's the one thing CFS will do for
> us quite well ;)
>    

I'm really anticipating things like the EC2 micro instance where the CPU 
allotment is variable.  Variable allotments are interesting from a 
density perspective but having interdependent performance is definitely 
a problem.

Another way to think about it: a customer reports a performance problem 
at 1PM.  With non-yielding guests, you can look at logs and see that the 
expected capacity was 2CU (it may have changed to 4CU at 3PM).  However, 
without something like non-yielding guests, the performance is almost 
entirely unpredictable and unless you have an exact timestamp from the 
customer along with a fine granularity performance log, there's no way 
to determine whether it's expected behavior.

>> If the environment is designed correctly, of N nodes, N-1 will
>> always be at capacity so it's really just a single node hat is under
>> utilized.
>>      
> Many clouds do a variation on Small, Medium, Large sizing.  So depending
> on the scheduler (best fit, rr...) even the notion of at capacity may
> change from node to node and during the time of day.
>    

An ideal cloud will make sure that something like 4 Small == 2 Medium == 
1 Large instance and that the machine capacity is always a multiple of 
Large instance size.

With a division like this, you can always achieve maximum density 
provided that you can support live migration.

Regards,

Anthony Liguori

> thanks,
> -chris
>    

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 11:57   ` Srivatsa Vaddagiri
@ 2010-12-03 16:27     ` Srivatsa Vaddagiri
  2010-12-03 17:29       ` Chris Wright
  2010-12-03 17:28     ` Chris Wright
  1 sibling, 1 reply; 69+ messages in thread
From: Srivatsa Vaddagiri @ 2010-12-03 16:27 UTC (permalink / raw)
  To: Chris Wright; +Cc: Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

On Fri, Dec 03, 2010 at 05:27:52PM +0530, Srivatsa Vaddagiri wrote:
> On Thu, Dec 02, 2010 at 11:14:16AM -0800, Chris Wright wrote:
> > Perhaps it should be a VM level option.  And then invert the notion.
> > Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> > (pin in place probably).  And have that VM do nothing other than hlt.
> > Then it's always runnable according to scheduler, and can "consume" the
> > extra work that CFS wants to give away.
> 
> That's not sufficient. Lets we have 3 guests A, B, C that need to be rate
> limited to 25% on a single cpu system. We create this idle guest D that is 100%
> cpu hog as per above definition. Now when one of the guest is idle, what ensures
> that the idle cycles of A is given only to D and not partly to B/C?

To tackle this problem, I was thinking of having a fill-thread associated with 
each vcpu (i.e both belong to same cgroup). Fill-thread consumes idle cycles 
left by vcpu, but otherwise doesn't compete with it for cycles.

- vatsa

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 11:57   ` Srivatsa Vaddagiri
  2010-12-03 16:27     ` Srivatsa Vaddagiri
@ 2010-12-03 17:28     ` Chris Wright
  2010-12-03 17:36       ` Srivatsa Vaddagiri
  1 sibling, 1 reply; 69+ messages in thread
From: Chris Wright @ 2010-12-03 17:28 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Chris Wright, Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

* Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
> On Thu, Dec 02, 2010 at 11:14:16AM -0800, Chris Wright wrote:
> > Perhaps it should be a VM level option.  And then invert the notion.
> > Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> > (pin in place probably).  And have that VM do nothing other than hlt.
> > Then it's always runnable according to scheduler, and can "consume" the
> > extra work that CFS wants to give away.
> 
> That's not sufficient. Lets we have 3 guests A, B, C that need to be
> rate limited to 25% on a single cpu system. We create this idle guest
> D that is 100% cpu hog as per above definition. Now when one of the
> guest is idle, what ensures that the idle cycles of A is given only
> to D and not partly to B/C?

Yeah, I pictured priorties handling this.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 16:27     ` Srivatsa Vaddagiri
@ 2010-12-03 17:29       ` Chris Wright
  2010-12-03 17:33         ` Srivatsa Vaddagiri
  2010-12-03 17:57         ` Srivatsa Vaddagiri
  0 siblings, 2 replies; 69+ messages in thread
From: Chris Wright @ 2010-12-03 17:29 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Chris Wright, Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

* Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
> On Fri, Dec 03, 2010 at 05:27:52PM +0530, Srivatsa Vaddagiri wrote:
> > On Thu, Dec 02, 2010 at 11:14:16AM -0800, Chris Wright wrote:
> > > Perhaps it should be a VM level option.  And then invert the notion.
> > > Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> > > (pin in place probably).  And have that VM do nothing other than hlt.
> > > Then it's always runnable according to scheduler, and can "consume" the
> > > extra work that CFS wants to give away.
> > 
> > That's not sufficient. Lets we have 3 guests A, B, C that need to be rate
> > limited to 25% on a single cpu system. We create this idle guest D that is 100%
> > cpu hog as per above definition. Now when one of the guest is idle, what ensures
> > that the idle cycles of A is given only to D and not partly to B/C?
> 
> To tackle this problem, I was thinking of having a fill-thread associated with 
> each vcpu (i.e both belong to same cgroup). Fill-thread consumes idle cycles 
> left by vcpu, but otherwise doesn't compete with it for cycles.

That's what Marcelo's suggestion does w/out a fill thread.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 17:29       ` Chris Wright
@ 2010-12-03 17:33         ` Srivatsa Vaddagiri
  2010-12-04  8:18           ` Avi Kivity
  2010-12-03 17:57         ` Srivatsa Vaddagiri
  1 sibling, 1 reply; 69+ messages in thread
From: Srivatsa Vaddagiri @ 2010-12-03 17:33 UTC (permalink / raw)
  To: Chris Wright; +Cc: Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

On Fri, Dec 03, 2010 at 09:29:06AM -0800, Chris Wright wrote:
> That's what Marcelo's suggestion does w/out a fill thread.

Are we willing to add that to KVM sources?

I was working under the constraints of not modifying the kernel (especially 
avoid adding short term hacks that become unnecessary in longer run, in this
case when kernel-based hard limits goes in).

- vatsa

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 17:28     ` Chris Wright
@ 2010-12-03 17:36       ` Srivatsa Vaddagiri
  2010-12-03 17:38         ` Chris Wright
  0 siblings, 1 reply; 69+ messages in thread
From: Srivatsa Vaddagiri @ 2010-12-03 17:36 UTC (permalink / raw)
  To: Chris Wright; +Cc: Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

On Fri, Dec 03, 2010 at 09:28:25AM -0800, Chris Wright wrote:
> * Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
> > On Thu, Dec 02, 2010 at 11:14:16AM -0800, Chris Wright wrote:
> > > Perhaps it should be a VM level option.  And then invert the notion.
> > > Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> > > (pin in place probably).  And have that VM do nothing other than hlt.
> > > Then it's always runnable according to scheduler, and can "consume" the
> > > extra work that CFS wants to give away.
> > 
> > That's not sufficient. Lets we have 3 guests A, B, C that need to be
> > rate limited to 25% on a single cpu system. We create this idle guest
> > D that is 100% cpu hog as per above definition. Now when one of the
> > guest is idle, what ensures that the idle cycles of A is given only
> > to D and not partly to B/C?
> 
> Yeah, I pictured priorties handling this.

All guest are of equal priorty in this case (that's how we are able to divide 
time into 25% chunks), so unless we dynamically boost D's priority based on how
idle other VMs are, its not going to be easy!

- vatsa

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 17:36       ` Srivatsa Vaddagiri
@ 2010-12-03 17:38         ` Chris Wright
  2010-12-03 17:43           ` Srivatsa Vaddagiri
  2010-12-03 17:47           ` Anthony Liguori
  0 siblings, 2 replies; 69+ messages in thread
From: Chris Wright @ 2010-12-03 17:38 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Chris Wright, Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

* Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
> On Fri, Dec 03, 2010 at 09:28:25AM -0800, Chris Wright wrote:
> > * Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
> > > On Thu, Dec 02, 2010 at 11:14:16AM -0800, Chris Wright wrote:
> > > > Perhaps it should be a VM level option.  And then invert the notion.
> > > > Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
> > > > (pin in place probably).  And have that VM do nothing other than hlt.
> > > > Then it's always runnable according to scheduler, and can "consume" the
> > > > extra work that CFS wants to give away.
> > > 
> > > That's not sufficient. Lets we have 3 guests A, B, C that need to be
> > > rate limited to 25% on a single cpu system. We create this idle guest
> > > D that is 100% cpu hog as per above definition. Now when one of the
> > > guest is idle, what ensures that the idle cycles of A is given only
> > > to D and not partly to B/C?
> > 
> > Yeah, I pictured priorties handling this.
> 
> All guest are of equal priorty in this case (that's how we are able to divide 
> time into 25% chunks), so unless we dynamically boost D's priority based on how
> idle other VMs are, its not going to be easy!

Right, I think there has to be an external mgmt entity.  Because num
vcpus is not static.  So priorities have to be rebalanaced at vcpu
create/destroy time.

thanks,
-chris

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 17:38         ` Chris Wright
@ 2010-12-03 17:43           ` Srivatsa Vaddagiri
  2010-12-03 17:47           ` Anthony Liguori
  1 sibling, 0 replies; 69+ messages in thread
From: Srivatsa Vaddagiri @ 2010-12-03 17:43 UTC (permalink / raw)
  To: Chris Wright; +Cc: Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

On Fri, Dec 03, 2010 at 09:38:05AM -0800, Chris Wright wrote:
> > All guest are of equal priorty in this case (that's how we are able to divide 
> > time into 25% chunks), so unless we dynamically boost D's priority based on how
> > idle other VMs are, its not going to be easy!
> 
> Right, I think there has to be an external mgmt entity.  Because num
> vcpus is not static.  So priorities have to be rebalanaced at vcpu
> create/destroy time.

and at idle/non-idle time as well, which makes the mgmt entity's job rather
harder? Anyway, if we are willing to take a patch to burn cycles upon halt (as
per Marcello's patch), that's be the best (short-term) solution ..otherwise,
something like a filler-thread per-vcpu is more easier than dynamic change of
priorities ..

- vatsa

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 17:38         ` Chris Wright
  2010-12-03 17:43           ` Srivatsa Vaddagiri
@ 2010-12-03 17:47           ` Anthony Liguori
  1 sibling, 0 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-03 17:47 UTC (permalink / raw)
  To: Chris Wright; +Cc: Srivatsa Vaddagiri, kvm, Avi Kivity, Marcelo Tosatti

On 12/03/2010 11:38 AM, Chris Wright wrote:
> * Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
>    
>> On Fri, Dec 03, 2010 at 09:28:25AM -0800, Chris Wright wrote:
>>      
>>> * Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
>>>        
>>>> On Thu, Dec 02, 2010 at 11:14:16AM -0800, Chris Wright wrote:
>>>>          
>>>>> Perhaps it should be a VM level option.  And then invert the notion.
>>>>> Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
>>>>> (pin in place probably).  And have that VM do nothing other than hlt.
>>>>> Then it's always runnable according to scheduler, and can "consume" the
>>>>> extra work that CFS wants to give away.
>>>>>            
>>>> That's not sufficient. Lets we have 3 guests A, B, C that need to be
>>>> rate limited to 25% on a single cpu system. We create this idle guest
>>>> D that is 100% cpu hog as per above definition. Now when one of the
>>>> guest is idle, what ensures that the idle cycles of A is given only
>>>> to D and not partly to B/C?
>>>>          
>>> Yeah, I pictured priorties handling this.
>>>        
>> All guest are of equal priorty in this case (that's how we are able to divide
>> time into 25% chunks), so unless we dynamically boost D's priority based on how
>> idle other VMs are, its not going to be easy!
>>      
> Right, I think there has to be an external mgmt entity.  Because num
> vcpus is not static.  So priorities have to be rebalanaced at vcpu
> create/destroy time.
>    

We've actually done a fair amount of testing with using priorities like 
this.  The granularity is extremely poor because priorities don't map 
linearly to cpu time allotment.  The interaction with background tasks 
also gets extremely complicated.

Regards,

Anthony Liguori

> thanks,
> -chris
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 17:29       ` Chris Wright
  2010-12-03 17:33         ` Srivatsa Vaddagiri
@ 2010-12-03 17:57         ` Srivatsa Vaddagiri
  2010-12-03 17:58           ` Chris Wright
  1 sibling, 1 reply; 69+ messages in thread
From: Srivatsa Vaddagiri @ 2010-12-03 17:57 UTC (permalink / raw)
  To: Chris Wright; +Cc: Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

On Fri, Dec 03, 2010 at 09:29:06AM -0800, Chris Wright wrote:
> That's what Marcelo's suggestion does w/out a fill thread.

There's one complication though even with that. How do we compute the
real utilization of VM (given that it will appear to be burning 100% cycles)?
We need to have scheduler discount the cycles burnt post halt-exit, so more
stuff is needed than those simple 3-4 lines!

- vatsa

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 17:57         ` Srivatsa Vaddagiri
@ 2010-12-03 17:58           ` Chris Wright
  2010-12-03 18:07             ` Anthony Liguori
  2010-12-03 18:10             ` Marcelo Tosatti
  0 siblings, 2 replies; 69+ messages in thread
From: Chris Wright @ 2010-12-03 17:58 UTC (permalink / raw)
  To: Srivatsa Vaddagiri
  Cc: Chris Wright, Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

* Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
> On Fri, Dec 03, 2010 at 09:29:06AM -0800, Chris Wright wrote:
> > That's what Marcelo's suggestion does w/out a fill thread.
> 
> There's one complication though even with that. How do we compute the
> real utilization of VM (given that it will appear to be burning 100% cycles)?
> We need to have scheduler discount the cycles burnt post halt-exit, so more
> stuff is needed than those simple 3-4 lines!

Heh, was just about to say the same thing ;)

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 17:58           ` Chris Wright
@ 2010-12-03 18:07             ` Anthony Liguori
  2010-12-03 18:12               ` Srivatsa Vaddagiri
  2010-12-03 18:20               ` Chris Wright
  2010-12-03 18:10             ` Marcelo Tosatti
  1 sibling, 2 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-03 18:07 UTC (permalink / raw)
  To: Chris Wright
  Cc: Srivatsa Vaddagiri, Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

On 12/03/2010 11:58 AM, Chris Wright wrote:
> * Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
>    
>> On Fri, Dec 03, 2010 at 09:29:06AM -0800, Chris Wright wrote:
>>      
>>> That's what Marcelo's suggestion does w/out a fill thread.
>>>        
>> There's one complication though even with that. How do we compute the
>> real utilization of VM (given that it will appear to be burning 100% cycles)?
>> We need to have scheduler discount the cycles burnt post halt-exit, so more
>> stuff is needed than those simple 3-4 lines!
>>      
> Heh, was just about to say the same thing ;)
>    

My first reaction is that it's not terribly important to account the 
non-idle time in the guest because of the use-case for this model.

Eventually, it might be nice to have idle time accounting but I don't 
see it as a critical feature here.

Non-idle time simply isn't as meaningful here as it normally would be.  
If you have 10 VMs in a normal environment and saw that you had only 50% 
CPU utilization, you might be inclined to add more VMs.  But if you're 
offering deterministic execution, it doesn't matter if you only have 
"50%" utilization.  If you add another VM, the guests will get exactly 
the same impact as if they were using 100% utilization.

Regards,

Anthony Liguori

> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 17:58           ` Chris Wright
  2010-12-03 18:07             ` Anthony Liguori
@ 2010-12-03 18:10             ` Marcelo Tosatti
  2010-12-03 18:24               ` Marcelo Tosatti
  1 sibling, 1 reply; 69+ messages in thread
From: Marcelo Tosatti @ 2010-12-03 18:10 UTC (permalink / raw)
  To: Chris Wright; +Cc: Srivatsa Vaddagiri, Anthony Liguori, kvm, Avi Kivity

On Fri, Dec 03, 2010 at 09:58:54AM -0800, Chris Wright wrote:
> * Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
> > On Fri, Dec 03, 2010 at 09:29:06AM -0800, Chris Wright wrote:
> > > That's what Marcelo's suggestion does w/out a fill thread.
> > 
> > There's one complication though even with that. How do we compute the
> > real utilization of VM (given that it will appear to be burning 100% cycles)?
> > We need to have scheduler discount the cycles burnt post halt-exit, so more
> > stuff is needed than those simple 3-4 lines!
> 
> Heh, was just about to say the same thing ;)

Probably yes. The point is, you get the same effect as with the
non-trapping hlt but without the complications on low-level VMX/SVM
code.

Even better if you can do it with fill thread idea.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 18:07             ` Anthony Liguori
@ 2010-12-03 18:12               ` Srivatsa Vaddagiri
  2010-12-04  8:19                 ` Avi Kivity
  2010-12-03 18:20               ` Chris Wright
  1 sibling, 1 reply; 69+ messages in thread
From: Srivatsa Vaddagiri @ 2010-12-03 18:12 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Chris Wright, Anthony Liguori, kvm, Avi Kivity, Marcelo Tosatti

On Fri, Dec 03, 2010 at 12:07:15PM -0600, Anthony Liguori wrote:
> My first reaction is that it's not terribly important to account the
> non-idle time in the guest because of the use-case for this model.

Agreed ...but I was considering the larger user-base who may be surprised to see
their VMs being reported as 100% hogs when they had left it idle.

- vatsa

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 18:07             ` Anthony Liguori
  2010-12-03 18:12               ` Srivatsa Vaddagiri
@ 2010-12-03 18:20               ` Chris Wright
  2010-12-03 18:55                 ` Anthony Liguori
  1 sibling, 1 reply; 69+ messages in thread
From: Chris Wright @ 2010-12-03 18:20 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Chris Wright, Srivatsa Vaddagiri, Anthony Liguori, kvm,
	Avi Kivity, Marcelo Tosatti

* Anthony Liguori (anthony@codemonkey.ws) wrote:
> On 12/03/2010 11:58 AM, Chris Wright wrote:
> >* Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
> >>On Fri, Dec 03, 2010 at 09:29:06AM -0800, Chris Wright wrote:
> >>>That's what Marcelo's suggestion does w/out a fill thread.
> >>There's one complication though even with that. How do we compute the
> >>real utilization of VM (given that it will appear to be burning 100% cycles)?
> >>We need to have scheduler discount the cycles burnt post halt-exit, so more
> >>stuff is needed than those simple 3-4 lines!
> >Heh, was just about to say the same thing ;)
> 
> My first reaction is that it's not terribly important to account the
> non-idle time in the guest because of the use-case for this model.

Depends on the chargeback model.  This would put guest vcpu runtime vs
host running guest vcpu time really out of skew.  ('course w/out steal
and that time it's already out of skew).  But I think most models are
more uptime based rather then actual runtime now.

> Eventually, it might be nice to have idle time accounting but I
> don't see it as a critical feature here.
> 
> Non-idle time simply isn't as meaningful here as it normally would
> be.  If you have 10 VMs in a normal environment and saw that you had
> only 50% CPU utilization, you might be inclined to add more VMs.

Who is "you"?  cloud user, or cloud service provider's scheduler?
On the user side, 50% cpu utilization wouldn't trigger me to add new
VMs.  On the host side, 50% cpu utilization would have to be measure
solely in terms of guest vcpu count.

> But if you're offering deterministic execution, it doesn't matter if
> you only have "50%" utilization.  If you add another VM, the guests
> will get exactly the same impact as if they were using 100%
> utilization.

Sorry, didn't follow here?

thanks,
-chris

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 18:10             ` Marcelo Tosatti
@ 2010-12-03 18:24               ` Marcelo Tosatti
  0 siblings, 0 replies; 69+ messages in thread
From: Marcelo Tosatti @ 2010-12-03 18:24 UTC (permalink / raw)
  To: Chris Wright; +Cc: Srivatsa Vaddagiri, Anthony Liguori, kvm, Avi Kivity

On Fri, Dec 03, 2010 at 04:10:43PM -0200, Marcelo Tosatti wrote:
> On Fri, Dec 03, 2010 at 09:58:54AM -0800, Chris Wright wrote:
> > * Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
> > > On Fri, Dec 03, 2010 at 09:29:06AM -0800, Chris Wright wrote:
> > > > That's what Marcelo's suggestion does w/out a fill thread.
> > > 
> > > There's one complication though even with that. How do we compute the
> > > real utilization of VM (given that it will appear to be burning 100% cycles)?
> > > We need to have scheduler discount the cycles burnt post halt-exit, so more
> > > stuff is needed than those simple 3-4 lines!
> > 
> > Heh, was just about to say the same thing ;)
> 
> Probably yes. The point is, you get the same effect as with the
> non-trapping hlt but without the complications on low-level VMX/SVM
> code.
> 
> Even better if you can do it with fill thread idea.

Well, no. Better to consume hlt time but yield if need_resched or in 
case of any event which breaks out of kvm_vcpu_block.



^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 18:20               ` Chris Wright
@ 2010-12-03 18:55                 ` Anthony Liguori
  0 siblings, 0 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-03 18:55 UTC (permalink / raw)
  To: Chris Wright; +Cc: Srivatsa Vaddagiri, kvm, Avi Kivity, Marcelo Tosatti

On 12/03/2010 12:20 PM, Chris Wright wrote:
> * Anthony Liguori (anthony@codemonkey.ws) wrote:
>    
>> On 12/03/2010 11:58 AM, Chris Wright wrote:
>>      
>>> * Srivatsa Vaddagiri (vatsa@linux.vnet.ibm.com) wrote:
>>>        
>>>> On Fri, Dec 03, 2010 at 09:29:06AM -0800, Chris Wright wrote:
>>>>          
>>>>> That's what Marcelo's suggestion does w/out a fill thread.
>>>>>            
>>>> There's one complication though even with that. How do we compute the
>>>> real utilization of VM (given that it will appear to be burning 100% cycles)?
>>>> We need to have scheduler discount the cycles burnt post halt-exit, so more
>>>> stuff is needed than those simple 3-4 lines!
>>>>          
>>> Heh, was just about to say the same thing ;)
>>>        
>> My first reaction is that it's not terribly important to account the
>> non-idle time in the guest because of the use-case for this model.
>>      
> Depends on the chargeback model.  This would put guest vcpu runtime vs
> host running guest vcpu time really out of skew.  ('course w/out steal
> and that time it's already out of skew).  But I think most models are
> more uptime based rather then actual runtime now.
>    

Right.  I'm not familiar with any models that are actually based on 
CPU-consumption based accounting.  In general, the feedback I've 
received is that predictable accounting is pretty critical so I don't 
anticipate something as volatile as CPU-consumption ever being something 
that's explicitly charged for in a granular fashion.

>> Eventually, it might be nice to have idle time accounting but I
>> don't see it as a critical feature here.
>>
>> Non-idle time simply isn't as meaningful here as it normally would
>> be.  If you have 10 VMs in a normal environment and saw that you had
>> only 50% CPU utilization, you might be inclined to add more VMs.
>>      
> Who is "you"?  cloud user, or cloud service provider's scheduler?
> On the user side, 50% cpu utilization wouldn't trigger me to add new
> VMs.  On the host side, 50% cpu utilization would have to be measure
> solely in terms of guest vcpu count.
>
>    
>> But if you're offering deterministic execution, it doesn't matter if
>> you only have "50%" utilization.  If you add another VM, the guests
>> will get exactly the same impact as if they were using 100%
>> utilization.
>>      
> Sorry, didn't follow here?
>    

The question is, why would something care about host CPU utilization?  
The answer I can think of is, something wants to measure host CPU 
utilization to identify an underutilized node.  One the underutilized 
node is identified, more work can be given to it.

Adding more work to an underutilized node doesn't change the amount of 
work that can be done.  More concretely, one PCPU, four independent 
VCPUs.  They are consuming, 25%, 25%, 25%, 12% respectively.  My 
management software says, ah hah, I can stick a fifth VCPU on this box 
that's only using 5%.  The other VCPUs are unaffected.

However, in a no-yield-on-hlt model, if I have four VCPUs, they each get 
25%, 25%, 25%, 25% on the host.  Three of the VCPUs are running 100% in 
the guest and one is running 50%.

If I add a fifth VCPU, even if it's only using 5%, each VCPU drops to 
20%.  That means the three VCPUS that are consuming 100% now see a 25% 
drop in their performance even though you've added an idle guest.

Basically, the traditional view of density simply doesn't apply in this 
model.

Regards,

Anthony Liguori

> thanks,
> -chris
>    

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 17:37 ` Marcelo Tosatti
  2010-12-02 19:07   ` Anthony Liguori
@ 2010-12-03 22:42   ` Anthony Liguori
  2010-12-04  8:16     ` Avi Kivity
  1 sibling, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-03 22:42 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, Avi Kivity, Chris Wright, Srivatsa Vaddagiri

On 12/02/2010 11:37 AM, Marcelo Tosatti wrote:
> On Thu, Dec 02, 2010 at 07:59:17AM -0600, Anthony Liguori wrote:
>    
>> In certain use-cases, we want to allocate guests fixed time slices where idle
>> guest cycles leave the machine idling.  There are many approaches to achieve
>> this but the most direct is to simply avoid trapping the HLT instruction which
>> lets the guest directly execute the instruction putting the processor to sleep.
>>
>> Introduce this as a module-level option for kvm-vmx.ko since if you do this
>> for one guest, you probably want to do it for all.  A similar option is possible
>> for AMD but I don't have easy access to AMD test hardware.
>>
>> Signed-off-by: Anthony Liguori<aliguori@us.ibm.com>
>> ---
>> v1 ->  v2
>>   - Rename parameter to yield_on_hlt
>>   - Remove __read_mostly
>>
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index caa967e..d8310e4 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
>>   static int __read_mostly vmm_exclusive = 1;
>>   module_param(vmm_exclusive, bool, S_IRUGO);
>>
>> +static int yield_on_hlt = 1;
>> +module_param(yield_on_hlt, bool, S_IRUGO);
>> +
>>   #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
>>   	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
>>   #define KVM_GUEST_CR0_MASK						\
>> @@ -1419,7 +1422,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>>   				&_pin_based_exec_control)<  0)
>>   		return -EIO;
>>
>> -	min = CPU_BASED_HLT_EXITING |
>> +	min =
>>   #ifdef CONFIG_X86_64
>>   	      CPU_BASED_CR8_LOAD_EXITING |
>>   	      CPU_BASED_CR8_STORE_EXITING |
>> @@ -1432,6 +1435,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>>   	      CPU_BASED_MWAIT_EXITING |
>>   	      CPU_BASED_MONITOR_EXITING |
>>   	      CPU_BASED_INVLPG_EXITING;
>> +
>> +	if (yield_on_hlt)
>> +		min |= CPU_BASED_HLT_EXITING;
>> +
>>   	opt = CPU_BASED_TPR_SHADOW |
>>   	      CPU_BASED_USE_MSR_BITMAPS |
>>   	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
>> -- 
>> 1.7.0.4
>>      
> Breaks async PF (see "checks on guest state"), timer reinjection
> probably.

In v3, I set the activity state to ACTIVE if the state is currently HLT 
when injecting an exception into a guest.

The effect is that after the exception is handled, if iret is executed, 
the hlt instruction will be restarted.  The seems like the correct 
semantics to me.

Regards,

Anthony Liguori

>   It should be possible to achieve determinism with
> a scheduler policy?
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03  9:36         ` Avi Kivity
@ 2010-12-03 22:45           ` Anthony Liguori
  2010-12-04  8:13             ` Avi Kivity
  0 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-03 22:45 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/03/2010 03:36 AM, Avi Kivity wrote:
> On 12/02/2010 10:51 PM, Anthony Liguori wrote:
>>> VCPU in HLT state only allows injection of certain events that
>>> would be delivered on HLT. #PF is not one of them.
>>
>> But you can't inject an exception into a guest while the VMCS is 
>> active, can you? 
>
> No, but this is irrelevant.
>
>> So the guest takes an exit while in the hlt instruction but that's no 
>> different than if the guest has been interrupted because of hlt exiting.
>
> hlt exiting doesn't leave vcpu in the halted state (since hlt has not 
> been executed).  So currently we never see a vcpu in halted state.

Right, you mean the guest activity state being halt.

My understanding is that it just needs to be cleared on exception 
injection.  Would could clear it at every vmentry but that would 
introduce a vmcs_read() to the fast path which is undesirable.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 20:40   ` Marcelo Tosatti
  2010-12-02 21:07     ` Chris Wright
  2010-12-02 22:27     ` Anthony Liguori
@ 2010-12-03 22:49     ` Anthony Liguori
  2010-12-04  5:43       ` Srivatsa Vaddagiri
  2 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-03 22:49 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Chris Wright, kvm, Avi Kivity, Srivatsa Vaddagiri

On 12/02/2010 02:40 PM, Marcelo Tosatti wrote:
> On Thu, Dec 02, 2010 at 11:14:16AM -0800, Chris Wright wrote:
>    
>> * Anthony Liguori (aliguori@us.ibm.com) wrote:
>>      
>>> In certain use-cases, we want to allocate guests fixed time slices where idle
>>> guest cycles leave the machine idling.  There are many approaches to achieve
>>> this but the most direct is to simply avoid trapping the HLT instruction which
>>> lets the guest directly execute the instruction putting the processor to sleep.
>>>        
>> I like the idea, esp to keep from burning power.
>>
>>      
>>> Introduce this as a module-level option for kvm-vmx.ko since if you do this
>>> for one guest, you probably want to do it for all.  A similar option is possible
>>> for AMD but I don't have easy access to AMD test hardware.
>>>        
>> Perhaps it should be a VM level option.  And then invert the notion.
>> Create one idle domain w/out hlt trap.  Give that VM a vcpu per pcpu
>> (pin in place probably).  And have that VM do nothing other than hlt.
>> Then it's always runnable according to scheduler, and can "consume" the
>> extra work that CFS wants to give away.
>>
>> What do you think?
>>
>> thanks,
>> -chris
>>      
> Consuming the timeslice outside guest mode is less intrusive and easier
> to replace. Something like this should work?
>
> if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
>      while (!need_resched())
>          default_idle();
> }
>    

This looked nice but the implementation in practice wasn't unless I 
totally misunderstood what you were getting at.

default_idle() is not exported to modules and is not an interface meant 
to be called directly.  Plus, an idle loop like this delays the guest 
until the scheduler wants to run something else but it doesn't account 
for another thread trying to inject an event into the halting thread.  
It's not immediately clear to me how to have what's effectively a wait 
queue that hlts instead of calls the scheduler.  You could mess around 
with various signalling mechanisms but it gets ugly fast.

So I circled back to disabling hlt exiting this time taking care of 
updating GUEST_ACTIVITY_STATE when necessary.

Regards,

Anthony Liguori

> But you agree this is no KVM business.
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03  9:38     ` Avi Kivity
  2010-12-03 11:12       ` Srivatsa Vaddagiri
@ 2010-12-03 23:28       ` Anthony Liguori
  1 sibling, 0 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-03 23:28 UTC (permalink / raw)
  To: Avi Kivity
  Cc: lidong chen, kvm, Marcelo Tosatti, Chris Wright, Srivatsa Vaddagiri

On 12/03/2010 03:38 AM, Avi Kivity wrote:
> On 12/02/2010 05:23 PM, Anthony Liguori wrote:
>> On 12/02/2010 08:39 AM, lidong chen wrote:
>>> In certain use-cases, we want to allocate guests fixed time slices 
>>> where idle
>>> guest cycles leave the machine idling.
>>>
>>> i could not understand why need this? can you tell more detailedly?
>>
>> If you run 4 guests on a CPU, and they're all trying to consume 100% 
>> CPU, all things being equal, you'll get ~25% CPU for each guest.
>>
>> However, if one guest is idle, you'll get something like 1% 32% 33% 
>> 32%.  This characteristic is usually desirable because it increase 
>> aggregate throughput but in some circumstances, determinism is more 
>> desirable than aggregate throughput.
>>
>> This patch essentially makes guest execution non-work conserving by 
>> making it appear to the scheduler that each guest wants 100% CPU even 
>> though they may be idling.
>>
>> That means that regardless of what each guest is doing, if you have 
>> four guests on one CPU, each will get ~25% CPU[1].
>>
>
> What if one of the guest crashes qemu or invokes a powerdown?  
> Suddenly the others get 33% each (with 1% going to my secret round-up 
> account).  Doesn't seem like a reliable way to limit cpu.

A guest shutting down is a macro event.  Macro events are easy to track 
and are logged by even the most naive management tools.  Macro events 
affecting performance are a workable problem.  I agree, it would be 
ideal to make them not impact performance but perfection is the enemy of 
good.

The problem with the status quo is that there is no performance 
stability in a consolidation environment.

Regards,

Anthony Liguori



^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-02 20:12     ` Marcelo Tosatti
  2010-12-02 20:51       ` Anthony Liguori
@ 2010-12-03 23:31       ` Anthony Liguori
  1 sibling, 0 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-03 23:31 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm, Avi Kivity, Chris Wright, Srivatsa Vaddagiri

On 12/02/2010 02:12 PM, Marcelo Tosatti wrote:
>>>   It should be possible to achieve determinism with
>>> a scheduler policy?
>>>        
>> If the desire is the ultimate desire is to have the guests be
>> scheduled in a non-work conserving fashion, I can't see a more
>> direct approach that to simply not have the guests yield (which is
>> ultimately what hlt trapping does).
>>
>> Anything the scheduler would do is after the fact and probably based
>> on inference about why the yield.
>>      
> Another issue is you ignore the hosts idea of the best way to sleep
> (ACPI, or whatever).
>    

Non-work conserving schedulers kill polar bears.  There's simply no way 
around it.

The best strategy for power savings is to complete you work as quickly 
as you can and then spend as much time in the deepest sleep mode you 
can.  If you're using a non-work conserving scheduler, you're going to 
take more time to complete a workload spending needless cycles in 
shallow sleep states.

But that's the price we pay for determinism.  Maybe we can plant some 
trees at the next KVM Forum to offset CPU limits? :-)

> And handling inactive HLT state (which was never enabled) can be painful.
>    

Sorry, I'm not sure what you mean by this.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 22:49     ` Anthony Liguori
@ 2010-12-04  5:43       ` Srivatsa Vaddagiri
  0 siblings, 0 replies; 69+ messages in thread
From: Srivatsa Vaddagiri @ 2010-12-04  5:43 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, Chris Wright, kvm, Avi Kivity

On Fri, Dec 03, 2010 at 04:49:20PM -0600, Anthony Liguori wrote:
> default_idle() is not exported to modules and is not an interface
> meant to be called directly.  Plus, an idle loop like this delays
> the guest until the scheduler wants to run something else but it
> doesn't account for another thread trying to inject an event into
> the halting thread.  It's not immediately clear to me how to have
> what's effectively a wait queue that hlts instead of calls the
> scheduler.  You could mess around with various signalling mechanisms
> but it gets ugly fast.

How about using MWAIT?

- vatsa

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 22:45           ` Anthony Liguori
@ 2010-12-04  8:13             ` Avi Kivity
  2010-12-04 13:30               ` Anthony Liguori
  0 siblings, 1 reply; 69+ messages in thread
From: Avi Kivity @ 2010-12-04  8:13 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/04/2010 12:45 AM, Anthony Liguori wrote:
>> hlt exiting doesn't leave vcpu in the halted state (since hlt has not 
>> been executed).  So currently we never see a vcpu in halted state.
>
>
> Right, you mean the guest activity state being halt.
>
> My understanding is that it just needs to be cleared on exception 
> injection.  Would could clear it at every vmentry but that would 
> introduce a vmcs_read() to the fast path which is undesirable.

Also need to skip the hlt instruction (by calling the emulator for example).

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 22:42   ` Anthony Liguori
@ 2010-12-04  8:16     ` Avi Kivity
  2010-12-04 13:48       ` Anthony Liguori
  0 siblings, 1 reply; 69+ messages in thread
From: Avi Kivity @ 2010-12-04  8:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/04/2010 12:42 AM, Anthony Liguori wrote:
>
> In v3, I set the activity state to ACTIVE if the state is currently 
> HLT when injecting an exception into a guest.
>
> The effect is that after the exception is handled, if iret is 
> executed, the hlt instruction will be restarted.  The seems like the 
> correct semantics to me.

No, an interrupt causes the HLT to be executed, and rip advanced past 
it.  You need to preserve this (both for interrupts and for the apf 
completion exception).

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.



^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 17:33         ` Srivatsa Vaddagiri
@ 2010-12-04  8:18           ` Avi Kivity
  0 siblings, 0 replies; 69+ messages in thread
From: Avi Kivity @ 2010-12-04  8:18 UTC (permalink / raw)
  To: vatsa; +Cc: Chris Wright, Anthony Liguori, kvm, Marcelo Tosatti

On 12/03/2010 07:33 PM, Srivatsa Vaddagiri wrote:
> On Fri, Dec 03, 2010 at 09:29:06AM -0800, Chris Wright wrote:
> >  That's what Marcelo's suggestion does w/out a fill thread.
>
> Are we willing to add that to KVM sources?
>

I'd rather avoid it.

> I was working under the constraints of not modifying the kernel (especially
> avoid adding short term hacks that become unnecessary in longer run, in this
> case when kernel-based hard limits goes in).

Yes.  Allowing the guest to execute HLT is fine, but adding scheduling 
smarts to kvm is something else.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-03 18:12               ` Srivatsa Vaddagiri
@ 2010-12-04  8:19                 ` Avi Kivity
  0 siblings, 0 replies; 69+ messages in thread
From: Avi Kivity @ 2010-12-04  8:19 UTC (permalink / raw)
  To: vatsa
  Cc: Anthony Liguori, Chris Wright, Anthony Liguori, kvm, Marcelo Tosatti

On 12/03/2010 08:12 PM, Srivatsa Vaddagiri wrote:
> On Fri, Dec 03, 2010 at 12:07:15PM -0600, Anthony Liguori wrote:
> >  My first reaction is that it's not terribly important to account the
> >  non-idle time in the guest because of the use-case for this model.
>
> Agreed ...but I was considering the larger user-base who may be surprised to see
> their VMs being reported as 100% hogs when they had left it idle.

The larger user base won't enable this option.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-04  8:13             ` Avi Kivity
@ 2010-12-04 13:30               ` Anthony Liguori
  2010-12-06  8:28                 ` Avi Kivity
  0 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-04 13:30 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/04/2010 02:13 AM, Avi Kivity wrote:
> On 12/04/2010 12:45 AM, Anthony Liguori wrote:
>>> hlt exiting doesn't leave vcpu in the halted state (since hlt has 
>>> not been executed).  So currently we never see a vcpu in halted state.
>>
>>
>> Right, you mean the guest activity state being halt.
>>
>> My understanding is that it just needs to be cleared on exception 
>> injection.  Would could clear it at every vmentry but that would 
>> introduce a vmcs_read() to the fast path which is undesirable.
>
> Also need to skip the hlt instruction (by calling the emulator for 
> example).

I wasn't sure about this.  Don't you want EIP to point to the beginning 
of the instruction such that the exception will cause the instruction to 
restart?

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-04  8:16     ` Avi Kivity
@ 2010-12-04 13:48       ` Anthony Liguori
  2010-12-06  8:32         ` Avi Kivity
  0 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-04 13:48 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/04/2010 02:16 AM, Avi Kivity wrote:
> On 12/04/2010 12:42 AM, Anthony Liguori wrote:
>>
>> In v3, I set the activity state to ACTIVE if the state is currently 
>> HLT when injecting an exception into a guest.
>>
>> The effect is that after the exception is handled, if iret is 
>> executed, the hlt instruction will be restarted.  The seems like the 
>> correct semantics to me.
>
> No, an interrupt causes the HLT to be executed, and rip advanced past 
> it.  You need to preserve this (both for interrupts and for the apf 
> completion exception).

Yeah, I see in the architecture manual it specifically calls out the rip 
advancing after NMI.  It doesn't say anything about debug exceptions 
though.  It's not clear to me if some exceptions cause hlt to be skipped 
and others not.

But then again, skipping hlt seems to be the most conservative thing to 
do for all exceptions because it's always going to be run in a loop.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-04 13:30               ` Anthony Liguori
@ 2010-12-06  8:28                 ` Avi Kivity
  2010-12-06  8:35                   ` Avi Kivity
  0 siblings, 1 reply; 69+ messages in thread
From: Avi Kivity @ 2010-12-06  8:28 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/04/2010 03:30 PM, Anthony Liguori wrote:
> On 12/04/2010 02:13 AM, Avi Kivity wrote:
>> On 12/04/2010 12:45 AM, Anthony Liguori wrote:
>>>> hlt exiting doesn't leave vcpu in the halted state (since hlt has 
>>>> not been executed).  So currently we never see a vcpu in halted state.
>>>
>>>
>>> Right, you mean the guest activity state being halt.
>>>
>>> My understanding is that it just needs to be cleared on exception 
>>> injection.  Would could clear it at every vmentry but that would 
>>> introduce a vmcs_read() to the fast path which is undesirable.
>>
>> Also need to skip the hlt instruction (by calling the emulator for 
>> example).
>
> I wasn't sure about this.  Don't you want EIP to point to the 
> beginning of the instruction such that the exception will cause the 
> instruction to restart?

An interrupt causes the HLT to complete execution.   APF completion 
counts as an interrupt in this case.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-04 13:48       ` Anthony Liguori
@ 2010-12-06  8:32         ` Avi Kivity
  0 siblings, 0 replies; 69+ messages in thread
From: Avi Kivity @ 2010-12-06  8:32 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/04/2010 03:48 PM, Anthony Liguori wrote:
>> No, an interrupt causes the HLT to be executed, and rip advanced past 
>> it.  You need to preserve this (both for interrupts and for the apf 
>> completion exception).
>
>
> Yeah, I see in the architecture manual it specifically calls out the 
> rip advancing after NMI.  It doesn't say anything about debug 
> exceptions though. 

Instruction breakpoints are triggered before HLT executes; and HLT can't 
trigger data breakpoints.

> It's not clear to me if some exceptions cause hlt to be skipped and 
> others not.

Faults take place before HLT is executed (and thus don't advance RIP).  
Interrupts take place during HLT execution and cause it to complete.

>
> But then again, skipping hlt seems to be the most conservative thing 
> to do for all exceptions because it's always going to be run in a loop.

There's no "all exceptions" here.  Only interrupts can happen, and the 
APF completion, which is wired to a fault vector, but we must treat it 
as an interrupt.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06  8:28                 ` Avi Kivity
@ 2010-12-06  8:35                   ` Avi Kivity
  2010-12-06 13:58                     ` Anthony Liguori
  0 siblings, 1 reply; 69+ messages in thread
From: Avi Kivity @ 2010-12-06  8:35 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 10:28 AM, Avi Kivity wrote:
>> I wasn't sure about this.  Don't you want EIP to point to the 
>> beginning of the instruction such that the exception will cause the 
>> instruction to restart?
>
>
> An interrupt causes the HLT to complete execution.   APF completion 
> counts as an interrupt in this case.
>

btw, it's possible a VMEXIT during HLT returns RIP already advanced past 
the HLT instruction.  Please check if this is the case.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06  8:35                   ` Avi Kivity
@ 2010-12-06 13:58                     ` Anthony Liguori
  2010-12-06 14:01                       ` Avi Kivity
  0 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-06 13:58 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 02:35 AM, Avi Kivity wrote:
> On 12/06/2010 10:28 AM, Avi Kivity wrote:
>>> I wasn't sure about this.  Don't you want EIP to point to the 
>>> beginning of the instruction such that the exception will cause the 
>>> instruction to restart?
>>
>>
>> An interrupt causes the HLT to complete execution.   APF completion 
>> counts as an interrupt in this case.
>>
>
> btw, it's possible a VMEXIT during HLT returns RIP already advanced 
> past the HLT instruction.  Please check if this is the case.

It's not just possible, it appears to be exactly what happens.

I guess it makes sense that RIP gets advanced before HLT begins to wait.

Regards,

Anthony Liguori



^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 13:58                     ` Anthony Liguori
@ 2010-12-06 14:01                       ` Avi Kivity
  2010-12-06 14:02                         ` Avi Kivity
  2010-12-06 14:03                         ` Anthony Liguori
  0 siblings, 2 replies; 69+ messages in thread
From: Avi Kivity @ 2010-12-06 14:01 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 03:58 PM, Anthony Liguori wrote:
> On 12/06/2010 02:35 AM, Avi Kivity wrote:
>> On 12/06/2010 10:28 AM, Avi Kivity wrote:
>>>> I wasn't sure about this.  Don't you want EIP to point to the 
>>>> beginning of the instruction such that the exception will cause the 
>>>> instruction to restart?
>>>
>>>
>>> An interrupt causes the HLT to complete execution.   APF completion 
>>> counts as an interrupt in this case.
>>>
>>
>> btw, it's possible a VMEXIT during HLT returns RIP already advanced 
>> past the HLT instruction.  Please check if this is the case.
>
> It's not just possible, it appears to be exactly what happens.
>
> I guess it makes sense that RIP gets advanced before HLT begins to wait.
>

It does.  Good, it simplifies the patch.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 14:01                       ` Avi Kivity
@ 2010-12-06 14:02                         ` Avi Kivity
  2010-12-06 14:08                           ` Anthony Liguori
  2010-12-06 14:03                         ` Anthony Liguori
  1 sibling, 1 reply; 69+ messages in thread
From: Avi Kivity @ 2010-12-06 14:02 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 04:01 PM, Avi Kivity wrote:
>> It's not just possible, it appears to be exactly what happens.
>>
>> I guess it makes sense that RIP gets advanced before HLT begins to wait.
>>
>
>
> It does.  Good, it simplifies the patch.
>

btw, this is how kvm emulates HLT - first we advance rip, then we sleep 
the vcpu.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 14:01                       ` Avi Kivity
  2010-12-06 14:02                         ` Avi Kivity
@ 2010-12-06 14:03                         ` Anthony Liguori
  2010-12-06 14:33                           ` Avi Kivity
  1 sibling, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-06 14:03 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 08:01 AM, Avi Kivity wrote:
> On 12/06/2010 03:58 PM, Anthony Liguori wrote:
>> On 12/06/2010 02:35 AM, Avi Kivity wrote:
>>> On 12/06/2010 10:28 AM, Avi Kivity wrote:
>>>>> I wasn't sure about this.  Don't you want EIP to point to the 
>>>>> beginning of the instruction such that the exception will cause 
>>>>> the instruction to restart?
>>>>
>>>>
>>>> An interrupt causes the HLT to complete execution.   APF completion 
>>>> counts as an interrupt in this case.
>>>>
>>>
>>> btw, it's possible a VMEXIT during HLT returns RIP already advanced 
>>> past the HLT instruction.  Please check if this is the case.
>>
>> It's not just possible, it appears to be exactly what happens.
>>
>> I guess it makes sense that RIP gets advanced before HLT begins to wait.
>>
>
> It does.  Good, it simplifies the patch.

Any ideas if the unit test framework can be used to validate this?  The 
behavior makes perfect sense but I wanted an excuse to play around with 
it :-)

Not sure if there's a way to validate VMCS state after a hand crafted exit.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 14:02                         ` Avi Kivity
@ 2010-12-06 14:08                           ` Anthony Liguori
  2010-12-06 14:14                             ` Gleb Natapov
  0 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-06 14:08 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 08:02 AM, Avi Kivity wrote:
> On 12/06/2010 04:01 PM, Avi Kivity wrote:
>>> It's not just possible, it appears to be exactly what happens.
>>>
>>> I guess it makes sense that RIP gets advanced before HLT begins to 
>>> wait.
>>>
>>
>>
>> It does.  Good, it simplifies the patch.
>>
>
> btw, this is how kvm emulates HLT - first we advance rip, then we 
> sleep the vcpu.

Yes, I assume the hardware updates RIP immediately after fetching the 
instruction.  That's at least how some simpler architectures I'm aware 
of work.

Regards,

Anthony Liguori



^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 14:08                           ` Anthony Liguori
@ 2010-12-06 14:14                             ` Gleb Natapov
  0 siblings, 0 replies; 69+ messages in thread
From: Gleb Natapov @ 2010-12-06 14:14 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Avi Kivity, Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On Mon, Dec 06, 2010 at 08:08:08AM -0600, Anthony Liguori wrote:
> On 12/06/2010 08:02 AM, Avi Kivity wrote:
> >On 12/06/2010 04:01 PM, Avi Kivity wrote:
> >>>It's not just possible, it appears to be exactly what happens.
> >>>
> >>>I guess it makes sense that RIP gets advanced before HLT
> >>>begins to wait.
> >>>
> >>
> >>
> >>It does.  Good, it simplifies the patch.
> >>
> >
> >btw, this is how kvm emulates HLT - first we advance rip, then we
> >sleep the vcpu.
> 
> Yes, I assume the hardware updates RIP immediately after fetching
> the instruction.  That's at least how some simpler architectures I'm
> aware of work.
> 
By "simpler" you mean "saner"? x86 needs at least decode it before
updating RIP :)
 
--
			Gleb.

^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 14:03                         ` Anthony Liguori
@ 2010-12-06 14:33                           ` Avi Kivity
  2010-12-06 15:07                             ` Anthony Liguori
  0 siblings, 1 reply; 69+ messages in thread
From: Avi Kivity @ 2010-12-06 14:33 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 04:03 PM, Anthony Liguori wrote:
>> It does.  Good, it simplifies the patch.
>
>
> Any ideas if the unit test framework can be used to validate this?  
> The behavior makes perfect sense but I wanted an excuse to play around 
> with it :-)
>

Not the user space one.  The exit we're interested in is external 
interrupt, and that one isn't delivered to userspace.

I guess you could have a loop

  1:  hlt
       jmp 1b

and enter it programming a timer to something close, and examine the 
vcpu state afterwards.  However you don't propagate the VMCS halted 
state to the corresponding kvm state, so there's no way to test it. (a 
minor bug in your patch)

> Not sure if there's a way to validate VMCS state after a hand crafted 
> exit.
>

KVM_GET_REGS and friends.



-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 14:33                           ` Avi Kivity
@ 2010-12-06 15:07                             ` Anthony Liguori
  2010-12-06 15:16                               ` Avi Kivity
  0 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-06 15:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 08:33 AM, Avi Kivity wrote:
> On 12/06/2010 04:03 PM, Anthony Liguori wrote:
>>> It does.  Good, it simplifies the patch.
>>
>>
>> Any ideas if the unit test framework can be used to validate this?  
>> The behavior makes perfect sense but I wanted an excuse to play 
>> around with it :-)
>>
>
> Not the user space one.  The exit we're interested in is external 
> interrupt, and that one isn't delivered to userspace.
>
> I guess you could have a loop
>
>  1:  hlt
>       jmp 1b
>
> and enter it programming a timer to something close, and examine the 
> vcpu state afterwards.  However you don't propagate the VMCS halted 
> state to the corresponding kvm state, so there's no way to test it. (a 
> minor bug in your patch)

The activity state doesn't get propagated at all to the kvm state.  Can 
we get into a non-zero activity state today (either SHUTDOWN or WAIT_SIPI)?

Regards,

Anthony Liguori

>> Not sure if there's a way to validate VMCS state after a hand crafted 
>> exit.
>>
>
> KVM_GET_REGS and friends.
>
>
>


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 15:07                             ` Anthony Liguori
@ 2010-12-06 15:16                               ` Avi Kivity
  2010-12-06 16:21                                 ` Anthony Liguori
  0 siblings, 1 reply; 69+ messages in thread
From: Avi Kivity @ 2010-12-06 15:16 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 05:07 PM, Anthony Liguori wrote:
>> Not the user space one.  The exit we're interested in is external 
>> interrupt, and that one isn't delivered to userspace.
>>
>> I guess you could have a loop
>>
>>  1:  hlt
>>       jmp 1b
>>
>> and enter it programming a timer to something close, and examine the 
>> vcpu state afterwards.  However you don't propagate the VMCS halted 
>> state to the corresponding kvm state, so there's no way to test it. 
>> (a minor bug in your patch)
>
>
> The activity state doesn't get propagated at all to the kvm state.  
> Can we get into a non-zero activity state today (either SHUTDOWN or 
> WAIT_SIPI)?

No, right now we intercept SHUTDOWN (and turn it back, in qemu, into a 
RESET), and we emulate all the SIPI stuff.  We also intercepted HLT so 
we couldn't get that activity state either.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 15:16                               ` Avi Kivity
@ 2010-12-06 16:21                                 ` Anthony Liguori
  2010-12-06 16:30                                   ` Avi Kivity
  0 siblings, 1 reply; 69+ messages in thread
From: Anthony Liguori @ 2010-12-06 16:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 09:16 AM, Avi Kivity wrote:
> On 12/06/2010 05:07 PM, Anthony Liguori wrote:
>>> Not the user space one.  The exit we're interested in is external 
>>> interrupt, and that one isn't delivered to userspace.
>>>
>>> I guess you could have a loop
>>>
>>>  1:  hlt
>>>       jmp 1b
>>>
>>> and enter it programming a timer to something close, and examine the 
>>> vcpu state afterwards.  However you don't propagate the VMCS halted 
>>> state to the corresponding kvm state, so there's no way to test it. 
>>> (a minor bug in your patch)
>>
>>
>> The activity state doesn't get propagated at all to the kvm state.  
>> Can we get into a non-zero activity state today (either SHUTDOWN or 
>> WAIT_SIPI)?
>
> No, right now we intercept SHUTDOWN (and turn it back, in qemu, into a 
> RESET), and we emulate all the SIPI stuff.  We also intercepted HLT so 
> we couldn't get that activity state either.

Ok, looks like I need to make this per-VM and use a CAP along with an 
interface to enable it to ensure that userspace to ensure it gets 
saved/restored appropriately.  I'll rework it.

Regards,

Anthony Liguori



^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 16:21                                 ` Anthony Liguori
@ 2010-12-06 16:30                                   ` Avi Kivity
  2010-12-06 16:33                                     ` Anthony Liguori
  0 siblings, 1 reply; 69+ messages in thread
From: Avi Kivity @ 2010-12-06 16:30 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 06:21 PM, Anthony Liguori wrote:
>> No, right now we intercept SHUTDOWN (and turn it back, in qemu, into 
>> a RESET), and we emulate all the SIPI stuff.  We also intercepted HLT 
>> so we couldn't get that activity state either.
>
>
> Ok, looks like I need to make this per-VM and use a CAP along with an 
> interface to enable it to ensure that userspace to ensure it gets 
> saved/restored appropriately.  I'll rework it.

Not really, it's not any different from the ordinary intercepted HLT.  
Put the halted state where we currently put it (hint: nowhere), and all 
is fine.

So we have a minor bug that we don't save/restore HLT state (either 
emulated or "real" with your patch), that causes a spurious wakeup after 
live migration.

Note for your respin, yield_on_hlt became __read_often again, so please 
make it __read_mostly.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 69+ messages in thread

* Re: [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2)
  2010-12-06 16:30                                   ` Avi Kivity
@ 2010-12-06 16:33                                     ` Anthony Liguori
  0 siblings, 0 replies; 69+ messages in thread
From: Anthony Liguori @ 2010-12-06 16:33 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm, Chris Wright, Srivatsa Vaddagiri

On 12/06/2010 10:30 AM, Avi Kivity wrote:
> On 12/06/2010 06:21 PM, Anthony Liguori wrote:
>>> No, right now we intercept SHUTDOWN (and turn it back, in qemu, into 
>>> a RESET), and we emulate all the SIPI stuff.  We also intercepted 
>>> HLT so we couldn't get that activity state either.
>>
>>
>> Ok, looks like I need to make this per-VM and use a CAP along with an 
>> interface to enable it to ensure that userspace to ensure it gets 
>> saved/restored appropriately.  I'll rework it.
>
> Not really, it's not any different from the ordinary intercepted HLT.  
> Put the halted state where we currently put it (hint: nowhere), and 
> all is fine.
>
> So we have a minor bug that we don't save/restore HLT state (either 
> emulated or "real" with your patch), that causes a spurious wakeup 
> after live migration.

Fair enough.  I certainly likes fixes that do nothing :-)

> Note for your respin, yield_on_hlt became __read_often again, so 
> please make it __read_mostly.

Done.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 69+ messages in thread

end of thread, other threads:[~2010-12-06 16:33 UTC | newest]

Thread overview: 69+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-12-02 13:59 [PATCH] kvm-vmx: add module parameter to avoid trapping HLT instructions (v2) Anthony Liguori
2010-12-02 14:39 ` lidong chen
2010-12-02 15:23   ` Anthony Liguori
2010-12-02 15:23   ` Anthony Liguori
2010-12-03  9:38     ` Avi Kivity
2010-12-03 11:12       ` Srivatsa Vaddagiri
2010-12-03 23:28       ` Anthony Liguori
2010-12-02 17:37 ` Marcelo Tosatti
2010-12-02 19:07   ` Anthony Liguori
2010-12-02 20:12     ` Marcelo Tosatti
2010-12-02 20:51       ` Anthony Liguori
2010-12-03  9:36         ` Avi Kivity
2010-12-03 22:45           ` Anthony Liguori
2010-12-04  8:13             ` Avi Kivity
2010-12-04 13:30               ` Anthony Liguori
2010-12-06  8:28                 ` Avi Kivity
2010-12-06  8:35                   ` Avi Kivity
2010-12-06 13:58                     ` Anthony Liguori
2010-12-06 14:01                       ` Avi Kivity
2010-12-06 14:02                         ` Avi Kivity
2010-12-06 14:08                           ` Anthony Liguori
2010-12-06 14:14                             ` Gleb Natapov
2010-12-06 14:03                         ` Anthony Liguori
2010-12-06 14:33                           ` Avi Kivity
2010-12-06 15:07                             ` Anthony Liguori
2010-12-06 15:16                               ` Avi Kivity
2010-12-06 16:21                                 ` Anthony Liguori
2010-12-06 16:30                                   ` Avi Kivity
2010-12-06 16:33                                     ` Anthony Liguori
2010-12-03 12:40         ` Gleb Natapov
2010-12-03 23:31       ` Anthony Liguori
2010-12-03 22:42   ` Anthony Liguori
2010-12-04  8:16     ` Avi Kivity
2010-12-04 13:48       ` Anthony Liguori
2010-12-06  8:32         ` Avi Kivity
2010-12-02 19:14 ` Chris Wright
2010-12-02 20:25   ` Anthony Liguori
2010-12-02 20:40     ` Chris Wright
2010-12-02 20:40   ` Marcelo Tosatti
2010-12-02 21:07     ` Chris Wright
2010-12-02 22:37       ` Anthony Liguori
2010-12-03  2:42         ` Chris Wright
2010-12-03  3:21           ` Anthony Liguori
2010-12-03  3:44             ` Chris Wright
2010-12-03 14:25               ` Anthony Liguori
2010-12-02 22:27     ` Anthony Liguori
2010-12-03 22:49     ` Anthony Liguori
2010-12-04  5:43       ` Srivatsa Vaddagiri
2010-12-03  9:40   ` Avi Kivity
2010-12-03 11:21     ` Srivatsa Vaddagiri
2010-12-03 11:57   ` Srivatsa Vaddagiri
2010-12-03 16:27     ` Srivatsa Vaddagiri
2010-12-03 17:29       ` Chris Wright
2010-12-03 17:33         ` Srivatsa Vaddagiri
2010-12-04  8:18           ` Avi Kivity
2010-12-03 17:57         ` Srivatsa Vaddagiri
2010-12-03 17:58           ` Chris Wright
2010-12-03 18:07             ` Anthony Liguori
2010-12-03 18:12               ` Srivatsa Vaddagiri
2010-12-04  8:19                 ` Avi Kivity
2010-12-03 18:20               ` Chris Wright
2010-12-03 18:55                 ` Anthony Liguori
2010-12-03 18:10             ` Marcelo Tosatti
2010-12-03 18:24               ` Marcelo Tosatti
2010-12-03 17:28     ` Chris Wright
2010-12-03 17:36       ` Srivatsa Vaddagiri
2010-12-03 17:38         ` Chris Wright
2010-12-03 17:43           ` Srivatsa Vaddagiri
2010-12-03 17:47           ` Anthony Liguori

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.