All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/1] xen/time: do not decrease steal time after live migration on xen
@ 2017-10-25  6:45 Dongli Zhang
  0 siblings, 0 replies; 8+ messages in thread
From: Dongli Zhang @ 2017-10-25  6:45 UTC (permalink / raw)
  To: xen-devel, linux-kernel; +Cc: jgross, boris.ostrovsky, joao.m.martins

After guest live migration on xen, steal time in /proc/stat
(cpustat[CPUTIME_STEAL]) might decrease because steal returned by
xen_steal_lock() might be less than this_rq()->prev_steal_time which is
derived from previous return value of xen_steal_clock().

For instance, steal time of each vcpu is 335 before live migration.

cpu  198 0 368 200064 1962 0 0 1340 0 0
cpu0 38 0 81 50063 492 0 0 335 0 0
cpu1 65 0 97 49763 634 0 0 335 0 0
cpu2 38 0 81 50098 462 0 0 335 0 0
cpu3 56 0 107 50138 374 0 0 335 0 0

After live migration, steal time is reduced to 312.

cpu  200 0 370 200330 1971 0 0 1248 0 0
cpu0 38 0 82 50123 500 0 0 312 0 0
cpu1 65 0 97 49832 634 0 0 312 0 0
cpu2 39 0 82 50167 462 0 0 312 0 0
cpu3 56 0 107 50207 374 0 0 312 0 0

Since runstate times are cumulative and cleared during xen live migration
by xen hypervisor, the idea of this patch is to accumulate runstate times
to global percpu variables before live migration suspend. Once guest VM is
resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
runstate times and previously accumulated times stored in global percpu
variables.

Similar and more severe issue would impact prior linux 4.8-4.10 as
discussed by Michael Las at
https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
which would overflow steal time and lead to 100% st usage in top command
for linux 4.8-4.10. A backport of this patch would fix that issue.

References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>

---
Changed since v1:
  * relocate modification to xen_get_runstate_snapshot_cpu

Changed since v2:
  * accumulate runstate times before live migration

---
 drivers/xen/manage.c  |  1 +
 drivers/xen/time.c    | 19 +++++++++++++++++++
 include/xen/xen-ops.h |  1 +
 3 files changed, 21 insertions(+)

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index c425d03..9aa2955 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -72,6 +72,7 @@ static int xen_suspend(void *data)
 	}
 
 	gnttab_suspend();
+	xen_accumulate_runstate_time();
 	xen_arch_pre_suspend();
 
 	/*
diff --git a/drivers/xen/time.c b/drivers/xen/time.c
index ac5f23f..6df3f82 100644
--- a/drivers/xen/time.c
+++ b/drivers/xen/time.c
@@ -19,6 +19,8 @@
 /* runstate info updated by Xen */
 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
 
+static DEFINE_PER_CPU(u64[4], old_runstate_time);
+
 /* return an consistent snapshot of 64-bit time/counter value */
 static u64 get64(const u64 *p)
 {
@@ -52,6 +54,7 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
 {
 	u64 state_time;
 	struct vcpu_runstate_info *state;
+	int i;
 
 	BUG_ON(preemptible());
 
@@ -64,6 +67,22 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
 		rmb();	/* Hypervisor might update data. */
 	} while (get64(&state->state_entry_time) != state_time ||
 		 (state_time & XEN_RUNSTATE_UPDATE));
+
+	for (i = 0; i < 4; i++)
+		res->time[i] += per_cpu(old_runstate_time, cpu)[i];
+}
+
+void xen_accumulate_runstate_time(void)
+{
+	struct vcpu_runstate_info state;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		xen_get_runstate_snapshot_cpu(&state, cpu);
+		memcpy(per_cpu(old_runstate_time, cpu),
+				state.time,
+				4 * sizeof(u64));
+	}
 }
 
 /*
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 218e6aa..5680059 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb);
 bool xen_vcpu_stolen(int vcpu);
 void xen_setup_runstate_info(int cpu);
 void xen_time_setup_guest(void);
+void xen_accumulate_runstate_time(void);
 void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
 u64 xen_steal_clock(int cpu);
 
-- 
2.7.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/1] xen/time: do not decrease steal time after live migration on xen
  2017-10-27  7:40       ` Dongli Zhang
@ 2017-10-27  7:49         ` Juergen Gross
  0 siblings, 0 replies; 8+ messages in thread
From: Juergen Gross @ 2017-10-27  7:49 UTC (permalink / raw)
  To: Dongli Zhang, Boris Ostrovsky, xen-devel, linux-kernel; +Cc: joao.m.martins

On 27/10/17 09:40, Dongli Zhang wrote:
> Hi Juergen,
> 
> On 10/27/2017 03:31 PM, Juergen Gross wrote:
>> On 27/10/17 09:16, Dongli Zhang wrote:
>>> Hi Boris,
>>>
>>> On 10/25/2017 11:12 PM, Boris Ostrovsky wrote:
>>>> On 10/25/2017 02:45 AM, Dongli Zhang wrote:
>>>>> After guest live migration on xen, steal time in /proc/stat
>>>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>>>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>>>>> derived from previous return value of xen_steal_clock().
>>>>>
>>>>> For instance, steal time of each vcpu is 335 before live migration.
>>>>>
>>>>> cpu  198 0 368 200064 1962 0 0 1340 0 0
>>>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>>>
>>>>> After live migration, steal time is reduced to 312.
>>>>>
>>>>> cpu  200 0 370 200330 1971 0 0 1248 0 0
>>>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>>>
>>>>> Since runstate times are cumulative and cleared during xen live migration
>>>>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>>>>> to global percpu variables before live migration suspend. Once guest VM is
>>>>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>>>>> runstate times and previously accumulated times stored in global percpu
>>>>> variables.
>>>>>
>>>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>>>> discussed by Michael Las at
>>>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>>>>> which would overflow steal time and lead to 100% st usage in top command
>>>>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>>>>
>>>>> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>>>>
>>>>> ---
>>>>> Changed since v1:
>>>>>   * relocate modification to xen_get_runstate_snapshot_cpu
>>>>>
>>>>> Changed since v2:
>>>>>   * accumulate runstate times before live migration
>>>>>
>>>>> ---
>>>>>  drivers/xen/manage.c  |  1 +
>>>>>  drivers/xen/time.c    | 19 +++++++++++++++++++
>>>>>  include/xen/xen-ops.h |  1 +
>>>>>  3 files changed, 21 insertions(+)
>>>>>
>>>>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>>>>> index c425d03..9aa2955 100644
>>>>> --- a/drivers/xen/manage.c
>>>>> +++ b/drivers/xen/manage.c
>>>>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>>>>>  	}
>>>>>  
>>>>>  	gnttab_suspend();
>>>>> +	xen_accumulate_runstate_time();
>>>>>  	xen_arch_pre_suspend();
>>>>>  
>>>>>  	/*
>>>>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>>>>> index ac5f23f..6df3f82 100644
>>>>> --- a/drivers/xen/time.c
>>>>> +++ b/drivers/xen/time.c
>>>>> @@ -19,6 +19,8 @@
>>>>>  /* runstate info updated by Xen */
>>>>>  static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>>>>  
>>>>> +static DEFINE_PER_CPU(u64[4], old_runstate_time);
>>>>> +
>>>>>  /* return an consistent snapshot of 64-bit time/counter value */
>>>>>  static u64 get64(const u64 *p)
>>>>>  {
>>>>> @@ -52,6 +54,7 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>>>>  {
>>>>>  	u64 state_time;
>>>>>  	struct vcpu_runstate_info *state;
>>>>> +	int i;
>>>>>  
>>>>>  	BUG_ON(preemptible());
>>>>>  
>>>>> @@ -64,6 +67,22 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>>>>  		rmb();	/* Hypervisor might update data. */
>>>>>  	} while (get64(&state->state_entry_time) != state_time ||
>>>>>  		 (state_time & XEN_RUNSTATE_UPDATE));
>>>>> +
>>>>> +	for (i = 0; i < 4; i++)
>>>>> +		res->time[i] += per_cpu(old_runstate_time, cpu)[i];
>>>>> +}
>>>>> +
>>>>> +void xen_accumulate_runstate_time(void)
>>>>> +{
>>>>> +	struct vcpu_runstate_info state;
>>>>> +	int cpu;
>>>>> +
>>>>> +	for_each_possible_cpu(cpu) {
>>>>> +		xen_get_runstate_snapshot_cpu(&state, cpu);
>>>>> +		memcpy(per_cpu(old_runstate_time, cpu),
>>>>> +				state.time,
>>>>> +				4 * sizeof(u64));
>>>>
>>>> sizeof(old_runstate_time). (I think this should work for per_cpu variables)
>>>>
>>>>> +	}
>>>>
>>>> Hmm.. This may not perform as intended if we are merely checkpointing
>>>> (or pausing) the guest (i.e. if HYPERVISOR_suspend() returns 1). We will
>>>> double-account for the last interval that the guest has run.
>>>>
>>>> I'd rather not have yet another per-cpu variable but I can't think of
>>>> anything else. Perhaps you or others can come up with something better.
>>>
>>> I have 3 options so far.
>>>
>>> The 1st option is to another per-cpu variable while you do not like it.
>>>
>>> The 2nd option is to borrow from what do_stolen_accounting() used to do. Compute
>>> the delta of current and previous time and do nothing if delta is less than 0.
>>> The drawback of this option is guest might wait for the new time to catch up
>>> with previous time.
>>
>> This could be a rather long time. I don't think this is the way to go.
>>
>>> The 3rd option is to check the return value of HYPERVISOR_suspend() to different
>>> if this is a migration of checkpointing. As we will double-account the runstate
>>> time for checkpointing, why not just divide it by 2? The drawback of this option
>>> is the result is not accurate as we divide the incremental (time before and
>>> after checkpointing) by 2.
>>
>> And it is wrong if you do multiple migrations.
>>
>>> Would you please let me know which option we prefer?
>>
>> Perhaps option 4:
>>
>> Allocate a buffer at suspend time for the times to add up and do the
>> correction after suspend and free the buffer again.
> 
> With option 4, we need to allocate and free the buffer in xen_suspend() and we
> would not be able to encapsulate everything inside
> xen_accumulate_runstate_time(). (unless we use a array of static variables in
> xen_accumulate_runstate_time()).

Add a parameter to xen_accumulate_runstate_time() and call it before and
after suspend, e.g.:

--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -72,6 +72,7 @@ static int xen_suspend(void *data)
        }

        gnttab_suspend();
+       xen_accumulate_runstate_time(-1);
        xen_arch_pre_suspend();

        /*
@@ -84,6 +85,7 @@ static int xen_suspend(void *data)
                                            : 0);

        xen_arch_post_suspend(si->cancelled);
+       xen_accumulate_runstate_time(si->cancelled);
        gnttab_resume();

        if (!si->cancelled) {

Then the allocation and freeing of the buffer both can happen in
xen_accumulate_runstate_time() (which then might want to be called
xen_time_suspend() or such).


Juergen

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/1] xen/time: do not decrease steal time after live migration on xen
  2017-10-27  7:31     ` Juergen Gross
  2017-10-27  7:40       ` Dongli Zhang
@ 2017-10-27  7:40       ` Dongli Zhang
  1 sibling, 0 replies; 8+ messages in thread
From: Dongli Zhang @ 2017-10-27  7:40 UTC (permalink / raw)
  To: Juergen Gross, Boris Ostrovsky, xen-devel, linux-kernel; +Cc: joao.m.martins

Hi Juergen,

On 10/27/2017 03:31 PM, Juergen Gross wrote:
> On 27/10/17 09:16, Dongli Zhang wrote:
>> Hi Boris,
>>
>> On 10/25/2017 11:12 PM, Boris Ostrovsky wrote:
>>> On 10/25/2017 02:45 AM, Dongli Zhang wrote:
>>>> After guest live migration on xen, steal time in /proc/stat
>>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>>>> derived from previous return value of xen_steal_clock().
>>>>
>>>> For instance, steal time of each vcpu is 335 before live migration.
>>>>
>>>> cpu  198 0 368 200064 1962 0 0 1340 0 0
>>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>>
>>>> After live migration, steal time is reduced to 312.
>>>>
>>>> cpu  200 0 370 200330 1971 0 0 1248 0 0
>>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>>
>>>> Since runstate times are cumulative and cleared during xen live migration
>>>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>>>> to global percpu variables before live migration suspend. Once guest VM is
>>>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>>>> runstate times and previously accumulated times stored in global percpu
>>>> variables.
>>>>
>>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>>> discussed by Michael Las at
>>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>>>> which would overflow steal time and lead to 100% st usage in top command
>>>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>>>
>>>> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>>>
>>>> ---
>>>> Changed since v1:
>>>>   * relocate modification to xen_get_runstate_snapshot_cpu
>>>>
>>>> Changed since v2:
>>>>   * accumulate runstate times before live migration
>>>>
>>>> ---
>>>>  drivers/xen/manage.c  |  1 +
>>>>  drivers/xen/time.c    | 19 +++++++++++++++++++
>>>>  include/xen/xen-ops.h |  1 +
>>>>  3 files changed, 21 insertions(+)
>>>>
>>>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>>>> index c425d03..9aa2955 100644
>>>> --- a/drivers/xen/manage.c
>>>> +++ b/drivers/xen/manage.c
>>>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>>>>  	}
>>>>  
>>>>  	gnttab_suspend();
>>>> +	xen_accumulate_runstate_time();
>>>>  	xen_arch_pre_suspend();
>>>>  
>>>>  	/*
>>>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>>>> index ac5f23f..6df3f82 100644
>>>> --- a/drivers/xen/time.c
>>>> +++ b/drivers/xen/time.c
>>>> @@ -19,6 +19,8 @@
>>>>  /* runstate info updated by Xen */
>>>>  static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>>>  
>>>> +static DEFINE_PER_CPU(u64[4], old_runstate_time);
>>>> +
>>>>  /* return an consistent snapshot of 64-bit time/counter value */
>>>>  static u64 get64(const u64 *p)
>>>>  {
>>>> @@ -52,6 +54,7 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>>>  {
>>>>  	u64 state_time;
>>>>  	struct vcpu_runstate_info *state;
>>>> +	int i;
>>>>  
>>>>  	BUG_ON(preemptible());
>>>>  
>>>> @@ -64,6 +67,22 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>>>  		rmb();	/* Hypervisor might update data. */
>>>>  	} while (get64(&state->state_entry_time) != state_time ||
>>>>  		 (state_time & XEN_RUNSTATE_UPDATE));
>>>> +
>>>> +	for (i = 0; i < 4; i++)
>>>> +		res->time[i] += per_cpu(old_runstate_time, cpu)[i];
>>>> +}
>>>> +
>>>> +void xen_accumulate_runstate_time(void)
>>>> +{
>>>> +	struct vcpu_runstate_info state;
>>>> +	int cpu;
>>>> +
>>>> +	for_each_possible_cpu(cpu) {
>>>> +		xen_get_runstate_snapshot_cpu(&state, cpu);
>>>> +		memcpy(per_cpu(old_runstate_time, cpu),
>>>> +				state.time,
>>>> +				4 * sizeof(u64));
>>>
>>> sizeof(old_runstate_time). (I think this should work for per_cpu variables)
>>>
>>>> +	}
>>>
>>> Hmm.. This may not perform as intended if we are merely checkpointing
>>> (or pausing) the guest (i.e. if HYPERVISOR_suspend() returns 1). We will
>>> double-account for the last interval that the guest has run.
>>>
>>> I'd rather not have yet another per-cpu variable but I can't think of
>>> anything else. Perhaps you or others can come up with something better.
>>
>> I have 3 options so far.
>>
>> The 1st option is to another per-cpu variable while you do not like it.
>>
>> The 2nd option is to borrow from what do_stolen_accounting() used to do. Compute
>> the delta of current and previous time and do nothing if delta is less than 0.
>> The drawback of this option is guest might wait for the new time to catch up
>> with previous time.
> 
> This could be a rather long time. I don't think this is the way to go.
> 
>> The 3rd option is to check the return value of HYPERVISOR_suspend() to different
>> if this is a migration of checkpointing. As we will double-account the runstate
>> time for checkpointing, why not just divide it by 2? The drawback of this option
>> is the result is not accurate as we divide the incremental (time before and
>> after checkpointing) by 2.
> 
> And it is wrong if you do multiple migrations.
> 
>> Would you please let me know which option we prefer?
> 
> Perhaps option 4:
> 
> Allocate a buffer at suspend time for the times to add up and do the
> correction after suspend and free the buffer again.

With option 4, we need to allocate and free the buffer in xen_suspend() and we
would not be able to encapsulate everything inside
xen_accumulate_runstate_time(). (unless we use a array of static variables in
xen_accumulate_runstate_time()).

I would choose option 4 if it is fine for reviewers.

Thank you very much!

Dongli Zhang

> 
> 
> Juergen
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/1] xen/time: do not decrease steal time after live migration on xen
  2017-10-27  7:16   ` [Xen-devel] " Dongli Zhang
  2017-10-27  7:31     ` Juergen Gross
@ 2017-10-27  7:31     ` Juergen Gross
  1 sibling, 0 replies; 8+ messages in thread
From: Juergen Gross @ 2017-10-27  7:31 UTC (permalink / raw)
  To: Dongli Zhang, Boris Ostrovsky, xen-devel, linux-kernel; +Cc: joao.m.martins

On 27/10/17 09:16, Dongli Zhang wrote:
> Hi Boris,
> 
> On 10/25/2017 11:12 PM, Boris Ostrovsky wrote:
>> On 10/25/2017 02:45 AM, Dongli Zhang wrote:
>>> After guest live migration on xen, steal time in /proc/stat
>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>>> derived from previous return value of xen_steal_clock().
>>>
>>> For instance, steal time of each vcpu is 335 before live migration.
>>>
>>> cpu  198 0 368 200064 1962 0 0 1340 0 0
>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>
>>> After live migration, steal time is reduced to 312.
>>>
>>> cpu  200 0 370 200330 1971 0 0 1248 0 0
>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>
>>> Since runstate times are cumulative and cleared during xen live migration
>>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>>> to global percpu variables before live migration suspend. Once guest VM is
>>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>>> runstate times and previously accumulated times stored in global percpu
>>> variables.
>>>
>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>> discussed by Michael Las at
>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>>> which would overflow steal time and lead to 100% st usage in top command
>>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>>
>>> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>>
>>> ---
>>> Changed since v1:
>>>   * relocate modification to xen_get_runstate_snapshot_cpu
>>>
>>> Changed since v2:
>>>   * accumulate runstate times before live migration
>>>
>>> ---
>>>  drivers/xen/manage.c  |  1 +
>>>  drivers/xen/time.c    | 19 +++++++++++++++++++
>>>  include/xen/xen-ops.h |  1 +
>>>  3 files changed, 21 insertions(+)
>>>
>>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>>> index c425d03..9aa2955 100644
>>> --- a/drivers/xen/manage.c
>>> +++ b/drivers/xen/manage.c
>>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>>>  	}
>>>  
>>>  	gnttab_suspend();
>>> +	xen_accumulate_runstate_time();
>>>  	xen_arch_pre_suspend();
>>>  
>>>  	/*
>>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>>> index ac5f23f..6df3f82 100644
>>> --- a/drivers/xen/time.c
>>> +++ b/drivers/xen/time.c
>>> @@ -19,6 +19,8 @@
>>>  /* runstate info updated by Xen */
>>>  static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>>  
>>> +static DEFINE_PER_CPU(u64[4], old_runstate_time);
>>> +
>>>  /* return an consistent snapshot of 64-bit time/counter value */
>>>  static u64 get64(const u64 *p)
>>>  {
>>> @@ -52,6 +54,7 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>>  {
>>>  	u64 state_time;
>>>  	struct vcpu_runstate_info *state;
>>> +	int i;
>>>  
>>>  	BUG_ON(preemptible());
>>>  
>>> @@ -64,6 +67,22 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>>  		rmb();	/* Hypervisor might update data. */
>>>  	} while (get64(&state->state_entry_time) != state_time ||
>>>  		 (state_time & XEN_RUNSTATE_UPDATE));
>>> +
>>> +	for (i = 0; i < 4; i++)
>>> +		res->time[i] += per_cpu(old_runstate_time, cpu)[i];
>>> +}
>>> +
>>> +void xen_accumulate_runstate_time(void)
>>> +{
>>> +	struct vcpu_runstate_info state;
>>> +	int cpu;
>>> +
>>> +	for_each_possible_cpu(cpu) {
>>> +		xen_get_runstate_snapshot_cpu(&state, cpu);
>>> +		memcpy(per_cpu(old_runstate_time, cpu),
>>> +				state.time,
>>> +				4 * sizeof(u64));
>>
>> sizeof(old_runstate_time). (I think this should work for per_cpu variables)
>>
>>> +	}
>>
>> Hmm.. This may not perform as intended if we are merely checkpointing
>> (or pausing) the guest (i.e. if HYPERVISOR_suspend() returns 1). We will
>> double-account for the last interval that the guest has run.
>>
>> I'd rather not have yet another per-cpu variable but I can't think of
>> anything else. Perhaps you or others can come up with something better.
> 
> I have 3 options so far.
> 
> The 1st option is to another per-cpu variable while you do not like it.
> 
> The 2nd option is to borrow from what do_stolen_accounting() used to do. Compute
> the delta of current and previous time and do nothing if delta is less than 0.
> The drawback of this option is guest might wait for the new time to catch up
> with previous time.

This could be a rather long time. I don't think this is the way to go.

> The 3rd option is to check the return value of HYPERVISOR_suspend() to different
> if this is a migration of checkpointing. As we will double-account the runstate
> time for checkpointing, why not just divide it by 2? The drawback of this option
> is the result is not accurate as we divide the incremental (time before and
> after checkpointing) by 2.

And it is wrong if you do multiple migrations.

> Would you please let me know which option we prefer?

Perhaps option 4:

Allocate a buffer at suspend time for the times to add up and do the
correction after suspend and free the buffer again.


Juergen

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/1] xen/time: do not decrease steal time after live migration on xen
  2017-10-25 15:12 ` Boris Ostrovsky
@ 2017-10-27  7:16   ` Dongli Zhang
  2017-10-27  7:16   ` [Xen-devel] " Dongli Zhang
  1 sibling, 0 replies; 8+ messages in thread
From: Dongli Zhang @ 2017-10-27  7:16 UTC (permalink / raw)
  To: Boris Ostrovsky, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins

Hi Boris,

On 10/25/2017 11:12 PM, Boris Ostrovsky wrote:
> On 10/25/2017 02:45 AM, Dongli Zhang wrote:
>> After guest live migration on xen, steal time in /proc/stat
>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>> derived from previous return value of xen_steal_clock().
>>
>> For instance, steal time of each vcpu is 335 before live migration.
>>
>> cpu  198 0 368 200064 1962 0 0 1340 0 0
>> cpu0 38 0 81 50063 492 0 0 335 0 0
>> cpu1 65 0 97 49763 634 0 0 335 0 0
>> cpu2 38 0 81 50098 462 0 0 335 0 0
>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>
>> After live migration, steal time is reduced to 312.
>>
>> cpu  200 0 370 200330 1971 0 0 1248 0 0
>> cpu0 38 0 82 50123 500 0 0 312 0 0
>> cpu1 65 0 97 49832 634 0 0 312 0 0
>> cpu2 39 0 82 50167 462 0 0 312 0 0
>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>
>> Since runstate times are cumulative and cleared during xen live migration
>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>> to global percpu variables before live migration suspend. Once guest VM is
>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>> runstate times and previously accumulated times stored in global percpu
>> variables.
>>
>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>> discussed by Michael Las at
>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>> which would overflow steal time and lead to 100% st usage in top command
>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>
>> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>
>> ---
>> Changed since v1:
>>   * relocate modification to xen_get_runstate_snapshot_cpu
>>
>> Changed since v2:
>>   * accumulate runstate times before live migration
>>
>> ---
>>  drivers/xen/manage.c  |  1 +
>>  drivers/xen/time.c    | 19 +++++++++++++++++++
>>  include/xen/xen-ops.h |  1 +
>>  3 files changed, 21 insertions(+)
>>
>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>> index c425d03..9aa2955 100644
>> --- a/drivers/xen/manage.c
>> +++ b/drivers/xen/manage.c
>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>>  	}
>>  
>>  	gnttab_suspend();
>> +	xen_accumulate_runstate_time();
>>  	xen_arch_pre_suspend();
>>  
>>  	/*
>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>> index ac5f23f..6df3f82 100644
>> --- a/drivers/xen/time.c
>> +++ b/drivers/xen/time.c
>> @@ -19,6 +19,8 @@
>>  /* runstate info updated by Xen */
>>  static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>  
>> +static DEFINE_PER_CPU(u64[4], old_runstate_time);
>> +
>>  /* return an consistent snapshot of 64-bit time/counter value */
>>  static u64 get64(const u64 *p)
>>  {
>> @@ -52,6 +54,7 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>  {
>>  	u64 state_time;
>>  	struct vcpu_runstate_info *state;
>> +	int i;
>>  
>>  	BUG_ON(preemptible());
>>  
>> @@ -64,6 +67,22 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>  		rmb();	/* Hypervisor might update data. */
>>  	} while (get64(&state->state_entry_time) != state_time ||
>>  		 (state_time & XEN_RUNSTATE_UPDATE));
>> +
>> +	for (i = 0; i < 4; i++)
>> +		res->time[i] += per_cpu(old_runstate_time, cpu)[i];
>> +}
>> +
>> +void xen_accumulate_runstate_time(void)
>> +{
>> +	struct vcpu_runstate_info state;
>> +	int cpu;
>> +
>> +	for_each_possible_cpu(cpu) {
>> +		xen_get_runstate_snapshot_cpu(&state, cpu);
>> +		memcpy(per_cpu(old_runstate_time, cpu),
>> +				state.time,
>> +				4 * sizeof(u64));
> 
> sizeof(old_runstate_time). (I think this should work for per_cpu variables)
> 
>> +	}
> 
> Hmm.. This may not perform as intended if we are merely checkpointing
> (or pausing) the guest (i.e. if HYPERVISOR_suspend() returns 1). We will
> double-account for the last interval that the guest has run.
> 
> I'd rather not have yet another per-cpu variable but I can't think of
> anything else. Perhaps you or others can come up with something better.

I have 3 options so far.

The 1st option is to another per-cpu variable while you do not like it.

The 2nd option is to borrow from what do_stolen_accounting() used to do. Compute
the delta of current and previous time and do nothing if delta is less than 0.
The drawback of this option is guest might wait for the new time to catch up
with previous time.

The 3rd option is to check the return value of HYPERVISOR_suspend() to different
if this is a migration of checkpointing. As we will double-account the runstate
time for checkpointing, why not just divide it by 2? The drawback of this option
is the result is not accurate as we divide the incremental (time before and
after checkpointing) by 2.

Would you please let me know which option we prefer?

Thank you very much!

Dongli Zhang


> 
> -boris
> 
>>  }
>>  
>>  /*
>> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
>> index 218e6aa..5680059 100644
>> --- a/include/xen/xen-ops.h
>> +++ b/include/xen/xen-ops.h
>> @@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb);
>>  bool xen_vcpu_stolen(int vcpu);
>>  void xen_setup_runstate_info(int cpu);
>>  void xen_time_setup_guest(void);
>> +void xen_accumulate_runstate_time(void);
>>  void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
>>  u64 xen_steal_clock(int cpu);
>>  
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/1] xen/time: do not decrease steal time after live migration on xen
  2017-10-25  6:45 Dongli Zhang
@ 2017-10-25 15:12 ` Boris Ostrovsky
  2017-10-27  7:16   ` Dongli Zhang
  2017-10-27  7:16   ` [Xen-devel] " Dongli Zhang
  2017-10-25 15:12 ` Boris Ostrovsky
  1 sibling, 2 replies; 8+ messages in thread
From: Boris Ostrovsky @ 2017-10-25 15:12 UTC (permalink / raw)
  To: Dongli Zhang, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins

On 10/25/2017 02:45 AM, Dongli Zhang wrote:
> After guest live migration on xen, steal time in /proc/stat
> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
> derived from previous return value of xen_steal_clock().
>
> For instance, steal time of each vcpu is 335 before live migration.
>
> cpu  198 0 368 200064 1962 0 0 1340 0 0
> cpu0 38 0 81 50063 492 0 0 335 0 0
> cpu1 65 0 97 49763 634 0 0 335 0 0
> cpu2 38 0 81 50098 462 0 0 335 0 0
> cpu3 56 0 107 50138 374 0 0 335 0 0
>
> After live migration, steal time is reduced to 312.
>
> cpu  200 0 370 200330 1971 0 0 1248 0 0
> cpu0 38 0 82 50123 500 0 0 312 0 0
> cpu1 65 0 97 49832 634 0 0 312 0 0
> cpu2 39 0 82 50167 462 0 0 312 0 0
> cpu3 56 0 107 50207 374 0 0 312 0 0
>
> Since runstate times are cumulative and cleared during xen live migration
> by xen hypervisor, the idea of this patch is to accumulate runstate times
> to global percpu variables before live migration suspend. Once guest VM is
> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
> runstate times and previously accumulated times stored in global percpu
> variables.
>
> Similar and more severe issue would impact prior linux 4.8-4.10 as
> discussed by Michael Las at
> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
> which would overflow steal time and lead to 100% st usage in top command
> for linux 4.8-4.10. A backport of this patch would fix that issue.
>
> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>
> ---
> Changed since v1:
>   * relocate modification to xen_get_runstate_snapshot_cpu
>
> Changed since v2:
>   * accumulate runstate times before live migration
>
> ---
>  drivers/xen/manage.c  |  1 +
>  drivers/xen/time.c    | 19 +++++++++++++++++++
>  include/xen/xen-ops.h |  1 +
>  3 files changed, 21 insertions(+)
>
> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
> index c425d03..9aa2955 100644
> --- a/drivers/xen/manage.c
> +++ b/drivers/xen/manage.c
> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>  	}
>  
>  	gnttab_suspend();
> +	xen_accumulate_runstate_time();
>  	xen_arch_pre_suspend();
>  
>  	/*
> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
> index ac5f23f..6df3f82 100644
> --- a/drivers/xen/time.c
> +++ b/drivers/xen/time.c
> @@ -19,6 +19,8 @@
>  /* runstate info updated by Xen */
>  static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>  
> +static DEFINE_PER_CPU(u64[4], old_runstate_time);
> +
>  /* return an consistent snapshot of 64-bit time/counter value */
>  static u64 get64(const u64 *p)
>  {
> @@ -52,6 +54,7 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>  {
>  	u64 state_time;
>  	struct vcpu_runstate_info *state;
> +	int i;
>  
>  	BUG_ON(preemptible());
>  
> @@ -64,6 +67,22 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>  		rmb();	/* Hypervisor might update data. */
>  	} while (get64(&state->state_entry_time) != state_time ||
>  		 (state_time & XEN_RUNSTATE_UPDATE));
> +
> +	for (i = 0; i < 4; i++)
> +		res->time[i] += per_cpu(old_runstate_time, cpu)[i];
> +}
> +
> +void xen_accumulate_runstate_time(void)
> +{
> +	struct vcpu_runstate_info state;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		xen_get_runstate_snapshot_cpu(&state, cpu);
> +		memcpy(per_cpu(old_runstate_time, cpu),
> +				state.time,
> +				4 * sizeof(u64));

sizeof(old_runstate_time). (I think this should work for per_cpu variables)

> +	}

Hmm.. This may not perform as intended if we are merely checkpointing
(or pausing) the guest (i.e. if HYPERVISOR_suspend() returns 1). We will
double-account for the last interval that the guest has run.

I'd rather not have yet another per-cpu variable but I can't think of
anything else. Perhaps you or others can come up with something better.

-boris

>  }
>  
>  /*
> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
> index 218e6aa..5680059 100644
> --- a/include/xen/xen-ops.h
> +++ b/include/xen/xen-ops.h
> @@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb);
>  bool xen_vcpu_stolen(int vcpu);
>  void xen_setup_runstate_info(int cpu);
>  void xen_time_setup_guest(void);
> +void xen_accumulate_runstate_time(void);
>  void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
>  u64 xen_steal_clock(int cpu);
>  

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 1/1] xen/time: do not decrease steal time after live migration on xen
  2017-10-25  6:45 Dongli Zhang
  2017-10-25 15:12 ` Boris Ostrovsky
@ 2017-10-25 15:12 ` Boris Ostrovsky
  1 sibling, 0 replies; 8+ messages in thread
From: Boris Ostrovsky @ 2017-10-25 15:12 UTC (permalink / raw)
  To: Dongli Zhang, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins

On 10/25/2017 02:45 AM, Dongli Zhang wrote:
> After guest live migration on xen, steal time in /proc/stat
> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
> derived from previous return value of xen_steal_clock().
>
> For instance, steal time of each vcpu is 335 before live migration.
>
> cpu  198 0 368 200064 1962 0 0 1340 0 0
> cpu0 38 0 81 50063 492 0 0 335 0 0
> cpu1 65 0 97 49763 634 0 0 335 0 0
> cpu2 38 0 81 50098 462 0 0 335 0 0
> cpu3 56 0 107 50138 374 0 0 335 0 0
>
> After live migration, steal time is reduced to 312.
>
> cpu  200 0 370 200330 1971 0 0 1248 0 0
> cpu0 38 0 82 50123 500 0 0 312 0 0
> cpu1 65 0 97 49832 634 0 0 312 0 0
> cpu2 39 0 82 50167 462 0 0 312 0 0
> cpu3 56 0 107 50207 374 0 0 312 0 0
>
> Since runstate times are cumulative and cleared during xen live migration
> by xen hypervisor, the idea of this patch is to accumulate runstate times
> to global percpu variables before live migration suspend. Once guest VM is
> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
> runstate times and previously accumulated times stored in global percpu
> variables.
>
> Similar and more severe issue would impact prior linux 4.8-4.10 as
> discussed by Michael Las at
> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
> which would overflow steal time and lead to 100% st usage in top command
> for linux 4.8-4.10. A backport of this patch would fix that issue.
>
> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>
> ---
> Changed since v1:
>   * relocate modification to xen_get_runstate_snapshot_cpu
>
> Changed since v2:
>   * accumulate runstate times before live migration
>
> ---
>  drivers/xen/manage.c  |  1 +
>  drivers/xen/time.c    | 19 +++++++++++++++++++
>  include/xen/xen-ops.h |  1 +
>  3 files changed, 21 insertions(+)
>
> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
> index c425d03..9aa2955 100644
> --- a/drivers/xen/manage.c
> +++ b/drivers/xen/manage.c
> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>  	}
>  
>  	gnttab_suspend();
> +	xen_accumulate_runstate_time();
>  	xen_arch_pre_suspend();
>  
>  	/*
> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
> index ac5f23f..6df3f82 100644
> --- a/drivers/xen/time.c
> +++ b/drivers/xen/time.c
> @@ -19,6 +19,8 @@
>  /* runstate info updated by Xen */
>  static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>  
> +static DEFINE_PER_CPU(u64[4], old_runstate_time);
> +
>  /* return an consistent snapshot of 64-bit time/counter value */
>  static u64 get64(const u64 *p)
>  {
> @@ -52,6 +54,7 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>  {
>  	u64 state_time;
>  	struct vcpu_runstate_info *state;
> +	int i;
>  
>  	BUG_ON(preemptible());
>  
> @@ -64,6 +67,22 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>  		rmb();	/* Hypervisor might update data. */
>  	} while (get64(&state->state_entry_time) != state_time ||
>  		 (state_time & XEN_RUNSTATE_UPDATE));
> +
> +	for (i = 0; i < 4; i++)
> +		res->time[i] += per_cpu(old_runstate_time, cpu)[i];
> +}
> +
> +void xen_accumulate_runstate_time(void)
> +{
> +	struct vcpu_runstate_info state;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		xen_get_runstate_snapshot_cpu(&state, cpu);
> +		memcpy(per_cpu(old_runstate_time, cpu),
> +				state.time,
> +				4 * sizeof(u64));

sizeof(old_runstate_time). (I think this should work for per_cpu variables)

> +	}

Hmm.. This may not perform as intended if we are merely checkpointing
(or pausing) the guest (i.e. if HYPERVISOR_suspend() returns 1). We will
double-account for the last interval that the guest has run.

I'd rather not have yet another per-cpu variable but I can't think of
anything else. Perhaps you or others can come up with something better.

-boris

>  }
>  
>  /*
> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
> index 218e6aa..5680059 100644
> --- a/include/xen/xen-ops.h
> +++ b/include/xen/xen-ops.h
> @@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb);
>  bool xen_vcpu_stolen(int vcpu);
>  void xen_setup_runstate_info(int cpu);
>  void xen_time_setup_guest(void);
> +void xen_accumulate_runstate_time(void);
>  void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
>  u64 xen_steal_clock(int cpu);
>  


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v3 1/1] xen/time: do not decrease steal time after live migration on xen
@ 2017-10-25  6:45 Dongli Zhang
  2017-10-25 15:12 ` Boris Ostrovsky
  2017-10-25 15:12 ` Boris Ostrovsky
  0 siblings, 2 replies; 8+ messages in thread
From: Dongli Zhang @ 2017-10-25  6:45 UTC (permalink / raw)
  To: xen-devel, linux-kernel; +Cc: boris.ostrovsky, jgross, joao.m.martins

After guest live migration on xen, steal time in /proc/stat
(cpustat[CPUTIME_STEAL]) might decrease because steal returned by
xen_steal_lock() might be less than this_rq()->prev_steal_time which is
derived from previous return value of xen_steal_clock().

For instance, steal time of each vcpu is 335 before live migration.

cpu  198 0 368 200064 1962 0 0 1340 0 0
cpu0 38 0 81 50063 492 0 0 335 0 0
cpu1 65 0 97 49763 634 0 0 335 0 0
cpu2 38 0 81 50098 462 0 0 335 0 0
cpu3 56 0 107 50138 374 0 0 335 0 0

After live migration, steal time is reduced to 312.

cpu  200 0 370 200330 1971 0 0 1248 0 0
cpu0 38 0 82 50123 500 0 0 312 0 0
cpu1 65 0 97 49832 634 0 0 312 0 0
cpu2 39 0 82 50167 462 0 0 312 0 0
cpu3 56 0 107 50207 374 0 0 312 0 0

Since runstate times are cumulative and cleared during xen live migration
by xen hypervisor, the idea of this patch is to accumulate runstate times
to global percpu variables before live migration suspend. Once guest VM is
resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
runstate times and previously accumulated times stored in global percpu
variables.

Similar and more severe issue would impact prior linux 4.8-4.10 as
discussed by Michael Las at
https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
which would overflow steal time and lead to 100% st usage in top command
for linux 4.8-4.10. A backport of this patch would fix that issue.

References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>

---
Changed since v1:
  * relocate modification to xen_get_runstate_snapshot_cpu

Changed since v2:
  * accumulate runstate times before live migration

---
 drivers/xen/manage.c  |  1 +
 drivers/xen/time.c    | 19 +++++++++++++++++++
 include/xen/xen-ops.h |  1 +
 3 files changed, 21 insertions(+)

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index c425d03..9aa2955 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -72,6 +72,7 @@ static int xen_suspend(void *data)
 	}
 
 	gnttab_suspend();
+	xen_accumulate_runstate_time();
 	xen_arch_pre_suspend();
 
 	/*
diff --git a/drivers/xen/time.c b/drivers/xen/time.c
index ac5f23f..6df3f82 100644
--- a/drivers/xen/time.c
+++ b/drivers/xen/time.c
@@ -19,6 +19,8 @@
 /* runstate info updated by Xen */
 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
 
+static DEFINE_PER_CPU(u64[4], old_runstate_time);
+
 /* return an consistent snapshot of 64-bit time/counter value */
 static u64 get64(const u64 *p)
 {
@@ -52,6 +54,7 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
 {
 	u64 state_time;
 	struct vcpu_runstate_info *state;
+	int i;
 
 	BUG_ON(preemptible());
 
@@ -64,6 +67,22 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
 		rmb();	/* Hypervisor might update data. */
 	} while (get64(&state->state_entry_time) != state_time ||
 		 (state_time & XEN_RUNSTATE_UPDATE));
+
+	for (i = 0; i < 4; i++)
+		res->time[i] += per_cpu(old_runstate_time, cpu)[i];
+}
+
+void xen_accumulate_runstate_time(void)
+{
+	struct vcpu_runstate_info state;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		xen_get_runstate_snapshot_cpu(&state, cpu);
+		memcpy(per_cpu(old_runstate_time, cpu),
+				state.time,
+				4 * sizeof(u64));
+	}
 }
 
 /*
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 218e6aa..5680059 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb);
 bool xen_vcpu_stolen(int vcpu);
 void xen_setup_runstate_info(int cpu);
 void xen_time_setup_guest(void);
+void xen_accumulate_runstate_time(void);
 void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
 u64 xen_steal_clock(int cpu);
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-10-27  7:49 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-25  6:45 [PATCH v3 1/1] xen/time: do not decrease steal time after live migration on xen Dongli Zhang
2017-10-25  6:45 Dongli Zhang
2017-10-25 15:12 ` Boris Ostrovsky
2017-10-27  7:16   ` Dongli Zhang
2017-10-27  7:16   ` [Xen-devel] " Dongli Zhang
2017-10-27  7:31     ` Juergen Gross
2017-10-27  7:40       ` Dongli Zhang
2017-10-27  7:49         ` Juergen Gross
2017-10-27  7:40       ` Dongli Zhang
2017-10-27  7:31     ` Juergen Gross
2017-10-25 15:12 ` Boris Ostrovsky

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.