* [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
@ 2017-10-30 8:03 Dongli Zhang
2017-10-30 13:34 ` Boris Ostrovsky
` (3 more replies)
0 siblings, 4 replies; 14+ messages in thread
From: Dongli Zhang @ 2017-10-30 8:03 UTC (permalink / raw)
To: xen-devel, linux-kernel; +Cc: boris.ostrovsky, jgross, joao.m.martins
After guest live migration on xen, steal time in /proc/stat
(cpustat[CPUTIME_STEAL]) might decrease because steal returned by
xen_steal_lock() might be less than this_rq()->prev_steal_time which is
derived from previous return value of xen_steal_clock().
For instance, steal time of each vcpu is 335 before live migration.
cpu 198 0 368 200064 1962 0 0 1340 0 0
cpu0 38 0 81 50063 492 0 0 335 0 0
cpu1 65 0 97 49763 634 0 0 335 0 0
cpu2 38 0 81 50098 462 0 0 335 0 0
cpu3 56 0 107 50138 374 0 0 335 0 0
After live migration, steal time is reduced to 312.
cpu 200 0 370 200330 1971 0 0 1248 0 0
cpu0 38 0 82 50123 500 0 0 312 0 0
cpu1 65 0 97 49832 634 0 0 312 0 0
cpu2 39 0 82 50167 462 0 0 312 0 0
cpu3 56 0 107 50207 374 0 0 312 0 0
Since runstate times are cumulative and cleared during xen live migration
by xen hypervisor, the idea of this patch is to accumulate runstate times
to global percpu variables before live migration suspend. Once guest VM is
resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
runstate times and previously accumulated times stored in global percpu
variables.
Similar and more severe issue would impact prior linux 4.8-4.10 as
discussed by Michael Las at
https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
which would overflow steal time and lead to 100% st usage in top command
for linux 4.8-4.10. A backport of this patch would fix that issue.
References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
Changed since v1:
* relocate modification to xen_get_runstate_snapshot_cpu
Changed since v2:
* accumulate runstate times before live migration
Changed since v3:
* do not accumulate times in the case of guest checkpointing
Changed since v4:
* allocate array of vcpu_runstate_info to reduce number of memory allocation
---
drivers/xen/manage.c | 2 ++
drivers/xen/time.c | 68 ++++++++++++++++++++++++++++++++++++++++++--
include/xen/interface/vcpu.h | 2 ++
include/xen/xen-ops.h | 1 +
4 files changed, 71 insertions(+), 2 deletions(-)
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index c425d03..3dc085d 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -72,6 +72,7 @@ static int xen_suspend(void *data)
}
gnttab_suspend();
+ xen_accumulate_runstate_time(-1);
xen_arch_pre_suspend();
/*
@@ -84,6 +85,7 @@ static int xen_suspend(void *data)
: 0);
xen_arch_post_suspend(si->cancelled);
+ xen_accumulate_runstate_time(si->cancelled);
gnttab_resume();
if (!si->cancelled) {
diff --git a/drivers/xen/time.c b/drivers/xen/time.c
index ac5f23f..cf3afb9 100644
--- a/drivers/xen/time.c
+++ b/drivers/xen/time.c
@@ -19,6 +19,9 @@
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
+static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
+static struct vcpu_runstate_info *runstate_delta;
+
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
{
@@ -47,8 +50,8 @@ static u64 get64(const u64 *p)
return ret;
}
-static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
- unsigned int cpu)
+static void xen_get_runstate_snapshot_cpu_delta(
+ struct vcpu_runstate_info *res, unsigned int cpu)
{
u64 state_time;
struct vcpu_runstate_info *state;
@@ -66,6 +69,67 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
(state_time & XEN_RUNSTATE_UPDATE));
}
+static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
+ unsigned int cpu)
+{
+ int i;
+
+ xen_get_runstate_snapshot_cpu_delta(res, cpu);
+
+ for (i = 0; i < RUNSTATE_max; i++)
+ res->time[i] += per_cpu(old_runstate_time, cpu)[i];
+}
+
+void xen_accumulate_runstate_time(int action)
+{
+ struct vcpu_runstate_info state;
+ int cpu, i;
+
+ switch (action) {
+ case -1: /* backup runstate time before suspend */
+ WARN_ON_ONCE(unlikely(runstate_delta));
+
+ runstate_delta = kcalloc(num_possible_cpus(),
+ sizeof(*runstate_delta),
+ GFP_KERNEL);
+ if (unlikely(!runstate_delta)) {
+ pr_alert("%s: failed to allocate runstate_delta\n",
+ __func__);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ xen_get_runstate_snapshot_cpu_delta(&state, cpu);
+ memcpy(runstate_delta[cpu].time, state.time,
+ RUNSTATE_max * sizeof(*runstate_delta[cpu].time));
+ }
+
+ break;
+
+ case 0: /* backup runstate time after resume */
+ if (unlikely(!runstate_delta)) {
+ pr_alert("%s: cannot accumulate runstate time as runstate_delta is NULL\n",
+ __func__);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RUNSTATE_max; i++)
+ per_cpu(old_runstate_time, cpu)[i] +=
+ runstate_delta[cpu].time[i];
+ }
+ break;
+
+ default: /* do not accumulate runstate time for checkpointing */
+ break;
+ }
+
+ if (action != -1 && runstate_delta) {
+ kfree(runstate_delta);
+ runstate_delta = NULL;
+ }
+}
+
/*
* Runstate accounting
*/
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
index 98188c8..85e81ce 100644
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -110,6 +110,8 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
*/
#define RUNSTATE_offline 3
+#define RUNSTATE_max 4
+
/*
* Register a shared memory area from which the guest may obtain its own
* runstate information without needing to execute a hypercall.
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 218e6aa..b1f9ae9 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb);
bool xen_vcpu_stolen(int vcpu);
void xen_setup_runstate_info(int cpu);
void xen_time_setup_guest(void);
+void xen_accumulate_runstate_time(int action);
void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
u64 xen_steal_clock(int cpu);
--
2.7.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
2017-10-30 8:03 [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen Dongli Zhang
@ 2017-10-30 13:34 ` Boris Ostrovsky
2017-10-30 13:34 ` Boris Ostrovsky
` (2 subsequent siblings)
3 siblings, 0 replies; 14+ messages in thread
From: Boris Ostrovsky @ 2017-10-30 13:34 UTC (permalink / raw)
To: Dongli Zhang, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins
On 10/30/2017 04:03 AM, Dongli Zhang wrote:
> After guest live migration on xen, steal time in /proc/stat
> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
> derived from previous return value of xen_steal_clock().
>
> For instance, steal time of each vcpu is 335 before live migration.
>
> cpu 198 0 368 200064 1962 0 0 1340 0 0
> cpu0 38 0 81 50063 492 0 0 335 0 0
> cpu1 65 0 97 49763 634 0 0 335 0 0
> cpu2 38 0 81 50098 462 0 0 335 0 0
> cpu3 56 0 107 50138 374 0 0 335 0 0
>
> After live migration, steal time is reduced to 312.
>
> cpu 200 0 370 200330 1971 0 0 1248 0 0
> cpu0 38 0 82 50123 500 0 0 312 0 0
> cpu1 65 0 97 49832 634 0 0 312 0 0
> cpu2 39 0 82 50167 462 0 0 312 0 0
> cpu3 56 0 107 50207 374 0 0 312 0 0
>
> Since runstate times are cumulative and cleared during xen live migration
> by xen hypervisor, the idea of this patch is to accumulate runstate times
> to global percpu variables before live migration suspend. Once guest VM is
> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
> runstate times and previously accumulated times stored in global percpu
> variables.
>
> Similar and more severe issue would impact prior linux 4.8-4.10 as
> discussed by Michael Las at
> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
> which would overflow steal time and lead to 100% st usage in top command
> for linux 4.8-4.10. A backport of this patch would fix that issue.
>
> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>
> ---
> Changed since v1:
> * relocate modification to xen_get_runstate_snapshot_cpu
>
> Changed since v2:
> * accumulate runstate times before live migration
>
> Changed since v3:
> * do not accumulate times in the case of guest checkpointing
>
> Changed since v4:
> * allocate array of vcpu_runstate_info to reduce number of memory allocation
>
> ---
> drivers/xen/manage.c | 2 ++
> drivers/xen/time.c | 68 ++++++++++++++++++++++++++++++++++++++++++--
> include/xen/interface/vcpu.h | 2 ++
> include/xen/xen-ops.h | 1 +
> 4 files changed, 71 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
> index c425d03..3dc085d 100644
> --- a/drivers/xen/manage.c
> +++ b/drivers/xen/manage.c
> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
> }
>
> gnttab_suspend();
> + xen_accumulate_runstate_time(-1);
> xen_arch_pre_suspend();
>
> /*
> @@ -84,6 +85,7 @@ static int xen_suspend(void *data)
> : 0);
>
> xen_arch_post_suspend(si->cancelled);
> + xen_accumulate_runstate_time(si->cancelled);
I am not convinced that the comment above HYPERVISOR_suspend() is
correct. The call can return an error code and so if it returns -EPERM
(which AFAICS it can't now but might in the future) then
xen_accumulate_runstate_time() will do wrong thing.
> gnttab_resume();
>
> if (!si->cancelled) {
> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
> index ac5f23f..cf3afb9 100644
> --- a/drivers/xen/time.c
> +++ b/drivers/xen/time.c
> @@ -19,6 +19,9 @@
> /* runstate info updated by Xen */
> static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>
> +static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
> +static struct vcpu_runstate_info *runstate_delta;
I'd move this inside xen_accumulate_runstate_time() since that's the
only function that uses it. And why does it need to be
vcpu_runstate_info and not u64[4]?
> +
> /* return an consistent snapshot of 64-bit time/counter value */
> static u64 get64(const u64 *p)
> {
> @@ -47,8 +50,8 @@ static u64 get64(const u64 *p)
> return ret;
> }
>
> -static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
> - unsigned int cpu)
> +static void xen_get_runstate_snapshot_cpu_delta(
> + struct vcpu_runstate_info *res, unsigned int cpu)
> {
> u64 state_time;
> struct vcpu_runstate_info *state;
> @@ -66,6 +69,67 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
> (state_time & XEN_RUNSTATE_UPDATE));
> }
>
> +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
> + unsigned int cpu)
> +{
> + int i;
> +
> + xen_get_runstate_snapshot_cpu_delta(res, cpu);
> +
> + for (i = 0; i < RUNSTATE_max; i++)
> + res->time[i] += per_cpu(old_runstate_time, cpu)[i];
> +}
> +
> +void xen_accumulate_runstate_time(int action)
> +{
> + struct vcpu_runstate_info state;
> + int cpu, i;
> +
> + switch (action) {
> + case -1: /* backup runstate time before suspend */
> + WARN_ON_ONCE(unlikely(runstate_delta));
pr_warn_once(), to be consistent with the rest of the file. And then
should you return if this is true?
> +
> + runstate_delta = kcalloc(num_possible_cpus(),
> + sizeof(*runstate_delta),
> + GFP_KERNEL);
> + if (unlikely(!runstate_delta)) {
> + pr_alert("%s: failed to allocate runstate_delta\n",
> + __func__);
pr_warn() should be sufficient. Below too.
Also, as a side question --- can we do kmalloc() at this point?
> + return;
> + }
> +
> + for_each_possible_cpu(cpu) {
> + xen_get_runstate_snapshot_cpu_delta(&state, cpu);
> + memcpy(runstate_delta[cpu].time, state.time,
> + RUNSTATE_max * sizeof(*runstate_delta[cpu].time));
sizeof(runstate_delta[cpu].time).
> + }
> +
> + break;
> +
> + case 0: /* backup runstate time after resume */
> + if (unlikely(!runstate_delta)) {
> + pr_alert("%s: cannot accumulate runstate time as runstate_delta is NULL\n",
> + __func__);
> + return;
> + }
> +
> + for_each_possible_cpu(cpu) {
> + for (i = 0; i < RUNSTATE_max; i++)
> + per_cpu(old_runstate_time, cpu)[i] +=
> + runstate_delta[cpu].time[i];
> + }
> + break;
> +
> + default: /* do not accumulate runstate time for checkpointing */
> + break;
> + }
> +
> + if (action != -1 && runstate_delta) {
> + kfree(runstate_delta);
> + runstate_delta = NULL;
> + }
> +}
> +
> /*
> * Runstate accounting
> */
> diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
> index 98188c8..85e81ce 100644
> --- a/include/xen/interface/vcpu.h
> +++ b/include/xen/interface/vcpu.h
> @@ -110,6 +110,8 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
> */
> #define RUNSTATE_offline 3
>
> +#define RUNSTATE_max 4
This file is part of Xen ABI. While this macro technically doesn't
change anything I'd rather have those updates first appear in Xen code.
Besides, this change leaves vcpu_runstate_info.time[4] definition
intact. I think all RUNSTATE_* macros would need to be moved above
vcpu_runstate_info definition.
TBH, for purposes of this patch I'd use 4.
-boris
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
2017-10-30 8:03 [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen Dongli Zhang
2017-10-30 13:34 ` Boris Ostrovsky
@ 2017-10-30 13:34 ` Boris Ostrovsky
2017-10-31 0:14 ` Dongli Zhang
2017-10-31 0:14 ` [Xen-devel] " Dongli Zhang
2017-11-01 20:05 ` kbuild test robot
2017-11-01 20:05 ` kbuild test robot
3 siblings, 2 replies; 14+ messages in thread
From: Boris Ostrovsky @ 2017-10-30 13:34 UTC (permalink / raw)
To: Dongli Zhang, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins
On 10/30/2017 04:03 AM, Dongli Zhang wrote:
> After guest live migration on xen, steal time in /proc/stat
> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
> derived from previous return value of xen_steal_clock().
>
> For instance, steal time of each vcpu is 335 before live migration.
>
> cpu 198 0 368 200064 1962 0 0 1340 0 0
> cpu0 38 0 81 50063 492 0 0 335 0 0
> cpu1 65 0 97 49763 634 0 0 335 0 0
> cpu2 38 0 81 50098 462 0 0 335 0 0
> cpu3 56 0 107 50138 374 0 0 335 0 0
>
> After live migration, steal time is reduced to 312.
>
> cpu 200 0 370 200330 1971 0 0 1248 0 0
> cpu0 38 0 82 50123 500 0 0 312 0 0
> cpu1 65 0 97 49832 634 0 0 312 0 0
> cpu2 39 0 82 50167 462 0 0 312 0 0
> cpu3 56 0 107 50207 374 0 0 312 0 0
>
> Since runstate times are cumulative and cleared during xen live migration
> by xen hypervisor, the idea of this patch is to accumulate runstate times
> to global percpu variables before live migration suspend. Once guest VM is
> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
> runstate times and previously accumulated times stored in global percpu
> variables.
>
> Similar and more severe issue would impact prior linux 4.8-4.10 as
> discussed by Michael Las at
> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
> which would overflow steal time and lead to 100% st usage in top command
> for linux 4.8-4.10. A backport of this patch would fix that issue.
>
> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>
> ---
> Changed since v1:
> * relocate modification to xen_get_runstate_snapshot_cpu
>
> Changed since v2:
> * accumulate runstate times before live migration
>
> Changed since v3:
> * do not accumulate times in the case of guest checkpointing
>
> Changed since v4:
> * allocate array of vcpu_runstate_info to reduce number of memory allocation
>
> ---
> drivers/xen/manage.c | 2 ++
> drivers/xen/time.c | 68 ++++++++++++++++++++++++++++++++++++++++++--
> include/xen/interface/vcpu.h | 2 ++
> include/xen/xen-ops.h | 1 +
> 4 files changed, 71 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
> index c425d03..3dc085d 100644
> --- a/drivers/xen/manage.c
> +++ b/drivers/xen/manage.c
> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
> }
>
> gnttab_suspend();
> + xen_accumulate_runstate_time(-1);
> xen_arch_pre_suspend();
>
> /*
> @@ -84,6 +85,7 @@ static int xen_suspend(void *data)
> : 0);
>
> xen_arch_post_suspend(si->cancelled);
> + xen_accumulate_runstate_time(si->cancelled);
I am not convinced that the comment above HYPERVISOR_suspend() is
correct. The call can return an error code and so if it returns -EPERM
(which AFAICS it can't now but might in the future) then
xen_accumulate_runstate_time() will do wrong thing.
> gnttab_resume();
>
> if (!si->cancelled) {
> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
> index ac5f23f..cf3afb9 100644
> --- a/drivers/xen/time.c
> +++ b/drivers/xen/time.c
> @@ -19,6 +19,9 @@
> /* runstate info updated by Xen */
> static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>
> +static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
> +static struct vcpu_runstate_info *runstate_delta;
I'd move this inside xen_accumulate_runstate_time() since that's the
only function that uses it. And why does it need to be
vcpu_runstate_info and not u64[4]?
> +
> /* return an consistent snapshot of 64-bit time/counter value */
> static u64 get64(const u64 *p)
> {
> @@ -47,8 +50,8 @@ static u64 get64(const u64 *p)
> return ret;
> }
>
> -static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
> - unsigned int cpu)
> +static void xen_get_runstate_snapshot_cpu_delta(
> + struct vcpu_runstate_info *res, unsigned int cpu)
> {
> u64 state_time;
> struct vcpu_runstate_info *state;
> @@ -66,6 +69,67 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
> (state_time & XEN_RUNSTATE_UPDATE));
> }
>
> +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
> + unsigned int cpu)
> +{
> + int i;
> +
> + xen_get_runstate_snapshot_cpu_delta(res, cpu);
> +
> + for (i = 0; i < RUNSTATE_max; i++)
> + res->time[i] += per_cpu(old_runstate_time, cpu)[i];
> +}
> +
> +void xen_accumulate_runstate_time(int action)
> +{
> + struct vcpu_runstate_info state;
> + int cpu, i;
> +
> + switch (action) {
> + case -1: /* backup runstate time before suspend */
> + WARN_ON_ONCE(unlikely(runstate_delta));
pr_warn_once(), to be consistent with the rest of the file. And then
should you return if this is true?
> +
> + runstate_delta = kcalloc(num_possible_cpus(),
> + sizeof(*runstate_delta),
> + GFP_KERNEL);
> + if (unlikely(!runstate_delta)) {
> + pr_alert("%s: failed to allocate runstate_delta\n",
> + __func__);
pr_warn() should be sufficient. Below too.
Also, as a side question --- can we do kmalloc() at this point?
> + return;
> + }
> +
> + for_each_possible_cpu(cpu) {
> + xen_get_runstate_snapshot_cpu_delta(&state, cpu);
> + memcpy(runstate_delta[cpu].time, state.time,
> + RUNSTATE_max * sizeof(*runstate_delta[cpu].time));
sizeof(runstate_delta[cpu].time).
> + }
> +
> + break;
> +
> + case 0: /* backup runstate time after resume */
> + if (unlikely(!runstate_delta)) {
> + pr_alert("%s: cannot accumulate runstate time as runstate_delta is NULL\n",
> + __func__);
> + return;
> + }
> +
> + for_each_possible_cpu(cpu) {
> + for (i = 0; i < RUNSTATE_max; i++)
> + per_cpu(old_runstate_time, cpu)[i] +=
> + runstate_delta[cpu].time[i];
> + }
> + break;
> +
> + default: /* do not accumulate runstate time for checkpointing */
> + break;
> + }
> +
> + if (action != -1 && runstate_delta) {
> + kfree(runstate_delta);
> + runstate_delta = NULL;
> + }
> +}
> +
> /*
> * Runstate accounting
> */
> diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
> index 98188c8..85e81ce 100644
> --- a/include/xen/interface/vcpu.h
> +++ b/include/xen/interface/vcpu.h
> @@ -110,6 +110,8 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
> */
> #define RUNSTATE_offline 3
>
> +#define RUNSTATE_max 4
This file is part of Xen ABI. While this macro technically doesn't
change anything I'd rather have those updates first appear in Xen code.
Besides, this change leaves vcpu_runstate_info.time[4] definition
intact. I think all RUNSTATE_* macros would need to be moved above
vcpu_runstate_info definition.
TBH, for purposes of this patch I'd use 4.
-boris
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
2017-10-30 13:34 ` Boris Ostrovsky
@ 2017-10-31 0:14 ` Dongli Zhang
2017-10-31 0:14 ` [Xen-devel] " Dongli Zhang
1 sibling, 0 replies; 14+ messages in thread
From: Dongli Zhang @ 2017-10-31 0:14 UTC (permalink / raw)
To: Boris Ostrovsky, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins
Hi Boris,
On 10/30/2017 09:34 PM, Boris Ostrovsky wrote:
> On 10/30/2017 04:03 AM, Dongli Zhang wrote:
>> After guest live migration on xen, steal time in /proc/stat
>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>> derived from previous return value of xen_steal_clock().
>>
>> For instance, steal time of each vcpu is 335 before live migration.
>>
>> cpu 198 0 368 200064 1962 0 0 1340 0 0
>> cpu0 38 0 81 50063 492 0 0 335 0 0
>> cpu1 65 0 97 49763 634 0 0 335 0 0
>> cpu2 38 0 81 50098 462 0 0 335 0 0
>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>
>> After live migration, steal time is reduced to 312.
>>
>> cpu 200 0 370 200330 1971 0 0 1248 0 0
>> cpu0 38 0 82 50123 500 0 0 312 0 0
>> cpu1 65 0 97 49832 634 0 0 312 0 0
>> cpu2 39 0 82 50167 462 0 0 312 0 0
>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>
>> Since runstate times are cumulative and cleared during xen live migration
>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>> to global percpu variables before live migration suspend. Once guest VM is
>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>> runstate times and previously accumulated times stored in global percpu
>> variables.
>>
>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>> discussed by Michael Las at
>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>> which would overflow steal time and lead to 100% st usage in top command
>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>
>> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>
>> ---
>> Changed since v1:
>> * relocate modification to xen_get_runstate_snapshot_cpu
>>
>> Changed since v2:
>> * accumulate runstate times before live migration
>>
>> Changed since v3:
>> * do not accumulate times in the case of guest checkpointing
>>
>> Changed since v4:
>> * allocate array of vcpu_runstate_info to reduce number of memory allocation
>>
>> ---
>> drivers/xen/manage.c | 2 ++
>> drivers/xen/time.c | 68 ++++++++++++++++++++++++++++++++++++++++++--
>> include/xen/interface/vcpu.h | 2 ++
>> include/xen/xen-ops.h | 1 +
>> 4 files changed, 71 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>> index c425d03..3dc085d 100644
>> --- a/drivers/xen/manage.c
>> +++ b/drivers/xen/manage.c
>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>> }
>>
>> gnttab_suspend();
>> + xen_accumulate_runstate_time(-1);
>> xen_arch_pre_suspend();
>>
>> /*
>> @@ -84,6 +85,7 @@ static int xen_suspend(void *data)
>> : 0);
>>
>> xen_arch_post_suspend(si->cancelled);
>> + xen_accumulate_runstate_time(si->cancelled);
>
> I am not convinced that the comment above HYPERVISOR_suspend() is
> correct. The call can return an error code and so if it returns -EPERM
> (which AFAICS it can't now but might in the future) then
> xen_accumulate_runstate_time() will do wrong thing.
I would split xen_accumulate_runstate_time() into two functions to avoid the
-EPERM issue, as one is for saving and another is for accumulation, respectively.
Otherwise, can you use xen_accumulate_runstate_time(2) for saving before suspend
and xen_accumulate_runstate_time(si->cancelled) after resume?
>
>
>> gnttab_resume();
>>
>> if (!si->cancelled) {
>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>> index ac5f23f..cf3afb9 100644
>> --- a/drivers/xen/time.c
>> +++ b/drivers/xen/time.c
>> @@ -19,6 +19,9 @@
>> /* runstate info updated by Xen */
>> static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>
>> +static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
>> +static struct vcpu_runstate_info *runstate_delta;
>
> I'd move this inside xen_accumulate_runstate_time() since that's the
If we split xen_accumulate_runstate_time() into two functions, we would leave
runstate_delta as global static.
> only function that uses it. And why does it need to be
> vcpu_runstate_info and not u64[4]?
This was suggested by Juergen to avoid the allocation and reclaim of the second
dimensional array as in v4 of this patch?
Or would you like to allocate sizeof(u64[4]) * num_possible_cpus() and emulate
the 2d array with this 1d array and move the pointer forward sizeof(u64[4]) in
each iteration?
>
>> +
>> /* return an consistent snapshot of 64-bit time/counter value */
>> static u64 get64(const u64 *p)
>> {
>> @@ -47,8 +50,8 @@ static u64 get64(const u64 *p)
>> return ret;
>> }
>>
>> -static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>> - unsigned int cpu)
>> +static void xen_get_runstate_snapshot_cpu_delta(
>> + struct vcpu_runstate_info *res, unsigned int cpu)
>> {
>> u64 state_time;
>> struct vcpu_runstate_info *state;
>> @@ -66,6 +69,67 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>> (state_time & XEN_RUNSTATE_UPDATE));
>> }
>>
>> +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>> + unsigned int cpu)
>> +{
>> + int i;
>> +
>> + xen_get_runstate_snapshot_cpu_delta(res, cpu);
>> +
>> + for (i = 0; i < RUNSTATE_max; i++)
>> + res->time[i] += per_cpu(old_runstate_time, cpu)[i];
>> +}
>> +
>> +void xen_accumulate_runstate_time(int action)
>> +{
>> + struct vcpu_runstate_info state;
>> + int cpu, i;
>> +
>> + switch (action) {
>> + case -1: /* backup runstate time before suspend */
>> + WARN_ON_ONCE(unlikely(runstate_delta));
>
> pr_warn_once(), to be consistent with the rest of the file. And then
> should you return if this is true?
I would prefer to not return if it is true but just warn the administrator that
there is memory leakage issue while leaving runstate accumulation works normally.
>
>> +
>> + runstate_delta = kcalloc(num_possible_cpus(),
>> + sizeof(*runstate_delta),
>> + GFP_KERNEL);
>> + if (unlikely(!runstate_delta)) {
>> + pr_alert("%s: failed to allocate runstate_delta\n",
>> + __func__);
>
> pr_warn() should be sufficient. Below too.
>
> Also, as a side question --- can we do kmalloc() at this point?
Yes. kmalloc_array() is better than kcalloc, unless we have 2 dimensional array
and we need to guarantee the value of first dimension is always 0.
>
>> + return;
>> + }
>> +
>> + for_each_possible_cpu(cpu) {
>> + xen_get_runstate_snapshot_cpu_delta(&state, cpu);
>> + memcpy(runstate_delta[cpu].time, state.time,
>> + RUNSTATE_max * sizeof(*runstate_delta[cpu].time));
>
> sizeof(runstate_delta[cpu].time).
>
>> + }
>> +
>> + break;
>> +
>> + case 0: /* backup runstate time after resume */
>> + if (unlikely(!runstate_delta)) {
>> + pr_alert("%s: cannot accumulate runstate time as runstate_delta is NULL\n",
>> + __func__);
>> + return;
>> + }
>> +
>> + for_each_possible_cpu(cpu) {
>> + for (i = 0; i < RUNSTATE_max; i++)
>> + per_cpu(old_runstate_time, cpu)[i] +=
>> + runstate_delta[cpu].time[i];
>> + }
>> + break;
>> +
>> + default: /* do not accumulate runstate time for checkpointing */
>> + break;
>> + }
>> +
>> + if (action != -1 && runstate_delta) {
>> + kfree(runstate_delta);
>> + runstate_delta = NULL;
>> + }
>> +}
>> +
>> /*
>> * Runstate accounting
>> */
>> diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
>> index 98188c8..85e81ce 100644
>> --- a/include/xen/interface/vcpu.h
>> +++ b/include/xen/interface/vcpu.h
>> @@ -110,6 +110,8 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
>> */
>> #define RUNSTATE_offline 3
>>
>> +#define RUNSTATE_max 4
>
> This file is part of Xen ABI. While this macro technically doesn't
> change anything I'd rather have those updates first appear in Xen code.
>
> Besides, this change leaves vcpu_runstate_info.time[4] definition
> intact. I think all RUNSTATE_* macros would need to be moved above
> vcpu_runstate_info definition.
>
> TBH, for purposes of this patch I'd use 4.
>
>
> -boris
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
>
Thank you very much for your suggestions!
Dongli Zhang
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [Xen-devel] [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
2017-10-30 13:34 ` Boris Ostrovsky
2017-10-31 0:14 ` Dongli Zhang
@ 2017-10-31 0:14 ` Dongli Zhang
2017-10-31 0:58 ` Boris Ostrovsky
1 sibling, 1 reply; 14+ messages in thread
From: Dongli Zhang @ 2017-10-31 0:14 UTC (permalink / raw)
To: Boris Ostrovsky, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins
Hi Boris,
On 10/30/2017 09:34 PM, Boris Ostrovsky wrote:
> On 10/30/2017 04:03 AM, Dongli Zhang wrote:
>> After guest live migration on xen, steal time in /proc/stat
>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>> derived from previous return value of xen_steal_clock().
>>
>> For instance, steal time of each vcpu is 335 before live migration.
>>
>> cpu 198 0 368 200064 1962 0 0 1340 0 0
>> cpu0 38 0 81 50063 492 0 0 335 0 0
>> cpu1 65 0 97 49763 634 0 0 335 0 0
>> cpu2 38 0 81 50098 462 0 0 335 0 0
>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>
>> After live migration, steal time is reduced to 312.
>>
>> cpu 200 0 370 200330 1971 0 0 1248 0 0
>> cpu0 38 0 82 50123 500 0 0 312 0 0
>> cpu1 65 0 97 49832 634 0 0 312 0 0
>> cpu2 39 0 82 50167 462 0 0 312 0 0
>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>
>> Since runstate times are cumulative and cleared during xen live migration
>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>> to global percpu variables before live migration suspend. Once guest VM is
>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>> runstate times and previously accumulated times stored in global percpu
>> variables.
>>
>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>> discussed by Michael Las at
>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>> which would overflow steal time and lead to 100% st usage in top command
>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>
>> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>
>> ---
>> Changed since v1:
>> * relocate modification to xen_get_runstate_snapshot_cpu
>>
>> Changed since v2:
>> * accumulate runstate times before live migration
>>
>> Changed since v3:
>> * do not accumulate times in the case of guest checkpointing
>>
>> Changed since v4:
>> * allocate array of vcpu_runstate_info to reduce number of memory allocation
>>
>> ---
>> drivers/xen/manage.c | 2 ++
>> drivers/xen/time.c | 68 ++++++++++++++++++++++++++++++++++++++++++--
>> include/xen/interface/vcpu.h | 2 ++
>> include/xen/xen-ops.h | 1 +
>> 4 files changed, 71 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>> index c425d03..3dc085d 100644
>> --- a/drivers/xen/manage.c
>> +++ b/drivers/xen/manage.c
>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>> }
>>
>> gnttab_suspend();
>> + xen_accumulate_runstate_time(-1);
>> xen_arch_pre_suspend();
>>
>> /*
>> @@ -84,6 +85,7 @@ static int xen_suspend(void *data)
>> : 0);
>>
>> xen_arch_post_suspend(si->cancelled);
>> + xen_accumulate_runstate_time(si->cancelled);
>
> I am not convinced that the comment above HYPERVISOR_suspend() is
> correct. The call can return an error code and so if it returns -EPERM
> (which AFAICS it can't now but might in the future) then
> xen_accumulate_runstate_time() will do wrong thing.
I would split xen_accumulate_runstate_time() into two functions to avoid the
-EPERM issue, as one is for saving and another is for accumulation, respectively.
Otherwise, can you use xen_accumulate_runstate_time(2) for saving before suspend
and xen_accumulate_runstate_time(si->cancelled) after resume?
>
>
>> gnttab_resume();
>>
>> if (!si->cancelled) {
>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>> index ac5f23f..cf3afb9 100644
>> --- a/drivers/xen/time.c
>> +++ b/drivers/xen/time.c
>> @@ -19,6 +19,9 @@
>> /* runstate info updated by Xen */
>> static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>
>> +static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
>> +static struct vcpu_runstate_info *runstate_delta;
>
> I'd move this inside xen_accumulate_runstate_time() since that's the
If we split xen_accumulate_runstate_time() into two functions, we would leave
runstate_delta as global static.
> only function that uses it. And why does it need to be
> vcpu_runstate_info and not u64[4]?
This was suggested by Juergen to avoid the allocation and reclaim of the second
dimensional array as in v4 of this patch?
Or would you like to allocate sizeof(u64[4]) * num_possible_cpus() and emulate
the 2d array with this 1d array and move the pointer forward sizeof(u64[4]) in
each iteration?
>
>> +
>> /* return an consistent snapshot of 64-bit time/counter value */
>> static u64 get64(const u64 *p)
>> {
>> @@ -47,8 +50,8 @@ static u64 get64(const u64 *p)
>> return ret;
>> }
>>
>> -static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>> - unsigned int cpu)
>> +static void xen_get_runstate_snapshot_cpu_delta(
>> + struct vcpu_runstate_info *res, unsigned int cpu)
>> {
>> u64 state_time;
>> struct vcpu_runstate_info *state;
>> @@ -66,6 +69,67 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>> (state_time & XEN_RUNSTATE_UPDATE));
>> }
>>
>> +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>> + unsigned int cpu)
>> +{
>> + int i;
>> +
>> + xen_get_runstate_snapshot_cpu_delta(res, cpu);
>> +
>> + for (i = 0; i < RUNSTATE_max; i++)
>> + res->time[i] += per_cpu(old_runstate_time, cpu)[i];
>> +}
>> +
>> +void xen_accumulate_runstate_time(int action)
>> +{
>> + struct vcpu_runstate_info state;
>> + int cpu, i;
>> +
>> + switch (action) {
>> + case -1: /* backup runstate time before suspend */
>> + WARN_ON_ONCE(unlikely(runstate_delta));
>
> pr_warn_once(), to be consistent with the rest of the file. And then
> should you return if this is true?
I would prefer to not return if it is true but just warn the administrator that
there is memory leakage issue while leaving runstate accumulation works normally.
>
>> +
>> + runstate_delta = kcalloc(num_possible_cpus(),
>> + sizeof(*runstate_delta),
>> + GFP_KERNEL);
>> + if (unlikely(!runstate_delta)) {
>> + pr_alert("%s: failed to allocate runstate_delta\n",
>> + __func__);
>
> pr_warn() should be sufficient. Below too.
>
> Also, as a side question --- can we do kmalloc() at this point?
Yes. kmalloc_array() is better than kcalloc, unless we have 2 dimensional array
and we need to guarantee the value of first dimension is always 0.
>
>> + return;
>> + }
>> +
>> + for_each_possible_cpu(cpu) {
>> + xen_get_runstate_snapshot_cpu_delta(&state, cpu);
>> + memcpy(runstate_delta[cpu].time, state.time,
>> + RUNSTATE_max * sizeof(*runstate_delta[cpu].time));
>
> sizeof(runstate_delta[cpu].time).
>
>> + }
>> +
>> + break;
>> +
>> + case 0: /* backup runstate time after resume */
>> + if (unlikely(!runstate_delta)) {
>> + pr_alert("%s: cannot accumulate runstate time as runstate_delta is NULL\n",
>> + __func__);
>> + return;
>> + }
>> +
>> + for_each_possible_cpu(cpu) {
>> + for (i = 0; i < RUNSTATE_max; i++)
>> + per_cpu(old_runstate_time, cpu)[i] +=
>> + runstate_delta[cpu].time[i];
>> + }
>> + break;
>> +
>> + default: /* do not accumulate runstate time for checkpointing */
>> + break;
>> + }
>> +
>> + if (action != -1 && runstate_delta) {
>> + kfree(runstate_delta);
>> + runstate_delta = NULL;
>> + }
>> +}
>> +
>> /*
>> * Runstate accounting
>> */
>> diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
>> index 98188c8..85e81ce 100644
>> --- a/include/xen/interface/vcpu.h
>> +++ b/include/xen/interface/vcpu.h
>> @@ -110,6 +110,8 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
>> */
>> #define RUNSTATE_offline 3
>>
>> +#define RUNSTATE_max 4
>
> This file is part of Xen ABI. While this macro technically doesn't
> change anything I'd rather have those updates first appear in Xen code.
>
> Besides, this change leaves vcpu_runstate_info.time[4] definition
> intact. I think all RUNSTATE_* macros would need to be moved above
> vcpu_runstate_info definition.
>
> TBH, for purposes of this patch I'd use 4.
>
>
> -boris
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
>
Thank you very much for your suggestions!
Dongli Zhang
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [Xen-devel] [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
2017-10-31 0:14 ` [Xen-devel] " Dongli Zhang
@ 2017-10-31 0:58 ` Boris Ostrovsky
0 siblings, 0 replies; 14+ messages in thread
From: Boris Ostrovsky @ 2017-10-31 0:58 UTC (permalink / raw)
To: Dongli Zhang, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins
On 10/30/2017 08:14 PM, Dongli Zhang wrote:
> Hi Boris,
>
> On 10/30/2017 09:34 PM, Boris Ostrovsky wrote:
>> On 10/30/2017 04:03 AM, Dongli Zhang wrote:
>>> After guest live migration on xen, steal time in /proc/stat
>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>>> derived from previous return value of xen_steal_clock().
>>>
>>> For instance, steal time of each vcpu is 335 before live migration.
>>>
>>> cpu 198 0 368 200064 1962 0 0 1340 0 0
>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>
>>> After live migration, steal time is reduced to 312.
>>>
>>> cpu 200 0 370 200330 1971 0 0 1248 0 0
>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>
>>> Since runstate times are cumulative and cleared during xen live migration
>>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>>> to global percpu variables before live migration suspend. Once guest VM is
>>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>>> runstate times and previously accumulated times stored in global percpu
>>> variables.
>>>
>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>> discussed by Michael Las at
>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>>> which would overflow steal time and lead to 100% st usage in top command
>>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>>
>>> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>>
>>> ---
>>> Changed since v1:
>>> * relocate modification to xen_get_runstate_snapshot_cpu
>>>
>>> Changed since v2:
>>> * accumulate runstate times before live migration
>>>
>>> Changed since v3:
>>> * do not accumulate times in the case of guest checkpointing
>>>
>>> Changed since v4:
>>> * allocate array of vcpu_runstate_info to reduce number of memory allocation
>>>
>>> ---
>>> drivers/xen/manage.c | 2 ++
>>> drivers/xen/time.c | 68 ++++++++++++++++++++++++++++++++++++++++++--
>>> include/xen/interface/vcpu.h | 2 ++
>>> include/xen/xen-ops.h | 1 +
>>> 4 files changed, 71 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>>> index c425d03..3dc085d 100644
>>> --- a/drivers/xen/manage.c
>>> +++ b/drivers/xen/manage.c
>>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>>> }
>>>
>>> gnttab_suspend();
>>> + xen_accumulate_runstate_time(-1);
>>> xen_arch_pre_suspend();
>>>
>>> /*
>>> @@ -84,6 +85,7 @@ static int xen_suspend(void *data)
>>> : 0);
>>>
>>> xen_arch_post_suspend(si->cancelled);
>>> + xen_accumulate_runstate_time(si->cancelled);
>>
>> I am not convinced that the comment above HYPERVISOR_suspend() is
>> correct. The call can return an error code and so if it returns -EPERM
>> (which AFAICS it can't now but might in the future) then
>> xen_accumulate_runstate_time() will do wrong thing.
>
> I would split xen_accumulate_runstate_time() into two functions to avoid the
> -EPERM issue, as one is for saving and another is for accumulation, respectively.
>
> Otherwise, can you use xen_accumulate_runstate_time(2) for saving before suspend
> and xen_accumulate_runstate_time(si->cancelled) after resume?
I'd probably just say something like
si->cancelled = HYPERVISOR_suspend() ? 1 : 0;
and keep xen_accumulate_runstate_time() as is (maybe rename it to
xen_manage_runstate_time()). And also remove the comment above the
hypercall as it is incorrect (but please mention the reason in the
commit message)
>
>>
>>
>>> gnttab_resume();
>>>
>>> if (!si->cancelled) {
>>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>>> index ac5f23f..cf3afb9 100644
>>> --- a/drivers/xen/time.c
>>> +++ b/drivers/xen/time.c
>>> @@ -19,6 +19,9 @@
>>> /* runstate info updated by Xen */
>>> static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>>
>>> +static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
>>> +static struct vcpu_runstate_info *runstate_delta;
>>
>> I'd move this inside xen_accumulate_runstate_time() since that's the
>
> If we split xen_accumulate_runstate_time() into two functions, we would leave
> runstate_delta as global static.
>
>> only function that uses it. And why does it need to be
>> vcpu_runstate_info and not u64[4]?
>
> This was suggested by Juergen to avoid the allocation and reclaim of the second
> dimensional array as in v4 of this patch?
>
> Or would you like to allocate sizeof(u64[4]) * num_possible_cpus() and emulate
> the 2d array with this 1d array and move the pointer forward sizeof(u64[4]) in
> each iteration?
I was thinking of
u64 **runstate_delta = (u64 **)kmalloc(sizeof(xen_runstate.time) *
num_possible_cpus())
and then you should be able to access runstate_delta[cpu][RUNSTATE_*].
>
>>
>>> +
>>> /* return an consistent snapshot of 64-bit time/counter value */
>>> static u64 get64(const u64 *p)
>>> {
>>> @@ -47,8 +50,8 @@ static u64 get64(const u64 *p)
>>> return ret;
>>> }
>>>
>>> -static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>> - unsigned int cpu)
>>> +static void xen_get_runstate_snapshot_cpu_delta(
>>> + struct vcpu_runstate_info *res, unsigned int cpu)
>>> {
>>> u64 state_time;
>>> struct vcpu_runstate_info *state;
>>> @@ -66,6 +69,67 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>> (state_time & XEN_RUNSTATE_UPDATE));
>>> }
>>>
>>> +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>> + unsigned int cpu)
>>> +{
>>> + int i;
>>> +
>>> + xen_get_runstate_snapshot_cpu_delta(res, cpu);
>>> +
>>> + for (i = 0; i < RUNSTATE_max; i++)
>>> + res->time[i] += per_cpu(old_runstate_time, cpu)[i];
>>> +}
>>> +
>>> +void xen_accumulate_runstate_time(int action)
>>> +{
>>> + struct vcpu_runstate_info state;
>>> + int cpu, i;
>>> +
>>> + switch (action) {
>>> + case -1: /* backup runstate time before suspend */
>>> + WARN_ON_ONCE(unlikely(runstate_delta));
>>
>> pr_warn_once(), to be consistent with the rest of the file. And then
>> should you return if this is true?
>
> I would prefer to not return if it is true but just warn the administrator that
> there is memory leakage issue while leaving runstate accumulation works normally.
>
>>
>>> +
>>> + runstate_delta = kcalloc(num_possible_cpus(),
>>> + sizeof(*runstate_delta),
>>> + GFP_KERNEL);
>>> + if (unlikely(!runstate_delta)) {
>>> + pr_alert("%s: failed to allocate runstate_delta\n",
>>> + __func__);
>>
>> pr_warn() should be sufficient. Below too.
>>
>> Also, as a side question --- can we do kmalloc() at this point?
>
> Yes. kmalloc_array() is better than kcalloc, unless we have 2 dimensional array
> and we need to guarantee the value of first dimension is always 0.
That's not what was thinking about. GFP_KERNEL may sleep and I don't
know how sleep is handled at this point. Everything is pretty much dead
now. Perhaps GFP_ATOMIC might be a better choice.
-boris
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
@ 2017-10-31 0:58 ` Boris Ostrovsky
0 siblings, 0 replies; 14+ messages in thread
From: Boris Ostrovsky @ 2017-10-31 0:58 UTC (permalink / raw)
To: Dongli Zhang, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins
On 10/30/2017 08:14 PM, Dongli Zhang wrote:
> Hi Boris,
>
> On 10/30/2017 09:34 PM, Boris Ostrovsky wrote:
>> On 10/30/2017 04:03 AM, Dongli Zhang wrote:
>>> After guest live migration on xen, steal time in /proc/stat
>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>>> derived from previous return value of xen_steal_clock().
>>>
>>> For instance, steal time of each vcpu is 335 before live migration.
>>>
>>> cpu 198 0 368 200064 1962 0 0 1340 0 0
>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>
>>> After live migration, steal time is reduced to 312.
>>>
>>> cpu 200 0 370 200330 1971 0 0 1248 0 0
>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>
>>> Since runstate times are cumulative and cleared during xen live migration
>>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>>> to global percpu variables before live migration suspend. Once guest VM is
>>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>>> runstate times and previously accumulated times stored in global percpu
>>> variables.
>>>
>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>> discussed by Michael Las at
>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>>> which would overflow steal time and lead to 100% st usage in top command
>>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>>
>>> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>>
>>> ---
>>> Changed since v1:
>>> * relocate modification to xen_get_runstate_snapshot_cpu
>>>
>>> Changed since v2:
>>> * accumulate runstate times before live migration
>>>
>>> Changed since v3:
>>> * do not accumulate times in the case of guest checkpointing
>>>
>>> Changed since v4:
>>> * allocate array of vcpu_runstate_info to reduce number of memory allocation
>>>
>>> ---
>>> drivers/xen/manage.c | 2 ++
>>> drivers/xen/time.c | 68 ++++++++++++++++++++++++++++++++++++++++++--
>>> include/xen/interface/vcpu.h | 2 ++
>>> include/xen/xen-ops.h | 1 +
>>> 4 files changed, 71 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>>> index c425d03..3dc085d 100644
>>> --- a/drivers/xen/manage.c
>>> +++ b/drivers/xen/manage.c
>>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>>> }
>>>
>>> gnttab_suspend();
>>> + xen_accumulate_runstate_time(-1);
>>> xen_arch_pre_suspend();
>>>
>>> /*
>>> @@ -84,6 +85,7 @@ static int xen_suspend(void *data)
>>> : 0);
>>>
>>> xen_arch_post_suspend(si->cancelled);
>>> + xen_accumulate_runstate_time(si->cancelled);
>>
>> I am not convinced that the comment above HYPERVISOR_suspend() is
>> correct. The call can return an error code and so if it returns -EPERM
>> (which AFAICS it can't now but might in the future) then
>> xen_accumulate_runstate_time() will do wrong thing.
>
> I would split xen_accumulate_runstate_time() into two functions to avoid the
> -EPERM issue, as one is for saving and another is for accumulation, respectively.
>
> Otherwise, can you use xen_accumulate_runstate_time(2) for saving before suspend
> and xen_accumulate_runstate_time(si->cancelled) after resume?
I'd probably just say something like
si->cancelled = HYPERVISOR_suspend() ? 1 : 0;
and keep xen_accumulate_runstate_time() as is (maybe rename it to
xen_manage_runstate_time()). And also remove the comment above the
hypercall as it is incorrect (but please mention the reason in the
commit message)
>
>>
>>
>>> gnttab_resume();
>>>
>>> if (!si->cancelled) {
>>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>>> index ac5f23f..cf3afb9 100644
>>> --- a/drivers/xen/time.c
>>> +++ b/drivers/xen/time.c
>>> @@ -19,6 +19,9 @@
>>> /* runstate info updated by Xen */
>>> static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>>
>>> +static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
>>> +static struct vcpu_runstate_info *runstate_delta;
>>
>> I'd move this inside xen_accumulate_runstate_time() since that's the
>
> If we split xen_accumulate_runstate_time() into two functions, we would leave
> runstate_delta as global static.
>
>> only function that uses it. And why does it need to be
>> vcpu_runstate_info and not u64[4]?
>
> This was suggested by Juergen to avoid the allocation and reclaim of the second
> dimensional array as in v4 of this patch?
>
> Or would you like to allocate sizeof(u64[4]) * num_possible_cpus() and emulate
> the 2d array with this 1d array and move the pointer forward sizeof(u64[4]) in
> each iteration?
I was thinking of
u64 **runstate_delta = (u64 **)kmalloc(sizeof(xen_runstate.time) *
num_possible_cpus())
and then you should be able to access runstate_delta[cpu][RUNSTATE_*].
>
>>
>>> +
>>> /* return an consistent snapshot of 64-bit time/counter value */
>>> static u64 get64(const u64 *p)
>>> {
>>> @@ -47,8 +50,8 @@ static u64 get64(const u64 *p)
>>> return ret;
>>> }
>>>
>>> -static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>> - unsigned int cpu)
>>> +static void xen_get_runstate_snapshot_cpu_delta(
>>> + struct vcpu_runstate_info *res, unsigned int cpu)
>>> {
>>> u64 state_time;
>>> struct vcpu_runstate_info *state;
>>> @@ -66,6 +69,67 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>> (state_time & XEN_RUNSTATE_UPDATE));
>>> }
>>>
>>> +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>> + unsigned int cpu)
>>> +{
>>> + int i;
>>> +
>>> + xen_get_runstate_snapshot_cpu_delta(res, cpu);
>>> +
>>> + for (i = 0; i < RUNSTATE_max; i++)
>>> + res->time[i] += per_cpu(old_runstate_time, cpu)[i];
>>> +}
>>> +
>>> +void xen_accumulate_runstate_time(int action)
>>> +{
>>> + struct vcpu_runstate_info state;
>>> + int cpu, i;
>>> +
>>> + switch (action) {
>>> + case -1: /* backup runstate time before suspend */
>>> + WARN_ON_ONCE(unlikely(runstate_delta));
>>
>> pr_warn_once(), to be consistent with the rest of the file. And then
>> should you return if this is true?
>
> I would prefer to not return if it is true but just warn the administrator that
> there is memory leakage issue while leaving runstate accumulation works normally.
>
>>
>>> +
>>> + runstate_delta = kcalloc(num_possible_cpus(),
>>> + sizeof(*runstate_delta),
>>> + GFP_KERNEL);
>>> + if (unlikely(!runstate_delta)) {
>>> + pr_alert("%s: failed to allocate runstate_delta\n",
>>> + __func__);
>>
>> pr_warn() should be sufficient. Below too.
>>
>> Also, as a side question --- can we do kmalloc() at this point?
>
> Yes. kmalloc_array() is better than kcalloc, unless we have 2 dimensional array
> and we need to guarantee the value of first dimension is always 0.
That's not what was thinking about. GFP_KERNEL may sleep and I don't
know how sleep is handled at this point. Everything is pretty much dead
now. Perhaps GFP_ATOMIC might be a better choice.
-boris
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
2017-10-31 0:58 ` Boris Ostrovsky
(?)
@ 2017-10-31 3:13 ` Dongli Zhang
-1 siblings, 0 replies; 14+ messages in thread
From: Dongli Zhang @ 2017-10-31 3:13 UTC (permalink / raw)
To: Boris Ostrovsky, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins
Hi Boris,
On 10/31/2017 08:58 AM, Boris Ostrovsky wrote:
>
>
> On 10/30/2017 08:14 PM, Dongli Zhang wrote:
>> Hi Boris,
>>
>> On 10/30/2017 09:34 PM, Boris Ostrovsky wrote:
>>> On 10/30/2017 04:03 AM, Dongli Zhang wrote:
>>>> After guest live migration on xen, steal time in /proc/stat
>>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>>>> derived from previous return value of xen_steal_clock().
>>>>
>>>> For instance, steal time of each vcpu is 335 before live migration.
>>>>
>>>> cpu 198 0 368 200064 1962 0 0 1340 0 0
>>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>>
>>>> After live migration, steal time is reduced to 312.
>>>>
>>>> cpu 200 0 370 200330 1971 0 0 1248 0 0
>>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>>
>>>> Since runstate times are cumulative and cleared during xen live migration
>>>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>>>> to global percpu variables before live migration suspend. Once guest VM is
>>>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>>>> runstate times and previously accumulated times stored in global percpu
>>>> variables.
>>>>
>>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>>> discussed by Michael Las at
>>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>>>>
>>>> which would overflow steal time and lead to 100% st usage in top command
>>>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>>>
>>>> References:
>>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>>>
>>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>>>
>>>> ---
>>>> Changed since v1:
>>>> * relocate modification to xen_get_runstate_snapshot_cpu
>>>>
>>>> Changed since v2:
>>>> * accumulate runstate times before live migration
>>>>
>>>> Changed since v3:
>>>> * do not accumulate times in the case of guest checkpointing
>>>>
>>>> Changed since v4:
>>>> * allocate array of vcpu_runstate_info to reduce number of memory allocation
>>>>
>>>> ---
>>>> drivers/xen/manage.c | 2 ++
>>>> drivers/xen/time.c | 68
>>>> ++++++++++++++++++++++++++++++++++++++++++--
>>>> include/xen/interface/vcpu.h | 2 ++
>>>> include/xen/xen-ops.h | 1 +
>>>> 4 files changed, 71 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>>>> index c425d03..3dc085d 100644
>>>> --- a/drivers/xen/manage.c
>>>> +++ b/drivers/xen/manage.c
>>>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>>>> }
>>>> gnttab_suspend();
>>>> + xen_accumulate_runstate_time(-1);
>>>> xen_arch_pre_suspend();
>>>> /*
>>>> @@ -84,6 +85,7 @@ static int xen_suspend(void *data)
>>>> : 0);
>>>> xen_arch_post_suspend(si->cancelled);
>>>> + xen_accumulate_runstate_time(si->cancelled);
>>>
>>> I am not convinced that the comment above HYPERVISOR_suspend() is
>>> correct. The call can return an error code and so if it returns -EPERM
>>> (which AFAICS it can't now but might in the future) then
>>> xen_accumulate_runstate_time() will do wrong thing.
>>
>> I would split xen_accumulate_runstate_time() into two functions to avoid the
>> -EPERM issue, as one is for saving and another is for accumulation, respectively.
>>
>> Otherwise, can you use xen_accumulate_runstate_time(2) for saving before suspend
>> and xen_accumulate_runstate_time(si->cancelled) after resume?
>
>
> I'd probably just say something like
>
> si->cancelled = HYPERVISOR_suspend() ? 1 : 0;
As the call of HYPERVISOR_suspend() takes 3 lines, I would make it as below and
I think gcc would optimize it.
- /*
- * This hypercall returns 1 if suspend was cancelled
- * or the domain was merely checkpointed, and 0 if it
- * is resuming in a new domain.
- */
si->cancelled = HYPERVISOR_suspend(xen_pv_domain()
? virt_to_gfn(xen_start_info)
: 0);
+ si->cancelled = si->cancelled ? 1 : 0;
>
> and keep xen_accumulate_runstate_time() as is (maybe rename it to
> xen_manage_runstate_time()). And also remove the comment above the hypercall as
> it is incorrect (but please mention the reason in the commit message)
>
>>
>>>
>>>
>>>> gnttab_resume();
>>>> if (!si->cancelled) {
>>>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>>>> index ac5f23f..cf3afb9 100644
>>>> --- a/drivers/xen/time.c
>>>> +++ b/drivers/xen/time.c
>>>> @@ -19,6 +19,9 @@
>>>> /* runstate info updated by Xen */
>>>> static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>>> +static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
>>>> +static struct vcpu_runstate_info *runstate_delta;
>>>
>>> I'd move this inside xen_accumulate_runstate_time() since that's the
>>
>> If we split xen_accumulate_runstate_time() into two functions, we would leave
>> runstate_delta as global static.
>>
>>> only function that uses it. And why does it need to be
>>> vcpu_runstate_info and not u64[4]?
>>
>> This was suggested by Juergen to avoid the allocation and reclaim of the second
>> dimensional array as in v4 of this patch?
>>
>> Or would you like to allocate sizeof(u64[4]) * num_possible_cpus() and emulate
>> the 2d array with this 1d array and move the pointer forward sizeof(u64[4]) in
>> each iteration?
>
>
> I was thinking of
>
> u64 **runstate_delta = (u64 **)kmalloc(sizeof(xen_runstate.time) *
> num_possible_cpus())
>
> and then you should be able to access runstate_delta[cpu][RUNSTATE_*].
Would the above code work? You are allocating data only for 1st dimension of a
2d array. How can you access the 2nd array with runstate_delta[cpu][RUNSTATE_*]?
Is there any option in gcc that support this?
I have actually tested this with below code in linux and kernel panic with page
fault when doing memcpy.
+void xen_manage_runstate_time(int action)
+{
+ static u64 **runstate_delta;
+ struct vcpu_runstate_info state;
+ int cpu, i;
+
+ switch (action) {
+ case -1: /* backup runstate time before suspend */
+ WARN_ON_ONCE(unlikely(runstate_delta));
+
+ runstate_delta = (u64 **)kmalloc(sizeof(xen_runstate.time) *
num_possible_cpus(), GFP_ATOMIC);
+ if (unlikely(!runstate_delta)) {
+ pr_alert("%s: failed to allocate runstate_delta\n",
+ __func__);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ xen_get_runstate_snapshot_cpu_delta(&state, cpu);
+ memcpy(runstate_delta[cpu], state.time,
+ sizeof(xen_runstate.time));
+ }
+
+ break;
>
>>
>>>
>>>> +
>>>> /* return an consistent snapshot of 64-bit time/counter value */
>>>> static u64 get64(const u64 *p)
>>>> {
>>>> @@ -47,8 +50,8 @@ static u64 get64(const u64 *p)
>>>> return ret;
>>>> }
>>>> -static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>>> - unsigned int cpu)
>>>> +static void xen_get_runstate_snapshot_cpu_delta(
>>>> + struct vcpu_runstate_info *res, unsigned int cpu)
>>>> {
>>>> u64 state_time;
>>>> struct vcpu_runstate_info *state;
>>>> @@ -66,6 +69,67 @@ static void xen_get_runstate_snapshot_cpu(struct
>>>> vcpu_runstate_info *res,
>>>> (state_time & XEN_RUNSTATE_UPDATE));
>>>> }
>>>> +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>>> + unsigned int cpu)
>>>> +{
>>>> + int i;
>>>> +
>>>> + xen_get_runstate_snapshot_cpu_delta(res, cpu);
>>>> +
>>>> + for (i = 0; i < RUNSTATE_max; i++)
>>>> + res->time[i] += per_cpu(old_runstate_time, cpu)[i];
>>>> +}
>>>> +
>>>> +void xen_accumulate_runstate_time(int action)
>>>> +{
>>>> + struct vcpu_runstate_info state;
>>>> + int cpu, i;
>>>> +
>>>> + switch (action) {
>>>> + case -1: /* backup runstate time before suspend */
>>>> + WARN_ON_ONCE(unlikely(runstate_delta));
>>>
>>> pr_warn_once(), to be consistent with the rest of the file. And then
>>> should you return if this is true?
>>
>> I would prefer to not return if it is true but just warn the administrator that
>> there is memory leakage issue while leaving runstate accumulation works normally.
>>
>>>
>>>> +
>>>> + runstate_delta = kcalloc(num_possible_cpus(),
>>>> + sizeof(*runstate_delta),
>>>> + GFP_KERNEL);
>>>> + if (unlikely(!runstate_delta)) {
>>>> + pr_alert("%s: failed to allocate runstate_delta\n",
>>>> + __func__);
>>>
>>> pr_warn() should be sufficient. Below too.
>>>
>>> Also, as a side question --- can we do kmalloc() at this point?
>>
>> Yes. kmalloc_array() is better than kcalloc, unless we have 2 dimensional array
>> and we need to guarantee the value of first dimension is always 0.
>
>
> That's not what was thinking about. GFP_KERNEL may sleep and I don't know how
> sleep is handled at this point. Everything is pretty much dead now. Perhaps
> GFP_ATOMIC might be a better choice.
>
> -boris
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
Thank you very much for your suggestion!
Dongli Zhang
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [Xen-devel] [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
2017-10-31 0:58 ` Boris Ostrovsky
(?)
(?)
@ 2017-10-31 3:13 ` Dongli Zhang
2017-10-31 11:05 ` Boris Ostrovsky
-1 siblings, 1 reply; 14+ messages in thread
From: Dongli Zhang @ 2017-10-31 3:13 UTC (permalink / raw)
To: Boris Ostrovsky, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins
Hi Boris,
On 10/31/2017 08:58 AM, Boris Ostrovsky wrote:
>
>
> On 10/30/2017 08:14 PM, Dongli Zhang wrote:
>> Hi Boris,
>>
>> On 10/30/2017 09:34 PM, Boris Ostrovsky wrote:
>>> On 10/30/2017 04:03 AM, Dongli Zhang wrote:
>>>> After guest live migration on xen, steal time in /proc/stat
>>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>>>> derived from previous return value of xen_steal_clock().
>>>>
>>>> For instance, steal time of each vcpu is 335 before live migration.
>>>>
>>>> cpu 198 0 368 200064 1962 0 0 1340 0 0
>>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>>
>>>> After live migration, steal time is reduced to 312.
>>>>
>>>> cpu 200 0 370 200330 1971 0 0 1248 0 0
>>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>>
>>>> Since runstate times are cumulative and cleared during xen live migration
>>>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>>>> to global percpu variables before live migration suspend. Once guest VM is
>>>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>>>> runstate times and previously accumulated times stored in global percpu
>>>> variables.
>>>>
>>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>>> discussed by Michael Las at
>>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>>>>
>>>> which would overflow steal time and lead to 100% st usage in top command
>>>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>>>
>>>> References:
>>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>>>
>>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>>>
>>>> ---
>>>> Changed since v1:
>>>> * relocate modification to xen_get_runstate_snapshot_cpu
>>>>
>>>> Changed since v2:
>>>> * accumulate runstate times before live migration
>>>>
>>>> Changed since v3:
>>>> * do not accumulate times in the case of guest checkpointing
>>>>
>>>> Changed since v4:
>>>> * allocate array of vcpu_runstate_info to reduce number of memory allocation
>>>>
>>>> ---
>>>> drivers/xen/manage.c | 2 ++
>>>> drivers/xen/time.c | 68
>>>> ++++++++++++++++++++++++++++++++++++++++++--
>>>> include/xen/interface/vcpu.h | 2 ++
>>>> include/xen/xen-ops.h | 1 +
>>>> 4 files changed, 71 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>>>> index c425d03..3dc085d 100644
>>>> --- a/drivers/xen/manage.c
>>>> +++ b/drivers/xen/manage.c
>>>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>>>> }
>>>> gnttab_suspend();
>>>> + xen_accumulate_runstate_time(-1);
>>>> xen_arch_pre_suspend();
>>>> /*
>>>> @@ -84,6 +85,7 @@ static int xen_suspend(void *data)
>>>> : 0);
>>>> xen_arch_post_suspend(si->cancelled);
>>>> + xen_accumulate_runstate_time(si->cancelled);
>>>
>>> I am not convinced that the comment above HYPERVISOR_suspend() is
>>> correct. The call can return an error code and so if it returns -EPERM
>>> (which AFAICS it can't now but might in the future) then
>>> xen_accumulate_runstate_time() will do wrong thing.
>>
>> I would split xen_accumulate_runstate_time() into two functions to avoid the
>> -EPERM issue, as one is for saving and another is for accumulation, respectively.
>>
>> Otherwise, can you use xen_accumulate_runstate_time(2) for saving before suspend
>> and xen_accumulate_runstate_time(si->cancelled) after resume?
>
>
> I'd probably just say something like
>
> si->cancelled = HYPERVISOR_suspend() ? 1 : 0;
As the call of HYPERVISOR_suspend() takes 3 lines, I would make it as below and
I think gcc would optimize it.
- /*
- * This hypercall returns 1 if suspend was cancelled
- * or the domain was merely checkpointed, and 0 if it
- * is resuming in a new domain.
- */
si->cancelled = HYPERVISOR_suspend(xen_pv_domain()
? virt_to_gfn(xen_start_info)
: 0);
+ si->cancelled = si->cancelled ? 1 : 0;
>
> and keep xen_accumulate_runstate_time() as is (maybe rename it to
> xen_manage_runstate_time()). And also remove the comment above the hypercall as
> it is incorrect (but please mention the reason in the commit message)
>
>>
>>>
>>>
>>>> gnttab_resume();
>>>> if (!si->cancelled) {
>>>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>>>> index ac5f23f..cf3afb9 100644
>>>> --- a/drivers/xen/time.c
>>>> +++ b/drivers/xen/time.c
>>>> @@ -19,6 +19,9 @@
>>>> /* runstate info updated by Xen */
>>>> static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>>> +static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
>>>> +static struct vcpu_runstate_info *runstate_delta;
>>>
>>> I'd move this inside xen_accumulate_runstate_time() since that's the
>>
>> If we split xen_accumulate_runstate_time() into two functions, we would leave
>> runstate_delta as global static.
>>
>>> only function that uses it. And why does it need to be
>>> vcpu_runstate_info and not u64[4]?
>>
>> This was suggested by Juergen to avoid the allocation and reclaim of the second
>> dimensional array as in v4 of this patch?
>>
>> Or would you like to allocate sizeof(u64[4]) * num_possible_cpus() and emulate
>> the 2d array with this 1d array and move the pointer forward sizeof(u64[4]) in
>> each iteration?
>
>
> I was thinking of
>
> u64 **runstate_delta = (u64 **)kmalloc(sizeof(xen_runstate.time) *
> num_possible_cpus())
>
> and then you should be able to access runstate_delta[cpu][RUNSTATE_*].
Would the above code work? You are allocating data only for 1st dimension of a
2d array. How can you access the 2nd array with runstate_delta[cpu][RUNSTATE_*]?
Is there any option in gcc that support this?
I have actually tested this with below code in linux and kernel panic with page
fault when doing memcpy.
+void xen_manage_runstate_time(int action)
+{
+ static u64 **runstate_delta;
+ struct vcpu_runstate_info state;
+ int cpu, i;
+
+ switch (action) {
+ case -1: /* backup runstate time before suspend */
+ WARN_ON_ONCE(unlikely(runstate_delta));
+
+ runstate_delta = (u64 **)kmalloc(sizeof(xen_runstate.time) *
num_possible_cpus(), GFP_ATOMIC);
+ if (unlikely(!runstate_delta)) {
+ pr_alert("%s: failed to allocate runstate_delta\n",
+ __func__);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ xen_get_runstate_snapshot_cpu_delta(&state, cpu);
+ memcpy(runstate_delta[cpu], state.time,
+ sizeof(xen_runstate.time));
+ }
+
+ break;
>
>>
>>>
>>>> +
>>>> /* return an consistent snapshot of 64-bit time/counter value */
>>>> static u64 get64(const u64 *p)
>>>> {
>>>> @@ -47,8 +50,8 @@ static u64 get64(const u64 *p)
>>>> return ret;
>>>> }
>>>> -static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>>> - unsigned int cpu)
>>>> +static void xen_get_runstate_snapshot_cpu_delta(
>>>> + struct vcpu_runstate_info *res, unsigned int cpu)
>>>> {
>>>> u64 state_time;
>>>> struct vcpu_runstate_info *state;
>>>> @@ -66,6 +69,67 @@ static void xen_get_runstate_snapshot_cpu(struct
>>>> vcpu_runstate_info *res,
>>>> (state_time & XEN_RUNSTATE_UPDATE));
>>>> }
>>>> +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
>>>> + unsigned int cpu)
>>>> +{
>>>> + int i;
>>>> +
>>>> + xen_get_runstate_snapshot_cpu_delta(res, cpu);
>>>> +
>>>> + for (i = 0; i < RUNSTATE_max; i++)
>>>> + res->time[i] += per_cpu(old_runstate_time, cpu)[i];
>>>> +}
>>>> +
>>>> +void xen_accumulate_runstate_time(int action)
>>>> +{
>>>> + struct vcpu_runstate_info state;
>>>> + int cpu, i;
>>>> +
>>>> + switch (action) {
>>>> + case -1: /* backup runstate time before suspend */
>>>> + WARN_ON_ONCE(unlikely(runstate_delta));
>>>
>>> pr_warn_once(), to be consistent with the rest of the file. And then
>>> should you return if this is true?
>>
>> I would prefer to not return if it is true but just warn the administrator that
>> there is memory leakage issue while leaving runstate accumulation works normally.
>>
>>>
>>>> +
>>>> + runstate_delta = kcalloc(num_possible_cpus(),
>>>> + sizeof(*runstate_delta),
>>>> + GFP_KERNEL);
>>>> + if (unlikely(!runstate_delta)) {
>>>> + pr_alert("%s: failed to allocate runstate_delta\n",
>>>> + __func__);
>>>
>>> pr_warn() should be sufficient. Below too.
>>>
>>> Also, as a side question --- can we do kmalloc() at this point?
>>
>> Yes. kmalloc_array() is better than kcalloc, unless we have 2 dimensional array
>> and we need to guarantee the value of first dimension is always 0.
>
>
> That's not what was thinking about. GFP_KERNEL may sleep and I don't know how
> sleep is handled at this point. Everything is pretty much dead now. Perhaps
> GFP_ATOMIC might be a better choice.
>
> -boris
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
Thank you very much for your suggestion!
Dongli Zhang
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [Xen-devel] [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
2017-10-31 3:13 ` [Xen-devel] " Dongli Zhang
@ 2017-10-31 11:05 ` Boris Ostrovsky
0 siblings, 0 replies; 14+ messages in thread
From: Boris Ostrovsky @ 2017-10-31 11:05 UTC (permalink / raw)
To: Dongli Zhang, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins
On 10/30/2017 11:13 PM, Dongli Zhang wrote:
> Hi Boris,
>
> On 10/31/2017 08:58 AM, Boris Ostrovsky wrote:
>>
>>
>> On 10/30/2017 08:14 PM, Dongli Zhang wrote:
>>> Hi Boris,
>>>
>>> On 10/30/2017 09:34 PM, Boris Ostrovsky wrote:
>>>> On 10/30/2017 04:03 AM, Dongli Zhang wrote:
>>>>> After guest live migration on xen, steal time in /proc/stat
>>>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>>>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>>>>> derived from previous return value of xen_steal_clock().
>>>>>
>>>>> For instance, steal time of each vcpu is 335 before live migration.
>>>>>
>>>>> cpu 198 0 368 200064 1962 0 0 1340 0 0
>>>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>>>
>>>>> After live migration, steal time is reduced to 312.
>>>>>
>>>>> cpu 200 0 370 200330 1971 0 0 1248 0 0
>>>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>>>
>>>>> Since runstate times are cumulative and cleared during xen live migration
>>>>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>>>>> to global percpu variables before live migration suspend. Once guest VM is
>>>>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>>>>> runstate times and previously accumulated times stored in global percpu
>>>>> variables.
>>>>>
>>>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>>>> discussed by Michael Las at
>>>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>>>>>
>>>>> which would overflow steal time and lead to 100% st usage in top command
>>>>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>>>>
>>>>> References:
>>>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>>>>
>>>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>>>>
>>>>> ---
>>>>> Changed since v1:
>>>>> * relocate modification to xen_get_runstate_snapshot_cpu
>>>>>
>>>>> Changed since v2:
>>>>> * accumulate runstate times before live migration
>>>>>
>>>>> Changed since v3:
>>>>> * do not accumulate times in the case of guest checkpointing
>>>>>
>>>>> Changed since v4:
>>>>> * allocate array of vcpu_runstate_info to reduce number of memory allocation
>>>>>
>>>>> ---
>>>>> drivers/xen/manage.c | 2 ++
>>>>> drivers/xen/time.c | 68
>>>>> ++++++++++++++++++++++++++++++++++++++++++--
>>>>> include/xen/interface/vcpu.h | 2 ++
>>>>> include/xen/xen-ops.h | 1 +
>>>>> 4 files changed, 71 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>>>>> index c425d03..3dc085d 100644
>>>>> --- a/drivers/xen/manage.c
>>>>> +++ b/drivers/xen/manage.c
>>>>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>>>>> }
>>>>> gnttab_suspend();
>>>>> + xen_accumulate_runstate_time(-1);
>>>>> xen_arch_pre_suspend();
>>>>> /*
>>>>> @@ -84,6 +85,7 @@ static int xen_suspend(void *data)
>>>>> : 0);
>>>>> xen_arch_post_suspend(si->cancelled);
>>>>> + xen_accumulate_runstate_time(si->cancelled);
>>>>
>>>> I am not convinced that the comment above HYPERVISOR_suspend() is
>>>> correct. The call can return an error code and so if it returns -EPERM
>>>> (which AFAICS it can't now but might in the future) then
>>>> xen_accumulate_runstate_time() will do wrong thing.
>>>
>>> I would split xen_accumulate_runstate_time() into two functions to avoid the
>>> -EPERM issue, as one is for saving and another is for accumulation, respectively.
>>>
>>> Otherwise, can you use xen_accumulate_runstate_time(2) for saving before suspend
>>> and xen_accumulate_runstate_time(si->cancelled) after resume?
>>
>>
>> I'd probably just say something like
>>
>> si->cancelled = HYPERVISOR_suspend() ? 1 : 0;
>
> As the call of HYPERVISOR_suspend() takes 3 lines, I would make it as below and
> I think gcc would optimize it.
>
> - /*
> - * This hypercall returns 1 if suspend was cancelled
> - * or the domain was merely checkpointed, and 0 if it
> - * is resuming in a new domain.
> - */
> si->cancelled = HYPERVISOR_suspend(xen_pv_domain()
> ? virt_to_gfn(xen_start_info)
> : 0);
> + si->cancelled = si->cancelled ? 1 : 0;
Or xen_manage_runstate_time(si->cancelled ? 1 : 0) --- that way you
preserve si->cancelled if anyone cares later (which at the moment nooone
does) Either way I think is fine.
>
>>
>> and keep xen_accumulate_runstate_time() as is (maybe rename it to
>> xen_manage_runstate_time()). And also remove the comment above the hypercall as
>> it is incorrect (but please mention the reason in the commit message)
>>
>>>
>>>>
>>>>
>>>>> gnttab_resume();
>>>>> if (!si->cancelled) {
>>>>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>>>>> index ac5f23f..cf3afb9 100644
>>>>> --- a/drivers/xen/time.c
>>>>> +++ b/drivers/xen/time.c
>>>>> @@ -19,6 +19,9 @@
>>>>> /* runstate info updated by Xen */
>>>>> static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>>>> +static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
>>>>> +static struct vcpu_runstate_info *runstate_delta;
>>>>
>>>> I'd move this inside xen_accumulate_runstate_time() since that's the
>>>
>>> If we split xen_accumulate_runstate_time() into two functions, we would leave
>>> runstate_delta as global static.
>>>
>>>> only function that uses it. And why does it need to be
>>>> vcpu_runstate_info and not u64[4]?
>>>
>>> This was suggested by Juergen to avoid the allocation and reclaim of the second
>>> dimensional array as in v4 of this patch?
>>>
>>> Or would you like to allocate sizeof(u64[4]) * num_possible_cpus() and emulate
>>> the 2d array with this 1d array and move the pointer forward sizeof(u64[4]) in
>>> each iteration?
>>
>>
>> I was thinking of
>>
>> u64 **runstate_delta = (u64 **)kmalloc(sizeof(xen_runstate.time) *
>> num_possible_cpus())
>>
>> and then you should be able to access runstate_delta[cpu][RUNSTATE_*].
>
> Would the above code work?
Ugh... If course it will not, it's clearly wrong. So nevermind, sorry.
Keep it the way it was originally.
-boris
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
@ 2017-10-31 11:05 ` Boris Ostrovsky
0 siblings, 0 replies; 14+ messages in thread
From: Boris Ostrovsky @ 2017-10-31 11:05 UTC (permalink / raw)
To: Dongli Zhang, xen-devel, linux-kernel; +Cc: jgross, joao.m.martins
On 10/30/2017 11:13 PM, Dongli Zhang wrote:
> Hi Boris,
>
> On 10/31/2017 08:58 AM, Boris Ostrovsky wrote:
>>
>>
>> On 10/30/2017 08:14 PM, Dongli Zhang wrote:
>>> Hi Boris,
>>>
>>> On 10/30/2017 09:34 PM, Boris Ostrovsky wrote:
>>>> On 10/30/2017 04:03 AM, Dongli Zhang wrote:
>>>>> After guest live migration on xen, steal time in /proc/stat
>>>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>>>> xen_steal_lock() might be less than this_rq()->prev_steal_time which is
>>>>> derived from previous return value of xen_steal_clock().
>>>>>
>>>>> For instance, steal time of each vcpu is 335 before live migration.
>>>>>
>>>>> cpu 198 0 368 200064 1962 0 0 1340 0 0
>>>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>>>
>>>>> After live migration, steal time is reduced to 312.
>>>>>
>>>>> cpu 200 0 370 200330 1971 0 0 1248 0 0
>>>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>>>
>>>>> Since runstate times are cumulative and cleared during xen live migration
>>>>> by xen hypervisor, the idea of this patch is to accumulate runstate times
>>>>> to global percpu variables before live migration suspend. Once guest VM is
>>>>> resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
>>>>> runstate times and previously accumulated times stored in global percpu
>>>>> variables.
>>>>>
>>>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>>>> discussed by Michael Las at
>>>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
>>>>>
>>>>> which would overflow steal time and lead to 100% st usage in top command
>>>>> for linux 4.8-4.10. A backport of this patch would fix that issue.
>>>>>
>>>>> References:
>>>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>>>>
>>>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>>>>
>>>>> ---
>>>>> Changed since v1:
>>>>> * relocate modification to xen_get_runstate_snapshot_cpu
>>>>>
>>>>> Changed since v2:
>>>>> * accumulate runstate times before live migration
>>>>>
>>>>> Changed since v3:
>>>>> * do not accumulate times in the case of guest checkpointing
>>>>>
>>>>> Changed since v4:
>>>>> * allocate array of vcpu_runstate_info to reduce number of memory allocation
>>>>>
>>>>> ---
>>>>> drivers/xen/manage.c | 2 ++
>>>>> drivers/xen/time.c | 68
>>>>> ++++++++++++++++++++++++++++++++++++++++++--
>>>>> include/xen/interface/vcpu.h | 2 ++
>>>>> include/xen/xen-ops.h | 1 +
>>>>> 4 files changed, 71 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>>>>> index c425d03..3dc085d 100644
>>>>> --- a/drivers/xen/manage.c
>>>>> +++ b/drivers/xen/manage.c
>>>>> @@ -72,6 +72,7 @@ static int xen_suspend(void *data)
>>>>> }
>>>>> gnttab_suspend();
>>>>> + xen_accumulate_runstate_time(-1);
>>>>> xen_arch_pre_suspend();
>>>>> /*
>>>>> @@ -84,6 +85,7 @@ static int xen_suspend(void *data)
>>>>> : 0);
>>>>> xen_arch_post_suspend(si->cancelled);
>>>>> + xen_accumulate_runstate_time(si->cancelled);
>>>>
>>>> I am not convinced that the comment above HYPERVISOR_suspend() is
>>>> correct. The call can return an error code and so if it returns -EPERM
>>>> (which AFAICS it can't now but might in the future) then
>>>> xen_accumulate_runstate_time() will do wrong thing.
>>>
>>> I would split xen_accumulate_runstate_time() into two functions to avoid the
>>> -EPERM issue, as one is for saving and another is for accumulation, respectively.
>>>
>>> Otherwise, can you use xen_accumulate_runstate_time(2) for saving before suspend
>>> and xen_accumulate_runstate_time(si->cancelled) after resume?
>>
>>
>> I'd probably just say something like
>>
>> si->cancelled = HYPERVISOR_suspend() ? 1 : 0;
>
> As the call of HYPERVISOR_suspend() takes 3 lines, I would make it as below and
> I think gcc would optimize it.
>
> - /*
> - * This hypercall returns 1 if suspend was cancelled
> - * or the domain was merely checkpointed, and 0 if it
> - * is resuming in a new domain.
> - */
> si->cancelled = HYPERVISOR_suspend(xen_pv_domain()
> ? virt_to_gfn(xen_start_info)
> : 0);
> + si->cancelled = si->cancelled ? 1 : 0;
Or xen_manage_runstate_time(si->cancelled ? 1 : 0) --- that way you
preserve si->cancelled if anyone cares later (which at the moment nooone
does) Either way I think is fine.
>
>>
>> and keep xen_accumulate_runstate_time() as is (maybe rename it to
>> xen_manage_runstate_time()). And also remove the comment above the hypercall as
>> it is incorrect (but please mention the reason in the commit message)
>>
>>>
>>>>
>>>>
>>>>> gnttab_resume();
>>>>> if (!si->cancelled) {
>>>>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>>>>> index ac5f23f..cf3afb9 100644
>>>>> --- a/drivers/xen/time.c
>>>>> +++ b/drivers/xen/time.c
>>>>> @@ -19,6 +19,9 @@
>>>>> /* runstate info updated by Xen */
>>>>> static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
>>>>> +static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
>>>>> +static struct vcpu_runstate_info *runstate_delta;
>>>>
>>>> I'd move this inside xen_accumulate_runstate_time() since that's the
>>>
>>> If we split xen_accumulate_runstate_time() into two functions, we would leave
>>> runstate_delta as global static.
>>>
>>>> only function that uses it. And why does it need to be
>>>> vcpu_runstate_info and not u64[4]?
>>>
>>> This was suggested by Juergen to avoid the allocation and reclaim of the second
>>> dimensional array as in v4 of this patch?
>>>
>>> Or would you like to allocate sizeof(u64[4]) * num_possible_cpus() and emulate
>>> the 2d array with this 1d array and move the pointer forward sizeof(u64[4]) in
>>> each iteration?
>>
>>
>> I was thinking of
>>
>> u64 **runstate_delta = (u64 **)kmalloc(sizeof(xen_runstate.time) *
>> num_possible_cpus())
>>
>> and then you should be able to access runstate_delta[cpu][RUNSTATE_*].
>
> Would the above code work?
Ugh... If course it will not, it's clearly wrong. So nevermind, sorry.
Keep it the way it was originally.
-boris
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
2017-10-30 8:03 [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen Dongli Zhang
2017-10-30 13:34 ` Boris Ostrovsky
2017-10-30 13:34 ` Boris Ostrovsky
@ 2017-11-01 20:05 ` kbuild test robot
2017-11-01 20:05 ` kbuild test robot
3 siblings, 0 replies; 14+ messages in thread
From: kbuild test robot @ 2017-11-01 20:05 UTC (permalink / raw)
To: Dongli Zhang
Cc: jgross, linux-kernel, kbuild-all, xen-devel, boris.ostrovsky,
joao.m.martins
[-- Attachment #1: Type: text/plain, Size: 3270 bytes --]
Hi Dongli,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on xen-tip/linux-next]
[also build test ERROR on v4.14-rc7 next-20171018]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Dongli-Zhang/xen-time-do-not-decrease-steal-time-after-live-migration-on-xen/20171102-011408
base: https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git linux-next
config: arm64-defconfig (attached as .config)
compiler: aarch64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=arm64
All error/warnings (new ones prefixed by >>):
drivers//xen/time.c: In function 'xen_accumulate_runstate_time':
>> drivers//xen/time.c:92:20: error: implicit declaration of function 'kcalloc' [-Werror=implicit-function-declaration]
runstate_delta = kcalloc(num_possible_cpus(),
^~~~~~~
>> drivers//xen/time.c:92:18: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
runstate_delta = kcalloc(num_possible_cpus(),
^
>> drivers//xen/time.c:128:3: error: implicit declaration of function 'kfree' [-Werror=implicit-function-declaration]
kfree(runstate_delta);
^~~~~
cc1: some warnings being treated as errors
vim +/kcalloc +92 drivers//xen/time.c
82
83 void xen_accumulate_runstate_time(int action)
84 {
85 struct vcpu_runstate_info state;
86 int cpu, i;
87
88 switch (action) {
89 case -1: /* backup runstate time before suspend */
90 WARN_ON_ONCE(unlikely(runstate_delta));
91
> 92 runstate_delta = kcalloc(num_possible_cpus(),
93 sizeof(*runstate_delta),
94 GFP_KERNEL);
95 if (unlikely(!runstate_delta)) {
96 pr_alert("%s: failed to allocate runstate_delta\n",
97 __func__);
98 return;
99 }
100
101 for_each_possible_cpu(cpu) {
102 xen_get_runstate_snapshot_cpu_delta(&state, cpu);
103 memcpy(runstate_delta[cpu].time, state.time,
104 RUNSTATE_max * sizeof(*runstate_delta[cpu].time));
105 }
106
107 break;
108
109 case 0: /* backup runstate time after resume */
110 if (unlikely(!runstate_delta)) {
111 pr_alert("%s: cannot accumulate runstate time as runstate_delta is NULL\n",
112 __func__);
113 return;
114 }
115
116 for_each_possible_cpu(cpu) {
117 for (i = 0; i < RUNSTATE_max; i++)
118 per_cpu(old_runstate_time, cpu)[i] +=
119 runstate_delta[cpu].time[i];
120 }
121 break;
122
123 default: /* do not accumulate runstate time for checkpointing */
124 break;
125 }
126
127 if (action != -1 && runstate_delta) {
> 128 kfree(runstate_delta);
129 runstate_delta = NULL;
130 }
131 }
132
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 36753 bytes --]
[-- Attachment #3: Type: text/plain, Size: 127 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
2017-10-30 8:03 [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen Dongli Zhang
` (2 preceding siblings ...)
2017-11-01 20:05 ` kbuild test robot
@ 2017-11-01 20:05 ` kbuild test robot
3 siblings, 0 replies; 14+ messages in thread
From: kbuild test robot @ 2017-11-01 20:05 UTC (permalink / raw)
To: Dongli Zhang
Cc: kbuild-all, xen-devel, linux-kernel, boris.ostrovsky, jgross,
joao.m.martins
[-- Attachment #1: Type: text/plain, Size: 3270 bytes --]
Hi Dongli,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on xen-tip/linux-next]
[also build test ERROR on v4.14-rc7 next-20171018]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Dongli-Zhang/xen-time-do-not-decrease-steal-time-after-live-migration-on-xen/20171102-011408
base: https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git linux-next
config: arm64-defconfig (attached as .config)
compiler: aarch64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=arm64
All error/warnings (new ones prefixed by >>):
drivers//xen/time.c: In function 'xen_accumulate_runstate_time':
>> drivers//xen/time.c:92:20: error: implicit declaration of function 'kcalloc' [-Werror=implicit-function-declaration]
runstate_delta = kcalloc(num_possible_cpus(),
^~~~~~~
>> drivers//xen/time.c:92:18: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
runstate_delta = kcalloc(num_possible_cpus(),
^
>> drivers//xen/time.c:128:3: error: implicit declaration of function 'kfree' [-Werror=implicit-function-declaration]
kfree(runstate_delta);
^~~~~
cc1: some warnings being treated as errors
vim +/kcalloc +92 drivers//xen/time.c
82
83 void xen_accumulate_runstate_time(int action)
84 {
85 struct vcpu_runstate_info state;
86 int cpu, i;
87
88 switch (action) {
89 case -1: /* backup runstate time before suspend */
90 WARN_ON_ONCE(unlikely(runstate_delta));
91
> 92 runstate_delta = kcalloc(num_possible_cpus(),
93 sizeof(*runstate_delta),
94 GFP_KERNEL);
95 if (unlikely(!runstate_delta)) {
96 pr_alert("%s: failed to allocate runstate_delta\n",
97 __func__);
98 return;
99 }
100
101 for_each_possible_cpu(cpu) {
102 xen_get_runstate_snapshot_cpu_delta(&state, cpu);
103 memcpy(runstate_delta[cpu].time, state.time,
104 RUNSTATE_max * sizeof(*runstate_delta[cpu].time));
105 }
106
107 break;
108
109 case 0: /* backup runstate time after resume */
110 if (unlikely(!runstate_delta)) {
111 pr_alert("%s: cannot accumulate runstate time as runstate_delta is NULL\n",
112 __func__);
113 return;
114 }
115
116 for_each_possible_cpu(cpu) {
117 for (i = 0; i < RUNSTATE_max; i++)
118 per_cpu(old_runstate_time, cpu)[i] +=
119 runstate_delta[cpu].time[i];
120 }
121 break;
122
123 default: /* do not accumulate runstate time for checkpointing */
124 break;
125 }
126
127 if (action != -1 && runstate_delta) {
> 128 kfree(runstate_delta);
129 runstate_delta = NULL;
130 }
131 }
132
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 36753 bytes --]
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen
@ 2017-10-30 8:03 Dongli Zhang
0 siblings, 0 replies; 14+ messages in thread
From: Dongli Zhang @ 2017-10-30 8:03 UTC (permalink / raw)
To: xen-devel, linux-kernel; +Cc: jgross, boris.ostrovsky, joao.m.martins
After guest live migration on xen, steal time in /proc/stat
(cpustat[CPUTIME_STEAL]) might decrease because steal returned by
xen_steal_lock() might be less than this_rq()->prev_steal_time which is
derived from previous return value of xen_steal_clock().
For instance, steal time of each vcpu is 335 before live migration.
cpu 198 0 368 200064 1962 0 0 1340 0 0
cpu0 38 0 81 50063 492 0 0 335 0 0
cpu1 65 0 97 49763 634 0 0 335 0 0
cpu2 38 0 81 50098 462 0 0 335 0 0
cpu3 56 0 107 50138 374 0 0 335 0 0
After live migration, steal time is reduced to 312.
cpu 200 0 370 200330 1971 0 0 1248 0 0
cpu0 38 0 82 50123 500 0 0 312 0 0
cpu1 65 0 97 49832 634 0 0 312 0 0
cpu2 39 0 82 50167 462 0 0 312 0 0
cpu3 56 0 107 50207 374 0 0 312 0 0
Since runstate times are cumulative and cleared during xen live migration
by xen hypervisor, the idea of this patch is to accumulate runstate times
to global percpu variables before live migration suspend. Once guest VM is
resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
runstate times and previously accumulated times stored in global percpu
variables.
Similar and more severe issue would impact prior linux 4.8-4.10 as
discussed by Michael Las at
https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
which would overflow steal time and lead to 100% st usage in top command
for linux 4.8-4.10. A backport of this patch would fix that issue.
References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
Changed since v1:
* relocate modification to xen_get_runstate_snapshot_cpu
Changed since v2:
* accumulate runstate times before live migration
Changed since v3:
* do not accumulate times in the case of guest checkpointing
Changed since v4:
* allocate array of vcpu_runstate_info to reduce number of memory allocation
---
drivers/xen/manage.c | 2 ++
drivers/xen/time.c | 68 ++++++++++++++++++++++++++++++++++++++++++--
include/xen/interface/vcpu.h | 2 ++
include/xen/xen-ops.h | 1 +
4 files changed, 71 insertions(+), 2 deletions(-)
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index c425d03..3dc085d 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -72,6 +72,7 @@ static int xen_suspend(void *data)
}
gnttab_suspend();
+ xen_accumulate_runstate_time(-1);
xen_arch_pre_suspend();
/*
@@ -84,6 +85,7 @@ static int xen_suspend(void *data)
: 0);
xen_arch_post_suspend(si->cancelled);
+ xen_accumulate_runstate_time(si->cancelled);
gnttab_resume();
if (!si->cancelled) {
diff --git a/drivers/xen/time.c b/drivers/xen/time.c
index ac5f23f..cf3afb9 100644
--- a/drivers/xen/time.c
+++ b/drivers/xen/time.c
@@ -19,6 +19,9 @@
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
+static DEFINE_PER_CPU(u64[RUNSTATE_max], old_runstate_time);
+static struct vcpu_runstate_info *runstate_delta;
+
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
{
@@ -47,8 +50,8 @@ static u64 get64(const u64 *p)
return ret;
}
-static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
- unsigned int cpu)
+static void xen_get_runstate_snapshot_cpu_delta(
+ struct vcpu_runstate_info *res, unsigned int cpu)
{
u64 state_time;
struct vcpu_runstate_info *state;
@@ -66,6 +69,67 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
(state_time & XEN_RUNSTATE_UPDATE));
}
+static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res,
+ unsigned int cpu)
+{
+ int i;
+
+ xen_get_runstate_snapshot_cpu_delta(res, cpu);
+
+ for (i = 0; i < RUNSTATE_max; i++)
+ res->time[i] += per_cpu(old_runstate_time, cpu)[i];
+}
+
+void xen_accumulate_runstate_time(int action)
+{
+ struct vcpu_runstate_info state;
+ int cpu, i;
+
+ switch (action) {
+ case -1: /* backup runstate time before suspend */
+ WARN_ON_ONCE(unlikely(runstate_delta));
+
+ runstate_delta = kcalloc(num_possible_cpus(),
+ sizeof(*runstate_delta),
+ GFP_KERNEL);
+ if (unlikely(!runstate_delta)) {
+ pr_alert("%s: failed to allocate runstate_delta\n",
+ __func__);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ xen_get_runstate_snapshot_cpu_delta(&state, cpu);
+ memcpy(runstate_delta[cpu].time, state.time,
+ RUNSTATE_max * sizeof(*runstate_delta[cpu].time));
+ }
+
+ break;
+
+ case 0: /* backup runstate time after resume */
+ if (unlikely(!runstate_delta)) {
+ pr_alert("%s: cannot accumulate runstate time as runstate_delta is NULL\n",
+ __func__);
+ return;
+ }
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RUNSTATE_max; i++)
+ per_cpu(old_runstate_time, cpu)[i] +=
+ runstate_delta[cpu].time[i];
+ }
+ break;
+
+ default: /* do not accumulate runstate time for checkpointing */
+ break;
+ }
+
+ if (action != -1 && runstate_delta) {
+ kfree(runstate_delta);
+ runstate_delta = NULL;
+ }
+}
+
/*
* Runstate accounting
*/
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
index 98188c8..85e81ce 100644
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -110,6 +110,8 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
*/
#define RUNSTATE_offline 3
+#define RUNSTATE_max 4
+
/*
* Register a shared memory area from which the guest may obtain its own
* runstate information without needing to execute a hypercall.
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 218e6aa..b1f9ae9 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb);
bool xen_vcpu_stolen(int vcpu);
void xen_setup_runstate_info(int cpu);
void xen_time_setup_guest(void);
+void xen_accumulate_runstate_time(int action);
void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
u64 xen_steal_clock(int cpu);
--
2.7.4
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply related [flat|nested] 14+ messages in thread
end of thread, other threads:[~2017-11-01 20:06 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-30 8:03 [PATCH v5 1/1] xen/time: do not decrease steal time after live migration on xen Dongli Zhang
2017-10-30 13:34 ` Boris Ostrovsky
2017-10-30 13:34 ` Boris Ostrovsky
2017-10-31 0:14 ` Dongli Zhang
2017-10-31 0:14 ` [Xen-devel] " Dongli Zhang
2017-10-31 0:58 ` Boris Ostrovsky
2017-10-31 0:58 ` Boris Ostrovsky
2017-10-31 3:13 ` Dongli Zhang
2017-10-31 3:13 ` [Xen-devel] " Dongli Zhang
2017-10-31 11:05 ` Boris Ostrovsky
2017-10-31 11:05 ` Boris Ostrovsky
2017-11-01 20:05 ` kbuild test robot
2017-11-01 20:05 ` kbuild test robot
2017-10-30 8:03 Dongli Zhang
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.