* [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
@ 2017-10-10 9:14 Dongli Zhang
2017-10-10 10:07 ` Jan Beulich
` (5 more replies)
0 siblings, 6 replies; 18+ messages in thread
From: Dongli Zhang @ 2017-10-10 9:14 UTC (permalink / raw)
To: linux-kernel, xen-devel
Cc: mingo, peterz, dario.faggioli, bevan, xen.list, joao.m.martins
After guest live migration on xen, steal time in /proc/stat
(cpustat[CPUTIME_STEAL]) might decrease because steal returned by
paravirt_steal_clock() might be less than this_rq()->prev_steal_time.
For instance, steal time of each vcpu is 335 before live migration.
cpu 198 0 368 200064 1962 0 0 1340 0 0
cpu0 38 0 81 50063 492 0 0 335 0 0
cpu1 65 0 97 49763 634 0 0 335 0 0
cpu2 38 0 81 50098 462 0 0 335 0 0
cpu3 56 0 107 50138 374 0 0 335 0 0
After live migration, steal time is reduced to 312.
cpu 200 0 370 200330 1971 0 0 1248 0 0
cpu0 38 0 82 50123 500 0 0 312 0 0
cpu1 65 0 97 49832 634 0 0 312 0 0
cpu2 39 0 82 50167 462 0 0 312 0 0
cpu3 56 0 107 50207 374 0 0 312 0 0
The code in this patch is borrowed from do_stolen_accounting() which has
already been removed from linux source code since commit ecb23dc6 ("xen:
add steal_clock support on x86").
Similar and more severe issue would impact prior linux 4.8-4.10 as
discussed by Michael Las at
https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest.
Unlike the issue discussed by Michael Las which would overflow steal time
and lead to 100% st usage in top command for linux 4.8-4.10, the issue for
linux 4.11+ would only decrease but not overflow steal time after live
migration.
References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
kernel/sched/cputime.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 14d2dbf..57d09cab 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -238,10 +238,17 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(¶virt_steal_enabled)) {
- u64 steal;
+ u64 steal, steal_time;
+ s64 steal_delta;
+
+ steal_time = paravirt_steal_clock(smp_processor_id());
+ steal = steal_delta = steal_time - this_rq()->prev_steal_time;
+
+ if (unlikely(steal_delta < 0)) {
+ this_rq()->prev_steal_time = steal_time;
+ return 0;
+ }
- steal = paravirt_steal_clock(smp_processor_id());
- steal -= this_rq()->prev_steal_time;
steal = min(steal, maxtime);
account_steal_time(steal);
this_rq()->prev_steal_time += steal;
--
2.7.4
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [Xen-devel] [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 9:14 [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen Dongli Zhang
2017-10-10 10:07 ` Jan Beulich
@ 2017-10-10 10:07 ` Jan Beulich
2017-10-10 10:59 ` Ingo Molnar
` (3 subsequent siblings)
5 siblings, 0 replies; 18+ messages in thread
From: Jan Beulich @ 2017-10-10 10:07 UTC (permalink / raw)
To: Dongli Zhang
Cc: bevan, dario.faggioli, xen.list, peterz, xen-devel,
joao.m.martins, mingo, linux-kernel
>>> On 10.10.17 at 11:14, <dongli.zhang@oracle.com> wrote:
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -238,10 +238,17 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
> {
> #ifdef CONFIG_PARAVIRT
> if (static_key_false(¶virt_steal_enabled)) {
> - u64 steal;
> + u64 steal, steal_time;
> + s64 steal_delta;
> +
> + steal_time = paravirt_steal_clock(smp_processor_id());
> + steal = steal_delta = steal_time - this_rq()->prev_steal_time;
> +
> + if (unlikely(steal_delta < 0)) {
> + this_rq()->prev_steal_time = steal_time;
> + return 0;
> + }
>
> - steal = paravirt_steal_clock(smp_processor_id());
> - steal -= this_rq()->prev_steal_time;
> steal = min(steal, maxtime);
> account_steal_time(steal);
> this_rq()->prev_steal_time += steal;
While I can see this making the issue less pronounced, I don't see
how it fully addresses it: Why would only a negative delta represent
a discontinuity? In our old XenoLinux derived kernel we had the
change below (unlikely to be upstreamable as is, so just to give you
an idea).
Jan
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -112,6 +112,47 @@ static inline void task_group_account_fi
cpuacct_account_field(p, index, tmp);
}
+#if !defined(CONFIG_XEN) || defined(CONFIG_VIRT_CPU_ACCOUNTING)
+# define _cputime_adjust(t) (t)
+#else
+# include <linux/syscore_ops.h>
+# define NS_PER_TICK (1000000000 / HZ)
+
+static DEFINE_PER_CPU(u64, steal_snapshot);
+static DEFINE_PER_CPU(unsigned int, steal_residual);
+
+static u64 _cputime_adjust(u64 t)
+{
+ u64 s = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+ unsigned long adj = div_u64_rem(s - __this_cpu_read(steal_snapshot)
+ + __this_cpu_read(steal_residual),
+ NS_PER_TICK,
+ this_cpu_ptr(&steal_residual));
+
+ __this_cpu_write(steal_snapshot, s);
+ if (t < jiffies_to_nsecs(adj))
+ return 0;
+
+ return t - jiffies_to_nsecs(adj);
+}
+
+static void steal_resume(void)
+{
+ _cputime_adjust((1ULL << 63) - 1);
+}
+
+static struct syscore_ops steal_syscore_ops = {
+ .resume = steal_resume,
+};
+
+static int __init steal_register(void)
+{
+ register_syscore_ops(&steal_syscore_ops);
+ return 0;
+}
+core_initcall(steal_register);
+#endif
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
@@ -128,7 +169,7 @@ void account_user_time(struct task_struc
index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
/* Add user time to cpustat. */
- task_group_account_field(p, index, cputime);
+ task_group_account_field(p, index, _cputime_adjust(cputime));
/* Account for user time used */
acct_account_cputime(p);
@@ -172,7 +213,7 @@ void account_system_index_time(struct ta
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
- task_group_account_field(p, index, cputime);
+ task_group_account_field(p, index, _cputime_adjust(cputime));
/* Account for system time used */
acct_account_cputime(p);
@@ -224,9 +265,9 @@ void account_idle_time(u64 cputime)
struct rq *rq = this_rq();
if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[CPUTIME_IOWAIT] += cputime;
+ cpustat[CPUTIME_IOWAIT] += _cputime_adjust(cputime);
else
- cpustat[CPUTIME_IDLE] += cputime;
+ cpustat[CPUTIME_IDLE] += _cputime_adjust(cputime);
}
/*
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 9:14 [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen Dongli Zhang
@ 2017-10-10 10:07 ` Jan Beulich
2017-10-10 10:07 ` [Xen-devel] " Jan Beulich
` (4 subsequent siblings)
5 siblings, 0 replies; 18+ messages in thread
From: Jan Beulich @ 2017-10-10 10:07 UTC (permalink / raw)
To: Dongli Zhang
Cc: xen.list, peterz, dario.faggioli, bevan, linux-kernel, xen-devel,
mingo, joao.m.martins
>>> On 10.10.17 at 11:14, <dongli.zhang@oracle.com> wrote:
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -238,10 +238,17 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
> {
> #ifdef CONFIG_PARAVIRT
> if (static_key_false(¶virt_steal_enabled)) {
> - u64 steal;
> + u64 steal, steal_time;
> + s64 steal_delta;
> +
> + steal_time = paravirt_steal_clock(smp_processor_id());
> + steal = steal_delta = steal_time - this_rq()->prev_steal_time;
> +
> + if (unlikely(steal_delta < 0)) {
> + this_rq()->prev_steal_time = steal_time;
> + return 0;
> + }
>
> - steal = paravirt_steal_clock(smp_processor_id());
> - steal -= this_rq()->prev_steal_time;
> steal = min(steal, maxtime);
> account_steal_time(steal);
> this_rq()->prev_steal_time += steal;
While I can see this making the issue less pronounced, I don't see
how it fully addresses it: Why would only a negative delta represent
a discontinuity? In our old XenoLinux derived kernel we had the
change below (unlikely to be upstreamable as is, so just to give you
an idea).
Jan
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -112,6 +112,47 @@ static inline void task_group_account_fi
cpuacct_account_field(p, index, tmp);
}
+#if !defined(CONFIG_XEN) || defined(CONFIG_VIRT_CPU_ACCOUNTING)
+# define _cputime_adjust(t) (t)
+#else
+# include <linux/syscore_ops.h>
+# define NS_PER_TICK (1000000000 / HZ)
+
+static DEFINE_PER_CPU(u64, steal_snapshot);
+static DEFINE_PER_CPU(unsigned int, steal_residual);
+
+static u64 _cputime_adjust(u64 t)
+{
+ u64 s = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+ unsigned long adj = div_u64_rem(s - __this_cpu_read(steal_snapshot)
+ + __this_cpu_read(steal_residual),
+ NS_PER_TICK,
+ this_cpu_ptr(&steal_residual));
+
+ __this_cpu_write(steal_snapshot, s);
+ if (t < jiffies_to_nsecs(adj))
+ return 0;
+
+ return t - jiffies_to_nsecs(adj);
+}
+
+static void steal_resume(void)
+{
+ _cputime_adjust((1ULL << 63) - 1);
+}
+
+static struct syscore_ops steal_syscore_ops = {
+ .resume = steal_resume,
+};
+
+static int __init steal_register(void)
+{
+ register_syscore_ops(&steal_syscore_ops);
+ return 0;
+}
+core_initcall(steal_register);
+#endif
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
@@ -128,7 +169,7 @@ void account_user_time(struct task_struc
index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
/* Add user time to cpustat. */
- task_group_account_field(p, index, cputime);
+ task_group_account_field(p, index, _cputime_adjust(cputime));
/* Account for user time used */
acct_account_cputime(p);
@@ -172,7 +213,7 @@ void account_system_index_time(struct ta
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
- task_group_account_field(p, index, cputime);
+ task_group_account_field(p, index, _cputime_adjust(cputime));
/* Account for system time used */
acct_account_cputime(p);
@@ -224,9 +265,9 @@ void account_idle_time(u64 cputime)
struct rq *rq = this_rq();
if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[CPUTIME_IOWAIT] += cputime;
+ cpustat[CPUTIME_IOWAIT] += _cputime_adjust(cputime);
else
- cpustat[CPUTIME_IDLE] += cputime;
+ cpustat[CPUTIME_IDLE] += _cputime_adjust(cputime);
}
/*
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 9:14 [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen Dongli Zhang
` (2 preceding siblings ...)
2017-10-10 10:59 ` Ingo Molnar
@ 2017-10-10 10:59 ` Ingo Molnar
2017-10-10 12:42 ` Stanislaw Gruszka
2017-10-10 12:42 ` Stanislaw Gruszka
2017-10-10 11:58 ` Peter Zijlstra
2017-10-10 11:58 ` Peter Zijlstra
5 siblings, 2 replies; 18+ messages in thread
From: Ingo Molnar @ 2017-10-10 10:59 UTC (permalink / raw)
To: Dongli Zhang, Wanpeng Li, Rik van Riel, Xiaolong Ye,
Frederic Weisbecker, Stanislaw Gruszka
Cc: linux-kernel, xen-devel, mingo, peterz, dario.faggioli, bevan,
xen.list, joao.m.martins
(Cc:-ed more gents involved in kernel/sched/cputime.c work. Full patch quoted
below.)
* Dongli Zhang <dongli.zhang@oracle.com> wrote:
> After guest live migration on xen, steal time in /proc/stat
> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
> paravirt_steal_clock() might be less than this_rq()->prev_steal_time.
>
> For instance, steal time of each vcpu is 335 before live migration.
>
> cpu 198 0 368 200064 1962 0 0 1340 0 0
> cpu0 38 0 81 50063 492 0 0 335 0 0
> cpu1 65 0 97 49763 634 0 0 335 0 0
> cpu2 38 0 81 50098 462 0 0 335 0 0
> cpu3 56 0 107 50138 374 0 0 335 0 0
>
> After live migration, steal time is reduced to 312.
>
> cpu 200 0 370 200330 1971 0 0 1248 0 0
> cpu0 38 0 82 50123 500 0 0 312 0 0
> cpu1 65 0 97 49832 634 0 0 312 0 0
> cpu2 39 0 82 50167 462 0 0 312 0 0
> cpu3 56 0 107 50207 374 0 0 312 0 0
>
> The code in this patch is borrowed from do_stolen_accounting() which has
> already been removed from linux source code since commit ecb23dc6 ("xen:
> add steal_clock support on x86").
>
> Similar and more severe issue would impact prior linux 4.8-4.10 as
> discussed by Michael Las at
> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest.
> Unlike the issue discussed by Michael Las which would overflow steal time
> and lead to 100% st usage in top command for linux 4.8-4.10, the issue for
> linux 4.11+ would only decrease but not overflow steal time after live
> migration.
>
> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
> ---
> kernel/sched/cputime.c | 13 ++++++++++---
> 1 file changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index 14d2dbf..57d09cab 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -238,10 +238,17 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
> {
> #ifdef CONFIG_PARAVIRT
> if (static_key_false(¶virt_steal_enabled)) {
> - u64 steal;
> + u64 steal, steal_time;
> + s64 steal_delta;
> +
> + steal_time = paravirt_steal_clock(smp_processor_id());
> + steal = steal_delta = steal_time - this_rq()->prev_steal_time;
> +
> + if (unlikely(steal_delta < 0)) {
> + this_rq()->prev_steal_time = steal_time;
> + return 0;
> + }
>
> - steal = paravirt_steal_clock(smp_processor_id());
> - steal -= this_rq()->prev_steal_time;
> steal = min(steal, maxtime);
> account_steal_time(steal);
> this_rq()->prev_steal_time += steal;
> --
> 2.7.4
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 9:14 [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen Dongli Zhang
2017-10-10 10:07 ` Jan Beulich
2017-10-10 10:07 ` [Xen-devel] " Jan Beulich
@ 2017-10-10 10:59 ` Ingo Molnar
2017-10-10 10:59 ` Ingo Molnar
` (2 subsequent siblings)
5 siblings, 0 replies; 18+ messages in thread
From: Ingo Molnar @ 2017-10-10 10:59 UTC (permalink / raw)
To: Dongli Zhang, Wanpeng Li, Rik van Riel, Xiaolong Ye,
Frederic Weisbecker, Stanislaw Gruszka
Cc: xen.list, peterz, dario.faggioli, bevan, linux-kernel, xen-devel,
mingo, joao.m.martins
(Cc:-ed more gents involved in kernel/sched/cputime.c work. Full patch quoted
below.)
* Dongli Zhang <dongli.zhang@oracle.com> wrote:
> After guest live migration on xen, steal time in /proc/stat
> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
> paravirt_steal_clock() might be less than this_rq()->prev_steal_time.
>
> For instance, steal time of each vcpu is 335 before live migration.
>
> cpu 198 0 368 200064 1962 0 0 1340 0 0
> cpu0 38 0 81 50063 492 0 0 335 0 0
> cpu1 65 0 97 49763 634 0 0 335 0 0
> cpu2 38 0 81 50098 462 0 0 335 0 0
> cpu3 56 0 107 50138 374 0 0 335 0 0
>
> After live migration, steal time is reduced to 312.
>
> cpu 200 0 370 200330 1971 0 0 1248 0 0
> cpu0 38 0 82 50123 500 0 0 312 0 0
> cpu1 65 0 97 49832 634 0 0 312 0 0
> cpu2 39 0 82 50167 462 0 0 312 0 0
> cpu3 56 0 107 50207 374 0 0 312 0 0
>
> The code in this patch is borrowed from do_stolen_accounting() which has
> already been removed from linux source code since commit ecb23dc6 ("xen:
> add steal_clock support on x86").
>
> Similar and more severe issue would impact prior linux 4.8-4.10 as
> discussed by Michael Las at
> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest.
> Unlike the issue discussed by Michael Las which would overflow steal time
> and lead to 100% st usage in top command for linux 4.8-4.10, the issue for
> linux 4.11+ would only decrease but not overflow steal time after live
> migration.
>
> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
> ---
> kernel/sched/cputime.c | 13 ++++++++++---
> 1 file changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index 14d2dbf..57d09cab 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -238,10 +238,17 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
> {
> #ifdef CONFIG_PARAVIRT
> if (static_key_false(¶virt_steal_enabled)) {
> - u64 steal;
> + u64 steal, steal_time;
> + s64 steal_delta;
> +
> + steal_time = paravirt_steal_clock(smp_processor_id());
> + steal = steal_delta = steal_time - this_rq()->prev_steal_time;
> +
> + if (unlikely(steal_delta < 0)) {
> + this_rq()->prev_steal_time = steal_time;
> + return 0;
> + }
>
> - steal = paravirt_steal_clock(smp_processor_id());
> - steal -= this_rq()->prev_steal_time;
> steal = min(steal, maxtime);
> account_steal_time(steal);
> this_rq()->prev_steal_time += steal;
> --
> 2.7.4
>
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 9:14 [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen Dongli Zhang
` (3 preceding siblings ...)
2017-10-10 10:59 ` Ingo Molnar
@ 2017-10-10 11:58 ` Peter Zijlstra
2017-10-10 11:58 ` Peter Zijlstra
5 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2017-10-10 11:58 UTC (permalink / raw)
To: Dongli Zhang
Cc: linux-kernel, xen-devel, mingo, dario.faggioli, bevan, xen.list,
joao.m.martins
On Tue, Oct 10, 2017 at 05:14:08PM +0800, Dongli Zhang wrote:
> After guest live migration on xen, steal time in /proc/stat
> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
> paravirt_steal_clock() might be less than this_rq()->prev_steal_time.
So why not fix paravirt_steal_clock() to not be broken?
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 9:14 [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen Dongli Zhang
` (4 preceding siblings ...)
2017-10-10 11:58 ` Peter Zijlstra
@ 2017-10-10 11:58 ` Peter Zijlstra
5 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2017-10-10 11:58 UTC (permalink / raw)
To: Dongli Zhang
Cc: xen.list, dario.faggioli, bevan, linux-kernel, xen-devel, mingo,
joao.m.martins
On Tue, Oct 10, 2017 at 05:14:08PM +0800, Dongli Zhang wrote:
> After guest live migration on xen, steal time in /proc/stat
> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
> paravirt_steal_clock() might be less than this_rq()->prev_steal_time.
So why not fix paravirt_steal_clock() to not be broken?
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 10:59 ` Ingo Molnar
2017-10-10 12:42 ` Stanislaw Gruszka
@ 2017-10-10 12:42 ` Stanislaw Gruszka
2017-10-10 12:48 ` Peter Zijlstra
` (3 more replies)
1 sibling, 4 replies; 18+ messages in thread
From: Stanislaw Gruszka @ 2017-10-10 12:42 UTC (permalink / raw)
To: Ingo Molnar
Cc: Dongli Zhang, Wanpeng Li, Rik van Riel, Xiaolong Ye,
Frederic Weisbecker, linux-kernel, xen-devel, mingo, peterz,
dario.faggioli, bevan, xen.list, joao.m.martins
On Tue, Oct 10, 2017 at 12:59:26PM +0200, Ingo Molnar wrote:
>
> (Cc:-ed more gents involved in kernel/sched/cputime.c work. Full patch quoted
> below.)
>
> * Dongli Zhang <dongli.zhang@oracle.com> wrote:
>
> > After guest live migration on xen, steal time in /proc/stat
> > (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
> > paravirt_steal_clock() might be less than this_rq()->prev_steal_time.
> >
> > For instance, steal time of each vcpu is 335 before live migration.
> >
> > cpu 198 0 368 200064 1962 0 0 1340 0 0
> > cpu0 38 0 81 50063 492 0 0 335 0 0
> > cpu1 65 0 97 49763 634 0 0 335 0 0
> > cpu2 38 0 81 50098 462 0 0 335 0 0
> > cpu3 56 0 107 50138 374 0 0 335 0 0
> >
> > After live migration, steal time is reduced to 312.
> >
> > cpu 200 0 370 200330 1971 0 0 1248 0 0
> > cpu0 38 0 82 50123 500 0 0 312 0 0
> > cpu1 65 0 97 49832 634 0 0 312 0 0
> > cpu2 39 0 82 50167 462 0 0 312 0 0
> > cpu3 56 0 107 50207 374 0 0 312 0 0
> >
> > The code in this patch is borrowed from do_stolen_accounting() which has
> > already been removed from linux source code since commit ecb23dc6 ("xen:
> > add steal_clock support on x86").
> >
> > Similar and more severe issue would impact prior linux 4.8-4.10 as
> > discussed by Michael Las at
> > https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest.
> > Unlike the issue discussed by Michael Las which would overflow steal time
> > and lead to 100% st usage in top command for linux 4.8-4.10, the issue for
> > linux 4.11+ would only decrease but not overflow steal time after live
> > migration.
> >
> > References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
> > Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
> > ---
> > kernel/sched/cputime.c | 13 ++++++++++---
> > 1 file changed, 10 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> > index 14d2dbf..57d09cab 100644
> > --- a/kernel/sched/cputime.c
> > +++ b/kernel/sched/cputime.c
> > @@ -238,10 +238,17 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
> > {
> > #ifdef CONFIG_PARAVIRT
> > if (static_key_false(¶virt_steal_enabled)) {
> > - u64 steal;
> > + u64 steal, steal_time;
> > + s64 steal_delta;
> > +
> > + steal_time = paravirt_steal_clock(smp_processor_id());
> > + steal = steal_delta = steal_time - this_rq()->prev_steal_time;
> > +
> > + if (unlikely(steal_delta < 0)) {
> > + this_rq()->prev_steal_time = steal_time;
I don't think setting prev_steal_time to smaller value is right
thing to do.
Beside, I don't think we need to check for overflow condition for
cputime variables (it will happen after 279 years :-). So instead
of introducing signed steal_delta variable I would just add
below check, which should be sufficient to fix the problem:
if (unlikely(steal <= this_rq()->prev_steal_time))
return 0;
Thanks
Stanislaw
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 10:59 ` Ingo Molnar
@ 2017-10-10 12:42 ` Stanislaw Gruszka
2017-10-10 12:42 ` Stanislaw Gruszka
1 sibling, 0 replies; 18+ messages in thread
From: Stanislaw Gruszka @ 2017-10-10 12:42 UTC (permalink / raw)
To: Ingo Molnar
Cc: xen.list, Rik van Riel, peterz, Dongli Zhang, dario.faggioli,
bevan, linux-kernel, Xiaolong Ye, mingo, Frederic Weisbecker,
joao.m.martins, xen-devel, Wanpeng Li
On Tue, Oct 10, 2017 at 12:59:26PM +0200, Ingo Molnar wrote:
>
> (Cc:-ed more gents involved in kernel/sched/cputime.c work. Full patch quoted
> below.)
>
> * Dongli Zhang <dongli.zhang@oracle.com> wrote:
>
> > After guest live migration on xen, steal time in /proc/stat
> > (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
> > paravirt_steal_clock() might be less than this_rq()->prev_steal_time.
> >
> > For instance, steal time of each vcpu is 335 before live migration.
> >
> > cpu 198 0 368 200064 1962 0 0 1340 0 0
> > cpu0 38 0 81 50063 492 0 0 335 0 0
> > cpu1 65 0 97 49763 634 0 0 335 0 0
> > cpu2 38 0 81 50098 462 0 0 335 0 0
> > cpu3 56 0 107 50138 374 0 0 335 0 0
> >
> > After live migration, steal time is reduced to 312.
> >
> > cpu 200 0 370 200330 1971 0 0 1248 0 0
> > cpu0 38 0 82 50123 500 0 0 312 0 0
> > cpu1 65 0 97 49832 634 0 0 312 0 0
> > cpu2 39 0 82 50167 462 0 0 312 0 0
> > cpu3 56 0 107 50207 374 0 0 312 0 0
> >
> > The code in this patch is borrowed from do_stolen_accounting() which has
> > already been removed from linux source code since commit ecb23dc6 ("xen:
> > add steal_clock support on x86").
> >
> > Similar and more severe issue would impact prior linux 4.8-4.10 as
> > discussed by Michael Las at
> > https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest.
> > Unlike the issue discussed by Michael Las which would overflow steal time
> > and lead to 100% st usage in top command for linux 4.8-4.10, the issue for
> > linux 4.11+ would only decrease but not overflow steal time after live
> > migration.
> >
> > References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
> > Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
> > ---
> > kernel/sched/cputime.c | 13 ++++++++++---
> > 1 file changed, 10 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> > index 14d2dbf..57d09cab 100644
> > --- a/kernel/sched/cputime.c
> > +++ b/kernel/sched/cputime.c
> > @@ -238,10 +238,17 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
> > {
> > #ifdef CONFIG_PARAVIRT
> > if (static_key_false(¶virt_steal_enabled)) {
> > - u64 steal;
> > + u64 steal, steal_time;
> > + s64 steal_delta;
> > +
> > + steal_time = paravirt_steal_clock(smp_processor_id());
> > + steal = steal_delta = steal_time - this_rq()->prev_steal_time;
> > +
> > + if (unlikely(steal_delta < 0)) {
> > + this_rq()->prev_steal_time = steal_time;
I don't think setting prev_steal_time to smaller value is right
thing to do.
Beside, I don't think we need to check for overflow condition for
cputime variables (it will happen after 279 years :-). So instead
of introducing signed steal_delta variable I would just add
below check, which should be sufficient to fix the problem:
if (unlikely(steal <= this_rq()->prev_steal_time))
return 0;
Thanks
Stanislaw
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 12:42 ` Stanislaw Gruszka
2017-10-10 12:48 ` Peter Zijlstra
@ 2017-10-10 12:48 ` Peter Zijlstra
2017-10-10 14:01 ` Rik van Riel
2017-10-10 14:01 ` Rik van Riel
2017-10-11 7:29 ` Dongli Zhang
2017-10-11 7:29 ` Dongli Zhang
3 siblings, 2 replies; 18+ messages in thread
From: Peter Zijlstra @ 2017-10-10 12:48 UTC (permalink / raw)
To: Stanislaw Gruszka
Cc: Ingo Molnar, Dongli Zhang, Wanpeng Li, Rik van Riel, Xiaolong Ye,
Frederic Weisbecker, linux-kernel, xen-devel, mingo,
dario.faggioli, bevan, xen.list, joao.m.martins
On Tue, Oct 10, 2017 at 02:42:01PM +0200, Stanislaw Gruszka wrote:
> > > + u64 steal, steal_time;
> > > + s64 steal_delta;
> > > +
> > > + steal_time = paravirt_steal_clock(smp_processor_id());
> > > + steal = steal_delta = steal_time - this_rq()->prev_steal_time;
> > > +
> > > + if (unlikely(steal_delta < 0)) {
> > > + this_rq()->prev_steal_time = steal_time;
>
> I don't think setting prev_steal_time to smaller value is right
> thing to do.
>
> Beside, I don't think we need to check for overflow condition for
> cputime variables (it will happen after 279 years :-). So instead
> of introducing signed steal_delta variable I would just add
> below check, which should be sufficient to fix the problem:
>
> if (unlikely(steal <= this_rq()->prev_steal_time))
> return 0;
How about you just fix up paravirt_steal_time() on migration and not
muck with the users ?
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 12:42 ` Stanislaw Gruszka
@ 2017-10-10 12:48 ` Peter Zijlstra
2017-10-10 12:48 ` Peter Zijlstra
` (2 subsequent siblings)
3 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2017-10-10 12:48 UTC (permalink / raw)
To: Stanislaw Gruszka
Cc: xen.list, Rik van Riel, Dongli Zhang, dario.faggioli, bevan,
Xiaolong Ye, linux-kernel, mingo, Frederic Weisbecker,
joao.m.martins, xen-devel, Ingo Molnar, Wanpeng Li
On Tue, Oct 10, 2017 at 02:42:01PM +0200, Stanislaw Gruszka wrote:
> > > + u64 steal, steal_time;
> > > + s64 steal_delta;
> > > +
> > > + steal_time = paravirt_steal_clock(smp_processor_id());
> > > + steal = steal_delta = steal_time - this_rq()->prev_steal_time;
> > > +
> > > + if (unlikely(steal_delta < 0)) {
> > > + this_rq()->prev_steal_time = steal_time;
>
> I don't think setting prev_steal_time to smaller value is right
> thing to do.
>
> Beside, I don't think we need to check for overflow condition for
> cputime variables (it will happen after 279 years :-). So instead
> of introducing signed steal_delta variable I would just add
> below check, which should be sufficient to fix the problem:
>
> if (unlikely(steal <= this_rq()->prev_steal_time))
> return 0;
How about you just fix up paravirt_steal_time() on migration and not
muck with the users ?
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 12:48 ` Peter Zijlstra
2017-10-10 14:01 ` Rik van Riel
@ 2017-10-10 14:01 ` Rik van Riel
2017-10-11 7:47 ` Dongli Zhang
2017-10-11 7:47 ` Dongli Zhang
1 sibling, 2 replies; 18+ messages in thread
From: Rik van Riel @ 2017-10-10 14:01 UTC (permalink / raw)
To: Peter Zijlstra, Stanislaw Gruszka
Cc: Ingo Molnar, Dongli Zhang, Wanpeng Li, Xiaolong Ye,
Frederic Weisbecker, linux-kernel, xen-devel, mingo,
dario.faggioli, bevan, xen.list, joao.m.martins
On Tue, 2017-10-10 at 14:48 +0200, Peter Zijlstra wrote:
> On Tue, Oct 10, 2017 at 02:42:01PM +0200, Stanislaw Gruszka wrote:
> > > > + u64 steal, steal_time;
> > > > + s64 steal_delta;
> > > > +
> > > > + steal_time =
> > > > paravirt_steal_clock(smp_processor_id());
> > > > + steal = steal_delta = steal_time - this_rq()-
> > > > >prev_steal_time;
> > > > +
> > > > + if (unlikely(steal_delta < 0)) {
> > > > + this_rq()->prev_steal_time =
> > > > steal_time;
> >
> > I don't think setting prev_steal_time to smaller value is right
> > thing to do.
> >
> > Beside, I don't think we need to check for overflow condition for
> > cputime variables (it will happen after 279 years :-). So instead
> > of introducing signed steal_delta variable I would just add
> > below check, which should be sufficient to fix the problem:
> >
> > if (unlikely(steal <= this_rq()->prev_steal_time))
> > return 0;
>
> How about you just fix up paravirt_steal_time() on migration and not
> muck with the users ?
Not just migration, either. CPU hotplug is another time to fix up
the steal time.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 12:48 ` Peter Zijlstra
@ 2017-10-10 14:01 ` Rik van Riel
2017-10-10 14:01 ` Rik van Riel
1 sibling, 0 replies; 18+ messages in thread
From: Rik van Riel @ 2017-10-10 14:01 UTC (permalink / raw)
To: Peter Zijlstra, Stanislaw Gruszka
Cc: xen.list, Dongli Zhang, dario.faggioli, bevan, linux-kernel,
Xiaolong Ye, mingo, Frederic Weisbecker, joao.m.martins,
xen-devel, Ingo Molnar, Wanpeng Li
On Tue, 2017-10-10 at 14:48 +0200, Peter Zijlstra wrote:
> On Tue, Oct 10, 2017 at 02:42:01PM +0200, Stanislaw Gruszka wrote:
> > > > + u64 steal, steal_time;
> > > > + s64 steal_delta;
> > > > +
> > > > + steal_time =
> > > > paravirt_steal_clock(smp_processor_id());
> > > > + steal = steal_delta = steal_time - this_rq()-
> > > > >prev_steal_time;
> > > > +
> > > > + if (unlikely(steal_delta < 0)) {
> > > > + this_rq()->prev_steal_time =
> > > > steal_time;
> >
> > I don't think setting prev_steal_time to smaller value is right
> > thing to do.
> >
> > Beside, I don't think we need to check for overflow condition for
> > cputime variables (it will happen after 279 years :-). So instead
> > of introducing signed steal_delta variable I would just add
> > below check, which should be sufficient to fix the problem:
> >
> > if (unlikely(steal <= this_rq()->prev_steal_time))
> > return 0;
>
> How about you just fix up paravirt_steal_time() on migration and not
> muck with the users ?
Not just migration, either. CPU hotplug is another time to fix up
the steal time.
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 12:42 ` Stanislaw Gruszka
2017-10-10 12:48 ` Peter Zijlstra
2017-10-10 12:48 ` Peter Zijlstra
@ 2017-10-11 7:29 ` Dongli Zhang
2017-10-11 7:29 ` Dongli Zhang
3 siblings, 0 replies; 18+ messages in thread
From: Dongli Zhang @ 2017-10-11 7:29 UTC (permalink / raw)
To: Stanislaw Gruszka, Ingo Molnar
Cc: Wanpeng Li, Rik van Riel, Xiaolong Ye, Frederic Weisbecker,
linux-kernel, xen-devel, mingo, peterz, dario.faggioli, bevan,
xen.list, joao.m.martins
Hi Stanislaw and Peter,
On 10/10/2017 08:42 PM, Stanislaw Gruszka wrote:
> On Tue, Oct 10, 2017 at 12:59:26PM +0200, Ingo Molnar wrote:
>>
>> (Cc:-ed more gents involved in kernel/sched/cputime.c work. Full patch quoted
>> below.)
>>
>> * Dongli Zhang <dongli.zhang@oracle.com> wrote:
>>
>>> After guest live migration on xen, steal time in /proc/stat
>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>> paravirt_steal_clock() might be less than this_rq()->prev_steal_time.
>>>
>>> For instance, steal time of each vcpu is 335 before live migration.
>>>
>>> cpu 198 0 368 200064 1962 0 0 1340 0 0
>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>
>>> After live migration, steal time is reduced to 312.
>>>
>>> cpu 200 0 370 200330 1971 0 0 1248 0 0
>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>
>>> The code in this patch is borrowed from do_stolen_accounting() which has
>>> already been removed from linux source code since commit ecb23dc6 ("xen:
>>> add steal_clock support on x86").
>>>
>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>> discussed by Michael Las at
>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest.
>>> Unlike the issue discussed by Michael Las which would overflow steal time
>>> and lead to 100% st usage in top command for linux 4.8-4.10, the issue for
>>> linux 4.11+ would only decrease but not overflow steal time after live
>>> migration.
>>>
>>> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>> ---
>>> kernel/sched/cputime.c | 13 ++++++++++---
>>> 1 file changed, 10 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
>>> index 14d2dbf..57d09cab 100644
>>> --- a/kernel/sched/cputime.c
>>> +++ b/kernel/sched/cputime.c
>>> @@ -238,10 +238,17 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
>>> {
>>> #ifdef CONFIG_PARAVIRT
>>> if (static_key_false(¶virt_steal_enabled)) {
>>> - u64 steal;
>>> + u64 steal, steal_time;
>>> + s64 steal_delta;
>>> +
>>> + steal_time = paravirt_steal_clock(smp_processor_id());
>>> + steal = steal_delta = steal_time - this_rq()->prev_steal_time;
>>> +
>>> + if (unlikely(steal_delta < 0)) {
>>> + this_rq()->prev_steal_time = steal_time;
>
> I don't think setting prev_steal_time to smaller value is right
> thing to do.
If we do not set prev_steal_time to smaller steal (obtained from
paravirt_steal_clock()), it will take a while for kernel to wait for new steal
to catch up with this_rq()->prev_steal_time, and cpustat[CPUTIME_STEAL] will
stay unchanged until steal is more than this_rq()->prev_steal_time again. Do you
think it is fine?
If it is fine, I will try to limit the fix to xen specific code in
driver/xen/time.c so that we would not taint kernel/sched/cputime.c, as Peter
has asked why not just fix up paravirt_steal_time() on migration.
Thank you very much!
Dongli Zhang
>
> Beside, I don't think we need to check for overflow condition for
> cputime variables (it will happen after 279 years :-). So instead
> of introducing signed steal_delta variable I would just add
> below check, which should be sufficient to fix the problem:
>
> if (unlikely(steal <= this_rq()->prev_steal_time))
> return 0;
>
> Thanks
> Stanislaw
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 12:42 ` Stanislaw Gruszka
` (2 preceding siblings ...)
2017-10-11 7:29 ` Dongli Zhang
@ 2017-10-11 7:29 ` Dongli Zhang
3 siblings, 0 replies; 18+ messages in thread
From: Dongli Zhang @ 2017-10-11 7:29 UTC (permalink / raw)
To: Stanislaw Gruszka, Ingo Molnar
Cc: xen.list, Rik van Riel, linux-kernel, peterz,
Frederic Weisbecker, dario.faggioli, bevan, Xiaolong Ye,
xen-devel, mingo, joao.m.martins, Wanpeng Li
Hi Stanislaw and Peter,
On 10/10/2017 08:42 PM, Stanislaw Gruszka wrote:
> On Tue, Oct 10, 2017 at 12:59:26PM +0200, Ingo Molnar wrote:
>>
>> (Cc:-ed more gents involved in kernel/sched/cputime.c work. Full patch quoted
>> below.)
>>
>> * Dongli Zhang <dongli.zhang@oracle.com> wrote:
>>
>>> After guest live migration on xen, steal time in /proc/stat
>>> (cpustat[CPUTIME_STEAL]) might decrease because steal returned by
>>> paravirt_steal_clock() might be less than this_rq()->prev_steal_time.
>>>
>>> For instance, steal time of each vcpu is 335 before live migration.
>>>
>>> cpu 198 0 368 200064 1962 0 0 1340 0 0
>>> cpu0 38 0 81 50063 492 0 0 335 0 0
>>> cpu1 65 0 97 49763 634 0 0 335 0 0
>>> cpu2 38 0 81 50098 462 0 0 335 0 0
>>> cpu3 56 0 107 50138 374 0 0 335 0 0
>>>
>>> After live migration, steal time is reduced to 312.
>>>
>>> cpu 200 0 370 200330 1971 0 0 1248 0 0
>>> cpu0 38 0 82 50123 500 0 0 312 0 0
>>> cpu1 65 0 97 49832 634 0 0 312 0 0
>>> cpu2 39 0 82 50167 462 0 0 312 0 0
>>> cpu3 56 0 107 50207 374 0 0 312 0 0
>>>
>>> The code in this patch is borrowed from do_stolen_accounting() which has
>>> already been removed from linux source code since commit ecb23dc6 ("xen:
>>> add steal_clock support on x86").
>>>
>>> Similar and more severe issue would impact prior linux 4.8-4.10 as
>>> discussed by Michael Las at
>>> https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest.
>>> Unlike the issue discussed by Michael Las which would overflow steal time
>>> and lead to 100% st usage in top command for linux 4.8-4.10, the issue for
>>> linux 4.11+ would only decrease but not overflow steal time after live
>>> migration.
>>>
>>> References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>> ---
>>> kernel/sched/cputime.c | 13 ++++++++++---
>>> 1 file changed, 10 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
>>> index 14d2dbf..57d09cab 100644
>>> --- a/kernel/sched/cputime.c
>>> +++ b/kernel/sched/cputime.c
>>> @@ -238,10 +238,17 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
>>> {
>>> #ifdef CONFIG_PARAVIRT
>>> if (static_key_false(¶virt_steal_enabled)) {
>>> - u64 steal;
>>> + u64 steal, steal_time;
>>> + s64 steal_delta;
>>> +
>>> + steal_time = paravirt_steal_clock(smp_processor_id());
>>> + steal = steal_delta = steal_time - this_rq()->prev_steal_time;
>>> +
>>> + if (unlikely(steal_delta < 0)) {
>>> + this_rq()->prev_steal_time = steal_time;
>
> I don't think setting prev_steal_time to smaller value is right
> thing to do.
If we do not set prev_steal_time to smaller steal (obtained from
paravirt_steal_clock()), it will take a while for kernel to wait for new steal
to catch up with this_rq()->prev_steal_time, and cpustat[CPUTIME_STEAL] will
stay unchanged until steal is more than this_rq()->prev_steal_time again. Do you
think it is fine?
If it is fine, I will try to limit the fix to xen specific code in
driver/xen/time.c so that we would not taint kernel/sched/cputime.c, as Peter
has asked why not just fix up paravirt_steal_time() on migration.
Thank you very much!
Dongli Zhang
>
> Beside, I don't think we need to check for overflow condition for
> cputime variables (it will happen after 279 years :-). So instead
> of introducing signed steal_delta variable I would just add
> below check, which should be sufficient to fix the problem:
>
> if (unlikely(steal <= this_rq()->prev_steal_time))
> return 0;
>
> Thanks
> Stanislaw
>
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 14:01 ` Rik van Riel
2017-10-11 7:47 ` Dongli Zhang
@ 2017-10-11 7:47 ` Dongli Zhang
1 sibling, 0 replies; 18+ messages in thread
From: Dongli Zhang @ 2017-10-11 7:47 UTC (permalink / raw)
To: Rik van Riel, Peter Zijlstra, Stanislaw Gruszka
Cc: Ingo Molnar, Wanpeng Li, Xiaolong Ye, Frederic Weisbecker,
linux-kernel, xen-devel, mingo, dario.faggioli, bevan, xen.list,
joao.m.martins
Hi Rik,
On 10/10/2017 10:01 PM, Rik van Riel wrote:
> On Tue, 2017-10-10 at 14:48 +0200, Peter Zijlstra wrote:
>> On Tue, Oct 10, 2017 at 02:42:01PM +0200, Stanislaw Gruszka wrote:
>>>>> + u64 steal, steal_time;
>>>>> + s64 steal_delta;
>>>>> +
>>>>> + steal_time =
>>>>> paravirt_steal_clock(smp_processor_id());
>>>>> + steal = steal_delta = steal_time - this_rq()-
>>>>>> prev_steal_time;
>>>>> +
>>>>> + if (unlikely(steal_delta < 0)) {
>>>>> + this_rq()->prev_steal_time =
>>>>> steal_time;
>>>
>>> I don't think setting prev_steal_time to smaller value is right
>>> thing to do.
>>>
>>> Beside, I don't think we need to check for overflow condition for
>>> cputime variables (it will happen after 279 years :-). So instead
>>> of introducing signed steal_delta variable I would just add
>>> below check, which should be sufficient to fix the problem:
>>>
>>> if (unlikely(steal <= this_rq()->prev_steal_time))
>>> return 0;
>>
>> How about you just fix up paravirt_steal_time() on migration and not
>> muck with the users ?
>
> Not just migration, either. CPU hotplug is another time to fix up
> the steal time.
I think this issue might be hit when we add and online vcpu after a very very
long time since boot (or the last time vcpu is offline). Please correct me if I
am wrong.
Thank you very much!
Dongli Zhang
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
2017-10-10 14:01 ` Rik van Riel
@ 2017-10-11 7:47 ` Dongli Zhang
2017-10-11 7:47 ` Dongli Zhang
1 sibling, 0 replies; 18+ messages in thread
From: Dongli Zhang @ 2017-10-11 7:47 UTC (permalink / raw)
To: Rik van Riel, Peter Zijlstra, Stanislaw Gruszka
Cc: xen.list, Frederic Weisbecker, dario.faggioli, bevan,
linux-kernel, xen-devel, Xiaolong Ye, joao.m.martins, mingo,
Ingo Molnar, Wanpeng Li
Hi Rik,
On 10/10/2017 10:01 PM, Rik van Riel wrote:
> On Tue, 2017-10-10 at 14:48 +0200, Peter Zijlstra wrote:
>> On Tue, Oct 10, 2017 at 02:42:01PM +0200, Stanislaw Gruszka wrote:
>>>>> + u64 steal, steal_time;
>>>>> + s64 steal_delta;
>>>>> +
>>>>> + steal_time =
>>>>> paravirt_steal_clock(smp_processor_id());
>>>>> + steal = steal_delta = steal_time - this_rq()-
>>>>>> prev_steal_time;
>>>>> +
>>>>> + if (unlikely(steal_delta < 0)) {
>>>>> + this_rq()->prev_steal_time =
>>>>> steal_time;
>>>
>>> I don't think setting prev_steal_time to smaller value is right
>>> thing to do.
>>>
>>> Beside, I don't think we need to check for overflow condition for
>>> cputime variables (it will happen after 279 years :-). So instead
>>> of introducing signed steal_delta variable I would just add
>>> below check, which should be sufficient to fix the problem:
>>>
>>> if (unlikely(steal <= this_rq()->prev_steal_time))
>>> return 0;
>>
>> How about you just fix up paravirt_steal_time() on migration and not
>> muck with the users ?
>
> Not just migration, either. CPU hotplug is another time to fix up
> the steal time.
I think this issue might be hit when we add and online vcpu after a very very
long time since boot (or the last time vcpu is offline). Please correct me if I
am wrong.
Thank you very much!
Dongli Zhang
>
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen
@ 2017-10-10 9:14 Dongli Zhang
0 siblings, 0 replies; 18+ messages in thread
From: Dongli Zhang @ 2017-10-10 9:14 UTC (permalink / raw)
To: linux-kernel, xen-devel
Cc: xen.list, peterz, dario.faggioli, bevan, mingo, joao.m.martins
After guest live migration on xen, steal time in /proc/stat
(cpustat[CPUTIME_STEAL]) might decrease because steal returned by
paravirt_steal_clock() might be less than this_rq()->prev_steal_time.
For instance, steal time of each vcpu is 335 before live migration.
cpu 198 0 368 200064 1962 0 0 1340 0 0
cpu0 38 0 81 50063 492 0 0 335 0 0
cpu1 65 0 97 49763 634 0 0 335 0 0
cpu2 38 0 81 50098 462 0 0 335 0 0
cpu3 56 0 107 50138 374 0 0 335 0 0
After live migration, steal time is reduced to 312.
cpu 200 0 370 200330 1971 0 0 1248 0 0
cpu0 38 0 82 50123 500 0 0 312 0 0
cpu1 65 0 97 49832 634 0 0 312 0 0
cpu2 39 0 82 50167 462 0 0 312 0 0
cpu3 56 0 107 50207 374 0 0 312 0 0
The code in this patch is borrowed from do_stolen_accounting() which has
already been removed from linux source code since commit ecb23dc6 ("xen:
add steal_clock support on x86").
Similar and more severe issue would impact prior linux 4.8-4.10 as
discussed by Michael Las at
https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest.
Unlike the issue discussed by Michael Las which would overflow steal time
and lead to 100% st usage in top command for linux 4.8-4.10, the issue for
linux 4.11+ would only decrease but not overflow steal time after live
migration.
References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
kernel/sched/cputime.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 14d2dbf..57d09cab 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -238,10 +238,17 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(¶virt_steal_enabled)) {
- u64 steal;
+ u64 steal, steal_time;
+ s64 steal_delta;
+
+ steal_time = paravirt_steal_clock(smp_processor_id());
+ steal = steal_delta = steal_time - this_rq()->prev_steal_time;
+
+ if (unlikely(steal_delta < 0)) {
+ this_rq()->prev_steal_time = steal_time;
+ return 0;
+ }
- steal = paravirt_steal_clock(smp_processor_id());
- steal -= this_rq()->prev_steal_time;
steal = min(steal, maxtime);
account_steal_time(steal);
this_rq()->prev_steal_time += steal;
--
2.7.4
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply related [flat|nested] 18+ messages in thread
end of thread, other threads:[~2017-10-11 7:48 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-10 9:14 [PATCH 1/1] sched/cputime: do not decrease steal time after live migration on xen Dongli Zhang
2017-10-10 10:07 ` Jan Beulich
2017-10-10 10:07 ` [Xen-devel] " Jan Beulich
2017-10-10 10:59 ` Ingo Molnar
2017-10-10 10:59 ` Ingo Molnar
2017-10-10 12:42 ` Stanislaw Gruszka
2017-10-10 12:42 ` Stanislaw Gruszka
2017-10-10 12:48 ` Peter Zijlstra
2017-10-10 12:48 ` Peter Zijlstra
2017-10-10 14:01 ` Rik van Riel
2017-10-10 14:01 ` Rik van Riel
2017-10-11 7:47 ` Dongli Zhang
2017-10-11 7:47 ` Dongli Zhang
2017-10-11 7:29 ` Dongli Zhang
2017-10-11 7:29 ` Dongli Zhang
2017-10-10 11:58 ` Peter Zijlstra
2017-10-10 11:58 ` Peter Zijlstra
2017-10-10 9:14 Dongli Zhang
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.