* [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression
@ 2017-04-25 14:10 Frederic Weisbecker
2017-04-26 14:30 ` Jesper Dangaard Brouer
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Frederic Weisbecker @ 2017-04-25 14:10 UTC (permalink / raw)
To: Ingo Molnar
Cc: LKML, Frederic Weisbecker, Jesper Dangaard Brouer,
Peter Zijlstra, Linus Torvalds, Wanpeng Li, Thomas Gleixner,
Rik van Riel, Stanislaw Gruszka
irq_time_read() returns the irqtime minus the ksoftirqd time. This
is necessary because irq_time_read() is used to substract the IRQ time
from the sum_exec_runtime of a task. If we were to include the softirq
time of ksoftirqd, this task would substract its own CPU time everytime
it updates ksoftirqd->sum_exec_runtime which would therefore never
progress.
But this behaviour got broken by commit a499a5a14db:
("sched/cputime: Increment kcpustat directly on irqtime account")
which now includes ksoftirqd softirq time in the time returned by
irq_time_read().
This has resulted in wrong ksotfirqd cputime reported to userspace
through /proc/stat and thus "top" not showing ksoftirqd when it should
after intense networking load.
ksoftirqd->stime happens to be correct but it gets scaled down by
sum_exec_runtime through task_cputime_adjusted().
To fix this, just account the strict IRQ time in a separate counter and
use it to report the IRQ time.
Reported-and-tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
kernel/sched/cputime.c | 27 ++++++++++++++++-----------
kernel/sched/sched.h | 9 +++++++--
2 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b..aea3135 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ u64_stats_update_begin(&irqtime->sync);
+ cpustat[idx] += delta;
+ irqtime->total += delta;
+ irqtime->tick_delta += delta;
+ u64_stats_update_end(&irqtime->sync);
+}
+
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
void irqtime_account_irq(struct task_struct *curr)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
- u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
- u64_stats_update_begin(&irqtime->sync);
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count()) {
- cpustat[CPUTIME_IRQ] += delta;
- irqtime->tick_delta += delta;
- } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
- cpustat[CPUTIME_SOFTIRQ] += delta;
- irqtime->tick_delta += delta;
- }
-
- u64_stats_update_end(&irqtime->sync);
+ if (hardirq_count())
+ irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf922..767aab3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
+ u64 total;
u64 tick_delta;
u64 irq_start_time;
struct u64_stats_sync sync;
@@ -1876,16 +1877,20 @@ struct irqtime {
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
static inline u64 irq_time_read(int cpu)
{
struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
- u64 *cpustat = kcpustat_cpu(cpu).cpustat;
unsigned int seq;
u64 total;
do {
seq = __u64_stats_fetch_begin(&irqtime->sync);
- total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+ total = irqtime->total;
} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
return total;
--
2.7.4
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression
2017-04-25 14:10 [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression Frederic Weisbecker
@ 2017-04-26 14:30 ` Jesper Dangaard Brouer
2017-04-26 15:44 ` Rik van Riel
2017-04-27 14:16 ` [tip:sched/urgent] " tip-bot for Frederic Weisbecker
2 siblings, 0 replies; 4+ messages in thread
From: Jesper Dangaard Brouer @ 2017-04-26 14:30 UTC (permalink / raw)
To: Frederic Weisbecker
Cc: Ingo Molnar, LKML, Peter Zijlstra, Linus Torvalds, Wanpeng Li,
Thomas Gleixner, Rik van Riel, Stanislaw Gruszka, brouer
On Tue, 25 Apr 2017 16:10:48 +0200
Frederic Weisbecker <fweisbec@gmail.com> wrote:
> irq_time_read() returns the irqtime minus the ksoftirqd time. This
> is necessary because irq_time_read() is used to substract the IRQ time
> from the sum_exec_runtime of a task. If we were to include the softirq
> time of ksoftirqd, this task would substract its own CPU time everytime
> it updates ksoftirqd->sum_exec_runtime which would therefore never
> progress.
>
> But this behaviour got broken by commit a499a5a14db:
> ("sched/cputime: Increment kcpustat directly on irqtime account")
> which now includes ksoftirqd softirq time in the time returned by
> irq_time_read().
>
> This has resulted in wrong ksotfirqd cputime reported to userspace
^^^^^^^^^
Typo in commit message, guess maintainer can fix/amend that.
> through /proc/stat and thus "top" not showing ksoftirqd when it should
> after intense networking load.
>
> ksoftirqd->stime happens to be correct but it gets scaled down by
> sum_exec_runtime through task_cputime_adjusted().
>
> To fix this, just account the strict IRQ time in a separate counter and
> use it to report the IRQ time.
>
> Reported-and-tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Confirming I have tested patch before Frederic posted it, and have been
running with this applied on my net-next testlab kernels since Monday.
And it does fix the bisected regression I reported here:
http://lkml.kernel.org/r/20170328101403.34a82fbf@redhat.com
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Stanislaw Gruszka <sgruszka@redhat.com>
> Cc: Wanpeng Li <wanpeng.li@hotmail.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
Please add:
Fixed: a499a5a14dbd ("sched/cputime: Increment kcpustat directly on irqtime account")
--Jesper
> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> ---
> kernel/sched/cputime.c | 27 ++++++++++++++++-----------
> kernel/sched/sched.h | 9 +++++++--
> 2 files changed, 23 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index f3778e2b..aea3135 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
> sched_clock_irqtime = 0;
> }
>
> +static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
> + enum cpu_usage_stat idx)
> +{
> + u64 *cpustat = kcpustat_this_cpu->cpustat;
> +
> + u64_stats_update_begin(&irqtime->sync);
> + cpustat[idx] += delta;
> + irqtime->total += delta;
> + irqtime->tick_delta += delta;
> + u64_stats_update_end(&irqtime->sync);
> +}
> +
> /*
> * Called before incrementing preempt_count on {soft,}irq_enter
> * and before decrementing preempt_count on {soft,}irq_exit.
> @@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
> void irqtime_account_irq(struct task_struct *curr)
> {
> struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
> - u64 *cpustat = kcpustat_this_cpu->cpustat;
> s64 delta;
> int cpu;
>
> @@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
> delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
> irqtime->irq_start_time += delta;
>
> - u64_stats_update_begin(&irqtime->sync);
> /*
> * We do not account for softirq time from ksoftirqd here.
> * We want to continue accounting softirq time to ksoftirqd thread
> * in that case, so as not to confuse scheduler with a special task
> * that do not consume any time, but still wants to run.
> */
> - if (hardirq_count()) {
> - cpustat[CPUTIME_IRQ] += delta;
> - irqtime->tick_delta += delta;
> - } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
> - cpustat[CPUTIME_SOFTIRQ] += delta;
> - irqtime->tick_delta += delta;
> - }
> -
> - u64_stats_update_end(&irqtime->sync);
> + if (hardirq_count())
> + irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
> + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
> + irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
> }
> EXPORT_SYMBOL_GPL(irqtime_account_irq);
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 5cbf922..767aab3 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
>
> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> struct irqtime {
> + u64 total;
> u64 tick_delta;
> u64 irq_start_time;
> struct u64_stats_sync sync;
> @@ -1876,16 +1877,20 @@ struct irqtime {
>
> DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
>
> +/*
> + * Returns the irqtime minus the softirq time computed by ksoftirqd.
> + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
> + * and never move forward.
> + */
> static inline u64 irq_time_read(int cpu)
> {
> struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
> - u64 *cpustat = kcpustat_cpu(cpu).cpustat;
> unsigned int seq;
> u64 total;
>
> do {
> seq = __u64_stats_fetch_begin(&irqtime->sync);
> - total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
> + total = irqtime->total;
> } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
>
> return total;
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression
2017-04-25 14:10 [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression Frederic Weisbecker
2017-04-26 14:30 ` Jesper Dangaard Brouer
@ 2017-04-26 15:44 ` Rik van Riel
2017-04-27 14:16 ` [tip:sched/urgent] " tip-bot for Frederic Weisbecker
2 siblings, 0 replies; 4+ messages in thread
From: Rik van Riel @ 2017-04-26 15:44 UTC (permalink / raw)
To: Frederic Weisbecker, Ingo Molnar
Cc: LKML, Jesper Dangaard Brouer, Peter Zijlstra, Linus Torvalds,
Wanpeng Li, Thomas Gleixner, Stanislaw Gruszka
On Tue, 2017-04-25 at 16:10 +0200, Frederic Weisbecker wrote:
> irq_time_read() returns the irqtime minus the ksoftirqd time. This
> is necessary because irq_time_read() is used to substract the IRQ
> time
> from the sum_exec_runtime of a task. If we were to include the
> softirq
> time of ksoftirqd, this task would substract its own CPU time
> everytime
> it updates ksoftirqd->sum_exec_runtime which would therefore never
> progress.
>
> But this behaviour got broken by commit a499a5a14db:
> ("sched/cputime: Increment kcpustat directly on irqtime
> account")
> which now includes ksoftirqd softirq time in the time returned by
> irq_time_read().
Ahhh, after a look at account_system_time, it becomes
clear why that was happening!
Reviewed-by: Rik van Riel <riel@redhat.com>
^ permalink raw reply [flat|nested] 4+ messages in thread
* [tip:sched/urgent] sched/cputime: Fix ksoftirqd cputime accounting regression
2017-04-25 14:10 [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression Frederic Weisbecker
2017-04-26 14:30 ` Jesper Dangaard Brouer
2017-04-26 15:44 ` Rik van Riel
@ 2017-04-27 14:16 ` tip-bot for Frederic Weisbecker
2 siblings, 0 replies; 4+ messages in thread
From: tip-bot for Frederic Weisbecker @ 2017-04-27 14:16 UTC (permalink / raw)
To: linux-tip-commits
Cc: sgruszka, wanpeng.li, torvalds, hpa, linux-kernel, riel,
fweisbec, peterz, tglx, brouer, mingo
Commit-ID: 25e2d8c1b9e327ed260edd13169cc22bc7a78bc6
Gitweb: http://git.kernel.org/tip/25e2d8c1b9e327ed260edd13169cc22bc7a78bc6
Author: Frederic Weisbecker <fweisbec@gmail.com>
AuthorDate: Tue, 25 Apr 2017 16:10:48 +0200
Committer: Ingo Molnar <mingo@kernel.org>
CommitDate: Thu, 27 Apr 2017 09:08:26 +0200
sched/cputime: Fix ksoftirqd cputime accounting regression
irq_time_read() returns the irqtime minus the ksoftirqd time. This
is necessary because irq_time_read() is used to substract the IRQ time
from the sum_exec_runtime of a task. If we were to include the softirq
time of ksoftirqd, this task would substract its own CPU time everytime
it updates ksoftirqd->sum_exec_runtime which would therefore never
progress.
But this behaviour got broken by:
a499a5a14db ("sched/cputime: Increment kcpustat directly on irqtime account")
... which now includes ksoftirqd softirq time in the time returned by
irq_time_read().
This has resulted in wrong ksoftirqd cputime reported to userspace
through /proc/stat and thus "top" not showing ksoftirqd when it should
after intense networking load.
ksoftirqd->stime happens to be correct but it gets scaled down by
sum_exec_runtime through task_cputime_adjusted().
To fix this, just account the strict IRQ time in a separate counter and
use it to report the IRQ time.
Reported-and-tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1493129448-5356-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
kernel/sched/cputime.c | 27 ++++++++++++++++-----------
kernel/sched/sched.h | 9 +++++++--
2 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b..aea3135 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ u64_stats_update_begin(&irqtime->sync);
+ cpustat[idx] += delta;
+ irqtime->total += delta;
+ irqtime->tick_delta += delta;
+ u64_stats_update_end(&irqtime->sync);
+}
+
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
void irqtime_account_irq(struct task_struct *curr)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
- u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
- u64_stats_update_begin(&irqtime->sync);
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count()) {
- cpustat[CPUTIME_IRQ] += delta;
- irqtime->tick_delta += delta;
- } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
- cpustat[CPUTIME_SOFTIRQ] += delta;
- irqtime->tick_delta += delta;
- }
-
- u64_stats_update_end(&irqtime->sync);
+ if (hardirq_count())
+ irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf922..767aab3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
+ u64 total;
u64 tick_delta;
u64 irq_start_time;
struct u64_stats_sync sync;
@@ -1876,16 +1877,20 @@ struct irqtime {
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
static inline u64 irq_time_read(int cpu)
{
struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
- u64 *cpustat = kcpustat_cpu(cpu).cpustat;
unsigned int seq;
u64 total;
do {
seq = __u64_stats_fetch_begin(&irqtime->sync);
- total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+ total = irqtime->total;
} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
return total;
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2017-04-27 14:22 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-25 14:10 [PATCH] sched/cputime: Fix ksoftirqd cputime accounting regression Frederic Weisbecker
2017-04-26 14:30 ` Jesper Dangaard Brouer
2017-04-26 15:44 ` Rik van Riel
2017-04-27 14:16 ` [tip:sched/urgent] " tip-bot for Frederic Weisbecker
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).