All of lore.kernel.org
 help / color / mirror / Atom feed
From: riel@redhat.com
To: linux-kernel@vger.kernel.org
Cc: x86@kernel.org, williams@redhat.com, luto@kernel.org,
	mingo@kernel.org, bonzini@redhat.com, fweisbec@redhat.com,
	peterz@infradead.org, heiko.carstens@de.ibm.com,
	tglx@linutronix.de, Rik van Riel <riel@redhat.com>,
	Andy Lutomirsky <amluto@amacapital.com>
Subject: [PATCH 2/3] remove local_irq_save from __acct_update_integrals
Date: Thu, 30 Apr 2015 17:23:54 -0400	[thread overview]
Message-ID: <1430429035-25563-3-git-send-email-riel@redhat.com> (raw)
In-Reply-To: <1430429035-25563-1-git-send-email-riel@redhat.com>

From: Rik van Riel <riel@redhat.com>

The function __acct_update_integrals() is called both from irq context
and task context. This creates a race where irq context can advance
tsk->acct_timexpd to a value larger than time, leading to a negative
value, which causes a divide error. See commit 6d5b5acca9e5
("Fix fixpoint divide exception in acct_update_integrals")

In 2012, __acct_update_integrals() was changed to get utime and stime
as function parameters. This re-introduced the bug, because an irq
can hit in-between the call to task_cputime() and where irqs actually
get disabled.

However, this race condition was originally reproduced on Hercules,
and I have not seen any reports of it re-occurring since it was
re-introduced 3 years ago.

On the other hand, the irq disabling and re-enabling, which no longer
even protects us against the race today, show up prominently in the
perf profile of a program that makes a very large number of system calls
in a short period of time, when nohz_full= (and context tracking) is
enabled.

This patch replaces the (now ineffective) irq blocking with a cheaper
way to test for the race condition, and speeds up my microbenchmark
with 10 million iterations, average of 5 runs, tiny stddev:

		run time	system time
vanilla		5.49s		2.08s
patch		5.21s		1.92s

Cc: Andy Lutomirsky <amluto@amacapital.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rik van Riel <riel@redhat.com>
---
 arch/powerpc/include/asm/cputime.h    |  3 +++
 arch/s390/include/asm/cputime.h       |  3 +++
 include/asm-generic/cputime_jiffies.h |  2 ++
 include/asm-generic/cputime_nsecs.h   |  3 +++
 kernel/tsacct.c                       | 16 ++++++++++++----
 5 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index e2452550bcb1..e41b32f68a2c 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -32,6 +32,9 @@ static inline void setup_cputime_one_jiffy(void) { }
 typedef u64 __nocast cputime_t;
 typedef u64 __nocast cputime64_t;
 
+typedef s64 signed_cputime_t;
+typedef s64 signed_cputime64_t;
+
 #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new)
 
 #ifdef __KERNEL__
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index 221b454c734a..2e8c268cc2a7 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -18,6 +18,9 @@
 typedef unsigned long long __nocast cputime_t;
 typedef unsigned long long __nocast cputime64_t;
 
+typedef signed long long signed_cputime_t;
+typedef signed long long signed_cputime64_t;
+
 #define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new)
 
 static inline unsigned long __div(unsigned long long n, unsigned long base)
diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h
index fe386fc6e85e..b96b6a1b6c97 100644
--- a/include/asm-generic/cputime_jiffies.h
+++ b/include/asm-generic/cputime_jiffies.h
@@ -2,6 +2,7 @@
 #define _ASM_GENERIC_CPUTIME_JIFFIES_H
 
 typedef unsigned long __nocast cputime_t;
+typedef signed long signed_cputime_t;
 
 #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new)
 
@@ -11,6 +12,7 @@ typedef unsigned long __nocast cputime_t;
 #define jiffies_to_cputime(__hz)	(__force cputime_t)(__hz)
 
 typedef u64 __nocast cputime64_t;
+typedef s64 signed_cputime_t;
 
 #define cputime64_to_jiffies64(__ct)	(__force u64)(__ct)
 #define jiffies64_to_cputime64(__jif)	(__force cputime64_t)(__jif)
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h
index 0419485891f2..c1ad2f90a4d9 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -21,6 +21,9 @@
 typedef u64 __nocast cputime_t;
 typedef u64 __nocast cputime64_t;
 
+typedef s64 signed_cputime_t;
+typedef s64 signed_cputime64_t;
+
 #define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new)
 
 #define cputime_one_jiffy		jiffies_to_cputime(1)
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 9e225425bc3a..e497c1c05675 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -125,15 +125,24 @@ static void __acct_update_integrals(struct task_struct *tsk,
 {
 	cputime_t time, dtime;
 	struct timeval value;
-	unsigned long flags;
 	u64 delta;
 
 	if (unlikely(!tsk->mm))
 		return;
 
-	local_irq_save(flags);
+	/*
+	 * This code is called both from task context and irq context.
+	 * There is a rare race where irq context advances tsk->acct_timexpd
+	 * to a value larger than time, leading to a negative dtime, which
+	 * could lead to a divide error in cputime_to_jiffies.
+	 * The statistics updated here are fairly rough estimates; just
+	 * ignore irq and task double accounting the same timer tick.
+	 */
 	time = stime + utime;
-	dtime = time - tsk->acct_timexpd;
+	dtime = time - READ_ONCE(tsk->acct_timexpd);
+	if (unlikely((signed_cputime_t)dtime <= 0))
+		return;
+
 	jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
 	delta = value.tv_sec;
 	delta = delta * USEC_PER_SEC + value.tv_usec;
@@ -143,7 +152,6 @@ static void __acct_update_integrals(struct task_struct *tsk,
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
-	local_irq_restore(flags);
 }
 
 /**
-- 
2.1.0


  parent reply	other threads:[~2015-04-30 21:24 UTC|newest]

Thread overview: 83+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-04-30 21:23 [PATCH 0/3] reduce nohz_full syscall overhead by 10% riel
2015-04-30 21:23 ` [PATCH 1/3] reduce indentation in __acct_update_integrals riel
2015-04-30 21:23 ` riel [this message]
2015-04-30 21:23 ` [PATCH 3/3] context_tracking,x86: remove extraneous irq disable & enable from context tracking on syscall entry riel
2015-04-30 21:56   ` Andy Lutomirski
2015-05-01  6:40   ` Ingo Molnar
2015-05-01 15:20     ` Rik van Riel
2015-05-01 15:59       ` Ingo Molnar
2015-05-01 16:03         ` Andy Lutomirski
2015-05-01 16:21           ` Ingo Molnar
2015-05-01 16:26             ` Rik van Riel
2015-05-01 16:34               ` Ingo Molnar
2015-05-01 18:05                 ` Rik van Riel
2015-05-01 18:40                   ` Ingo Molnar
2015-05-01 19:11                     ` Rik van Riel
2015-05-01 19:37                       ` Andy Lutomirski
2015-05-02  5:27                         ` Ingo Molnar
2015-05-02 18:27                           ` Rik van Riel
2015-05-03 18:41                           ` Andy Lutomirski
2015-05-07 10:35                             ` Ingo Molnar
2015-05-04  9:26                           ` Paolo Bonzini
2015-05-04 13:30                             ` Rik van Riel
2015-05-04 14:06                             ` Rik van Riel
2015-05-04 14:19                             ` Rik van Riel
2015-05-04 15:59                             ` question about RCU dynticks_nesting Rik van Riel
2015-05-04 18:39                               ` Paul E. McKenney
2015-05-04 19:39                                 ` Rik van Riel
2015-05-04 20:02                                   ` Paul E. McKenney
2015-05-04 20:13                                     ` Rik van Riel
2015-05-04 20:38                                       ` Paul E. McKenney
2015-05-04 20:53                                         ` Rik van Riel
2015-05-05  5:54                                           ` Paul E. McKenney
2015-05-06  1:49                                             ` Mike Galbraith
2015-05-06  3:44                                               ` Mike Galbraith
2015-05-06  6:06                                                 ` Paul E. McKenney
2015-05-06  6:52                                                   ` Mike Galbraith
2015-05-06  7:01                                                     ` Mike Galbraith
2015-05-07  0:59                                           ` Frederic Weisbecker
2015-05-07 15:44                                             ` Rik van Riel
2015-05-04 19:00                               ` Rik van Riel
2015-05-04 19:39                                 ` Paul E. McKenney
2015-05-04 19:59                                   ` Rik van Riel
2015-05-04 20:40                                     ` Paul E. McKenney
2015-05-05 10:53                                   ` Peter Zijlstra
2015-05-05 12:34                                     ` Paul E. McKenney
2015-05-05 13:00                                       ` Peter Zijlstra
2015-05-05 18:35                                         ` Paul E. McKenney
2015-05-05 21:09                                           ` Rik van Riel
2015-05-06  5:41                                             ` Paul E. McKenney
2015-05-05 10:48                                 ` Peter Zijlstra
2015-05-05 10:51                                   ` Peter Zijlstra
2015-05-05 12:30                                     ` Paul E. McKenney
2015-05-02  4:06                   ` [PATCH 3/3] context_tracking,x86: remove extraneous irq disable & enable from context tracking on syscall entry Mike Galbraith
2015-05-01 16:37             ` Ingo Molnar
2015-05-01 16:40               ` Rik van Riel
2015-05-01 16:45                 ` Ingo Molnar
2015-05-01 16:54                   ` Rik van Riel
2015-05-01 17:12                     ` Ingo Molnar
2015-05-01 17:22                       ` Rik van Riel
2015-05-01 17:59                         ` Ingo Molnar
2015-05-01 16:22           ` Rik van Riel
2015-05-01 16:27             ` Ingo Molnar
2015-05-03 13:23       ` Mike Galbraith
2015-05-03 17:30         ` Rik van Riel
2015-05-03 18:24           ` Andy Lutomirski
2015-05-03 18:52             ` Rik van Riel
2015-05-07 10:48               ` Ingo Molnar
2015-05-07 12:18                 ` Frederic Weisbecker
2015-05-07 12:29                   ` Ingo Molnar
2015-05-07 15:47                     ` Rik van Riel
2015-05-08  7:58                       ` Ingo Molnar
2015-05-07 12:22                 ` Andy Lutomirski
2015-05-07 12:44                   ` Ingo Molnar
2015-05-07 12:49                     ` Ingo Molnar
2015-05-08  6:17                       ` Paul E. McKenney
2015-05-07 12:52                     ` Andy Lutomirski
2015-05-07 15:08                       ` Ingo Molnar
2015-05-07 17:47                         ` Andy Lutomirski
2015-05-08  6:37                           ` Ingo Molnar
2015-05-08 10:59                             ` Andy Lutomirski
2015-05-08 11:27                               ` Ingo Molnar
2015-05-08 12:56                                 ` Andy Lutomirski
2015-05-08 13:27                                   ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1430429035-25563-3-git-send-email-riel@redhat.com \
    --to=riel@redhat.com \
    --cc=amluto@amacapital.com \
    --cc=bonzini@redhat.com \
    --cc=fweisbec@redhat.com \
    --cc=heiko.carstens@de.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=williams@redhat.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.