linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] context_tracking: remove local_irq_save from __acct_update_integrals
@ 2015-04-24 15:16 Rik van Riel
  2015-04-25  9:43 ` Heiko Carstens
  0 siblings, 1 reply; 6+ messages in thread
From: Rik van Riel @ 2015-04-24 15:16 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andy Lutomirsky, Frederic Weisbecker, Peter Zijlstra,
	Heiko Carstens, williams

The function __acct_update_integrals() is called both from irq context
and task context. This creates a race where irq context can advance
tsk->acct_timexpd to a value larger than time, leading to a negative
value, which causes a divide error. See commit 6d5b5acca9e5
("Fix fixpoint divide exception in acct_update_integrals")

In 2012, __acct_update_integrals() was changed to get utime and stime
as function parameters. This re-introduced the bug, because an irq
can hit in-between the call to task_cputime() and where irqs actually
get disabled.

However, this race condition was originally reproduced on Hercules,
and I have not seen any reports of it re-occurring since it was
re-introduced 3 years ago.

On the other hand, the irq disabling and re-enabling, which no longer
even protects us against the race today, show up prominently in the
perf profile of a program that makes a very large number of system calls
in a short period of time, when nohz_full= (and context tracking) is
enabled.

This patch replaces the (now ineffective) irq blocking with a cheaper
way to test for the race condition, and speeds up my microbenchmark
with 10 million iterations, average of 5 runs, tiny stddev:

		run time	system time
vanilla		5.49s		2.08s
patch		5.21s		1.92s

Cc: Andy Lutomirsky <amluto@amacapital.com>
Cc: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
---
V2: introduce signed_cputime_t to deal with 64 bit cputime_t on
    32 bit architectures, and use READ_ONCE to ensure the value
    is always read atomically (Heiko Karstens)

 arch/powerpc/include/asm/cputime.h    |  3 +++
 arch/s390/include/asm/cputime.h       |  3 +++
 include/asm-generic/cputime_jiffies.h |  2 ++
 include/asm-generic/cputime_nsecs.h   |  3 +++
 kernel/tsacct.c                       | 18 ++++++++++++------
 5 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index e2452550bcb1..e41b32f68a2c 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -32,6 +32,9 @@ static inline void setup_cputime_one_jiffy(void) { }
 typedef u64 __nocast cputime_t;
 typedef u64 __nocast cputime64_t;
 
+typedef s64 signed_cputime_t;
+typedef s64 signed_cputime64_t;
+
 #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new)
 
 #ifdef __KERNEL__
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index 221b454c734a..2e8c268cc2a7 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -18,6 +18,9 @@
 typedef unsigned long long __nocast cputime_t;
 typedef unsigned long long __nocast cputime64_t;
 
+typedef signed long long signed_cputime_t;
+typedef signed long long signed_cputime64_t;
+
 #define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new)
 
 static inline unsigned long __div(unsigned long long n, unsigned long base)
diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h
index fe386fc6e85e..b96b6a1b6c97 100644
--- a/include/asm-generic/cputime_jiffies.h
+++ b/include/asm-generic/cputime_jiffies.h
@@ -2,6 +2,7 @@
 #define _ASM_GENERIC_CPUTIME_JIFFIES_H
 
 typedef unsigned long __nocast cputime_t;
+typedef signed long signed_cputime_t;
 
 #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new)
 
@@ -11,6 +12,7 @@ typedef unsigned long __nocast cputime_t;
 #define jiffies_to_cputime(__hz)	(__force cputime_t)(__hz)
 
 typedef u64 __nocast cputime64_t;
+typedef s64 signed_cputime_t;
 
 #define cputime64_to_jiffies64(__ct)	(__force u64)(__ct)
 #define jiffies64_to_cputime64(__jif)	(__force cputime64_t)(__jif)
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h
index 0419485891f2..c1ad2f90a4d9 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -21,6 +21,9 @@
 typedef u64 __nocast cputime_t;
 typedef u64 __nocast cputime64_t;
 
+typedef s64 signed_cputime_t;
+typedef s64 signed_cputime64_t;
+
 #define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new)
 
 #define cputime_one_jiffy		jiffies_to_cputime(1)
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 975cb49e32bf..1bebe0339ac1 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -126,23 +126,29 @@ static void __acct_update_integrals(struct task_struct *tsk,
 	if (likely(tsk->mm)) {
 		cputime_t time, dtime;
 		struct timeval value;
-		unsigned long flags;
 		u64 delta;
 
-		local_irq_save(flags);
 		time = stime + utime;
-		dtime = time - tsk->acct_timexpd;
+		dtime = time - READ_ONCE(tsk->acct_timexpd);
+		/*
+		 * This code is called both from irq context and from
+		 * task context. There is a race where irq context advances
+		 * tsk->acct_timexpd to a value larger than time, creating
+		 * a negative value. In that case, the irq has already
+		 * updated the statistics.
+		 */
+		if (unlikely((signed_cputime_t)dtime <= 0))
+			return;
+
 		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
 		delta = value.tv_sec;
 		delta = delta * USEC_PER_SEC + value.tv_usec;
 
 		if (delta == 0)
-			goto out;
+			return;
 		tsk->acct_timexpd = time;
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
-	out:
-		local_irq_restore(flags);
 	}
 }
 

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] context_tracking: remove local_irq_save from __acct_update_integrals
  2015-04-24 15:16 [PATCH v2] context_tracking: remove local_irq_save from __acct_update_integrals Rik van Riel
@ 2015-04-25  9:43 ` Heiko Carstens
  2015-04-25 12:50   ` Rik van Riel
  0 siblings, 1 reply; 6+ messages in thread
From: Heiko Carstens @ 2015-04-25  9:43 UTC (permalink / raw)
  To: Rik van Riel
  Cc: linux-kernel, Andy Lutomirsky, Frederic Weisbecker,
	Peter Zijlstra, williams

On Fri, Apr 24, 2015 at 11:16:53AM -0400, Rik van Riel wrote:
> V2: introduce signed_cputime_t to deal with 64 bit cputime_t on
>     32 bit architectures, and use READ_ONCE to ensure the value
>     is always read atomically (Heiko Karstens)

Erm, that's not what I said ;)
READ_ONCE() only fixes the isssue that with your previous code the
compiler was free to generate code that accesses the memory value
several times.

But..

> -		local_irq_save(flags);
>  		time = stime + utime;
> -		dtime = time - tsk->acct_timexpd;
> +		dtime = time - READ_ONCE(tsk->acct_timexpd);
> +		/*
> +		 * This code is called both from irq context and from
> +		 * task context. There is a race where irq context advances
> +		 * tsk->acct_timexpd to a value larger than time, creating
> +		 * a negative value. In that case, the irq has already
> +		 * updated the statistics.
> +		 */
> +		if (unlikely((signed_cputime_t)dtime <= 0))
> +			return;
> +

...the READ_ONCE() doesn't give you any guarantees about reading
tsk->acct_timexpd in an atomic way.
Well, actually you don't need atomic semantics, but only to make sure that
the read access happens with a single instruction, since you want to protect
against interrupts.
But still: if the size of acct_timexpd is 64 bit READ_ONCE() may still result
in two instructions on 32 bit architectures.
(or isn't there currently no 32 bit architecture with 64 bit cputime_t left?)


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] context_tracking: remove local_irq_save from __acct_update_integrals
  2015-04-25  9:43 ` Heiko Carstens
@ 2015-04-25 12:50   ` Rik van Riel
  2015-04-27 11:18     ` Heiko Carstens
  0 siblings, 1 reply; 6+ messages in thread
From: Rik van Riel @ 2015-04-25 12:50 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: linux-kernel, Andy Lutomirsky, Frederic Weisbecker,
	Peter Zijlstra, williams

On 04/25/2015 05:43 AM, Heiko Carstens wrote:
> On Fri, Apr 24, 2015 at 11:16:53AM -0400, Rik van Riel wrote:
>> V2: introduce signed_cputime_t to deal with 64 bit cputime_t on
>>     32 bit architectures, and use READ_ONCE to ensure the value
>>     is always read atomically (Heiko Karstens)
> 
> Erm, that's not what I said ;)
> READ_ONCE() only fixes the isssue that with your previous code the
> compiler was free to generate code that accesses the memory value
> several times.

Ah indeed, you are right.

>> -		local_irq_save(flags);
>>  		time = stime + utime;
>> -		dtime = time - tsk->acct_timexpd;
>> +		dtime = time - READ_ONCE(tsk->acct_timexpd);
>> +		/*
>> +		 * This code is called both from irq context and from
>> +		 * task context. There is a race where irq context advances
>> +		 * tsk->acct_timexpd to a value larger than time, creating
>> +		 * a negative value. In that case, the irq has already
>> +		 * updated the statistics.
>> +		 */
>> +		if (unlikely((signed_cputime_t)dtime <= 0))
>> +			return;
>> +
> 
> ...the READ_ONCE() doesn't give you any guarantees about reading
> tsk->acct_timexpd in an atomic way.
> Well, actually you don't need atomic semantics, but only to make sure that
> the read access happens with a single instruction, since you want to protect
> against interrupts.
> But still: if the size of acct_timexpd is 64 bit READ_ONCE() may still result
> in two instructions on 32 bit architectures.
> (or isn't there currently no 32 bit architecture with 64 bit cputime_t left?)

Even if there is (maybe some ARM system?), can we even guarantee
that a single instruction to read 64 bits exists on such a system?

-- 
All rights reversed.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] context_tracking: remove local_irq_save from __acct_update_integrals
  2015-04-25 12:50   ` Rik van Riel
@ 2015-04-27 11:18     ` Heiko Carstens
  2015-04-28 12:53       ` Rik van Riel
  0 siblings, 1 reply; 6+ messages in thread
From: Heiko Carstens @ 2015-04-27 11:18 UTC (permalink / raw)
  To: Rik van Riel
  Cc: linux-kernel, Andy Lutomirsky, Frederic Weisbecker,
	Peter Zijlstra, williams

On Sat, Apr 25, 2015 at 08:50:49AM -0400, Rik van Riel wrote:
> On 04/25/2015 05:43 AM, Heiko Carstens wrote:
> > ...the READ_ONCE() doesn't give you any guarantees about reading
> > tsk->acct_timexpd in an atomic way.
> > Well, actually you don't need atomic semantics, but only to make sure that
> > the read access happens with a single instruction, since you want to protect
> > against interrupts.
> > But still: if the size of acct_timexpd is 64 bit READ_ONCE() may still result
> > in two instructions on 32 bit architectures.
> > (or isn't there currently no 32 bit architecture with 64 bit cputime_t left?)
> 
> Even if there is (maybe some ARM system?), can we even guarantee
> that a single instruction to read 64 bits exists on such a system?

I wouldn't bet on it. I can only talk for s390 and there is an instruction
available which would do that. But since s390 is now a 64 bit only architecture
it doesn't matter anyway.
For other architectures I'd say: no, you can't rely on that.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] context_tracking: remove local_irq_save from __acct_update_integrals
  2015-04-27 11:18     ` Heiko Carstens
@ 2015-04-28 12:53       ` Rik van Riel
  2015-04-28 13:57         ` Peter Zijlstra
  0 siblings, 1 reply; 6+ messages in thread
From: Rik van Riel @ 2015-04-28 12:53 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: linux-kernel, Andy Lutomirsky, Frederic Weisbecker,
	Peter Zijlstra, williams

On 04/27/2015 07:18 AM, Heiko Carstens wrote:
> On Sat, Apr 25, 2015 at 08:50:49AM -0400, Rik van Riel wrote:
>> On 04/25/2015 05:43 AM, Heiko Carstens wrote:
>>> ...the READ_ONCE() doesn't give you any guarantees about reading
>>> tsk->acct_timexpd in an atomic way.
>>> Well, actually you don't need atomic semantics, but only to make sure that
>>> the read access happens with a single instruction, since you want to protect
>>> against interrupts.
>>> But still: if the size of acct_timexpd is 64 bit READ_ONCE() may still result
>>> in two instructions on 32 bit architectures.
>>> (or isn't there currently no 32 bit architecture with 64 bit cputime_t left?)
>>
>> Even if there is (maybe some ARM system?), can we even guarantee
>> that a single instruction to read 64 bits exists on such a system?
> 
> I wouldn't bet on it. I can only talk for s390 and there is an instruction
> available which would do that. But since s390 is now a 64 bit only architecture
> it doesn't matter anyway.
> For other architectures I'd say: no, you can't rely on that.

So what can I do to move forward with this patch?

It speeds up syscall entry / exit by 7% when nohz_full
is enabled on a CPU...

Should I have the irq block compiled in only when
sizeof(cputime_t) > sizeof(long) ?

-- 
All rights reversed.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] context_tracking: remove local_irq_save from __acct_update_integrals
  2015-04-28 12:53       ` Rik van Riel
@ 2015-04-28 13:57         ` Peter Zijlstra
  0 siblings, 0 replies; 6+ messages in thread
From: Peter Zijlstra @ 2015-04-28 13:57 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Heiko Carstens, linux-kernel, Andy Lutomirsky,
	Frederic Weisbecker, williams

On Tue, Apr 28, 2015 at 08:53:18AM -0400, Rik van Riel wrote:
> So what can I do to move forward with this patch?
> 
> It speeds up syscall entry / exit by 7% when nohz_full
> is enabled on a CPU...
> 
> Should I have the irq block compiled in only when
> sizeof(cputime_t) > sizeof(long) ?

So I'm not sure how the IRQ disable even works today (as you write in
the Changelog), it appears racy.

At which point you already have the problem of load/store tearing.

So either you (the pural) need to go fix that, or we should collectively
say sod it and just do your patch.

That said, your patch would probably want a WRITE_ONCE() around the
assignment to acct_timexpd; ensuring the read happens in a single load
doesn't really help if then the store is done in two :-)

Also, we should probably reduce indent in that function, something like
the below.


---
 kernel/tsacct.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 975cb49e32bf..9e225425bc3a 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -123,27 +123,27 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 static void __acct_update_integrals(struct task_struct *tsk,
 				    cputime_t utime, cputime_t stime)
 {
-	if (likely(tsk->mm)) {
-		cputime_t time, dtime;
-		struct timeval value;
-		unsigned long flags;
-		u64 delta;
-
-		local_irq_save(flags);
-		time = stime + utime;
-		dtime = time - tsk->acct_timexpd;
-		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
-		delta = value.tv_sec;
-		delta = delta * USEC_PER_SEC + value.tv_usec;
-
-		if (delta == 0)
-			goto out;
+	cputime_t time, dtime;
+	struct timeval value;
+	unsigned long flags;
+	u64 delta;
+
+	if (unlikely(!tsk->mm))
+		return;
+
+	local_irq_save(flags);
+	time = stime + utime;
+	dtime = time - tsk->acct_timexpd;
+	jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+	delta = value.tv_sec;
+	delta = delta * USEC_PER_SEC + value.tv_usec;
+
+	if (delta) {
 		tsk->acct_timexpd = time;
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
-	out:
-		local_irq_restore(flags);
 	}
+	local_irq_restore(flags);
 }
 
 /**

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2015-04-28 13:57 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-24 15:16 [PATCH v2] context_tracking: remove local_irq_save from __acct_update_integrals Rik van Riel
2015-04-25  9:43 ` Heiko Carstens
2015-04-25 12:50   ` Rik van Riel
2015-04-27 11:18     ` Heiko Carstens
2015-04-28 12:53       ` Rik van Riel
2015-04-28 13:57         ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).