All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Yuyang Du <yuyang.du@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>,
	Andrey Ryabinin <aryabinin@virtuozzo.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Mike Galbraith <efault@gmx.de>,
	Thomas Gleixner <tglx@linutronix.de>,
	bsegall@google.com, morten.rasmussen@arm.com, pjt@google.com,
	steve.muckle@linaro.org, linux-kernel@vger.kernel.org,
	kernel@kyup.com
Subject: Re: Divide-by-zero in post_init_entity_util_avg
Date: Thu, 16 Jun 2016 10:50:40 +0200	[thread overview]
Message-ID: <20160616085040.GF30927@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <20160609130750.GQ30909@twins.programming.kicks-ass.net>

On Thu, Jun 09, 2016 at 03:07:50PM +0200, Peter Zijlstra wrote:
> Which given the lack of serialization, and the code generated from
> update_cfs_rq_load_avg() is entirely possible.
> 
> 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> 		sa->load_avg = max_t(long, sa->load_avg - r, 0);
> 		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
> 		removed_load = 1;
> 	}
> 
> turns into:
> 
> ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
> ffffffff8108706b:       48 85 c0                test   %rax,%rax
> ffffffff8108706e:       74 40                   je     ffffffff810870b0 <update_blocked_averages+0xc0>
> ffffffff81087070:       4c 89 f8                mov    %r15,%rax
> ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
> ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
> ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
> ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
> ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
> ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx
> 
> Which you'll note ends up with sa->load_avg -= r in memory at
> ffffffff8108707a.
> 
> Ludicrous code generation if you ask me; I'd have expected something
> like (note, r15 holds 0):
> 
> 	mov	%r15, %rax
> 	xchg	%rax, cfs_rq->removed_load_avg
> 	mov	sa->load_avg, %rcx
> 	sub	%rax, %rcx
> 	cmovs	%r15, %rcx
> 	mov	%rcx, sa->load_avg

So I _should_ have looked at other unserialized users of ->load_avg, but
alas. Luckily nikbor reported a similar /0 from task_h_load() which
instantly triggered recollection of this here problem.

Now, we really do not want to go grab rq->lock there, so I did the
below. Which actually ends up generating the 'right' code as per the
above.

    3133:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
    313a:       49 8b 4d 70             mov    0x70(%r13),%rcx
    313e:       bb 01 00 00 00          mov    $0x1,%ebx
    3143:       48 29 c1                sub    %rax,%rcx
    3146:       49 0f 48 cf             cmovs  %r15,%rcx
    314a:       48 69 c0 82 45 ff ff    imul   $0xffffffffffff4582,%rax,%rax
    3151:       49 89 4d 70             mov    %rcx,0x70(%r13)

This ensures the negative value never hits memory and allows the
unserialized use.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f75930bdd326..3fd3d903e6b6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2878,6 +2878,20 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 	}
 }
 
+/*
+ * Explicitly do a load-store to ensure the temporary value never hits memory.
+ * This allows lockless observations without ever seeing the negative values.
+ *
+ * Incidentally, this also generates much saner code for x86.
+ */
+#define sub_positive(type, ptr, val) do {			\
+	type tmp = READ_ONCE(*ptr);				\
+	tmp -= (val);						\
+	if (tmp < 0)						\
+		tmp = 0;					\
+	WRITE_ONCE(*ptr, tmp);					\
+} while (0)
+
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2887,15 +2901,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-		sa->load_avg = max_t(long, sa->load_avg - r, 0);
-		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+		sub_positive(long, &sa->load_avg, r);
+		sub_positive(s64,  &sa->load_sum, r * LOAD_AVG_MAX);
 		removed_load = 1;
 	}
 
 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-		sa->util_avg = max_t(long, sa->util_avg - r, 0);
-		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+		sub_positive(long, &sa->util_avg, r);
+		sub_positive(s32,  &sa->util_sum, r * LOAD_AVG_MAX);
 		removed_util = 1;
 	}
 
@@ -2968,10 +2982,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
 
-	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+	sub_positive(long, &cfs_rq->avg.load_avg, se->avg.load_avg);
+	sub_positive(s64,  &cfs_rq->avg.load_sum, se->avg.load_sum);
+	sub_positive(long, &cfs_rq->avg.util_avg, se->avg.util_avg);
+	sub_positive(s32,  &cfs_rq->avg.util_sum, se->avg.util_sum);
 
 	cfs_rq_util_change(cfs_rq);
 }

  parent reply	other threads:[~2016-06-16  8:51 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-06-09  9:01 Divide-by-zero in post_init_entity_util_avg Chris Wilson
2016-06-09  1:33 ` Yuyang Du
2016-06-09 13:07   ` Peter Zijlstra
2016-06-12 22:25     ` Yuyang Du
2016-06-14 11:25     ` [tip:sched/core] sched/fair: Fix post_init_entity_util_avg() serialization tip-bot for Peter Zijlstra
2016-06-16  8:50     ` Peter Zijlstra [this message]
2016-06-16 12:25       ` Divide-by-zero in post_init_entity_util_avg Peter Zijlstra
2016-06-16 16:16         ` Peter Zijlstra
2016-06-17  8:16         ` Andrey Ryabinin
2016-06-17  8:23           ` Peter Zijlstra
2016-06-17  9:19         ` [PATCH] sched/fair: Fix cfs_rq avg tracking underflow Peter Zijlstra
2016-06-17  2:01           ` Yuyang Du
2016-06-20 13:24           ` [tip:sched/urgent] " tip-bot for Peter Zijlstra
2016-06-09 10:29 ` Divide-by-zero in post_init_entity_util_avg Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160616085040.GF30927@twins.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=aryabinin@virtuozzo.com \
    --cc=bsegall@google.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=efault@gmx.de \
    --cc=kernel@kyup.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=morten.rasmussen@arm.com \
    --cc=pjt@google.com \
    --cc=steve.muckle@linaro.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=yuyang.du@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.