From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1751876AbcFINIV (ORCPT <rfc822;w@1wt.eu>);
	Thu, 9 Jun 2016 09:08:21 -0400
Received: from merlin.infradead.org ([205.233.59.134]:59337 "EHLO
	merlin.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1750810AbcFINIU (ORCPT
	<rfc822;linux-kernel@vger.kernel.org>);
	Thu, 9 Jun 2016 09:08:20 -0400
Date: Thu, 9 Jun 2016 15:07:50 +0200
From: Peter Zijlstra <peterz@infradead.org>
To: Yuyang Du <yuyang.du@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>,
        Andrey Ryabinin <aryabinin@virtuozzo.com>,
        Linus Torvalds <torvalds@linux-foundation.org>,
        Mike Galbraith <efault@gmx.de>, Thomas Gleixner <tglx@linutronix.de>,
        bsegall@google.com, morten.rasmussen@arm.com, pjt@google.com,
        steve.muckle@linaro.org, linux-kernel@vger.kernel.org
Subject: Re: Divide-by-zero in post_init_entity_util_avg
Message-ID: <20160609130750.GQ30909@twins.programming.kicks-ass.net>
References: <20160609090142.GS32344@nuc-i3427.alporthouse.com>
 <20160609013324.GH8105@intel.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20160609013324.GH8105@intel.com>
User-Agent: Mutt/1.5.23.1 (2014-03-12)
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

Chris Wilson reported a divide by 0 at:

post_init_entity_util_avg():

>    725			if (cfs_rq->avg.util_avg != 0) {
>    726				sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
> -> 727				sa->util_avg /= (cfs_rq->avg.load_avg + 1);
>    728	
>    729				if (sa->util_avg > cap)
>    730					sa->util_avg = cap;
>    731			} else {

Which given the lack of serialization, and the code generated from
update_cfs_rq_load_avg() is entirely possible.

	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
		sa->load_avg = max_t(long, sa->load_avg - r, 0);
		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
		removed_load = 1;
	}

turns into:

ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
ffffffff8108706b:       48 85 c0                test   %rax,%rax
ffffffff8108706e:       74 40                   je     ffffffff810870b0 <update_blocked_averages+0xc0>
ffffffff81087070:       4c 89 f8                mov    %r15,%rax
ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx

Which you'll note ends up with sa->load_avg -= r in memory at
ffffffff8108707a.

Ludicrous code generation if you ask me; I'd have expected something
like (note, r15 holds 0):

	mov	%r15, %rax
	xchg	%rax, cfs_rq->removed_load_avg
	mov	sa->load_avg, %rcx
	sub	%rax, %rcx
	cmovs	%r15, %rcx
	mov	%rcx, sa->load_avg

Adding the serialization (to _both_ call sites) should fix this.

Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded value")
Reported-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c | 3 +--
 kernel/sched/fair.c | 8 +++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 385c947482e1..4aff10e3bd14 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2535,10 +2535,9 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
-	/* Post initialize new task's util average when its cfs_rq is set */
+	rq = __task_rq_lock(p, &rf);
 	post_init_entity_util_avg(&p->se);
 
-	rq = __task_rq_lock(p, &rf);
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c6dd8bab010c..f379da14c7dd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8468,8 +8468,9 @@ void free_fair_sched_group(struct task_group *tg)
 
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
 	int i;
 
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8484,6 +8485,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
 	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
@@ -8497,7 +8500,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 		init_entity_runnable_average(se);
+
+		raw_spin_lock_irq(&rq->lock);
 		post_init_entity_util_avg(se);
+		raw_spin_unlock_irq(&rq->lock);
 	}
 
 	return 1;