From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751197AbdEBKBx (ORCPT ); Tue, 2 May 2017 06:01:53 -0400 Received: from mail-io0-f194.google.com ([209.85.223.194]:34535 "EHLO mail-io0-f194.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750934AbdEBKBs (ORCPT ); Tue, 2 May 2017 06:01:48 -0400 MIME-Version: 1.0 In-Reply-To: <20170413133255.GA2608@lerouge> References: <1490636129.8850.76.camel@redhat.com> <20170328132406.7d23579c@redhat.com> <20170329131656.1d6cb743@redhat.com> <1490818125.28917.11.camel@redhat.com> <1490848051.4167.57.camel@gmx.de> <20170412131818.GB21309@lerouge> <20170413133255.GA2608@lerouge> From: Wanpeng Li Date: Tue, 2 May 2017 18:01:46 +0800 Message-ID: Subject: Re: [BUG nohz]: wrong user and system time accounting To: Frederic Weisbecker Cc: Thomas Gleixner , Mike Galbraith , Rik van Riel , Luiz Capitulino , "linux-kernel@vger.kernel.org" , Peter Zijlstra , Paolo Bonzini Content-Type: text/plain; charset=UTF-8 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Cc Paolo, 2017-04-13 21:32 GMT+08:00 Frederic Weisbecker : > On Thu, Apr 13, 2017 at 12:31:12PM +0800, Wanpeng Li wrote: >> 2017-04-12 22:57 GMT+08:00 Thomas Gleixner : >> > On Wed, 12 Apr 2017, Frederic Weisbecker wrote: >> >> On Tue, Apr 11, 2017 at 04:22:48PM +0200, Thomas Gleixner wrote: >> >> > It's not different from the current jiffies based stuff at all. Same >> >> > failure mode. >> >> >> >> Yes you're right, I got confused again. So to fix this we could do our snapshots >> >> at a frequency lower than HZ but still high enough to avoid overhead. >> >> >> >> Something like TICK_NSEC / 2 ? >> > >> > If you are using TSC anyway then you can do proper accumulation for both >> > system and user and only account the data when the accumulation is more >> > than a jiffie. >> >> So I implement it as below: >> >> - HZ=1000. >> 1) two cpu hogs on cpu in nohz_full mode, 100% user time >> 2) Luzi's testcase, ~95% user time, ~5% idle time (as we expected) >> - HZ=250 >> 1) two cpu hogs on cpu in nohz_full mode, 100% user time >> 2) Luzi's testcase, 100% idle >> >> So the codes below still not work correctly for HZ=250, any suggestions? > > Right, so first lets reorder that code a bit so we can see clear inside :-) > >> >> -------------------------------------->8----------------------------------------------------- >> >> diff --git a/include/linux/sched.h b/include/linux/sched.h >> index d67eee8..6a11771 100644 >> --- a/include/linux/sched.h >> +++ b/include/linux/sched.h >> @@ -668,6 +668,8 @@ struct task_struct { >> #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN >> seqcount_t vtime_seqcount; >> unsigned long long vtime_snap; >> + u64 vtime_acct_utime; >> + u64 vtime_acct_stime; > > You need to accumulate guest and steal time as well. > Hi Frederic, Sorry for the delay since I'm too busy recently, I just add guest time and idle time accumulations as below, the code work as we expected for native kernel, however, the testcase fails when it runs in kvm guest. Top shows ~99% sys for Luzi's testcase "./acct-bug 1 995" which we expect 95% user and %5 idle. In addition, what's the design idea of steal time accumluation in your mind? Pass the tsk parameter in the function get_vtime_delta() down to the function steal_account_process_time()? -------------------------------------->8----------------------------------------------------- diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cf9a59..56815cd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -672,6 +672,10 @@ struct task_struct { #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_t vtime_seqcount; unsigned long long vtime_snap; + u64 vtime_acct_utime; + u64 vtime_acct_stime; + u64 vtime_acct_idle_time; + u64 vtime_acct_guest_time; enum { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f3778e2b..2d950c6 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -676,18 +676,19 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN static u64 vtime_delta(struct task_struct *tsk) { - unsigned long now = READ_ONCE(jiffies); + unsigned long long clock; - if (time_before(now, (unsigned long)tsk->vtime_snap)) + clock = sched_clock(); + if (clock < tsk->vtime_snap) return 0; - return jiffies_to_nsecs(now - tsk->vtime_snap); + return clock - tsk->vtime_snap; } static u64 get_vtime_delta(struct task_struct *tsk) { - unsigned long now = READ_ONCE(jiffies); - u64 delta, other; + u64 delta = vtime_delta(tsk); + u64 other; /* * Unlike tick based timing, vtime based timing never has lost @@ -696,17 +697,16 @@ static u64 get_vtime_delta(struct task_struct *tsk) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - tsk->vtime_snap); other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); - tsk->vtime_snap = now; + tsk->vtime_snap += delta; return delta - other; } static void __vtime_account_system(struct task_struct *tsk) { - account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); + account_system_time(tsk, irq_count(), tsk->vtime_acct_stime); } void vtime_account_system(struct task_struct *tsk) @@ -715,7 +715,11 @@ void vtime_account_system(struct task_struct *tsk) return; write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + tsk->vtime_acct_stime += get_vtime_delta(tsk); + if (tsk->vtime_acct_stime >= TICK_NSEC) { + __vtime_account_system(tsk); + tsk->vtime_acct_stime = 0; + } write_seqcount_end(&tsk->vtime_seqcount); } @@ -723,16 +727,22 @@ void vtime_account_user(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); tsk->vtime_snap_whence = VTIME_SYS; - if (vtime_delta(tsk)) - account_user_time(tsk, get_vtime_delta(tsk)); + tsk->vtime_acct_utime += get_vtime_delta(tsk); + if (tsk->vtime_acct_utime >= TICK_NSEC) { + account_user_time(tsk, tsk->vtime_acct_utime); + tsk->vtime_acct_utime = 0; + } write_seqcount_end(&tsk->vtime_seqcount); } void vtime_user_enter(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) + tsk->vtime_acct_stime += get_vtime_delta(tsk); + if (tsk->vtime_acct_stime >= TICK_NSEC) { __vtime_account_system(tsk); + tsk->vtime_acct_stime = 0; + } tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); } @@ -747,8 +757,11 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) + tsk->vtime_acct_stime += get_vtime_delta(tsk); + if (tsk->vtime_acct_stime >= TICK_NSEC) { __vtime_account_system(tsk); + tsk->vtime_acct_stime = 0; + } current->flags |= PF_VCPU; write_seqcount_end(&tsk->vtime_seqcount); } @@ -757,7 +770,11 @@ EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + tsk->vtime_acct_stime += get_vtime_delta(tsk); + if (tsk->vtime_acct_stime >= TICK_NSEC) { + __vtime_account_system(tsk); + tsk->vtime_acct_stime = 0; + } current->flags &= ~PF_VCPU; write_seqcount_end(&tsk->vtime_seqcount); } @@ -765,7 +782,11 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { - account_idle_time(get_vtime_delta(tsk)); + tsk->vtime_acct_idle_time += get_vtime_delta(tsk); + if (tsk->vtime_acct_idle_time >= TICK_NSEC) { + account_idle_time(tsk->vtime_acct_idle_time); + tsk->vtime_acct_idle_time = 0; + } } void arch_vtime_task_switch(struct task_struct *prev) @@ -776,7 +797,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(¤t->vtime_seqcount); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = jiffies; + current->vtime_snap = sched_clock_cpu(smp_processor_id()); write_seqcount_end(¤t->vtime_seqcount); } @@ -787,7 +808,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&t->vtime_seqcount); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = jiffies; + t->vtime_snap = sched_clock_cpu(cpu); write_seqcount_end(&t->vtime_seqcount); local_irq_restore(flags); } Regards, Wanpeng Li