Re: [BUG nohz]: wrong user and system time accounting

From: Wanpeng Li <kernellwp@gmail.com>
To: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>,
	Mike Galbraith <efault@gmx.de>, Rik van Riel <riel@redhat.com>,
	Luiz Capitulino <lcapitulino@redhat.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Paolo Bonzini <pbonzini@redhat.com>
Subject: Re: [BUG nohz]: wrong user and system time accounting
Date: Tue, 2 May 2017 18:01:46 +0800	[thread overview]
Message-ID: <CANRm+CwHObjZZ4M=_kMYN9fUg2SEZzmfYitVjRdBs4eZyueyHA@mail.gmail.com> (raw)
In-Reply-To: <20170413133255.GA2608@lerouge>

Cc Paolo,
2017-04-13 21:32 GMT+08:00 Frederic Weisbecker <fweisbec@gmail.com>:
> On Thu, Apr 13, 2017 at 12:31:12PM +0800, Wanpeng Li wrote:
>> 2017-04-12 22:57 GMT+08:00 Thomas Gleixner <tglx@linutronix.de>:
>> > On Wed, 12 Apr 2017, Frederic Weisbecker wrote:
>> >> On Tue, Apr 11, 2017 at 04:22:48PM +0200, Thomas Gleixner wrote:
>> >> > It's not different from the current jiffies based stuff at all. Same
>> >> > failure mode.
>> >>
>> >> Yes you're right, I got confused again. So to fix this we could do our snapshots
>> >> at a frequency lower than HZ but still high enough to avoid overhead.
>> >>
>> >> Something like TICK_NSEC / 2 ?
>> >
>> > If you are using TSC anyway then you can do proper accumulation for both
>> > system and user and only account the data when the accumulation is more
>> > than a jiffie.
>>
>> So I implement it as below:
>>
>> - HZ=1000.
>>   1) two cpu hogs on cpu in nohz_full mode, 100% user time
>>   2) Luzi's testcase, ~95% user time, ~5% idle time (as we expected)
>> - HZ=250
>>    1) two cpu hogs on cpu in nohz_full mode, 100% user time
>>    2) Luzi's testcase, 100% idle
>>
>> So the codes below still not work correctly for HZ=250, any suggestions?
>
> Right, so first lets reorder that code a bit so we can see clear inside :-)
>
>>
>> -------------------------------------->8-----------------------------------------------------
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index d67eee8..6a11771 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -668,6 +668,8 @@ struct task_struct {
>>  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
>>      seqcount_t            vtime_seqcount;
>>      unsigned long long        vtime_snap;
>> +    u64                vtime_acct_utime;
>> +    u64                vtime_acct_stime;
>
> You need to accumulate guest and steal time as well.
>

Hi Frederic,

Sorry for the delay since I'm too busy recently, I just add guest time
and idle time accumulations as below, the code work as we expected for
native kernel, however, the testcase fails when it runs in kvm guest.
Top shows ~99% sys for Luzi's testcase "./acct-bug 1 995" which we
expect 95% user  and %5 idle. In addition, what's the design idea of
steal time accumluation in your mind? Pass the tsk parameter in the
function get_vtime_delta() down to the function
steal_account_process_time()?

-------------------------------------->8-----------------------------------------------------

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cf9a59..56815cd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -672,6 +672,10 @@ struct task_struct {
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
     seqcount_t            vtime_seqcount;
     unsigned long long        vtime_snap;
+    u64                vtime_acct_utime;
+    u64                vtime_acct_stime;
+    u64                vtime_acct_idle_time;
+    u64                vtime_acct_guest_time;
     enum {
         /* Task is sleeping or running in a CPU with VTIME inactive: */
         VTIME_INACTIVE = 0,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b..2d950c6 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -676,18 +676,19 @@ void thread_group_cputime_adjusted(struct
task_struct *p, u64 *ut, u64 *st)
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 static u64 vtime_delta(struct task_struct *tsk)
 {
-    unsigned long now = READ_ONCE(jiffies);
+    unsigned long long clock;

-    if (time_before(now, (unsigned long)tsk->vtime_snap))
+    clock = sched_clock();
+    if (clock < tsk->vtime_snap)
         return 0;

-    return jiffies_to_nsecs(now - tsk->vtime_snap);
+    return clock - tsk->vtime_snap;
 }

 static u64 get_vtime_delta(struct task_struct *tsk)
 {
-    unsigned long now = READ_ONCE(jiffies);
-    u64 delta, other;
+    u64 delta = vtime_delta(tsk);
+    u64 other;

     /*
      * Unlike tick based timing, vtime based timing never has lost
@@ -696,17 +697,16 @@ static u64 get_vtime_delta(struct task_struct *tsk)
      * elapsed time. Limit account_other_time to prevent rounding
      * errors from causing elapsed vtime to go negative.
      */
-    delta = jiffies_to_nsecs(now - tsk->vtime_snap);
     other = account_other_time(delta);
     WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-    tsk->vtime_snap = now;
+    tsk->vtime_snap += delta;

     return delta - other;
 }

 static void __vtime_account_system(struct task_struct *tsk)
 {
-    account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
+    account_system_time(tsk, irq_count(), tsk->vtime_acct_stime);
 }

 void vtime_account_system(struct task_struct *tsk)
@@ -715,7 +715,11 @@ void vtime_account_system(struct task_struct *tsk)
         return;

     write_seqcount_begin(&tsk->vtime_seqcount);
-    __vtime_account_system(tsk);
+    tsk->vtime_acct_stime += get_vtime_delta(tsk);
+    if (tsk->vtime_acct_stime >= TICK_NSEC) {
+        __vtime_account_system(tsk);
+        tsk->vtime_acct_stime = 0;
+    }
     write_seqcount_end(&tsk->vtime_seqcount);
 }

@@ -723,16 +727,22 @@ void vtime_account_user(struct task_struct *tsk)
 {
     write_seqcount_begin(&tsk->vtime_seqcount);
     tsk->vtime_snap_whence = VTIME_SYS;
-    if (vtime_delta(tsk))
-        account_user_time(tsk, get_vtime_delta(tsk));
+    tsk->vtime_acct_utime += get_vtime_delta(tsk);
+    if (tsk->vtime_acct_utime >= TICK_NSEC) {
+        account_user_time(tsk, tsk->vtime_acct_utime);
+        tsk->vtime_acct_utime = 0;
+    }
     write_seqcount_end(&tsk->vtime_seqcount);
 }

 void vtime_user_enter(struct task_struct *tsk)
 {
     write_seqcount_begin(&tsk->vtime_seqcount);
-    if (vtime_delta(tsk))
+    tsk->vtime_acct_stime += get_vtime_delta(tsk);
+    if (tsk->vtime_acct_stime >= TICK_NSEC) {
         __vtime_account_system(tsk);
+        tsk->vtime_acct_stime = 0;
+    }
     tsk->vtime_snap_whence = VTIME_USER;
     write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -747,8 +757,11 @@ void vtime_guest_enter(struct task_struct *tsk)
      * that can thus safely catch up with a tickless delta.
      */
     write_seqcount_begin(&tsk->vtime_seqcount);
-    if (vtime_delta(tsk))
+    tsk->vtime_acct_stime += get_vtime_delta(tsk);
+    if (tsk->vtime_acct_stime >= TICK_NSEC) {
         __vtime_account_system(tsk);
+        tsk->vtime_acct_stime = 0;
+    }
     current->flags |= PF_VCPU;
     write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -757,7 +770,11 @@ EXPORT_SYMBOL_GPL(vtime_guest_enter);
 void vtime_guest_exit(struct task_struct *tsk)
 {
     write_seqcount_begin(&tsk->vtime_seqcount);
-    __vtime_account_system(tsk);
+    tsk->vtime_acct_stime += get_vtime_delta(tsk);
+    if (tsk->vtime_acct_stime >= TICK_NSEC) {
+        __vtime_account_system(tsk);
+        tsk->vtime_acct_stime = 0;
+    }
     current->flags &= ~PF_VCPU;
     write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -765,7 +782,11 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit);

 void vtime_account_idle(struct task_struct *tsk)
 {
-    account_idle_time(get_vtime_delta(tsk));
+    tsk->vtime_acct_idle_time += get_vtime_delta(tsk);
+    if (tsk->vtime_acct_idle_time >= TICK_NSEC) {
+        account_idle_time(tsk->vtime_acct_idle_time);
+        tsk->vtime_acct_idle_time = 0;
+    }
 }

 void arch_vtime_task_switch(struct task_struct *prev)
@@ -776,7 +797,7 @@ void arch_vtime_task_switch(struct task_struct *prev)

     write_seqcount_begin(&current->vtime_seqcount);
     current->vtime_snap_whence = VTIME_SYS;
-    current->vtime_snap = jiffies;
+    current->vtime_snap = sched_clock_cpu(smp_processor_id());
     write_seqcount_end(&current->vtime_seqcount);
 }

@@ -787,7 +808,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
     local_irq_save(flags);
     write_seqcount_begin(&t->vtime_seqcount);
     t->vtime_snap_whence = VTIME_SYS;
-    t->vtime_snap = jiffies;
+    t->vtime_snap = sched_clock_cpu(cpu);
     write_seqcount_end(&t->vtime_seqcount);
     local_irq_restore(flags);
 }

Regards,
Wanpeng Li