All of lore.kernel.org
 help / color / mirror / Atom feed
* [PROBLEM] possible divide by 0 in kernel/sched/cputime.c scale_stime()
@ 2013-11-16 21:37 Christian Engelmayer
  2013-11-18 14:02 ` Stanislaw Gruszka
  0 siblings, 1 reply; 9+ messages in thread
From: Christian Engelmayer @ 2013-11-16 21:37 UTC (permalink / raw)
  To: Ingo Molnar, Stanislaw Gruszka; +Cc: linux-kernel

Since upgrading from v3.8 to v3.12 I see random crashes in function scale_stime()
in kernel/sched/cputime.c:

   divide error: 0000 [#1] SMP 
   Modules linked in: parport_pc(F) ppdev(F) bnep rfcomm bluetooth binfmt_misc(F)
   zl10353 cx88_dvb cx88_vp3054_i2c videobuf_dvb dvb_core intel_powerclamp coretemp
   kvm_intel(F) tuner_xc2028 kvm(F) i915 snd_hda_codec_hdmi snd_hda_codec_realtek
   cx8800 cx8802 tuner snd_hda_intel snd_hda_codec cx88_alsa crct10dif_pclmul(F)
   crc32_pclmul(F) snd_hwdep(F) snd_pcm(F) snd_page_alloc(F) ghash_clmulni_intel(F)
   aesni_intel(F) snd_seq_midi(F) snd_seq_midi_event(F) snd_rawmidi(F) snd_seq(F)
   joydev(F) cx88xx snd_seq_device(F) snd_timer(F) aes_x86_64(F) lrw(F) gf128mul(F)
   glue_helper(F) video(F) btcx_risc drm_kms_helper ablk_helper(F) tveeprom cryptd(F)
   lp(F) videobuf_dma_sg rc_core drm v4l2_common videobuf_core mei_me parport(F)
   snd(F) mei soundcore(F) videodev i2c_algo_bit serio_raw(F) microcode(F) mac_hid
   lpc_ich asus_atk0110 hid_generic usbhid hid usb_storage(F) firewire_ohci ahci(F)
   libahci(F) firewire_core r8169 crc_itu_t(F) mii(F)
   CPU: 3 PID: 15367 Comm: htop Tainted: GF            3.12.0-031200-generic #201311031935
   Hardware name: System manufacturer System Product Name/P7H55-M PRO, BIOS 1709    01/04/2011
   task: ffff8800cc09e000 ti: ffff8800af620000 task.ti: ffff8800af620000
   RIP: 0010:[<ffffffff81099de0>]  [<ffffffff81099de0>] cputime_adjust+0xf0/0x110
   RSP: 0018:ffff8800af621cc8  EFLAGS: 00010847
   RAX: 85fdc1fef4047c00 RBX: 0000000000000000 RCX: ffff8800af621df8
   RDX: 0000000000000000 RSI: ffff8800cc0634d0 RDI: 0000000000000000
   RBP: ffff8800af621cd8 R08: 00000000fffffffe R09: 0000000000000000
   R10: 0000000000000000 R11: fffffffe03427acc R12: ffff8800af621df0
   R13: ffff8800af621df8 R14: 0000000000000000 R15: ffff8800cc063300
   FS:  00007f22a387d740(0000) GS:ffff880117c60000(0000) knlGS:0000000000000000
   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
   CR2: 00007f22a3892000 CR3: 0000000097023000 CR4: 00000000000007e0
   Stack:
    ffff8800c37f0000 ffff8800af621df0 ffff8800af621d18 ffffffff8109aa51
    000000000a7d8c00 0000000042fee100 fffffffe03427acc ffff8800bf112a80
    ffff8800c37f0000 ffff8800c307c280 ffff8800af621e50 ffffffff8121f74b
   Call Trace:
    [<ffffffff8109aa51>] thread_group_cputime_adjusted+0x41/0x50
    [<ffffffff8121f74b>] do_task_stat+0x8eb/0xb60
    [<ffffffff81176400>] ? vma_compute_subtree_gap+0x50/0x50
    [<ffffffff81220314>] proc_tgid_stat+0x14/0x20
    [<ffffffff8121b16d>] proc_single_show+0x4d/0x90
    [<ffffffff811d6eee>] seq_read+0x14e/0x390
    [<ffffffff811b3725>] vfs_read+0x95/0x160
    [<ffffffff811b4239>] SyS_read+0x49/0xa0
    [<ffffffff81723ced>] system_call_fastpath+0x1a/0x1f
   Code: 89 fa 49 c1 ea 20 4d 85 d2 74 ca 4c 89 c2 48 d1 ef 49 89 c0 48 d1 ea 48
         89 d0 eb 9f 0f 1f 80 00 00 00 00 89 c0 31 d2 49 0f af c0 <48> f7 f7 4c
         89 df 48 29 c7 49 89 c3 e9 31 ff ff ff 66 66 66 66 
   RIP  [<ffffffff81099de0>] cputime_adjust+0xf0/0x110
    RSP <ffff8800af621cc8>
   ---[ end trace dbafd2159a385dd6 ]---

The affected LOC performing the division by 0 was introduced in commit

   commit 55eaa7c1f511af5fb6ef808b5328804f4d4e5243
   Author: Stanislaw Gruszka <sgruszka@redhat.com>
   Date:   Tue Apr 30 17:14:42 2013 +0200
   sched: Avoid cputime scaling overflow

For the problem to occur the function is called eg. with the following
input parameters

   stime: 0x3567e00
   rtime: 0xffffffffbf1abfdb
   total: 0x3938700

which leads to 'total' being shifted to 0 during the adaption of the precision
and is then used without further check in

   scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);

The root cause triggering this issue seems to be an overflowed value of

   rtime = nsecs_to_cputime(curr->sum_exec_runtime);

On the affected machine the problem can be triggered by loading the
previously idle system by starting a full kernel build. The problem occurs
within a minute after the ondemand frequency scaling governor adjusts the
frequency from the minimum to the maximum.

The x86 init check whether all booted CPUs have their TSC's synchronized, never
failed so far, however, the tsc clocksource is sporadically marked unstable.

   Clocksource tsc unstable (delta = -74994678 ns)

The used CPU provides an Intel Invariant TSC as stated by CPUID.80000007H:EDX[8]:

    eax in    eax      ebx      ecx      edx
   00000000 0000000b 756e6547 6c65746e 49656e69
   00000001 00020652 04100800 0298e3ff bfebfbff
   00000002 55035a01 00f0b2e3 00000000 09ca212c
   00000003 00000000 00000000 00000000 00000000
   00000004 00000000 00000000 00000000 00000000
   00000005 00000040 00000040 00000003 00001120
   00000006 00000005 00000002 00000001 00000000
   00000007 00000000 00000000 00000000 00000000
   00000008 00000000 00000000 00000000 00000000
   00000009 00000000 00000000 00000000 00000000
   0000000a 07300403 00000004 00000000 00000603
   0000000b 00000000 00000000 0000002c 00000004
   80000000 80000008 00000000 00000000 00000000
   80000001 00000000 00000000 00000001 28100800
   80000002 65746e49 2952286c 726f4320 4d542865
   80000003 35692029 55504320 20202020 20202020
   80000004 30353620 20402020 30322e33 007a4847
   80000005 00000000 00000000 00000000 00000000
   80000006 00000000 00000000 01006040 00000000
   80000007 00000000 00000000 00000000 00000100
   80000008 00003024 00000000 00000000 00000000

   Vendor ID: "GenuineIntel"; CPUID level 11

   Intel-specific functions:
   Version 00020652:
   Type 0 - Original OEM
   Family 6 - Pentium Pro
   Model 5 - Pentium II Model 5/Xeon/Celeron
   Stepping 2
   Reserved 8

   Extended brand string: "Intel(R) Core(TM) i5 CPU         650  @ 3.20GHz"
   CLFLUSH instruction cache line size: 8
   Initial APIC ID: 4
   Hyper threading siblings: 16

   Feature flags bfebfbff:
   FPU    Floating Point Unit
   VME    Virtual 8086 Mode Enhancements
   DE     Debugging Extensions
   PSE    Page Size Extensions
   TSC    Time Stamp Counter
   MSR    Model Specific Registers
   PAE    Physical Address Extension
   MCE    Machine Check Exception
   CX8    COMPXCHG8B Instruction
   APIC   On-chip Advanced Programmable Interrupt Controller present and enabled
   SEP    Fast System Call
   MTRR   Memory Type Range Registers
   PGE    PTE Global Flag
   MCA    Machine Check Architecture
   CMOV   Conditional Move and Compare Instructions
   FGPAT  Page Attribute Table
   PSE-36 36-bit Page Size Extension
   CLFSH  CFLUSH instruction
   DS     Debug store
   ACPI   Thermal Monitor and Clock Ctrl
   MMX    MMX instruction set
   FXSR   Fast FP/MMX Streaming SIMD Extensions save/restore
   SSE    Streaming SIMD Extensions instruction set
   SSE2   SSE2 extensions
   SS     Self Snoop
   HT     Hyper Threading
   TM     Thermal monitor
   31     reserved

Nevertheless, when looking into the issue I saw occurences of sched_clock going
backwards as if the TSCs were read out of sync. Accordingly the problem does
not occur when either booting with option 'nosmp' or when forcing the TSC to be
marked as unstable for sched_clock. Booting with 'acpi=off' and no frequency
scaling works too.

Having a look at the scheduler code I see the following pattern that would also
catch a time warp, eg. kernel/sched/rt.c update_curr_rt():

   u64 delta_exec;
   delta_exec = rq_clock_task(rq) - curr->se.exec_start;
   if (unlikely((s64)delta_exec <= 0))
           return;

However, there are still vulnerable places, eg. kernel/sched/fair.c update_curr():

   /*
    * Get the amount of time the current task was running
    * since the last time we changed load (this cannot
    * overflow on 32 bits):
    */
   delta_exec = (unsigned long)(now - curr->exec_start);
   if (!delta_exec)
           return;

Regards,
Christian

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PROBLEM] possible divide by 0 in kernel/sched/cputime.c scale_stime()
  2013-11-16 21:37 [PROBLEM] possible divide by 0 in kernel/sched/cputime.c scale_stime() Christian Engelmayer
@ 2013-11-18 14:02 ` Stanislaw Gruszka
  2013-11-18 14:19   ` Peter Zijlstra
  2013-11-18 17:27   ` Peter Zijlstra
  0 siblings, 2 replies; 9+ messages in thread
From: Stanislaw Gruszka @ 2013-11-18 14:02 UTC (permalink / raw)
  To: Christian Engelmayer
  Cc: Ingo Molnar, linux-kernel, Peter Zijlstra, Thomas Gleixner

Hi Christian 

On Sat, Nov 16, 2013 at 10:37:40PM +0100, Christian Engelmayer wrote:
> Since upgrading from v3.8 to v3.12 I see random crashes in function scale_stime()
> in kernel/sched/cputime.c:
> 
>    divide error: 0000 [#1] SMP 
>    Modules linked in: parport_pc(F) ppdev(F) bnep rfcomm bluetooth binfmt_misc(F)
>    zl10353 cx88_dvb cx88_vp3054_i2c videobuf_dvb dvb_core intel_powerclamp coretemp
>    kvm_intel(F) tuner_xc2028 kvm(F) i915 snd_hda_codec_hdmi snd_hda_codec_realtek
>    cx8800 cx8802 tuner snd_hda_intel snd_hda_codec cx88_alsa crct10dif_pclmul(F)
>    crc32_pclmul(F) snd_hwdep(F) snd_pcm(F) snd_page_alloc(F) ghash_clmulni_intel(F)
>    aesni_intel(F) snd_seq_midi(F) snd_seq_midi_event(F) snd_rawmidi(F) snd_seq(F)
>    joydev(F) cx88xx snd_seq_device(F) snd_timer(F) aes_x86_64(F) lrw(F) gf128mul(F)
>    glue_helper(F) video(F) btcx_risc drm_kms_helper ablk_helper(F) tveeprom cryptd(F)
>    lp(F) videobuf_dma_sg rc_core drm v4l2_common videobuf_core mei_me parport(F)
>    snd(F) mei soundcore(F) videodev i2c_algo_bit serio_raw(F) microcode(F) mac_hid
>    lpc_ich asus_atk0110 hid_generic usbhid hid usb_storage(F) firewire_ohci ahci(F)
>    libahci(F) firewire_core r8169 crc_itu_t(F) mii(F)
>    CPU: 3 PID: 15367 Comm: htop Tainted: GF            3.12.0-031200-generic #201311031935
>    Hardware name: System manufacturer System Product Name/P7H55-M PRO, BIOS 1709    01/04/2011
>    task: ffff8800cc09e000 ti: ffff8800af620000 task.ti: ffff8800af620000
>    RIP: 0010:[<ffffffff81099de0>]  [<ffffffff81099de0>] cputime_adjust+0xf0/0x110
>    RSP: 0018:ffff8800af621cc8  EFLAGS: 00010847
>    RAX: 85fdc1fef4047c00 RBX: 0000000000000000 RCX: ffff8800af621df8
>    RDX: 0000000000000000 RSI: ffff8800cc0634d0 RDI: 0000000000000000
>    RBP: ffff8800af621cd8 R08: 00000000fffffffe R09: 0000000000000000
>    R10: 0000000000000000 R11: fffffffe03427acc R12: ffff8800af621df0
>    R13: ffff8800af621df8 R14: 0000000000000000 R15: ffff8800cc063300
>    FS:  00007f22a387d740(0000) GS:ffff880117c60000(0000) knlGS:0000000000000000
>    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>    CR2: 00007f22a3892000 CR3: 0000000097023000 CR4: 00000000000007e0
>    Stack:
>     ffff8800c37f0000 ffff8800af621df0 ffff8800af621d18 ffffffff8109aa51
>     000000000a7d8c00 0000000042fee100 fffffffe03427acc ffff8800bf112a80
>     ffff8800c37f0000 ffff8800c307c280 ffff8800af621e50 ffffffff8121f74b
>    Call Trace:
>     [<ffffffff8109aa51>] thread_group_cputime_adjusted+0x41/0x50
>     [<ffffffff8121f74b>] do_task_stat+0x8eb/0xb60
>     [<ffffffff81176400>] ? vma_compute_subtree_gap+0x50/0x50
>     [<ffffffff81220314>] proc_tgid_stat+0x14/0x20
>     [<ffffffff8121b16d>] proc_single_show+0x4d/0x90
>     [<ffffffff811d6eee>] seq_read+0x14e/0x390
>     [<ffffffff811b3725>] vfs_read+0x95/0x160
>     [<ffffffff811b4239>] SyS_read+0x49/0xa0
>     [<ffffffff81723ced>] system_call_fastpath+0x1a/0x1f
>    Code: 89 fa 49 c1 ea 20 4d 85 d2 74 ca 4c 89 c2 48 d1 ef 49 89 c0 48 d1 ea 48
>          89 d0 eb 9f 0f 1f 80 00 00 00 00 89 c0 31 d2 49 0f af c0 <48> f7 f7 4c
>          89 df 48 29 c7 49 89 c3 e9 31 ff ff ff 66 66 66 66 
>    RIP  [<ffffffff81099de0>] cputime_adjust+0xf0/0x110
>     RSP <ffff8800af621cc8>
>    ---[ end trace dbafd2159a385dd6 ]---
> 
> The affected LOC performing the division by 0 was introduced in commit
> 
>    commit 55eaa7c1f511af5fb6ef808b5328804f4d4e5243
>    Author: Stanislaw Gruszka <sgruszka@redhat.com>
>    Date:   Tue Apr 30 17:14:42 2013 +0200
>    sched: Avoid cputime scaling overflow
> 
> For the problem to occur the function is called eg. with the following
> input parameters
> 
>    stime: 0x3567e00
>    rtime: 0xffffffffbf1abfdb
>    total: 0x3938700
> 
> which leads to 'total' being shifted to 0 during the adaption of the precision
> and is then used without further check in
> 
>    scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
> 
> The root cause triggering this issue seems to be an overflowed value of
> 
>    rtime = nsecs_to_cputime(curr->sum_exec_runtime);
> 
> On the affected machine the problem can be triggered by loading the
> previously idle system by starting a full kernel build. The problem occurs
> within a minute after the ondemand frequency scaling governor adjusts the
> frequency from the minimum to the maximum.
> 
> The x86 init check whether all booted CPUs have their TSC's synchronized, never
> failed so far, however, the tsc clocksource is sporadically marked unstable.
> 
>    Clocksource tsc unstable (delta = -74994678 ns)
> 
> The used CPU provides an Intel Invariant TSC as stated by CPUID.80000007H:EDX[8]:
> 
>     eax in    eax      ebx      ecx      edx
>    00000000 0000000b 756e6547 6c65746e 49656e69
>    00000001 00020652 04100800 0298e3ff bfebfbff
>    00000002 55035a01 00f0b2e3 00000000 09ca212c
>    00000003 00000000 00000000 00000000 00000000
>    00000004 00000000 00000000 00000000 00000000
>    00000005 00000040 00000040 00000003 00001120
>    00000006 00000005 00000002 00000001 00000000
>    00000007 00000000 00000000 00000000 00000000
>    00000008 00000000 00000000 00000000 00000000
>    00000009 00000000 00000000 00000000 00000000
>    0000000a 07300403 00000004 00000000 00000603
>    0000000b 00000000 00000000 0000002c 00000004
>    80000000 80000008 00000000 00000000 00000000
>    80000001 00000000 00000000 00000001 28100800
>    80000002 65746e49 2952286c 726f4320 4d542865
>    80000003 35692029 55504320 20202020 20202020
>    80000004 30353620 20402020 30322e33 007a4847
>    80000005 00000000 00000000 00000000 00000000
>    80000006 00000000 00000000 01006040 00000000
>    80000007 00000000 00000000 00000000 00000100
>    80000008 00003024 00000000 00000000 00000000
> 
>    Vendor ID: "GenuineIntel"; CPUID level 11
> 
>    Intel-specific functions:
>    Version 00020652:
>    Type 0 - Original OEM
>    Family 6 - Pentium Pro
>    Model 5 - Pentium II Model 5/Xeon/Celeron
>    Stepping 2
>    Reserved 8
> 
>    Extended brand string: "Intel(R) Core(TM) i5 CPU         650  @ 3.20GHz"
>    CLFLUSH instruction cache line size: 8
>    Initial APIC ID: 4
>    Hyper threading siblings: 16
> 
>    Feature flags bfebfbff:
>    FPU    Floating Point Unit
>    VME    Virtual 8086 Mode Enhancements
>    DE     Debugging Extensions
>    PSE    Page Size Extensions
>    TSC    Time Stamp Counter
>    MSR    Model Specific Registers
>    PAE    Physical Address Extension
>    MCE    Machine Check Exception
>    CX8    COMPXCHG8B Instruction
>    APIC   On-chip Advanced Programmable Interrupt Controller present and enabled
>    SEP    Fast System Call
>    MTRR   Memory Type Range Registers
>    PGE    PTE Global Flag
>    MCA    Machine Check Architecture
>    CMOV   Conditional Move and Compare Instructions
>    FGPAT  Page Attribute Table
>    PSE-36 36-bit Page Size Extension
>    CLFSH  CFLUSH instruction
>    DS     Debug store
>    ACPI   Thermal Monitor and Clock Ctrl
>    MMX    MMX instruction set
>    FXSR   Fast FP/MMX Streaming SIMD Extensions save/restore
>    SSE    Streaming SIMD Extensions instruction set
>    SSE2   SSE2 extensions
>    SS     Self Snoop
>    HT     Hyper Threading
>    TM     Thermal monitor
>    31     reserved
> 
> Nevertheless, when looking into the issue I saw occurences of sched_clock going
> backwards as if the TSCs were read out of sync. Accordingly the problem does
> not occur when either booting with option 'nosmp' or when forcing the TSC to be
> marked as unstable for sched_clock. Booting with 'acpi=off' and no frequency
> scaling works too.
> 
> Having a look at the scheduler code I see the following pattern that would also
> catch a time warp, eg. kernel/sched/rt.c update_curr_rt():
> 
>    u64 delta_exec;
>    delta_exec = rq_clock_task(rq) - curr->se.exec_start;
>    if (unlikely((s64)delta_exec <= 0))
>            return;
> 
> However, there are still vulnerable places, eg. kernel/sched/fair.c update_curr():
> 
>    /*
>     * Get the amount of time the current task was running
>     * since the last time we changed load (this cannot
>     * overflow on 32 bits):
>     */
>    delta_exec = (unsigned long)(now - curr->exec_start);
>    if (!delta_exec)
>            return;

Thanks for great analyse. I'm not sure where this problem should be
fixed (in TSC clocksource or sched/fair.c or sched/cputime.c), however
since RT scheduler is protected for sched_clock going backward, we
probably can protect fair scheduler as well (also do_task_delta_exec()
has similar check).

Does the below patch fixes the issue on affected machine?

Stanislaw

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c70201..f02a567 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -727,7 +727,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	u64 now = rq_clock_task(rq_of(cfs_rq));
 	unsigned long delta_exec;
 
-	if (unlikely(!curr))
+	if (unlikely(!curr || now <= curr->exec_start))
 		return;
 
 	/*
@@ -736,8 +736,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	 * overflow on 32 bits):
 	 */
 	delta_exec = (unsigned long)(now - curr->exec_start);
-	if (!delta_exec)
-		return;
 
 	__update_curr(cfs_rq, curr, delta_exec);
 	curr->exec_start = now;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PROBLEM] possible divide by 0 in kernel/sched/cputime.c scale_stime()
  2013-11-18 14:02 ` Stanislaw Gruszka
@ 2013-11-18 14:19   ` Peter Zijlstra
  2013-11-18 15:39     ` Ingo Molnar
  2013-11-18 17:27   ` Peter Zijlstra
  1 sibling, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2013-11-18 14:19 UTC (permalink / raw)
  To: Stanislaw Gruszka
  Cc: Christian Engelmayer, Ingo Molnar, linux-kernel, Thomas Gleixner

On Mon, Nov 18, 2013 at 03:02:24PM +0100, Stanislaw Gruszka wrote:
> > The x86 init check whether all booted CPUs have their TSC's synchronized, never
> > failed so far, however, the tsc clocksource is sporadically marked unstable.
> > 
> >    Clocksource tsc unstable (delta = -74994678 ns)

> >    Version 00020652:
> >    Type 0 - Original OEM
> >    Family 6 - Pentium Pro
> >    Model 5 - Pentium II Model 5/Xeon/Celeron
> >    Extended brand string: "Intel(R) Core(TM) i5 CPU         650  @ 3.20GHz"

I'm not sure what tool you used to generate that, but its broken, that's
model 0x25 (37), it somehow truncates the upper model bits.

That said, its a westmere core and I've seen wsm-ep (dual socket)
machines loose their TSC sync quite regularly, but this would be the
first case a single socket wsm would loose its TSC sync.

That leads me to believe your BIOS is screwing you over with SMIs or the
like.

I would be tempted to say you should simply mark the tsc unstable on
boot and live with that -- we fully assume the sched_clock stuff is not
going backwards in an 'observable' way.

That said, it might be nice to not crash either.. 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PROBLEM] possible divide by 0 in kernel/sched/cputime.c scale_stime()
  2013-11-18 14:19   ` Peter Zijlstra
@ 2013-11-18 15:39     ` Ingo Molnar
  0 siblings, 0 replies; 9+ messages in thread
From: Ingo Molnar @ 2013-11-18 15:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stanislaw Gruszka, Christian Engelmayer, Ingo Molnar,
	linux-kernel, Thomas Gleixner


* Peter Zijlstra <peterz@infradead.org> wrote:

> I would be tempted to say you should simply mark the tsc unstable on 
> boot and live with that -- we fully assume the sched_clock stuff is 
> not going backwards in an 'observable' way.

BIOS crap and actual hardware bugs do happen - so kernel code needs to 
consider TSC input with a pinch of salt, assuming that it's untrusted 
external data.

> That said, it might be nice to not crash either..

Indeed, not crashing on weird TSC input is absolutely required!

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PROBLEM] possible divide by 0 in kernel/sched/cputime.c scale_stime()
  2013-11-18 14:02 ` Stanislaw Gruszka
  2013-11-18 14:19   ` Peter Zijlstra
@ 2013-11-18 17:27   ` Peter Zijlstra
  2013-11-20 12:08     ` Stanislaw Gruszka
  2013-12-12  9:52     ` [tip:sched/urgent] sched/fair: " tip-bot for Peter Zijlstra
  1 sibling, 2 replies; 9+ messages in thread
From: Peter Zijlstra @ 2013-11-18 17:27 UTC (permalink / raw)
  To: Stanislaw Gruszka
  Cc: Christian Engelmayer, Ingo Molnar, linux-kernel, Thomas Gleixner,
	fweisbec, Paul Turner

On Mon, Nov 18, 2013 at 03:02:24PM +0100, Stanislaw Gruszka wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7c70201..f02a567 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -727,7 +727,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  	u64 now = rq_clock_task(rq_of(cfs_rq));
>  	unsigned long delta_exec;
>  
> -	if (unlikely(!curr))
> +	if (unlikely(!curr || now <= curr->exec_start))
>  		return;

That is not actually correct in the case time wraps.

There's a further problem with this code though -- ever since Frederic
added NO_HZ_FULL a CPU can in fact aggregate a runtime delta larger than
4 seconds, due to running without a tick.

Therefore we need to be able to deal with u64 deltas.

The below is a compile tested only attempt to deal with both these
problems. Comments?

---
 arch/x86/Kconfig       |   1 +
 include/linux/math64.h |  32 +++++++++++
 init/Kconfig           |   6 +++
 kernel/sched/fair.c    | 144 ++++++++++++++++++++++---------------------------
 4 files changed, 102 insertions(+), 81 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c84cf90ca693..33544dbed7bc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,6 +26,7 @@ config X86
 	select HAVE_AOUT if X86_32
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_SUPPORTS_INT128
 	select ARCH_WANTS_PROT_NUMA_PROT_NONE
 	select HAVE_IDE
 	select HAVE_OPROFILE
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 69ed5f5e9f6e..67d8c8258fbd 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -133,4 +133,36 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
 	return ret;
 }
 
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+#ifndef mul_u64_u32_shr
+static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+{
+	return (u64)(((unsigned __int128)a * mul) >> shift);
+}
+#endif /* mul_u64_u32_shr */
+
+#else
+
+#ifndef mul_u64_u32_shr
+static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+{
+	u32 ah, al;
+	u64 t1, t2;
+
+	al = a;
+	ah = a >> 32;
+
+	t1 = ((u64)al * mul) >> shift;
+	if (ah) {
+		t2 = ((u64)ah * mul) << (32 - shift);
+		t1 += t2;
+	}
+
+	return t1;
+}
+#endif /* mul_u64_u32_shr */
+
+#endif
+
 #endif /* _LINUX_MATH64_H */
diff --git a/init/Kconfig b/init/Kconfig
index 3fc8a2f2fac4..8654d2c94625 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -823,6 +823,12 @@ config GENERIC_SCHED_CLOCK
 config ARCH_SUPPORTS_NUMA_BALANCING
 	bool
 
+#
+# For architectures that know their GCC __int128 support is sound
+#
+config ARCH_SUPPORTS_INT128
+	bool
+
 # For architectures that (ab)use NUMA to represent different memory regions
 # all cpu-local but of different latencies, such as SuperH.
 #
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8b652ebe027..ceee8421720f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,59 @@ void sched_init_granularity(void)
 	update_sysctl();
 }
 
-#if BITS_PER_LONG == 32
-# define WMULT_CONST	(~0UL)
-#else
-# define WMULT_CONST	(1UL << 32)
-#endif
-
+#define WMULT_CONST	(~0U)
 #define WMULT_SHIFT	32
 
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
-
-/*
- * delta *= weight / lw
- */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-		struct load_weight *lw)
+static void __update_inv_weight(struct load_weight *lw)
 {
-	u64 tmp;
+	unsigned long w;
 
-	/*
-	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-	 * 2^SCHED_LOAD_RESOLUTION.
-	 */
-	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-		tmp = (u64)delta_exec * scale_load_down(weight);
+	if (lw->inv_weight)
+		return;
+
+	w = scale_load_down(lw->weight);
+
+	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+		lw->inv_weight = 1;
+	else if (unlikely(!w))
+		lw->inv_weight = WMULT_CONST;
 	else
-		tmp = (u64)delta_exec;
+		lw->inv_weight = WMULT_CONST / w;
+}
+
+/* 
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * XXX mind got twisted, but I'm fairly sure shift will stay positive.
+ *
+ */
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
+{
+	int shift = WMULT_SHIFT;
+	u64 fact = scale_load_down(weight);
 
-	if (!lw->inv_weight) {
-		unsigned long w = scale_load_down(lw->weight);
+	__update_inv_weight(lw);
 
-		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-			lw->inv_weight = 1;
-		else if (unlikely(!w))
-			lw->inv_weight = WMULT_CONST;
-		else
-			lw->inv_weight = WMULT_CONST / w;
+	while (fact >> 32) {
+		fact >>= 1;
+		shift--;
 	}
 
-	/*
-	 * Check whether we'd overflow the 64-bit multiplication:
-	 */
-	if (unlikely(tmp > WMULT_CONST))
-		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-			WMULT_SHIFT/2);
-	else
-		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+	fact *= lw->inv_weight;
 
-	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+	while (fact >> 32) {
+		fact >>= 1;
+		shift--;
+	}
+
+	return mul_u64_u32_shr(delta_exec, fact, shift);
 }
 
 
@@ -443,7 +443,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -612,11 +612,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 /*
  * delta /= w
  */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 {
 	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 
 	return delta;
 }
@@ -665,7 +664,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			update_load_add(&lw, se->load.weight);
 			load = &lw;
 		}
-		slice = calc_delta_mine(slice, se->load.weight, load);
+		slice = __calc_delta(slice, se->load.weight, load);
 	}
 	return slice;
 }
@@ -703,47 +702,32 @@ void init_task_runnable_average(struct task_struct *p)
 #endif
 
 /*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
+ * Update the current task's runtime statistics.
  */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-	      unsigned long delta_exec)
-{
-	unsigned long delta_exec_weighted;
-
-	schedstat_set(curr->statistics.exec_max,
-		      max((u64)delta_exec, curr->statistics.exec_max));
-
-	curr->sum_exec_runtime += delta_exec;
-	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
-	curr->vruntime += delta_exec_weighted;
-	update_min_vruntime(cfs_rq);
-}
-
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
 	u64 now = rq_clock_task(rq_of(cfs_rq));
-	unsigned long delta_exec;
+	u64 delta_exec;
 
 	if (unlikely(!curr))
 		return;
 
-	/*
-	 * Get the amount of time the current task was running
-	 * since the last time we changed load (this cannot
-	 * overflow on 32 bits):
-	 */
-	delta_exec = (unsigned long)(now - curr->exec_start);
-	if (!delta_exec)
+	delta_exec = now - curr->exec_start;
+	if ((s64)delta_exec < 0)
 		return;
 
-	__update_curr(cfs_rq, curr, delta_exec);
 	curr->exec_start = now;
 
+	schedstat_set(curr->statistics.exec_max,
+		      max((u64)delta_exec, curr->statistics.exec_max));
+
+	curr->sum_exec_runtime += delta_exec;
+	schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+	curr->vruntime += calc_delta_fair(delta_exec, curr);
+	update_min_vruntime(cfs_rq);
+
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
 
@@ -3015,8 +2999,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	}
 }
 
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-				     unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	/* dock delta_exec before expiring quota (as it could span periods) */
 	cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3017,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 }
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 		return;
@@ -3574,8 +3557,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 	return rq_clock_task(rq_of(cfs_rq));
 }
 
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-				     unsigned long delta_exec) {}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PROBLEM] possible divide by 0 in kernel/sched/cputime.c scale_stime()
  2013-11-18 17:27   ` Peter Zijlstra
@ 2013-11-20 12:08     ` Stanislaw Gruszka
  2013-11-25  0:43       ` Christian Engelmayer
  2013-12-12  9:52     ` [tip:sched/urgent] sched/fair: " tip-bot for Peter Zijlstra
  1 sibling, 1 reply; 9+ messages in thread
From: Stanislaw Gruszka @ 2013-11-20 12:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christian Engelmayer, Ingo Molnar, linux-kernel, Thomas Gleixner,
	fweisbec, Paul Turner

On Mon, Nov 18, 2013 at 06:27:06PM +0100, Peter Zijlstra wrote:
> The below is a compile tested only attempt to deal with both these
> problems. Comments?

Just two nits as I don't understand vast of the patch.

> +	delta_exec = now - curr->exec_start;
> +	if ((s64)delta_exec < 0)
>  		return;

Check here should probably use <= and also unlikely() would be
reasonable. 

> +	schedstat_set(curr->statistics.exec_max,
> +		      max((u64)delta_exec, curr->statistics.exec_max));

(u64) cast not needed.

Stanislaw

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PROBLEM] possible divide by 0 in kernel/sched/cputime.c scale_stime()
  2013-11-20 12:08     ` Stanislaw Gruszka
@ 2013-11-25  0:43       ` Christian Engelmayer
  2013-11-26 14:44         ` [PATCH] sched, fair: Rework sched_fair time accounting Peter Zijlstra
  0 siblings, 1 reply; 9+ messages in thread
From: Christian Engelmayer @ 2013-11-25  0:43 UTC (permalink / raw)
  To: Stanislaw Gruszka, Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Thomas Gleixner, fweisbec, Paul Turner

On Mon, 18 Nov 2013 18:27:06 +0100, Peter Zijlstra <peterz@infradead.org> wrote:
> That is not actually correct in the case time wraps.
> 
> There's a further problem with this code though -- ever since Frederic
> added NO_HZ_FULL a CPU can in fact aggregate a runtime delta larger than
> 4 seconds, due to running without a tick.
> 
> Therefore we need to be able to deal with u64 deltas.
> 
> The below is a compile tested only attempt to deal with both these
> problems. Comments?

I had this patch applied during daily use. No systematic testing, but no user
perceived regressions either. The originally reported divide by 0 scenario
could no longer be reproduced with this change.

> +/* 
> + * delta_exec * weight / lw.weight
> + *   OR
> + * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
> + *
> + * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
> + * we're guaranteed shift stays positive because inv_weight is guaranteed to
> + * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
> + *
> + * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
> + * XXX mind got twisted, but I'm fairly sure shift will stay positive.
> + *
> + */
> +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)

The patch itself seems comprehensible to me, although I have to admit that I
would have to read into the code more deeply in order to understand why the
changed __calc_delta() will always prove correct.

On Mon, 18 Nov 2013 15:19:56 +0100, Peter Zijlstra <peterz@infradead.org> wrote:
> I'm not sure what tool you used to generate that, but its broken, that's
> model 0x25 (37), it somehow truncates the upper model bits.

Correct, that was the fairly outdated cpuid (http://www.ka9q.net/code/cpuid)
currently shipped with Ubuntu 13.10. Debian already switched to packaging a
maintained version (http://www.etallen.com/cpuid.html).

> That said, its a westmere core and I've seen wsm-ep (dual socket)
> machines loose their TSC sync quite regularly, but this would be the
> first case a single socket wsm would loose its TSC sync.
>
> That leads me to believe your BIOS is screwing you over with SMIs or the
> like.

Having rechecked the running microcode as hinted by Henrique de Moraes Holschuh
off-list and running the Intel BIOS Implementation Test Suite (http://biosbits.org)
that seems to be an educated guess.

Regards,
Christian

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] sched, fair: Rework sched_fair time accounting
  2013-11-25  0:43       ` Christian Engelmayer
@ 2013-11-26 14:44         ` Peter Zijlstra
  0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2013-11-26 14:44 UTC (permalink / raw)
  To: Christian Engelmayer
  Cc: Stanislaw Gruszka, Ingo Molnar, linux-kernel, Thomas Gleixner,
	fweisbec, Paul Turner, Andy Lutomirski

On Mon, Nov 25, 2013 at 01:43:02AM +0100, Christian Engelmayer wrote:
> I had this patch applied during daily use. No systematic testing, but no user
> perceived regressions either. The originally reported divide by 0 scenario
> could no longer be reproduced with this change.

Thanks! slightly updated version below.

I took some care to make sure the patch compiles into two 32x32->64
multiplications in the common case for 32bit.

Sadly GCC is too smart and completely fails to get the hint; the line:

 fact = (u64)(u32)fact * lw->inv_weight;

Is converted into a double imul+mul monster for -O2. The only way to get
the proper single mul is by forcing it into a noinline function, but
that adds call/stack overhead.

That said, given a sensible compiler (ha!) the common case for 32bit
should be equal or better than before.

---
Subject: sched, fair: Rework sched_fair time accounting
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 18 Nov 2013 18:27:06 +0100

Christian suffers from a bad BIOS that wrecks his i5's TSC sync. This
results in him occasionally seeing time going backwards.

Most of our time accounting can actually handle that except the most
common one; the tick time update of sched_fair.

There is a further problem with that code; previously we assumed that
because we get a tick every TICK_NSEC our time delta could never
exceed 32bits and math was simpler.

However, ever since Frederic managed to get NO_HZ_FULL merged; this is
no longer the case since now a task can run for a long time indeed
without getting a tick. It only takes about ~4.2 seconds to overflow
our u32 in nanoseconds.

This means we not only need to better deal with time going backwards;
but also means we need to be able to deal with large deltas.

This patch reworks the entire code and introduces mul_u64_u32_shr() as
proposed by Andy a long while ago.

We express our virtual time scale factor in a u32 multiplier and shift
right and the 32bit mul_u64_u32_shr() implementation reduces to a
single 32x32->64 multiply if the time delta is still short (common
case).

For 64bit a 64x64->128 multiply can be used if ARCH_SUPPORTS_INT128; I
didn't want to write fallback code for now, but we could.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: Paul Turner <pjt@google.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ingo Molnar <mingo@redhat.com>
Reported-by: Christian Engelmayer <cengelma@gmx.at>
Tested-by: Christian Engelmayer <cengelma@gmx.at>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/Kconfig       |    1 
 include/linux/math64.h |   30 ++++++++++
 include/linux/sched.h  |    3 -
 init/Kconfig           |    6 ++
 kernel/sched/fair.c    |  144 +++++++++++++++++++++----------------------------
 5 files changed, 103 insertions(+), 81 deletions(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,6 +26,7 @@ config X86
 	select HAVE_AOUT if X86_32
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_SUPPORTS_INT128 if X86_64
 	select ARCH_WANTS_PROT_NUMA_PROT_NONE
 	select HAVE_IDE
 	select HAVE_OPROFILE
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -133,4 +133,34 @@ __iter_div_u64_rem(u64 dividend, u32 div
 	return ret;
 }
 
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+#ifndef mul_u64_u32_shr
+static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+{
+	return (u64)(((unsigned __int128)a * mul) >> shift);
+}
+#endif /* mul_u64_u32_shr */
+
+#else
+
+#ifndef mul_u64_u32_shr
+static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+{
+	u32 ah, al;
+	u64 ret;
+
+	al = a;
+	ah = a >> 32;
+
+	ret = ((u64)al * mul) >> shift;
+	if (ah)
+		ret += ((u64)ah * mul) << (32 - shift);
+
+	return ret;
+}
+#endif /* mul_u64_u32_shr */
+
+#endif
+
 #endif /* _LINUX_MATH64_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -932,7 +932,8 @@ struct pipe_inode_info;
 struct uts_namespace;
 
 struct load_weight {
-	unsigned long weight, inv_weight;
+	unsigned long weight;
+	u32 inv_weight;
 };
 
 struct sched_avg {
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -809,6 +809,12 @@ config GENERIC_SCHED_CLOCK
 config ARCH_SUPPORTS_NUMA_BALANCING
 	bool
 
+#
+# For architectures that know their GCC __int128 support is sound
+#
+config ARCH_SUPPORTS_INT128
+	bool
+
 # For architectures that (ab)use NUMA to represent different memory regions
 # all cpu-local but of different latencies, such as SuperH.
 #
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
 	update_sysctl();
 }
 
-#if BITS_PER_LONG == 32
-# define WMULT_CONST	(~0UL)
-#else
-# define WMULT_CONST	(1UL << 32)
-#endif
-
+#define WMULT_CONST	(~0U)
 #define WMULT_SHIFT	32
 
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+static void __update_inv_weight(struct load_weight *lw)
+{
+	unsigned long w;
+
+	if (likely(lw->inv_weight))
+		return;
+
+	w = scale_load_down(lw->weight);
+
+	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+		lw->inv_weight = 1;
+	else if (unlikely(!w))
+		lw->inv_weight = WMULT_CONST;
+	else
+		lw->inv_weight = WMULT_CONST / w;
+}
 
 /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
  */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-		struct load_weight *lw)
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 {
-	u64 tmp;
-
-	/*
-	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-	 * 2^SCHED_LOAD_RESOLUTION.
-	 */
-	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-		tmp = (u64)delta_exec * scale_load_down(weight);
-	else
-		tmp = (u64)delta_exec;
+	u64 fact = scale_load_down(weight);
+	int shift = WMULT_SHIFT;
 
-	if (!lw->inv_weight) {
-		unsigned long w = scale_load_down(lw->weight);
+	__update_inv_weight(lw);
 
-		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-			lw->inv_weight = 1;
-		else if (unlikely(!w))
-			lw->inv_weight = WMULT_CONST;
-		else
-			lw->inv_weight = WMULT_CONST / w;
+	if (unlikely(fact >> 32)) {
+		while (fact >> 32) {
+			fact >>= 1;
+			shift--;
+		}
 	}
 
-	/*
-	 * Check whether we'd overflow the 64-bit multiplication:
-	 */
-	if (unlikely(tmp > WMULT_CONST))
-		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-			WMULT_SHIFT/2);
-	else
-		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+	/* hint to use a 32x32->64 mul */
+	fact = (u64)(u32)fact * lw->inv_weight;
 
-	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+	while (fact >> 32) {
+		fact >>= 1;
+		shift--;
+	}
+
+	return mul_u64_u32_shr(delta_exec, fact, shift);
 }
 
 
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **s
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl
 /*
  * delta /= w
  */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 {
 	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 
 	return delta;
 }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cf
 			update_load_add(&lw, se->load.weight);
 			load = &lw;
 		}
-		slice = calc_delta_mine(slice, se->load.weight, load);
+		slice = __calc_delta(slice, se->load.weight, load);
 	}
 	return slice;
 }
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct t
 #endif
 
 /*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
+ * Update the current task's runtime statistics.
  */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-	      unsigned long delta_exec)
-{
-	unsigned long delta_exec_weighted;
-
-	schedstat_set(curr->statistics.exec_max,
-		      max((u64)delta_exec, curr->statistics.exec_max));
-
-	curr->sum_exec_runtime += delta_exec;
-	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
-	curr->vruntime += delta_exec_weighted;
-	update_min_vruntime(cfs_rq);
-}
-
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
 	u64 now = rq_clock_task(rq_of(cfs_rq));
-	unsigned long delta_exec;
+	u64 delta_exec;
 
 	if (unlikely(!curr))
 		return;
 
-	/*
-	 * Get the amount of time the current task was running
-	 * since the last time we changed load (this cannot
-	 * overflow on 32 bits):
-	 */
-	delta_exec = (unsigned long)(now - curr->exec_start);
-	if (!delta_exec)
+	delta_exec = now - curr->exec_start;
+	if (unlikely((s64)delta_exec <= 0))
 		return;
 
-	__update_curr(cfs_rq, curr, delta_exec);
 	curr->exec_start = now;
 
+	schedstat_set(curr->statistics.exec_max,
+		      max(delta_exec, curr->statistics.exec_max));
+
+	curr->sum_exec_runtime += delta_exec;
+	schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+	curr->vruntime += calc_delta_fair(delta_exec, curr);
+	update_min_vruntime(cfs_rq);
+
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
 
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct
 	}
 }
 
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-				     unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	/* dock delta_exec before expiring quota (as it could span periods) */
 	cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(str
 }
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 		return;
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(stru
 	return rq_clock_task(rq_of(cfs_rq));
 }
 
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-				     unsigned long delta_exec) {}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [tip:sched/urgent] sched/fair: Rework sched_fair time accounting
  2013-11-18 17:27   ` Peter Zijlstra
  2013-11-20 12:08     ` Stanislaw Gruszka
@ 2013-12-12  9:52     ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 9+ messages in thread
From: tip-bot for Peter Zijlstra @ 2013-12-12  9:52 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, torvalds, pjt, peterz, luto, akpm,
	tglx, cengelma, sgruszka

Commit-ID:  9dbdb155532395ba000c5d5d187658b0e17e529f
Gitweb:     http://git.kernel.org/tip/9dbdb155532395ba000c5d5d187658b0e17e529f
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Mon, 18 Nov 2013 18:27:06 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Wed, 11 Dec 2013 15:52:35 +0100

sched/fair: Rework sched_fair time accounting

Christian suffers from a bad BIOS that wrecks his i5's TSC sync. This
results in him occasionally seeing time going backwards - which
crashes the scheduler ...

Most of our time accounting can actually handle that except the most
common one; the tick time update of sched_fair.

There is a further problem with that code; previously we assumed that
because we get a tick every TICK_NSEC our time delta could never
exceed 32bits and math was simpler.

However, ever since Frederic managed to get NO_HZ_FULL merged; this is
no longer the case since now a task can run for a long time indeed
without getting a tick. It only takes about ~4.2 seconds to overflow
our u32 in nanoseconds.

This means we not only need to better deal with time going backwards;
but also means we need to be able to deal with large deltas.

This patch reworks the entire code and uses mul_u64_u32_shr() as
proposed by Andy a long while ago.

We express our virtual time scale factor in a u32 multiplier and shift
right and the 32bit mul_u64_u32_shr() implementation reduces to a
single 32x32->64 multiply if the time delta is still short (common
case).

For 64bit a 64x64->128 multiply can be used if ARCH_SUPPORTS_INT128.

Reported-and-Tested-by: Christian Engelmayer <cengelma@gmx.at>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: fweisbec@gmail.com
Cc: Paul Turner <pjt@google.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20131118172706.GI3866@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |   3 +-
 kernel/sched/fair.c   | 144 ++++++++++++++++++++++----------------------------
 2 files changed, 66 insertions(+), 81 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 96d674b..53f97eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -930,7 +930,8 @@ struct pipe_inode_info;
 struct uts_namespace;
 
 struct load_weight {
-	unsigned long weight, inv_weight;
+	unsigned long weight;
+	u32 inv_weight;
 };
 
 struct sched_avg {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd773ad..9030da7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
 	update_sysctl();
 }
 
-#if BITS_PER_LONG == 32
-# define WMULT_CONST	(~0UL)
-#else
-# define WMULT_CONST	(1UL << 32)
-#endif
-
+#define WMULT_CONST	(~0U)
 #define WMULT_SHIFT	32
 
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+static void __update_inv_weight(struct load_weight *lw)
+{
+	unsigned long w;
+
+	if (likely(lw->inv_weight))
+		return;
+
+	w = scale_load_down(lw->weight);
+
+	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+		lw->inv_weight = 1;
+	else if (unlikely(!w))
+		lw->inv_weight = WMULT_CONST;
+	else
+		lw->inv_weight = WMULT_CONST / w;
+}
 
 /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
  */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-		struct load_weight *lw)
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 {
-	u64 tmp;
+	u64 fact = scale_load_down(weight);
+	int shift = WMULT_SHIFT;
 
-	/*
-	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-	 * 2^SCHED_LOAD_RESOLUTION.
-	 */
-	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-		tmp = (u64)delta_exec * scale_load_down(weight);
-	else
-		tmp = (u64)delta_exec;
+	__update_inv_weight(lw);
 
-	if (!lw->inv_weight) {
-		unsigned long w = scale_load_down(lw->weight);
-
-		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-			lw->inv_weight = 1;
-		else if (unlikely(!w))
-			lw->inv_weight = WMULT_CONST;
-		else
-			lw->inv_weight = WMULT_CONST / w;
+	if (unlikely(fact >> 32)) {
+		while (fact >> 32) {
+			fact >>= 1;
+			shift--;
+		}
 	}
 
-	/*
-	 * Check whether we'd overflow the 64-bit multiplication:
-	 */
-	if (unlikely(tmp > WMULT_CONST))
-		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-			WMULT_SHIFT/2);
-	else
-		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+	/* hint to use a 32x32->64 mul */
+	fact = (u64)(u32)fact * lw->inv_weight;
+
+	while (fact >> 32) {
+		fact >>= 1;
+		shift--;
+	}
 
-	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+	return mul_u64_u32_shr(delta_exec, fact, shift);
 }
 
 
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 /*
  * delta /= w
  */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 {
 	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 
 	return delta;
 }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			update_load_add(&lw, se->load.weight);
 			load = &lw;
 		}
-		slice = calc_delta_mine(slice, se->load.weight, load);
+		slice = __calc_delta(slice, se->load.weight, load);
 	}
 	return slice;
 }
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
 #endif
 
 /*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
+ * Update the current task's runtime statistics.
  */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-	      unsigned long delta_exec)
-{
-	unsigned long delta_exec_weighted;
-
-	schedstat_set(curr->statistics.exec_max,
-		      max((u64)delta_exec, curr->statistics.exec_max));
-
-	curr->sum_exec_runtime += delta_exec;
-	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
-	curr->vruntime += delta_exec_weighted;
-	update_min_vruntime(cfs_rq);
-}
-
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
 	u64 now = rq_clock_task(rq_of(cfs_rq));
-	unsigned long delta_exec;
+	u64 delta_exec;
 
 	if (unlikely(!curr))
 		return;
 
-	/*
-	 * Get the amount of time the current task was running
-	 * since the last time we changed load (this cannot
-	 * overflow on 32 bits):
-	 */
-	delta_exec = (unsigned long)(now - curr->exec_start);
-	if (!delta_exec)
+	delta_exec = now - curr->exec_start;
+	if (unlikely((s64)delta_exec <= 0))
 		return;
 
-	__update_curr(cfs_rq, curr, delta_exec);
 	curr->exec_start = now;
 
+	schedstat_set(curr->statistics.exec_max,
+		      max(delta_exec, curr->statistics.exec_max));
+
+	curr->sum_exec_runtime += delta_exec;
+	schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+	curr->vruntime += calc_delta_fair(delta_exec, curr);
+	update_min_vruntime(cfs_rq);
+
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
 
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	}
 }
 
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-				     unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	/* dock delta_exec before expiring quota (as it could span periods) */
 	cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 }
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 		return;
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 	return rq_clock_task(rq_of(cfs_rq));
 }
 
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-				     unsigned long delta_exec) {}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2013-12-12  9:52 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-16 21:37 [PROBLEM] possible divide by 0 in kernel/sched/cputime.c scale_stime() Christian Engelmayer
2013-11-18 14:02 ` Stanislaw Gruszka
2013-11-18 14:19   ` Peter Zijlstra
2013-11-18 15:39     ` Ingo Molnar
2013-11-18 17:27   ` Peter Zijlstra
2013-11-20 12:08     ` Stanislaw Gruszka
2013-11-25  0:43       ` Christian Engelmayer
2013-11-26 14:44         ` [PATCH] sched, fair: Rework sched_fair time accounting Peter Zijlstra
2013-12-12  9:52     ` [tip:sched/urgent] sched/fair: " tip-bot for Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.