On kernel 4.14.86 we had a kernel crash in the fair scheduler. Kernel 4.14.86 is from 4.14 LTS series, and current 4.14 has no change in the affected code path. Additionally we compared v5.4-rc2 source, there are changes but these wouldn't affect the result and lead to the same crash as a similar execution path would be taken. So we concentrate on the 4.14.86 source here. The callstack of the crash: [28439.656848] BUG: unable to handle kernel NULL pointer dereference at 0000000000000038 [28439.665662] IP: set_next_entity+0x17/0x490 [28439.670244] PGD 0 P4D 0 [28439.673096] Oops: 0000 [#1] PREEMPT SMP NOPTI [28439.677972] Modules linked in: bcmdhd(O) sr_mod cdrom tntfs(PO) texfat(PO) usb_storage configfs squashfs zlib_inflate xz_dec ah4 xfrm4_mode_transport ebt_ip ebtable_filter ebtables xfrm_user xfrm_algo cls_u32 sch_htb intel_tfm_governor ecryptfs cfq_iosched sd_mod snd_soc_apl_mgu_hu intel_ipu4_psys intel_ipu4_psys_csslib intel_xhci_usb_role_switch dwc3 roles udc_core adv728x snd_soc_skl sdw_cnl snd_soc_acpi_intel_match coretemp snd_soc_acpi sbi_apl intel_ipu4_isys snd_soc_core videobuf2_dma_contig snd_compress videobuf2_memops ipu4_acpi i2c_i801 intel_ipu4_isys_csslib videobuf2_v4l2 snd_soc_skl_ipc videobuf2_core ahci libahci sdw_bus xhci_pci libata xhci_hcd crc8 snd_soc_sst_ipc cfg80211 snd_soc_sst_dsp scsi_mod usbcore snd_hda_ext_core rfkill usb_common dwc3_pci snd_hda_core intel_ipu4_mmu snd_pcm [28439.757185] mei_me snd_timer intel_ipu4 mei snd iova soundcore nfsd auth_rpcgss lockd grace sunrpc loop fuse 8021q bridge stp llc inap560t(O) i915 video backlight intel_gtt i2c_algo_bit drm_kms_helper igb_avb(O) drm firmware_class ptp hwmon spi_pxa2xx_platform pps_core [last unloaded: bcmdhd] [28439.786334] CPU: 3 PID: 0 Comm: swapper/3 Tainted: P U O 4.14.86-apl #1 [28439.794807] task: ffff99a97895a580 task.stack: ffffb21000078000 [28439.801430] RIP: 0010:set_next_entity+0x17/0x490 [28439.806597] RSP: 0018:ffffb2100007bdb0 EFLAGS: 00010082 [28439.812444] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [28439.820428] RDX: 00000000000003e1 RSI: 0000000000000000 RDI: ffff99a96dda9800 [28439.828408] RBP: ffffb2100007bdf0 R08: 0000000000000000 R09: 0000000000000000 [28439.836395] R10: 0000000000000000 R11: 0000042df10058da R12: ffff99a97895a580 [28439.844383] R13: ffff99a96dda9800 R14: 0000000000000000 R15: ffff99a96dda9800 [28439.852371] FS: 0000000000000000(0000) GS:ffff99a97fd80000(0000) knlGS:0000000000000000 [28439.861429] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [28439.867864] CR2: 0000000000000038 CR3: 00000001f7a64000 CR4: 00000000003406a0 [28439.875852] Call Trace: [28439.878590] pick_next_task_fair+0x517/0x560 [28439.883369] __schedule+0xfa/0x7b0 [28439.887172] schedule_idle+0x1d/0x40 [28439.891170] do_idle+0x134/0x1a0 [28439.894779] cpu_startup_entry+0x6e/0x70 [28439.899168] start_secondary+0x1a7/0x1f0 [28439.903555] secondary_startup_64+0xa5/0xb0 [28439.908233] Code: 70 44 29 f6 41 89 d5 e8 58 f9 ff ff e9 11 ff ff ff 0f 1f 00 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 89 f3 48 83 ec 18 <8b> 4e 38 85 c9 75 3d 4d 8b a5 d0 00 00 00 41 83 bc 24 f8 08 00 [28439.929423] RIP: set_next_entity+0x17/0x490 RSP: ffffb2100007bdb0 [28439.936238] CR2: 0000000000000038 A dump of cfs_rq which is used here (a bit compressed to ease reading): crash> * cfs_rq ffff99a96dda9800 struct cfs_rq { load = { weight = 1048576, inv_weight = 0 }, nr_running = 1, h_nr_running = 1, exec_clock = 0, min_vruntime = 190894920101, tasks_timeline = { rb_root = { rb_node = 0xffff99a9502e0d10 }, rb_leftmost = 0x0 }, curr = 0x0, next = 0x0, last = 0x0, skip = 0x0, nr_spread_over = 0, avg = { last_update_time = 28439656814592, load_sum = 7948862, util_sum = 1838088, period_contrib = 278, load_avg = 167, util_avg = 37 }, runnable_load_sum = 5920382, runnable_load_avg = 124, tg_load_avg_contrib = 167, propagate_avg = 0, removed_load_avg = { counter = 0 }, removed_util_avg = { counter = 0 }, h_load = 26, last_h_load_update = 4323103347, h_load_next = 0x0, rq = 0xffff99a97fd9f500, on_list = 1, leaf_cfs_rq_list = { next = 0xffff99a9782ad4e0, prev = 0xffff99a9779bece0 }, tg = 0xffff99a977b35d40, runtime_enabled = 0, runtime_expires = 0, runtime_remaining = 0, throttled_clock = 0, throttled_clock_task = 0, throttled_clock_task_time = 0, throttled = 0, throttle_count = 0, throttled_list = { next = 0xffff99a96dda9930, prev = 0xffff99a96dda9930 } } and the complete stack frame of the crash (with some comments added, removed the crash related TOS): crash> bt -FF PID: 0 TASK: ffff99a97895a580 CPU: 3 COMMAND: "swapper/3" #0 [ffffb2100007ba20] machine_kexec at ffffffffaa035a5f --- snip --- #9 [ffffb2100007bd00] page_fault at ffffffffaa601215 [exception RIP: set_next_entity+23] RIP: ffffffffaa0ad2e7 RSP: ffffb2100007bdb0 RFLAGS: 00010082 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 00000000000003e1 RSI: 0000000000000000 RDI: ffff99a96dda9800 RBP: ffffb2100007bdf0 R8: 0000000000000000 R9: 0000000000000000 R10: 0000000000000000 R11: 0000042df10058da R12: ffff99a97895a580 R13: ffff99a96dda9800 R14: 0000000000000000 R15: ffff99a96dda9800 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 ffffb2100007bd08: [ffff99a96dda9800:kmalloc-512] 0000000000000000 ffffb2100007bd18: [ffff99a96dda9800:kmalloc-512] [ffff99a97895a580:task_struct] ffffb2100007bd28: ffffb2100007bdf0 0000000000000000 ffffb2100007bd38: 0000042df10058da 0000000000000000 ffffb2100007bd48: 0000000000000000 0000000000000000 ffffb2100007bd58: 0000000000000000 0000000000000000 ffffb2100007bd68: 00000000000003e1 0000000000000000 ffffb2100007bd78: [ffff99a96dda9800:kmalloc-512] ffffffffffffffff ffffb2100007bd88: set_next_entity+23 0000000000000010 ffffb2100007bd98: 0000000000010082 ffffb2100007bdb0 ffffb2100007bda8: 0000000000000018 0000000000000002 ffffb2100007bdb8: 0000000000000000 0000000000000000 ffffb2100007bdc8: ffff99a97fd9f500 [ffff99a97895a580:task_struct] ffffb2100007bdd8: [ffff99a97895aab8:task_struct] 0000000000000000 ffffb2100007bde8: [ffff99a96dda9800:kmalloc-512] ffffb2100007be60:rbp ffffb2100007bdf8: pick_next_task_fair+1303 #10 [ffffb2100007bdf8] pick_next_task_fair at ffffffffaa0b5457 ffffb2100007be00: ffffb2100007be10 0000000000000018 ffffb2100007be10: ffff99a97fd9f560 QWORD PTR gs:0x28 ffffb2100007be80 ffffb2100007be20: [ffff99a97895a580:task_struct] saved param *prev ffffb2100007be48 ffffb2100007be30: f048ce7809464d00 ffff99a97fd9f500=rbx ffffb2100007be40: [ffff99a97895a580:task_struct]=r12 [ffff99a97895aab8:task_struct]=r13 ffffb2100007be50: 0000000000000000=r14 0000000000000000=r15 ffffb2100007be60: ffffb2100007bec0=rbp __schedule+250 The failing piece of code is in kernel/sched/fair.c (Line 3873, Kernel 4.14.86; Line 4153 Kernel v5.4-rc2), because se is NULL: static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* 'current' is not kept within the tree. */ if (se->on_rq) { <<<<<<< crash here set_next_entity is called from within pick_next_task_fair, from the following piece of code: static struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; struct task_struct *p; int new_tasks; again: if (!cfs_rq->nr_running) // this is 1, so we are not going to idle goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED if (prev->sched_class != &fair_sched_class) <<<<< this is true: crash> p &fair_sched_class $1 = (const struct sched_class *) 0xffffffffaaa10cc0 <<<< crash> $prev=ffff99a97895a580 crash> gdb set $prev=(struct task_struct *)0xffff99a97895a580 crash> p $prev->sched_class $2 = (const struct sched_class *) 0xffffffffaaa10b40 <<<< goto simple; <<<< so we go to simple .... (Line 6360, Kernel 4.14.86; Line 6820 Kernel v5.4-rc2) simple: #endif put_prev_task(rq, prev); do { se = pick_next_entity(cfs_rq, NULL); <<<< this returns se=NULL set_next_entity(cfs_rq, se); <<<<<<<< here we crash cfs_rq = group_cfs_rq(se); } while (cfs_rq); So why is se = NULL returned? Looking at pick_next_entity and cfs_rq fed into it together with curr = NULL: (Line 3915, Kernel 4.14.86; Line 4196 Kernel v5.4-rc2) /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups * 2) pick the "next" process, since someone really wants that to run * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { struct sched_entity *left = __pick_first_entity(cfs_rq); <<<< returns NULL struct sched_entity *se; &cfs_rq->tasks_timeline->rb_leftmost tasks_timeline = { rb_root = { rb_node = 0xffff99a9502e0d10 }, rb_leftmost = 0x0 }, include/linux/rbtree.h:91:#define rb_first_cached(root) (root)->rb_leftmost struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); if (!left) return NULL; <<<<<<<<<< the case return rb_entry(left, struct sched_entity, run_node); } Back in pick_next_entity, we now have curr = NULL left = NULL /* * If curr is set we have to see if its left of the leftmost entity * still in the tree, provided there was anything in the tree at all. */ if (!left <<< this is the case here >>>> || (curr && entity_before(curr, left))) left = curr; <<<< so we still get left = curr = NULL se = left; /* ideally we run the leftmost entity */ <<<< but we now have se = NULL ???? /* * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ if (cfs_rq->skip == se) { <<<< that's equal here, se = NULL = 0x0 = cfs_rq->skip struct sched_entity *second; if (se == curr) { <<<< and this too, NULL = NULL second = __pick_first_entity(cfs_rq); <<<< so we get again NULL like we did for *left } else { second = __pick_next_entity(se); if (!second || (curr && entity_before(curr, second))) second = curr; } if (second && wakeup_preempt_entity(second, left) < 1) <<<< as second is also NULL nothing is done here, se unchanged se = second; } /* * Prefer last buddy, try to return the CPU to a preempted task. */ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) <<<< cfs_rq->last = 0x0, not true, se unchanged se = cfs_rq->last; /* * Someone really wants this to run. If it's not unfair, run it. */ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) <<<< cfs_rq->next = 0x0, not true, se unchanged se = cfs_rq->next; <<<< we reach this point with se = NULL >>>> clear_buddies(cfs_rq, se); <<<< this doesn't change anything in se return se; <<<< so we get se=NULL as a result. } Is this a corner case nobody thought of or do we have cfs_rq data that is unexpected in it's content?