From: Thomas Gleixner Preeti reported a cpu down race with hrtimer based broadcasting: Assume CPU1 is the CPU which holds the hrtimer broadcasting duty before it is taken down. CPU0 CPU1 cpu_down() takedown_cpu() disable_interrupts() cpu_die() while (CPU1 != DEAD) { msleep(100); switch_to_idle() stop_cpu_timer() schedule_broadcast() } tick_cleanup_dead_cpu() take_over_broadcast() So after CPU1 disabled interrupts it cannot handle the broadcast hrtimer anymore, so CPU0 will be stuck forever. Doing a "while (CPU1 != DEAD) msleep(100);" periodic poll is silly at best, but we need to fix that nevertheless. Split the tick cleanup into two pieces: 1) Shutdown and remove all per cpu clockevent devices from takedown_cpu() This is done carefully with respect to existing arch code which works around the shortcoming of the clockevents core code in interesting ways. We really want a separate callback for this to cleanup the workarounds, but that's not scope of this patch 2) Takeover the broadcast duty explicitely before calling cpu_die() This is a temporary workaround as well. What we really want is a callback in the clockevent device which allows us to do that from the dying CPU by pushing the hrtimer onto a different cpu. That might involve an IPI and is definitely more complex than this immediate fix. Reported-by: Preeti U Murthy Signed-off-by: Thomas Gleixner --- include/linux/tick.h | 9 +++++---- kernel/cpu.c | 6 +++--- kernel/time/clockevents.c | 30 ++++++++++++++++++------------ kernel/time/tick-broadcast.c | 32 ++++++++++++++++++++++---------- kernel/time/tick-common.c | 34 ++++++++++++---------------------- kernel/time/tick-internal.h | 6 +++--- 6 files changed, 63 insertions(+), 54 deletions(-) Index: linux/include/linux/tick.h =================================================================== --- linux.orig/include/linux/tick.h +++ linux/include/linux/tick.h @@ -29,13 +29,12 @@ extern struct tick_device *tick_get_devi extern void __init tick_init(void); /* Should be core only, but XEN resume magic requires this */ extern void tick_resume_local(void); -extern void tick_handover_do_timer(void); -extern void tick_cleanup_dead_cpu(int cpu); +/* CPU hotplug */ +extern void tick_shutdown_local(void); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_resume_local(void) { } -static inline void tick_handover_do_timer(void) { } -static inline void tick_cleanup_dead_cpu(int cpu) { } +static inline void tick_shutdown_local(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #ifdef CONFIG_TICK_ONESHOT @@ -66,8 +65,10 @@ static inline void tick_broadcast_contro #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state); +extern void tick_takeover(int deadcpu); #else static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { return 0; } +static inline void tick_takeover(int deadcpu) { } #endif static inline void tick_broadcast_enable(void) Index: linux/kernel/cpu.c =================================================================== --- linux.orig/kernel/cpu.c +++ linux/kernel/cpu.c @@ -349,8 +349,8 @@ static int __ref take_cpu_down(void *_pa return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); - /* Give up timekeeping duties */ - tick_handover_do_timer(); + /* Shutdown the per cpu tick */ + tick_shutdown_local(); /* Park the stopper thread */ kthread_park(current); return 0; @@ -428,7 +428,7 @@ static int __ref _cpu_down(unsigned int __cpu_die(cpu); /* CPU is completely dead: tell everyone. Too late to complain. */ - tick_cleanup_dead_cpu(cpu); + tick_takeover(cpu); cpu_notify_nofail(CPU_DEAD | mod, hcpu); check_for_tasks(cpu); Index: linux/kernel/time/clockevents.c =================================================================== --- linux.orig/kernel/time/clockevents.c +++ linux/kernel/time/clockevents.c @@ -541,26 +541,32 @@ void clockevents_resume(void) #endif #ifdef CONFIG_HOTPLUG_CPU -/** - * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu +/* + * Cleanup the clock events devices on the dying cpu. curdev is the + * current installed tick device on that cpu */ -void tick_cleanup_dead_cpu(int cpu) +void clockevents_cleanup_dying_cpu(struct clock_event_device *curdev) { struct clock_event_device *dev, *tmp; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - - tick_shutdown(cpu); - /* - * Unregister the clock event devices which were - * released from the users in the notify chain. - */ - list_for_each_entry_safe(dev, tmp, &clockevents_released, list) - list_del(&dev->list); + if (!curdev) + goto cleanup; /* - * Now check whether the CPU has left unused per cpu devices + * We cannot call the set mode function here at the moment + * because existing architecture cpu down code shuts down + * stuff already and we cannot interfere with that. So we just + * set the mode to unused for now. */ + curdev->mode = CLOCK_EVT_MODE_UNUSED; + list_del(&curdev->list); + module_put(curdev->owner); + +cleanup: + /* Remove the unused percpu devices from the list */ + cpu = smp_processor_id(); list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && Index: linux/kernel/time/tick-broadcast.c =================================================================== --- linux.orig/kernel/time/tick-broadcast.c +++ linux/kernel/time/tick-broadcast.c @@ -421,15 +421,17 @@ void tick_set_periodic_handler(struct cl #ifdef CONFIG_HOTPLUG_CPU /* - * Remove a CPU from broadcasting + * Remove a CPU from broadcasting. Called from the dying cpu. */ -void tick_shutdown_broadcast(unsigned int cpu) +void tick_shutdown_broadcast(void) { struct clock_event_device *bc; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + cpu = smp_processor_id(); cpumask_clear_cpu(cpu, tick_broadcast_mask); cpumask_clear_cpu(cpu, tick_broadcast_on); @@ -906,14 +908,26 @@ void tick_broadcast_switch_to_oneshot(vo } #ifdef CONFIG_HOTPLUG_CPU -static void broadcast_move_bc(int deadcpu) +/* + * Called from the cpu hotplug code after a cpu is dead. This ensures + * that a hrtimer based broad cast device is taken over. + * + * FIXME: This should go away. We should replace this by a mechanism + * which pushes the hrtimer over to a different cpu from + * tick_shutdown_broadcast_oneshot() + */ +void tick_broadcast_takeover_bc(int deadcpu) { - struct clock_event_device *bc = tick_broadcast_device.evtdev; + struct clock_event_device *bc; + unsigned long flags; - if (!bc || !broadcast_needs_cpu(bc, deadcpu)) - return; - /* This moves the broadcast assignment to this cpu */ - clockevents_program_event(bc, bc->next_event, 1); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + bc = tick_broadcast_device.evtdev; + if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* This moves the broadcast assignment to this cpu */ + clockevents_program_event(bc, bc->next_event, 1); + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* @@ -929,8 +943,6 @@ static void tick_shutdown_broadcast_ones cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); - - broadcast_move_bc(cpu); } #endif Index: linux/kernel/time/tick-common.c =================================================================== --- linux.orig/kernel/time/tick-common.c +++ linux/kernel/time/tick-common.c @@ -336,10 +336,10 @@ out_bc: /* * Transfer the do_timer job away from a dying cpu. * - * Called with interrupts disabled. Not locking required. If - * tick_do_timer_cpu is owned by this cpu, nothing can change it. + * No locking required. If tick_do_timer_cpu is owned by this cpu, + * nothing can change it. */ -void tick_handover_do_timer(void) +static void tick_handover_do_timer(void) { if (tick_do_timer_cpu == smp_processor_id()) { int cpu = cpumask_first(cpu_online_mask); @@ -349,32 +349,22 @@ void tick_handover_do_timer(void) } } -/* - * Shutdown an event device on a given cpu: +/** + * tick_shutdown_local - Shutdown the tick related functions on a cpu * - * This is called on a life CPU, when a CPU is dead. So we cannot - * access the hardware device itself. - * We just set the mode and remove it from the lists. + * This is called from the dying cpu. */ -void tick_shutdown(unsigned int cpu) +void tick_shutdown_local(void) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); - struct clock_event_device *dev = td->evtdev; + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); /* Remove the CPU from the broadcast machinery */ - tick_shutdown_broadcast(cpu); + tick_shutdown_broadcast(); + clockevents_cleanup_dying_cpu(td->evtdev); td->mode = TICKDEV_MODE_PERIODIC; - if (dev) { - /* - * Prevent that the clock events layer tries to call - * the set mode function! - */ - dev->mode = CLOCK_EVT_MODE_UNUSED; - clockevents_exchange_device(dev, NULL); - dev->event_handler = clockevents_handle_noop; - td->evtdev = NULL; - } + + tick_handover_do_timer(); } #endif Index: linux/kernel/time/tick-internal.h =================================================================== --- linux.orig/kernel/time/tick-internal.h +++ linux/kernel/time/tick-internal.h @@ -20,7 +20,6 @@ extern int tick_do_timer_cpu __read_most extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); -extern void tick_shutdown(unsigned int cpu); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, @@ -38,6 +37,7 @@ extern void clockevents_shutdown(struct extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); extern void clockevents_handle_noop(struct clock_event_device *dev); +extern void clockevents_cleanup_dying_cpu(struct clock_event_device *dev); extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern void clockevents_suspend(void); extern void clockevents_resume(void); @@ -82,7 +82,7 @@ static inline int tick_check_oneshot_cha extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_shutdown_broadcast(unsigned int cpu); +extern void tick_shutdown_broadcast(void); extern void tick_suspend_broadcast(void); extern void tick_resume_broadcast(void); extern bool tick_resume_check_broadcast(void); @@ -96,7 +96,7 @@ static inline void tick_install_broadcas static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_shutdown_broadcast(unsigned int cpu) { } +static inline void tick_shutdown_broadcast(void) { } static inline void tick_suspend_broadcast(void) { } static inline void tick_resume_broadcast(void) { } static inline bool tick_resume_check_broadcast(void) { return false; }