* gFrom 9667d5ddbb1dc230653e5f8cedb778e9c562d46c Mon Sep 17 00:00:00 2001 @ 2020-10-24 22:57 Xi Wang 2020-10-24 22:57 ` [PATCH v3 1/1] sched: watchdog: Touch kernel watchdog with sched count Xi Wang 0 siblings, 1 reply; 4+ messages in thread From: Xi Wang @ 2020-10-24 22:57 UTC (permalink / raw) To: Ingo Molnar, Peter Zijlstra Cc: Thomas Gleixner, Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Josh Don, linux-kernel Instead of periodically resetting watchdogs from thread context, this patch simply forces resched and checks rq->sched_count. Watchdog is reset if the sched count increases. If the same thread is picked by pick_next_task during resched, there is no context switch. With the new method we lose coverage on: a migration/n thread actually gets picked and we actually context switch to the migration/n thread. These steps are unlikely to silently fail. The change would provide nearly the same level of protection with less latency / jitter. v3: - Removed the old method and boot option - Still need to check resched v2: - Use sched_count instead of having sched calling into watchdog code - Remove the sysctl and add a boot option, which can be removed later - Changed the subject line Xi Wang (1): sched: watchdog: Touch kernel watchdog with sched count include/linux/sched.h | 4 ++++ kernel/sched/core.c | 23 +++++++++++++++++++-- kernel/sched/sched.h | 6 +++++- kernel/watchdog.c | 47 +++++++++++++------------------------------ 4 files changed, 44 insertions(+), 36 deletions(-) -- 2.29.0.rc2.309.g374f81d7ae-goog ^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH v3 1/1] sched: watchdog: Touch kernel watchdog with sched count 2020-10-24 22:57 gFrom 9667d5ddbb1dc230653e5f8cedb778e9c562d46c Mon Sep 17 00:00:00 2001 Xi Wang @ 2020-10-24 22:57 ` Xi Wang 2020-10-26 8:32 ` Vincent Guittot 0 siblings, 1 reply; 4+ messages in thread From: Xi Wang @ 2020-10-24 22:57 UTC (permalink / raw) To: Ingo Molnar, Peter Zijlstra Cc: Thomas Gleixner, Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Josh Don, linux-kernel Instead of periodically resetting watchdogs from thread context, this patch simply forces resched and checks rq->sched_count. Watchdog is reset if the sched count increases. If the same thread is picked by pick_next_task during resched, there is no context switch. With the new method we lose coverage on: a migration/n thread actually gets picked and we actually context switch to the migration/n thread. These steps are unlikely to silently fail. The change would provide nearly the same level of protection with less latency / jitter. Suggested-by: Paul Turner <pjt@google.com> Suggested-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Xi Wang <xii@google.com> --- include/linux/sched.h | 4 ++++ kernel/sched/core.c | 23 +++++++++++++++++++-- kernel/sched/sched.h | 6 +++++- kernel/watchdog.c | 47 +++++++++++++------------------------------ 4 files changed, 44 insertions(+), 36 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d383cf09e78f..1e3bef9a9cdb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1662,6 +1662,10 @@ extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); +#ifdef CONFIG_SOFTLOCKUP_DETECTOR +extern unsigned int sched_get_count(int cpu); +#endif + /** * is_idle_task - is the specified task an idle task? * @p: the task in question. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8160ab5263f8..378f0f36c402 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4293,8 +4293,6 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - - schedstat_inc(this_rq()->sched_count); } static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, @@ -4492,6 +4490,12 @@ static void __sched notrace __schedule(bool preempt) clear_tsk_need_resched(prev); clear_preempt_need_resched(); +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + this_rq()->sched_count++; /* sched count is also used by watchdog */ +#else + schedstat_inc(this_rq()->sched_count); +#endif + if (likely(prev != next)) { rq->nr_switches++; /* @@ -5117,6 +5121,21 @@ struct task_struct *idle_task(int cpu) return cpu_rq(cpu)->idle; } +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + +/** + * sched_get_count - get the sched count of a CPU. + * @cpu: the CPU in question. + * + * Return: sched count. + */ +unsigned int sched_get_count(int cpu) +{ + return cpu_rq(cpu)->sched_count; +} + +#endif + /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 28709f6b0975..f23255981d52 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -959,6 +959,11 @@ struct rq { u64 clock_pelt; unsigned long lost_idle_time; +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_SOFTLOCKUP_DETECTOR) + /* Also used by watchdog - no longer grouping with other sched stats */ + unsigned int sched_count; +#endif + atomic_t nr_iowait; #ifdef CONFIG_MEMBARRIER @@ -1036,7 +1041,6 @@ struct rq { unsigned int yld_count; /* schedule() stats */ - unsigned int sched_count; unsigned int sched_goidle; /* try_to_wake_up() stats */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5abb5b22ad13..22f87aded95a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -170,6 +170,7 @@ static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +static DEFINE_PER_CPU(unsigned int, watchdog_sched_prev); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); @@ -239,6 +240,7 @@ static void set_sample_period(void) static void __touch_watchdog(void) { __this_cpu_write(watchdog_touch_ts, get_timestamp()); + __this_cpu_write(watchdog_sched_prev, sched_get_count(smp_processor_id())); } /** @@ -318,25 +320,6 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } -static DEFINE_PER_CPU(struct completion, softlockup_completion); -static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); - -/* - * The watchdog thread function - touches the timestamp. - * - * It only runs once every sample_period seconds (4 seconds by - * default) to reset the softlockup timestamp. If this gets delayed - * for more than 2*watchdog_thresh seconds then the debug-printout - * triggers in watchdog_timer_fn(). - */ -static int softlockup_fn(void *data) -{ - __touch_watchdog(); - complete(this_cpu_ptr(&softlockup_completion)); - - return 0; -} - /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -351,15 +334,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* kick the hardlockup detector */ watchdog_interrupt_count(); - /* kick the softlockup detector */ - if (completion_done(this_cpu_ptr(&softlockup_completion))) { - reinit_completion(this_cpu_ptr(&softlockup_completion)); - stop_one_cpu_nowait(smp_processor_id(), - softlockup_fn, NULL, - this_cpu_ptr(&softlockup_stop_work)); - } - - /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); if (touch_ts == SOFTLOCKUP_RESET) { @@ -378,6 +352,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } + /* Trigger reschedule for the next round */ + set_tsk_need_resched(current); + set_preempt_need_resched(); + + /* sched_count increase in __schedule is taken as watchdog touched */ + if (sched_get_count(smp_processor_id()) - + __this_cpu_read(watchdog_sched_prev)) { + __touch_watchdog(); + __this_cpu_write(soft_watchdog_warn, false); + return HRTIMER_RESTART; + } + /* check for a softlockup * This is done by making sure a high priority task is * being scheduled. The task touches the watchdog to @@ -443,13 +429,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); - struct completion *done = this_cpu_ptr(&softlockup_completion); WARN_ON_ONCE(cpu != smp_processor_id()); - init_completion(done); - complete(done); - /* * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. @@ -479,7 +461,6 @@ static void watchdog_disable(unsigned int cpu) */ watchdog_nmi_disable(cpu); hrtimer_cancel(hrtimer); - wait_for_completion(this_cpu_ptr(&softlockup_completion)); } static int softlockup_stop_fn(void *data) -- 2.29.0.rc2.309.g374f81d7ae-goog ^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH v3 1/1] sched: watchdog: Touch kernel watchdog with sched count 2020-10-24 22:57 ` [PATCH v3 1/1] sched: watchdog: Touch kernel watchdog with sched count Xi Wang @ 2020-10-26 8:32 ` Vincent Guittot 2020-10-27 17:39 ` Xi Wang 0 siblings, 1 reply; 4+ messages in thread From: Vincent Guittot @ 2020-10-26 8:32 UTC (permalink / raw) To: Xi Wang Cc: Ingo Molnar, Peter Zijlstra, Thomas Gleixner, Juri Lelli, Dietmar Eggemann, Steven Rostedt, Josh Don, linux-kernel On Sun, 25 Oct 2020 at 00:57, Xi Wang <xii@google.com> wrote: > > Instead of periodically resetting watchdogs from thread context, > this patch simply forces resched and checks rq->sched_count. > Watchdog is reset if the sched count increases. If the same thread > is picked by pick_next_task during resched, there is no context > switch. > > With the new method we lose coverage on: a migration/n thread > actually gets picked and we actually context switch to the > migration/n thread. These steps are unlikely to silently fail. > The change would provide nearly the same level of protection with > less latency / jitter. When a patch provides an improvement, it's usually good to give figures that show the improvement > > Suggested-by: Paul Turner <pjt@google.com> > Suggested-by: Peter Zijlstra <peterz@infradead.org> > Signed-off-by: Xi Wang <xii@google.com> > --- > include/linux/sched.h | 4 ++++ > kernel/sched/core.c | 23 +++++++++++++++++++-- > kernel/sched/sched.h | 6 +++++- > kernel/watchdog.c | 47 +++++++++++++------------------------------ > 4 files changed, 44 insertions(+), 36 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index d383cf09e78f..1e3bef9a9cdb 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1662,6 +1662,10 @@ extern int sched_setattr(struct task_struct *, const struct sched_attr *); > extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); > extern struct task_struct *idle_task(int cpu); > > +#ifdef CONFIG_SOFTLOCKUP_DETECTOR > +extern unsigned int sched_get_count(int cpu); > +#endif > + > /** > * is_idle_task - is the specified task an idle task? > * @p: the task in question. > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 8160ab5263f8..378f0f36c402 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -4293,8 +4293,6 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) > rcu_sleep_check(); > > profile_hit(SCHED_PROFILING, __builtin_return_address(0)); > - > - schedstat_inc(this_rq()->sched_count); > } > > static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, > @@ -4492,6 +4490,12 @@ static void __sched notrace __schedule(bool preempt) > clear_tsk_need_resched(prev); > clear_preempt_need_resched(); > > +#ifdef CONFIG_SOFTLOCKUP_DETECTOR > + this_rq()->sched_count++; /* sched count is also used by watchdog */ > +#else > + schedstat_inc(this_rq()->sched_count); > +#endif > + > if (likely(prev != next)) { > rq->nr_switches++; > /* > @@ -5117,6 +5121,21 @@ struct task_struct *idle_task(int cpu) > return cpu_rq(cpu)->idle; > } > > +#ifdef CONFIG_SOFTLOCKUP_DETECTOR > + > +/** > + * sched_get_count - get the sched count of a CPU. > + * @cpu: the CPU in question. > + * > + * Return: sched count. > + */ > +unsigned int sched_get_count(int cpu) > +{ > + return cpu_rq(cpu)->sched_count; > +} > + > +#endif > + > /** > * find_process_by_pid - find a process with a matching PID value. > * @pid: the pid in question. > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 28709f6b0975..f23255981d52 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -959,6 +959,11 @@ struct rq { > u64 clock_pelt; > unsigned long lost_idle_time; > > +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_SOFTLOCKUP_DETECTOR) > + /* Also used by watchdog - no longer grouping with other sched stats */ > + unsigned int sched_count; > +#endif > + > atomic_t nr_iowait; > > #ifdef CONFIG_MEMBARRIER > @@ -1036,7 +1041,6 @@ struct rq { > unsigned int yld_count; > > /* schedule() stats */ > - unsigned int sched_count; > unsigned int sched_goidle; > > /* try_to_wake_up() stats */ > diff --git a/kernel/watchdog.c b/kernel/watchdog.c > index 5abb5b22ad13..22f87aded95a 100644 > --- a/kernel/watchdog.c > +++ b/kernel/watchdog.c > @@ -170,6 +170,7 @@ static bool softlockup_initialized __read_mostly; > static u64 __read_mostly sample_period; > > static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); > +static DEFINE_PER_CPU(unsigned int, watchdog_sched_prev); > static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); > static DEFINE_PER_CPU(bool, softlockup_touch_sync); > static DEFINE_PER_CPU(bool, soft_watchdog_warn); > @@ -239,6 +240,7 @@ static void set_sample_period(void) > static void __touch_watchdog(void) > { > __this_cpu_write(watchdog_touch_ts, get_timestamp()); > + __this_cpu_write(watchdog_sched_prev, sched_get_count(smp_processor_id())); > } > > /** > @@ -318,25 +320,6 @@ static void watchdog_interrupt_count(void) > __this_cpu_inc(hrtimer_interrupts); > } > > -static DEFINE_PER_CPU(struct completion, softlockup_completion); > -static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); > - > -/* > - * The watchdog thread function - touches the timestamp. > - * > - * It only runs once every sample_period seconds (4 seconds by > - * default) to reset the softlockup timestamp. If this gets delayed > - * for more than 2*watchdog_thresh seconds then the debug-printout > - * triggers in watchdog_timer_fn(). > - */ > -static int softlockup_fn(void *data) > -{ > - __touch_watchdog(); > - complete(this_cpu_ptr(&softlockup_completion)); > - > - return 0; > -} > - > /* watchdog kicker functions */ > static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) > { > @@ -351,15 +334,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) > /* kick the hardlockup detector */ > watchdog_interrupt_count(); > > - /* kick the softlockup detector */ > - if (completion_done(this_cpu_ptr(&softlockup_completion))) { > - reinit_completion(this_cpu_ptr(&softlockup_completion)); > - stop_one_cpu_nowait(smp_processor_id(), > - softlockup_fn, NULL, > - this_cpu_ptr(&softlockup_stop_work)); > - } > - > - /* .. and repeat */ > hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); > > if (touch_ts == SOFTLOCKUP_RESET) { > @@ -378,6 +352,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) > return HRTIMER_RESTART; > } > > + /* Trigger reschedule for the next round */ > + set_tsk_need_resched(current); > + set_preempt_need_resched(); > + > + /* sched_count increase in __schedule is taken as watchdog touched */ > + if (sched_get_count(smp_processor_id()) - > + __this_cpu_read(watchdog_sched_prev)) { > + __touch_watchdog(); > + __this_cpu_write(soft_watchdog_warn, false); > + return HRTIMER_RESTART; > + } > + > /* check for a softlockup > * This is done by making sure a high priority task is > * being scheduled. The task touches the watchdog to > @@ -443,13 +429,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) > static void watchdog_enable(unsigned int cpu) > { > struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); > - struct completion *done = this_cpu_ptr(&softlockup_completion); > > WARN_ON_ONCE(cpu != smp_processor_id()); > > - init_completion(done); > - complete(done); > - > /* > * Start the timer first to prevent the NMI watchdog triggering > * before the timer has a chance to fire. > @@ -479,7 +461,6 @@ static void watchdog_disable(unsigned int cpu) > */ > watchdog_nmi_disable(cpu); > hrtimer_cancel(hrtimer); > - wait_for_completion(this_cpu_ptr(&softlockup_completion)); > } > > static int softlockup_stop_fn(void *data) > -- > 2.29.0.rc2.309.g374f81d7ae-goog > ^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v3 1/1] sched: watchdog: Touch kernel watchdog with sched count 2020-10-26 8:32 ` Vincent Guittot @ 2020-10-27 17:39 ` Xi Wang 0 siblings, 0 replies; 4+ messages in thread From: Xi Wang @ 2020-10-27 17:39 UTC (permalink / raw) To: Vincent Guittot Cc: Ingo Molnar, Peter Zijlstra, Thomas Gleixner, Juri Lelli, Dietmar Eggemann, Steven Rostedt, Josh Don, linux-kernel On Mon, Oct 26, 2020 at 1:32 AM Vincent Guittot <vincent.guittot@linaro.org> wrote: > > On Sun, 25 Oct 2020 at 00:57, Xi Wang <xii@google.com> wrote: > > > > Instead of periodically resetting watchdogs from thread context, > > this patch simply forces resched and checks rq->sched_count. > > Watchdog is reset if the sched count increases. If the same thread > > is picked by pick_next_task during resched, there is no context > > switch. > > > > With the new method we lose coverage on: a migration/n thread > > actually gets picked and we actually context switch to the > > migration/n thread. These steps are unlikely to silently fail. > > The change would provide nearly the same level of protection with > > less latency / jitter. > > When a patch provides an improvement, it's usually good to give > figures that show the improvement This change would reduce jitters for a continuously running thread. The difference is likely too small to tell for sched latency benchmarks. will-it-scale mmap1 reported 15.8% improvement for the v1 patch: https://lkml.org/lkml/2020/3/10/129 The performance change is actually unexpected. If it's not noise there might be contentions related to watchdog thread wake up or context switch. -Xi > > > > > Suggested-by: Paul Turner <pjt@google.com> > > Suggested-by: Peter Zijlstra <peterz@infradead.org> > > Signed-off-by: Xi Wang <xii@google.com> > > --- > > include/linux/sched.h | 4 ++++ > > kernel/sched/core.c | 23 +++++++++++++++++++-- > > kernel/sched/sched.h | 6 +++++- > > kernel/watchdog.c | 47 +++++++++++++------------------------------ > > 4 files changed, 44 insertions(+), 36 deletions(-) > > > > diff --git a/include/linux/sched.h b/include/linux/sched.h > > index d383cf09e78f..1e3bef9a9cdb 100644 > > --- a/include/linux/sched.h > > +++ b/include/linux/sched.h > > @@ -1662,6 +1662,10 @@ extern int sched_setattr(struct task_struct *, const struct sched_attr *); > > extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); > > extern struct task_struct *idle_task(int cpu); > > > > +#ifdef CONFIG_SOFTLOCKUP_DETECTOR > > +extern unsigned int sched_get_count(int cpu); > > +#endif > > + > > /** > > * is_idle_task - is the specified task an idle task? > > * @p: the task in question. > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > > index 8160ab5263f8..378f0f36c402 100644 > > --- a/kernel/sched/core.c > > +++ b/kernel/sched/core.c > > @@ -4293,8 +4293,6 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) > > rcu_sleep_check(); > > > > profile_hit(SCHED_PROFILING, __builtin_return_address(0)); > > - > > - schedstat_inc(this_rq()->sched_count); > > } > > > > static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, > > @@ -4492,6 +4490,12 @@ static void __sched notrace __schedule(bool preempt) > > clear_tsk_need_resched(prev); > > clear_preempt_need_resched(); > > > > +#ifdef CONFIG_SOFTLOCKUP_DETECTOR > > + this_rq()->sched_count++; /* sched count is also used by watchdog */ > > +#else > > + schedstat_inc(this_rq()->sched_count); > > +#endif > > + > > if (likely(prev != next)) { > > rq->nr_switches++; > > /* > > @@ -5117,6 +5121,21 @@ struct task_struct *idle_task(int cpu) > > return cpu_rq(cpu)->idle; > > } > > > > +#ifdef CONFIG_SOFTLOCKUP_DETECTOR > > + > > +/** > > + * sched_get_count - get the sched count of a CPU. > > + * @cpu: the CPU in question. > > + * > > + * Return: sched count. > > + */ > > +unsigned int sched_get_count(int cpu) > > +{ > > + return cpu_rq(cpu)->sched_count; > > +} > > + > > +#endif > > + > > /** > > * find_process_by_pid - find a process with a matching PID value. > > * @pid: the pid in question. > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > > index 28709f6b0975..f23255981d52 100644 > > --- a/kernel/sched/sched.h > > +++ b/kernel/sched/sched.h > > @@ -959,6 +959,11 @@ struct rq { > > u64 clock_pelt; > > unsigned long lost_idle_time; > > > > +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_SOFTLOCKUP_DETECTOR) > > + /* Also used by watchdog - no longer grouping with other sched stats */ > > + unsigned int sched_count; > > +#endif > > + > > atomic_t nr_iowait; > > > > #ifdef CONFIG_MEMBARRIER > > @@ -1036,7 +1041,6 @@ struct rq { > > unsigned int yld_count; > > > > /* schedule() stats */ > > - unsigned int sched_count; > > unsigned int sched_goidle; > > > > /* try_to_wake_up() stats */ > > diff --git a/kernel/watchdog.c b/kernel/watchdog.c > > index 5abb5b22ad13..22f87aded95a 100644 > > --- a/kernel/watchdog.c > > +++ b/kernel/watchdog.c > > @@ -170,6 +170,7 @@ static bool softlockup_initialized __read_mostly; > > static u64 __read_mostly sample_period; > > > > static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); > > +static DEFINE_PER_CPU(unsigned int, watchdog_sched_prev); > > static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); > > static DEFINE_PER_CPU(bool, softlockup_touch_sync); > > static DEFINE_PER_CPU(bool, soft_watchdog_warn); > > @@ -239,6 +240,7 @@ static void set_sample_period(void) > > static void __touch_watchdog(void) > > { > > __this_cpu_write(watchdog_touch_ts, get_timestamp()); > > + __this_cpu_write(watchdog_sched_prev, sched_get_count(smp_processor_id())); > > } > > > > /** > > @@ -318,25 +320,6 @@ static void watchdog_interrupt_count(void) > > __this_cpu_inc(hrtimer_interrupts); > > } > > > > -static DEFINE_PER_CPU(struct completion, softlockup_completion); > > -static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); > > - > > -/* > > - * The watchdog thread function - touches the timestamp. > > - * > > - * It only runs once every sample_period seconds (4 seconds by > > - * default) to reset the softlockup timestamp. If this gets delayed > > - * for more than 2*watchdog_thresh seconds then the debug-printout > > - * triggers in watchdog_timer_fn(). > > - */ > > -static int softlockup_fn(void *data) > > -{ > > - __touch_watchdog(); > > - complete(this_cpu_ptr(&softlockup_completion)); > > - > > - return 0; > > -} > > - > > /* watchdog kicker functions */ > > static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) > > { > > @@ -351,15 +334,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) > > /* kick the hardlockup detector */ > > watchdog_interrupt_count(); > > > > - /* kick the softlockup detector */ > > - if (completion_done(this_cpu_ptr(&softlockup_completion))) { > > - reinit_completion(this_cpu_ptr(&softlockup_completion)); > > - stop_one_cpu_nowait(smp_processor_id(), > > - softlockup_fn, NULL, > > - this_cpu_ptr(&softlockup_stop_work)); > > - } > > - > > - /* .. and repeat */ > > hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); > > > > if (touch_ts == SOFTLOCKUP_RESET) { > > @@ -378,6 +352,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) > > return HRTIMER_RESTART; > > } > > > > + /* Trigger reschedule for the next round */ > > + set_tsk_need_resched(current); > > + set_preempt_need_resched(); > > + > > + /* sched_count increase in __schedule is taken as watchdog touched */ > > + if (sched_get_count(smp_processor_id()) - > > + __this_cpu_read(watchdog_sched_prev)) { > > + __touch_watchdog(); > > + __this_cpu_write(soft_watchdog_warn, false); > > + return HRTIMER_RESTART; > > + } > > + > > /* check for a softlockup > > * This is done by making sure a high priority task is > > * being scheduled. The task touches the watchdog to > > @@ -443,13 +429,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) > > static void watchdog_enable(unsigned int cpu) > > { > > struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); > > - struct completion *done = this_cpu_ptr(&softlockup_completion); > > > > WARN_ON_ONCE(cpu != smp_processor_id()); > > > > - init_completion(done); > > - complete(done); > > - > > /* > > * Start the timer first to prevent the NMI watchdog triggering > > * before the timer has a chance to fire. > > @@ -479,7 +461,6 @@ static void watchdog_disable(unsigned int cpu) > > */ > > watchdog_nmi_disable(cpu); > > hrtimer_cancel(hrtimer); > > - wait_for_completion(this_cpu_ptr(&softlockup_completion)); > > } > > > > static int softlockup_stop_fn(void *data) > > -- > > 2.29.0.rc2.309.g374f81d7ae-goog > > ^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2020-10-27 17:41 UTC | newest] Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2020-10-24 22:57 gFrom 9667d5ddbb1dc230653e5f8cedb778e9c562d46c Mon Sep 17 00:00:00 2001 Xi Wang 2020-10-24 22:57 ` [PATCH v3 1/1] sched: watchdog: Touch kernel watchdog with sched count Xi Wang 2020-10-26 8:32 ` Vincent Guittot 2020-10-27 17:39 ` Xi Wang
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.