From mboxrd@z Thu Jan 1 00:00:00 1970 References: From: Jan Kiszka Message-ID: <40e76345-95a5-8425-e787-b90d6655f672@siemens.com> Date: Tue, 4 Sep 2018 20:01:20 +0200 MIME-Version: 1.0 In-Reply-To: Content-Type: text/plain; charset=utf-8; format=flowed Content-Language: en-US Content-Transfer-Encoding: 7bit Subject: Re: [Xenomai] [PATCH] cobalt/sched: improve watchdog accuracy List-Id: Discussions about the Xenomai project List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Philippe Gerum , xenomai@xenomai.org On 2018-09-04 16:24, Philippe Gerum wrote: > The original watchdog mechanism was based on a sampling method: every > second (built-in value), it used to check the runtime mode of the > current task preempted on the ticking CPU. A per-cpu counter was > increased by one every time rt/primary mode was detected, then checked > against the trigger limit (CONFIG_XENO_OPT_WATCHDOG_TIMEOUT). > Otherwise, the counter was reset to zero. > > With this fairly naive approach, it only takes a single hit with > CONFIG_XENO_OPT_WATCHDOG_TIMEOUT=1 to trigger the watchdog, i.e. if > the system-fixed 1s watchdog tick preempts any Xenomai task when it is > running in primary mode on the current CPU, the watchdog fires. > > The default value of 4s papered over the inherent imprecision of such > a coarse-grained method, lengthening the odds of observing false > positive triggers. > > To improve the accuracy of the watchdog, arm the watchdog timer to > fire at the final trigger date directly, right before switching the > CPU to primary mode (leave_root()), disarming it when the CPU is about > to switch back to secondary mode (enter_root()). > > Better accuracy comes at the expense of slightly more overhead when > transitioning between primary and secondary modes, which should be > acceptable for a debug feature which is not affecting the hot path > anyway (i.e. there is no added cost for strictly rt context switches). > > Signed-off-by: Philippe Gerum > --- > include/cobalt/kernel/sched.h | 13 ------------- > kernel/cobalt/sched.c | 41 ++++++++++++++++++++++++++--------------- > kernel/cobalt/timer.c | 7 ------- > 3 files changed, 26 insertions(+), 35 deletions(-) > > diff --git a/include/cobalt/kernel/sched.h b/include/cobalt/kernel/sched.h > index 03436ee82..0d0c71b42 100644 > --- a/include/cobalt/kernel/sched.h > +++ b/include/cobalt/kernel/sched.h > @@ -104,8 +104,6 @@ struct xnsched { > #ifdef CONFIG_XENO_OPT_WATCHDOG > /*!< Watchdog timer object. */ > struct xntimer wdtimer; > - /*!< Watchdog tick count. */ > - int wdcount; > #endif > #ifdef CONFIG_XENO_OPT_STATS > /*!< Last account switch date (ticks). */ > @@ -361,17 +359,6 @@ xnsched_maybe_resched_after_unlocked_switch(struct xnsched *sched) > > #endif /* !CONFIG_XENO_ARCH_UNLOCKED_SWITCH */ > > -#ifdef CONFIG_XENO_OPT_WATCHDOG > -static inline void xnsched_reset_watchdog(struct xnsched *sched) > -{ > - sched->wdcount = 0; > -} > -#else /* !CONFIG_XENO_OPT_WATCHDOG */ > -static inline void xnsched_reset_watchdog(struct xnsched *sched) > -{ > -} > -#endif /* CONFIG_XENO_OPT_WATCHDOG */ > - > #include > #include > > diff --git a/kernel/cobalt/sched.c b/kernel/cobalt/sched.c > index 76e3bc5d7..dd14d4975 100644 > --- a/kernel/cobalt/sched.c > +++ b/kernel/cobalt/sched.c > @@ -93,14 +93,20 @@ void xnsched_register_classes(void) > static unsigned long wd_timeout_arg = CONFIG_XENO_OPT_WATCHDOG_TIMEOUT; > module_param_named(watchdog_timeout, wd_timeout_arg, ulong, 0644); > > +static inline xnticks_t get_watchdog_timeout(void) > +{ > + return wd_timeout_arg * 1000000000ULL; > +} > + > /** > * @internal > * @fn void watchdog_handler(struct xntimer *timer) > * @brief Process watchdog ticks. > * > - * This internal routine handles incoming watchdog ticks to detect > - * software lockups. It kills any offending thread which is found to > - * monopolize the CPU so as to starve the Linux kernel for too long. > + * This internal routine handles incoming watchdog triggers to detect > + * software lockups. It forces the offending thread to stop > + * monopolizing the CPU, either by kicking it out of primary mode if > + * running in user space, or cancelling it if kernel-based. > * > * @coretags{coreirq-only, atomic-entry} > */ > @@ -109,14 +115,15 @@ static void watchdog_handler(struct xntimer *timer) > struct xnsched *sched = xnsched_current(); > struct xnthread *curr = sched->curr; > > - if (likely(xnthread_test_state(curr, XNROOT))) { > - xnsched_reset_watchdog(sched); > - return; > - } > - > - if (likely(++sched->wdcount < wd_timeout_arg)) > + /* > + * CAUTION: The watchdog tick might have been delayed while we > + * were busy switching the CPU to secondary mode at the > + * trigger date eventually. Make sure that we are not about to > + * kick the incoming root thread. > + */ > + if (xnthread_test_state(curr, XNROOT)) > return; > - > + Stray whitespace change. Could fix that on merge but... > trace_cobalt_watchdog_signal(curr); > > if (xnthread_test_state(curr, XNUSER)) { > @@ -137,8 +144,6 @@ static void watchdog_handler(struct xntimer *timer) > */ > xnthread_set_info(curr, XNKICKED|XNCANCELD); > } > - > - xnsched_reset_watchdog(sched); > } > > #endif /* CONFIG_XENO_OPT_WATCHDOG */ > @@ -818,6 +823,10 @@ static inline void enter_root(struct xnthread *root) > { > struct xnarchtcb *rootcb __maybe_unused = xnthread_archtcb(root); > > +#ifdef CONFIG_XENO_OPT_WATCHDOG > + xntimer_stop(&root->sched->wdtimer); > +#endif > + > #ifdef CONFIG_XENO_ARCH_UNLOCKED_SWITCH > if (rootcb->core.mm == NULL) > set_ti_thread_flag(rootcb->core.tip, TIF_MMSWITCH_INT); > @@ -840,6 +849,11 @@ static inline void leave_root(struct xnthread *root) > rootcb->core.tip = task_thread_info(p); > #endif > xnarch_leave_root(root); > + > +#ifdef CONFIG_XENO_OPT_WATCHDOG > + xntimer_start(&root->sched->wdtimer, get_watchdog_timeout(), > + XN_INFINITE, XN_RELATIVE); > +#endif > } > > void __xnsched_run_handler(void) /* hw interrupts off. */ > @@ -889,9 +903,6 @@ reschedule: > > trace_cobalt_switch_context(prev, next); > > - if (xnthread_test_state(next, XNROOT)) > - xnsched_reset_watchdog(sched); > - > sched->curr = next; > shadow = 1; > > diff --git a/kernel/cobalt/timer.c b/kernel/cobalt/timer.c > index 6d985a995..5d8b3bc76 100644 > --- a/kernel/cobalt/timer.c > +++ b/kernel/cobalt/timer.c > @@ -893,10 +893,6 @@ int xntimer_grab_hardware(void) > else if (ret == 1) > xntimer_start(&sched->htimer, 0, 0, XN_RELATIVE); > > -#ifdef CONFIG_XENO_OPT_WATCHDOG > - xntimer_start(&sched->wdtimer, 1000000000UL, 1000000000UL, XN_RELATIVE); > - xnsched_reset_watchdog(sched); > -#endif > xnlock_put_irqrestore(&nklock, s); > } > > @@ -908,9 +904,6 @@ fail: > xnlock_get_irqsave(&nklock, s); > sched = xnsched_struct(cpu); > xntimer_stop(&sched->htimer); > -#ifdef CONFIG_XENO_OPT_WATCHDOG > - xntimer_stop(&sched->wdtimer); > -#endif > xnlock_put_irqrestore(&nklock, s); > ipipe_timer_stop(_cpu); > } > ... it does not apply to next. There must be something between this patch and current next (and it's not "cobalt/sched: group high-level init/cleanup code") which breaks things. Jan -- Siemens AG, Corporate Technology, CT RDA IOT SES-DE Corporate Competence Center Embedded Linux