All of lore.kernel.org
 help / color / mirror / Atom feed
From: Daniel Bristot de Oliveira <bristot@kernel.org>
To: Guenter Roeck <linux@roeck-us.net>,
	Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>,
	Wim Van Sebroeck <wim@linux-watchdog.org>,
	Jonathan Corbet <corbet@lwn.net>, Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Peter Zijlstra <peterz@infradead.org>,
	Will Deacon <will@kernel.org>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Marco Elver <elver@google.com>,
	Dmitry Vyukov <dvyukov@google.com>,
	"Paul E. McKenney" <paulmck@kernel.org>,
	Shuah Khan <skhan@linuxfoundation.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Clark Williams <williams@redhat.com>,
	"open list:DOCUMENTATION" <linux-doc@vger.kernel.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	linux-trace-devel@vger.kernel.org
Subject: Re: [PATCH V4 18/20] rv/monitor: Add safe watchdog monitor
Date: Fri, 17 Jun 2022 19:06:46 +0200	[thread overview]
Message-ID: <9003420b-a74b-dfa4-37b5-ed7bb952a816@kernel.org> (raw)
In-Reply-To: <17ade9e2-8fbe-ea80-93c1-9f1e291805b6@roeck-us.net>

On 6/17/22 01:53, Guenter Roeck wrote:
>>
>>     struct watchdog_device {
>>     ...
>>              #ifdef CONFIG_RV_MON_SAFE_WTD
>>              struct da_monitor da_mon;
>>              #endif
>>     ...
>>     }
>>
> 
> In my opinion shis should be dynamically allocated and not waste space in driver
> code if unused.

ack

>>     A simplified version of the the "per task" monitor, in the patch 01, changes in
>>
>>     include/linux/sched.h.
>>
>>      >> +
>>      >> +static void handle_nowayout(void *data, struct watchdog_device *wdd)
>>      >> +{
>>      >> +    if (wdd->id != watchdog_id)
>>      >> +        return;
>>      >> +
>>      >> +    da_handle_init_run_event_safe_wtd(nowayout_safe_wtd);
>>      >> +}
>>      >> +
>>      >> +static void handle_close(void *data, struct watchdog_device *wdd)
>>      >> +{
>>      >> +    if (wdd->id != watchdog_id)
>>      >> +        return;
>>      >> +
>>      >> +    if (open_pid && current->pid != open_pid) {
>>      >> +        da_handle_init_run_event_safe_wtd(other_threads_safe_wtd);
>>      >> +    } else {
>>      >> +        da_handle_event_safe_wtd(close_safe_wtd);
>>      >> +        open_pid = 0;
>>      >> +    }
>>      >> +}
>>      >> +
>>      >> +static void handle_open(void *data, struct watchdog_device *wdd)
>>      >> +{
>>      >> +    if (wdd->id != watchdog_id)
>>      >> +        return;
>>      >> +
>>      >> +    if (open_pid && current->pid != open_pid) {
>>      >> +        da_handle_init_run_event_safe_wtd(other_threads_safe_wtd);
>>      >> +    } else {
>>      >> +        da_handle_init_run_event_safe_wtd(open_safe_wtd);
>>      >> +        open_pid = current->pid;
>>      >> +    }
>>      >> +}
>>      >> +
>>      >> +static void blocked_events(void *data, struct watchdog_device *wdd)
>>      >> +{
>>      >> +    if (wdd->id != watchdog_id)
>>      >> +        return;
>>      >> +
>>      >> +    if (open_pid && current->pid != open_pid) {
>>      >> +        da_handle_init_run_event_safe_wtd(other_threads_safe_wtd);
>>      >> +        return;
>>      >> +    }
>>      >> +    da_handle_event_safe_wtd(other_threads_safe_wtd);
>>      >> +}
>>      >> +
>>      >> +static void blocked_events_timeout(void *data, struct watchdog_device *wdd,
>>
>>      >> u64 timeout)
>>      >> +{
>>      >> +    blocked_events(data, wdd);
>>      >> +}
>>      >> +
>>      >> +static void handle_ping(void *data, struct watchdog_device *wdd)
>>      >> +{
>>      >> +    char msg[128];
>>      >> +    unsigned int timeout;
>>      >> +
>>      >> +    if (wdd->id != watchdog_id)
>>      >> +        return;
>>      >> +
>>      >> +    if (open_pid && current->pid != open_pid) {
>>      >> +        da_handle_init_run_event_safe_wtd(other_threads_safe_wtd);
>>      >> +        return;
>>      >> +    }
>>      >> +
>>      >> +    da_handle_event_safe_wtd(ping_safe_wtd);
>>      >> +
>>      >> +    if (!check_timeout)
>>      >> +        return;
>>      >> +
>>      >> +    if (wdd->ops->get_timeleft) {
>>      >> +        timeout = wdd->ops->get_timeleft(wdd);
>>      >> +        if (timeout > last_timeout_set) {
>>      >> +            snprintf(msg, 128,
>>      >> +                 "watchdog timeout is %u > than previously set (%d)\n",
>>      >> +                 timeout, last_timeout_set);
>>      >> +            cond_react(msg);
>>      >> +        }
>>      >> +    } else {
>>      >> +        snprintf(msg, 128, "error getting timeout: option not supported\n");
>>
>>      >
>>      > This is not an error. The get_timeleft callback is optional.
>>
>>     Right... but this part of the code is only reachable if the user explicitly
>>     asked to check the timeout (if (!check_timeout)...return before this code).
>>
>>     So, if the user only considers the system safe if the monitor also checks the
>>     written timeout, but the watchdog is one of those that do not have the callback
>>
>>     implemented (which is ok for a Linux watchdog), the monitor captures this
>>     "undesired" behavior.
>>
>>     This monitor is not checking if the watchdog subsystem is correct at its
>>     plenitude, it is checking if the watchdog usage is following a set of
>>     specifications (raised by people in the LF Elisa workgroup).
>>
> 
> The kernel is not intended for special use cases. The callback is optional,
> period. The test for check_timeout is way too late. A check like this should
> be made when the check is requested, not when it is executed - in other words,
> when the user requests it. That request should fail.
> 
>>      >> +        cond_react(msg);
>>      >> +    }

[...]

>>      >
>>      > I find this event table all but impossible to verify.
>>
>>     It is a matrix. Lines are states, and columns are events.
>>
>>     On a given state/line, receiving a given event/column, the data is the next
>>     state/row.
>>
> 
> I am aware of that, and I did program state machines before.
> 
>>     For instance, let's say "init" (row 0), event "nwo" (column 1), and the next
>>     state is the "nwo" (row 3).
>>
>>     -1 means invalid/blocked state (yeah, maybe it is better to have an #define
>>     INVALID_STATE -1).
>>
>>     This is the C representation of an automaton, following the formal definition of
>>
>>     a deterministic automaton. I've added an explanation of this representation in
>>
>>     the documentation (patch 15, file da_monitor_synthesis.rst).
>>
>>     A deeper look into this subject is here (peer-reviewed conference paper at
>>     Software Engineer and Formal Methods 2019):
>>     https://bristot.me/wp-content/uploads/2019/09/paper.pdf <https://bristot.me/wp-content/uploads/2019/09/paper.pdf>
>>
>>
>>     One could translate it back to the automaton's graphical format... to a format
>>
>>     of by a tool used to analyze automaton properties... that is the good point of
>>
>>     using a well-established formalism. (The bad part is that they are often
>>     boring... c'est la vie :-)).
>>
> 
> If the above state machine fails, no one but the authors will be able to even
> remotely figure out what happened, and if the watchdog driver is at fault or
> its monitor.

That is a point we can improve. If we look only for the matrix, yes, it is not
the best way to try to understand what is going on. I agree.

The idea here is that one can describe the expected behavior and check. But we fail
to explain the reasons for these choices... 

The patch "20/20 Documentation/rv: Add watchdog-monitor documentation" attempts
to explain the idea behind the monitor, and includes the ASCII representation
of the monitor. But indeed, it fails on explaining why those assumptions were
made, what it is trying to monitor, and what it is not...

It is a state machine making assumptions about state transitions,
> sure, but who knows if those asssumptions are even remotely correct or match
> reality. For example, I have no idea if the lack of a 'ping' function is handled
> correctly,  if the lack of a 'stop' function is handled correctly, or what
> happens if any of the driver functions returns an error.
> 
> I already found three assumptions which do not or not necessarily match
> reality:
> 
> - The function to read the remaining timeout is optional and must not be
>   used unconditionally, and its lack is not an error.

Ack, that can be removed.

> - The requested timeout (and pretimeout) do not have to match the actually
>   configured timeout, and userspace must not rely on the assumption that
>   the values match.

I agree, the model is lacking that, and it needs it.

> - The code assumes that the process opening the watchdog and the process
>   accessing it are the same. While that is in general the case, it might
>   well be that some application opens the watchdog and then handles it
>   from a child process.

What is not clear on our documentation are the reasons behind these choices.
They are based on a hazard analysis made by people safety experts in the
Elisa group, that need to be clarified.

/me looks at people from Elisa...

-- Daniel

> And that is just after briefly browsing through the code> I am open to suggestions from others, but at this point I have serious doubts
> that this code is maintainable in the kernel.
> 
> Guenter


  reply	other threads:[~2022-06-17 17:07 UTC|newest]

Thread overview: 82+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-16  8:44 [PATCH V4 00/20] The Runtime Verification (RV) interface Daniel Bristot de Oliveira
2022-06-16  8:44 ` [PATCH V4 01/20] rv: Add " Daniel Bristot de Oliveira
2022-06-23 17:21   ` Punit Agrawal
2022-07-01 13:24     ` Daniel Bristot de Oliveira
2022-06-23 20:26   ` Steven Rostedt
2022-07-04 19:49     ` Daniel Bristot de Oliveira
2022-07-06 17:49   ` Tao Zhou
2022-07-06 17:53     ` Matthew Wilcox
2022-07-08 15:36       ` Tao Zhou
2022-07-08 15:55         ` Matthew Wilcox
2022-07-08 14:39     ` Daniel Bristot de Oliveira
2022-07-10 15:11       ` Tao Zhou
2022-07-10 15:42         ` Steven Rostedt
2022-07-10 22:28           ` Tao Zhou
2022-06-16  8:44 ` [PATCH V4 02/20] rv: Add runtime reactors interface Daniel Bristot de Oliveira
2022-06-23 20:40   ` Steven Rostedt
2022-06-16  8:44 ` [PATCH V4 03/20] rv/include: Add helper functions for deterministic automata Daniel Bristot de Oliveira
2022-06-28 17:48   ` Steven Rostedt
2022-07-06 18:35   ` Tao Zhou
2022-07-13 18:38     ` Daniel Bristot de Oliveira
2022-06-16  8:44 ` [PATCH V4 04/20] rv/include: Add deterministic automata monitor definition via C macros Daniel Bristot de Oliveira
2022-07-06 18:56   ` Tao Zhou
2022-07-13 18:39     ` Daniel Bristot de Oliveira
2022-06-16  8:44 ` [PATCH V4 05/20] rv/include: Add instrumentation helper functions Daniel Bristot de Oliveira
2022-06-16  8:44 ` [PATCH V4 06/20] tools/rv: Add dot2c Daniel Bristot de Oliveira
2022-06-28 18:10   ` Steven Rostedt
2022-06-28 18:16   ` Steven Rostedt
2022-07-13 18:41     ` Daniel Bristot de Oliveira
2022-06-16  8:44 ` [PATCH V4 07/20] tools/rv: Add dot2k Daniel Bristot de Oliveira
2022-06-16  8:44 ` [PATCH V4 08/20] rv/monitor: Add the wip monitor skeleton created by dot2k Daniel Bristot de Oliveira
2022-06-28 19:02   ` Steven Rostedt
2022-06-16  8:44 ` [PATCH V4 09/20] rv/monitor: wip instrumentation and Makefile/Kconfig entries Daniel Bristot de Oliveira
2022-06-16 11:21   ` kernel test robot
2022-06-16 21:00   ` Randy Dunlap
2022-06-17 16:07     ` Daniel Bristot de Oliveira
2022-06-28 19:02     ` Steven Rostedt
2022-06-16  8:44 ` [PATCH V4 10/20] rv/monitor: Add the wwnr monitor skeleton created by dot2k Daniel Bristot de Oliveira
2022-07-06 20:08   ` Tao Zhou
2022-06-16  8:44 ` [PATCH V4 11/20] rv/monitor: wwnr instrumentation and Makefile/Kconfig entries Daniel Bristot de Oliveira
2022-06-16 13:47   ` kernel test robot
2022-06-28 19:05   ` Steven Rostedt
2022-06-16  8:44 ` [PATCH V4 12/20] rv/reactor: Add the printk reactor Daniel Bristot de Oliveira
2022-06-16  8:44 ` [PATCH V4 13/20] rv/reactor: Add the panic reactor Daniel Bristot de Oliveira
2022-06-16 15:20   ` kernel test robot
2022-06-16 21:03   ` Randy Dunlap
2022-06-17 16:09     ` Daniel Bristot de Oliveira
2022-07-13 18:47     ` Daniel Bristot de Oliveira
2022-06-28 19:06   ` Steven Rostedt
2022-06-16  8:44 ` [PATCH V4 14/20] Documentation/rv: Add a basic documentation Daniel Bristot de Oliveira
2022-06-29  3:35   ` Bagas Sanjaya
2022-07-13 19:30     ` Daniel Bristot de Oliveira
2022-06-16  8:44 ` [PATCH V4 15/20] Documentation/rv: Add deterministic automata monitor synthesis documentation Daniel Bristot de Oliveira
2022-06-28 19:09   ` Steven Rostedt
2022-06-16  8:44 ` [PATCH V4 16/20] Documentation/rv: Add deterministic automata instrumentation documentation Daniel Bristot de Oliveira
2022-06-16  8:44 ` [PATCH V4 17/20] watchdog/dev: Add tracepoints Daniel Bristot de Oliveira
2022-06-16 13:44   ` Guenter Roeck
2022-06-16 15:47     ` Daniel Bristot de Oliveira
2022-06-16 23:55       ` Guenter Roeck
2022-06-17 16:16         ` Daniel Bristot de Oliveira
2022-07-13 18:49         ` Daniel Bristot de Oliveira
2022-06-16  8:45 ` [PATCH V4 18/20] rv/monitor: Add safe watchdog monitor Daniel Bristot de Oliveira
2022-06-16 13:36   ` Guenter Roeck
2022-06-16 15:29     ` Daniel Bristot de Oliveira
     [not found]       ` <CA+wEVJbvcMZbCroO2_rdVxLvYkUo-ePxCwsp5vbDpoqys4HGWQ@mail.gmail.com>
2022-06-16 23:53         ` Guenter Roeck
2022-06-17 17:06           ` Daniel Bristot de Oliveira [this message]
2022-06-28 19:32           ` Steven Rostedt
2022-07-01 14:45             ` Guenter Roeck
2022-07-01 15:38               ` Steven Rostedt
2022-07-04 12:41                 ` Daniel Bristot de Oliveira
2022-06-16 20:57   ` Randy Dunlap
2022-06-17 16:17     ` Daniel Bristot de Oliveira
2022-07-13 19:13   ` Daniel Bristot de Oliveira
2022-06-16  8:45 ` [PATCH V4 19/20] rv/safety_app: Add a safety_app sample Daniel Bristot de Oliveira
2022-06-16  8:45 ` [PATCH V4 20/20] Documentation/rv: Add watchdog-monitor documentation Daniel Bristot de Oliveira
2022-07-07 12:41   ` Tao Zhou
2022-07-13 18:51     ` Daniel Bristot de Oliveira
2022-06-22  7:24 ` [PATCH V4 00/20] The Runtime Verification (RV) interface Song Liu
2022-06-23 16:41   ` Daniel Bristot de Oliveira
2022-06-23 17:52     ` Song Liu
2022-06-23 20:29       ` Daniel Bristot de Oliveira
2022-06-23 21:10         ` Song Liu
2022-07-06 16:18 ` Tao Zhou

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=9003420b-a74b-dfa4-37b5-ed7bb952a816@kernel.org \
    --to=bristot@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=corbet@lwn.net \
    --cc=dvyukov@google.com \
    --cc=elver@google.com \
    --cc=gpaoloni@redhat.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-trace-devel@vger.kernel.org \
    --cc=linux@roeck-us.net \
    --cc=mingo@redhat.com \
    --cc=paulmck@kernel.org \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=skhan@linuxfoundation.org \
    --cc=tglx@linutronix.de \
    --cc=will@kernel.org \
    --cc=williams@redhat.com \
    --cc=wim@linux-watchdog.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.