* [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure
@ 2019-05-20 10:16 Petri Latvala
2019-05-20 10:16 ` [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort Petri Latvala
` (3 more replies)
0 siblings, 4 replies; 9+ messages in thread
From: Petri Latvala @ 2019-05-20 10:16 UTC (permalink / raw)
To: igt-dev; +Cc: Tomi Sarvela, Petri Latvala, Daniel Vetter
If the network goes down while testing, CI tends to interpret that as
the device being down, cutting its power after a while. This causes an
incomplete to an innocent test, increasing noise in the results.
A new flag to --abort-on-monitored-error, "ping", uses liboping to
ping a host configured in .igtrc with one ping after each test
execution and aborts the run if there is no reply in a hardcoded
amount of time.
v2:
- Use a higher timeout
- Allow hostname configuration from environment
v3:
- Use runner_c_args for holding c args for runner
- Handle runner's meson options in runner/meson.build
- Instead of one ping with 20 second timeout, ping with 1 second timeout
for a duration of 20 seconds
Signed-off-by: Petri Latvala <petri.latvala@intel.com>
Cc: Arkadiusz Hiler <arkadiusz.hiler@intel.com>
Cc: Martin Peres <martin.peres@linux.intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
---
meson_options.txt | 6 ++
runner/executor.c | 150 +++++++++++++++++++++++++++++++++++++++++++++
runner/meson.build | 14 ++++-
runner/settings.c | 4 ++
runner/settings.h | 5 +-
5 files changed, 176 insertions(+), 3 deletions(-)
diff --git a/meson_options.txt b/meson_options.txt
index 888efe56..935f883e 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -58,6 +58,12 @@ option('build_runner',
choices : ['auto', 'true', 'false'],
description : 'Build test runner')
+option('with_oping',
+ type : 'combo',
+ value : 'auto',
+ choices : ['auto', 'true', 'false'],
+ description : 'Build igt_runner with liboping')
+
option('use_rpath',
type : 'boolean',
value : false,
diff --git a/runner/executor.c b/runner/executor.c
index 7e5fbe8f..c07e53fa 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -1,6 +1,10 @@
#include <errno.h>
#include <fcntl.h>
+#include <glib.h>
#include <linux/watchdog.h>
+#if HAVE_OPING
+#include <oping.h>
+#endif
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
@@ -16,6 +20,7 @@
#include <time.h>
#include <unistd.h>
+#include "igt_aux.h"
#include "igt_core.h"
#include "executor.h"
#include "output_strings.h"
@@ -108,6 +113,147 @@ static void ping_watchdogs(void)
}
}
+#if HAVE_OPING
+static pingobj_t *pingobj = NULL;
+
+static bool load_ping_config_from_file(void)
+{
+ char *key_file_env = NULL;
+ char *key_file_loc = NULL;
+ GError *error = NULL;
+ GKeyFile *key_file = NULL;
+ const char *ping_hostname;
+ int ret;
+
+ /* Determine igt config path */
+ key_file_env = getenv("IGT_CONFIG_PATH");
+ if (key_file_env) {
+ key_file_loc = strdup(key_file_env);
+ } else {
+ asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
+ }
+
+ /* Load igt config file */
+ key_file = g_key_file_new();
+ ret = g_key_file_load_from_file(key_file, key_file_loc,
+ G_KEY_FILE_NONE, &error);
+ free(key_file_loc);
+
+ if (!ret) {
+ g_error_free(error);
+ g_key_file_free(key_file);
+ return false;
+ }
+
+ g_clear_error(&error);
+
+ ping_hostname =
+ g_key_file_get_string(key_file, "DUT",
+ "PingHostName", &error);
+
+ g_clear_error(&error);
+ g_key_file_free(key_file);
+
+ if (!ping_hostname)
+ return false;
+
+ if (ping_host_add(pingobj, ping_hostname)) {
+ fprintf(stderr,
+ "abort on ping: Cannot use hostname from config file\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool load_ping_config_from_env(void)
+{
+ const char *ping_hostname;
+
+ ping_hostname = getenv("IGT_PING_HOSTNAME");
+ if (!ping_hostname)
+ return false;
+
+ if (ping_host_add(pingobj, ping_hostname)) {
+ fprintf(stderr,
+ "abort on ping: Cannot use hostname from environment\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool can_ping(void)
+{
+ /*
+ * On some hosts, getting network back up after suspend takes
+ * upwards of 10 seconds. 20 seconds should be enough to see
+ * if network comes back at all, and hopefully not too long to
+ * make external monitoring freak out.
+ */
+ igt_until_timeout(20) {
+ pingobj_iter_t *iter;
+
+ ping_send(pingobj);
+
+ for (iter = ping_iterator_get(pingobj);
+ iter != NULL;
+ iter = ping_iterator_next(iter)) {
+ double latency;
+ size_t len = sizeof(latency);
+
+ ping_iterator_get_info(iter,
+ PING_INFO_LATENCY,
+ &latency,
+ &len);
+ if (latency >= 0.0)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+#endif
+
+static void ping_config(void)
+{
+#if HAVE_OPING
+ double timeout = 1.0;
+
+ if (pingobj)
+ return;
+
+ pingobj = ping_construct();
+
+ /* Try env first, then config file */
+ if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
+ fprintf(stderr,
+ "abort on ping: No host to ping configured\n");
+ ping_destroy(pingobj);
+ pingobj = NULL;
+ return;
+ }
+
+ ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
+#endif
+}
+
+static char *handle_ping(void)
+{
+#if HAVE_OPING
+ if (pingobj && !can_ping()) {
+ char *reason;
+
+ asprintf(&reason,
+ "Ping host did not respond to ping, network down");
+ return reason;
+ }
+#endif
+
+ return NULL;
+}
+
static char *handle_lockdep(void)
{
const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
@@ -197,6 +343,7 @@ static const struct {
} abort_handlers[] = {
{ ABORT_LOCKDEP, handle_lockdep },
{ ABORT_TAINT, handle_taint },
+ { ABORT_PING, handle_ping },
{ 0, 0 },
};
@@ -1288,6 +1435,9 @@ bool execute(struct execute_state *state,
init_watchdogs(settings);
+ if (settings->abort_mask & ABORT_PING)
+ ping_config();
+
if (!uname(&unamebuf)) {
dprintf(unamefd, "%s %s %s %s %s\n",
unamebuf.sysname,
diff --git a/runner/meson.build b/runner/meson.build
index b658f1d2..6d60c530 100644
--- a/runner/meson.build
+++ b/runner/meson.build
@@ -1,4 +1,15 @@
jsonc = dependency('json-c', required: _runner_required)
+runner_deps = [jsonc, glib]
+runner_c_args = []
+
+with_oping = get_option('with_oping')
+if with_oping != 'false'
+ oping = dependency('liboping', required: with_oping == 'true')
+ if oping.found()
+ runner_deps += oping
+ runner_c_args += '-DHAVE_OPING=1'
+ endif
+endif
runnerlib_sources = [ 'settings.c',
'job_list.c',
@@ -21,7 +32,8 @@ if _build_runner and _build_tests and jsonc.found()
runnerlib = static_library('igt_runner', runnerlib_sources,
include_directories : inc,
- dependencies : [jsonc, glib])
+ c_args : runner_c_args,
+ dependencies : runner_deps)
runner = executable('igt_runner', runner_sources,
link_with : runnerlib,
diff --git a/runner/settings.c b/runner/settings.c
index ad38ae8d..b57d1a6a 100644
--- a/runner/settings.c
+++ b/runner/settings.c
@@ -48,6 +48,7 @@ static struct {
} abort_conditions[] = {
{ ABORT_TAINT, "taint" },
{ ABORT_LOCKDEP, "lockdep" },
+ { ABORT_PING, "ping" },
{ ABORT_ALL, "all" },
{ 0, 0 },
};
@@ -136,6 +137,9 @@ static const char *usage_str =
" Possible conditions:\n"
" lockdep - abort when kernel lockdep has been angered.\n"
" taint - abort when kernel becomes fatally tainted.\n"
+ " ping - abort when a host configured in .igtrc or\n"
+ " environment variable IGT_PING_HOSTNAME does\n"
+ " not respond to ping.\n"
" all - abort for all of the above.\n"
" -s, --sync Sync results to disk after every test\n"
" -l {quiet,verbose,dummy}, --log-level {quiet,verbose,dummy}\n"
diff --git a/runner/settings.h b/runner/settings.h
index 0a1ee08d..2b6e65d0 100644
--- a/runner/settings.h
+++ b/runner/settings.h
@@ -15,9 +15,10 @@ enum {
#define ABORT_TAINT (1 << 0)
#define ABORT_LOCKDEP (1 << 1)
-#define ABORT_ALL (ABORT_TAINT | ABORT_LOCKDEP)
+#define ABORT_PING (1 << 2)
+#define ABORT_ALL (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING)
-_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP), "ABORT_ALL must be all conditions bitwise or'd");
+_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING), "ABORT_ALL must be all conditions bitwise or'd");
struct regex_list {
char **regex_strings;
--
2.19.1
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort
2019-05-20 10:16 [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure Petri Latvala
@ 2019-05-20 10:16 ` Petri Latvala
2019-05-20 11:51 ` Petri Latvala
2019-05-20 12:09 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev2) Patchwork
` (2 subsequent siblings)
3 siblings, 1 reply; 9+ messages in thread
From: Petri Latvala @ 2019-05-20 10:16 UTC (permalink / raw)
To: igt-dev; +Cc: Petri Latvala
---
runner/executor.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/runner/executor.c b/runner/executor.c
index c07e53fa..4a44c633 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -354,7 +354,7 @@ static char *need_to_abort(const struct settings* settings)
for (it = abort_handlers; it->condition; it++) {
char *abort;
- if (!(settings->abort_mask & it->condition))
+ if (false && !(settings->abort_mask & it->condition))
continue;
abort = it->handler();
--
2.19.1
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort
2019-05-20 10:16 ` [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort Petri Latvala
@ 2019-05-20 11:51 ` Petri Latvala
2019-05-20 12:16 ` Petri Latvala
0 siblings, 1 reply; 9+ messages in thread
From: Petri Latvala @ 2019-05-20 11:51 UTC (permalink / raw)
To: igt-dev; +Cc: Petri Latvala
... and remember to HAX the ping setup as well
---
runner/executor.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/runner/executor.c b/runner/executor.c
index c07e53fa..af87e312 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -354,7 +354,7 @@ static char *need_to_abort(const struct settings* settings)
for (it = abort_handlers; it->condition; it++) {
char *abort;
- if (!(settings->abort_mask & it->condition))
+ if (false && !(settings->abort_mask & it->condition))
continue;
abort = it->handler();
@@ -1435,7 +1435,7 @@ bool execute(struct execute_state *state,
init_watchdogs(settings);
- if (settings->abort_mask & ABORT_PING)
+ if (true && settings->abort_mask & ABORT_PING)
ping_config();
if (!uname(&unamebuf)) {
--
2.19.1
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev2)
2019-05-20 10:16 [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure Petri Latvala
2019-05-20 10:16 ` [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort Petri Latvala
@ 2019-05-20 12:09 ` Patchwork
2019-05-20 13:33 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev3) Patchwork
2019-09-09 10:53 ` [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure Arkadiusz Hiler
3 siblings, 0 replies; 9+ messages in thread
From: Patchwork @ 2019-05-20 12:09 UTC (permalink / raw)
To: Petri Latvala; +Cc: igt-dev
== Series Details ==
Series: series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev2)
URL : https://patchwork.freedesktop.org/series/60854/
State : success
== Summary ==
CI Bug Log - changes from CI_DRM_6099 -> IGTPW_3007
====================================================
Summary
-------
**SUCCESS**
No regressions found.
External URL: https://patchwork.freedesktop.org/api/1.0/series/60854/revisions/2/mbox/
Known issues
------------
Here are the changes found in IGTPW_3007 that come from known issues:
### IGT changes ###
#### Issues hit ####
* igt@prime_vgem@basic-fence-flip:
- fi-ilk-650: [PASS][1] -> [DMESG-WARN][2] ([fdo#106387]) +1 similar issue
[1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_6099/fi-ilk-650/igt@prime_vgem@basic-fence-flip.html
[2]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3007/fi-ilk-650/igt@prime_vgem@basic-fence-flip.html
#### Possible fixes ####
* igt@gem_exec_suspend@basic-s4-devices:
- fi-kbl-7500u: [DMESG-WARN][3] ([fdo#105128] / [fdo#107139]) -> [PASS][4]
[3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_6099/fi-kbl-7500u/igt@gem_exec_suspend@basic-s4-devices.html
[4]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3007/fi-kbl-7500u/igt@gem_exec_suspend@basic-s4-devices.html
[fdo#105128]: https://bugs.freedesktop.org/show_bug.cgi?id=105128
[fdo#106387]: https://bugs.freedesktop.org/show_bug.cgi?id=106387
[fdo#107139]: https://bugs.freedesktop.org/show_bug.cgi?id=107139
Participating hosts (52 -> 43)
------------------------------
Missing (9): fi-kbl-soraka fi-ilk-m540 fi-bxt-dsi fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-apl-guc fi-byt-clapper fi-bdw-samus
Build changes
-------------
* IGT: IGT_4997 -> IGTPW_3007
CI_DRM_6099: 3cef0eacdd3d3448dfefb7011f1e66821c5fd41d @ git://anongit.freedesktop.org/gfx-ci/linux
IGTPW_3007: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3007/
IGT_4997: eff5d0db3248734845b78fcc2e2772dd4012e5af @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
== Logs ==
For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3007/
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev
^ permalink raw reply [flat|nested] 9+ messages in thread
* [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort
2019-05-20 11:51 ` Petri Latvala
@ 2019-05-20 12:16 ` Petri Latvala
0 siblings, 0 replies; 9+ messages in thread
From: Petri Latvala @ 2019-05-20 12:16 UTC (permalink / raw)
To: igt-dev; +Cc: Petri Latvala
... and remember to HAX the ping setup as well
... correctly
---
runner/executor.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/runner/executor.c b/runner/executor.c
index c07e53fa..8cb09dda 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -354,7 +354,7 @@ static char *need_to_abort(const struct settings* settings)
for (it = abort_handlers; it->condition; it++) {
char *abort;
- if (!(settings->abort_mask & it->condition))
+ if (false && !(settings->abort_mask & it->condition))
continue;
abort = it->handler();
@@ -1435,7 +1435,7 @@ bool execute(struct execute_state *state,
init_watchdogs(settings);
- if (settings->abort_mask & ABORT_PING)
+ if (true || settings->abort_mask & ABORT_PING)
ping_config();
if (!uname(&unamebuf)) {
--
2.19.1
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev3)
2019-05-20 10:16 [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure Petri Latvala
2019-05-20 10:16 ` [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort Petri Latvala
2019-05-20 12:09 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev2) Patchwork
@ 2019-05-20 13:33 ` Patchwork
2019-09-09 10:53 ` [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure Arkadiusz Hiler
3 siblings, 0 replies; 9+ messages in thread
From: Patchwork @ 2019-05-20 13:33 UTC (permalink / raw)
To: Petri Latvala; +Cc: igt-dev
== Series Details ==
Series: series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev3)
URL : https://patchwork.freedesktop.org/series/60854/
State : success
== Summary ==
CI Bug Log - changes from CI_DRM_6100 -> IGTPW_3009
====================================================
Summary
-------
**SUCCESS**
No regressions found.
External URL: https://patchwork.freedesktop.org/api/1.0/series/60854/revisions/3/mbox/
Known issues
------------
Here are the changes found in IGTPW_3009 that come from known issues:
### IGT changes ###
#### Issues hit ####
* igt@i915_selftest@live_hangcheck:
- fi-skl-iommu: [PASS][1] -> [INCOMPLETE][2] ([fdo#108602] / [fdo#108744])
[1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_6100/fi-skl-iommu/igt@i915_selftest@live_hangcheck.html
[2]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3009/fi-skl-iommu/igt@i915_selftest@live_hangcheck.html
* igt@kms_frontbuffer_tracking@basic:
- fi-hsw-peppy: [PASS][3] -> [DMESG-WARN][4] ([fdo#102614])
[3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_6100/fi-hsw-peppy/igt@kms_frontbuffer_tracking@basic.html
[4]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3009/fi-hsw-peppy/igt@kms_frontbuffer_tracking@basic.html
#### Possible fixes ####
* igt@i915_selftest@live_contexts:
- fi-skl-gvtdvm: [DMESG-FAIL][5] ([fdo#110235]) -> [PASS][6]
[5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_6100/fi-skl-gvtdvm/igt@i915_selftest@live_contexts.html
[6]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3009/fi-skl-gvtdvm/igt@i915_selftest@live_contexts.html
[fdo#102614]: https://bugs.freedesktop.org/show_bug.cgi?id=102614
[fdo#108602]: https://bugs.freedesktop.org/show_bug.cgi?id=108602
[fdo#108744]: https://bugs.freedesktop.org/show_bug.cgi?id=108744
[fdo#110235]: https://bugs.freedesktop.org/show_bug.cgi?id=110235
Participating hosts (50 -> 43)
------------------------------
Additional (1): fi-icl-y
Missing (8): fi-kbl-soraka fi-ilk-m540 fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-apl-guc fi-pnv-d510 fi-byt-clapper
Build changes
-------------
* IGT: IGT_4997 -> IGTPW_3009
CI_DRM_6100: 6cd6c39683ceb3e48b98871fd25aea47f291f2b0 @ git://anongit.freedesktop.org/gfx-ci/linux
IGTPW_3009: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3009/
IGT_4997: eff5d0db3248734845b78fcc2e2772dd4012e5af @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
== Logs ==
For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3009/
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure
2019-05-20 10:16 [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure Petri Latvala
` (2 preceding siblings ...)
2019-05-20 13:33 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev3) Patchwork
@ 2019-09-09 10:53 ` Arkadiusz Hiler
2019-09-09 11:27 ` Petri Latvala
3 siblings, 1 reply; 9+ messages in thread
From: Arkadiusz Hiler @ 2019-09-09 10:53 UTC (permalink / raw)
To: Petri Latvala; +Cc: igt-dev, Tomi Sarvela, Daniel Vetter
On Mon, May 20, 2019 at 01:16:22PM +0300, Petri Latvala wrote:
> If the network goes down while testing, CI tends to interpret that as
> the device being down, cutting its power after a while. This causes an
> incomplete to an innocent test, increasing noise in the results.
>
> A new flag to --abort-on-monitored-error, "ping", uses liboping to
> ping a host configured in .igtrc with one ping after each test
> execution and aborts the run if there is no reply in a hardcoded
> amount of time.
>
> v2:
> - Use a higher timeout
> - Allow hostname configuration from environment
> v3:
> - Use runner_c_args for holding c args for runner
> - Handle runner's meson options in runner/meson.build
> - Instead of one ping with 20 second timeout, ping with 1 second timeout
> for a duration of 20 seconds
>
> Signed-off-by: Petri Latvala <petri.latvala@intel.com>
> Cc: Arkadiusz Hiler <arkadiusz.hiler@intel.com>
> Cc: Martin Peres <martin.peres@linux.intel.com>
> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> ---
> meson_options.txt | 6 ++
> runner/executor.c | 150 +++++++++++++++++++++++++++++++++++++++++++++
> runner/meson.build | 14 ++++-
> runner/settings.c | 4 ++
> runner/settings.h | 5 +-
> 5 files changed, 176 insertions(+), 3 deletions(-)
>
> diff --git a/meson_options.txt b/meson_options.txt
> index 888efe56..935f883e 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -58,6 +58,12 @@ option('build_runner',
> choices : ['auto', 'true', 'false'],
> description : 'Build test runner')
>
> +option('with_oping',
> + type : 'combo',
> + value : 'auto',
> + choices : ['auto', 'true', 'false'],
> + description : 'Build igt_runner with liboping')
> +
> option('use_rpath',
> type : 'boolean',
> value : false,
> diff --git a/runner/executor.c b/runner/executor.c
> index 7e5fbe8f..c07e53fa 100644
> --- a/runner/executor.c
> +++ b/runner/executor.c
> @@ -1,6 +1,10 @@
> #include <errno.h>
> #include <fcntl.h>
> +#include <glib.h>
> #include <linux/watchdog.h>
> +#if HAVE_OPING
> +#include <oping.h>
> +#endif
> #include <signal.h>
> #include <stdio.h>
> #include <stdlib.h>
> @@ -16,6 +20,7 @@
> #include <time.h>
> #include <unistd.h>
>
> +#include "igt_aux.h"
> #include "igt_core.h"
> #include "executor.h"
> #include "output_strings.h"
> @@ -108,6 +113,147 @@ static void ping_watchdogs(void)
> }
> }
>
> +#if HAVE_OPING
> +static pingobj_t *pingobj = NULL;
> +
> +static bool load_ping_config_from_file(void)
> +{
> + char *key_file_env = NULL;
> + char *key_file_loc = NULL;
> + GError *error = NULL;
> + GKeyFile *key_file = NULL;
> + const char *ping_hostname;
> + int ret;
> +
> + /* Determine igt config path */
> + key_file_env = getenv("IGT_CONFIG_PATH");
> + if (key_file_env) {
> + key_file_loc = strdup(key_file_env);
> + } else {
> + asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
> + }
> +
> + /* Load igt config file */
> + key_file = g_key_file_new();
> + ret = g_key_file_load_from_file(key_file, key_file_loc,
> + G_KEY_FILE_NONE, &error);
> + free(key_file_loc);
> +
> + if (!ret) {
> + g_error_free(error);
> + g_key_file_free(key_file);
> + return false;
> + }
> +
> + g_clear_error(&error);
put that in
GKeyFile *igt_open_igtrc();
and use it in igt_core, so we have only one implementation
> +
> + ping_hostname =
> + g_key_file_get_string(key_file, "DUT",
> + "PingHostName", &error);
> +
> + g_clear_error(&error);
> + g_key_file_free(key_file);
> +
> + if (!ping_hostname)
> + return false;
> +
> + if (ping_host_add(pingobj, ping_hostname)) {
> + fprintf(stderr,
> + "abort on ping: Cannot use hostname from config file\n");
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static bool load_ping_config_from_env(void)
> +{
> + const char *ping_hostname;
> +
> + ping_hostname = getenv("IGT_PING_HOSTNAME");
> + if (!ping_hostname)
> + return false;
> +
> + if (ping_host_add(pingobj, ping_hostname)) {
> + fprintf(stderr,
> + "abort on ping: Cannot use hostname from environment\n");
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static bool can_ping(void)
> +{
> + /*
> + * On some hosts, getting network back up after suspend takes
> + * upwards of 10 seconds. 20 seconds should be enough to see
> + * if network comes back at all, and hopefully not too long to
> + * make external monitoring freak out.
> + */
> + igt_until_timeout(20) {
make 20 a #define
> + pingobj_iter_t *iter;
> +
> + ping_send(pingobj);
> +
> + for (iter = ping_iterator_get(pingobj);
> + iter != NULL;
> + iter = ping_iterator_next(iter)) {
> + double latency;
> + size_t len = sizeof(latency);
> +
> + ping_iterator_get_info(iter,
> + PING_INFO_LATENCY,
> + &latency,
> + &len);
> + if (latency >= 0.0)
> + return true;
> + }
> + }
> +
> + return false;
> +}
> +
> +#endif
> +
> +static void ping_config(void)
> +{
> +#if HAVE_OPING
> + double timeout = 1.0;
single_attempt_timeout
> +
> + if (pingobj)
> + return;
> +
> + pingobj = ping_construct();
> +
> + /* Try env first, then config file */
> + if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
> + fprintf(stderr,
> + "abort on ping: No host to ping configured\n");
> + ping_destroy(pingobj);
> + pingobj = NULL;
> + return;
> + }
> +
> + ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
> +#endif
> +}
> +
> +static char *handle_ping(void)
> +{
> +#if HAVE_OPING
> + if (pingobj && !can_ping()) {
> + char *reason;
> +
> + asprintf(&reason,
> + "Ping host did not respond to ping, network down");
> + return reason;
> + }
> +#endif
> +
> + return NULL;
> +}
> +
> static char *handle_lockdep(void)
> {
> const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
> @@ -197,6 +343,7 @@ static const struct {
> } abort_handlers[] = {
> { ABORT_LOCKDEP, handle_lockdep },
> { ABORT_TAINT, handle_taint },
> + { ABORT_PING, handle_ping },
> { 0, 0 },
> };
So the handlers are used need_to_abort():
* when we start execution
* after each tests
What worries me here are the suspend tests. You have mentioned that
getting network up can take up to 17s, so potentially we are adding
17s * numer_of_executed_suspend_tests to the total execution time.
Have you checked the actual impact?
Also this does not cover any network issues that happen when executing
longer tests - we still can get killed by any external watchdogs.
What is the maximum ping timeout on those external watchdogs? Maybe we
should get strict on tests' time budget for BAT/shards first...
- Arek
>
> @@ -1288,6 +1435,9 @@ bool execute(struct execute_state *state,
>
> init_watchdogs(settings);
>
> + if (settings->abort_mask & ABORT_PING)
> + ping_config();
> +
> if (!uname(&unamebuf)) {
> dprintf(unamefd, "%s %s %s %s %s\n",
> unamebuf.sysname,
> diff --git a/runner/meson.build b/runner/meson.build
> index b658f1d2..6d60c530 100644
> --- a/runner/meson.build
> +++ b/runner/meson.build
> @@ -1,4 +1,15 @@
> jsonc = dependency('json-c', required: _runner_required)
> +runner_deps = [jsonc, glib]
> +runner_c_args = []
> +
> +with_oping = get_option('with_oping')
> +if with_oping != 'false'
> + oping = dependency('liboping', required: with_oping == 'true')
> + if oping.found()
> + runner_deps += oping
> + runner_c_args += '-DHAVE_OPING=1'
> + endif
> +endif
>
> runnerlib_sources = [ 'settings.c',
> 'job_list.c',
> @@ -21,7 +32,8 @@ if _build_runner and _build_tests and jsonc.found()
>
> runnerlib = static_library('igt_runner', runnerlib_sources,
> include_directories : inc,
> - dependencies : [jsonc, glib])
> + c_args : runner_c_args,
> + dependencies : runner_deps)
>
> runner = executable('igt_runner', runner_sources,
> link_with : runnerlib,
> diff --git a/runner/settings.c b/runner/settings.c
> index ad38ae8d..b57d1a6a 100644
> --- a/runner/settings.c
> +++ b/runner/settings.c
> @@ -48,6 +48,7 @@ static struct {
> } abort_conditions[] = {
> { ABORT_TAINT, "taint" },
> { ABORT_LOCKDEP, "lockdep" },
> + { ABORT_PING, "ping" },
> { ABORT_ALL, "all" },
> { 0, 0 },
> };
> @@ -136,6 +137,9 @@ static const char *usage_str =
> " Possible conditions:\n"
> " lockdep - abort when kernel lockdep has been angered.\n"
> " taint - abort when kernel becomes fatally tainted.\n"
> + " ping - abort when a host configured in .igtrc or\n"
> + " environment variable IGT_PING_HOSTNAME does\n"
> + " not respond to ping.\n"
> " all - abort for all of the above.\n"
> " -s, --sync Sync results to disk after every test\n"
> " -l {quiet,verbose,dummy}, --log-level {quiet,verbose,dummy}\n"
> diff --git a/runner/settings.h b/runner/settings.h
> index 0a1ee08d..2b6e65d0 100644
> --- a/runner/settings.h
> +++ b/runner/settings.h
> @@ -15,9 +15,10 @@ enum {
>
> #define ABORT_TAINT (1 << 0)
> #define ABORT_LOCKDEP (1 << 1)
> -#define ABORT_ALL (ABORT_TAINT | ABORT_LOCKDEP)
> +#define ABORT_PING (1 << 2)
> +#define ABORT_ALL (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING)
>
> -_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP), "ABORT_ALL must be all conditions bitwise or'd");
> +_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING), "ABORT_ALL must be all conditions bitwise or'd");
>
> struct regex_list {
> char **regex_strings;
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure
2019-09-09 10:53 ` [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure Arkadiusz Hiler
@ 2019-09-09 11:27 ` Petri Latvala
2019-09-10 7:31 ` Tomi Sarvela
0 siblings, 1 reply; 9+ messages in thread
From: Petri Latvala @ 2019-09-09 11:27 UTC (permalink / raw)
To: Arkadiusz Hiler; +Cc: igt-dev, Tomi Sarvela, Daniel Vetter
On Mon, Sep 09, 2019 at 01:53:44PM +0300, Arkadiusz Hiler wrote:
> On Mon, May 20, 2019 at 01:16:22PM +0300, Petri Latvala wrote:
> > If the network goes down while testing, CI tends to interpret that as
> > the device being down, cutting its power after a while. This causes an
> > incomplete to an innocent test, increasing noise in the results.
> >
> > A new flag to --abort-on-monitored-error, "ping", uses liboping to
> > ping a host configured in .igtrc with one ping after each test
> > execution and aborts the run if there is no reply in a hardcoded
> > amount of time.
> >
> > v2:
> > - Use a higher timeout
> > - Allow hostname configuration from environment
> > v3:
> > - Use runner_c_args for holding c args for runner
> > - Handle runner's meson options in runner/meson.build
> > - Instead of one ping with 20 second timeout, ping with 1 second timeout
> > for a duration of 20 seconds
> >
> > Signed-off-by: Petri Latvala <petri.latvala@intel.com>
> > Cc: Arkadiusz Hiler <arkadiusz.hiler@intel.com>
> > Cc: Martin Peres <martin.peres@linux.intel.com>
> > Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> > Cc: Daniel Vetter <daniel@ffwll.ch>
> > ---
> > meson_options.txt | 6 ++
> > runner/executor.c | 150 +++++++++++++++++++++++++++++++++++++++++++++
> > runner/meson.build | 14 ++++-
> > runner/settings.c | 4 ++
> > runner/settings.h | 5 +-
> > 5 files changed, 176 insertions(+), 3 deletions(-)
> >
> > diff --git a/meson_options.txt b/meson_options.txt
> > index 888efe56..935f883e 100644
> > --- a/meson_options.txt
> > +++ b/meson_options.txt
> > @@ -58,6 +58,12 @@ option('build_runner',
> > choices : ['auto', 'true', 'false'],
> > description : 'Build test runner')
> >
> > +option('with_oping',
> > + type : 'combo',
> > + value : 'auto',
> > + choices : ['auto', 'true', 'false'],
> > + description : 'Build igt_runner with liboping')
> > +
> > option('use_rpath',
> > type : 'boolean',
> > value : false,
> > diff --git a/runner/executor.c b/runner/executor.c
> > index 7e5fbe8f..c07e53fa 100644
> > --- a/runner/executor.c
> > +++ b/runner/executor.c
> > @@ -1,6 +1,10 @@
> > #include <errno.h>
> > #include <fcntl.h>
> > +#include <glib.h>
> > #include <linux/watchdog.h>
> > +#if HAVE_OPING
> > +#include <oping.h>
> > +#endif
> > #include <signal.h>
> > #include <stdio.h>
> > #include <stdlib.h>
> > @@ -16,6 +20,7 @@
> > #include <time.h>
> > #include <unistd.h>
> >
> > +#include "igt_aux.h"
> > #include "igt_core.h"
> > #include "executor.h"
> > #include "output_strings.h"
> > @@ -108,6 +113,147 @@ static void ping_watchdogs(void)
> > }
> > }
> >
> > +#if HAVE_OPING
> > +static pingobj_t *pingobj = NULL;
> > +
> > +static bool load_ping_config_from_file(void)
> > +{
> > + char *key_file_env = NULL;
> > + char *key_file_loc = NULL;
> > + GError *error = NULL;
> > + GKeyFile *key_file = NULL;
> > + const char *ping_hostname;
> > + int ret;
> > +
> > + /* Determine igt config path */
> > + key_file_env = getenv("IGT_CONFIG_PATH");
> > + if (key_file_env) {
> > + key_file_loc = strdup(key_file_env);
> > + } else {
> > + asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
> > + }
> > +
> > + /* Load igt config file */
> > + key_file = g_key_file_new();
> > + ret = g_key_file_load_from_file(key_file, key_file_loc,
> > + G_KEY_FILE_NONE, &error);
> > + free(key_file_loc);
> > +
> > + if (!ret) {
> > + g_error_free(error);
> > + g_key_file_free(key_file);
> > + return false;
> > + }
> > +
> > + g_clear_error(&error);
>
> put that in
>
> GKeyFile *igt_open_igtrc();
>
> and use it in igt_core, so we have only one implementation
>
> > +
> > + ping_hostname =
> > + g_key_file_get_string(key_file, "DUT",
> > + "PingHostName", &error);
> > +
> > + g_clear_error(&error);
> > + g_key_file_free(key_file);
> > +
> > + if (!ping_hostname)
> > + return false;
> > +
> > + if (ping_host_add(pingobj, ping_hostname)) {
> > + fprintf(stderr,
> > + "abort on ping: Cannot use hostname from config file\n");
> > + return false;
> > + }
> > +
> > + return true;
> > +}
> > +
> > +static bool load_ping_config_from_env(void)
> > +{
> > + const char *ping_hostname;
> > +
> > + ping_hostname = getenv("IGT_PING_HOSTNAME");
> > + if (!ping_hostname)
> > + return false;
> > +
> > + if (ping_host_add(pingobj, ping_hostname)) {
> > + fprintf(stderr,
> > + "abort on ping: Cannot use hostname from environment\n");
> > + return false;
> > + }
> > +
> > + return true;
> > +}
> > +
> > +static bool can_ping(void)
> > +{
> > + /*
> > + * On some hosts, getting network back up after suspend takes
> > + * upwards of 10 seconds. 20 seconds should be enough to see
> > + * if network comes back at all, and hopefully not too long to
> > + * make external monitoring freak out.
> > + */
> > + igt_until_timeout(20) {
>
> make 20 a #define
>
> > + pingobj_iter_t *iter;
> > +
> > + ping_send(pingobj);
> > +
> > + for (iter = ping_iterator_get(pingobj);
> > + iter != NULL;
> > + iter = ping_iterator_next(iter)) {
> > + double latency;
> > + size_t len = sizeof(latency);
> > +
> > + ping_iterator_get_info(iter,
> > + PING_INFO_LATENCY,
> > + &latency,
> > + &len);
> > + if (latency >= 0.0)
> > + return true;
> > + }
> > + }
> > +
> > + return false;
> > +}
> > +
> > +#endif
> > +
> > +static void ping_config(void)
> > +{
> > +#if HAVE_OPING
> > + double timeout = 1.0;
>
> single_attempt_timeout
>
> > +
> > + if (pingobj)
> > + return;
> > +
> > + pingobj = ping_construct();
> > +
> > + /* Try env first, then config file */
> > + if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
> > + fprintf(stderr,
> > + "abort on ping: No host to ping configured\n");
> > + ping_destroy(pingobj);
> > + pingobj = NULL;
> > + return;
> > + }
> > +
> > + ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
> > +#endif
> > +}
> > +
> > +static char *handle_ping(void)
> > +{
> > +#if HAVE_OPING
> > + if (pingobj && !can_ping()) {
> > + char *reason;
> > +
> > + asprintf(&reason,
> > + "Ping host did not respond to ping, network down");
> > + return reason;
> > + }
> > +#endif
> > +
> > + return NULL;
> > +}
> > +
> > static char *handle_lockdep(void)
> > {
> > const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
> > @@ -197,6 +343,7 @@ static const struct {
> > } abort_handlers[] = {
> > { ABORT_LOCKDEP, handle_lockdep },
> > { ABORT_TAINT, handle_taint },
> > + { ABORT_PING, handle_ping },
> > { 0, 0 },
> > };
>
> So the handlers are used need_to_abort():
> * when we start execution
> * after each tests
>
> What worries me here are the suspend tests. You have mentioned that
> getting network up can take up to 17s, so potentially we are adding
> 17s * numer_of_executed_suspend_tests to the total execution time.
>
> Have you checked the actual impact?
No, let's take a look at that after I send v4.
Even with that extra overhead, an argument can be made that getting
incompletes without clear indicators why they happen is worse.
> Also this does not cover any network issues that happen when executing
> longer tests - we still can get killed by any external watchdogs.
Yeah, and we don't get easily machine-parseable timing data on how
long those running tests were fine for. Note to self: Make runner dump
periodical still-alive marks to journal. Unrelated to this patch
though...
Needs to be said that this patch won't make much of a difference if
you only look at the passrates or amount of incompletes. What this
does is allowing us to close that one silly semi-catchall cibuglog
filter for incompletes without good data, and thus forcing proper
issue binning.
>
> What is the maximum ping timeout on those external watchdogs? Maybe we
> should get strict on tests' time budget for BAT/shards first...
Don't know actually. Tomi?
--
Petri Latvala
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure
2019-09-09 11:27 ` Petri Latvala
@ 2019-09-10 7:31 ` Tomi Sarvela
0 siblings, 0 replies; 9+ messages in thread
From: Tomi Sarvela @ 2019-09-10 7:31 UTC (permalink / raw)
To: Arkadiusz Hiler, igt-dev, Daniel Vetter
On 9/9/19 2:27 PM, Petri Latvala wrote:
> On Mon, Sep 09, 2019 at 01:53:44PM +0300, Arkadiusz Hiler wrote:
>> On Mon, May 20, 2019 at 01:16:22PM +0300, Petri Latvala wrote:
>>> If the network goes down while testing, CI tends to interpret that as
>>> the device being down, cutting its power after a while. This causes an
>>> incomplete to an innocent test, increasing noise in the results.
>>>
>>> A new flag to --abort-on-monitored-error, "ping", uses liboping to
>>> ping a host configured in .igtrc with one ping after each test
>>> execution and aborts the run if there is no reply in a hardcoded
>>> amount of time.
>>>
>>> v2:
>>> - Use a higher timeout
>>> - Allow hostname configuration from environment
>>> v3:
>>> - Use runner_c_args for holding c args for runner
>>> - Handle runner's meson options in runner/meson.build
>>> - Instead of one ping with 20 second timeout, ping with 1 second timeout
>>> for a duration of 20 seconds
>>>
>>> Signed-off-by: Petri Latvala <petri.latvala@intel.com>
>>> Cc: Arkadiusz Hiler <arkadiusz.hiler@intel.com>
>>> Cc: Martin Peres <martin.peres@linux.intel.com>
>>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
>>> Cc: Daniel Vetter <daniel@ffwll.ch>
>>> ---
>>> meson_options.txt | 6 ++
>>> runner/executor.c | 150 +++++++++++++++++++++++++++++++++++++++++++++
>>> runner/meson.build | 14 ++++-
>>> runner/settings.c | 4 ++
>>> runner/settings.h | 5 +-
>>> 5 files changed, 176 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/meson_options.txt b/meson_options.txt
>>> index 888efe56..935f883e 100644
>>> --- a/meson_options.txt
>>> +++ b/meson_options.txt
>>> @@ -58,6 +58,12 @@ option('build_runner',
>>> choices : ['auto', 'true', 'false'],
>>> description : 'Build test runner')
>>>
>>> +option('with_oping',
>>> + type : 'combo',
>>> + value : 'auto',
>>> + choices : ['auto', 'true', 'false'],
>>> + description : 'Build igt_runner with liboping')
>>> +
>>> option('use_rpath',
>>> type : 'boolean',
>>> value : false,
>>> diff --git a/runner/executor.c b/runner/executor.c
>>> index 7e5fbe8f..c07e53fa 100644
>>> --- a/runner/executor.c
>>> +++ b/runner/executor.c
>>> @@ -1,6 +1,10 @@
>>> #include <errno.h>
>>> #include <fcntl.h>
>>> +#include <glib.h>
>>> #include <linux/watchdog.h>
>>> +#if HAVE_OPING
>>> +#include <oping.h>
>>> +#endif
>>> #include <signal.h>
>>> #include <stdio.h>
>>> #include <stdlib.h>
>>> @@ -16,6 +20,7 @@
>>> #include <time.h>
>>> #include <unistd.h>
>>>
>>> +#include "igt_aux.h"
>>> #include "igt_core.h"
>>> #include "executor.h"
>>> #include "output_strings.h"
>>> @@ -108,6 +113,147 @@ static void ping_watchdogs(void)
>>> }
>>> }
>>>
>>> +#if HAVE_OPING
>>> +static pingobj_t *pingobj = NULL;
>>> +
>>> +static bool load_ping_config_from_file(void)
>>> +{
>>> + char *key_file_env = NULL;
>>> + char *key_file_loc = NULL;
>>> + GError *error = NULL;
>>> + GKeyFile *key_file = NULL;
>>> + const char *ping_hostname;
>>> + int ret;
>>> +
>>> + /* Determine igt config path */
>>> + key_file_env = getenv("IGT_CONFIG_PATH");
>>> + if (key_file_env) {
>>> + key_file_loc = strdup(key_file_env);
>>> + } else {
>>> + asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
>>> + }
>>> +
>>> + /* Load igt config file */
>>> + key_file = g_key_file_new();
>>> + ret = g_key_file_load_from_file(key_file, key_file_loc,
>>> + G_KEY_FILE_NONE, &error);
>>> + free(key_file_loc);
>>> +
>>> + if (!ret) {
>>> + g_error_free(error);
>>> + g_key_file_free(key_file);
>>> + return false;
>>> + }
>>> +
>>> + g_clear_error(&error);
>>
>> put that in
>>
>> GKeyFile *igt_open_igtrc();
>>
>> and use it in igt_core, so we have only one implementation
>>
>>> +
>>> + ping_hostname =
>>> + g_key_file_get_string(key_file, "DUT",
>>> + "PingHostName", &error);
>>> +
>>> + g_clear_error(&error);
>>> + g_key_file_free(key_file);
>>> +
>>> + if (!ping_hostname)
>>> + return false;
>>> +
>>> + if (ping_host_add(pingobj, ping_hostname)) {
>>> + fprintf(stderr,
>>> + "abort on ping: Cannot use hostname from config file\n");
>>> + return false;
>>> + }
>>> +
>>> + return true;
>>> +}
>>> +
>>> +static bool load_ping_config_from_env(void)
>>> +{
>>> + const char *ping_hostname;
>>> +
>>> + ping_hostname = getenv("IGT_PING_HOSTNAME");
>>> + if (!ping_hostname)
>>> + return false;
>>> +
>>> + if (ping_host_add(pingobj, ping_hostname)) {
>>> + fprintf(stderr,
>>> + "abort on ping: Cannot use hostname from environment\n");
>>> + return false;
>>> + }
>>> +
>>> + return true;
>>> +}
>>> +
>>> +static bool can_ping(void)
>>> +{
>>> + /*
>>> + * On some hosts, getting network back up after suspend takes
>>> + * upwards of 10 seconds. 20 seconds should be enough to see
>>> + * if network comes back at all, and hopefully not too long to
>>> + * make external monitoring freak out.
>>> + */
>>> + igt_until_timeout(20) {
>>
>> make 20 a #define
>>
>>> + pingobj_iter_t *iter;
>>> +
>>> + ping_send(pingobj);
>>> +
>>> + for (iter = ping_iterator_get(pingobj);
>>> + iter != NULL;
>>> + iter = ping_iterator_next(iter)) {
>>> + double latency;
>>> + size_t len = sizeof(latency);
>>> +
>>> + ping_iterator_get_info(iter,
>>> + PING_INFO_LATENCY,
>>> + &latency,
>>> + &len);
>>> + if (latency >= 0.0)
>>> + return true;
>>> + }
>>> + }
>>> +
>>> + return false;
>>> +}
>>> +
>>> +#endif
>>> +
>>> +static void ping_config(void)
>>> +{
>>> +#if HAVE_OPING
>>> + double timeout = 1.0;
>>
>> single_attempt_timeout
>>
>>> +
>>> + if (pingobj)
>>> + return;
>>> +
>>> + pingobj = ping_construct();
>>> +
>>> + /* Try env first, then config file */
>>> + if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
>>> + fprintf(stderr,
>>> + "abort on ping: No host to ping configured\n");
>>> + ping_destroy(pingobj);
>>> + pingobj = NULL;
>>> + return;
>>> + }
>>> +
>>> + ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
>>> +#endif
>>> +}
>>> +
>>> +static char *handle_ping(void)
>>> +{
>>> +#if HAVE_OPING
>>> + if (pingobj && !can_ping()) {
>>> + char *reason;
>>> +
>>> + asprintf(&reason,
>>> + "Ping host did not respond to ping, network down");
>>> + return reason;
>>> + }
>>> +#endif
>>> +
>>> + return NULL;
>>> +}
>>> +
>>> static char *handle_lockdep(void)
>>> {
>>> const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
>>> @@ -197,6 +343,7 @@ static const struct {
>>> } abort_handlers[] = {
>>> { ABORT_LOCKDEP, handle_lockdep },
>>> { ABORT_TAINT, handle_taint },
>>> + { ABORT_PING, handle_ping },
>>> { 0, 0 },
>>> };
>>
>> So the handlers are used need_to_abort():
>> * when we start execution
>> * after each tests
>>
>> What worries me here are the suspend tests. You have mentioned that
>> getting network up can take up to 17s, so potentially we are adding
>> 17s * numer_of_executed_suspend_tests to the total execution time.
>>
>> Have you checked the actual impact?
>
> No, let's take a look at that after I send v4.
>
> Even with that extra overhead, an argument can be made that getting
> incompletes without clear indicators why they happen is worse.
>
>
>> Also this does not cover any network issues that happen when executing
>> longer tests - we still can get killed by any external watchdogs.
>
> Yeah, and we don't get easily machine-parseable timing data on how
> long those running tests were fine for. Note to self: Make runner dump
> periodical still-alive marks to journal. Unrelated to this patch
> though...
>
> Needs to be said that this patch won't make much of a difference if
> you only look at the passrates or amount of incompletes. What this
> does is allowing us to close that one silly semi-catchall cibuglog
> filter for incompletes without good data, and thus forcing proper
> issue binning.
>
>>
>> What is the maximum ping timeout on those external watchdogs? Maybe we
>> should get strict on tests' time budget for BAT/shards first...
>
> Don't know actually. Tomi?
SSH timeout (TCP timeout) is first, ~2 minutes. To make sure that the
external watchdog doesn't mess up live system, it only activates when
the target is offline 6 minutes (pinged several times during this time).
Tomi
--
Intel Finland Oy - BIC 0357606-4 - Westendinkatu 7, 02160 Espoo
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2019-09-10 7:32 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-20 10:16 [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure Petri Latvala
2019-05-20 10:16 ` [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort Petri Latvala
2019-05-20 11:51 ` Petri Latvala
2019-05-20 12:16 ` Petri Latvala
2019-05-20 12:09 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev2) Patchwork
2019-05-20 13:33 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev3) Patchwork
2019-09-09 10:53 ` [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure Arkadiusz Hiler
2019-09-09 11:27 ` Petri Latvala
2019-09-10 7:31 ` Tomi Sarvela
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.