All of lore.kernel.org
 help / color / mirror / Atom feed
* [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure
@ 2019-05-20 10:16 Petri Latvala
  2019-05-20 10:16 ` [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort Petri Latvala
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Petri Latvala @ 2019-05-20 10:16 UTC (permalink / raw)
  To: igt-dev; +Cc: Tomi Sarvela, Petri Latvala, Daniel Vetter

If the network goes down while testing, CI tends to interpret that as
the device being down, cutting its power after a while. This causes an
incomplete to an innocent test, increasing noise in the results.

A new flag to --abort-on-monitored-error, "ping", uses liboping to
ping a host configured in .igtrc with one ping after each test
execution and aborts the run if there is no reply in a hardcoded
amount of time.

v2:
 - Use a higher timeout
 - Allow hostname configuration from environment
v3:
 - Use runner_c_args for holding c args for runner
 - Handle runner's meson options in runner/meson.build
 - Instead of one ping with 20 second timeout, ping with 1 second timeout
   for a duration of 20 seconds

Signed-off-by: Petri Latvala <petri.latvala@intel.com>
Cc: Arkadiusz Hiler <arkadiusz.hiler@intel.com>
Cc: Martin Peres <martin.peres@linux.intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
---
 meson_options.txt  |   6 ++
 runner/executor.c  | 150 +++++++++++++++++++++++++++++++++++++++++++++
 runner/meson.build |  14 ++++-
 runner/settings.c  |   4 ++
 runner/settings.h  |   5 +-
 5 files changed, 176 insertions(+), 3 deletions(-)

diff --git a/meson_options.txt b/meson_options.txt
index 888efe56..935f883e 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -58,6 +58,12 @@ option('build_runner',
        choices : ['auto', 'true', 'false'],
        description : 'Build test runner')
 
+option('with_oping',
+       type : 'combo',
+       value : 'auto',
+       choices : ['auto', 'true', 'false'],
+       description : 'Build igt_runner with liboping')
+
 option('use_rpath',
        type : 'boolean',
        value : false,
diff --git a/runner/executor.c b/runner/executor.c
index 7e5fbe8f..c07e53fa 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -1,6 +1,10 @@
 #include <errno.h>
 #include <fcntl.h>
+#include <glib.h>
 #include <linux/watchdog.h>
+#if HAVE_OPING
+#include <oping.h>
+#endif
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -16,6 +20,7 @@
 #include <time.h>
 #include <unistd.h>
 
+#include "igt_aux.h"
 #include "igt_core.h"
 #include "executor.h"
 #include "output_strings.h"
@@ -108,6 +113,147 @@ static void ping_watchdogs(void)
 	}
 }
 
+#if HAVE_OPING
+static pingobj_t *pingobj = NULL;
+
+static bool load_ping_config_from_file(void)
+{
+	char *key_file_env = NULL;
+	char *key_file_loc = NULL;
+	GError *error = NULL;
+	GKeyFile *key_file = NULL;
+	const char *ping_hostname;
+	int ret;
+
+	/* Determine igt config path */
+	key_file_env = getenv("IGT_CONFIG_PATH");
+	if (key_file_env) {
+		key_file_loc = strdup(key_file_env);
+	} else {
+		asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
+	}
+
+	/* Load igt config file */
+	key_file = g_key_file_new();
+	ret = g_key_file_load_from_file(key_file, key_file_loc,
+					G_KEY_FILE_NONE, &error);
+	free(key_file_loc);
+
+	if (!ret) {
+		g_error_free(error);
+		g_key_file_free(key_file);
+		return false;
+	}
+
+	g_clear_error(&error);
+
+	ping_hostname =
+		g_key_file_get_string(key_file, "DUT",
+				      "PingHostName", &error);
+
+	g_clear_error(&error);
+	g_key_file_free(key_file);
+
+	if (!ping_hostname)
+		return false;
+
+	if (ping_host_add(pingobj, ping_hostname)) {
+		fprintf(stderr,
+			"abort on ping: Cannot use hostname from config file\n");
+		return false;
+	}
+
+	return true;
+}
+
+static bool load_ping_config_from_env(void)
+{
+	const char *ping_hostname;
+
+	ping_hostname = getenv("IGT_PING_HOSTNAME");
+	if (!ping_hostname)
+		return false;
+
+	if (ping_host_add(pingobj, ping_hostname)) {
+		fprintf(stderr,
+			"abort on ping: Cannot use hostname from environment\n");
+		return false;
+	}
+
+	return true;
+}
+
+static bool can_ping(void)
+{
+	/*
+	 * On some hosts, getting network back up after suspend takes
+	 * upwards of 10 seconds. 20 seconds should be enough to see
+	 * if network comes back at all, and hopefully not too long to
+	 * make external monitoring freak out.
+	 */
+	igt_until_timeout(20) {
+		pingobj_iter_t *iter;
+
+		ping_send(pingobj);
+
+		for (iter = ping_iterator_get(pingobj);
+		     iter != NULL;
+		     iter = ping_iterator_next(iter)) {
+			double latency;
+			size_t len = sizeof(latency);
+
+			ping_iterator_get_info(iter,
+					       PING_INFO_LATENCY,
+					       &latency,
+					       &len);
+			if (latency >= 0.0)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+#endif
+
+static void ping_config(void)
+{
+#if HAVE_OPING
+	double timeout = 1.0;
+
+	if (pingobj)
+		return;
+
+	pingobj = ping_construct();
+
+	/* Try env first, then config file */
+	if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
+		fprintf(stderr,
+			"abort on ping: No host to ping configured\n");
+		ping_destroy(pingobj);
+		pingobj = NULL;
+		return;
+	}
+
+	ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
+#endif
+}
+
+static char *handle_ping(void)
+{
+#if HAVE_OPING
+	if (pingobj && !can_ping()) {
+		char *reason;
+
+		asprintf(&reason,
+			 "Ping host did not respond to ping, network down");
+		return reason;
+	}
+#endif
+
+	return NULL;
+}
+
 static char *handle_lockdep(void)
 {
 	const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
@@ -197,6 +343,7 @@ static const struct {
 } abort_handlers[] = {
 	{ ABORT_LOCKDEP, handle_lockdep },
 	{ ABORT_TAINT, handle_taint },
+	{ ABORT_PING, handle_ping },
 	{ 0, 0 },
 };
 
@@ -1288,6 +1435,9 @@ bool execute(struct execute_state *state,
 
 	init_watchdogs(settings);
 
+	if (settings->abort_mask & ABORT_PING)
+		ping_config();
+
 	if (!uname(&unamebuf)) {
 		dprintf(unamefd, "%s %s %s %s %s\n",
 			unamebuf.sysname,
diff --git a/runner/meson.build b/runner/meson.build
index b658f1d2..6d60c530 100644
--- a/runner/meson.build
+++ b/runner/meson.build
@@ -1,4 +1,15 @@
 jsonc = dependency('json-c', required: _runner_required)
+runner_deps = [jsonc, glib]
+runner_c_args = []
+
+with_oping = get_option('with_oping')
+if with_oping != 'false'
+	oping = dependency('liboping', required: with_oping == 'true')
+	if oping.found()
+		runner_deps += oping
+		runner_c_args += '-DHAVE_OPING=1'
+	endif
+endif
 
 runnerlib_sources = [ 'settings.c',
 		      'job_list.c',
@@ -21,7 +32,8 @@ if _build_runner and _build_tests and jsonc.found()
 
 	runnerlib = static_library('igt_runner', runnerlib_sources,
 				   include_directories : inc,
-				   dependencies : [jsonc, glib])
+				   c_args : runner_c_args,
+				   dependencies : runner_deps)
 
 	runner = executable('igt_runner', runner_sources,
 			    link_with : runnerlib,
diff --git a/runner/settings.c b/runner/settings.c
index ad38ae8d..b57d1a6a 100644
--- a/runner/settings.c
+++ b/runner/settings.c
@@ -48,6 +48,7 @@ static struct {
 } abort_conditions[] = {
 	{ ABORT_TAINT, "taint" },
 	{ ABORT_LOCKDEP, "lockdep" },
+	{ ABORT_PING, "ping" },
 	{ ABORT_ALL, "all" },
 	{ 0, 0 },
 };
@@ -136,6 +137,9 @@ static const char *usage_str =
 	"                        Possible conditions:\n"
 	"                         lockdep - abort when kernel lockdep has been angered.\n"
 	"                         taint   - abort when kernel becomes fatally tainted.\n"
+	"                         ping    - abort when a host configured in .igtrc or\n"
+	"                                   environment variable IGT_PING_HOSTNAME does\n"
+	"                                   not respond to ping.\n"
 	"                         all     - abort for all of the above.\n"
 	"  -s, --sync            Sync results to disk after every test\n"
 	"  -l {quiet,verbose,dummy}, --log-level {quiet,verbose,dummy}\n"
diff --git a/runner/settings.h b/runner/settings.h
index 0a1ee08d..2b6e65d0 100644
--- a/runner/settings.h
+++ b/runner/settings.h
@@ -15,9 +15,10 @@ enum {
 
 #define ABORT_TAINT   (1 << 0)
 #define ABORT_LOCKDEP (1 << 1)
-#define ABORT_ALL     (ABORT_TAINT | ABORT_LOCKDEP)
+#define ABORT_PING    (1 << 2)
+#define ABORT_ALL     (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING)
 
-_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP), "ABORT_ALL must be all conditions bitwise or'd");
+_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING), "ABORT_ALL must be all conditions bitwise or'd");
 
 struct regex_list {
 	char **regex_strings;
-- 
2.19.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort
  2019-05-20 10:16 [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure Petri Latvala
@ 2019-05-20 10:16 ` Petri Latvala
  2019-05-20 11:51   ` Petri Latvala
  2019-05-20 12:09 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev2) Patchwork
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Petri Latvala @ 2019-05-20 10:16 UTC (permalink / raw)
  To: igt-dev; +Cc: Petri Latvala

---
 runner/executor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runner/executor.c b/runner/executor.c
index c07e53fa..4a44c633 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -354,7 +354,7 @@ static char *need_to_abort(const struct settings* settings)
 	for (it = abort_handlers; it->condition; it++) {
 		char *abort;
 
-		if (!(settings->abort_mask & it->condition))
+		if (false && !(settings->abort_mask & it->condition))
 			continue;
 
 		abort = it->handler();
-- 
2.19.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort
  2019-05-20 10:16 ` [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort Petri Latvala
@ 2019-05-20 11:51   ` Petri Latvala
  2019-05-20 12:16     ` Petri Latvala
  0 siblings, 1 reply; 9+ messages in thread
From: Petri Latvala @ 2019-05-20 11:51 UTC (permalink / raw)
  To: igt-dev; +Cc: Petri Latvala

... and remember to HAX the ping setup as well
---
 runner/executor.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runner/executor.c b/runner/executor.c
index c07e53fa..af87e312 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -354,7 +354,7 @@ static char *need_to_abort(const struct settings* settings)
 	for (it = abort_handlers; it->condition; it++) {
 		char *abort;
 
-		if (!(settings->abort_mask & it->condition))
+		if (false && !(settings->abort_mask & it->condition))
 			continue;
 
 		abort = it->handler();
@@ -1435,7 +1435,7 @@ bool execute(struct execute_state *state,
 
 	init_watchdogs(settings);
 
-	if (settings->abort_mask & ABORT_PING)
+	if (true && settings->abort_mask & ABORT_PING)
 		ping_config();
 
 	if (!uname(&unamebuf)) {
-- 
2.19.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev2)
  2019-05-20 10:16 [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure Petri Latvala
  2019-05-20 10:16 ` [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort Petri Latvala
@ 2019-05-20 12:09 ` Patchwork
  2019-05-20 13:33 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev3) Patchwork
  2019-09-09 10:53 ` [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure Arkadiusz Hiler
  3 siblings, 0 replies; 9+ messages in thread
From: Patchwork @ 2019-05-20 12:09 UTC (permalink / raw)
  To: Petri Latvala; +Cc: igt-dev

== Series Details ==

Series: series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev2)
URL   : https://patchwork.freedesktop.org/series/60854/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_6099 -> IGTPW_3007
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/60854/revisions/2/mbox/

Known issues
------------

  Here are the changes found in IGTPW_3007 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@prime_vgem@basic-fence-flip:
    - fi-ilk-650:         [PASS][1] -> [DMESG-WARN][2] ([fdo#106387]) +1 similar issue
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_6099/fi-ilk-650/igt@prime_vgem@basic-fence-flip.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3007/fi-ilk-650/igt@prime_vgem@basic-fence-flip.html

  
#### Possible fixes ####

  * igt@gem_exec_suspend@basic-s4-devices:
    - fi-kbl-7500u:       [DMESG-WARN][3] ([fdo#105128] / [fdo#107139]) -> [PASS][4]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_6099/fi-kbl-7500u/igt@gem_exec_suspend@basic-s4-devices.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3007/fi-kbl-7500u/igt@gem_exec_suspend@basic-s4-devices.html

  
  [fdo#105128]: https://bugs.freedesktop.org/show_bug.cgi?id=105128
  [fdo#106387]: https://bugs.freedesktop.org/show_bug.cgi?id=106387
  [fdo#107139]: https://bugs.freedesktop.org/show_bug.cgi?id=107139


Participating hosts (52 -> 43)
------------------------------

  Missing    (9): fi-kbl-soraka fi-ilk-m540 fi-bxt-dsi fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-apl-guc fi-byt-clapper fi-bdw-samus 


Build changes
-------------

  * IGT: IGT_4997 -> IGTPW_3007

  CI_DRM_6099: 3cef0eacdd3d3448dfefb7011f1e66821c5fd41d @ git://anongit.freedesktop.org/gfx-ci/linux
  IGTPW_3007: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3007/
  IGT_4997: eff5d0db3248734845b78fcc2e2772dd4012e5af @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3007/
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort
  2019-05-20 11:51   ` Petri Latvala
@ 2019-05-20 12:16     ` Petri Latvala
  0 siblings, 0 replies; 9+ messages in thread
From: Petri Latvala @ 2019-05-20 12:16 UTC (permalink / raw)
  To: igt-dev; +Cc: Petri Latvala

... and remember to HAX the ping setup as well

... correctly
---
 runner/executor.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runner/executor.c b/runner/executor.c
index c07e53fa..8cb09dda 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -354,7 +354,7 @@ static char *need_to_abort(const struct settings* settings)
 	for (it = abort_handlers; it->condition; it++) {
 		char *abort;
 
-		if (!(settings->abort_mask & it->condition))
+		if (false && !(settings->abort_mask & it->condition))
 			continue;
 
 		abort = it->handler();
@@ -1435,7 +1435,7 @@ bool execute(struct execute_state *state,
 
 	init_watchdogs(settings);
 
-	if (settings->abort_mask & ABORT_PING)
+	if (true || settings->abort_mask & ABORT_PING)
 		ping_config();
 
 	if (!uname(&unamebuf)) {
-- 
2.19.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev3)
  2019-05-20 10:16 [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure Petri Latvala
  2019-05-20 10:16 ` [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort Petri Latvala
  2019-05-20 12:09 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev2) Patchwork
@ 2019-05-20 13:33 ` Patchwork
  2019-09-09 10:53 ` [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure Arkadiusz Hiler
  3 siblings, 0 replies; 9+ messages in thread
From: Patchwork @ 2019-05-20 13:33 UTC (permalink / raw)
  To: Petri Latvala; +Cc: igt-dev

== Series Details ==

Series: series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev3)
URL   : https://patchwork.freedesktop.org/series/60854/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_6100 -> IGTPW_3009
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/60854/revisions/3/mbox/

Known issues
------------

  Here are the changes found in IGTPW_3009 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_selftest@live_hangcheck:
    - fi-skl-iommu:       [PASS][1] -> [INCOMPLETE][2] ([fdo#108602] / [fdo#108744])
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_6100/fi-skl-iommu/igt@i915_selftest@live_hangcheck.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3009/fi-skl-iommu/igt@i915_selftest@live_hangcheck.html

  * igt@kms_frontbuffer_tracking@basic:
    - fi-hsw-peppy:       [PASS][3] -> [DMESG-WARN][4] ([fdo#102614])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_6100/fi-hsw-peppy/igt@kms_frontbuffer_tracking@basic.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3009/fi-hsw-peppy/igt@kms_frontbuffer_tracking@basic.html

  
#### Possible fixes ####

  * igt@i915_selftest@live_contexts:
    - fi-skl-gvtdvm:      [DMESG-FAIL][5] ([fdo#110235]) -> [PASS][6]
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_6100/fi-skl-gvtdvm/igt@i915_selftest@live_contexts.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3009/fi-skl-gvtdvm/igt@i915_selftest@live_contexts.html

  
  [fdo#102614]: https://bugs.freedesktop.org/show_bug.cgi?id=102614
  [fdo#108602]: https://bugs.freedesktop.org/show_bug.cgi?id=108602
  [fdo#108744]: https://bugs.freedesktop.org/show_bug.cgi?id=108744
  [fdo#110235]: https://bugs.freedesktop.org/show_bug.cgi?id=110235


Participating hosts (50 -> 43)
------------------------------

  Additional (1): fi-icl-y 
  Missing    (8): fi-kbl-soraka fi-ilk-m540 fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-apl-guc fi-pnv-d510 fi-byt-clapper 


Build changes
-------------

  * IGT: IGT_4997 -> IGTPW_3009

  CI_DRM_6100: 6cd6c39683ceb3e48b98871fd25aea47f291f2b0 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGTPW_3009: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3009/
  IGT_4997: eff5d0db3248734845b78fcc2e2772dd4012e5af @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_3009/
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure
  2019-05-20 10:16 [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure Petri Latvala
                   ` (2 preceding siblings ...)
  2019-05-20 13:33 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev3) Patchwork
@ 2019-09-09 10:53 ` Arkadiusz Hiler
  2019-09-09 11:27   ` Petri Latvala
  3 siblings, 1 reply; 9+ messages in thread
From: Arkadiusz Hiler @ 2019-09-09 10:53 UTC (permalink / raw)
  To: Petri Latvala; +Cc: igt-dev, Tomi Sarvela, Daniel Vetter

On Mon, May 20, 2019 at 01:16:22PM +0300, Petri Latvala wrote:
> If the network goes down while testing, CI tends to interpret that as
> the device being down, cutting its power after a while. This causes an
> incomplete to an innocent test, increasing noise in the results.
> 
> A new flag to --abort-on-monitored-error, "ping", uses liboping to
> ping a host configured in .igtrc with one ping after each test
> execution and aborts the run if there is no reply in a hardcoded
> amount of time.
> 
> v2:
>  - Use a higher timeout
>  - Allow hostname configuration from environment
> v3:
>  - Use runner_c_args for holding c args for runner
>  - Handle runner's meson options in runner/meson.build
>  - Instead of one ping with 20 second timeout, ping with 1 second timeout
>    for a duration of 20 seconds
> 
> Signed-off-by: Petri Latvala <petri.latvala@intel.com>
> Cc: Arkadiusz Hiler <arkadiusz.hiler@intel.com>
> Cc: Martin Peres <martin.peres@linux.intel.com>
> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> ---
>  meson_options.txt  |   6 ++
>  runner/executor.c  | 150 +++++++++++++++++++++++++++++++++++++++++++++
>  runner/meson.build |  14 ++++-
>  runner/settings.c  |   4 ++
>  runner/settings.h  |   5 +-
>  5 files changed, 176 insertions(+), 3 deletions(-)
> 
> diff --git a/meson_options.txt b/meson_options.txt
> index 888efe56..935f883e 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -58,6 +58,12 @@ option('build_runner',
>         choices : ['auto', 'true', 'false'],
>         description : 'Build test runner')
>  
> +option('with_oping',
> +       type : 'combo',
> +       value : 'auto',
> +       choices : ['auto', 'true', 'false'],
> +       description : 'Build igt_runner with liboping')
> +
>  option('use_rpath',
>         type : 'boolean',
>         value : false,
> diff --git a/runner/executor.c b/runner/executor.c
> index 7e5fbe8f..c07e53fa 100644
> --- a/runner/executor.c
> +++ b/runner/executor.c
> @@ -1,6 +1,10 @@
>  #include <errno.h>
>  #include <fcntl.h>
> +#include <glib.h>
>  #include <linux/watchdog.h>
> +#if HAVE_OPING
> +#include <oping.h>
> +#endif
>  #include <signal.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> @@ -16,6 +20,7 @@
>  #include <time.h>
>  #include <unistd.h>
>  
> +#include "igt_aux.h"
>  #include "igt_core.h"
>  #include "executor.h"
>  #include "output_strings.h"
> @@ -108,6 +113,147 @@ static void ping_watchdogs(void)
>  	}
>  }
>  
> +#if HAVE_OPING
> +static pingobj_t *pingobj = NULL;
> +
> +static bool load_ping_config_from_file(void)
> +{
> +	char *key_file_env = NULL;
> +	char *key_file_loc = NULL;
> +	GError *error = NULL;
> +	GKeyFile *key_file = NULL;
> +	const char *ping_hostname;
> +	int ret;
> +
> +	/* Determine igt config path */
> +	key_file_env = getenv("IGT_CONFIG_PATH");
> +	if (key_file_env) {
> +		key_file_loc = strdup(key_file_env);
> +	} else {
> +		asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
> +	}
> +
> +	/* Load igt config file */
> +	key_file = g_key_file_new();
> +	ret = g_key_file_load_from_file(key_file, key_file_loc,
> +					G_KEY_FILE_NONE, &error);
> +	free(key_file_loc);
> +
> +	if (!ret) {
> +		g_error_free(error);
> +		g_key_file_free(key_file);
> +		return false;
> +	}
> +
> +	g_clear_error(&error);

put that in

GKeyFile *igt_open_igtrc();

and use it in igt_core, so we have only one implementation

> +
> +	ping_hostname =
> +		g_key_file_get_string(key_file, "DUT",
> +				      "PingHostName", &error);
> +
> +	g_clear_error(&error);
> +	g_key_file_free(key_file);
> +
> +	if (!ping_hostname)
> +		return false;
> +
> +	if (ping_host_add(pingobj, ping_hostname)) {
> +		fprintf(stderr,
> +			"abort on ping: Cannot use hostname from config file\n");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool load_ping_config_from_env(void)
> +{
> +	const char *ping_hostname;
> +
> +	ping_hostname = getenv("IGT_PING_HOSTNAME");
> +	if (!ping_hostname)
> +		return false;
> +
> +	if (ping_host_add(pingobj, ping_hostname)) {
> +		fprintf(stderr,
> +			"abort on ping: Cannot use hostname from environment\n");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool can_ping(void)
> +{
> +	/*
> +	 * On some hosts, getting network back up after suspend takes
> +	 * upwards of 10 seconds. 20 seconds should be enough to see
> +	 * if network comes back at all, and hopefully not too long to
> +	 * make external monitoring freak out.
> +	 */
> +	igt_until_timeout(20) {

make 20 a #define

> +		pingobj_iter_t *iter;
> +
> +		ping_send(pingobj);
> +
> +		for (iter = ping_iterator_get(pingobj);
> +		     iter != NULL;
> +		     iter = ping_iterator_next(iter)) {
> +			double latency;
> +			size_t len = sizeof(latency);
> +
> +			ping_iterator_get_info(iter,
> +					       PING_INFO_LATENCY,
> +					       &latency,
> +					       &len);
> +			if (latency >= 0.0)
> +				return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +#endif
> +
> +static void ping_config(void)
> +{
> +#if HAVE_OPING
> +	double timeout = 1.0;

single_attempt_timeout

> +
> +	if (pingobj)
> +		return;
> +
> +	pingobj = ping_construct();
> +
> +	/* Try env first, then config file */
> +	if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
> +		fprintf(stderr,
> +			"abort on ping: No host to ping configured\n");
> +		ping_destroy(pingobj);
> +		pingobj = NULL;
> +		return;
> +	}
> +
> +	ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
> +#endif
> +}
> +
> +static char *handle_ping(void)
> +{
> +#if HAVE_OPING
> +	if (pingobj && !can_ping()) {
> +		char *reason;
> +
> +		asprintf(&reason,
> +			 "Ping host did not respond to ping, network down");
> +		return reason;
> +	}
> +#endif
> +
> +	return NULL;
> +}
> +
>  static char *handle_lockdep(void)
>  {
>  	const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
> @@ -197,6 +343,7 @@ static const struct {
>  } abort_handlers[] = {
>  	{ ABORT_LOCKDEP, handle_lockdep },
>  	{ ABORT_TAINT, handle_taint },
> +	{ ABORT_PING, handle_ping },
>  	{ 0, 0 },
>  };

So the handlers are used need_to_abort():
 * when we start execution
 * after each tests

What worries me here are the suspend tests. You have mentioned that
getting network up can take up to 17s, so potentially we are adding
17s * numer_of_executed_suspend_tests to the total execution time.

Have you checked the actual impact?

Also this does not cover any network issues that happen when executing
longer tests - we still can get killed by any external watchdogs.

What is the maximum ping timeout on those external watchdogs? Maybe we
should get strict on tests' time budget for BAT/shards first...

- Arek

>  
> @@ -1288,6 +1435,9 @@ bool execute(struct execute_state *state,
>  
>  	init_watchdogs(settings);
>  
> +	if (settings->abort_mask & ABORT_PING)
> +		ping_config();
> +
>  	if (!uname(&unamebuf)) {
>  		dprintf(unamefd, "%s %s %s %s %s\n",
>  			unamebuf.sysname,
> diff --git a/runner/meson.build b/runner/meson.build
> index b658f1d2..6d60c530 100644
> --- a/runner/meson.build
> +++ b/runner/meson.build
> @@ -1,4 +1,15 @@
>  jsonc = dependency('json-c', required: _runner_required)
> +runner_deps = [jsonc, glib]
> +runner_c_args = []
> +
> +with_oping = get_option('with_oping')
> +if with_oping != 'false'
> +	oping = dependency('liboping', required: with_oping == 'true')
> +	if oping.found()
> +		runner_deps += oping
> +		runner_c_args += '-DHAVE_OPING=1'
> +	endif
> +endif
>  
>  runnerlib_sources = [ 'settings.c',
>  		      'job_list.c',
> @@ -21,7 +32,8 @@ if _build_runner and _build_tests and jsonc.found()
>  
>  	runnerlib = static_library('igt_runner', runnerlib_sources,
>  				   include_directories : inc,
> -				   dependencies : [jsonc, glib])
> +				   c_args : runner_c_args,
> +				   dependencies : runner_deps)
>  
>  	runner = executable('igt_runner', runner_sources,
>  			    link_with : runnerlib,
> diff --git a/runner/settings.c b/runner/settings.c
> index ad38ae8d..b57d1a6a 100644
> --- a/runner/settings.c
> +++ b/runner/settings.c
> @@ -48,6 +48,7 @@ static struct {
>  } abort_conditions[] = {
>  	{ ABORT_TAINT, "taint" },
>  	{ ABORT_LOCKDEP, "lockdep" },
> +	{ ABORT_PING, "ping" },
>  	{ ABORT_ALL, "all" },
>  	{ 0, 0 },
>  };
> @@ -136,6 +137,9 @@ static const char *usage_str =
>  	"                        Possible conditions:\n"
>  	"                         lockdep - abort when kernel lockdep has been angered.\n"
>  	"                         taint   - abort when kernel becomes fatally tainted.\n"
> +	"                         ping    - abort when a host configured in .igtrc or\n"
> +	"                                   environment variable IGT_PING_HOSTNAME does\n"
> +	"                                   not respond to ping.\n"
>  	"                         all     - abort for all of the above.\n"
>  	"  -s, --sync            Sync results to disk after every test\n"
>  	"  -l {quiet,verbose,dummy}, --log-level {quiet,verbose,dummy}\n"
> diff --git a/runner/settings.h b/runner/settings.h
> index 0a1ee08d..2b6e65d0 100644
> --- a/runner/settings.h
> +++ b/runner/settings.h
> @@ -15,9 +15,10 @@ enum {
>  
>  #define ABORT_TAINT   (1 << 0)
>  #define ABORT_LOCKDEP (1 << 1)
> -#define ABORT_ALL     (ABORT_TAINT | ABORT_LOCKDEP)
> +#define ABORT_PING    (1 << 2)
> +#define ABORT_ALL     (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING)
>  
> -_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP), "ABORT_ALL must be all conditions bitwise or'd");
> +_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING), "ABORT_ALL must be all conditions bitwise or'd");
>  
>  struct regex_list {
>  	char **regex_strings;
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure
  2019-09-09 10:53 ` [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure Arkadiusz Hiler
@ 2019-09-09 11:27   ` Petri Latvala
  2019-09-10  7:31     ` Tomi Sarvela
  0 siblings, 1 reply; 9+ messages in thread
From: Petri Latvala @ 2019-09-09 11:27 UTC (permalink / raw)
  To: Arkadiusz Hiler; +Cc: igt-dev, Tomi Sarvela, Daniel Vetter

On Mon, Sep 09, 2019 at 01:53:44PM +0300, Arkadiusz Hiler wrote:
> On Mon, May 20, 2019 at 01:16:22PM +0300, Petri Latvala wrote:
> > If the network goes down while testing, CI tends to interpret that as
> > the device being down, cutting its power after a while. This causes an
> > incomplete to an innocent test, increasing noise in the results.
> > 
> > A new flag to --abort-on-monitored-error, "ping", uses liboping to
> > ping a host configured in .igtrc with one ping after each test
> > execution and aborts the run if there is no reply in a hardcoded
> > amount of time.
> > 
> > v2:
> >  - Use a higher timeout
> >  - Allow hostname configuration from environment
> > v3:
> >  - Use runner_c_args for holding c args for runner
> >  - Handle runner's meson options in runner/meson.build
> >  - Instead of one ping with 20 second timeout, ping with 1 second timeout
> >    for a duration of 20 seconds
> > 
> > Signed-off-by: Petri Latvala <petri.latvala@intel.com>
> > Cc: Arkadiusz Hiler <arkadiusz.hiler@intel.com>
> > Cc: Martin Peres <martin.peres@linux.intel.com>
> > Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
> > Cc: Daniel Vetter <daniel@ffwll.ch>
> > ---
> >  meson_options.txt  |   6 ++
> >  runner/executor.c  | 150 +++++++++++++++++++++++++++++++++++++++++++++
> >  runner/meson.build |  14 ++++-
> >  runner/settings.c  |   4 ++
> >  runner/settings.h  |   5 +-
> >  5 files changed, 176 insertions(+), 3 deletions(-)
> > 
> > diff --git a/meson_options.txt b/meson_options.txt
> > index 888efe56..935f883e 100644
> > --- a/meson_options.txt
> > +++ b/meson_options.txt
> > @@ -58,6 +58,12 @@ option('build_runner',
> >         choices : ['auto', 'true', 'false'],
> >         description : 'Build test runner')
> >  
> > +option('with_oping',
> > +       type : 'combo',
> > +       value : 'auto',
> > +       choices : ['auto', 'true', 'false'],
> > +       description : 'Build igt_runner with liboping')
> > +
> >  option('use_rpath',
> >         type : 'boolean',
> >         value : false,
> > diff --git a/runner/executor.c b/runner/executor.c
> > index 7e5fbe8f..c07e53fa 100644
> > --- a/runner/executor.c
> > +++ b/runner/executor.c
> > @@ -1,6 +1,10 @@
> >  #include <errno.h>
> >  #include <fcntl.h>
> > +#include <glib.h>
> >  #include <linux/watchdog.h>
> > +#if HAVE_OPING
> > +#include <oping.h>
> > +#endif
> >  #include <signal.h>
> >  #include <stdio.h>
> >  #include <stdlib.h>
> > @@ -16,6 +20,7 @@
> >  #include <time.h>
> >  #include <unistd.h>
> >  
> > +#include "igt_aux.h"
> >  #include "igt_core.h"
> >  #include "executor.h"
> >  #include "output_strings.h"
> > @@ -108,6 +113,147 @@ static void ping_watchdogs(void)
> >  	}
> >  }
> >  
> > +#if HAVE_OPING
> > +static pingobj_t *pingobj = NULL;
> > +
> > +static bool load_ping_config_from_file(void)
> > +{
> > +	char *key_file_env = NULL;
> > +	char *key_file_loc = NULL;
> > +	GError *error = NULL;
> > +	GKeyFile *key_file = NULL;
> > +	const char *ping_hostname;
> > +	int ret;
> > +
> > +	/* Determine igt config path */
> > +	key_file_env = getenv("IGT_CONFIG_PATH");
> > +	if (key_file_env) {
> > +		key_file_loc = strdup(key_file_env);
> > +	} else {
> > +		asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
> > +	}
> > +
> > +	/* Load igt config file */
> > +	key_file = g_key_file_new();
> > +	ret = g_key_file_load_from_file(key_file, key_file_loc,
> > +					G_KEY_FILE_NONE, &error);
> > +	free(key_file_loc);
> > +
> > +	if (!ret) {
> > +		g_error_free(error);
> > +		g_key_file_free(key_file);
> > +		return false;
> > +	}
> > +
> > +	g_clear_error(&error);
> 
> put that in
> 
> GKeyFile *igt_open_igtrc();
> 
> and use it in igt_core, so we have only one implementation
> 
> > +
> > +	ping_hostname =
> > +		g_key_file_get_string(key_file, "DUT",
> > +				      "PingHostName", &error);
> > +
> > +	g_clear_error(&error);
> > +	g_key_file_free(key_file);
> > +
> > +	if (!ping_hostname)
> > +		return false;
> > +
> > +	if (ping_host_add(pingobj, ping_hostname)) {
> > +		fprintf(stderr,
> > +			"abort on ping: Cannot use hostname from config file\n");
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +static bool load_ping_config_from_env(void)
> > +{
> > +	const char *ping_hostname;
> > +
> > +	ping_hostname = getenv("IGT_PING_HOSTNAME");
> > +	if (!ping_hostname)
> > +		return false;
> > +
> > +	if (ping_host_add(pingobj, ping_hostname)) {
> > +		fprintf(stderr,
> > +			"abort on ping: Cannot use hostname from environment\n");
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +static bool can_ping(void)
> > +{
> > +	/*
> > +	 * On some hosts, getting network back up after suspend takes
> > +	 * upwards of 10 seconds. 20 seconds should be enough to see
> > +	 * if network comes back at all, and hopefully not too long to
> > +	 * make external monitoring freak out.
> > +	 */
> > +	igt_until_timeout(20) {
> 
> make 20 a #define
> 
> > +		pingobj_iter_t *iter;
> > +
> > +		ping_send(pingobj);
> > +
> > +		for (iter = ping_iterator_get(pingobj);
> > +		     iter != NULL;
> > +		     iter = ping_iterator_next(iter)) {
> > +			double latency;
> > +			size_t len = sizeof(latency);
> > +
> > +			ping_iterator_get_info(iter,
> > +					       PING_INFO_LATENCY,
> > +					       &latency,
> > +					       &len);
> > +			if (latency >= 0.0)
> > +				return true;
> > +		}
> > +	}
> > +
> > +	return false;
> > +}
> > +
> > +#endif
> > +
> > +static void ping_config(void)
> > +{
> > +#if HAVE_OPING
> > +	double timeout = 1.0;
> 
> single_attempt_timeout
> 
> > +
> > +	if (pingobj)
> > +		return;
> > +
> > +	pingobj = ping_construct();
> > +
> > +	/* Try env first, then config file */
> > +	if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
> > +		fprintf(stderr,
> > +			"abort on ping: No host to ping configured\n");
> > +		ping_destroy(pingobj);
> > +		pingobj = NULL;
> > +		return;
> > +	}
> > +
> > +	ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
> > +#endif
> > +}
> > +
> > +static char *handle_ping(void)
> > +{
> > +#if HAVE_OPING
> > +	if (pingobj && !can_ping()) {
> > +		char *reason;
> > +
> > +		asprintf(&reason,
> > +			 "Ping host did not respond to ping, network down");
> > +		return reason;
> > +	}
> > +#endif
> > +
> > +	return NULL;
> > +}
> > +
> >  static char *handle_lockdep(void)
> >  {
> >  	const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
> > @@ -197,6 +343,7 @@ static const struct {
> >  } abort_handlers[] = {
> >  	{ ABORT_LOCKDEP, handle_lockdep },
> >  	{ ABORT_TAINT, handle_taint },
> > +	{ ABORT_PING, handle_ping },
> >  	{ 0, 0 },
> >  };
> 
> So the handlers are used need_to_abort():
>  * when we start execution
>  * after each tests
> 
> What worries me here are the suspend tests. You have mentioned that
> getting network up can take up to 17s, so potentially we are adding
> 17s * numer_of_executed_suspend_tests to the total execution time.
> 
> Have you checked the actual impact?

No, let's take a look at that after I send v4.

Even with that extra overhead, an argument can be made that getting
incompletes without clear indicators why they happen is worse.


> Also this does not cover any network issues that happen when executing
> longer tests - we still can get killed by any external watchdogs.

Yeah, and we don't get easily machine-parseable timing data on how
long those running tests were fine for. Note to self: Make runner dump
periodical still-alive marks to journal. Unrelated to this patch
though...

Needs to be said that this patch won't make much of a difference if
you only look at the passrates or amount of incompletes. What this
does is allowing us to close that one silly semi-catchall cibuglog
filter for incompletes without good data, and thus forcing proper
issue binning.

> 
> What is the maximum ping timeout on those external watchdogs? Maybe we
> should get strict on tests' time budget for BAT/shards first...

Don't know actually. Tomi?


-- 
Petri Latvala
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure
  2019-09-09 11:27   ` Petri Latvala
@ 2019-09-10  7:31     ` Tomi Sarvela
  0 siblings, 0 replies; 9+ messages in thread
From: Tomi Sarvela @ 2019-09-10  7:31 UTC (permalink / raw)
  To: Arkadiusz Hiler, igt-dev, Daniel Vetter

On 9/9/19 2:27 PM, Petri Latvala wrote:
> On Mon, Sep 09, 2019 at 01:53:44PM +0300, Arkadiusz Hiler wrote:
>> On Mon, May 20, 2019 at 01:16:22PM +0300, Petri Latvala wrote:
>>> If the network goes down while testing, CI tends to interpret that as
>>> the device being down, cutting its power after a while. This causes an
>>> incomplete to an innocent test, increasing noise in the results.
>>>
>>> A new flag to --abort-on-monitored-error, "ping", uses liboping to
>>> ping a host configured in .igtrc with one ping after each test
>>> execution and aborts the run if there is no reply in a hardcoded
>>> amount of time.
>>>
>>> v2:
>>>   - Use a higher timeout
>>>   - Allow hostname configuration from environment
>>> v3:
>>>   - Use runner_c_args for holding c args for runner
>>>   - Handle runner's meson options in runner/meson.build
>>>   - Instead of one ping with 20 second timeout, ping with 1 second timeout
>>>     for a duration of 20 seconds
>>>
>>> Signed-off-by: Petri Latvala <petri.latvala@intel.com>
>>> Cc: Arkadiusz Hiler <arkadiusz.hiler@intel.com>
>>> Cc: Martin Peres <martin.peres@linux.intel.com>
>>> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
>>> Cc: Daniel Vetter <daniel@ffwll.ch>
>>> ---
>>>   meson_options.txt  |   6 ++
>>>   runner/executor.c  | 150 +++++++++++++++++++++++++++++++++++++++++++++
>>>   runner/meson.build |  14 ++++-
>>>   runner/settings.c  |   4 ++
>>>   runner/settings.h  |   5 +-
>>>   5 files changed, 176 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/meson_options.txt b/meson_options.txt
>>> index 888efe56..935f883e 100644
>>> --- a/meson_options.txt
>>> +++ b/meson_options.txt
>>> @@ -58,6 +58,12 @@ option('build_runner',
>>>          choices : ['auto', 'true', 'false'],
>>>          description : 'Build test runner')
>>>   
>>> +option('with_oping',
>>> +       type : 'combo',
>>> +       value : 'auto',
>>> +       choices : ['auto', 'true', 'false'],
>>> +       description : 'Build igt_runner with liboping')
>>> +
>>>   option('use_rpath',
>>>          type : 'boolean',
>>>          value : false,
>>> diff --git a/runner/executor.c b/runner/executor.c
>>> index 7e5fbe8f..c07e53fa 100644
>>> --- a/runner/executor.c
>>> +++ b/runner/executor.c
>>> @@ -1,6 +1,10 @@
>>>   #include <errno.h>
>>>   #include <fcntl.h>
>>> +#include <glib.h>
>>>   #include <linux/watchdog.h>
>>> +#if HAVE_OPING
>>> +#include <oping.h>
>>> +#endif
>>>   #include <signal.h>
>>>   #include <stdio.h>
>>>   #include <stdlib.h>
>>> @@ -16,6 +20,7 @@
>>>   #include <time.h>
>>>   #include <unistd.h>
>>>   
>>> +#include "igt_aux.h"
>>>   #include "igt_core.h"
>>>   #include "executor.h"
>>>   #include "output_strings.h"
>>> @@ -108,6 +113,147 @@ static void ping_watchdogs(void)
>>>   	}
>>>   }
>>>   
>>> +#if HAVE_OPING
>>> +static pingobj_t *pingobj = NULL;
>>> +
>>> +static bool load_ping_config_from_file(void)
>>> +{
>>> +	char *key_file_env = NULL;
>>> +	char *key_file_loc = NULL;
>>> +	GError *error = NULL;
>>> +	GKeyFile *key_file = NULL;
>>> +	const char *ping_hostname;
>>> +	int ret;
>>> +
>>> +	/* Determine igt config path */
>>> +	key_file_env = getenv("IGT_CONFIG_PATH");
>>> +	if (key_file_env) {
>>> +		key_file_loc = strdup(key_file_env);
>>> +	} else {
>>> +		asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
>>> +	}
>>> +
>>> +	/* Load igt config file */
>>> +	key_file = g_key_file_new();
>>> +	ret = g_key_file_load_from_file(key_file, key_file_loc,
>>> +					G_KEY_FILE_NONE, &error);
>>> +	free(key_file_loc);
>>> +
>>> +	if (!ret) {
>>> +		g_error_free(error);
>>> +		g_key_file_free(key_file);
>>> +		return false;
>>> +	}
>>> +
>>> +	g_clear_error(&error);
>>
>> put that in
>>
>> GKeyFile *igt_open_igtrc();
>>
>> and use it in igt_core, so we have only one implementation
>>
>>> +
>>> +	ping_hostname =
>>> +		g_key_file_get_string(key_file, "DUT",
>>> +				      "PingHostName", &error);
>>> +
>>> +	g_clear_error(&error);
>>> +	g_key_file_free(key_file);
>>> +
>>> +	if (!ping_hostname)
>>> +		return false;
>>> +
>>> +	if (ping_host_add(pingobj, ping_hostname)) {
>>> +		fprintf(stderr,
>>> +			"abort on ping: Cannot use hostname from config file\n");
>>> +		return false;
>>> +	}
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +static bool load_ping_config_from_env(void)
>>> +{
>>> +	const char *ping_hostname;
>>> +
>>> +	ping_hostname = getenv("IGT_PING_HOSTNAME");
>>> +	if (!ping_hostname)
>>> +		return false;
>>> +
>>> +	if (ping_host_add(pingobj, ping_hostname)) {
>>> +		fprintf(stderr,
>>> +			"abort on ping: Cannot use hostname from environment\n");
>>> +		return false;
>>> +	}
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +static bool can_ping(void)
>>> +{
>>> +	/*
>>> +	 * On some hosts, getting network back up after suspend takes
>>> +	 * upwards of 10 seconds. 20 seconds should be enough to see
>>> +	 * if network comes back at all, and hopefully not too long to
>>> +	 * make external monitoring freak out.
>>> +	 */
>>> +	igt_until_timeout(20) {
>>
>> make 20 a #define
>>
>>> +		pingobj_iter_t *iter;
>>> +
>>> +		ping_send(pingobj);
>>> +
>>> +		for (iter = ping_iterator_get(pingobj);
>>> +		     iter != NULL;
>>> +		     iter = ping_iterator_next(iter)) {
>>> +			double latency;
>>> +			size_t len = sizeof(latency);
>>> +
>>> +			ping_iterator_get_info(iter,
>>> +					       PING_INFO_LATENCY,
>>> +					       &latency,
>>> +					       &len);
>>> +			if (latency >= 0.0)
>>> +				return true;
>>> +		}
>>> +	}
>>> +
>>> +	return false;
>>> +}
>>> +
>>> +#endif
>>> +
>>> +static void ping_config(void)
>>> +{
>>> +#if HAVE_OPING
>>> +	double timeout = 1.0;
>>
>> single_attempt_timeout
>>
>>> +
>>> +	if (pingobj)
>>> +		return;
>>> +
>>> +	pingobj = ping_construct();
>>> +
>>> +	/* Try env first, then config file */
>>> +	if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
>>> +		fprintf(stderr,
>>> +			"abort on ping: No host to ping configured\n");
>>> +		ping_destroy(pingobj);
>>> +		pingobj = NULL;
>>> +		return;
>>> +	}
>>> +
>>> +	ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
>>> +#endif
>>> +}
>>> +
>>> +static char *handle_ping(void)
>>> +{
>>> +#if HAVE_OPING
>>> +	if (pingobj && !can_ping()) {
>>> +		char *reason;
>>> +
>>> +		asprintf(&reason,
>>> +			 "Ping host did not respond to ping, network down");
>>> +		return reason;
>>> +	}
>>> +#endif
>>> +
>>> +	return NULL;
>>> +}
>>> +
>>>   static char *handle_lockdep(void)
>>>   {
>>>   	const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
>>> @@ -197,6 +343,7 @@ static const struct {
>>>   } abort_handlers[] = {
>>>   	{ ABORT_LOCKDEP, handle_lockdep },
>>>   	{ ABORT_TAINT, handle_taint },
>>> +	{ ABORT_PING, handle_ping },
>>>   	{ 0, 0 },
>>>   };
>>
>> So the handlers are used need_to_abort():
>>   * when we start execution
>>   * after each tests
>>
>> What worries me here are the suspend tests. You have mentioned that
>> getting network up can take up to 17s, so potentially we are adding
>> 17s * numer_of_executed_suspend_tests to the total execution time.
>>
>> Have you checked the actual impact?
> 
> No, let's take a look at that after I send v4.
> 
> Even with that extra overhead, an argument can be made that getting
> incompletes without clear indicators why they happen is worse.
> 
> 
>> Also this does not cover any network issues that happen when executing
>> longer tests - we still can get killed by any external watchdogs.
> 
> Yeah, and we don't get easily machine-parseable timing data on how
> long those running tests were fine for. Note to self: Make runner dump
> periodical still-alive marks to journal. Unrelated to this patch
> though...
> 
> Needs to be said that this patch won't make much of a difference if
> you only look at the passrates or amount of incompletes. What this
> does is allowing us to close that one silly semi-catchall cibuglog
> filter for incompletes without good data, and thus forcing proper
> issue binning.
> 
>>
>> What is the maximum ping timeout on those external watchdogs? Maybe we
>> should get strict on tests' time budget for BAT/shards first...
> 
> Don't know actually. Tomi?

SSH timeout (TCP timeout) is first, ~2 minutes. To make sure that the 
external watchdog doesn't mess up live system, it only activates when 
the target is offline 6 minutes (pinged several times during this time).

Tomi

-- 
Intel Finland Oy - BIC 0357606-4 - Westendinkatu 7, 02160 Espoo
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-09-10  7:32 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-20 10:16 [igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure Petri Latvala
2019-05-20 10:16 ` [igt-dev] [PATCH i-g-t 2/2] HAX: Check all conditions to abort Petri Latvala
2019-05-20 11:51   ` Petri Latvala
2019-05-20 12:16     ` Petri Latvala
2019-05-20 12:09 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev2) Patchwork
2019-05-20 13:33 ` [igt-dev] ✓ Fi.CI.BAT: success for series starting with [i-g-t,v3,1/2] runner: Add support for aborting on network failure (rev3) Patchwork
2019-09-09 10:53 ` [igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure Arkadiusz Hiler
2019-09-09 11:27   ` Petri Latvala
2019-09-10  7:31     ` Tomi Sarvela

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.