All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-02-23 11:34 ` Tvrtko Ursulin
  0 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-02-23 11:34 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Chris Wilson <chris@chris-wilson.co.uk>

CPU hotplug, especially CPU0, can be flaky on commodity hardware.

To improve test reliability and reponse times when testing larger runs we
need to handle those cases better.

Handle failures to off-line a CPU by immediately skipping the test, and
failures to on-line a CPU by immediately rebooting the machine.

This patch includes igt_sysrq_reboot implementation from Chris Wilson.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/Makefile.sources |  2 ++
 lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
 lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
 lib/meson.build      |  2 ++
 tests/perf_pmu.c     | 42 ++++++++++++++++++++++++++++++++++--------
 5 files changed, 90 insertions(+), 8 deletions(-)
 create mode 100644 lib/igt_sysrq.c
 create mode 100644 lib/igt_sysrq.h

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 5b13ef8896c0..3d37ef1d1984 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -35,6 +35,8 @@ lib_source_list =	 	\
 	igt_stats.h		\
 	igt_sysfs.c		\
 	igt_sysfs.h		\
+	igt_sysrq.c		\
+	igt_sysrq.h		\
 	igt_x86.h		\
 	igt_x86.c		\
 	igt_vgem.c		\
diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
new file mode 100644
index 000000000000..fe3d2e344ff1
--- /dev/null
+++ b/lib/igt_sysrq.c
@@ -0,0 +1,22 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/reboot.h>
+
+#include "igt_core.h"
+
+#include "igt_sysrq.h"
+
+void igt_sysrq_reboot(void)
+{
+	sync();
+
+	/* Try to be nice at first, and if that fails pull the trigger */
+	if (reboot(RB_AUTOBOOT)) {
+		int fd = open("/proc/sysrq-trigger", O_WRONLY);
+		igt_ignore_warn(write(fd, "b", 2));
+		close(fd);
+	}
+
+	abort();
+}
diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
new file mode 100644
index 000000000000..422473d2a480
--- /dev/null
+++ b/lib/igt_sysrq.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __IGT_SYSRQ_H__
+#define __IGT_SYSRQ_H__
+
+void igt_sysrq_reboot(void) __attribute__((noreturn));
+
+#endif /* __IGT_SYSRQ_H__ */
diff --git a/lib/meson.build b/lib/meson.build
index 8f14f6320ecf..63afd3ddb535 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -22,6 +22,7 @@ lib_headers = [
 	'igt_stats.h',
 	'igt_syncobj.h',
 	'igt_sysfs.h',
+	'igt_sysrq.h',
 	'igt_x86.h',
 	'igt_vgem.h',
 	'instdone.h',
@@ -69,6 +70,7 @@ lib_sources = [
 	'igt_stats.c',
 	'igt_syncobj.c',
 	'igt_sysfs.c',
+	'igt_sysrq.c',
 	'igt_vgem.c',
 	'igt_x86.c',
 	'instdone.c',
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 3bbb18d2f216..658d0976137f 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -41,6 +41,7 @@
 #include "igt_core.h"
 #include "igt_perf.h"
 #include "igt_sysfs.h"
+#include "igt_sysrq.h"
 #include "igt_pm.h"
 #include "sw_sync.h"
 
@@ -965,6 +966,7 @@ static void cpu_hotplug(int gem_fd)
 	int link[2];
 	int fd, ret;
 	int cur = 0;
+	char buf;
 
 	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
 	igt_require(cpu0_hotplug_support());
@@ -994,7 +996,7 @@ static void cpu_hotplug(int gem_fd)
 
 		for (;;) {
 			char name[128];
-			int cpufd;
+			int cpufd, ret;
 
 			igt_assert_lt(snprintf(name, sizeof(name),
 					       "/sys/devices/system/cpu/cpu%d/online",
@@ -1011,9 +1013,33 @@ static void cpu_hotplug(int gem_fd)
 			}
 
 			/* Offline followed by online a CPU. */
-			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			ret = write(cpufd, "0", 2);
+			if (ret < 0) {
+				/*
+				 * If we failed to offline a CPU we don't want
+				 * to proceed.
+				 */
+				igt_warn("Failed to offline cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_assert_eq(write(link[1], "s", 1), 1);
+				break;
+			}
+
 			usleep(1e6);
-			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			ret = write(cpufd, "1", 2);
+			if (ret < 0) {
+				/*
+				 * Failed to bring a CPU back online is fatal
+				 * for the sanity of a test run so reboot
+				 * immediately.
+				 */
+				igt_warn("Failed to online cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_sysrq_reboot();
+				igt_assert(0);
+			}
 
 			close(cpufd);
 			cpu++;
@@ -1027,15 +1053,12 @@ static void cpu_hotplug(int gem_fd)
 	 * until the CPU core shuffler finishes one loop.
 	 */
 	for (;;) {
-		char buf;
-		int ret2;
-
 		usleep(500e3);
 		end_spin(gem_fd, spin[cur], 0);
 
 		/* Check if the child is signaling completion. */
-		ret2 = read(link[0], &buf, 1);
-		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
+		ret = read(link[0], &buf, 1);
+		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
 			break;
 
 		igt_spin_batch_free(gem_fd, spin[cur]);
@@ -1054,6 +1077,9 @@ static void cpu_hotplug(int gem_fd)
 	close(fd);
 	close(link[0]);
 
+	/* Skip if child signals a problem with offlining a CPU. */
+	igt_skip_on(buf == 's');
+
 	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
 }
 
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [Intel-gfx] [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-02-23 11:34 ` Tvrtko Ursulin
  0 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-02-23 11:34 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Chris Wilson <chris@chris-wilson.co.uk>

CPU hotplug, especially CPU0, can be flaky on commodity hardware.

To improve test reliability and reponse times when testing larger runs we
need to handle those cases better.

Handle failures to off-line a CPU by immediately skipping the test, and
failures to on-line a CPU by immediately rebooting the machine.

This patch includes igt_sysrq_reboot implementation from Chris Wilson.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
---
 lib/Makefile.sources |  2 ++
 lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
 lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
 lib/meson.build      |  2 ++
 tests/perf_pmu.c     | 42 ++++++++++++++++++++++++++++++++++--------
 5 files changed, 90 insertions(+), 8 deletions(-)
 create mode 100644 lib/igt_sysrq.c
 create mode 100644 lib/igt_sysrq.h

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 5b13ef8896c0..3d37ef1d1984 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -35,6 +35,8 @@ lib_source_list =	 	\
 	igt_stats.h		\
 	igt_sysfs.c		\
 	igt_sysfs.h		\
+	igt_sysrq.c		\
+	igt_sysrq.h		\
 	igt_x86.h		\
 	igt_x86.c		\
 	igt_vgem.c		\
diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
new file mode 100644
index 000000000000..fe3d2e344ff1
--- /dev/null
+++ b/lib/igt_sysrq.c
@@ -0,0 +1,22 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/reboot.h>
+
+#include "igt_core.h"
+
+#include "igt_sysrq.h"
+
+void igt_sysrq_reboot(void)
+{
+	sync();
+
+	/* Try to be nice at first, and if that fails pull the trigger */
+	if (reboot(RB_AUTOBOOT)) {
+		int fd = open("/proc/sysrq-trigger", O_WRONLY);
+		igt_ignore_warn(write(fd, "b", 2));
+		close(fd);
+	}
+
+	abort();
+}
diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
new file mode 100644
index 000000000000..422473d2a480
--- /dev/null
+++ b/lib/igt_sysrq.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __IGT_SYSRQ_H__
+#define __IGT_SYSRQ_H__
+
+void igt_sysrq_reboot(void) __attribute__((noreturn));
+
+#endif /* __IGT_SYSRQ_H__ */
diff --git a/lib/meson.build b/lib/meson.build
index 8f14f6320ecf..63afd3ddb535 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -22,6 +22,7 @@ lib_headers = [
 	'igt_stats.h',
 	'igt_syncobj.h',
 	'igt_sysfs.h',
+	'igt_sysrq.h',
 	'igt_x86.h',
 	'igt_vgem.h',
 	'instdone.h',
@@ -69,6 +70,7 @@ lib_sources = [
 	'igt_stats.c',
 	'igt_syncobj.c',
 	'igt_sysfs.c',
+	'igt_sysrq.c',
 	'igt_vgem.c',
 	'igt_x86.c',
 	'instdone.c',
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 3bbb18d2f216..658d0976137f 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -41,6 +41,7 @@
 #include "igt_core.h"
 #include "igt_perf.h"
 #include "igt_sysfs.h"
+#include "igt_sysrq.h"
 #include "igt_pm.h"
 #include "sw_sync.h"
 
@@ -965,6 +966,7 @@ static void cpu_hotplug(int gem_fd)
 	int link[2];
 	int fd, ret;
 	int cur = 0;
+	char buf;
 
 	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
 	igt_require(cpu0_hotplug_support());
@@ -994,7 +996,7 @@ static void cpu_hotplug(int gem_fd)
 
 		for (;;) {
 			char name[128];
-			int cpufd;
+			int cpufd, ret;
 
 			igt_assert_lt(snprintf(name, sizeof(name),
 					       "/sys/devices/system/cpu/cpu%d/online",
@@ -1011,9 +1013,33 @@ static void cpu_hotplug(int gem_fd)
 			}
 
 			/* Offline followed by online a CPU. */
-			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			ret = write(cpufd, "0", 2);
+			if (ret < 0) {
+				/*
+				 * If we failed to offline a CPU we don't want
+				 * to proceed.
+				 */
+				igt_warn("Failed to offline cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_assert_eq(write(link[1], "s", 1), 1);
+				break;
+			}
+
 			usleep(1e6);
-			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			ret = write(cpufd, "1", 2);
+			if (ret < 0) {
+				/*
+				 * Failed to bring a CPU back online is fatal
+				 * for the sanity of a test run so reboot
+				 * immediately.
+				 */
+				igt_warn("Failed to online cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_sysrq_reboot();
+				igt_assert(0);
+			}
 
 			close(cpufd);
 			cpu++;
@@ -1027,15 +1053,12 @@ static void cpu_hotplug(int gem_fd)
 	 * until the CPU core shuffler finishes one loop.
 	 */
 	for (;;) {
-		char buf;
-		int ret2;
-
 		usleep(500e3);
 		end_spin(gem_fd, spin[cur], 0);
 
 		/* Check if the child is signaling completion. */
-		ret2 = read(link[0], &buf, 1);
-		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
+		ret = read(link[0], &buf, 1);
+		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
 			break;
 
 		igt_spin_batch_free(gem_fd, spin[cur]);
@@ -1054,6 +1077,9 @@ static void cpu_hotplug(int gem_fd)
 	close(fd);
 	close(link[0]);
 
+	/* Skip if child signals a problem with offlining a CPU. */
+	igt_skip_on(buf == 's');
+
 	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
 }
 
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better
  2018-02-23 11:34 ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-02-23 11:58   ` Petri Latvala
  -1 siblings, 0 replies; 30+ messages in thread
From: Petri Latvala @ 2018-02-23 11:58 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev, Intel-gfx

On Fri, Feb 23, 2018 at 11:34:53AM +0000, Tvrtko Ursulin wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> CPU hotplug, especially CPU0, can be flaky on commodity hardware.
> 
> To improve test reliability and reponse times when testing larger runs we
> need to handle those cases better.
> 
> Handle failures to off-line a CPU by immediately skipping the test, and
> failures to on-line a CPU by immediately rebooting the machine.
> 
> This patch includes igt_sysrq_reboot implementation from Chris Wilson.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  lib/Makefile.sources |  2 ++
>  lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
>  lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
>  lib/meson.build      |  2 ++
>  tests/perf_pmu.c     | 42 ++++++++++++++++++++++++++++++++++--------
>  5 files changed, 90 insertions(+), 8 deletions(-)
>  create mode 100644 lib/igt_sysrq.c
>  create mode 100644 lib/igt_sysrq.h
> 
> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
> index 5b13ef8896c0..3d37ef1d1984 100644
> --- a/lib/Makefile.sources
> +++ b/lib/Makefile.sources
> @@ -35,6 +35,8 @@ lib_source_list =	 	\
>  	igt_stats.h		\
>  	igt_sysfs.c		\
>  	igt_sysfs.h		\
> +	igt_sysrq.c		\
> +	igt_sysrq.h		\
>  	igt_x86.h		\
>  	igt_x86.c		\
>  	igt_vgem.c		\
> diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
> new file mode 100644
> index 000000000000..fe3d2e344ff1
> --- /dev/null
> +++ b/lib/igt_sysrq.c
> @@ -0,0 +1,22 @@
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <sys/reboot.h>
> +
> +#include "igt_core.h"
> +
> +#include "igt_sysrq.h"
> +
> +void igt_sysrq_reboot(void)
> +{
> +	sync();
> +
> +	/* Try to be nice at first, and if that fails pull the trigger */
> +	if (reboot(RB_AUTOBOOT)) {
> +		int fd = open("/proc/sysrq-trigger", O_WRONLY);
> +		igt_ignore_warn(write(fd, "b", 2));
> +		close(fd);
> +	}
> +
> +	abort();
> +}


While the cause for taking this action might be dire, rebooting
people's machines can be kind of a dick move, even considering they're
running tests that can be fatal to the machine in other ways.

We have IGT_HANG and IGT_HANG_WITHOUT_RESET so the users can opt
in/out of some fatal behaviour already. I'm fine with auto-rebooting,
even as the default, if users can opt out of it with
IGT_NO_REBOOT_PRETTY_PLEASE or so.


-- 
Petri Latvala



> diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
> new file mode 100644
> index 000000000000..422473d2a480
> --- /dev/null
> +++ b/lib/igt_sysrq.h
> @@ -0,0 +1,30 @@
> +/*
> + * Copyright © 2018 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __IGT_SYSRQ_H__
> +#define __IGT_SYSRQ_H__
> +
> +void igt_sysrq_reboot(void) __attribute__((noreturn));
> +
> +#endif /* __IGT_SYSRQ_H__ */
> diff --git a/lib/meson.build b/lib/meson.build
> index 8f14f6320ecf..63afd3ddb535 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -22,6 +22,7 @@ lib_headers = [
>  	'igt_stats.h',
>  	'igt_syncobj.h',
>  	'igt_sysfs.h',
> +	'igt_sysrq.h',
>  	'igt_x86.h',
>  	'igt_vgem.h',
>  	'instdone.h',
> @@ -69,6 +70,7 @@ lib_sources = [
>  	'igt_stats.c',
>  	'igt_syncobj.c',
>  	'igt_sysfs.c',
> +	'igt_sysrq.c',
>  	'igt_vgem.c',
>  	'igt_x86.c',
>  	'instdone.c',
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 3bbb18d2f216..658d0976137f 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -41,6 +41,7 @@
>  #include "igt_core.h"
>  #include "igt_perf.h"
>  #include "igt_sysfs.h"
> +#include "igt_sysrq.h"
>  #include "igt_pm.h"
>  #include "sw_sync.h"
>  
> @@ -965,6 +966,7 @@ static void cpu_hotplug(int gem_fd)
>  	int link[2];
>  	int fd, ret;
>  	int cur = 0;
> +	char buf;
>  
>  	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
>  	igt_require(cpu0_hotplug_support());
> @@ -994,7 +996,7 @@ static void cpu_hotplug(int gem_fd)
>  
>  		for (;;) {
>  			char name[128];
> -			int cpufd;
> +			int cpufd, ret;
>  
>  			igt_assert_lt(snprintf(name, sizeof(name),
>  					       "/sys/devices/system/cpu/cpu%d/online",
> @@ -1011,9 +1013,33 @@ static void cpu_hotplug(int gem_fd)
>  			}
>  
>  			/* Offline followed by online a CPU. */
> -			igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +			ret = write(cpufd, "0", 2);
> +			if (ret < 0) {
> +				/*
> +				 * If we failed to offline a CPU we don't want
> +				 * to proceed.
> +				 */
> +				igt_warn("Failed to offline cpu%u! (%d)\n",
> +					 cpu, errno);
> +				igt_assert_eq(write(link[1], "s", 1), 1);
> +				break;
> +			}
> +
>  			usleep(1e6);
> -			igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +			ret = write(cpufd, "1", 2);
> +			if (ret < 0) {
> +				/*
> +				 * Failed to bring a CPU back online is fatal
> +				 * for the sanity of a test run so reboot
> +				 * immediately.
> +				 */
> +				igt_warn("Failed to online cpu%u! (%d)\n",
> +					 cpu, errno);
> +				igt_sysrq_reboot();
> +				igt_assert(0);
> +			}
>  
>  			close(cpufd);
>  			cpu++;
> @@ -1027,15 +1053,12 @@ static void cpu_hotplug(int gem_fd)
>  	 * until the CPU core shuffler finishes one loop.
>  	 */
>  	for (;;) {
> -		char buf;
> -		int ret2;
> -
>  		usleep(500e3);
>  		end_spin(gem_fd, spin[cur], 0);
>  
>  		/* Check if the child is signaling completion. */
> -		ret2 = read(link[0], &buf, 1);
> -		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
> +		ret = read(link[0], &buf, 1);
> +		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
>  			break;
>  
>  		igt_spin_batch_free(gem_fd, spin[cur]);
> @@ -1054,6 +1077,9 @@ static void cpu_hotplug(int gem_fd)
>  	close(fd);
>  	close(link[0]);
>  
> +	/* Skip if child signals a problem with offlining a CPU. */
> +	igt_skip_on(buf == 's');
> +
>  	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
>  }
>  
> -- 
> 2.14.1
> 
> _______________________________________________
> igt-dev mailing list
> igt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-02-23 11:58   ` Petri Latvala
  0 siblings, 0 replies; 30+ messages in thread
From: Petri Latvala @ 2018-02-23 11:58 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev, Intel-gfx, Tvrtko Ursulin

On Fri, Feb 23, 2018 at 11:34:53AM +0000, Tvrtko Ursulin wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> CPU hotplug, especially CPU0, can be flaky on commodity hardware.
> 
> To improve test reliability and reponse times when testing larger runs we
> need to handle those cases better.
> 
> Handle failures to off-line a CPU by immediately skipping the test, and
> failures to on-line a CPU by immediately rebooting the machine.
> 
> This patch includes igt_sysrq_reboot implementation from Chris Wilson.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  lib/Makefile.sources |  2 ++
>  lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
>  lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
>  lib/meson.build      |  2 ++
>  tests/perf_pmu.c     | 42 ++++++++++++++++++++++++++++++++++--------
>  5 files changed, 90 insertions(+), 8 deletions(-)
>  create mode 100644 lib/igt_sysrq.c
>  create mode 100644 lib/igt_sysrq.h
> 
> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
> index 5b13ef8896c0..3d37ef1d1984 100644
> --- a/lib/Makefile.sources
> +++ b/lib/Makefile.sources
> @@ -35,6 +35,8 @@ lib_source_list =	 	\
>  	igt_stats.h		\
>  	igt_sysfs.c		\
>  	igt_sysfs.h		\
> +	igt_sysrq.c		\
> +	igt_sysrq.h		\
>  	igt_x86.h		\
>  	igt_x86.c		\
>  	igt_vgem.c		\
> diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
> new file mode 100644
> index 000000000000..fe3d2e344ff1
> --- /dev/null
> +++ b/lib/igt_sysrq.c
> @@ -0,0 +1,22 @@
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <sys/reboot.h>
> +
> +#include "igt_core.h"
> +
> +#include "igt_sysrq.h"
> +
> +void igt_sysrq_reboot(void)
> +{
> +	sync();
> +
> +	/* Try to be nice at first, and if that fails pull the trigger */
> +	if (reboot(RB_AUTOBOOT)) {
> +		int fd = open("/proc/sysrq-trigger", O_WRONLY);
> +		igt_ignore_warn(write(fd, "b", 2));
> +		close(fd);
> +	}
> +
> +	abort();
> +}


While the cause for taking this action might be dire, rebooting
people's machines can be kind of a dick move, even considering they're
running tests that can be fatal to the machine in other ways.

We have IGT_HANG and IGT_HANG_WITHOUT_RESET so the users can opt
in/out of some fatal behaviour already. I'm fine with auto-rebooting,
even as the default, if users can opt out of it with
IGT_NO_REBOOT_PRETTY_PLEASE or so.


-- 
Petri Latvala



> diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
> new file mode 100644
> index 000000000000..422473d2a480
> --- /dev/null
> +++ b/lib/igt_sysrq.h
> @@ -0,0 +1,30 @@
> +/*
> + * Copyright © 2018 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __IGT_SYSRQ_H__
> +#define __IGT_SYSRQ_H__
> +
> +void igt_sysrq_reboot(void) __attribute__((noreturn));
> +
> +#endif /* __IGT_SYSRQ_H__ */
> diff --git a/lib/meson.build b/lib/meson.build
> index 8f14f6320ecf..63afd3ddb535 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -22,6 +22,7 @@ lib_headers = [
>  	'igt_stats.h',
>  	'igt_syncobj.h',
>  	'igt_sysfs.h',
> +	'igt_sysrq.h',
>  	'igt_x86.h',
>  	'igt_vgem.h',
>  	'instdone.h',
> @@ -69,6 +70,7 @@ lib_sources = [
>  	'igt_stats.c',
>  	'igt_syncobj.c',
>  	'igt_sysfs.c',
> +	'igt_sysrq.c',
>  	'igt_vgem.c',
>  	'igt_x86.c',
>  	'instdone.c',
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 3bbb18d2f216..658d0976137f 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -41,6 +41,7 @@
>  #include "igt_core.h"
>  #include "igt_perf.h"
>  #include "igt_sysfs.h"
> +#include "igt_sysrq.h"
>  #include "igt_pm.h"
>  #include "sw_sync.h"
>  
> @@ -965,6 +966,7 @@ static void cpu_hotplug(int gem_fd)
>  	int link[2];
>  	int fd, ret;
>  	int cur = 0;
> +	char buf;
>  
>  	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
>  	igt_require(cpu0_hotplug_support());
> @@ -994,7 +996,7 @@ static void cpu_hotplug(int gem_fd)
>  
>  		for (;;) {
>  			char name[128];
> -			int cpufd;
> +			int cpufd, ret;
>  
>  			igt_assert_lt(snprintf(name, sizeof(name),
>  					       "/sys/devices/system/cpu/cpu%d/online",
> @@ -1011,9 +1013,33 @@ static void cpu_hotplug(int gem_fd)
>  			}
>  
>  			/* Offline followed by online a CPU. */
> -			igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +			ret = write(cpufd, "0", 2);
> +			if (ret < 0) {
> +				/*
> +				 * If we failed to offline a CPU we don't want
> +				 * to proceed.
> +				 */
> +				igt_warn("Failed to offline cpu%u! (%d)\n",
> +					 cpu, errno);
> +				igt_assert_eq(write(link[1], "s", 1), 1);
> +				break;
> +			}
> +
>  			usleep(1e6);
> -			igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +			ret = write(cpufd, "1", 2);
> +			if (ret < 0) {
> +				/*
> +				 * Failed to bring a CPU back online is fatal
> +				 * for the sanity of a test run so reboot
> +				 * immediately.
> +				 */
> +				igt_warn("Failed to online cpu%u! (%d)\n",
> +					 cpu, errno);
> +				igt_sysrq_reboot();
> +				igt_assert(0);
> +			}
>  
>  			close(cpufd);
>  			cpu++;
> @@ -1027,15 +1053,12 @@ static void cpu_hotplug(int gem_fd)
>  	 * until the CPU core shuffler finishes one loop.
>  	 */
>  	for (;;) {
> -		char buf;
> -		int ret2;
> -
>  		usleep(500e3);
>  		end_spin(gem_fd, spin[cur], 0);
>  
>  		/* Check if the child is signaling completion. */
> -		ret2 = read(link[0], &buf, 1);
> -		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
> +		ret = read(link[0], &buf, 1);
> +		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
>  			break;
>  
>  		igt_spin_batch_free(gem_fd, spin[cur]);
> @@ -1054,6 +1077,9 @@ static void cpu_hotplug(int gem_fd)
>  	close(fd);
>  	close(link[0]);
>  
> +	/* Skip if child signals a problem with offlining a CPU. */
> +	igt_skip_on(buf == 's');
> +
>  	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
>  }
>  
> -- 
> 2.14.1
> 
> _______________________________________________
> igt-dev mailing list
> igt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Handle CPU hotplug failures better
  2018-02-23 11:34 ` [Intel-gfx] " Tvrtko Ursulin
  (?)
  (?)
@ 2018-02-23 12:37 ` Patchwork
  -1 siblings, 0 replies; 30+ messages in thread
From: Patchwork @ 2018-02-23 12:37 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Handle CPU hotplug failures better
URL   : https://patchwork.freedesktop.org/series/38855/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
f9b6fd624be9692e9a892c0976fa72cedefc9fed meson: Make cairo mandatory

with latest DRM-Tip kernel build CI_DRM_3828
562dc33a969d drm-tip: 2018y-02m-23d-09h-04m-20s UTC integration manifest

No testlist changes.

Test gem_mmap_gtt:
        Subgroup basic-small-bo-tiledx:
                fail       -> PASS       (fi-gdg-551) fdo#102575

fdo#102575 https://bugs.freedesktop.org/show_bug.cgi?id=102575

fi-bdw-5557u     total:288  pass:267  dwarn:0   dfail:0   fail:0   skip:21  time:417s
fi-bdw-gvtdvm    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:434s
fi-blb-e6850     total:288  pass:223  dwarn:1   dfail:0   fail:0   skip:64  time:374s
fi-bsw-n3050     total:288  pass:242  dwarn:0   dfail:0   fail:0   skip:46  time:487s
fi-bwr-2160      total:288  pass:183  dwarn:0   dfail:0   fail:0   skip:105 time:285s
fi-bxt-dsi       total:288  pass:258  dwarn:0   dfail:0   fail:0   skip:30  time:479s
fi-bxt-j4205     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:483s
fi-byt-j1900     total:288  pass:253  dwarn:0   dfail:0   fail:0   skip:35  time:469s
fi-byt-n2820     total:288  pass:249  dwarn:0   dfail:0   fail:0   skip:39  time:458s
fi-cfl-8700k     total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:393s
fi-cfl-s2        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:568s
fi-elk-e7500     total:288  pass:229  dwarn:0   dfail:0   fail:0   skip:59  time:414s
fi-gdg-551       total:288  pass:180  dwarn:0   dfail:0   fail:0   skip:108 time:283s
fi-glk-1         total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:511s
fi-hsw-4770      total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:388s
fi-ilk-650       total:288  pass:228  dwarn:0   dfail:0   fail:0   skip:60  time:415s
fi-ivb-3520m     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:447s
fi-kbl-7500u     total:288  pass:263  dwarn:1   dfail:0   fail:0   skip:24  time:450s
fi-kbl-7560u     total:288  pass:269  dwarn:0   dfail:0   fail:0   skip:19  time:494s
fi-kbl-7567u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:456s
fi-kbl-r         total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:494s
fi-pnv-d510      total:288  pass:222  dwarn:1   dfail:0   fail:0   skip:65  time:587s
fi-skl-6260u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:424s
fi-skl-6600u     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:500s
fi-skl-6700hq    total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:521s
fi-skl-6700k2    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:490s
fi-skl-6770hq    total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:470s
fi-skl-guc       total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:408s
fi-skl-gvtdvm    total:288  pass:265  dwarn:0   dfail:0   fail:0   skip:23  time:432s
fi-snb-2520m     total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:518s
fi-snb-2600      total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:394s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_996/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better
  2018-02-23 11:58   ` Petri Latvala
@ 2018-02-23 14:20     ` Tvrtko Ursulin
  -1 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-02-23 14:20 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev, Intel-gfx, Tvrtko Ursulin


On 23/02/2018 11:58, Petri Latvala wrote:
> On Fri, Feb 23, 2018 at 11:34:53AM +0000, Tvrtko Ursulin wrote:
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> CPU hotplug, especially CPU0, can be flaky on commodity hardware.
>>
>> To improve test reliability and reponse times when testing larger runs we
>> need to handle those cases better.
>>
>> Handle failures to off-line a CPU by immediately skipping the test, and
>> failures to on-line a CPU by immediately rebooting the machine.
>>
>> This patch includes igt_sysrq_reboot implementation from Chris Wilson.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>>   lib/Makefile.sources |  2 ++
>>   lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
>>   lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
>>   lib/meson.build      |  2 ++
>>   tests/perf_pmu.c     | 42 ++++++++++++++++++++++++++++++++++--------
>>   5 files changed, 90 insertions(+), 8 deletions(-)
>>   create mode 100644 lib/igt_sysrq.c
>>   create mode 100644 lib/igt_sysrq.h
>>
>> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
>> index 5b13ef8896c0..3d37ef1d1984 100644
>> --- a/lib/Makefile.sources
>> +++ b/lib/Makefile.sources
>> @@ -35,6 +35,8 @@ lib_source_list =	 	\
>>   	igt_stats.h		\
>>   	igt_sysfs.c		\
>>   	igt_sysfs.h		\
>> +	igt_sysrq.c		\
>> +	igt_sysrq.h		\
>>   	igt_x86.h		\
>>   	igt_x86.c		\
>>   	igt_vgem.c		\
>> diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
>> new file mode 100644
>> index 000000000000..fe3d2e344ff1
>> --- /dev/null
>> +++ b/lib/igt_sysrq.c
>> @@ -0,0 +1,22 @@
>> +#include <unistd.h>
>> +#include <fcntl.h>
>> +#include <stdlib.h>
>> +#include <sys/reboot.h>
>> +
>> +#include "igt_core.h"
>> +
>> +#include "igt_sysrq.h"
>> +
>> +void igt_sysrq_reboot(void)
>> +{
>> +	sync();
>> +
>> +	/* Try to be nice at first, and if that fails pull the trigger */
>> +	if (reboot(RB_AUTOBOOT)) {
>> +		int fd = open("/proc/sysrq-trigger", O_WRONLY);
>> +		igt_ignore_warn(write(fd, "b", 2));
>> +		close(fd);
>> +	}
>> +
>> +	abort();
>> +}
> 
> 
> While the cause for taking this action might be dire, rebooting
> people's machines can be kind of a dick move, even considering they're
> running tests that can be fatal to the machine in other ways.
> 
> We have IGT_HANG and IGT_HANG_WITHOUT_RESET so the users can opt
> in/out of some fatal behaviour already. I'm fine with auto-rebooting,
> even as the default, if users can opt out of it with
> IGT_NO_REBOOT_PRETTY_PLEASE or so.

I am fine with something like that. Just lets define how to call the env 
variable and what the default should be?

Do we have a return code from a test which stops the test runner?

I am thinking that the best approach would be not to reboot but to halt 
testing, unless this environment option is set.

But then it is up to CI people to say if they want to be setting this 
option across all systems, or would actually prefer to reboot by default.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [igt-dev] [Intel-gfx] [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-02-23 14:20     ` Tvrtko Ursulin
  0 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-02-23 14:20 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev, Intel-gfx, Tvrtko Ursulin


On 23/02/2018 11:58, Petri Latvala wrote:
> On Fri, Feb 23, 2018 at 11:34:53AM +0000, Tvrtko Ursulin wrote:
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> CPU hotplug, especially CPU0, can be flaky on commodity hardware.
>>
>> To improve test reliability and reponse times when testing larger runs we
>> need to handle those cases better.
>>
>> Handle failures to off-line a CPU by immediately skipping the test, and
>> failures to on-line a CPU by immediately rebooting the machine.
>>
>> This patch includes igt_sysrq_reboot implementation from Chris Wilson.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>>   lib/Makefile.sources |  2 ++
>>   lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
>>   lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
>>   lib/meson.build      |  2 ++
>>   tests/perf_pmu.c     | 42 ++++++++++++++++++++++++++++++++++--------
>>   5 files changed, 90 insertions(+), 8 deletions(-)
>>   create mode 100644 lib/igt_sysrq.c
>>   create mode 100644 lib/igt_sysrq.h
>>
>> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
>> index 5b13ef8896c0..3d37ef1d1984 100644
>> --- a/lib/Makefile.sources
>> +++ b/lib/Makefile.sources
>> @@ -35,6 +35,8 @@ lib_source_list =	 	\
>>   	igt_stats.h		\
>>   	igt_sysfs.c		\
>>   	igt_sysfs.h		\
>> +	igt_sysrq.c		\
>> +	igt_sysrq.h		\
>>   	igt_x86.h		\
>>   	igt_x86.c		\
>>   	igt_vgem.c		\
>> diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
>> new file mode 100644
>> index 000000000000..fe3d2e344ff1
>> --- /dev/null
>> +++ b/lib/igt_sysrq.c
>> @@ -0,0 +1,22 @@
>> +#include <unistd.h>
>> +#include <fcntl.h>
>> +#include <stdlib.h>
>> +#include <sys/reboot.h>
>> +
>> +#include "igt_core.h"
>> +
>> +#include "igt_sysrq.h"
>> +
>> +void igt_sysrq_reboot(void)
>> +{
>> +	sync();
>> +
>> +	/* Try to be nice at first, and if that fails pull the trigger */
>> +	if (reboot(RB_AUTOBOOT)) {
>> +		int fd = open("/proc/sysrq-trigger", O_WRONLY);
>> +		igt_ignore_warn(write(fd, "b", 2));
>> +		close(fd);
>> +	}
>> +
>> +	abort();
>> +}
> 
> 
> While the cause for taking this action might be dire, rebooting
> people's machines can be kind of a dick move, even considering they're
> running tests that can be fatal to the machine in other ways.
> 
> We have IGT_HANG and IGT_HANG_WITHOUT_RESET so the users can opt
> in/out of some fatal behaviour already. I'm fine with auto-rebooting,
> even as the default, if users can opt out of it with
> IGT_NO_REBOOT_PRETTY_PLEASE or so.

I am fine with something like that. Just lets define how to call the env 
variable and what the default should be?

Do we have a return code from a test which stops the test runner?

I am thinking that the best approach would be not to reboot but to halt 
testing, unless this environment option is set.

But then it is up to CI people to say if they want to be setting this 
option across all systems, or would actually prefer to reboot by default.

Regards,

Tvrtko
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: failure for tests/perf_pmu: Handle CPU hotplug failures better
  2018-02-23 11:34 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (2 preceding siblings ...)
  (?)
@ 2018-02-23 15:34 ` Patchwork
  -1 siblings, 0 replies; 30+ messages in thread
From: Patchwork @ 2018-02-23 15:34 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Handle CPU hotplug failures better
URL   : https://patchwork.freedesktop.org/series/38855/
State : failure

== Summary ==

Test kms_rotation_crc:
        Subgroup sprite-rotation-180:
                fail       -> PASS       (shard-snb) fdo#103925
Test prime_vgem:
        Subgroup basic-fence-flip:
                pass       -> FAIL       (shard-apl)
Test kms_cursor_crc:
        Subgroup cursor-64x64-suspend:
                incomplete -> PASS       (shard-hsw) fdo#103540
Test kms_mmap_write_crc:
                pass       -> SKIP       (shard-apl)
Test kms_flip:
        Subgroup 2x-modeset-vs-vblank-race-interruptible:
                fail       -> PASS       (shard-hsw) fdo#103060 +1
Test kms_setmode:
        Subgroup basic:
                fail       -> PASS       (shard-hsw) fdo#99912
Test kms_vblank:
        Subgroup pipe-b-accuracy-idle:
                pass       -> FAIL       (shard-hsw) fdo#102583

fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925
fdo#103540 https://bugs.freedesktop.org/show_bug.cgi?id=103540
fdo#103060 https://bugs.freedesktop.org/show_bug.cgi?id=103060
fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912
fdo#102583 https://bugs.freedesktop.org/show_bug.cgi?id=102583

shard-apl        total:3465 pass:1818 dwarn:1   dfail:0   fail:13  skip:1632 time:12234s
shard-hsw        total:3465 pass:1768 dwarn:1   dfail:0   fail:2   skip:1693 time:11464s
shard-snb        total:3465 pass:1358 dwarn:1   dfail:0   fail:2   skip:2104 time:6652s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_996/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better
  2018-02-23 14:20     ` [igt-dev] [Intel-gfx] " Tvrtko Ursulin
@ 2018-02-26 10:03       ` Petri Latvala
  -1 siblings, 0 replies; 30+ messages in thread
From: Petri Latvala @ 2018-02-26 10:03 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tomi Sarvela; +Cc: igt-dev, Intel-gfx

On Fri, Feb 23, 2018 at 02:20:59PM +0000, Tvrtko Ursulin wrote:
> 
> On 23/02/2018 11:58, Petri Latvala wrote:
> > We have IGT_HANG and IGT_HANG_WITHOUT_RESET so the users can opt
> > in/out of some fatal behaviour already. I'm fine with auto-rebooting,
> > even as the default, if users can opt out of it with
> > IGT_NO_REBOOT_PRETTY_PLEASE or so.
> 
> I am fine with something like that. Just lets define how to call the env
> variable and what the default should be?


IGT_REBOOT_ON_FATAL_ERROR=1, default behaviour is not to reboot?

Tomi, preference?



> Do we have a return code from a test which stops the test runner?
> 
> I am thinking that the best approach would be not to reboot but to halt
> testing, unless this environment option is set.


Yeah, that would be the best out of all the options. Unfortunately
that's not going to materialize in the near future. I have an
implementation of aborting support hacked up on my test machines, but
it's quite ugly and requires hacking deeply into the bowels of piglit.

Piglit is a hammer and we have screws...



-- 
Petri Latvala


> But then it is up to CI people to say if they want to be setting this option
> across all systems, or would actually prefer to reboot by default.
> 
> Regards,
> 
> Tvrtko
> _______________________________________________
> igt-dev mailing list
> igt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [igt-dev] [Intel-gfx] [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-02-26 10:03       ` Petri Latvala
  0 siblings, 0 replies; 30+ messages in thread
From: Petri Latvala @ 2018-02-26 10:03 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tomi Sarvela; +Cc: igt-dev, Intel-gfx, Tvrtko Ursulin

On Fri, Feb 23, 2018 at 02:20:59PM +0000, Tvrtko Ursulin wrote:
> 
> On 23/02/2018 11:58, Petri Latvala wrote:
> > We have IGT_HANG and IGT_HANG_WITHOUT_RESET so the users can opt
> > in/out of some fatal behaviour already. I'm fine with auto-rebooting,
> > even as the default, if users can opt out of it with
> > IGT_NO_REBOOT_PRETTY_PLEASE or so.
> 
> I am fine with something like that. Just lets define how to call the env
> variable and what the default should be?


IGT_REBOOT_ON_FATAL_ERROR=1, default behaviour is not to reboot?

Tomi, preference?



> Do we have a return code from a test which stops the test runner?
> 
> I am thinking that the best approach would be not to reboot but to halt
> testing, unless this environment option is set.


Yeah, that would be the best out of all the options. Unfortunately
that's not going to materialize in the near future. I have an
implementation of aborting support hacked up on my test machines, but
it's quite ugly and requires hacking deeply into the bowels of piglit.

Piglit is a hammer and we have screws...



-- 
Petri Latvala


> But then it is up to CI people to say if they want to be setting this option
> across all systems, or would actually prefer to reboot by default.
> 
> Regards,
> 
> Tvrtko
> _______________________________________________
> igt-dev mailing list
> igt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better
  2018-02-26 10:03       ` [igt-dev] [Intel-gfx] " Petri Latvala
@ 2018-02-26 10:14         ` Tomi Sarvela
  -1 siblings, 0 replies; 30+ messages in thread
From: Tomi Sarvela @ 2018-02-26 10:14 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev, Intel-gfx, Tvrtko Ursulin

On 02/26/2018 12:03 PM, Petri Latvala wrote:
> On Fri, Feb 23, 2018 at 02:20:59PM +0000, Tvrtko Ursulin wrote:
>>
>> On 23/02/2018 11:58, Petri Latvala wrote:
>>> We have IGT_HANG and IGT_HANG_WITHOUT_RESET so the users can opt
>>> in/out of some fatal behaviour already. I'm fine with auto-rebooting,
>>> even as the default, if users can opt out of it with
>>> IGT_NO_REBOOT_PRETTY_PLEASE or so.
>>
>> I am fine with something like that. Just lets define how to call the env
>> variable and what the default should be?
> 
> 
> IGT_REBOOT_ON_FATAL_ERROR=1, default behaviour is not to reboot?

Default behaviour is not to reboot on panic either, but we want to have 
that.

So something that adds forced panic / reboot when test spectacularly 
fails is probably the sanest default: only if we have asked for that 
behaviour.

Turning that on with ENV or piglit.conf entry are both ok, either way 
it's something that user needs to know and explicitly turn on.

Tomi

> 
> Tomi, preference?
> 
> 
> 
>> Do we have a return code from a test which stops the test runner?
>>
>> I am thinking that the best approach would be not to reboot but to halt
>> testing, unless this environment option is set.
> 
> 
> Yeah, that would be the best out of all the options. Unfortunately
> that's not going to materialize in the near future. I have an
> implementation of aborting support hacked up on my test machines, but
> it's quite ugly and requires hacking deeply into the bowels of piglit.
> 
> Piglit is a hammer and we have screws...
> 
> 
> 


Tomi
-- 
Intel Finland Oy - BIC 0357606-4 - Westendinkatu 7, 02160 Espoo
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [igt-dev] [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-02-26 10:14         ` Tomi Sarvela
  0 siblings, 0 replies; 30+ messages in thread
From: Tomi Sarvela @ 2018-02-26 10:14 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev, Intel-gfx, Tvrtko Ursulin

On 02/26/2018 12:03 PM, Petri Latvala wrote:
> On Fri, Feb 23, 2018 at 02:20:59PM +0000, Tvrtko Ursulin wrote:
>>
>> On 23/02/2018 11:58, Petri Latvala wrote:
>>> We have IGT_HANG and IGT_HANG_WITHOUT_RESET so the users can opt
>>> in/out of some fatal behaviour already. I'm fine with auto-rebooting,
>>> even as the default, if users can opt out of it with
>>> IGT_NO_REBOOT_PRETTY_PLEASE or so.
>>
>> I am fine with something like that. Just lets define how to call the env
>> variable and what the default should be?
> 
> 
> IGT_REBOOT_ON_FATAL_ERROR=1, default behaviour is not to reboot?

Default behaviour is not to reboot on panic either, but we want to have 
that.

So something that adds forced panic / reboot when test spectacularly 
fails is probably the sanest default: only if we have asked for that 
behaviour.

Turning that on with ENV or piglit.conf entry are both ok, either way 
it's something that user needs to know and explicitly turn on.

Tomi

> 
> Tomi, preference?
> 
> 
> 
>> Do we have a return code from a test which stops the test runner?
>>
>> I am thinking that the best approach would be not to reboot but to halt
>> testing, unless this environment option is set.
> 
> 
> Yeah, that would be the best out of all the options. Unfortunately
> that's not going to materialize in the near future. I have an
> implementation of aborting support hacked up on my test machines, but
> it's quite ugly and requires hacking deeply into the bowels of piglit.
> 
> Piglit is a hammer and we have screws...
> 
> 
> 


Tomi
-- 
Intel Finland Oy - BIC 0357606-4 - Westendinkatu 7, 02160 Espoo
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH i-g-t v2] tests/perf_pmu: Handle CPU hotplug failures better
  2018-02-26 10:14         ` [Intel-gfx] " Tomi Sarvela
@ 2018-02-28 10:05           ` Tvrtko Ursulin
  -1 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-02-28 10:05 UTC (permalink / raw)
  To: igt-dev; +Cc: Tomi Sarvela, Intel-gfx

From: Chris Wilson <chris@chris-wilson.co.uk>

CPU hotplug, especially CPU0, can be flaky on commodity hardware.

To improve test reliability and reponse times when testing larger runs we
need to handle those cases better.

Handle failures to off-line a CPU by immediately skipping the test, and
failures to on-line a CPU by immediately rebooting the machine.

This patch includes igt_sysrq_reboot implementation from Chris Wilson.

v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
    set. (Petri Latvala)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
---
 lib/Makefile.sources |  2 ++
 lib/igt_core.c       | 23 +++++++++++++++++++++++
 lib/igt_core.h       |  1 +
 lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
 lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
 lib/meson.build      |  1 +
 tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
 7 files changed, 110 insertions(+), 7 deletions(-)
 create mode 100644 lib/igt_sysrq.c
 create mode 100644 lib/igt_sysrq.h

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 5b13ef8896c0..3d37ef1d1984 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -35,6 +35,8 @@ lib_source_list =	 	\
 	igt_stats.h		\
 	igt_sysfs.c		\
 	igt_sysfs.h		\
+	igt_sysrq.c		\
+	igt_sysrq.h		\
 	igt_x86.h		\
 	igt_x86.c		\
 	igt_vgem.c		\
diff --git a/lib/igt_core.c b/lib/igt_core.c
index c292343de09e..3fd9f529f09f 100644
--- a/lib/igt_core.c
+++ b/lib/igt_core.c
@@ -70,6 +70,7 @@
 #include "igt_core.h"
 #include "igt_aux.h"
 #include "igt_sysfs.h"
+#include "igt_sysrq.h"
 #include "igt_rc.h"
 
 #define UNW_LOCAL_ONLY
@@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
 	}
 }
 
+/**
+ * igt_fatal_error:
+ *
+ * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
+ * environment variable is set, reboot the machine.
+ *
+ * Since out test runner (piglit) does support fatal test exit codes, we
+ * implement the default behaviour by waiting endlessly.
+ */
+void  __attribute__((noreturn)) igt_fatal_error(void)
+{
+	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
+		igt_warn("FATAL ERROR - REBOOTING");
+		igt_sysrq_reboot();
+	} else {
+		igt_warn("FATAL ERROR");
+		for (;;)
+			sleep(60);
+	}
+}
+
+
 /**
  * igt_can_fail:
  *
diff --git a/lib/igt_core.h b/lib/igt_core.h
index 7af2b4c109fe..66523a208c31 100644
--- a/lib/igt_core.h
+++ b/lib/igt_core.h
@@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
 		       const char *format, ...)
 	__attribute__((noreturn));
 void igt_exit(void) __attribute__((noreturn));
+void igt_fatal_error(void) __attribute__((noreturn));
 
 /**
  * igt_ignore_warn:
diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
new file mode 100644
index 000000000000..fe3d2e344ff1
--- /dev/null
+++ b/lib/igt_sysrq.c
@@ -0,0 +1,22 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/reboot.h>
+
+#include "igt_core.h"
+
+#include "igt_sysrq.h"
+
+void igt_sysrq_reboot(void)
+{
+	sync();
+
+	/* Try to be nice at first, and if that fails pull the trigger */
+	if (reboot(RB_AUTOBOOT)) {
+		int fd = open("/proc/sysrq-trigger", O_WRONLY);
+		igt_ignore_warn(write(fd, "b", 2));
+		close(fd);
+	}
+
+	abort();
+}
diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
new file mode 100644
index 000000000000..422473d2a480
--- /dev/null
+++ b/lib/igt_sysrq.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __IGT_SYSRQ_H__
+#define __IGT_SYSRQ_H__
+
+void igt_sysrq_reboot(void) __attribute__((noreturn));
+
+#endif /* __IGT_SYSRQ_H__ */
diff --git a/lib/meson.build b/lib/meson.build
index a9e53689b35d..b3b8b14a3f01 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,6 +14,7 @@ lib_sources = [
 	'igt_stats.c',
 	'igt_syncobj.c',
 	'igt_sysfs.c',
+	'igt_sysrq.c',
 	'igt_vgem.c',
 	'igt_x86.c',
 	'instdone.c',
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 3bbb18d2f216..8c75b0641785 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
 	int link[2];
 	int fd, ret;
 	int cur = 0;
+	char buf;
 
 	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
 	igt_require(cpu0_hotplug_support());
@@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
 			}
 
 			/* Offline followed by online a CPU. */
-			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			ret = write(cpufd, "0", 2);
+			if (ret < 0) {
+				/*
+				 * If we failed to offline a CPU we don't want
+				 * to proceed.
+				 */
+				igt_warn("Failed to offline cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_assert_eq(write(link[1], "s", 1), 1);
+				break;
+			}
+
 			usleep(1e6);
-			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			ret = write(cpufd, "1", 2);
+			if (ret < 0) {
+				/*
+				 * Failed to bring a CPU back online is fatal
+				 * for the sanity of a test run so reboot
+				 * immediately.
+				 */
+				igt_warn("Failed to online cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_fatal_error();
+			}
 
 			close(cpufd);
 			cpu++;
@@ -1027,15 +1051,12 @@ static void cpu_hotplug(int gem_fd)
 	 * until the CPU core shuffler finishes one loop.
 	 */
 	for (;;) {
-		char buf;
-		int ret2;
-
 		usleep(500e3);
 		end_spin(gem_fd, spin[cur], 0);
 
 		/* Check if the child is signaling completion. */
-		ret2 = read(link[0], &buf, 1);
-		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
+		ret = read(link[0], &buf, 1);
+		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
 			break;
 
 		igt_spin_batch_free(gem_fd, spin[cur]);
@@ -1054,6 +1075,9 @@ static void cpu_hotplug(int gem_fd)
 	close(fd);
 	close(link[0]);
 
+	/* Skip if child signals a problem with offlining a CPU. */
+	igt_skip_on(buf == 's');
+
 	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
 }
 
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [igt-dev] [PATCH i-g-t v2] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-02-28 10:05           ` Tvrtko Ursulin
  0 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-02-28 10:05 UTC (permalink / raw)
  To: igt-dev; +Cc: Tomi Sarvela, Intel-gfx, Tvrtko Ursulin

From: Chris Wilson <chris@chris-wilson.co.uk>

CPU hotplug, especially CPU0, can be flaky on commodity hardware.

To improve test reliability and reponse times when testing larger runs we
need to handle those cases better.

Handle failures to off-line a CPU by immediately skipping the test, and
failures to on-line a CPU by immediately rebooting the machine.

This patch includes igt_sysrq_reboot implementation from Chris Wilson.

v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
    set. (Petri Latvala)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
---
 lib/Makefile.sources |  2 ++
 lib/igt_core.c       | 23 +++++++++++++++++++++++
 lib/igt_core.h       |  1 +
 lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
 lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
 lib/meson.build      |  1 +
 tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
 7 files changed, 110 insertions(+), 7 deletions(-)
 create mode 100644 lib/igt_sysrq.c
 create mode 100644 lib/igt_sysrq.h

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 5b13ef8896c0..3d37ef1d1984 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -35,6 +35,8 @@ lib_source_list =	 	\
 	igt_stats.h		\
 	igt_sysfs.c		\
 	igt_sysfs.h		\
+	igt_sysrq.c		\
+	igt_sysrq.h		\
 	igt_x86.h		\
 	igt_x86.c		\
 	igt_vgem.c		\
diff --git a/lib/igt_core.c b/lib/igt_core.c
index c292343de09e..3fd9f529f09f 100644
--- a/lib/igt_core.c
+++ b/lib/igt_core.c
@@ -70,6 +70,7 @@
 #include "igt_core.h"
 #include "igt_aux.h"
 #include "igt_sysfs.h"
+#include "igt_sysrq.h"
 #include "igt_rc.h"
 
 #define UNW_LOCAL_ONLY
@@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
 	}
 }
 
+/**
+ * igt_fatal_error:
+ *
+ * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
+ * environment variable is set, reboot the machine.
+ *
+ * Since out test runner (piglit) does support fatal test exit codes, we
+ * implement the default behaviour by waiting endlessly.
+ */
+void  __attribute__((noreturn)) igt_fatal_error(void)
+{
+	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
+		igt_warn("FATAL ERROR - REBOOTING");
+		igt_sysrq_reboot();
+	} else {
+		igt_warn("FATAL ERROR");
+		for (;;)
+			sleep(60);
+	}
+}
+
+
 /**
  * igt_can_fail:
  *
diff --git a/lib/igt_core.h b/lib/igt_core.h
index 7af2b4c109fe..66523a208c31 100644
--- a/lib/igt_core.h
+++ b/lib/igt_core.h
@@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
 		       const char *format, ...)
 	__attribute__((noreturn));
 void igt_exit(void) __attribute__((noreturn));
+void igt_fatal_error(void) __attribute__((noreturn));
 
 /**
  * igt_ignore_warn:
diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
new file mode 100644
index 000000000000..fe3d2e344ff1
--- /dev/null
+++ b/lib/igt_sysrq.c
@@ -0,0 +1,22 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/reboot.h>
+
+#include "igt_core.h"
+
+#include "igt_sysrq.h"
+
+void igt_sysrq_reboot(void)
+{
+	sync();
+
+	/* Try to be nice at first, and if that fails pull the trigger */
+	if (reboot(RB_AUTOBOOT)) {
+		int fd = open("/proc/sysrq-trigger", O_WRONLY);
+		igt_ignore_warn(write(fd, "b", 2));
+		close(fd);
+	}
+
+	abort();
+}
diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
new file mode 100644
index 000000000000..422473d2a480
--- /dev/null
+++ b/lib/igt_sysrq.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __IGT_SYSRQ_H__
+#define __IGT_SYSRQ_H__
+
+void igt_sysrq_reboot(void) __attribute__((noreturn));
+
+#endif /* __IGT_SYSRQ_H__ */
diff --git a/lib/meson.build b/lib/meson.build
index a9e53689b35d..b3b8b14a3f01 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,6 +14,7 @@ lib_sources = [
 	'igt_stats.c',
 	'igt_syncobj.c',
 	'igt_sysfs.c',
+	'igt_sysrq.c',
 	'igt_vgem.c',
 	'igt_x86.c',
 	'instdone.c',
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 3bbb18d2f216..8c75b0641785 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
 	int link[2];
 	int fd, ret;
 	int cur = 0;
+	char buf;
 
 	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
 	igt_require(cpu0_hotplug_support());
@@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
 			}
 
 			/* Offline followed by online a CPU. */
-			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			ret = write(cpufd, "0", 2);
+			if (ret < 0) {
+				/*
+				 * If we failed to offline a CPU we don't want
+				 * to proceed.
+				 */
+				igt_warn("Failed to offline cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_assert_eq(write(link[1], "s", 1), 1);
+				break;
+			}
+
 			usleep(1e6);
-			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			ret = write(cpufd, "1", 2);
+			if (ret < 0) {
+				/*
+				 * Failed to bring a CPU back online is fatal
+				 * for the sanity of a test run so reboot
+				 * immediately.
+				 */
+				igt_warn("Failed to online cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_fatal_error();
+			}
 
 			close(cpufd);
 			cpu++;
@@ -1027,15 +1051,12 @@ static void cpu_hotplug(int gem_fd)
 	 * until the CPU core shuffler finishes one loop.
 	 */
 	for (;;) {
-		char buf;
-		int ret2;
-
 		usleep(500e3);
 		end_spin(gem_fd, spin[cur], 0);
 
 		/* Check if the child is signaling completion. */
-		ret2 = read(link[0], &buf, 1);
-		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
+		ret = read(link[0], &buf, 1);
+		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
 			break;
 
 		igt_spin_batch_free(gem_fd, spin[cur]);
@@ -1054,6 +1075,9 @@ static void cpu_hotplug(int gem_fd)
 	close(fd);
 	close(link[0]);
 
+	/* Skip if child signals a problem with offlining a CPU. */
+	igt_skip_on(buf == 's');
+
 	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
 }
 
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Handle CPU hotplug failures better (rev2)
  2018-02-23 11:34 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (3 preceding siblings ...)
  (?)
@ 2018-02-28 11:54 ` Patchwork
  -1 siblings, 0 replies; 30+ messages in thread
From: Patchwork @ 2018-02-28 11:54 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Handle CPU hotplug failures better (rev2)
URL   : https://patchwork.freedesktop.org/series/38855/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
d3dc9c619791c8cb88d176fc7b3d8aa5e802055d lib/igt_draw: Fix bo leak in gpu draw routines

with latest DRM-Tip kernel build CI_DRM_3846
9a02ae14ae02 drm-tip: 2018y-02m-28d-10h-13m-18s UTC integration manifest

No testlist changes.

---- Known issues:

Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-a:
                pass       -> DMESG-WARN (fi-cnl-y3) fdo#103191
        Subgroup suspend-read-crc-pipe-b:
                pass       -> DMESG-WARN (fi-cnl-y3) fdo#104951
                pass       -> INCOMPLETE (fi-snb-2520m) fdo#103713

fdo#103191 https://bugs.freedesktop.org/show_bug.cgi?id=103191
fdo#104951 https://bugs.freedesktop.org/show_bug.cgi?id=104951
fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713

fi-bdw-5557u     total:288  pass:267  dwarn:0   dfail:0   fail:0   skip:21  time:414s
fi-bdw-gvtdvm    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:426s
fi-blb-e6850     total:288  pass:223  dwarn:1   dfail:0   fail:0   skip:64  time:375s
fi-bsw-n3050     total:288  pass:242  dwarn:0   dfail:0   fail:0   skip:46  time:490s
fi-bwr-2160      total:288  pass:183  dwarn:0   dfail:0   fail:0   skip:105 time:285s
fi-bxt-dsi       total:288  pass:258  dwarn:0   dfail:0   fail:0   skip:30  time:477s
fi-bxt-j4205     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:482s
fi-byt-j1900     total:288  pass:253  dwarn:0   dfail:0   fail:0   skip:35  time:468s
fi-byt-n2820     total:288  pass:249  dwarn:0   dfail:0   fail:0   skip:39  time:459s
fi-cfl-8700k     total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:391s
fi-cfl-s2        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:562s
fi-cnl-y3        total:288  pass:260  dwarn:2   dfail:0   fail:0   skip:26  time:577s
fi-elk-e7500     total:288  pass:229  dwarn:0   dfail:0   fail:0   skip:59  time:414s
fi-gdg-551       total:288  pass:179  dwarn:0   dfail:0   fail:1   skip:108 time:285s
fi-glk-1         total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:513s
fi-hsw-4770      total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:387s
fi-ilk-650       total:288  pass:228  dwarn:0   dfail:0   fail:0   skip:60  time:411s
fi-ivb-3520m     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:455s
fi-ivb-3770      total:288  pass:255  dwarn:0   dfail:0   fail:0   skip:33  time:413s
fi-kbl-7500u     total:288  pass:263  dwarn:1   dfail:0   fail:0   skip:24  time:451s
fi-kbl-7560u     total:288  pass:269  dwarn:0   dfail:0   fail:0   skip:19  time:492s
fi-kbl-7567u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:449s
fi-kbl-r         total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:493s
fi-pnv-d510      total:288  pass:222  dwarn:1   dfail:0   fail:0   skip:65  time:585s
fi-skl-6260u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:428s
fi-skl-6600u     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:500s
fi-skl-6700hq    total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:521s
fi-skl-6700k2    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:489s
fi-skl-6770hq    total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:476s
fi-skl-guc       total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:403s
fi-skl-gvtdvm    total:288  pass:265  dwarn:0   dfail:0   fail:0   skip:23  time:432s
fi-snb-2520m     total:245  pass:211  dwarn:0   dfail:0   fail:0   skip:33 
fi-snb-2600      total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:394s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1019/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [igt-dev] ✓ Fi.CI.IGT: success for tests/perf_pmu: Handle CPU hotplug failures better (rev2)
  2018-02-23 11:34 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (4 preceding siblings ...)
  (?)
@ 2018-02-28 14:50 ` Patchwork
  -1 siblings, 0 replies; 30+ messages in thread
From: Patchwork @ 2018-02-28 14:50 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Handle CPU hotplug failures better (rev2)
URL   : https://patchwork.freedesktop.org/series/38855/
State : success

== Summary ==

---- Possible new issues:

Test kms_rotation_crc:
        Subgroup primary-rotation-270:
                fail       -> PASS       (shard-apl)

---- Known issues:

Test gem_eio:
        Subgroup in-flight:
                pass       -> INCOMPLETE (shard-apl) fdo#104945
Test gem_softpin:
        Subgroup noreloc-s3:
                incomplete -> PASS       (shard-hsw) fdo#103540
Test kms_chv_cursor_fail:
        Subgroup pipe-b-256x256-left-edge:
                dmesg-warn -> PASS       (shard-snb) fdo#105185
Test kms_flip:
        Subgroup plain-flip-ts-check-interruptible:
                fail       -> PASS       (shard-hsw) fdo#100368
Test kms_rotation_crc:
        Subgroup sprite-rotation-180:
                fail       -> PASS       (shard-snb) fdo#103925
Test perf:
        Subgroup buffer-fill:
                pass       -> FAIL       (shard-apl) fdo#103755
        Subgroup polling:
                fail       -> PASS       (shard-hsw) fdo#102252

fdo#104945 https://bugs.freedesktop.org/show_bug.cgi?id=104945
fdo#103540 https://bugs.freedesktop.org/show_bug.cgi?id=103540
fdo#105185 https://bugs.freedesktop.org/show_bug.cgi?id=105185
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925
fdo#103755 https://bugs.freedesktop.org/show_bug.cgi?id=103755
fdo#102252 https://bugs.freedesktop.org/show_bug.cgi?id=102252

shard-apl        total:3304 pass:1728 dwarn:1   dfail:0   fail:9   skip:1563 time:11640s
shard-hsw        total:3460 pass:1767 dwarn:1   dfail:0   fail:1   skip:1690 time:11658s
shard-snb        total:3460 pass:1359 dwarn:1   dfail:0   fail:1   skip:2099 time:6640s
Blacklisted hosts:
shard-kbl        total:3460 pass:1902 dwarn:30  dfail:1   fail:7   skip:1520 time:9393s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1019/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH i-g-t v2] tests/perf_pmu: Handle CPU hotplug failures better
  2018-02-28 10:05           ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-02 11:12             ` Petri Latvala
  -1 siblings, 0 replies; 30+ messages in thread
From: Petri Latvala @ 2018-03-02 11:12 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev, Tomi Sarvela, Intel-gfx

On Wed, Feb 28, 2018 at 10:05:55AM +0000, Tvrtko Ursulin wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> CPU hotplug, especially CPU0, can be flaky on commodity hardware.
> 
> To improve test reliability and reponse times when testing larger runs we
> need to handle those cases better.
> 
> Handle failures to off-line a CPU by immediately skipping the test, and
> failures to on-line a CPU by immediately rebooting the machine.
> 
> This patch includes igt_sysrq_reboot implementation from Chris Wilson.
> 
> v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
>     set. (Petri Latvala)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Petri Latvala <petri.latvala@intel.com>
> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>


Reviewed-by: Petri Latvala <petri.latvala@intel.com>

with two nitpicks below.




> ---
>  lib/Makefile.sources |  2 ++
>  lib/igt_core.c       | 23 +++++++++++++++++++++++
>  lib/igt_core.h       |  1 +
>  lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
>  lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
>  lib/meson.build      |  1 +
>  tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
>  7 files changed, 110 insertions(+), 7 deletions(-)
>  create mode 100644 lib/igt_sysrq.c
>  create mode 100644 lib/igt_sysrq.h
> 
> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
> index 5b13ef8896c0..3d37ef1d1984 100644
> --- a/lib/Makefile.sources
> +++ b/lib/Makefile.sources
> @@ -35,6 +35,8 @@ lib_source_list =	 	\
>  	igt_stats.h		\
>  	igt_sysfs.c		\
>  	igt_sysfs.h		\
> +	igt_sysrq.c		\
> +	igt_sysrq.h		\
>  	igt_x86.h		\
>  	igt_x86.c		\
>  	igt_vgem.c		\
> diff --git a/lib/igt_core.c b/lib/igt_core.c
> index c292343de09e..3fd9f529f09f 100644
> --- a/lib/igt_core.c
> +++ b/lib/igt_core.c
> @@ -70,6 +70,7 @@
>  #include "igt_core.h"
>  #include "igt_aux.h"
>  #include "igt_sysfs.h"
> +#include "igt_sysrq.h"
>  #include "igt_rc.h"
>  
>  #define UNW_LOCAL_ONLY
> @@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
>  	}
>  }
>  
> +/**
> + * igt_fatal_error:
> + *
> + * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
> + * environment variable is set, reboot the machine.
> + *
> + * Since out test runner (piglit) does support fatal test exit codes, we
> + * implement the default behaviour by waiting endlessly.
> + */
> +void  __attribute__((noreturn)) igt_fatal_error(void)
> +{
> +	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
> +		igt_warn("FATAL ERROR - REBOOTING");
> +		igt_sysrq_reboot();
> +	} else {
> +		igt_warn("FATAL ERROR");
> +		for (;;)
> +			sleep(60);
> +	}
> +}
> +
> +
>  /**
>   * igt_can_fail:
>   *
> diff --git a/lib/igt_core.h b/lib/igt_core.h
> index 7af2b4c109fe..66523a208c31 100644
> --- a/lib/igt_core.h
> +++ b/lib/igt_core.h
> @@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
>  		       const char *format, ...)
>  	__attribute__((noreturn));
>  void igt_exit(void) __attribute__((noreturn));
> +void igt_fatal_error(void) __attribute__((noreturn));
>  
>  /**
>   * igt_ignore_warn:
> diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
> new file mode 100644
> index 000000000000..fe3d2e344ff1
> --- /dev/null
> +++ b/lib/igt_sysrq.c
> @@ -0,0 +1,22 @@
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <sys/reboot.h>
> +
> +#include "igt_core.h"
> +
> +#include "igt_sysrq.h"
> +


Docs for igt_sysrq_reboot?


> +void igt_sysrq_reboot(void)
> +{
> +	sync();
> +
> +	/* Try to be nice at first, and if that fails pull the trigger */
> +	if (reboot(RB_AUTOBOOT)) {
> +		int fd = open("/proc/sysrq-trigger", O_WRONLY);
> +		igt_ignore_warn(write(fd, "b", 2));
> +		close(fd);
> +	}
> +
> +	abort();
> +}
> diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
> new file mode 100644
> index 000000000000..422473d2a480
> --- /dev/null
> +++ b/lib/igt_sysrq.h
> @@ -0,0 +1,30 @@
> +/*
> + * Copyright © 2018 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __IGT_SYSRQ_H__
> +#define __IGT_SYSRQ_H__
> +
> +void igt_sysrq_reboot(void) __attribute__((noreturn));
> +
> +#endif /* __IGT_SYSRQ_H__ */
> diff --git a/lib/meson.build b/lib/meson.build
> index a9e53689b35d..b3b8b14a3f01 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -14,6 +14,7 @@ lib_sources = [
>  	'igt_stats.c',
>  	'igt_syncobj.c',
>  	'igt_sysfs.c',
> +	'igt_sysrq.c',
>  	'igt_vgem.c',
>  	'igt_x86.c',
>  	'instdone.c',
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 3bbb18d2f216..8c75b0641785 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
>  	int link[2];
>  	int fd, ret;
>  	int cur = 0;
> +	char buf;
>  
>  	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
>  	igt_require(cpu0_hotplug_support());
> @@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
>  			}
>  
>  			/* Offline followed by online a CPU. */
> -			igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +			ret = write(cpufd, "0", 2);
> +			if (ret < 0) {
> +				/*
> +				 * If we failed to offline a CPU we don't want
> +				 * to proceed.
> +				 */
> +				igt_warn("Failed to offline cpu%u! (%d)\n",
> +					 cpu, errno);
> +				igt_assert_eq(write(link[1], "s", 1), 1);
> +				break;
> +			}
> +
>  			usleep(1e6);
> -			igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +			ret = write(cpufd, "1", 2);
> +			if (ret < 0) {
> +				/*
> +				 * Failed to bring a CPU back online is fatal
> +				 * for the sanity of a test run so reboot
> +				 * immediately.
> +				 */

This is assuming what the user has configured igt_fatal_error() to
do. Just a s/so reboot immediately// maybe?


-- 
Petri Latvala
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [Intel-gfx] [PATCH i-g-t v2] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-03-02 11:12             ` Petri Latvala
  0 siblings, 0 replies; 30+ messages in thread
From: Petri Latvala @ 2018-03-02 11:12 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev, Tomi Sarvela, Intel-gfx

On Wed, Feb 28, 2018 at 10:05:55AM +0000, Tvrtko Ursulin wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> CPU hotplug, especially CPU0, can be flaky on commodity hardware.
> 
> To improve test reliability and reponse times when testing larger runs we
> need to handle those cases better.
> 
> Handle failures to off-line a CPU by immediately skipping the test, and
> failures to on-line a CPU by immediately rebooting the machine.
> 
> This patch includes igt_sysrq_reboot implementation from Chris Wilson.
> 
> v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
>     set. (Petri Latvala)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Petri Latvala <petri.latvala@intel.com>
> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>


Reviewed-by: Petri Latvala <petri.latvala@intel.com>

with two nitpicks below.




> ---
>  lib/Makefile.sources |  2 ++
>  lib/igt_core.c       | 23 +++++++++++++++++++++++
>  lib/igt_core.h       |  1 +
>  lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
>  lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
>  lib/meson.build      |  1 +
>  tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
>  7 files changed, 110 insertions(+), 7 deletions(-)
>  create mode 100644 lib/igt_sysrq.c
>  create mode 100644 lib/igt_sysrq.h
> 
> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
> index 5b13ef8896c0..3d37ef1d1984 100644
> --- a/lib/Makefile.sources
> +++ b/lib/Makefile.sources
> @@ -35,6 +35,8 @@ lib_source_list =	 	\
>  	igt_stats.h		\
>  	igt_sysfs.c		\
>  	igt_sysfs.h		\
> +	igt_sysrq.c		\
> +	igt_sysrq.h		\
>  	igt_x86.h		\
>  	igt_x86.c		\
>  	igt_vgem.c		\
> diff --git a/lib/igt_core.c b/lib/igt_core.c
> index c292343de09e..3fd9f529f09f 100644
> --- a/lib/igt_core.c
> +++ b/lib/igt_core.c
> @@ -70,6 +70,7 @@
>  #include "igt_core.h"
>  #include "igt_aux.h"
>  #include "igt_sysfs.h"
> +#include "igt_sysrq.h"
>  #include "igt_rc.h"
>  
>  #define UNW_LOCAL_ONLY
> @@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
>  	}
>  }
>  
> +/**
> + * igt_fatal_error:
> + *
> + * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
> + * environment variable is set, reboot the machine.
> + *
> + * Since out test runner (piglit) does support fatal test exit codes, we
> + * implement the default behaviour by waiting endlessly.
> + */
> +void  __attribute__((noreturn)) igt_fatal_error(void)
> +{
> +	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
> +		igt_warn("FATAL ERROR - REBOOTING");
> +		igt_sysrq_reboot();
> +	} else {
> +		igt_warn("FATAL ERROR");
> +		for (;;)
> +			sleep(60);
> +	}
> +}
> +
> +
>  /**
>   * igt_can_fail:
>   *
> diff --git a/lib/igt_core.h b/lib/igt_core.h
> index 7af2b4c109fe..66523a208c31 100644
> --- a/lib/igt_core.h
> +++ b/lib/igt_core.h
> @@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
>  		       const char *format, ...)
>  	__attribute__((noreturn));
>  void igt_exit(void) __attribute__((noreturn));
> +void igt_fatal_error(void) __attribute__((noreturn));
>  
>  /**
>   * igt_ignore_warn:
> diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
> new file mode 100644
> index 000000000000..fe3d2e344ff1
> --- /dev/null
> +++ b/lib/igt_sysrq.c
> @@ -0,0 +1,22 @@
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <sys/reboot.h>
> +
> +#include "igt_core.h"
> +
> +#include "igt_sysrq.h"
> +


Docs for igt_sysrq_reboot?


> +void igt_sysrq_reboot(void)
> +{
> +	sync();
> +
> +	/* Try to be nice at first, and if that fails pull the trigger */
> +	if (reboot(RB_AUTOBOOT)) {
> +		int fd = open("/proc/sysrq-trigger", O_WRONLY);
> +		igt_ignore_warn(write(fd, "b", 2));
> +		close(fd);
> +	}
> +
> +	abort();
> +}
> diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
> new file mode 100644
> index 000000000000..422473d2a480
> --- /dev/null
> +++ b/lib/igt_sysrq.h
> @@ -0,0 +1,30 @@
> +/*
> + * Copyright © 2018 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __IGT_SYSRQ_H__
> +#define __IGT_SYSRQ_H__
> +
> +void igt_sysrq_reboot(void) __attribute__((noreturn));
> +
> +#endif /* __IGT_SYSRQ_H__ */
> diff --git a/lib/meson.build b/lib/meson.build
> index a9e53689b35d..b3b8b14a3f01 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -14,6 +14,7 @@ lib_sources = [
>  	'igt_stats.c',
>  	'igt_syncobj.c',
>  	'igt_sysfs.c',
> +	'igt_sysrq.c',
>  	'igt_vgem.c',
>  	'igt_x86.c',
>  	'instdone.c',
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 3bbb18d2f216..8c75b0641785 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
>  	int link[2];
>  	int fd, ret;
>  	int cur = 0;
> +	char buf;
>  
>  	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
>  	igt_require(cpu0_hotplug_support());
> @@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
>  			}
>  
>  			/* Offline followed by online a CPU. */
> -			igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +			ret = write(cpufd, "0", 2);
> +			if (ret < 0) {
> +				/*
> +				 * If we failed to offline a CPU we don't want
> +				 * to proceed.
> +				 */
> +				igt_warn("Failed to offline cpu%u! (%d)\n",
> +					 cpu, errno);
> +				igt_assert_eq(write(link[1], "s", 1), 1);
> +				break;
> +			}
> +
>  			usleep(1e6);
> -			igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +			ret = write(cpufd, "1", 2);
> +			if (ret < 0) {
> +				/*
> +				 * Failed to bring a CPU back online is fatal
> +				 * for the sanity of a test run so reboot
> +				 * immediately.
> +				 */

This is assuming what the user has configured igt_fatal_error() to
do. Just a s/so reboot immediately// maybe?


-- 
Petri Latvala
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH i-g-t v3] tests/perf_pmu: Handle CPU hotplug failures better
  2018-03-02 11:12             ` [Intel-gfx] " Petri Latvala
@ 2018-03-02 11:28               ` Tvrtko Ursulin
  -1 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-03-02 11:28 UTC (permalink / raw)
  To: igt-dev; +Cc: Tomi Sarvela, Intel-gfx

From: Chris Wilson <chris@chris-wilson.co.uk>

CPU hotplug, especially CPU0, can be flaky on commodity hardware.

To improve test reliability and reponse times when testing larger runs we
need to handle those cases better.

Handle failures to off-line a CPU by immediately skipping the test, and
failures to on-line a CPU by immediately rebooting the machine.

This patch includes igt_sysrq_reboot implementation from Chris Wilson.

v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
    set. (Petri Latvala)

v3: Add missign docs and update stale comment. (Petri Latvala)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
Reviewed-by: Petri Latvala <petri.latvala@intel.com>
---
 lib/Makefile.sources |  2 ++
 lib/igt_core.c       | 23 +++++++++++++++++++++++
 lib/igt_core.h       |  1 +
 lib/igt_sysrq.c      | 27 +++++++++++++++++++++++++++
 lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
 lib/meson.build      |  1 +
 tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
 7 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 lib/igt_sysrq.c
 create mode 100644 lib/igt_sysrq.h

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 5b13ef8896c0..3d37ef1d1984 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -35,6 +35,8 @@ lib_source_list =	 	\
 	igt_stats.h		\
 	igt_sysfs.c		\
 	igt_sysfs.h		\
+	igt_sysrq.c		\
+	igt_sysrq.h		\
 	igt_x86.h		\
 	igt_x86.c		\
 	igt_vgem.c		\
diff --git a/lib/igt_core.c b/lib/igt_core.c
index c292343de09e..d64c25e1217b 100644
--- a/lib/igt_core.c
+++ b/lib/igt_core.c
@@ -70,6 +70,7 @@
 #include "igt_core.h"
 #include "igt_aux.h"
 #include "igt_sysfs.h"
+#include "igt_sysrq.h"
 #include "igt_rc.h"
 
 #define UNW_LOCAL_ONLY
@@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
 	}
 }
 
+/**
+ * igt_fatal_error: Stop test execution on fatal errors
+ *
+ * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
+ * environment variable is set, reboot the machine.
+ *
+ * Since out test runner (piglit) does support fatal test exit codes, we
+ * implement the default behaviour by waiting endlessly.
+ */
+void  __attribute__((noreturn)) igt_fatal_error(void)
+{
+	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
+		igt_warn("FATAL ERROR - REBOOTING");
+		igt_sysrq_reboot();
+	} else {
+		igt_warn("FATAL ERROR");
+		for (;;)
+			sleep(60);
+	}
+}
+
+
 /**
  * igt_can_fail:
  *
diff --git a/lib/igt_core.h b/lib/igt_core.h
index 7af2b4c109fe..66523a208c31 100644
--- a/lib/igt_core.h
+++ b/lib/igt_core.h
@@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
 		       const char *format, ...)
 	__attribute__((noreturn));
 void igt_exit(void) __attribute__((noreturn));
+void igt_fatal_error(void) __attribute__((noreturn));
 
 /**
  * igt_ignore_warn:
diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
new file mode 100644
index 000000000000..3bda321f7c5b
--- /dev/null
+++ b/lib/igt_sysrq.c
@@ -0,0 +1,27 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/reboot.h>
+
+#include "igt_core.h"
+
+#include "igt_sysrq.h"
+
+/**
+ * igt_sysrq_reboot: Reboots the machine
+ *
+ * Syncs filesystems and immediately reboots the machine.
+ */
+void igt_sysrq_reboot(void)
+{
+	sync();
+
+	/* Try to be nice at first, and if that fails pull the trigger */
+	if (reboot(RB_AUTOBOOT)) {
+		int fd = open("/proc/sysrq-trigger", O_WRONLY);
+		igt_ignore_warn(write(fd, "b", 2));
+		close(fd);
+	}
+
+	abort();
+}
diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
new file mode 100644
index 000000000000..422473d2a480
--- /dev/null
+++ b/lib/igt_sysrq.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __IGT_SYSRQ_H__
+#define __IGT_SYSRQ_H__
+
+void igt_sysrq_reboot(void) __attribute__((noreturn));
+
+#endif /* __IGT_SYSRQ_H__ */
diff --git a/lib/meson.build b/lib/meson.build
index a9e53689b35d..b3b8b14a3f01 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,6 +14,7 @@ lib_sources = [
 	'igt_stats.c',
 	'igt_syncobj.c',
 	'igt_sysfs.c',
+	'igt_sysrq.c',
 	'igt_vgem.c',
 	'igt_x86.c',
 	'instdone.c',
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 3bbb18d2f216..87e7e782d05e 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
 	int link[2];
 	int fd, ret;
 	int cur = 0;
+	char buf;
 
 	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
 	igt_require(cpu0_hotplug_support());
@@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
 			}
 
 			/* Offline followed by online a CPU. */
-			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			ret = write(cpufd, "0", 2);
+			if (ret < 0) {
+				/*
+				 * If we failed to offline a CPU we don't want
+				 * to proceed.
+				 */
+				igt_warn("Failed to offline cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_assert_eq(write(link[1], "s", 1), 1);
+				break;
+			}
+
 			usleep(1e6);
-			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			ret = write(cpufd, "1", 2);
+			if (ret < 0) {
+				/*
+				 * Failed to bring a CPU back online is fatal
+				 * for the sanity of a test run so stop further
+				 * testing.
+				 */
+				igt_warn("Failed to online cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_fatal_error();
+			}
 
 			close(cpufd);
 			cpu++;
@@ -1027,15 +1051,12 @@ static void cpu_hotplug(int gem_fd)
 	 * until the CPU core shuffler finishes one loop.
 	 */
 	for (;;) {
-		char buf;
-		int ret2;
-
 		usleep(500e3);
 		end_spin(gem_fd, spin[cur], 0);
 
 		/* Check if the child is signaling completion. */
-		ret2 = read(link[0], &buf, 1);
-		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
+		ret = read(link[0], &buf, 1);
+		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
 			break;
 
 		igt_spin_batch_free(gem_fd, spin[cur]);
@@ -1054,6 +1075,9 @@ static void cpu_hotplug(int gem_fd)
 	close(fd);
 	close(link[0]);
 
+	/* Skip if child signals a problem with offlining a CPU. */
+	igt_skip_on(buf == 's');
+
 	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
 }
 
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [igt-dev] [PATCH i-g-t v3] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-03-02 11:28               ` Tvrtko Ursulin
  0 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-03-02 11:28 UTC (permalink / raw)
  To: igt-dev; +Cc: Tomi Sarvela, Intel-gfx, Tvrtko Ursulin

From: Chris Wilson <chris@chris-wilson.co.uk>

CPU hotplug, especially CPU0, can be flaky on commodity hardware.

To improve test reliability and reponse times when testing larger runs we
need to handle those cases better.

Handle failures to off-line a CPU by immediately skipping the test, and
failures to on-line a CPU by immediately rebooting the machine.

This patch includes igt_sysrq_reboot implementation from Chris Wilson.

v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
    set. (Petri Latvala)

v3: Add missign docs and update stale comment. (Petri Latvala)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
Reviewed-by: Petri Latvala <petri.latvala@intel.com>
---
 lib/Makefile.sources |  2 ++
 lib/igt_core.c       | 23 +++++++++++++++++++++++
 lib/igt_core.h       |  1 +
 lib/igt_sysrq.c      | 27 +++++++++++++++++++++++++++
 lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
 lib/meson.build      |  1 +
 tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
 7 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 lib/igt_sysrq.c
 create mode 100644 lib/igt_sysrq.h

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 5b13ef8896c0..3d37ef1d1984 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -35,6 +35,8 @@ lib_source_list =	 	\
 	igt_stats.h		\
 	igt_sysfs.c		\
 	igt_sysfs.h		\
+	igt_sysrq.c		\
+	igt_sysrq.h		\
 	igt_x86.h		\
 	igt_x86.c		\
 	igt_vgem.c		\
diff --git a/lib/igt_core.c b/lib/igt_core.c
index c292343de09e..d64c25e1217b 100644
--- a/lib/igt_core.c
+++ b/lib/igt_core.c
@@ -70,6 +70,7 @@
 #include "igt_core.h"
 #include "igt_aux.h"
 #include "igt_sysfs.h"
+#include "igt_sysrq.h"
 #include "igt_rc.h"
 
 #define UNW_LOCAL_ONLY
@@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
 	}
 }
 
+/**
+ * igt_fatal_error: Stop test execution on fatal errors
+ *
+ * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
+ * environment variable is set, reboot the machine.
+ *
+ * Since out test runner (piglit) does support fatal test exit codes, we
+ * implement the default behaviour by waiting endlessly.
+ */
+void  __attribute__((noreturn)) igt_fatal_error(void)
+{
+	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
+		igt_warn("FATAL ERROR - REBOOTING");
+		igt_sysrq_reboot();
+	} else {
+		igt_warn("FATAL ERROR");
+		for (;;)
+			sleep(60);
+	}
+}
+
+
 /**
  * igt_can_fail:
  *
diff --git a/lib/igt_core.h b/lib/igt_core.h
index 7af2b4c109fe..66523a208c31 100644
--- a/lib/igt_core.h
+++ b/lib/igt_core.h
@@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
 		       const char *format, ...)
 	__attribute__((noreturn));
 void igt_exit(void) __attribute__((noreturn));
+void igt_fatal_error(void) __attribute__((noreturn));
 
 /**
  * igt_ignore_warn:
diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
new file mode 100644
index 000000000000..3bda321f7c5b
--- /dev/null
+++ b/lib/igt_sysrq.c
@@ -0,0 +1,27 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/reboot.h>
+
+#include "igt_core.h"
+
+#include "igt_sysrq.h"
+
+/**
+ * igt_sysrq_reboot: Reboots the machine
+ *
+ * Syncs filesystems and immediately reboots the machine.
+ */
+void igt_sysrq_reboot(void)
+{
+	sync();
+
+	/* Try to be nice at first, and if that fails pull the trigger */
+	if (reboot(RB_AUTOBOOT)) {
+		int fd = open("/proc/sysrq-trigger", O_WRONLY);
+		igt_ignore_warn(write(fd, "b", 2));
+		close(fd);
+	}
+
+	abort();
+}
diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
new file mode 100644
index 000000000000..422473d2a480
--- /dev/null
+++ b/lib/igt_sysrq.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __IGT_SYSRQ_H__
+#define __IGT_SYSRQ_H__
+
+void igt_sysrq_reboot(void) __attribute__((noreturn));
+
+#endif /* __IGT_SYSRQ_H__ */
diff --git a/lib/meson.build b/lib/meson.build
index a9e53689b35d..b3b8b14a3f01 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,6 +14,7 @@ lib_sources = [
 	'igt_stats.c',
 	'igt_syncobj.c',
 	'igt_sysfs.c',
+	'igt_sysrq.c',
 	'igt_vgem.c',
 	'igt_x86.c',
 	'instdone.c',
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 3bbb18d2f216..87e7e782d05e 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
 	int link[2];
 	int fd, ret;
 	int cur = 0;
+	char buf;
 
 	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
 	igt_require(cpu0_hotplug_support());
@@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
 			}
 
 			/* Offline followed by online a CPU. */
-			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			ret = write(cpufd, "0", 2);
+			if (ret < 0) {
+				/*
+				 * If we failed to offline a CPU we don't want
+				 * to proceed.
+				 */
+				igt_warn("Failed to offline cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_assert_eq(write(link[1], "s", 1), 1);
+				break;
+			}
+
 			usleep(1e6);
-			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			ret = write(cpufd, "1", 2);
+			if (ret < 0) {
+				/*
+				 * Failed to bring a CPU back online is fatal
+				 * for the sanity of a test run so stop further
+				 * testing.
+				 */
+				igt_warn("Failed to online cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_fatal_error();
+			}
 
 			close(cpufd);
 			cpu++;
@@ -1027,15 +1051,12 @@ static void cpu_hotplug(int gem_fd)
 	 * until the CPU core shuffler finishes one loop.
 	 */
 	for (;;) {
-		char buf;
-		int ret2;
-
 		usleep(500e3);
 		end_spin(gem_fd, spin[cur], 0);
 
 		/* Check if the child is signaling completion. */
-		ret2 = read(link[0], &buf, 1);
-		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
+		ret = read(link[0], &buf, 1);
+		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
 			break;
 
 		igt_spin_batch_free(gem_fd, spin[cur]);
@@ -1054,6 +1075,9 @@ static void cpu_hotplug(int gem_fd)
 	close(fd);
 	close(link[0]);
 
+	/* Skip if child signals a problem with offlining a CPU. */
+	igt_skip_on(buf == 's');
+
 	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
 }
 
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH i-g-t v2] tests/perf_pmu: Handle CPU hotplug failures better
  2018-03-02 11:12             ` [Intel-gfx] " Petri Latvala
@ 2018-03-02 11:32               ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2018-03-02 11:32 UTC (permalink / raw)
  To: Petri Latvala, Tvrtko Ursulin; +Cc: igt-dev, Tomi Sarvela, Intel-gfx

Quoting Petri Latvala (2018-03-02 11:12:19)
> On Wed, Feb 28, 2018 at 10:05:55AM +0000, Tvrtko Ursulin wrote:
> > +/**
> > + * igt_fatal_error:
> > + *
> > + * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
> > + * environment variable is set, reboot the machine.
> > + *
> > + * Since out test runner (piglit) does support fatal test exit codes, we
> > + * implement the default behaviour by waiting endlessly.
> > + */
> > +void  __attribute__((noreturn)) igt_fatal_error(void)
> > +{
> > +     if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
> > +             igt_warn("FATAL ERROR - REBOOTING");
> > +             igt_sysrq_reboot();
> > +     } else {
> > +             igt_warn("FATAL ERROR");
> > +             for (;;)
> > +                     sleep(60);

s/sleep/pause/
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-03-02 11:32               ` Chris Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2018-03-02 11:32 UTC (permalink / raw)
  To: Petri Latvala, Tvrtko Ursulin
  Cc: igt-dev, Tomi Sarvela, Intel-gfx, Tvrtko Ursulin

Quoting Petri Latvala (2018-03-02 11:12:19)
> On Wed, Feb 28, 2018 at 10:05:55AM +0000, Tvrtko Ursulin wrote:
> > +/**
> > + * igt_fatal_error:
> > + *
> > + * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
> > + * environment variable is set, reboot the machine.
> > + *
> > + * Since out test runner (piglit) does support fatal test exit codes, we
> > + * implement the default behaviour by waiting endlessly.
> > + */
> > +void  __attribute__((noreturn)) igt_fatal_error(void)
> > +{
> > +     if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
> > +             igt_warn("FATAL ERROR - REBOOTING");
> > +             igt_sysrq_reboot();
> > +     } else {
> > +             igt_warn("FATAL ERROR");
> > +             for (;;)
> > +                     sleep(60);

s/sleep/pause/
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH i-g-t v4] tests/perf_pmu: Handle CPU hotplug failures better
  2018-03-02 11:28               ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-02 11:39                 ` Tvrtko Ursulin
  -1 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-03-02 11:39 UTC (permalink / raw)
  To: igt-dev; +Cc: Tomi Sarvela, Intel-gfx

From: Chris Wilson <chris@chris-wilson.co.uk>

CPU hotplug, especially CPU0, can be flaky on commodity hardware.

To improve test reliability and reponse times when testing larger runs we
need to handle those cases better.

Handle failures to off-line a CPU by immediately skipping the test, and
failures to on-line a CPU by immediately rebooting the machine.

This patch includes igt_sysrq_reboot implementation from Chris Wilson.

v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
    set. (Petri Latvala)

v3: Add missign docs and update stale comment. (Petri Latvala)

v4: Use pause instead of sleep. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
Reviewed-by: Petri Latvala <petri.latvala@intel.com>
---
 lib/Makefile.sources |  2 ++
 lib/igt_core.c       | 23 +++++++++++++++++++++++
 lib/igt_core.h       |  1 +
 lib/igt_sysrq.c      | 27 +++++++++++++++++++++++++++
 lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
 lib/meson.build      |  1 +
 tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
 7 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 lib/igt_sysrq.c
 create mode 100644 lib/igt_sysrq.h

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 5b13ef8896c0..3d37ef1d1984 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -35,6 +35,8 @@ lib_source_list =	 	\
 	igt_stats.h		\
 	igt_sysfs.c		\
 	igt_sysfs.h		\
+	igt_sysrq.c		\
+	igt_sysrq.h		\
 	igt_x86.h		\
 	igt_x86.c		\
 	igt_vgem.c		\
diff --git a/lib/igt_core.c b/lib/igt_core.c
index c292343de09e..46285ac32432 100644
--- a/lib/igt_core.c
+++ b/lib/igt_core.c
@@ -70,6 +70,7 @@
 #include "igt_core.h"
 #include "igt_aux.h"
 #include "igt_sysfs.h"
+#include "igt_sysrq.h"
 #include "igt_rc.h"
 
 #define UNW_LOCAL_ONLY
@@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
 	}
 }
 
+/**
+ * igt_fatal_error: Stop test execution on fatal errors
+ *
+ * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
+ * environment variable is set, reboot the machine.
+ *
+ * Since out test runner (piglit) does support fatal test exit codes, we
+ * implement the default behaviour by waiting endlessly.
+ */
+void  __attribute__((noreturn)) igt_fatal_error(void)
+{
+	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
+		igt_warn("FATAL ERROR - REBOOTING");
+		igt_sysrq_reboot();
+	} else {
+		igt_warn("FATAL ERROR");
+		for (;;)
+			pause();
+	}
+}
+
+
 /**
  * igt_can_fail:
  *
diff --git a/lib/igt_core.h b/lib/igt_core.h
index 7af2b4c109fe..66523a208c31 100644
--- a/lib/igt_core.h
+++ b/lib/igt_core.h
@@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
 		       const char *format, ...)
 	__attribute__((noreturn));
 void igt_exit(void) __attribute__((noreturn));
+void igt_fatal_error(void) __attribute__((noreturn));
 
 /**
  * igt_ignore_warn:
diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
new file mode 100644
index 000000000000..3bda321f7c5b
--- /dev/null
+++ b/lib/igt_sysrq.c
@@ -0,0 +1,27 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/reboot.h>
+
+#include "igt_core.h"
+
+#include "igt_sysrq.h"
+
+/**
+ * igt_sysrq_reboot: Reboots the machine
+ *
+ * Syncs filesystems and immediately reboots the machine.
+ */
+void igt_sysrq_reboot(void)
+{
+	sync();
+
+	/* Try to be nice at first, and if that fails pull the trigger */
+	if (reboot(RB_AUTOBOOT)) {
+		int fd = open("/proc/sysrq-trigger", O_WRONLY);
+		igt_ignore_warn(write(fd, "b", 2));
+		close(fd);
+	}
+
+	abort();
+}
diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
new file mode 100644
index 000000000000..422473d2a480
--- /dev/null
+++ b/lib/igt_sysrq.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __IGT_SYSRQ_H__
+#define __IGT_SYSRQ_H__
+
+void igt_sysrq_reboot(void) __attribute__((noreturn));
+
+#endif /* __IGT_SYSRQ_H__ */
diff --git a/lib/meson.build b/lib/meson.build
index a9e53689b35d..b3b8b14a3f01 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,6 +14,7 @@ lib_sources = [
 	'igt_stats.c',
 	'igt_syncobj.c',
 	'igt_sysfs.c',
+	'igt_sysrq.c',
 	'igt_vgem.c',
 	'igt_x86.c',
 	'instdone.c',
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 3bbb18d2f216..87e7e782d05e 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
 	int link[2];
 	int fd, ret;
 	int cur = 0;
+	char buf;
 
 	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
 	igt_require(cpu0_hotplug_support());
@@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
 			}
 
 			/* Offline followed by online a CPU. */
-			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			ret = write(cpufd, "0", 2);
+			if (ret < 0) {
+				/*
+				 * If we failed to offline a CPU we don't want
+				 * to proceed.
+				 */
+				igt_warn("Failed to offline cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_assert_eq(write(link[1], "s", 1), 1);
+				break;
+			}
+
 			usleep(1e6);
-			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			ret = write(cpufd, "1", 2);
+			if (ret < 0) {
+				/*
+				 * Failed to bring a CPU back online is fatal
+				 * for the sanity of a test run so stop further
+				 * testing.
+				 */
+				igt_warn("Failed to online cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_fatal_error();
+			}
 
 			close(cpufd);
 			cpu++;
@@ -1027,15 +1051,12 @@ static void cpu_hotplug(int gem_fd)
 	 * until the CPU core shuffler finishes one loop.
 	 */
 	for (;;) {
-		char buf;
-		int ret2;
-
 		usleep(500e3);
 		end_spin(gem_fd, spin[cur], 0);
 
 		/* Check if the child is signaling completion. */
-		ret2 = read(link[0], &buf, 1);
-		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
+		ret = read(link[0], &buf, 1);
+		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
 			break;
 
 		igt_spin_batch_free(gem_fd, spin[cur]);
@@ -1054,6 +1075,9 @@ static void cpu_hotplug(int gem_fd)
 	close(fd);
 	close(link[0]);
 
+	/* Skip if child signals a problem with offlining a CPU. */
+	igt_skip_on(buf == 's');
+
 	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
 }
 
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [igt-dev] [PATCH i-g-t v4] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-03-02 11:39                 ` Tvrtko Ursulin
  0 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-03-02 11:39 UTC (permalink / raw)
  To: igt-dev; +Cc: Tomi Sarvela, Intel-gfx, Tvrtko Ursulin

From: Chris Wilson <chris@chris-wilson.co.uk>

CPU hotplug, especially CPU0, can be flaky on commodity hardware.

To improve test reliability and reponse times when testing larger runs we
need to handle those cases better.

Handle failures to off-line a CPU by immediately skipping the test, and
failures to on-line a CPU by immediately rebooting the machine.

This patch includes igt_sysrq_reboot implementation from Chris Wilson.

v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
    set. (Petri Latvala)

v3: Add missign docs and update stale comment. (Petri Latvala)

v4: Use pause instead of sleep. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
Reviewed-by: Petri Latvala <petri.latvala@intel.com>
---
 lib/Makefile.sources |  2 ++
 lib/igt_core.c       | 23 +++++++++++++++++++++++
 lib/igt_core.h       |  1 +
 lib/igt_sysrq.c      | 27 +++++++++++++++++++++++++++
 lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
 lib/meson.build      |  1 +
 tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
 7 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 lib/igt_sysrq.c
 create mode 100644 lib/igt_sysrq.h

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 5b13ef8896c0..3d37ef1d1984 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -35,6 +35,8 @@ lib_source_list =	 	\
 	igt_stats.h		\
 	igt_sysfs.c		\
 	igt_sysfs.h		\
+	igt_sysrq.c		\
+	igt_sysrq.h		\
 	igt_x86.h		\
 	igt_x86.c		\
 	igt_vgem.c		\
diff --git a/lib/igt_core.c b/lib/igt_core.c
index c292343de09e..46285ac32432 100644
--- a/lib/igt_core.c
+++ b/lib/igt_core.c
@@ -70,6 +70,7 @@
 #include "igt_core.h"
 #include "igt_aux.h"
 #include "igt_sysfs.h"
+#include "igt_sysrq.h"
 #include "igt_rc.h"
 
 #define UNW_LOCAL_ONLY
@@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
 	}
 }
 
+/**
+ * igt_fatal_error: Stop test execution on fatal errors
+ *
+ * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
+ * environment variable is set, reboot the machine.
+ *
+ * Since out test runner (piglit) does support fatal test exit codes, we
+ * implement the default behaviour by waiting endlessly.
+ */
+void  __attribute__((noreturn)) igt_fatal_error(void)
+{
+	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
+		igt_warn("FATAL ERROR - REBOOTING");
+		igt_sysrq_reboot();
+	} else {
+		igt_warn("FATAL ERROR");
+		for (;;)
+			pause();
+	}
+}
+
+
 /**
  * igt_can_fail:
  *
diff --git a/lib/igt_core.h b/lib/igt_core.h
index 7af2b4c109fe..66523a208c31 100644
--- a/lib/igt_core.h
+++ b/lib/igt_core.h
@@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
 		       const char *format, ...)
 	__attribute__((noreturn));
 void igt_exit(void) __attribute__((noreturn));
+void igt_fatal_error(void) __attribute__((noreturn));
 
 /**
  * igt_ignore_warn:
diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
new file mode 100644
index 000000000000..3bda321f7c5b
--- /dev/null
+++ b/lib/igt_sysrq.c
@@ -0,0 +1,27 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/reboot.h>
+
+#include "igt_core.h"
+
+#include "igt_sysrq.h"
+
+/**
+ * igt_sysrq_reboot: Reboots the machine
+ *
+ * Syncs filesystems and immediately reboots the machine.
+ */
+void igt_sysrq_reboot(void)
+{
+	sync();
+
+	/* Try to be nice at first, and if that fails pull the trigger */
+	if (reboot(RB_AUTOBOOT)) {
+		int fd = open("/proc/sysrq-trigger", O_WRONLY);
+		igt_ignore_warn(write(fd, "b", 2));
+		close(fd);
+	}
+
+	abort();
+}
diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
new file mode 100644
index 000000000000..422473d2a480
--- /dev/null
+++ b/lib/igt_sysrq.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __IGT_SYSRQ_H__
+#define __IGT_SYSRQ_H__
+
+void igt_sysrq_reboot(void) __attribute__((noreturn));
+
+#endif /* __IGT_SYSRQ_H__ */
diff --git a/lib/meson.build b/lib/meson.build
index a9e53689b35d..b3b8b14a3f01 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,6 +14,7 @@ lib_sources = [
 	'igt_stats.c',
 	'igt_syncobj.c',
 	'igt_sysfs.c',
+	'igt_sysrq.c',
 	'igt_vgem.c',
 	'igt_x86.c',
 	'instdone.c',
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 3bbb18d2f216..87e7e782d05e 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
 	int link[2];
 	int fd, ret;
 	int cur = 0;
+	char buf;
 
 	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
 	igt_require(cpu0_hotplug_support());
@@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
 			}
 
 			/* Offline followed by online a CPU. */
-			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			ret = write(cpufd, "0", 2);
+			if (ret < 0) {
+				/*
+				 * If we failed to offline a CPU we don't want
+				 * to proceed.
+				 */
+				igt_warn("Failed to offline cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_assert_eq(write(link[1], "s", 1), 1);
+				break;
+			}
+
 			usleep(1e6);
-			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			ret = write(cpufd, "1", 2);
+			if (ret < 0) {
+				/*
+				 * Failed to bring a CPU back online is fatal
+				 * for the sanity of a test run so stop further
+				 * testing.
+				 */
+				igt_warn("Failed to online cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_fatal_error();
+			}
 
 			close(cpufd);
 			cpu++;
@@ -1027,15 +1051,12 @@ static void cpu_hotplug(int gem_fd)
 	 * until the CPU core shuffler finishes one loop.
 	 */
 	for (;;) {
-		char buf;
-		int ret2;
-
 		usleep(500e3);
 		end_spin(gem_fd, spin[cur], 0);
 
 		/* Check if the child is signaling completion. */
-		ret2 = read(link[0], &buf, 1);
-		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
+		ret = read(link[0], &buf, 1);
+		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
 			break;
 
 		igt_spin_batch_free(gem_fd, spin[cur]);
@@ -1054,6 +1075,9 @@ static void cpu_hotplug(int gem_fd)
 	close(fd);
 	close(link[0]);
 
+	/* Skip if child signals a problem with offlining a CPU. */
+	igt_skip_on(buf == 's');
+
 	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
 }
 
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH i-g-t v4] tests/perf_pmu: Handle CPU hotplug failures better
  2018-03-02 11:39                 ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-02 11:42                   ` Chris Wilson
  -1 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2018-03-02 11:42 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Tomi Sarvela, Intel-gfx

Quoting Tvrtko Ursulin (2018-03-02 11:39:12)
> +/**
> + * igt_fatal_error: Stop test execution on fatal errors
> + *
> + * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
> + * environment variable is set, reboot the machine.
> + *
> + * Since out test runner (piglit) does support fatal test exit codes, we
> + * implement the default behaviour by waiting endlessly.
> + */
> +void  __attribute__((noreturn)) igt_fatal_error(void)
> +{
> +       if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
> +               igt_warn("FATAL ERROR - REBOOTING");

+ "\n"

> +               igt_sysrq_reboot();
> +       } else {
> +               igt_warn("FATAL ERROR");

+ "\n"

igt_emergency() ? Would help with mapping kmsg into igt loglevels :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v4] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-03-02 11:42                   ` Chris Wilson
  0 siblings, 0 replies; 30+ messages in thread
From: Chris Wilson @ 2018-03-02 11:42 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Tomi Sarvela, Intel-gfx, Tvrtko Ursulin

Quoting Tvrtko Ursulin (2018-03-02 11:39:12)
> +/**
> + * igt_fatal_error: Stop test execution on fatal errors
> + *
> + * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
> + * environment variable is set, reboot the machine.
> + *
> + * Since out test runner (piglit) does support fatal test exit codes, we
> + * implement the default behaviour by waiting endlessly.
> + */
> +void  __attribute__((noreturn)) igt_fatal_error(void)
> +{
> +       if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
> +               igt_warn("FATAL ERROR - REBOOTING");

+ "\n"

> +               igt_sysrq_reboot();
> +       } else {
> +               igt_warn("FATAL ERROR");

+ "\n"

igt_emergency() ? Would help with mapping kmsg into igt loglevels :)
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH i-g-t v5] tests/perf_pmu: Handle CPU hotplug failures better
  2018-03-02 11:42                   ` [igt-dev] " Chris Wilson
@ 2018-03-02 11:55                     ` Tvrtko Ursulin
  -1 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-03-02 11:55 UTC (permalink / raw)
  To: igt-dev; +Cc: Tomi Sarvela, Intel-gfx

From: Chris Wilson <chris@chris-wilson.co.uk>

CPU hotplug, especially CPU0, can be flaky on commodity hardware.

To improve test reliability and reponse times when testing larger runs we
need to handle those cases better.

Handle failures to off-line a CPU by immediately skipping the test, and
failures to on-line a CPU by immediately rebooting the machine.

This patch includes igt_sysrq_reboot implementation from Chris Wilson.

v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
    set. (Petri Latvala)

v3: Add missign docs and update stale comment. (Petri Latvala)

v4: Use pause instead of sleep. (Chris Wilson)
v5: Newlines! (Chris Wilson)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
Reviewed-by: Petri Latvala <petri.latvala@intel.com>
---
 lib/Makefile.sources |  2 ++
 lib/igt_core.c       | 23 +++++++++++++++++++++++
 lib/igt_core.h       |  1 +
 lib/igt_sysrq.c      | 27 +++++++++++++++++++++++++++
 lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
 lib/meson.build      |  1 +
 tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
 7 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 lib/igt_sysrq.c
 create mode 100644 lib/igt_sysrq.h

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 5b13ef8896c0..3d37ef1d1984 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -35,6 +35,8 @@ lib_source_list =	 	\
 	igt_stats.h		\
 	igt_sysfs.c		\
 	igt_sysfs.h		\
+	igt_sysrq.c		\
+	igt_sysrq.h		\
 	igt_x86.h		\
 	igt_x86.c		\
 	igt_vgem.c		\
diff --git a/lib/igt_core.c b/lib/igt_core.c
index c292343de09e..e52b806bdb01 100644
--- a/lib/igt_core.c
+++ b/lib/igt_core.c
@@ -70,6 +70,7 @@
 #include "igt_core.h"
 #include "igt_aux.h"
 #include "igt_sysfs.h"
+#include "igt_sysrq.h"
 #include "igt_rc.h"
 
 #define UNW_LOCAL_ONLY
@@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
 	}
 }
 
+/**
+ * igt_fatal_error: Stop test execution on fatal errors
+ *
+ * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
+ * environment variable is set, reboot the machine.
+ *
+ * Since out test runner (piglit) does support fatal test exit codes, we
+ * implement the default behaviour by waiting endlessly.
+ */
+void  __attribute__((noreturn)) igt_fatal_error(void)
+{
+	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
+		igt_warn("FATAL ERROR - REBOOTING\n");
+		igt_sysrq_reboot();
+	} else {
+		igt_warn("FATAL ERROR\n");
+		for (;;)
+			pause();
+	}
+}
+
+
 /**
  * igt_can_fail:
  *
diff --git a/lib/igt_core.h b/lib/igt_core.h
index 7af2b4c109fe..66523a208c31 100644
--- a/lib/igt_core.h
+++ b/lib/igt_core.h
@@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
 		       const char *format, ...)
 	__attribute__((noreturn));
 void igt_exit(void) __attribute__((noreturn));
+void igt_fatal_error(void) __attribute__((noreturn));
 
 /**
  * igt_ignore_warn:
diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
new file mode 100644
index 000000000000..3bda321f7c5b
--- /dev/null
+++ b/lib/igt_sysrq.c
@@ -0,0 +1,27 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/reboot.h>
+
+#include "igt_core.h"
+
+#include "igt_sysrq.h"
+
+/**
+ * igt_sysrq_reboot: Reboots the machine
+ *
+ * Syncs filesystems and immediately reboots the machine.
+ */
+void igt_sysrq_reboot(void)
+{
+	sync();
+
+	/* Try to be nice at first, and if that fails pull the trigger */
+	if (reboot(RB_AUTOBOOT)) {
+		int fd = open("/proc/sysrq-trigger", O_WRONLY);
+		igt_ignore_warn(write(fd, "b", 2));
+		close(fd);
+	}
+
+	abort();
+}
diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
new file mode 100644
index 000000000000..422473d2a480
--- /dev/null
+++ b/lib/igt_sysrq.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __IGT_SYSRQ_H__
+#define __IGT_SYSRQ_H__
+
+void igt_sysrq_reboot(void) __attribute__((noreturn));
+
+#endif /* __IGT_SYSRQ_H__ */
diff --git a/lib/meson.build b/lib/meson.build
index a9e53689b35d..b3b8b14a3f01 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,6 +14,7 @@ lib_sources = [
 	'igt_stats.c',
 	'igt_syncobj.c',
 	'igt_sysfs.c',
+	'igt_sysrq.c',
 	'igt_vgem.c',
 	'igt_x86.c',
 	'instdone.c',
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 3bbb18d2f216..87e7e782d05e 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
 	int link[2];
 	int fd, ret;
 	int cur = 0;
+	char buf;
 
 	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
 	igt_require(cpu0_hotplug_support());
@@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
 			}
 
 			/* Offline followed by online a CPU. */
-			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			ret = write(cpufd, "0", 2);
+			if (ret < 0) {
+				/*
+				 * If we failed to offline a CPU we don't want
+				 * to proceed.
+				 */
+				igt_warn("Failed to offline cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_assert_eq(write(link[1], "s", 1), 1);
+				break;
+			}
+
 			usleep(1e6);
-			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			ret = write(cpufd, "1", 2);
+			if (ret < 0) {
+				/*
+				 * Failed to bring a CPU back online is fatal
+				 * for the sanity of a test run so stop further
+				 * testing.
+				 */
+				igt_warn("Failed to online cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_fatal_error();
+			}
 
 			close(cpufd);
 			cpu++;
@@ -1027,15 +1051,12 @@ static void cpu_hotplug(int gem_fd)
 	 * until the CPU core shuffler finishes one loop.
 	 */
 	for (;;) {
-		char buf;
-		int ret2;
-
 		usleep(500e3);
 		end_spin(gem_fd, spin[cur], 0);
 
 		/* Check if the child is signaling completion. */
-		ret2 = read(link[0], &buf, 1);
-		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
+		ret = read(link[0], &buf, 1);
+		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
 			break;
 
 		igt_spin_batch_free(gem_fd, spin[cur]);
@@ -1054,6 +1075,9 @@ static void cpu_hotplug(int gem_fd)
 	close(fd);
 	close(link[0]);
 
+	/* Skip if child signals a problem with offlining a CPU. */
+	igt_skip_on(buf == 's');
+
 	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
 }
 
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [igt-dev] [PATCH i-g-t v5] tests/perf_pmu: Handle CPU hotplug failures better
@ 2018-03-02 11:55                     ` Tvrtko Ursulin
  0 siblings, 0 replies; 30+ messages in thread
From: Tvrtko Ursulin @ 2018-03-02 11:55 UTC (permalink / raw)
  To: igt-dev; +Cc: Tomi Sarvela, Intel-gfx, Tvrtko Ursulin

From: Chris Wilson <chris@chris-wilson.co.uk>

CPU hotplug, especially CPU0, can be flaky on commodity hardware.

To improve test reliability and reponse times when testing larger runs we
need to handle those cases better.

Handle failures to off-line a CPU by immediately skipping the test, and
failures to on-line a CPU by immediately rebooting the machine.

This patch includes igt_sysrq_reboot implementation from Chris Wilson.

v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
    set. (Petri Latvala)

v3: Add missign docs and update stale comment. (Petri Latvala)

v4: Use pause instead of sleep. (Chris Wilson)
v5: Newlines! (Chris Wilson)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>
Reviewed-by: Petri Latvala <petri.latvala@intel.com>
---
 lib/Makefile.sources |  2 ++
 lib/igt_core.c       | 23 +++++++++++++++++++++++
 lib/igt_core.h       |  1 +
 lib/igt_sysrq.c      | 27 +++++++++++++++++++++++++++
 lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
 lib/meson.build      |  1 +
 tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
 7 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 lib/igt_sysrq.c
 create mode 100644 lib/igt_sysrq.h

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 5b13ef8896c0..3d37ef1d1984 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -35,6 +35,8 @@ lib_source_list =	 	\
 	igt_stats.h		\
 	igt_sysfs.c		\
 	igt_sysfs.h		\
+	igt_sysrq.c		\
+	igt_sysrq.h		\
 	igt_x86.h		\
 	igt_x86.c		\
 	igt_vgem.c		\
diff --git a/lib/igt_core.c b/lib/igt_core.c
index c292343de09e..e52b806bdb01 100644
--- a/lib/igt_core.c
+++ b/lib/igt_core.c
@@ -70,6 +70,7 @@
 #include "igt_core.h"
 #include "igt_aux.h"
 #include "igt_sysfs.h"
+#include "igt_sysrq.h"
 #include "igt_rc.h"
 
 #define UNW_LOCAL_ONLY
@@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
 	}
 }
 
+/**
+ * igt_fatal_error: Stop test execution on fatal errors
+ *
+ * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
+ * environment variable is set, reboot the machine.
+ *
+ * Since out test runner (piglit) does support fatal test exit codes, we
+ * implement the default behaviour by waiting endlessly.
+ */
+void  __attribute__((noreturn)) igt_fatal_error(void)
+{
+	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
+		igt_warn("FATAL ERROR - REBOOTING\n");
+		igt_sysrq_reboot();
+	} else {
+		igt_warn("FATAL ERROR\n");
+		for (;;)
+			pause();
+	}
+}
+
+
 /**
  * igt_can_fail:
  *
diff --git a/lib/igt_core.h b/lib/igt_core.h
index 7af2b4c109fe..66523a208c31 100644
--- a/lib/igt_core.h
+++ b/lib/igt_core.h
@@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
 		       const char *format, ...)
 	__attribute__((noreturn));
 void igt_exit(void) __attribute__((noreturn));
+void igt_fatal_error(void) __attribute__((noreturn));
 
 /**
  * igt_ignore_warn:
diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
new file mode 100644
index 000000000000..3bda321f7c5b
--- /dev/null
+++ b/lib/igt_sysrq.c
@@ -0,0 +1,27 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/reboot.h>
+
+#include "igt_core.h"
+
+#include "igt_sysrq.h"
+
+/**
+ * igt_sysrq_reboot: Reboots the machine
+ *
+ * Syncs filesystems and immediately reboots the machine.
+ */
+void igt_sysrq_reboot(void)
+{
+	sync();
+
+	/* Try to be nice at first, and if that fails pull the trigger */
+	if (reboot(RB_AUTOBOOT)) {
+		int fd = open("/proc/sysrq-trigger", O_WRONLY);
+		igt_ignore_warn(write(fd, "b", 2));
+		close(fd);
+	}
+
+	abort();
+}
diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
new file mode 100644
index 000000000000..422473d2a480
--- /dev/null
+++ b/lib/igt_sysrq.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __IGT_SYSRQ_H__
+#define __IGT_SYSRQ_H__
+
+void igt_sysrq_reboot(void) __attribute__((noreturn));
+
+#endif /* __IGT_SYSRQ_H__ */
diff --git a/lib/meson.build b/lib/meson.build
index a9e53689b35d..b3b8b14a3f01 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,6 +14,7 @@ lib_sources = [
 	'igt_stats.c',
 	'igt_syncobj.c',
 	'igt_sysfs.c',
+	'igt_sysrq.c',
 	'igt_vgem.c',
 	'igt_x86.c',
 	'instdone.c',
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
index 3bbb18d2f216..87e7e782d05e 100644
--- a/tests/perf_pmu.c
+++ b/tests/perf_pmu.c
@@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
 	int link[2];
 	int fd, ret;
 	int cur = 0;
+	char buf;
 
 	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
 	igt_require(cpu0_hotplug_support());
@@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
 			}
 
 			/* Offline followed by online a CPU. */
-			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			ret = write(cpufd, "0", 2);
+			if (ret < 0) {
+				/*
+				 * If we failed to offline a CPU we don't want
+				 * to proceed.
+				 */
+				igt_warn("Failed to offline cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_assert_eq(write(link[1], "s", 1), 1);
+				break;
+			}
+
 			usleep(1e6);
-			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			ret = write(cpufd, "1", 2);
+			if (ret < 0) {
+				/*
+				 * Failed to bring a CPU back online is fatal
+				 * for the sanity of a test run so stop further
+				 * testing.
+				 */
+				igt_warn("Failed to online cpu%u! (%d)\n",
+					 cpu, errno);
+				igt_fatal_error();
+			}
 
 			close(cpufd);
 			cpu++;
@@ -1027,15 +1051,12 @@ static void cpu_hotplug(int gem_fd)
 	 * until the CPU core shuffler finishes one loop.
 	 */
 	for (;;) {
-		char buf;
-		int ret2;
-
 		usleep(500e3);
 		end_spin(gem_fd, spin[cur], 0);
 
 		/* Check if the child is signaling completion. */
-		ret2 = read(link[0], &buf, 1);
-		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
+		ret = read(link[0], &buf, 1);
+		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
 			break;
 
 		igt_spin_batch_free(gem_fd, spin[cur]);
@@ -1054,6 +1075,9 @@ static void cpu_hotplug(int gem_fd)
 	close(fd);
 	close(link[0]);
 
+	/* Skip if child signals a problem with offlining a CPU. */
+	igt_skip_on(buf == 's');
+
 	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
 }
 
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Handle CPU hotplug failures better (rev5)
  2018-03-02 11:12             ` [Intel-gfx] " Petri Latvala
                               ` (2 preceding siblings ...)
  (?)
@ 2018-03-02 20:16             ` Patchwork
  -1 siblings, 0 replies; 30+ messages in thread
From: Patchwork @ 2018-03-02 20:16 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Handle CPU hotplug failures better (rev5)
URL   : https://patchwork.freedesktop.org/series/38855/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
bddfb8dd3c1767f13d2af578d5c3d897fddf0dcd igt/gem_ctx_switch: Exercise all engines at once

with latest DRM-Tip kernel build CI_DRM_3867
4f4e4dd52a30 drm-tip: 2018y-03m-02d-16h-28m-21s UTC integration manifest

No testlist changes.

---- Known issues:

Test prime_vgem:
        Subgroup basic-fence-flip:
                pass       -> FAIL       (fi-skl-6260u) fdo#104008

fdo#104008 https://bugs.freedesktop.org/show_bug.cgi?id=104008

fi-bdw-5557u     total:288  pass:267  dwarn:0   dfail:0   fail:0   skip:21  time:414s
fi-bdw-gvtdvm    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:423s
fi-blb-e6850     total:288  pass:223  dwarn:1   dfail:0   fail:0   skip:64  time:374s
fi-bsw-n3050     total:288  pass:242  dwarn:0   dfail:0   fail:0   skip:46  time:484s
fi-bwr-2160      total:288  pass:183  dwarn:0   dfail:0   fail:0   skip:105 time:279s
fi-bxt-dsi       total:288  pass:258  dwarn:0   dfail:0   fail:0   skip:30  time:480s
fi-bxt-j4205     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:485s
fi-byt-j1900     total:288  pass:253  dwarn:0   dfail:0   fail:0   skip:35  time:468s
fi-byt-n2820     total:288  pass:249  dwarn:0   dfail:0   fail:0   skip:39  time:458s
fi-cfl-8700k     total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:398s
fi-cfl-s2        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:574s
fi-cfl-u         total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:500s
fi-cnl-y3        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:571s
fi-elk-e7500     total:288  pass:229  dwarn:0   dfail:0   fail:0   skip:59  time:414s
fi-gdg-551       total:288  pass:179  dwarn:0   dfail:0   fail:1   skip:108 time:288s
fi-glk-1         total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:509s
fi-hsw-4770      total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:385s
fi-ilk-650       total:288  pass:228  dwarn:0   dfail:0   fail:0   skip:60  time:410s
fi-ivb-3520m     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:453s
fi-ivb-3770      total:288  pass:255  dwarn:0   dfail:0   fail:0   skip:33  time:417s
fi-kbl-7500u     total:288  pass:263  dwarn:1   dfail:0   fail:0   skip:24  time:452s
fi-kbl-7560u     total:288  pass:269  dwarn:0   dfail:0   fail:0   skip:19  time:490s
fi-kbl-7567u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:451s
fi-kbl-r         total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:493s
fi-pnv-d510      total:288  pass:222  dwarn:1   dfail:0   fail:0   skip:65  time:584s
fi-skl-6260u     total:288  pass:267  dwarn:0   dfail:0   fail:1   skip:20  time:424s
fi-skl-6600u     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:502s
fi-skl-6700hq    total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:518s
fi-skl-6700k2    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:486s
fi-skl-6770hq    total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:489s
fi-skl-guc       total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:408s
fi-skl-gvtdvm    total:288  pass:265  dwarn:0   dfail:0   fail:0   skip:23  time:433s
fi-snb-2520m     total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:524s
fi-snb-2600      total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:394s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1047/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: warning for tests/perf_pmu: Handle CPU hotplug failures better (rev5)
  2018-03-02 11:12             ` [Intel-gfx] " Petri Latvala
                               ` (3 preceding siblings ...)
  (?)
@ 2018-03-02 23:33             ` Patchwork
  -1 siblings, 0 replies; 30+ messages in thread
From: Patchwork @ 2018-03-02 23:33 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: tests/perf_pmu: Handle CPU hotplug failures better (rev5)
URL   : https://patchwork.freedesktop.org/series/38855/
State : warning

== Summary ==

---- Possible new issues:

Test gem_pwrite:
        Subgroup big-gtt-random:
                pass       -> SKIP       (shard-apl)

---- Known issues:

Test gem_eio:
        Subgroup in-flight:
                incomplete -> PASS       (shard-apl) fdo#104945
Test kms_chv_cursor_fail:
        Subgroup pipe-b-256x256-bottom-edge:
                dmesg-warn -> PASS       (shard-snb) fdo#105185
Test kms_fbcon_fbt:
        Subgroup fbc-suspend:
                incomplete -> PASS       (shard-hsw) fdo#105087
Test kms_flip:
        Subgroup flip-vs-modeset-vs-hang-interruptible:
                pass       -> DMESG-WARN (shard-snb) fdo#104311
Test kms_rotation_crc:
        Subgroup sprite-rotation-180:
                pass       -> FAIL       (shard-snb) fdo#103925

fdo#104945 https://bugs.freedesktop.org/show_bug.cgi?id=104945
fdo#105185 https://bugs.freedesktop.org/show_bug.cgi?id=105185
fdo#105087 https://bugs.freedesktop.org/show_bug.cgi?id=105087
fdo#104311 https://bugs.freedesktop.org/show_bug.cgi?id=104311
fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925

shard-apl        total:3463 pass:1821 dwarn:1   dfail:0   fail:7   skip:1633 time:12522s
shard-hsw        total:3463 pass:1770 dwarn:1   dfail:0   fail:1   skip:1690 time:12100s
shard-snb        total:3463 pass:1360 dwarn:2   dfail:0   fail:2   skip:2099 time:7104s
Blacklisted hosts:
shard-kbl        total:3435 pass:1929 dwarn:8   dfail:0   fail:7   skip:1490 time:9578s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1047/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2018-03-02 23:33 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-23 11:34 [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better Tvrtko Ursulin
2018-02-23 11:34 ` [Intel-gfx] " Tvrtko Ursulin
2018-02-23 11:58 ` [igt-dev] " Petri Latvala
2018-02-23 11:58   ` Petri Latvala
2018-02-23 14:20   ` Tvrtko Ursulin
2018-02-23 14:20     ` [igt-dev] [Intel-gfx] " Tvrtko Ursulin
2018-02-26 10:03     ` [igt-dev] " Petri Latvala
2018-02-26 10:03       ` [igt-dev] [Intel-gfx] " Petri Latvala
2018-02-26 10:14       ` [igt-dev] " Tomi Sarvela
2018-02-26 10:14         ` [Intel-gfx] " Tomi Sarvela
2018-02-28 10:05         ` [PATCH i-g-t v2] " Tvrtko Ursulin
2018-02-28 10:05           ` [igt-dev] " Tvrtko Ursulin
2018-03-02 11:12           ` Petri Latvala
2018-03-02 11:12             ` [Intel-gfx] " Petri Latvala
2018-03-02 11:28             ` [PATCH i-g-t v3] " Tvrtko Ursulin
2018-03-02 11:28               ` [igt-dev] " Tvrtko Ursulin
2018-03-02 11:39               ` [PATCH i-g-t v4] " Tvrtko Ursulin
2018-03-02 11:39                 ` [igt-dev] " Tvrtko Ursulin
2018-03-02 11:42                 ` Chris Wilson
2018-03-02 11:42                   ` [igt-dev] " Chris Wilson
2018-03-02 11:55                   ` [PATCH i-g-t v5] " Tvrtko Ursulin
2018-03-02 11:55                     ` [igt-dev] " Tvrtko Ursulin
2018-03-02 11:32             ` [PATCH i-g-t v2] " Chris Wilson
2018-03-02 11:32               ` [igt-dev] " Chris Wilson
2018-03-02 20:16             ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Handle CPU hotplug failures better (rev5) Patchwork
2018-03-02 23:33             ` [igt-dev] ✗ Fi.CI.IGT: warning " Patchwork
2018-02-23 12:37 ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Handle CPU hotplug failures better Patchwork
2018-02-23 15:34 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-02-28 11:54 ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Handle CPU hotplug failures better (rev2) Patchwork
2018-02-28 14:50 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.