All of lore.kernel.org
 help / color / mirror / Atom feed
From: Petri Latvala <petri.latvala@intel.com>
To: Tvrtko Ursulin <tursulin@ursulin.net>
Cc: igt-dev@lists.freedesktop.org,
	Tomi Sarvela <tomi.p.sarvela@intel.com>,
	Intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH i-g-t v2] tests/perf_pmu: Handle CPU hotplug failures better
Date: Fri, 2 Mar 2018 13:12:19 +0200	[thread overview]
Message-ID: <20180302111219.wjas3yhqqbafc4ju@platvala-desk.ger.corp.intel.com> (raw)
In-Reply-To: <20180228100555.11734-1-tvrtko.ursulin@linux.intel.com>

On Wed, Feb 28, 2018 at 10:05:55AM +0000, Tvrtko Ursulin wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> CPU hotplug, especially CPU0, can be flaky on commodity hardware.
> 
> To improve test reliability and reponse times when testing larger runs we
> need to handle those cases better.
> 
> Handle failures to off-line a CPU by immediately skipping the test, and
> failures to on-line a CPU by immediately rebooting the machine.
> 
> This patch includes igt_sysrq_reboot implementation from Chris Wilson.
> 
> v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
>     set. (Petri Latvala)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Petri Latvala <petri.latvala@intel.com>
> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>


Reviewed-by: Petri Latvala <petri.latvala@intel.com>

with two nitpicks below.




> ---
>  lib/Makefile.sources |  2 ++
>  lib/igt_core.c       | 23 +++++++++++++++++++++++
>  lib/igt_core.h       |  1 +
>  lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
>  lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
>  lib/meson.build      |  1 +
>  tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
>  7 files changed, 110 insertions(+), 7 deletions(-)
>  create mode 100644 lib/igt_sysrq.c
>  create mode 100644 lib/igt_sysrq.h
> 
> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
> index 5b13ef8896c0..3d37ef1d1984 100644
> --- a/lib/Makefile.sources
> +++ b/lib/Makefile.sources
> @@ -35,6 +35,8 @@ lib_source_list =	 	\
>  	igt_stats.h		\
>  	igt_sysfs.c		\
>  	igt_sysfs.h		\
> +	igt_sysrq.c		\
> +	igt_sysrq.h		\
>  	igt_x86.h		\
>  	igt_x86.c		\
>  	igt_vgem.c		\
> diff --git a/lib/igt_core.c b/lib/igt_core.c
> index c292343de09e..3fd9f529f09f 100644
> --- a/lib/igt_core.c
> +++ b/lib/igt_core.c
> @@ -70,6 +70,7 @@
>  #include "igt_core.h"
>  #include "igt_aux.h"
>  #include "igt_sysfs.h"
> +#include "igt_sysrq.h"
>  #include "igt_rc.h"
>  
>  #define UNW_LOCAL_ONLY
> @@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
>  	}
>  }
>  
> +/**
> + * igt_fatal_error:
> + *
> + * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
> + * environment variable is set, reboot the machine.
> + *
> + * Since out test runner (piglit) does support fatal test exit codes, we
> + * implement the default behaviour by waiting endlessly.
> + */
> +void  __attribute__((noreturn)) igt_fatal_error(void)
> +{
> +	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
> +		igt_warn("FATAL ERROR - REBOOTING");
> +		igt_sysrq_reboot();
> +	} else {
> +		igt_warn("FATAL ERROR");
> +		for (;;)
> +			sleep(60);
> +	}
> +}
> +
> +
>  /**
>   * igt_can_fail:
>   *
> diff --git a/lib/igt_core.h b/lib/igt_core.h
> index 7af2b4c109fe..66523a208c31 100644
> --- a/lib/igt_core.h
> +++ b/lib/igt_core.h
> @@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
>  		       const char *format, ...)
>  	__attribute__((noreturn));
>  void igt_exit(void) __attribute__((noreturn));
> +void igt_fatal_error(void) __attribute__((noreturn));
>  
>  /**
>   * igt_ignore_warn:
> diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
> new file mode 100644
> index 000000000000..fe3d2e344ff1
> --- /dev/null
> +++ b/lib/igt_sysrq.c
> @@ -0,0 +1,22 @@
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <sys/reboot.h>
> +
> +#include "igt_core.h"
> +
> +#include "igt_sysrq.h"
> +


Docs for igt_sysrq_reboot?


> +void igt_sysrq_reboot(void)
> +{
> +	sync();
> +
> +	/* Try to be nice at first, and if that fails pull the trigger */
> +	if (reboot(RB_AUTOBOOT)) {
> +		int fd = open("/proc/sysrq-trigger", O_WRONLY);
> +		igt_ignore_warn(write(fd, "b", 2));
> +		close(fd);
> +	}
> +
> +	abort();
> +}
> diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
> new file mode 100644
> index 000000000000..422473d2a480
> --- /dev/null
> +++ b/lib/igt_sysrq.h
> @@ -0,0 +1,30 @@
> +/*
> + * Copyright © 2018 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __IGT_SYSRQ_H__
> +#define __IGT_SYSRQ_H__
> +
> +void igt_sysrq_reboot(void) __attribute__((noreturn));
> +
> +#endif /* __IGT_SYSRQ_H__ */
> diff --git a/lib/meson.build b/lib/meson.build
> index a9e53689b35d..b3b8b14a3f01 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -14,6 +14,7 @@ lib_sources = [
>  	'igt_stats.c',
>  	'igt_syncobj.c',
>  	'igt_sysfs.c',
> +	'igt_sysrq.c',
>  	'igt_vgem.c',
>  	'igt_x86.c',
>  	'instdone.c',
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 3bbb18d2f216..8c75b0641785 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
>  	int link[2];
>  	int fd, ret;
>  	int cur = 0;
> +	char buf;
>  
>  	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
>  	igt_require(cpu0_hotplug_support());
> @@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
>  			}
>  
>  			/* Offline followed by online a CPU. */
> -			igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +			ret = write(cpufd, "0", 2);
> +			if (ret < 0) {
> +				/*
> +				 * If we failed to offline a CPU we don't want
> +				 * to proceed.
> +				 */
> +				igt_warn("Failed to offline cpu%u! (%d)\n",
> +					 cpu, errno);
> +				igt_assert_eq(write(link[1], "s", 1), 1);
> +				break;
> +			}
> +
>  			usleep(1e6);
> -			igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +			ret = write(cpufd, "1", 2);
> +			if (ret < 0) {
> +				/*
> +				 * Failed to bring a CPU back online is fatal
> +				 * for the sanity of a test run so reboot
> +				 * immediately.
> +				 */

This is assuming what the user has configured igt_fatal_error() to
do. Just a s/so reboot immediately// maybe?


-- 
Petri Latvala
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

WARNING: multiple messages have this Message-ID (diff)
From: Petri Latvala <petri.latvala@intel.com>
To: Tvrtko Ursulin <tursulin@ursulin.net>
Cc: igt-dev@lists.freedesktop.org,
	Tomi Sarvela <tomi.p.sarvela@intel.com>,
	Intel-gfx@lists.freedesktop.org
Subject: Re: [Intel-gfx] [PATCH i-g-t v2] tests/perf_pmu: Handle CPU hotplug failures better
Date: Fri, 2 Mar 2018 13:12:19 +0200	[thread overview]
Message-ID: <20180302111219.wjas3yhqqbafc4ju@platvala-desk.ger.corp.intel.com> (raw)
In-Reply-To: <20180228100555.11734-1-tvrtko.ursulin@linux.intel.com>

On Wed, Feb 28, 2018 at 10:05:55AM +0000, Tvrtko Ursulin wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> CPU hotplug, especially CPU0, can be flaky on commodity hardware.
> 
> To improve test reliability and reponse times when testing larger runs we
> need to handle those cases better.
> 
> Handle failures to off-line a CPU by immediately skipping the test, and
> failures to on-line a CPU by immediately rebooting the machine.
> 
> This patch includes igt_sysrq_reboot implementation from Chris Wilson.
> 
> v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
>     set. (Petri Latvala)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Petri Latvala <petri.latvala@intel.com>
> Cc: Tomi Sarvela <tomi.p.sarvela@intel.com>


Reviewed-by: Petri Latvala <petri.latvala@intel.com>

with two nitpicks below.




> ---
>  lib/Makefile.sources |  2 ++
>  lib/igt_core.c       | 23 +++++++++++++++++++++++
>  lib/igt_core.h       |  1 +
>  lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
>  lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
>  lib/meson.build      |  1 +
>  tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
>  7 files changed, 110 insertions(+), 7 deletions(-)
>  create mode 100644 lib/igt_sysrq.c
>  create mode 100644 lib/igt_sysrq.h
> 
> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
> index 5b13ef8896c0..3d37ef1d1984 100644
> --- a/lib/Makefile.sources
> +++ b/lib/Makefile.sources
> @@ -35,6 +35,8 @@ lib_source_list =	 	\
>  	igt_stats.h		\
>  	igt_sysfs.c		\
>  	igt_sysfs.h		\
> +	igt_sysrq.c		\
> +	igt_sysrq.h		\
>  	igt_x86.h		\
>  	igt_x86.c		\
>  	igt_vgem.c		\
> diff --git a/lib/igt_core.c b/lib/igt_core.c
> index c292343de09e..3fd9f529f09f 100644
> --- a/lib/igt_core.c
> +++ b/lib/igt_core.c
> @@ -70,6 +70,7 @@
>  #include "igt_core.h"
>  #include "igt_aux.h"
>  #include "igt_sysfs.h"
> +#include "igt_sysrq.h"
>  #include "igt_rc.h"
>  
>  #define UNW_LOCAL_ONLY
> @@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
>  	}
>  }
>  
> +/**
> + * igt_fatal_error:
> + *
> + * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
> + * environment variable is set, reboot the machine.
> + *
> + * Since out test runner (piglit) does support fatal test exit codes, we
> + * implement the default behaviour by waiting endlessly.
> + */
> +void  __attribute__((noreturn)) igt_fatal_error(void)
> +{
> +	if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
> +		igt_warn("FATAL ERROR - REBOOTING");
> +		igt_sysrq_reboot();
> +	} else {
> +		igt_warn("FATAL ERROR");
> +		for (;;)
> +			sleep(60);
> +	}
> +}
> +
> +
>  /**
>   * igt_can_fail:
>   *
> diff --git a/lib/igt_core.h b/lib/igt_core.h
> index 7af2b4c109fe..66523a208c31 100644
> --- a/lib/igt_core.h
> +++ b/lib/igt_core.h
> @@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file,
>  		       const char *format, ...)
>  	__attribute__((noreturn));
>  void igt_exit(void) __attribute__((noreturn));
> +void igt_fatal_error(void) __attribute__((noreturn));
>  
>  /**
>   * igt_ignore_warn:
> diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
> new file mode 100644
> index 000000000000..fe3d2e344ff1
> --- /dev/null
> +++ b/lib/igt_sysrq.c
> @@ -0,0 +1,22 @@
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <sys/reboot.h>
> +
> +#include "igt_core.h"
> +
> +#include "igt_sysrq.h"
> +


Docs for igt_sysrq_reboot?


> +void igt_sysrq_reboot(void)
> +{
> +	sync();
> +
> +	/* Try to be nice at first, and if that fails pull the trigger */
> +	if (reboot(RB_AUTOBOOT)) {
> +		int fd = open("/proc/sysrq-trigger", O_WRONLY);
> +		igt_ignore_warn(write(fd, "b", 2));
> +		close(fd);
> +	}
> +
> +	abort();
> +}
> diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
> new file mode 100644
> index 000000000000..422473d2a480
> --- /dev/null
> +++ b/lib/igt_sysrq.h
> @@ -0,0 +1,30 @@
> +/*
> + * Copyright © 2018 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __IGT_SYSRQ_H__
> +#define __IGT_SYSRQ_H__
> +
> +void igt_sysrq_reboot(void) __attribute__((noreturn));
> +
> +#endif /* __IGT_SYSRQ_H__ */
> diff --git a/lib/meson.build b/lib/meson.build
> index a9e53689b35d..b3b8b14a3f01 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -14,6 +14,7 @@ lib_sources = [
>  	'igt_stats.c',
>  	'igt_syncobj.c',
>  	'igt_sysfs.c',
> +	'igt_sysrq.c',
>  	'igt_vgem.c',
>  	'igt_x86.c',
>  	'instdone.c',
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 3bbb18d2f216..8c75b0641785 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
>  	int link[2];
>  	int fd, ret;
>  	int cur = 0;
> +	char buf;
>  
>  	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
>  	igt_require(cpu0_hotplug_support());
> @@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
>  			}
>  
>  			/* Offline followed by online a CPU. */
> -			igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +			ret = write(cpufd, "0", 2);
> +			if (ret < 0) {
> +				/*
> +				 * If we failed to offline a CPU we don't want
> +				 * to proceed.
> +				 */
> +				igt_warn("Failed to offline cpu%u! (%d)\n",
> +					 cpu, errno);
> +				igt_assert_eq(write(link[1], "s", 1), 1);
> +				break;
> +			}
> +
>  			usleep(1e6);
> -			igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +			ret = write(cpufd, "1", 2);
> +			if (ret < 0) {
> +				/*
> +				 * Failed to bring a CPU back online is fatal
> +				 * for the sanity of a test run so reboot
> +				 * immediately.
> +				 */

This is assuming what the user has configured igt_fatal_error() to
do. Just a s/so reboot immediately// maybe?


-- 
Petri Latvala
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  reply	other threads:[~2018-03-02 11:12 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-02-23 11:34 [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better Tvrtko Ursulin
2018-02-23 11:34 ` [Intel-gfx] " Tvrtko Ursulin
2018-02-23 11:58 ` [igt-dev] " Petri Latvala
2018-02-23 11:58   ` Petri Latvala
2018-02-23 14:20   ` Tvrtko Ursulin
2018-02-23 14:20     ` [igt-dev] [Intel-gfx] " Tvrtko Ursulin
2018-02-26 10:03     ` [igt-dev] " Petri Latvala
2018-02-26 10:03       ` [igt-dev] [Intel-gfx] " Petri Latvala
2018-02-26 10:14       ` [igt-dev] " Tomi Sarvela
2018-02-26 10:14         ` [Intel-gfx] " Tomi Sarvela
2018-02-28 10:05         ` [PATCH i-g-t v2] " Tvrtko Ursulin
2018-02-28 10:05           ` [igt-dev] " Tvrtko Ursulin
2018-03-02 11:12           ` Petri Latvala [this message]
2018-03-02 11:12             ` [Intel-gfx] " Petri Latvala
2018-03-02 11:28             ` [PATCH i-g-t v3] " Tvrtko Ursulin
2018-03-02 11:28               ` [igt-dev] " Tvrtko Ursulin
2018-03-02 11:39               ` [PATCH i-g-t v4] " Tvrtko Ursulin
2018-03-02 11:39                 ` [igt-dev] " Tvrtko Ursulin
2018-03-02 11:42                 ` Chris Wilson
2018-03-02 11:42                   ` [igt-dev] " Chris Wilson
2018-03-02 11:55                   ` [PATCH i-g-t v5] " Tvrtko Ursulin
2018-03-02 11:55                     ` [igt-dev] " Tvrtko Ursulin
2018-03-02 11:32             ` [PATCH i-g-t v2] " Chris Wilson
2018-03-02 11:32               ` [igt-dev] " Chris Wilson
2018-03-02 20:16             ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Handle CPU hotplug failures better (rev5) Patchwork
2018-03-02 23:33             ` [igt-dev] ✗ Fi.CI.IGT: warning " Patchwork
2018-02-23 12:37 ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Handle CPU hotplug failures better Patchwork
2018-02-23 15:34 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-02-28 11:54 ` [igt-dev] ✓ Fi.CI.BAT: success for tests/perf_pmu: Handle CPU hotplug failures better (rev2) Patchwork
2018-02-28 14:50 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180302111219.wjas3yhqqbafc4ju@platvala-desk.ger.corp.intel.com \
    --to=petri.latvala@intel.com \
    --cc=Intel-gfx@lists.freedesktop.org \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=tomi.p.sarvela@intel.com \
    --cc=tursulin@ursulin.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.