All of lore.kernel.org
 help / color / mirror / Atom feed
* [igt-dev] [RFC v2] tests/gem_watchdog: Initial set of tests for GPU watchdog
@ 2019-04-18 16:53 Carlos Santa
  2019-04-18 17:39 ` [igt-dev] ✗ Fi.CI.BAT: failure for tests/gem_watchdog: Initial set of tests for GPU watchdog (rev4) Patchwork
  2019-04-19 21:29 ` [igt-dev] [RFC v2] tests/gem_watchdog: Initial set of tests for GPU watchdog Antonio Argenziano
  0 siblings, 2 replies; 4+ messages in thread
From: Carlos Santa @ 2019-04-18 16:53 UTC (permalink / raw)
  To: igt-dev; +Cc: Ursulin Tvrtko

This test adds basic set of tests to reset the different
GPU engines through the gpu watchdog timer.

Credits to Antonio for the original codebase this is based on.

v2: remove gem_context_get_param() during set (Antonio)
    remove clearing of the engines_threshold[] in the default case
    inside context_set_watchdog(). (Antonio)
    fix indexing when creating low/high priority contexts
    get rid of 2 threads idea (Antonio)
    fix context prio bug due to wrong indexing (Antonio)

Cc: Ursulin Tvrtko <tvrtko.ursulin@intel.com>
Cc: Antonio Argenziano <antonio.argenziano@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Carlos Santa <carlos.santa@intel.com>
---
 tests/Makefile.sources    |   3 +
 tests/i915/gem_watchdog.c | 366 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/meson.build         |   1 +
 3 files changed, 370 insertions(+)
 create mode 100644 tests/i915/gem_watchdog.c

diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index 214698d..7f17f20 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES = i915/gem_userptr_blits.c
 TESTS_progs += gem_wait
 gem_wait_SOURCES = i915/gem_wait.c
 
+TESTS_progs += gem_watchdog
+gem_watchdog_SOURCES = i915/gem_watchdog.c
+
 TESTS_progs += gem_workarounds
 gem_workarounds_SOURCES = i915/gem_workarounds.c
 
diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
new file mode 100644
index 0000000..e6c1abe
--- /dev/null
+++ b/tests/i915/gem_watchdog.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "igt.h"
+#include "igt_sysfs.h"
+#include "sw_sync.h"
+
+#include <pthread.h>
+#include <fcntl.h>
+
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/signal.h>
+#include "i915/gem_ring.h"
+
+#define LOCAL_I915_EXEC_BSD_SHIFT	(13)
+#define LOCAL_I915_EXEC_BSD_RING1 	(1 << LOCAL_I915_EXEC_BSD_SHIFT)
+#define LOCAL_I915_EXEC_BSD_RING2 	(2 << LOCAL_I915_EXEC_BSD_SHIFT)
+
+#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
+#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
+#define HIGH 1
+#define LOW 0
+#define WATCHDOG_THRESHOLD (100)
+#define MAX_ENGINES 5
+#define RENDER_CLASS 0
+#define VIDEO_DECODE_CLASS 1
+#define VIDEO_ENHANCEMENT_CLASS 2
+#define COPY_ENGINE_CLASS 3
+#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
+
+const uint64_t timeout_100ms = 100000000LL;
+int num;
+
+struct drm_i915_gem_watchdog_timeout {
+	union {
+		struct {
+			/*
+			 * Engine class & instance to be configured or queried.
+			 */
+			__u16 engine_class;
+			__u16 engine_instance;
+		};
+		/* Index based addressing mode */
+		__u32 index;
+	};
+	/* GPU Engine watchdog resets timeout in us */
+	__u32 timeout_us;
+};
+
+static void clear_error_state(int fd)
+{
+	int dir;
+
+	dir = igt_sysfs_open(fd);
+
+	if (dir < 0)
+		return;
+
+	/* Any write to the error state clears it */
+	igt_sysfs_set(dir, "error", "");
+	close(dir);
+}
+
+static void context_set_watchdog(int fd, int engine_id,
+                                 unsigned ctx_id, unsigned threshold)
+{
+	struct drm_i915_gem_watchdog_timeout engines_threshold[MAX_ENGINES];
+	struct drm_i915_gem_context_param arg = {
+		.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
+		.ctx_id = ctx_id,
+		.size = sizeof(engines_threshold),
+		.value = (uint64_t)&engines_threshold
+	};
+
+	memset(&engines_threshold, 0, sizeof(engines_threshold));
+
+	switch (engine_id & I915_EXEC_RING_MASK) {
+	case I915_EXEC_RENDER:
+		engines_threshold[RENDER_CLASS].timeout_us = threshold;
+		engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
+		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
+		break;
+	case I915_EXEC_BSD:
+		engines_threshold[RENDER_CLASS].timeout_us = 0;
+		engines_threshold[VIDEO_DECODE_CLASS].timeout_us = threshold;
+		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
+		break;
+	case I915_EXEC_VEBOX:
+		engines_threshold[RENDER_CLASS].timeout_us = 0;
+		engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
+		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = threshold;
+		break;
+	default:
+		break;
+	}
+
+	gem_context_set_param(fd, &arg);
+}
+
+static void batch_buffer_factory(uint32_t fd, uint32_t ctx_id, unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle, uint64_t timeout, int *fence, int fence_index)
+{
+    struct drm_i915_gem_exec_object2 obj[2];
+    struct drm_i915_gem_relocation_entry reloc;
+    struct drm_i915_gem_execbuffer2 execbuf;
+    igt_spin_t *spin = NULL;
+    const uint32_t bbe = MI_BATCH_BUFFER_END;
+    int i = 0;
+
+    gem_quiescent_gpu(fd);
+
+    memset(&execbuf, 0, sizeof(execbuf));
+    memset(&obj, 0, sizeof(obj));
+    memset(&reloc, 0, sizeof(reloc));
+
+    execbuf.buffers_ptr = to_user_pointer(obj);
+
+    execbuf.buffer_count = 2;
+    execbuf.flags = exec_id | I915_EXEC_FENCE_OUT ;
+
+    obj[0].handle = target;
+    obj[1].handle = gem_create(fd, 4096);
+
+    obj[1].relocation_count = 1;
+    obj[1].relocs_ptr = to_user_pointer(&reloc);
+
+    reloc.target_handle = obj[0].handle;
+    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
+    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
+    reloc.delta = offset * sizeof(uint32_t);
+
+    reloc.offset = i * sizeof(uint32_t);
+    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
+
+    __sync_synchronize();
+
+    if (handle) {
+        *handle = obj[1].handle;
+        return;
+    }
+
+    gem_sync(fd, obj[1].handle);
+    execbuf.rsvd1 = ctx_id;
+    execbuf.rsvd2 = -1;
+
+    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
+    igt_spin_batch_set_timeout(spin, timeout);
+    igt_assert(gem_bo_busy(fd, obj[0].handle));
+
+    gem_execbuf_wr(fd, &execbuf);
+    igt_spin_batch_free(fd, spin);
+
+    fence[fence_index] = execbuf.rsvd2 >> 32;
+
+    gem_close(fd, obj[1].handle);
+    gem_quiescent_gpu(fd);
+}
+
+static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
+{
+	uint32_t ctx = gem_context_create(fd);
+
+	switch (ctx_prio) {
+	case HIGH:
+		__gem_context_set_priority(fd, ctx, MAX_PRIO);
+		igt_info("Setting MAX priority %d\n", ctx_prio);
+		break;
+	case LOW:
+		__gem_context_set_priority(fd, ctx, MIN_PRIO);
+		igt_info("Setting MIN priority %d\n", ctx_prio);
+		break;
+	default:
+		igt_info("Ignoring context priority %d\n", ctx_prio);
+		break;
+	}
+	printf("ctx id: %u\n",ctx);
+	return ctx;
+}
+
+static void inject_hang(uint32_t fd, unsigned ring, uint32_t ctx_id,  unsigned flags)
+{
+	igt_hang_t hang;
+	hang = igt_hang_ctx(fd, ctx_id, ring, flags);
+	gem_sync(fd, hang.spin->handle);
+}
+
+static void gpu_watchdog_long_batch_2_contexts(int fd, int nengine, int prio_ctx1, int prio_ctx2)
+{
+	uint32_t ctx[2];
+	uint32_t scratch[2];
+	unsigned flags = HANG_ALLOW_CAPTURE;
+	const uint64_t batch_timeout_ms = timeout_100ms * 3;
+	int i = 0, engine_id;
+	int *fence = 0;
+
+	igt_require(nengine);
+
+	fence = (int *)malloc(sizeof(int)*2);
+
+	if (!fence) {
+		igt_info("Out of memory\n");
+		exit(1);
+	}
+
+	for (i = 0; i < 2; i++) {
+		scratch[i] = gem_create(fd, 4096);
+	}
+
+	/* Create some work on RCS0 */
+	engine_id = 1;
+	ctx[0] = create_ctx_with_priority(fd, prio_ctx1);
+	batch_buffer_factory(fd, ctx[0], engine_id, scratch[0], 0, NULL, batch_timeout_ms, fence, 0);
+
+	/* Cancel batch on RCS0 w/ gpu watchdog timeout */
+	if(prio_ctx1 < 0 && prio_ctx2 < 0) {
+		context_set_watchdog(fd, engine_id, ctx[0], WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+		inject_hang(fd, engine_id, ctx[0], flags);
+	}
+
+#if 0
+	/* Now check the engine was reset successfully*/
+	igt_assert_eq(sync_fence_status(*fence), -EIO);
+#endif
+	close(fence[0]);
+
+	/* Create some work on VECS0 */
+	engine_id = 4;
+	ctx[1] = create_ctx_with_priority(fd, prio_ctx2);
+	batch_buffer_factory(fd, ctx[1], engine_id, scratch[1], 0, NULL, batch_timeout_ms, fence, 1);
+
+	/* Cancel batch on RCS0 w/ gpu watchdog timeout */
+	context_set_watchdog(fd, engine_id, ctx[1], WATCHDOG_THRESHOLD);
+	clear_error_state(fd);
+	inject_hang(fd, engine_id, ctx[1], flags);
+
+#if 0
+	/* Now check the engine was reset successfully */
+	igt_assert_eq(sync_fence_status(*fence), -EIO);
+#endif
+	close(fence[1]);
+
+	for (i = 0; i < 2; i++) {
+		gem_context_destroy(fd, ctx[i]);
+		gem_close(fd, scratch[i]);
+	}
+}
+
+static void gpu_watchodg_hang_long_batch_single_engine(int fd, unsigned engine_id, const char *name)
+{
+	uint32_t ctx[16];
+	uint32_t scratch[16];
+	int *fence;
+	unsigned nengine = 0;
+	unsigned engine;
+
+	int i;
+	unsigned flags = HANG_ALLOW_CAPTURE;
+	const uint64_t batch_timeout_ms = timeout_100ms*4;
+
+	fence = (int *)malloc(sizeof(int)*16);
+
+	if (!fence) {
+		igt_info("Out of memory\n");
+		exit(1);
+	}
+
+	for_each_physical_engine(fd, engine) {
+		/* no support for gpu watchdog on BLT */
+		if ( strncmp(e__->name, "blt", 3) == 0 )
+			continue;
+
+		scratch[nengine] = gem_create(fd, 4096);
+		ctx[nengine] = create_ctx_with_priority(fd, -1);
+
+		/* Create some work on the engine using the same ctx*/
+		batch_buffer_factory(fd, ctx[nengine], e__->exec_id, scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
+
+		/* Set the gpu watchdog timeout */
+		context_set_watchdog(fd, e__->exec_id, ctx[nengine], WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+
+		/* Cancel only the batch requested */
+		if ( strncmp(e__->name, name, 4) == 0 )
+			inject_hang(fd, e__->exec_id, ctx[nengine], flags);
+#if 0
+		igt_info("fence:%d, fence status : %d EIO: %d ctx_id:%d\n",fence[nengine], sync_fence_status(fence[nengine]),-EIO, ctx[nengine]);
+		/* Now check the engine was reset */
+		igt_assert_eq(sync_fence_status(fence[nengine]), -EIO);
+#endif
+		nengine++;
+	}
+
+	for (i = 0; i < nengine; i++) {
+		close(fence[i]);
+		gem_context_destroy(fd, ctx[i]);
+		gem_close(fd, scratch[i]);
+	}
+}
+
+igt_main
+{
+	int fd;
+	unsigned int nengine = 0;
+	unsigned int engine;
+
+	igt_skip_on_simulation();
+
+	igt_fixture {
+		fd = drm_open_driver(DRIVER_INTEL);
+		igt_require_gem(fd);
+
+		for_each_physical_engine(fd, engine)
+			nengine++;
+		igt_require(nengine);
+	}
+
+	igt_subtest_group {
+
+		igt_subtest_f("low-prio-ctx-wo-gpu-watchdog-and-high-prio-ctx-with-gpu-watchdog") {
+			int prio1 = LOW;
+			int prio2 = HIGH;
+			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio1, prio2);
+		}
+
+		for (const struct intel_execution_engine *e = intel_execution_engines; e->name; e++) {
+			/* no support for gpu watchdog on BLT */
+			if (e->exec_id == 0 || e->exec_id == I915_EXEC_BLT)
+				continue;
+
+			igt_subtest_f("gpu-watchdog-long-batch-%s", e->name) {
+				igt_require(gem_ring_has_physical_engine(fd, e->exec_id | e->flags));
+				printf("below id: %d\n",e->exec_id);
+				gpu_watchodg_hang_long_batch_single_engine(fd, e->exec_id | e->flags, e->name);
+			}
+		}
+
+		igt_subtest_f("gpu-watchdog-long-batch-2-contexts") {
+			int prio = -1;
+			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio, prio);
+		}
+    }
+
+    igt_fixture {
+	close(fd);
+    }
+}
diff --git a/tests/meson.build b/tests/meson.build
index 5167a6c..b281b75 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -210,6 +210,7 @@ i915_progs = [
 	'gem_unref_active_buffers',
 	'gem_userptr_blits',
 	'gem_wait',
+        'gem_watchdog',
 	'gem_workarounds',
 	'gem_write_read_ring_switch',
 	'i915_fb_tiling',
-- 
2.7.4

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [igt-dev] ✗ Fi.CI.BAT: failure for tests/gem_watchdog: Initial set of tests for GPU watchdog (rev4)
  2019-04-18 16:53 [igt-dev] [RFC v2] tests/gem_watchdog: Initial set of tests for GPU watchdog Carlos Santa
@ 2019-04-18 17:39 ` Patchwork
  2019-04-19 21:29 ` [igt-dev] [RFC v2] tests/gem_watchdog: Initial set of tests for GPU watchdog Antonio Argenziano
  1 sibling, 0 replies; 4+ messages in thread
From: Patchwork @ 2019-04-18 17:39 UTC (permalink / raw)
  To: Carlos Santa; +Cc: igt-dev

== Series Details ==

Series: tests/gem_watchdog: Initial set of tests for GPU watchdog (rev4)
URL   : https://patchwork.freedesktop.org/series/50041/
State : failure

== Summary ==

IGT patchset build failed on latest successful build
a765aa108105804c19096554447ad0cb71f64fc3 lib/igt_dummyload: Get rid of 'batch' on spinner accessors

[334/458] Linking target tests/gem_ring_sync_loop.
[335/458] Linking target tests/gem_softpin.
[336/458] Linking target tests/gem_set_tiling_vs_pwrite.
[337/458] Linking target tests/gem_set_tiling_vs_gtt.
[338/458] Linking target tests/gem_shrink.
[339/458] Linking target tests/gem_storedw_loop.
[340/458] Linking target tests/gem_stolen.
[341/458] Linking target tests/gem_spin_batch.
[342/458] Linking target tests/gem_storedw_batches_loop.
[343/458] Linking target tests/gem_streaming_writes.
[344/458] Linking target tests/gem_threaded_access_tiled.
[345/458] Linking target tests/gem_sync.
[346/458] Linking target tests/gem_tiled_partial_pwrite_pread.
[347/458] Linking target tests/gem_tiled_blits.
[348/458] Linking target tests/gem_tiled_fence_blits.
[349/458] Linking target tests/gem_tiled_pread_basic.
[350/458] Linking target tests/gem_userptr_blits.
[351/458] Linking target tests/gem_tiled_pread_pwrite.
[352/458] Linking target tests/gem_tiled_wb.
[353/458] Linking target tests/gem_tiled_swapping.
[354/458] Linking target tests/gem_tiling_max_stride.
[355/458] Linking target tests/gem_tiled_wc.
[356/458] Linking target tests/gem_unfence_active_buffers.
[357/458] Linking target tests/gem_unref_active_buffers.
[358/458] Linking target tests/i915_fb_tiling.
[359/458] Linking target tests/gem_write_read_ring_switch.
[360/458] Compiling C object 'tests/tests@@gem_watchdog@exe/i915_gem_watchdog.c.o'.
FAILED: tests/tests@@gem_watchdog@exe/i915_gem_watchdog.c.o 
ccache cc -Itests/tests@@gem_watchdog@exe -Itests -I../tests -I../include/drm-uapi -Ilib -I../lib -I../lib/stubs/syscalls -I. -I../ -I/usr/include/cairo -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include -I/usr/include/pixman-1 -I/usr/include/libpng16 -I/usr/include/freetype2 -I/usr/include/libpng12 -I/opt/igt/include -I/opt/igt/include/libdrm -I/usr/include/x86_64-linux-gnu -I/usr/include/alsa -I/usr/include -I/usr/include/libdrm -I/usr/include/libdrm/nouveau -I/home/cidrm/kernel_headers/include -fdiagnostics-color=always -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=gnu11 -O0 -g -D_GNU_SOURCE -include config.h -Wbad-function-cast -Wdeclaration-after-statement -Wformat=2 -Wimplicit-fallthrough=0 -Wlogical-op -Wmissing-declarations -Wmissing-format-attribute -Wmissing-noreturn -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wpointer-arith -Wredundant-decls -Wshadow -Wstrict-prototypes -Wuninitialized -Wunused -Wno-clobbered -Wno-maybe-uninitialized -Wno-missing-field-initializers -Wno-pointer-arith -Wno-sign-compare -Wno-type-limits -Wno-unused-parameter -Wno-unused-result -Werror=address -Werror=array-bounds -Werror=implicit -Werror=init-self -Werror=int-to-pointer-cast -Werror=main -Werror=missing-braces -Werror=nonnull -Werror=pointer-to-int-cast -Werror=return-type -Werror=sequence-point -Werror=trigraphs -Werror=write-strings -pthread  -MD -MQ 'tests/tests@@gem_watchdog@exe/i915_gem_watchdog.c.o' -MF 'tests/tests@@gem_watchdog@exe/i915_gem_watchdog.c.o.d' -o 'tests/tests@@gem_watchdog@exe/i915_gem_watchdog.c.o' -c ../tests/i915/gem_watchdog.c
../tests/i915/gem_watchdog.c: In function ‘batch_buffer_factory’:
../tests/i915/gem_watchdog.c:165:12: error: implicit declaration of function ‘igt_spin_batch_new’; did you mean ‘igt_spin_new’? [-Werror=implicit-function-declaration]
     spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
            ^~~~~~~~~~~~~~~~~~
            igt_spin_new
../tests/i915/gem_watchdog.c:165:12: warning: nested extern declaration of ‘igt_spin_batch_new’ [-Wnested-externs]
../tests/i915/gem_watchdog.c:165:35: error: expected expression before ‘.’ token
     spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
                                   ^
../tests/i915/gem_watchdog.c:166:5: error: implicit declaration of function ‘igt_spin_batch_set_timeout’; did you mean ‘igt_spin_set_timeout’? [-Werror=implicit-function-declaration]
     igt_spin_batch_set_timeout(spin, timeout);
     ^~~~~~~~~~~~~~~~~~~~~~~~~~
     igt_spin_set_timeout
../tests/i915/gem_watchdog.c:166:5: warning: nested extern declaration of ‘igt_spin_batch_set_timeout’ [-Wnested-externs]
../tests/i915/gem_watchdog.c:170:5: error: implicit declaration of function ‘igt_spin_batch_free’; did you mean ‘igt_spin_free’? [-Werror=implicit-function-declaration]
     igt_spin_batch_free(fd, spin);
     ^~~~~~~~~~~~~~~~~~~
     igt_spin_free
../tests/i915/gem_watchdog.c:170:5: warning: nested extern declaration of ‘igt_spin_batch_free’ [-Wnested-externs]
cc1: some warnings being treated as errors
ninja: build stopped: subcommand failed.

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [igt-dev] [RFC v2] tests/gem_watchdog: Initial set of tests for GPU watchdog
  2019-04-18 16:53 [igt-dev] [RFC v2] tests/gem_watchdog: Initial set of tests for GPU watchdog Carlos Santa
  2019-04-18 17:39 ` [igt-dev] ✗ Fi.CI.BAT: failure for tests/gem_watchdog: Initial set of tests for GPU watchdog (rev4) Patchwork
@ 2019-04-19 21:29 ` Antonio Argenziano
  2019-04-23  0:04   ` Carlos Santa
  1 sibling, 1 reply; 4+ messages in thread
From: Antonio Argenziano @ 2019-04-19 21:29 UTC (permalink / raw)
  To: Carlos Santa, igt-dev; +Cc: Ursulin Tvrtko



On 18/04/19 09:53, Carlos Santa wrote:
> This test adds basic set of tests to reset the different
> GPU engines through the gpu watchdog timer.
> 
> Credits to Antonio for the original codebase this is based on.
> 
> v2: remove gem_context_get_param() during set (Antonio)
>      remove clearing of the engines_threshold[] in the default case
>      inside context_set_watchdog(). (Antonio)
>      fix indexing when creating low/high priority contexts
>      get rid of 2 threads idea (Antonio)
>      fix context prio bug due to wrong indexing (Antonio)
> 
> Cc: Ursulin Tvrtko <tvrtko.ursulin@intel.com>
> Cc: Antonio Argenziano <antonio.argenziano@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Carlos Santa <carlos.santa@intel.com>
> ---
>   tests/Makefile.sources    |   3 +
>   tests/i915/gem_watchdog.c | 366 ++++++++++++++++++++++++++++++++++++++++++++++
>   tests/meson.build         |   1 +
>   3 files changed, 370 insertions(+)
>   create mode 100644 tests/i915/gem_watchdog.c
> 
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index 214698d..7f17f20 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES = i915/gem_userptr_blits.c
>   TESTS_progs += gem_wait
>   gem_wait_SOURCES = i915/gem_wait.c
>   
> +TESTS_progs += gem_watchdog
> +gem_watchdog_SOURCES = i915/gem_watchdog.c
> +
>   TESTS_progs += gem_workarounds
>   gem_workarounds_SOURCES = i915/gem_workarounds.c
>   
> diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
> new file mode 100644
> index 0000000..e6c1abe
> --- /dev/null
> +++ b/tests/i915/gem_watchdog.c
> @@ -0,0 +1,366 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +#include "igt.h"
> +#include "igt_sysfs.h"
> +#include "sw_sync.h"
> +
> +#include <pthread.h>
> +#include <fcntl.h>
> +
> +#include <sys/ioctl.h>
> +#include <sys/poll.h>
> +#include <sys/signal.h>
> +#include "i915/gem_ring.h"
> +
> +#define LOCAL_I915_EXEC_BSD_SHIFT	(13)
> +#define LOCAL_I915_EXEC_BSD_RING1 	(1 << LOCAL_I915_EXEC_BSD_SHIFT)
> +#define LOCAL_I915_EXEC_BSD_RING2 	(2 << LOCAL_I915_EXEC_BSD_SHIFT)
> +
> +#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
> +#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
> +#define HIGH 1
> +#define LOW 0
> +#define WATCHDOG_THRESHOLD (100)
> +#define MAX_ENGINES 5
> +#define RENDER_CLASS 0
> +#define VIDEO_DECODE_CLASS 1
> +#define VIDEO_ENHANCEMENT_CLASS 2
> +#define COPY_ENGINE_CLASS 3
> +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
> +
> +const uint64_t timeout_100ms = 100000000LL;
> +int num;
> +
> +struct drm_i915_gem_watchdog_timeout {
> +	union {
> +		struct {
> +			/*
> +			 * Engine class & instance to be configured or queried.
> +			 */
> +			__u16 engine_class;
> +			__u16 engine_instance;
> +		};
> +		/* Index based addressing mode */
> +		__u32 index;
> +	};
> +	/* GPU Engine watchdog resets timeout in us */
> +	__u32 timeout_us;
> +};
> +
> +static void clear_error_state(int fd)
> +{
> +	int dir;
> +
> +	dir = igt_sysfs_open(fd);
> +
> +	if (dir < 0)
> +		return;
> +
> +	/* Any write to the error state clears it */
> +	igt_sysfs_set(dir, "error", "");
> +	close(dir);
> +}
> +
> +static void context_set_watchdog(int fd, int engine_id,
> +                                 unsigned ctx_id, unsigned threshold)
> +{
> +	struct drm_i915_gem_watchdog_timeout engines_threshold[MAX_ENGINES];
> +	struct drm_i915_gem_context_param arg = {
> +		.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
> +		.ctx_id = ctx_id,
> +		.size = sizeof(engines_threshold),
> +		.value = (uint64_t)&engines_threshold
> +	};
> +
> +	memset(&engines_threshold, 0, sizeof(engines_threshold));
> +
> +	switch (engine_id & I915_EXEC_RING_MASK) {
> +	case I915_EXEC_RENDER:
> +		engines_threshold[RENDER_CLASS].timeout_us = threshold;
> +		engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
> +		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;

nit: struct already initialized to 0, no need to explicitly zero its 
elements.

> +		break;
> +	case I915_EXEC_BSD:
> +		engines_threshold[RENDER_CLASS].timeout_us = 0;
> +		engines_threshold[VIDEO_DECODE_CLASS].timeout_us = threshold;
> +		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
> +		break;
> +	case I915_EXEC_VEBOX:
> +		engines_threshold[RENDER_CLASS].timeout_us = 0;
> +		engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
> +		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = threshold;
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	gem_context_set_param(fd, &arg);
> +}
> +
> +static void batch_buffer_factory(uint32_t fd, uint32_t ctx_id, unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle, uint64_t timeout, int *fence, int fence_index)
> +{
> +    struct drm_i915_gem_exec_object2 obj[2];
> +    struct drm_i915_gem_relocation_entry reloc;
> +    struct drm_i915_gem_execbuffer2 execbuf;
> +    igt_spin_t *spin = NULL;
> +    const uint32_t bbe = MI_BATCH_BUFFER_END;
> +    int i = 0;
> +
> +    gem_quiescent_gpu(fd);
> +
> +    memset(&execbuf, 0, sizeof(execbuf));
> +    memset(&obj, 0, sizeof(obj));
> +    memset(&reloc, 0, sizeof(reloc));
> +
> +    execbuf.buffers_ptr = to_user_pointer(obj);
> +
> +    execbuf.buffer_count = 2;
> +    execbuf.flags = exec_id | I915_EXEC_FENCE_OUT ;
> +
> +    obj[0].handle = target;
> +    obj[1].handle = gem_create(fd, 4096);
> +
> +    obj[1].relocation_count = 1;
> +    obj[1].relocs_ptr = to_user_pointer(&reloc);
> +
> +    reloc.target_handle = obj[0].handle;
> +    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
> +    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
> +    reloc.delta = offset * sizeof(uint32_t);
> +
> +    reloc.offset = i * sizeof(uint32_t);
> +    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
> +
> +    __sync_synchronize();
> +
> +    if (handle) {
> +        *handle = obj[1].handle;
> +        return;
> +    }
> +
> +    gem_sync(fd, obj[1].handle);
> +    execbuf.rsvd1 = ctx_id;
> +    execbuf.rsvd2 = -1;
> +
> +    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
> +    igt_spin_batch_set_timeout(spin, timeout);
> +    igt_assert(gem_bo_busy(fd, obj[0].handle));
> +
> +    gem_execbuf_wr(fd, &execbuf);
> +    igt_spin_batch_free(fd, spin);
> +
> +    fence[fence_index] = execbuf.rsvd2 >> 32;
> +
> +    gem_close(fd, obj[1].handle);
> +    gem_quiescent_gpu(fd);
> +}
> +
> +static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
> +{
> +	uint32_t ctx = gem_context_create(fd);
> +
> +	switch (ctx_prio) {
> +	case HIGH:
> +		__gem_context_set_priority(fd, ctx, MAX_PRIO);

This helper returns an error code, need to check it or use 
gem_context_set_priority() which will assert if the IOCTL fails.

> +		igt_info("Setting MAX priority %d\n", ctx_prio);
> +		break;
> +	case LOW:
> +		__gem_context_set_priority(fd, ctx, MIN_PRIO);
> +		igt_info("Setting MIN priority %d\n", ctx_prio);
> +		break;
> +	default:
> +		igt_info("Ignoring context priority %d\n", ctx_prio);
> +		break;
> +	}
> +	printf("ctx id: %u\n",ctx);

Remember to change this to a log once you send the PATCH version.

> +	return ctx;
> +}
> +
> +static void inject_hang(uint32_t fd, unsigned ring, uint32_t ctx_id,  unsigned flags)
> +{
> +	igt_hang_t hang;
> +	hang = igt_hang_ctx(fd, ctx_id, ring, flags);
> +	gem_sync(fd, hang.spin->handle);
> +}
> +
> +static void gpu_watchdog_long_batch_2_contexts(int fd, int nengine, int prio_ctx1, int prio_ctx2)
> +{
> +	uint32_t ctx[2];
> +	uint32_t scratch[2];
> +	unsigned flags = HANG_ALLOW_CAPTURE;
> +	const uint64_t batch_timeout_ms = timeout_100ms * 3;
> +	int i = 0, engine_id;
> +	int *fence = 0;
> +
> +	igt_require(nengine);
> +
> +	fence = (int *)malloc(sizeof(int)*2);
> +
> +	if (!fence) {
> +		igt_info("Out of memory\n");
> +		exit(1);
> +	}
> +
> +	for (i = 0; i < 2; i++) {
> +		scratch[i] = gem_create(fd, 4096);
> +	}
> +
> +	/* Create some work on RCS0 */
> +	engine_id = 1;

Not that many combinations I think we can iterate over all engines in a 
reasonable time.

> +	ctx[0] = create_ctx_with_priority(fd, prio_ctx1);
> +	batch_buffer_factory(fd, ctx[0], engine_id, scratch[0], 0, NULL, batch_timeout_ms, fence, 0);

Note that the execbuf used in batch_buffer_factory() is not the same 
used in inject_hang().

> +
> +	/* Cancel batch on RCS0 w/ gpu watchdog timeout */
> +	if(prio_ctx1 < 0 && prio_ctx2 < 0) {
> +		context_set_watchdog(fd, engine_id, ctx[0], WATCHDOG_THRESHOLD);
> +		clear_error_state(fd);
> +		inject_hang(fd, engine_id, ctx[0], flags);
> +	}
> +
> +#if 0

Is the code wrapped in if 0 statements going to be included in the final 
patch?

> +	/* Now check the engine was reset successfully*/
> +	igt_assert_eq(sync_fence_status(*fence), -EIO); > +#endif
> +	close(fence[0]);

nit: you can close all fences at the very end of the test.

> +
> +	/* Create some work on VECS0 */
> +	engine_id = 4;
> +	ctx[1] = create_ctx_with_priority(fd, prio_ctx2);
> +	batch_buffer_factory(fd, ctx[1], engine_id, scratch[1], 0, NULL, batch_timeout_ms, fence, 1);
> +
> +	/* Cancel batch on RCS0 w/ gpu watchdog timeout */
> +	context_set_watchdog(fd, engine_id, ctx[1], WATCHDOG_THRESHOLD);
> +	clear_error_state(fd);
> +	inject_hang(fd, engine_id, ctx[1], flags);

You don't want to sync after the hang in this test I think. It would be 
better to send plenty of contexts with different thresholds and have 
them active at ~ the same time.

> +
> +#if 0
> +	/* Now check the engine was reset successfully */
> +	igt_assert_eq(sync_fence_status(*fence), -EIO);
> +#endif
> +	close(fence[1]);
> +
> +	for (i = 0; i < 2; i++) {
> +		gem_context_destroy(fd, ctx[i]);
> +		gem_close(fd, scratch[i]);
> +	}
> +}
> +
> +static void gpu_watchodg_hang_long_batch_single_engine(int fd, unsigned engine_id, const char *name)

engine_id is unused in this function.

> +{
> +	uint32_t ctx[16];
> +	uint32_t scratch[16];
> +	int *fence;
> +	unsigned nengine = 0;
> +	unsigned engine;
> +
> +	int i;
> +	unsigned flags = HANG_ALLOW_CAPTURE;
> +	const uint64_t batch_timeout_ms = timeout_100ms*4;
> +
> +	fence = (int *)malloc(sizeof(int)*16);
> +
> +	if (!fence) {
> +		igt_info("Out of memory\n");
> +		exit(1);
> +	}
> +
> +	for_each_physical_engine(fd, engine) {

Just a suggestion: you could use for_each_engine_class_instance() end 
skip based on the class. This should fit in better with the new engine 
interface Andi is working on.

> +		/* no support for gpu watchdog on BLT */
> +		if ( strncmp(e__->name, "blt", 3) == 0 )
> +			continue;
> +
> +		scratch[nengine] = gem_create(fd, 4096);
> +		ctx[nengine] = create_ctx_with_priority(fd, -1);
> +
> +		/* Create some work on the engine using the same ctx*/
> +		batch_buffer_factory(fd, ctx[nengine], e__->exec_id, scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);

Why are we sending work before sending the hanging batch? I think it 
would be better not to send anything down and maybe have a different 
test where you:

	- Submit work from context A
	- Set a non 0 threshold on context A
	- Submit more work on context A

And expect the work before you set the threshold is not reset and what 
was sent after is.

> +
> +		/* Set the gpu watchdog timeout */
> +		context_set_watchdog(fd, e__->exec_id, ctx[nengine], WATCHDOG_THRESHOLD);
> +		clear_error_state(fd);
> +
> +		/* Cancel only the batch requested */
> +		if ( strncmp(e__->name, name, 4) == 0 )
> +			inject_hang(fd, e__->exec_id, ctx[nengine], flags);
> +#if 0
> +		igt_info("fence:%d, fence status : %d EIO: %d ctx_id:%d\n",fence[nengine], sync_fence_status(fence[nengine]),-EIO, ctx[nengine]);
> +		/* Now check the engine was reset */
> +		igt_assert_eq(sync_fence_status(fence[nengine]), -EIO);
> +#endif
> +		nengine++;
> +	}
> +
> +	for (i = 0; i < nengine; i++) {
> +		close(fence[i]);
> +		gem_context_destroy(fd, ctx[i]);
> +		gem_close(fd, scratch[i]);
> +	}
> +}
> +
> +igt_main
> +{
> +	int fd;
> +	unsigned int nengine = 0;
> +	unsigned int engine;
> +
> +	igt_skip_on_simulation();
> +
> +	igt_fixture {
> +		fd = drm_open_driver(DRIVER_INTEL);
> +		igt_require_gem(fd);
> +
> +		for_each_physical_engine(fd, engine)
> +			nengine++;
> +		igt_require(nengine);
> +	}
> +
> +	igt_subtest_group {
> +
> +		igt_subtest_f("low-prio-ctx-wo-gpu-watchdog-and-high-prio-ctx-with-gpu-watchdog") {
> +			int prio1 = LOW;
> +			int prio2 = HIGH;
> +			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio1, prio2);
> +		}
> +
> +		for (const struct intel_execution_engine *e = intel_execution_engines; e->name; e++) {

You are looping on each engine inside the test as well.

Antonio

> +			/* no support for gpu watchdog on BLT */
> +			if (e->exec_id == 0 || e->exec_id == I915_EXEC_BLT)
> +				continue;
> +
> +			igt_subtest_f("gpu-watchdog-long-batch-%s", e->name) {
> +				igt_require(gem_ring_has_physical_engine(fd, e->exec_id | e->flags));
> +				printf("below id: %d\n",e->exec_id);
> +				gpu_watchodg_hang_long_batch_single_engine(fd, e->exec_id | e->flags, e->name);
> +			}
> +		}
> +
> +		igt_subtest_f("gpu-watchdog-long-batch-2-contexts") {
> +			int prio = -1;
> +			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio, prio);
> +		}
> +    }
> +
> +    igt_fixture {
> +	close(fd);
> +    }
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 5167a6c..b281b75 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -210,6 +210,7 @@ i915_progs = [
>   	'gem_unref_active_buffers',
>   	'gem_userptr_blits',
>   	'gem_wait',
> +        'gem_watchdog',
>   	'gem_workarounds',
>   	'gem_write_read_ring_switch',
>   	'i915_fb_tiling',
> 
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [igt-dev] [RFC v2] tests/gem_watchdog: Initial set of tests for GPU watchdog
  2019-04-19 21:29 ` [igt-dev] [RFC v2] tests/gem_watchdog: Initial set of tests for GPU watchdog Antonio Argenziano
@ 2019-04-23  0:04   ` Carlos Santa
  0 siblings, 0 replies; 4+ messages in thread
From: Carlos Santa @ 2019-04-23  0:04 UTC (permalink / raw)
  To: Antonio Argenziano, igt-dev; +Cc: Ursulin Tvrtko

On Fri, 2019-04-19 at 14:29 -0700, Antonio Argenziano wrote:
> 
> On 18/04/19 09:53, Carlos Santa wrote:
> > This test adds basic set of tests to reset the different
> > GPU engines through the gpu watchdog timer.
> > 
> > Credits to Antonio for the original codebase this is based on.
> > 
> > v2: remove gem_context_get_param() during set (Antonio)
> >      remove clearing of the engines_threshold[] in the default case
> >      inside context_set_watchdog(). (Antonio)
> >      fix indexing when creating low/high priority contexts
> >      get rid of 2 threads idea (Antonio)
> >      fix context prio bug due to wrong indexing (Antonio)
> > 
> > Cc: Ursulin Tvrtko <tvrtko.ursulin@intel.com>
> > Cc: Antonio Argenziano <antonio.argenziano@intel.com>
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > Signed-off-by: Carlos Santa <carlos.santa@intel.com>
> > ---
> >   tests/Makefile.sources    |   3 +
> >   tests/i915/gem_watchdog.c | 366
> > ++++++++++++++++++++++++++++++++++++++++++++++
> >   tests/meson.build         |   1 +
> >   3 files changed, 370 insertions(+)
> >   create mode 100644 tests/i915/gem_watchdog.c
> > 
> > diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> > index 214698d..7f17f20 100644
> > --- a/tests/Makefile.sources
> > +++ b/tests/Makefile.sources
> > @@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES =
> > i915/gem_userptr_blits.c
> >   TESTS_progs += gem_wait
> >   gem_wait_SOURCES = i915/gem_wait.c
> >   
> > +TESTS_progs += gem_watchdog
> > +gem_watchdog_SOURCES = i915/gem_watchdog.c
> > +
> >   TESTS_progs += gem_workarounds
> >   gem_workarounds_SOURCES = i915/gem_workarounds.c
> >   
> > diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
> > new file mode 100644
> > index 0000000..e6c1abe
> > --- /dev/null
> > +++ b/tests/i915/gem_watchdog.c
> > @@ -0,0 +1,366 @@
> > +/*
> > + * Copyright © 2016 Intel Corporation
> > + *
> > + * Permission is hereby granted, free of charge, to any person
> > obtaining a
> > + * copy of this software and associated documentation files (the
> > "Software"),
> > + * to deal in the Software without restriction, including without
> > limitation
> > + * the rights to use, copy, modify, merge, publish, distribute,
> > sublicense,
> > + * and/or sell copies of the Software, and to permit persons to
> > whom the
> > + * Software is furnished to do so, subject to the following
> > conditions:
> > + *
> > + * The above copyright notice and this permission notice
> > (including the next
> > + * paragraph) shall be included in all copies or substantial
> > portions of the
> > + * Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> > EVENT SHALL
> > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
> > DAMAGES OR OTHER
> > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> > ARISING
> > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> > OTHER DEALINGS
> > + * IN THE SOFTWARE.
> > + */
> > +#include "igt.h"
> > +#include "igt_sysfs.h"
> > +#include "sw_sync.h"
> > +
> > +#include <pthread.h>
> > +#include <fcntl.h>
> > +
> > +#include <sys/ioctl.h>
> > +#include <sys/poll.h>
> > +#include <sys/signal.h>
> > +#include "i915/gem_ring.h"
> > +
> > +#define LOCAL_I915_EXEC_BSD_SHIFT	(13)
> > +#define LOCAL_I915_EXEC_BSD_RING1 	(1 <<
> > LOCAL_I915_EXEC_BSD_SHIFT)
> > +#define LOCAL_I915_EXEC_BSD_RING2 	(2 <<
> > LOCAL_I915_EXEC_BSD_SHIFT)
> > +
> > +#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
> > +#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
> > +#define HIGH 1
> > +#define LOW 0
> > +#define WATCHDOG_THRESHOLD (100)
> > +#define MAX_ENGINES 5
> > +#define RENDER_CLASS 0
> > +#define VIDEO_DECODE_CLASS 1
> > +#define VIDEO_ENHANCEMENT_CLASS 2
> > +#define COPY_ENGINE_CLASS 3
> > +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
> > +
> > +const uint64_t timeout_100ms = 100000000LL;
> > +int num;
> > +
> > +struct drm_i915_gem_watchdog_timeout {
> > +	union {
> > +		struct {
> > +			/*
> > +			 * Engine class & instance to be configured or
> > queried.
> > +			 */
> > +			__u16 engine_class;
> > +			__u16 engine_instance;
> > +		};
> > +		/* Index based addressing mode */
> > +		__u32 index;
> > +	};
> > +	/* GPU Engine watchdog resets timeout in us */
> > +	__u32 timeout_us;
> > +};
> > +
> > +static void clear_error_state(int fd)
> > +{
> > +	int dir;
> > +
> > +	dir = igt_sysfs_open(fd);
> > +
> > +	if (dir < 0)
> > +		return;
> > +
> > +	/* Any write to the error state clears it */
> > +	igt_sysfs_set(dir, "error", "");
> > +	close(dir);
> > +}
> > +
> > +static void context_set_watchdog(int fd, int engine_id,
> > +                                 unsigned ctx_id, unsigned
> > threshold)
> > +{
> > +	struct drm_i915_gem_watchdog_timeout
> > engines_threshold[MAX_ENGINES];
> > +	struct drm_i915_gem_context_param arg = {
> > +		.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
> > +		.ctx_id = ctx_id,
> > +		.size = sizeof(engines_threshold),
> > +		.value = (uint64_t)&engines_threshold
> > +	};
> > +
> > +	memset(&engines_threshold, 0, sizeof(engines_threshold));
> > +
> > +	switch (engine_id & I915_EXEC_RING_MASK) {
> > +	case I915_EXEC_RENDER:
> > +		engines_threshold[RENDER_CLASS].timeout_us = threshold;
> > +		engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
> > +		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us =
> > 0;
> 
> nit: struct already initialized to 0, no need to explicitly zero its 
> elements.

Agree.

> 
> > +		break;
> > +	case I915_EXEC_BSD:
> > +		engines_threshold[RENDER_CLASS].timeout_us = 0;
> > +		engines_threshold[VIDEO_DECODE_CLASS].timeout_us =
> > threshold;
> > +		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us =
> > 0;
> > +		break;
> > +	case I915_EXEC_VEBOX:
> > +		engines_threshold[RENDER_CLASS].timeout_us = 0;
> > +		engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
> > +		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us =
> > threshold;
> > +		break;
> > +	default:
> > +		break;
> > +	}
> > +
> > +	gem_context_set_param(fd, &arg);
> > +}
> > +
> > +static void batch_buffer_factory(uint32_t fd, uint32_t ctx_id,
> > unsigned exec_id, uint32_t target, uint32_t offset, uint32_t
> > *handle, uint64_t timeout, int *fence, int fence_index)
> > +{
> > +    struct drm_i915_gem_exec_object2 obj[2];
> > +    struct drm_i915_gem_relocation_entry reloc;
> > +    struct drm_i915_gem_execbuffer2 execbuf;
> > +    igt_spin_t *spin = NULL;
> > +    const uint32_t bbe = MI_BATCH_BUFFER_END;
> > +    int i = 0;
> > +
> > +    gem_quiescent_gpu(fd);
> > +
> > +    memset(&execbuf, 0, sizeof(execbuf));
> > +    memset(&obj, 0, sizeof(obj));
> > +    memset(&reloc, 0, sizeof(reloc));
> > +
> > +    execbuf.buffers_ptr = to_user_pointer(obj);
> > +
> > +    execbuf.buffer_count = 2;
> > +    execbuf.flags = exec_id | I915_EXEC_FENCE_OUT ;
> > +
> > +    obj[0].handle = target;
> > +    obj[1].handle = gem_create(fd, 4096);
> > +
> > +    obj[1].relocation_count = 1;
> > +    obj[1].relocs_ptr = to_user_pointer(&reloc);
> > +
> > +    reloc.target_handle = obj[0].handle;
> > +    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
> > +    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
> > +    reloc.delta = offset * sizeof(uint32_t);
> > +
> > +    reloc.offset = i * sizeof(uint32_t);
> > +    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
> > +
> > +    __sync_synchronize();
> > +
> > +    if (handle) {
> > +        *handle = obj[1].handle;
> > +        return;
> > +    }
> > +
> > +    gem_sync(fd, obj[1].handle);
> > +    execbuf.rsvd1 = ctx_id;
> > +    execbuf.rsvd2 = -1;
> > +
> > +    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
> > +    igt_spin_batch_set_timeout(spin, timeout);
> > +    igt_assert(gem_bo_busy(fd, obj[0].handle));
> > +
> > +    gem_execbuf_wr(fd, &execbuf);
> > +    igt_spin_batch_free(fd, spin);
> > +
> > +    fence[fence_index] = execbuf.rsvd2 >> 32;
> > +
> > +    gem_close(fd, obj[1].handle);
> > +    gem_quiescent_gpu(fd);
> > +}
> > +
> > +static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
> > +{
> > +	uint32_t ctx = gem_context_create(fd);
> > +
> > +	switch (ctx_prio) {
> > +	case HIGH:
> > +		__gem_context_set_priority(fd, ctx, MAX_PRIO);
> 
> This helper returns an error code, need to check it or use 
> gem_context_set_priority() which will assert if the IOCTL fails.

Agree.
 
> 
> > +		igt_info("Setting MAX priority %d\n", ctx_prio);
> > +		break;
> > +	case LOW:
> > +		__gem_context_set_priority(fd, ctx, MIN_PRIO);
> > +		igt_info("Setting MIN priority %d\n", ctx_prio);
> > +		break;
> > +	default:
> > +		igt_info("Ignoring context priority %d\n", ctx_prio);
> > +		break;
> > +	}
> > +	printf("ctx id: %u\n",ctx);
> 
> Remember to change this to a log once you send the PATCH version.
> 
> > +	return ctx;
> > +}
> > +
> > +static void inject_hang(uint32_t fd, unsigned ring, uint32_t
> > ctx_id,  unsigned flags)
> > +{
> > +	igt_hang_t hang;
> > +	hang = igt_hang_ctx(fd, ctx_id, ring, flags);
> > +	gem_sync(fd, hang.spin->handle);
> > +}
> > +
> > +static void gpu_watchdog_long_batch_2_contexts(int fd, int
> > nengine, int prio_ctx1, int prio_ctx2)
> > +{
> > +	uint32_t ctx[2];
> > +	uint32_t scratch[2];
> > +	unsigned flags = HANG_ALLOW_CAPTURE;
> > +	const uint64_t batch_timeout_ms = timeout_100ms * 3;
> > +	int i = 0, engine_id;
> > +	int *fence = 0;
> > +
> > +	igt_require(nengine);
> > +
> > +	fence = (int *)malloc(sizeof(int)*2);
> > +
> > +	if (!fence) {
> > +		igt_info("Out of memory\n");
> > +		exit(1);
> > +	}
> > +
> > +	for (i = 0; i < 2; i++) {
> > +		scratch[i] = gem_create(fd, 4096);
> > +	}
> > +
> > +	/* Create some work on RCS0 */
> > +	engine_id = 1;
> 
> Not that many combinations I think we can iterate over all engines in
> a 
> reasonable time.

Ok, now it's a looper in version #3.

> 
> > +	ctx[0] = create_ctx_with_priority(fd, prio_ctx1);
> > +	batch_buffer_factory(fd, ctx[0], engine_id, scratch[0], 0,
> > NULL, batch_timeout_ms, fence, 0);
> 
> Note that the execbuf used in batch_buffer_factory() is not the same 
> used in inject_hang().
> 
> > +
> > +	/* Cancel batch on RCS0 w/ gpu watchdog timeout */
> > +	if(prio_ctx1 < 0 && prio_ctx2 < 0) {
> > +		context_set_watchdog(fd, engine_id, ctx[0],
> > WATCHDOG_THRESHOLD);
> > +		clear_error_state(fd);
> > +		inject_hang(fd, engine_id, ctx[0], flags);
> > +	}
> > +
> > +#if 0
> 
> Is the code wrapped in if 0 statements going to be included in the
> final 
> patch?
> 
> > +	/* Now check the engine was reset successfully*/
> > +	igt_assert_eq(sync_fence_status(*fence), -EIO); > +#endif
> > +	close(fence[0]);
> 
> nit: you can close all fences at the very end of the test.

ok.
> 
> > +
> > +	/* Create some work on VECS0 */
> > +	engine_id = 4;
> > +	ctx[1] = create_ctx_with_priority(fd, prio_ctx2);
> > +	batch_buffer_factory(fd, ctx[1], engine_id, scratch[1], 0,
> > NULL, batch_timeout_ms, fence, 1);
> > +
> > +	/* Cancel batch on RCS0 w/ gpu watchdog timeout */
> > +	context_set_watchdog(fd, engine_id, ctx[1],
> > WATCHDOG_THRESHOLD);
> > +	clear_error_state(fd);
> > +	inject_hang(fd, engine_id, ctx[1], flags);
> 
> You don't want to sync after the hang in this test I think. It would
> be 
> better to send plenty of contexts with different thresholds and have 
> them active at ~ the same time.

Ok, did this in the version #3.

> 
> > +
> > +#if 0
> > +	/* Now check the engine was reset successfully */
> > +	igt_assert_eq(sync_fence_status(*fence), -EIO);
> > +#endif
> > +	close(fence[1]);
> > +
> > +	for (i = 0; i < 2; i++) {
> > +		gem_context_destroy(fd, ctx[i]);
> > +		gem_close(fd, scratch[i]);
> > +	}
> > +}
> > +
> > +static void gpu_watchodg_hang_long_batch_single_engine(int fd,
> > unsigned engine_id, const char *name)
> 
> engine_id is unused in this function.
> 
> > +{
> > +	uint32_t ctx[16];
> > +	uint32_t scratch[16];
> > +	int *fence;
> > +	unsigned nengine = 0;
> > +	unsigned engine;
> > +
> > +	int i;
> > +	unsigned flags = HANG_ALLOW_CAPTURE;
> > +	const uint64_t batch_timeout_ms = timeout_100ms*4;
> > +
> > +	fence = (int *)malloc(sizeof(int)*16);
> > +
> > +	if (!fence) {
> > +		igt_info("Out of memory\n");
> > +		exit(1);
> > +	}
> > +
> > +	for_each_physical_engine(fd, engine) {
> 
> Just a suggestion: you could use for_each_engine_class_instance()
> end 
> skip based on the class. This should fit in better with the new
> engine 
> interface Andi is working on.

Ok, done in v3.

> 
> > +		/* no support for gpu watchdog on BLT */
> > +		if ( strncmp(e__->name, "blt", 3) == 0 )
> > +			continue;
> > +
> > +		scratch[nengine] = gem_create(fd, 4096);
> > +		ctx[nengine] = create_ctx_with_priority(fd, -1);
> > +
> > +		/* Create some work on the engine using the same ctx*/
> > +		batch_buffer_factory(fd, ctx[nengine], e__->exec_id,
> > scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
> 
> Why are we sending work before sending the hanging batch? I think it 
> would be better not to send anything down and maybe have a different 
> test where you:
> 
> 	- Submit work from context A
> 	- Set a non 0 threshold on context A
> 	- Submit more work on context A
> 
> And expect the work before you set the threshold is not reset and
> what 
> was sent after is.

ok, "inject_hang()" is the hanging batch in itself and sending that
before any batch buffer simply hangs the engine... That's why, I
usually create some work and then send the hanging batch last.

Having said that, I took your suggestion for the new test on version 3.

> 
> > +
> > +		/* Set the gpu watchdog timeout */
> > +		context_set_watchdog(fd, e__->exec_id, ctx[nengine],
> > WATCHDOG_THRESHOLD);
> > +		clear_error_state(fd);
> > +
> > +		/* Cancel only the batch requested */
> > +		if ( strncmp(e__->name, name, 4) == 0 )
> > +			inject_hang(fd, e__->exec_id, ctx[nengine],
> > flags);
> > +#if 0
> > +		igt_info("fence:%d, fence status : %d EIO: %d
> > ctx_id:%d\n",fence[nengine], sync_fence_status(fence[nengine]),-
> > EIO, ctx[nengine]);
> > +		/* Now check the engine was reset */
> > +		igt_assert_eq(sync_fence_status(fence[nengine]), -EIO);
> > +#endif
> > +		nengine++;
> > +	}
> > +
> > +	for (i = 0; i < nengine; i++) {
> > +		close(fence[i]);
> > +		gem_context_destroy(fd, ctx[i]);
> > +		gem_close(fd, scratch[i]);
> > +	}
> > +}
> > +
> > +igt_main
> > +{
> > +	int fd;
> > +	unsigned int nengine = 0;
> > +	unsigned int engine;
> > +
> > +	igt_skip_on_simulation();
> > +
> > +	igt_fixture {
> > +		fd = drm_open_driver(DRIVER_INTEL);
> > +		igt_require_gem(fd);
> > +
> > +		for_each_physical_engine(fd, engine)
> > +			nengine++;
> > +		igt_require(nengine);
> > +	}
> > +
> > +	igt_subtest_group {
> > +
> > +		igt_subtest_f("low-prio-ctx-wo-gpu-watchdog-and-high-
> > prio-ctx-with-gpu-watchdog") {
> > +			int prio1 = LOW;
> > +			int prio2 = HIGH;
> > +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
> > prio1, prio2);
> > +		}
> > +
> > +		for (const struct intel_execution_engine *e =
> > intel_execution_engines; e->name; e++) {
> 
> You are looping on each engine inside the test as well.

Yes, this is done on purpose, the test cancel the batch based on the
engine picked by the user and then it continues looping through all the
remaining engines...

Regards,
Carlos

> 
> Antonio
> 
> > +			/* no support for gpu watchdog on BLT */
> > +			if (e->exec_id == 0 || e->exec_id ==
> > I915_EXEC_BLT)
> > +				continue;
> > +
> > +			igt_subtest_f("gpu-watchdog-long-batch-%s", e-
> > >name) {
> > +				igt_require(gem_ring_has_physical_engin
> > e(fd, e->exec_id | e->flags));
> > +				printf("below id: %d\n",e->exec_id);
> > +				gpu_watchodg_hang_long_batch_single_eng
> > ine(fd, e->exec_id | e->flags, e->name);
> > +			}
> > +		}
> > +
> > +		igt_subtest_f("gpu-watchdog-long-batch-2-contexts") {
> > +			int prio = -1;
> > +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
> > prio, prio);
> > +		}
> > +    }
> > +
> > +    igt_fixture {
> > +	close(fd);
> > +    }
> > +}
> > diff --git a/tests/meson.build b/tests/meson.build
> > index 5167a6c..b281b75 100644
> > --- a/tests/meson.build
> > +++ b/tests/meson.build
> > @@ -210,6 +210,7 @@ i915_progs = [
> >   	'gem_unref_active_buffers',
> >   	'gem_userptr_blits',
> >   	'gem_wait',
> > +        'gem_watchdog',
> >   	'gem_workarounds',
> >   	'gem_write_read_ring_switch',
> >   	'i915_fb_tiling',
> > 

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2019-04-23  0:05 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-18 16:53 [igt-dev] [RFC v2] tests/gem_watchdog: Initial set of tests for GPU watchdog Carlos Santa
2019-04-18 17:39 ` [igt-dev] ✗ Fi.CI.BAT: failure for tests/gem_watchdog: Initial set of tests for GPU watchdog (rev4) Patchwork
2019-04-19 21:29 ` [igt-dev] [RFC v2] tests/gem_watchdog: Initial set of tests for GPU watchdog Antonio Argenziano
2019-04-23  0:04   ` Carlos Santa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.