All of lore.kernel.org
 help / color / mirror / Atom feed
* [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog
@ 2019-04-23  0:05 Carlos Santa
  2019-04-23  0:24 ` [igt-dev] ✗ Fi.CI.BAT: failure for tests/gem_watchdog: Initial set of tests for GPU watchdog (rev5) Patchwork
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Carlos Santa @ 2019-04-23  0:05 UTC (permalink / raw)
  To: igt-dev; +Cc: Ursulin Tvrtko

This test adds basic set of tests to reset the different
GPU engines through the gpu watchdog timer.

Credits to Antonio for the original codebase this is based on.

v2: remove gem_context_get_param() during set (Antonio)
    remove clearing of the engines_threshold[] in the default case
    inside context_set_watchdog(). (Antonio)
    fix indexing when creating low/high priority contexts
    get rid of 2 threads idea (Antonio)
    fix context prio bug due to wrong indexing (Antonio)
v3: no need to force clear drm_i915_gem_watchdog_timeout struct
    when setting the watchdog timeouts (Antonio)
    -use gem_context_set_priority() instead to use error checking
    -always use a looper to test all engines (Antonio)
    -modify test gpu_watchodg_hang_long_batch_single_engine()
    to use a couple of batches and set the timeout in between them (Antonio)
    -make use of for_each_engine_class_instance() (Antonio)

Cc: Ursulin Tvrtko <tvrtko.ursulin@intel.com>
Cc: Antonio Argenziano <antonio.argenziano@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Carlos Santa <carlos.santa@intel.com>
---
 tests/Makefile.sources    |   3 +
 tests/i915/gem_watchdog.c | 517 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/meson.build         |   1 +
 3 files changed, 521 insertions(+)
 create mode 100644 tests/i915/gem_watchdog.c

diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index 214698d..7f17f20 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES = i915/gem_userptr_blits.c
 TESTS_progs += gem_wait
 gem_wait_SOURCES = i915/gem_wait.c
 
+TESTS_progs += gem_watchdog
+gem_watchdog_SOURCES = i915/gem_watchdog.c
+
 TESTS_progs += gem_workarounds
 gem_workarounds_SOURCES = i915/gem_workarounds.c
 
diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
new file mode 100644
index 0000000..8f789ab
--- /dev/null
+++ b/tests/i915/gem_watchdog.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "igt.h"
+#include "igt_sysfs.h"
+#include "sw_sync.h"
+
+#include <pthread.h>
+#include <fcntl.h>
+
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/signal.h>
+#include "i915/gem_ring.h"
+
+#define LOCAL_I915_EXEC_BSD_SHIFT	(13)
+#define LOCAL_I915_EXEC_BSD_RING1 	(1 << LOCAL_I915_EXEC_BSD_SHIFT)
+#define LOCAL_I915_EXEC_BSD_RING2 	(2 << LOCAL_I915_EXEC_BSD_SHIFT)
+
+#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
+#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
+#define HIGH 1
+#define LOW 0
+#define TRUE 1
+#define FALSE 0
+#define WATCHDOG_THRESHOLD (100)
+#define MAX_ENGINES 5
+#define RENDER_CLASS 0
+#define VIDEO_DECODE_CLASS 1
+#define VIDEO_ENHANCEMENT_CLASS 2
+#define COPY_ENGINE_CLASS 3
+#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
+
+#define RS_NO_ERROR      0
+#define RS_BATCH_ACTIVE  (1 << 0)
+#define RS_BATCH_PENDING (1 << 1)
+#define RS_UNKNOWN       (1 << 2)
+
+#define GET_RESET_STATS_IOCTL DRM_IOWR(DRM_COMMAND_BASE + 0x32, struct local_drm_i915_reset_stats)
+struct local_drm_i915_reset_stats {
+	__u32 ctx_id;
+	__u32 flags;
+	__u32 reset_count;
+	__u32 batch_active;
+	__u32 batch_pending;
+	__u32 pad;
+};
+
+const uint64_t timeout_100ms = 100000000LL;
+int num;
+
+struct drm_i915_gem_watchdog_timeout {
+	union {
+		struct {
+			/*
+			 * Engine class & instance to be configured or queried.
+			 */
+			__u16 engine_class;
+			__u16 engine_instance;
+		};
+		/* Index based addressing mode */
+		__u32 index;
+	};
+	/* GPU Engine watchdog resets timeout in us */
+	__u32 timeout_us;
+};
+
+static void clear_error_state(int fd)
+{
+	int dir;
+
+	dir = igt_sysfs_open(fd);
+
+	if (dir < 0)
+		return;
+
+	/* Any write to the error state clears it */
+	igt_sysfs_set(dir, "error", "");
+	close(dir);
+}
+
+static unsigned int e2ring(int fd, const struct intel_execution_engine2 *e)
+{
+    return gem_class_instance_to_eb_flags(fd, e->class, e->instance);
+}
+
+static void context_set_watchdog(int fd, unsigned engine_id,
+                                 unsigned ctx_id, unsigned threshold)
+{
+	struct drm_i915_gem_watchdog_timeout engines_threshold[MAX_ENGINES];
+	struct drm_i915_gem_context_param arg = {
+		.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
+		.ctx_id = ctx_id,
+		.size = sizeof(engines_threshold),
+		.value = (uint64_t)&engines_threshold
+	};
+
+	memset(&engines_threshold, 0, sizeof(engines_threshold));
+
+	switch (engine_id & I915_EXEC_RING_MASK) {
+	case I915_EXEC_RENDER:
+		engines_threshold[RENDER_CLASS].timeout_us = threshold;
+		break;
+	case I915_EXEC_BSD:
+		engines_threshold[VIDEO_DECODE_CLASS].timeout_us = threshold;
+		break;
+	case I915_EXEC_VEBOX:
+		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = threshold;
+		break;
+	default:
+		break;
+	}
+
+	gem_context_set_param(fd, &arg);
+}
+
+static void batch_buffer_factory(uint32_t fd, uint32_t ctx_id, unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle, uint64_t timeout, int *fence, int fence_index)
+{
+    struct drm_i915_gem_exec_object2 obj[2];
+    struct drm_i915_gem_relocation_entry reloc;
+    struct drm_i915_gem_execbuffer2 execbuf;
+    igt_spin_t *spin = NULL;
+    const uint32_t bbe = MI_BATCH_BUFFER_END;
+    int i = 0;
+
+    gem_quiescent_gpu(fd);
+
+    memset(&execbuf, 0, sizeof(execbuf));
+    memset(&obj, 0, sizeof(obj));
+    memset(&reloc, 0, sizeof(reloc));
+
+    execbuf.buffers_ptr = to_user_pointer(obj);
+
+    execbuf.buffer_count = 2;
+    execbuf.flags = exec_id | I915_EXEC_FENCE_OUT ;
+
+    obj[0].handle = target;
+    obj[1].handle = gem_create(fd, 4096);
+
+    obj[1].relocation_count = 1;
+    obj[1].relocs_ptr = to_user_pointer(&reloc);
+
+    reloc.target_handle = obj[0].handle;
+    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
+    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
+    reloc.delta = offset * sizeof(uint32_t);
+
+    reloc.offset = i * sizeof(uint32_t);
+    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
+
+    __sync_synchronize();
+
+    if (handle) {
+        *handle = obj[1].handle;
+        return;
+    }
+
+    gem_sync(fd, obj[1].handle);
+    execbuf.rsvd1 = ctx_id;
+    execbuf.rsvd2 = -1;
+
+    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
+    igt_spin_batch_set_timeout(spin, timeout);
+    igt_assert(gem_bo_busy(fd, obj[0].handle));
+
+    gem_execbuf_wr(fd, &execbuf);
+    igt_spin_batch_free(fd, spin);
+
+    fence[fence_index] = execbuf.rsvd2 >> 32;
+
+    gem_close(fd, obj[1].handle);
+    gem_quiescent_gpu(fd);
+}
+
+static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
+{
+	uint32_t ctx = gem_context_create(fd);
+
+	switch (ctx_prio) {
+	case HIGH:
+		gem_context_set_priority(fd, ctx, MAX_PRIO);
+		igt_info("Setting MAX priority %d\n", ctx_prio);
+		break;
+	case LOW:
+		gem_context_set_priority(fd, ctx, MIN_PRIO);
+		igt_info("Setting MIN priority %d\n", ctx_prio);
+		break;
+	default:
+		igt_info("Ignoring context priority %d\n", ctx_prio);
+		break;
+	}
+
+	return ctx;
+}
+
+static int gem_reset_stats(int fd, int ctx_id,
+			   struct local_drm_i915_reset_stats *rs)
+{
+	memset(rs, 0, sizeof(*rs));
+	rs->ctx_id = ctx_id;
+	rs->reset_count = -1;
+
+	if (drmIoctl(fd, GET_RESET_STATS_IOCTL, rs)) {
+		printf("error:%d\n",-errno);
+		return -errno;
+	}
+	igt_assert(rs->reset_count != -1);
+	return 0;
+}
+
+static unsigned index_to_engine_id(int fd, int index)
+{
+	unsigned engine_id = I915_EXEC_RENDER;
+
+	switch (index) {
+	case 1:
+		engine_id = I915_EXEC_RENDER;
+		break;
+	case 2:
+		if (gem_has_bsd2(fd))
+			engine_id = I915_EXEC_BSD | I915_EXEC_BSD_RING1;
+		else
+			engine_id = I915_EXEC_BSD;
+		break;
+	case 3:
+		if (gem_has_bsd2(fd))
+			engine_id = I915_EXEC_BSD | I915_EXEC_BSD_RING2;
+		else
+			engine_id = I915_EXEC_BSD;
+		break;
+	case 4:
+		engine_id = I915_EXEC_VEBOX;
+		break;
+	default:
+		break;
+	}
+
+	return engine_id;
+}
+
+static void inject_hang(uint32_t fd, unsigned engine_id, uint32_t ctx_id,  unsigned flags)
+{
+	igt_hang_t hang;
+	hang = igt_hang_ctx(fd, ctx_id, engine_id, flags);
+	gem_sync(fd, hang.spin->handle);
+}
+
+static void gpu_watchdog_long_batch_2_contexts(int fd, int nengine, int prio_ctx1, int reset_ctx1, int prio_ctx2, int reset_ctx2)
+{
+	uint32_t ctx[48];
+	uint32_t scratch[48];
+	unsigned flags = HANG_ALLOW_CAPTURE;
+	const uint64_t batch_timeout_ms = timeout_100ms * 3;
+	const struct intel_execution_engine2 *e;
+	const struct intel_execution_engine2 *e2;
+	int i = 0, index = 0, engine_id = 0, ctx_cnt = 0;
+	int *fence = 0, active_count = 0;
+	struct local_drm_i915_reset_stats rs;
+
+	igt_require(nengine);
+
+	fence = (int *)malloc(sizeof(int) * 48);
+
+	if (!fence) {
+		igt_info("Out of memory\n");
+		exit(1);
+	}
+
+	/* Intializes random number generator */
+	srand((unsigned) time(NULL));
+
+	/* The outer looper is ctx1 and the inner looper is ctx2
+	 * Both contexts can set their priority, create work, and cancel themselves
+	 */
+	for_each_engine_class_instance(fd, e) {
+		if ( strcmp(e->name, "bcs0") == 0 )
+			continue;
+
+		scratch[ctx_cnt] = gem_create(fd, 4096);
+		index = rand() % 4 + 1;
+		engine_id = index_to_engine_id(fd, index);
+
+		/* get current batch_active count */
+		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
+		active_count = rs.batch_active;
+		igt_assert_eq(rs.batch_active, active_count);
+
+		ctx[ctx_cnt] = create_ctx_with_priority(fd, prio_ctx1);
+		batch_buffer_factory(fd, ctx[ctx_cnt], engine_id, scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence, ctx_cnt);
+		context_set_watchdog(fd, engine_id, ctx[ctx_cnt], WATCHDOG_THRESHOLD);
+		sleep(1);
+
+		if (reset_ctx1) {
+			clear_error_state(fd);
+			inject_hang(fd, engine_id, ctx[ctx_cnt], flags);
+			active_count++;
+		}
+
+		/* check batch_active count once again after the hang */
+		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
+		igt_assert_eq(rs.batch_active, active_count);
+
+		ctx_cnt++;
+		i = 1;
+
+		for_each_engine_class_instance(fd, e2) {
+			if ( strcmp(e->name, "bcs0") == 0 )
+				continue;
+
+			scratch[ctx_cnt] = gem_create(fd, 4096);
+			engine_id = index_to_engine_id(fd, i);
+
+			/* get current batch_active count */
+			igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
+			active_count = rs.batch_active;
+			igt_assert_eq(rs.batch_active, active_count);
+
+			ctx[ctx_cnt] = create_ctx_with_priority(fd, prio_ctx2);
+			batch_buffer_factory(fd, ctx[ctx_cnt], engine_id, scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence, ctx_cnt);
+			context_set_watchdog(fd, engine_id, ctx[ctx_cnt], WATCHDOG_THRESHOLD);
+
+			if (reset_ctx2) {
+				clear_error_state(fd);
+				inject_hang(fd, engine_id, ctx[ctx_cnt], flags);
+				active_count++;
+			}
+
+			/* check batch_active count once again after the hang */
+			igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
+			igt_assert_eq(rs.batch_active, active_count);
+
+			ctx_cnt++;
+			i++;
+		}
+	}
+
+	for (i = 0; i < ctx_cnt; i++) {
+		close(fence[i]);
+		gem_context_destroy(fd, ctx[i]);
+		gem_close(fd, scratch[i]);
+	}
+}
+
+static void gpu_watchodg_hang_long_batch_single_engine(int fd, const char *name)
+{
+	uint32_t ctx[16];
+	uint32_t scratch[16];
+	int *fence;
+	const struct intel_execution_engine2 *e;
+	unsigned nengine = 0;
+	unsigned engine_id = 0;
+	int active_count = 0;
+	struct local_drm_i915_reset_stats rs;
+	int i;
+	unsigned flags = HANG_ALLOW_CAPTURE;
+	const uint64_t batch_timeout_ms = timeout_100ms * 4;
+
+	fence = (int *)malloc(sizeof(int)*16);
+
+	if (!fence) {
+		igt_info("Out of memory\n");
+		exit(1);
+	}
+
+	for_each_engine_class_instance(fd, e) {
+		if ( strcmp(e->name, "bcs0") == 0 )
+			continue;
+
+		engine_id = e2ring(fd, e);
+
+		scratch[nengine] = gem_create(fd, 4096);
+		ctx[nengine] = create_ctx_with_priority(fd, -1);
+
+		/* get current batch_active count */
+		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
+		active_count = rs.batch_active;
+		igt_assert_eq(rs.batch_active, active_count);
+
+		/* create some work on this engine using the above ctx*/
+		batch_buffer_factory(fd, ctx[nengine], engine_id, scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
+
+		/* set the gpu watchdog timeout */
+		context_set_watchdog(fd, engine_id, ctx[nengine], WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+
+		/*
+		 * give enough time for the batch buffer to run
+		 */
+		nanosleep(&(struct timespec){ .tv_sec = 0,
+					      .tv_nsec = batch_timeout_ms * 2 }, NULL);
+
+		/* create some work on this engine using the above ctx*/
+		batch_buffer_factory(fd, ctx[nengine], engine_id, scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
+		context_set_watchdog(fd, engine_id, ctx[nengine], WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+
+		/* cancel only the batch requested */
+		if ( strcmp(e->name, name) == 0 ) {
+			inject_hang(fd, engine_id, ctx[nengine], flags);
+			active_count++;
+		}
+
+		/* check batch_active count once again after the hang */
+		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
+		active_count = rs.batch_active;
+		igt_assert_eq(rs.batch_active, active_count);
+
+		nengine++;
+	}
+
+	for (i = 0; i < nengine; i++) {
+		close(fence[i]);
+		gem_context_destroy(fd, ctx[i]);
+		gem_close(fd, scratch[i]);
+	}
+}
+
+igt_main
+{
+	int fd;
+	unsigned int nengine = 0;
+	unsigned int engine;
+	const struct intel_execution_engine2 *e;
+
+	igt_skip_on_simulation();
+
+	igt_fixture {
+		fd = drm_open_driver(DRIVER_INTEL);
+		igt_require_gem(fd);
+
+		for_each_physical_engine(fd, engine)
+			nengine++;
+		igt_require(nengine);
+	}
+
+	igt_subtest_group {
+
+		igt_subtest_f("gpu-watchdog-long-batch-low-prio-ctx1-reset-ctx2-high-prio-execute") {
+			int prio_ctx1 = LOW;
+			int prio_ctx2 = HIGH;
+			int reset_ctx1 = TRUE;
+			int reset_ctx2 = FALSE;
+			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
+		}
+
+		igt_subtest_f("gpu-watchdog-long-batch-high-prio-ctx1-reset-ctx2-low-prio-execute") {
+			int prio_ctx1 = HIGH;
+			int prio_ctx2 = LOW;
+			int reset_ctx1 = TRUE;
+			int reset_ctx2 = FALSE;
+			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
+		}
+
+		igt_subtest_f("gpu-watchdog-long-batch-ctx1-reset-ctx2-execute") {
+			int prio_ctx1 = -1;
+			int prio_ctx2 = -1;
+			int reset_ctx1 = TRUE;
+			int reset_ctx2 = FALSE;
+			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
+		}
+
+		igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-ctx2-reset") {
+			int prio_ctx1 = -1;
+			int prio_ctx2 = -1;
+			int reset_ctx1 = FALSE;
+			int reset_ctx2 = TRUE;
+			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
+		}
+
+		igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-ctx2-execute") {
+			int prio_ctx1 = -1;
+			int prio_ctx2 = -1;
+			int reset_ctx1 = FALSE;
+			int reset_ctx2 = FALSE;
+			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
+		}
+
+	       __for_each_engine_class_instance(e) {
+			igt_subtest_group {
+				igt_fixture {
+				gem_require_engine(fd,
+					e->class,
+					e->instance);
+				}
+			/* no support for gpu watchdog on bcs0 */
+			if (strcmp(e->name, "bcs0") == 0)
+				continue;
+
+		            igt_subtest_f("gpu-watchdog-long-batch-single-engine-reset-%s", e->name)
+		                gpu_watchodg_hang_long_batch_single_engine(fd, e->name);
+			}
+		}
+	}
+
+    igt_fixture {
+	close(fd);
+    }
+}
diff --git a/tests/meson.build b/tests/meson.build
index 5167a6c..b281b75 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -210,6 +210,7 @@ i915_progs = [
 	'gem_unref_active_buffers',
 	'gem_userptr_blits',
 	'gem_wait',
+        'gem_watchdog',
 	'gem_workarounds',
 	'gem_write_read_ring_switch',
 	'i915_fb_tiling',
-- 
2.7.4

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [igt-dev] ✗ Fi.CI.BAT: failure for tests/gem_watchdog: Initial set of tests for GPU watchdog (rev5)
  2019-04-23  0:05 [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog Carlos Santa
@ 2019-04-23  0:24 ` Patchwork
  2019-04-23  9:54 ` [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog Petri Latvala
  2019-04-23 16:25 ` Tvrtko Ursulin
  2 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2019-04-23  0:24 UTC (permalink / raw)
  To: Carlos Santa; +Cc: igt-dev

== Series Details ==

Series: tests/gem_watchdog: Initial set of tests for GPU watchdog (rev5)
URL   : https://patchwork.freedesktop.org/series/50041/
State : failure

== Summary ==

IGT patchset build failed on latest successful build
504367d33b787de2ba8e007a5b620cfd6f0b3074 tests/kms_plane_scaling: Skip testing unsupported fp16 features

[322/460] Linking target tests/gem_pwrite_pread.
[323/460] Linking target tests/gem_read_read_speed.
[324/460] Linking target tests/gem_reg_read.
[325/460] Linking target tests/gem_reloc_overflow.
[326/460] Linking target tests/gem_reloc_vs_gpu.
[327/460] Linking target tests/gem_render_copy_redux.
[328/460] Linking target tests/gem_render_copy.
[329/460] Linking target tests/gem_render_tiled_blits.
[330/460] Linking target tests/gem_request_retire.
[331/460] Linking target tests/gem_ring_sync_copy.
[332/460] Linking target tests/gem_reset_stats.
[333/460] Linking target tests/gem_ring_sync_loop.
[334/460] Linking target tests/gem_ringfill.
[335/460] Linking target tests/gem_set_tiling_vs_blt.
[336/460] Linking target tests/gem_set_tiling_vs_gtt.
[337/460] Linking target tests/gem_shrink.
[338/460] Linking target tests/gem_set_tiling_vs_pwrite.
[339/460] Linking target tests/gem_storedw_batches_loop.
[340/460] Linking target tests/gem_softpin.
[341/460] Linking target tests/gem_spin_batch.
[342/460] Linking target tests/gem_stolen.
[343/460] Linking target tests/gem_storedw_loop.
[344/460] Linking target tests/gem_threaded_access_tiled.
[345/460] Linking target tests/gem_tiled_partial_pwrite_pread.
[346/460] Linking target tests/gem_sync.
[347/460] Linking target tests/gem_streaming_writes.
[348/460] Compiling C object 'tests/tests@@gem_watchdog@exe/i915_gem_watchdog.c.o'.
FAILED: tests/tests@@gem_watchdog@exe/i915_gem_watchdog.c.o 
ccache cc -Itests/tests@@gem_watchdog@exe -Itests -I../tests -I../include/drm-uapi -Ilib -I../lib -I../lib/stubs/syscalls -I. -I../ -I/usr/include/cairo -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include -I/usr/include/pixman-1 -I/usr/include/libpng16 -I/usr/include/freetype2 -I/usr/include/libpng12 -I/opt/igt/include -I/opt/igt/include/libdrm -I/usr/include/x86_64-linux-gnu -I/usr/include/alsa -I/usr/include -I/usr/include/libdrm -I/usr/include/libdrm/nouveau -I/home/cidrm/kernel_headers/include -fdiagnostics-color=always -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=gnu11 -O0 -g -D_GNU_SOURCE -include config.h -Wbad-function-cast -Wdeclaration-after-statement -Wformat=2 -Wimplicit-fallthrough=0 -Wlogical-op -Wmissing-declarations -Wmissing-format-attribute -Wmissing-noreturn -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wpointer-arith -Wredundant-decls -Wshadow -Wstrict-prototypes -Wuninitialized -Wunused -Wno-clobbered -Wno-maybe-uninitialized -Wno-missing-field-initializers -Wno-pointer-arith -Wno-sign-compare -Wno-type-limits -Wno-unused-parameter -Wno-unused-result -Werror=address -Werror=array-bounds -Werror=implicit -Werror=init-self -Werror=int-to-pointer-cast -Werror=main -Werror=missing-braces -Werror=nonnull -Werror=pointer-to-int-cast -Werror=return-type -Werror=sequence-point -Werror=trigraphs -Werror=write-strings -pthread  -MD -MQ 'tests/tests@@gem_watchdog@exe/i915_gem_watchdog.c.o' -MF 'tests/tests@@gem_watchdog@exe/i915_gem_watchdog.c.o.d' -o 'tests/tests@@gem_watchdog@exe/i915_gem_watchdog.c.o' -c ../tests/i915/gem_watchdog.c
../tests/i915/gem_watchdog.c: In function ‘batch_buffer_factory’:
../tests/i915/gem_watchdog.c:181:12: error: implicit declaration of function ‘igt_spin_batch_new’; did you mean ‘igt_spin_new’? [-Werror=implicit-function-declaration]
     spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
            ^~~~~~~~~~~~~~~~~~
            igt_spin_new
../tests/i915/gem_watchdog.c:181:12: warning: nested extern declaration of ‘igt_spin_batch_new’ [-Wnested-externs]
../tests/i915/gem_watchdog.c:181:35: error: expected expression before ‘.’ token
     spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
                                   ^
../tests/i915/gem_watchdog.c:182:5: error: implicit declaration of function ‘igt_spin_batch_set_timeout’; did you mean ‘igt_spin_set_timeout’? [-Werror=implicit-function-declaration]
     igt_spin_batch_set_timeout(spin, timeout);
     ^~~~~~~~~~~~~~~~~~~~~~~~~~
     igt_spin_set_timeout
../tests/i915/gem_watchdog.c:182:5: warning: nested extern declaration of ‘igt_spin_batch_set_timeout’ [-Wnested-externs]
../tests/i915/gem_watchdog.c:186:5: error: implicit declaration of function ‘igt_spin_batch_free’; did you mean ‘igt_spin_free’? [-Werror=implicit-function-declaration]
     igt_spin_batch_free(fd, spin);
     ^~~~~~~~~~~~~~~~~~~
     igt_spin_free
../tests/i915/gem_watchdog.c:186:5: warning: nested extern declaration of ‘igt_spin_batch_free’ [-Wnested-externs]
cc1: some warnings being treated as errors
ninja: build stopped: subcommand failed.

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog
  2019-04-23  0:05 [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog Carlos Santa
  2019-04-23  0:24 ` [igt-dev] ✗ Fi.CI.BAT: failure for tests/gem_watchdog: Initial set of tests for GPU watchdog (rev5) Patchwork
@ 2019-04-23  9:54 ` Petri Latvala
  2019-04-23 16:25 ` Tvrtko Ursulin
  2 siblings, 0 replies; 7+ messages in thread
From: Petri Latvala @ 2019-04-23  9:54 UTC (permalink / raw)
  To: Carlos Santa; +Cc: igt-dev, Ursulin Tvrtko

On Mon, Apr 22, 2019 at 05:05:54PM -0700, Carlos Santa wrote:
> This test adds basic set of tests to reset the different
> GPU engines through the gpu watchdog timer.
> 
> Credits to Antonio for the original codebase this is based on.
> 
> v2: remove gem_context_get_param() during set (Antonio)
>     remove clearing of the engines_threshold[] in the default case
>     inside context_set_watchdog(). (Antonio)
>     fix indexing when creating low/high priority contexts
>     get rid of 2 threads idea (Antonio)
>     fix context prio bug due to wrong indexing (Antonio)
> v3: no need to force clear drm_i915_gem_watchdog_timeout struct
>     when setting the watchdog timeouts (Antonio)
>     -use gem_context_set_priority() instead to use error checking
>     -always use a looper to test all engines (Antonio)
>     -modify test gpu_watchodg_hang_long_batch_single_engine()
>     to use a couple of batches and set the timeout in between them (Antonio)
>     -make use of for_each_engine_class_instance() (Antonio)
> 
> Cc: Ursulin Tvrtko <tvrtko.ursulin@intel.com>
> Cc: Antonio Argenziano <antonio.argenziano@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Carlos Santa <carlos.santa@intel.com>
> ---
>  tests/Makefile.sources    |   3 +
>  tests/i915/gem_watchdog.c | 517 ++++++++++++++++++++++++++++++++++++++++++++++
>  tests/meson.build         |   1 +
>  3 files changed, 521 insertions(+)
>  create mode 100644 tests/i915/gem_watchdog.c
> 
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index 214698d..7f17f20 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES = i915/gem_userptr_blits.c
>  TESTS_progs += gem_wait
>  gem_wait_SOURCES = i915/gem_wait.c
>  
> +TESTS_progs += gem_watchdog
> +gem_watchdog_SOURCES = i915/gem_watchdog.c
> +
>  TESTS_progs += gem_workarounds
>  gem_workarounds_SOURCES = i915/gem_workarounds.c
>  
> diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
> new file mode 100644
> index 0000000..8f789ab
> --- /dev/null
> +++ b/tests/i915/gem_watchdog.c
> @@ -0,0 +1,517 @@
> +/*
> + * Copyright © 2016 Intel Corporation


2019, surely?


-- 
Petri Latvala
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog
  2019-04-23  0:05 [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog Carlos Santa
  2019-04-23  0:24 ` [igt-dev] ✗ Fi.CI.BAT: failure for tests/gem_watchdog: Initial set of tests for GPU watchdog (rev5) Patchwork
  2019-04-23  9:54 ` [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog Petri Latvala
@ 2019-04-23 16:25 ` Tvrtko Ursulin
  2019-04-23 16:31   ` Tvrtko Ursulin
  2019-04-24  2:31   ` Carlos Santa
  2 siblings, 2 replies; 7+ messages in thread
From: Tvrtko Ursulin @ 2019-04-23 16:25 UTC (permalink / raw)
  To: Carlos Santa, igt-dev; +Cc: Ursulin Tvrtko


Hi,

How is the test doing against the code in progress? Any issues 
discovered etc?

In general it looks a bit untidy. Coding style is off, alignment is 
random etc.. so a bit difficult to read. But I'll make a pass and see if 
I can make sense of it. I won't leave comments on coding style and 
formatting since there are too many. But please do fix that before 
re-posting.

On 23/04/2019 01:05, Carlos Santa wrote:
> This test adds basic set of tests to reset the different
> GPU engines through the gpu watchdog timer.
> 
> Credits to Antonio for the original codebase this is based on.
> 
> v2: remove gem_context_get_param() during set (Antonio)
>      remove clearing of the engines_threshold[] in the default case
>      inside context_set_watchdog(). (Antonio)
>      fix indexing when creating low/high priority contexts
>      get rid of 2 threads idea (Antonio)
>      fix context prio bug due to wrong indexing (Antonio)
> v3: no need to force clear drm_i915_gem_watchdog_timeout struct
>      when setting the watchdog timeouts (Antonio)
>      -use gem_context_set_priority() instead to use error checking
>      -always use a looper to test all engines (Antonio)
>      -modify test gpu_watchodg_hang_long_batch_single_engine()
>      to use a couple of batches and set the timeout in between them (Antonio)
>      -make use of for_each_engine_class_instance() (Antonio)
> 
> Cc: Ursulin Tvrtko <tvrtko.ursulin@intel.com>
> Cc: Antonio Argenziano <antonio.argenziano@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Carlos Santa <carlos.santa@intel.com>
> ---
>   tests/Makefile.sources    |   3 +
>   tests/i915/gem_watchdog.c | 517 ++++++++++++++++++++++++++++++++++++++++++++++
>   tests/meson.build         |   1 +
>   3 files changed, 521 insertions(+)
>   create mode 100644 tests/i915/gem_watchdog.c
> 
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index 214698d..7f17f20 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES = i915/gem_userptr_blits.c
>   TESTS_progs += gem_wait
>   gem_wait_SOURCES = i915/gem_wait.c
>   
> +TESTS_progs += gem_watchdog
> +gem_watchdog_SOURCES = i915/gem_watchdog.c
> +
>   TESTS_progs += gem_workarounds
>   gem_workarounds_SOURCES = i915/gem_workarounds.c
>   
> diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
> new file mode 100644
> index 0000000..8f789ab
> --- /dev/null
> +++ b/tests/i915/gem_watchdog.c
> @@ -0,0 +1,517 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +#include "igt.h"
> +#include "igt_sysfs.h"
> +#include "sw_sync.h"
> +
> +#include <pthread.h>
> +#include <fcntl.h>
> +
> +#include <sys/ioctl.h>
> +#include <sys/poll.h>
> +#include <sys/signal.h>
> +#include "i915/gem_ring.h"
> +
> +#define LOCAL_I915_EXEC_BSD_SHIFT	(13)
> +#define LOCAL_I915_EXEC_BSD_RING1 	(1 << LOCAL_I915_EXEC_BSD_SHIFT)
> +#define LOCAL_I915_EXEC_BSD_RING2 	(2 << LOCAL_I915_EXEC_BSD_SHIFT)

These are in old so no need for local definitions.

> +
> +#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
> +#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
> +#define HIGH 1
> +#define LOW 0
> +#define TRUE 1
> +#define FALSE 0

true and false are fair game in igt.

> +#define WATCHDOG_THRESHOLD (100)
> +#define MAX_ENGINES 5
> +#define RENDER_CLASS 0
> +#define VIDEO_DECODE_CLASS 1
> +#define VIDEO_ENHANCEMENT_CLASS 2
> +#define COPY_ENGINE_CLASS 3

Classes are in i915_drm.h as well.

> +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
> +
> +#define RS_NO_ERROR      0
> +#define RS_BATCH_ACTIVE  (1 << 0)
> +#define RS_BATCH_PENDING (1 << 1)
> +#define RS_UNKNOWN       (1 << 2)

What are these? Look unused in code.

> +
> +#define GET_RESET_STATS_IOCTL DRM_IOWR(DRM_COMMAND_BASE + 0x32, struct local_drm_i915_reset_stats)
> +struct local_drm_i915_reset_stats {
> +	__u32 ctx_id;
> +	__u32 flags;
> +	__u32 reset_count;
> +	__u32 batch_active;
> +	__u32 batch_pending;
> +	__u32 pad;
> +};
> +
> +const uint64_t timeout_100ms = 100000000LL;
> +int num;
> +
> +struct drm_i915_gem_watchdog_timeout {
> +	union {
> +		struct {
> +			/*
> +			 * Engine class & instance to be configured or queried.
> +			 */
> +			__u16 engine_class;
> +			__u16 engine_instance;
> +		};
> +		/* Index based addressing mode */
> +		__u32 index;
> +	};
> +	/* GPU Engine watchdog resets timeout in us */
> +	__u32 timeout_us;
> +};
> +
> +static void clear_error_state(int fd)
> +{
> +	int dir;
> +
> +	dir = igt_sysfs_open(fd);
> +
> +	if (dir < 0)
> +		return;

igt_require(dir >= 0) if test relies on this being present?

> +
> +	/* Any write to the error state clears it */
> +	igt_sysfs_set(dir, "error", "");
> +	close(dir);
> +}
> +
> +static unsigned int e2ring(int fd, const struct intel_execution_engine2 *e)
> +{
> +    return gem_class_instance_to_eb_flags(fd, e->class, e->instance);
> +}
> +
> +static void context_set_watchdog(int fd, unsigned engine_id,
> +                                 unsigned ctx_id, unsigned threshold)
> +{
> +	struct drm_i915_gem_watchdog_timeout engines_threshold[MAX_ENGINES];
> +	struct drm_i915_gem_context_param arg = {
> +		.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
> +		.ctx_id = ctx_id,
> +		.size = sizeof(engines_threshold),
> +		.value = (uint64_t)&engines_threshold
> +	};
> +
> +	memset(&engines_threshold, 0, sizeof(engines_threshold));
> +
> +	switch (engine_id & I915_EXEC_RING_MASK) {
> +	case I915_EXEC_RENDER:
> +		engines_threshold[RENDER_CLASS].timeout_us = threshold;

Looks like uAPI is in some interim state? I was expecting to see here 
something like:

	engines_threshold.engine_class = RENDER_CLASS;
	engines_threshold.engine_instance = 0;
	engines_threshold.timeout_us = threshold;

But you seem to be passing an array of structs which carry 
class/instance data, don't use it but instead infer the class from the 
position in the array.

> +		break;
> +	case I915_EXEC_BSD:
> +		engines_threshold[VIDEO_DECODE_CLASS].timeout_us = threshold;
> +		break;
> +	case I915_EXEC_VEBOX:
> +		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = threshold;
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	gem_context_set_param(fd, &arg);
> +}
> +
> +static void batch_buffer_factory(uint32_t fd, uint32_t ctx_id, unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle, uint64_t timeout, int *fence, int fence_index)
> +{
> +    struct drm_i915_gem_exec_object2 obj[2];
> +    struct drm_i915_gem_relocation_entry reloc;
> +    struct drm_i915_gem_execbuffer2 execbuf;
> +    igt_spin_t *spin = NULL;
> +    const uint32_t bbe = MI_BATCH_BUFFER_END;
> +    int i = 0;
> +
> +    gem_quiescent_gpu(fd);
> +
> +    memset(&execbuf, 0, sizeof(execbuf));
> +    memset(&obj, 0, sizeof(obj));
> +    memset(&reloc, 0, sizeof(reloc));
> +
> +    execbuf.buffers_ptr = to_user_pointer(obj);
> +
> +    execbuf.buffer_count = 2;
> +    execbuf.flags = exec_id | I915_EXEC_FENCE_OUT ;
> +
> +    obj[0].handle = target;
> +    obj[1].handle = gem_create(fd, 4096);
> +
> +    obj[1].relocation_count = 1;
> +    obj[1].relocs_ptr = to_user_pointer(&reloc);
> +
> +    reloc.target_handle = obj[0].handle;
> +    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
> +    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
> +    reloc.delta = offset * sizeof(uint32_t);
> +
> +    reloc.offset = i * sizeof(uint32_t);
> +    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
> +
> +    __sync_synchronize();
> +
> +    if (handle) {
> +        *handle = obj[1].handle;
> +        return;

Is this path used by the callers?

> +    }
> +
> +    gem_sync(fd, obj[1].handle);
> +    execbuf.rsvd1 = ctx_id;
> +    execbuf.rsvd2 = -1;
> +
> +    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
> +    igt_spin_batch_set_timeout(spin, timeout);
> +    igt_assert(gem_bo_busy(fd, obj[0].handle));
> +
> +    gem_execbuf_wr(fd, &execbuf);
> +    igt_spin_batch_free(fd, spin);
> +
> +    fence[fence_index] = execbuf.rsvd2 >> 32;
> +
> +    gem_close(fd, obj[1].handle);
> +    gem_quiescent_gpu(fd);
> +}
> +
> +static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
> +{
> +	uint32_t ctx = gem_context_create(fd);
> +
> +	switch (ctx_prio) {
> +	case HIGH:
> +		gem_context_set_priority(fd, ctx, MAX_PRIO);
> +		igt_info("Setting MAX priority %d\n", ctx_prio);
> +		break;
> +	case LOW:
> +		gem_context_set_priority(fd, ctx, MIN_PRIO);
> +		igt_info("Setting MIN priority %d\n", ctx_prio);
> +		break;
> +	default:
> +		igt_info("Ignoring context priority %d\n", ctx_prio);
> +		break;
> +	}
> +
> +	return ctx;

Why do you need this helper which translates an integer to an integer? 
Couldn't you just pass in the real priority to start with?

> +}
> +
> +static int gem_reset_stats(int fd, int ctx_id,
> +			   struct local_drm_i915_reset_stats *rs)
> +{
> +	memset(rs, 0, sizeof(*rs));
> +	rs->ctx_id = ctx_id;
> +	rs->reset_count = -1;
> +
> +	if (drmIoctl(fd, GET_RESET_STATS_IOCTL, rs)) {
> +		printf("error:%d\n",-errno);
> +		return -errno;

Make it assert success here since all callers do that anyway and it 
matches the convention for gem prefixed ioctl helpers?

> +	}
> +	igt_assert(rs->reset_count != -1);
> +	return 0;
> +}
> +
> +static unsigned index_to_engine_id(int fd, int index)
> +{
> +	unsigned engine_id = I915_EXEC_RENDER;
> +
> +	switch (index) {
> +	case 1:
> +		engine_id = I915_EXEC_RENDER;
> +		break;
> +	case 2:
> +		if (gem_has_bsd2(fd))
> +			engine_id = I915_EXEC_BSD | I915_EXEC_BSD_RING1;
> +		else
> +			engine_id = I915_EXEC_BSD;
> +		break;
> +	case 3:
> +		if (gem_has_bsd2(fd))
> +			engine_id = I915_EXEC_BSD | I915_EXEC_BSD_RING2;
> +		else
> +			engine_id = I915_EXEC_BSD;
> +		break;
> +	case 4:
> +		engine_id = I915_EXEC_VEBOX;
> +		break;
> +	default:
> +		break;
> +	}

I hope we can avoid this maintenance burden, will see later in patch how.

> +
> +	return engine_id;
> +}
> +
> +static void inject_hang(uint32_t fd, unsigned engine_id, uint32_t ctx_id,  unsigned flags)
> +{
> +	igt_hang_t hang;
> +	hang = igt_hang_ctx(fd, ctx_id, engine_id, flags);
> +	gem_sync(fd, hang.spin->handle);
> +}
> +
> +static void gpu_watchdog_long_batch_2_contexts(int fd, int nengine, int prio_ctx1, int reset_ctx1, int prio_ctx2, int reset_ctx2)
> +{
> +	uint32_t ctx[48];
> +	uint32_t scratch[48];
> +	unsigned flags = HANG_ALLOW_CAPTURE;
> +	const uint64_t batch_timeout_ms = timeout_100ms * 3;

ms suffix usually means the unit stored is ms. While here it seems to be 
something else.

const unsigned long batch_timeout_ms = 300;
const unsigned long batch_timeout_us = 300e3;
const unsigned long batch_timeout_ns = 300e6;

Not sure which one you want. Probably use ms and scale up at the use site.

> +	const struct intel_execution_engine2 *e;
> +	const struct intel_execution_engine2 *e2;
> +	int i = 0, index = 0, engine_id = 0, ctx_cnt = 0;
> +	int *fence = 0, active_count = 0;
> +	struct local_drm_i915_reset_stats rs;
> +
> +	igt_require(nengine);
> +
> +	fence = (int *)malloc(sizeof(int) * 48);
> +
> +	if (!fence) {
> +		igt_info("Out of memory\n");
> +		exit(1);
> +	}

Replace with igt_assert(fence).

> +
> +	/* Intializes random number generator */
> +	srand((unsigned) time(NULL));
> +
> +	/* The outer looper is ctx1 and the inner looper is ctx2
> +	 * Both contexts can set their priority, create work, and cancel themselves
> +	 */
> +	for_each_engine_class_instance(fd, e) {
> +		if ( strcmp(e->name, "bcs0") == 0 )
> +			continue;

Use -ENODEV or -EINVAL from context_set_watchdog to skip.

> +
> +		scratch[ctx_cnt] = gem_create(fd, 4096);
> +		index = rand() % 4 + 1;
> +		engine_id = index_to_engine_id(fd, index);
> +
> +		/* get current batch_active count */
> +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> +		active_count = rs.batch_active;
> +		igt_assert_eq(rs.batch_active, active_count);
> +
> +		ctx[ctx_cnt] = create_ctx_with_priority(fd, prio_ctx1);
> +		batch_buffer_factory(fd, ctx[ctx_cnt], engine_id, scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence, ctx_cnt);
> +		context_set_watchdog(fd, engine_id, ctx[ctx_cnt], WATCHDOG_THRESHOLD);
> +		sleep(1);

What's the sleep for? If unavoidable at least needs a comment.

> +
> +		if (reset_ctx1) {
> +			clear_error_state(fd);
> +			inject_hang(fd, engine_id, ctx[ctx_cnt], flags);
> +			active_count++;
> +		}
> +
> +		/* check batch_active count once again after the hang */
> +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> +		igt_assert_eq(rs.batch_active, active_count);
> +
> +		ctx_cnt++;
> +		i = 1;
> +
> +		for_each_engine_class_instance(fd, e2) {
> +			if ( strcmp(e->name, "bcs0") == 0 )

e2, but remove it altogether as said above.

I don't get what is the purpose of two for_each_engine_class_instance 
loops when neither e or e2 appear to be used?

Since either I am missing something critical or the test really is very 
confused, until clarified, I am skipping a bit lower down.

> +				continue;
> +
> +			scratch[ctx_cnt] = gem_create(fd, 4096);
> +			engine_id = index_to_engine_id(fd, i);
> +
> +			/* get current batch_active count */
> +			igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> +			active_count = rs.batch_active;
> +			igt_assert_eq(rs.batch_active, active_count);
> +
> +			ctx[ctx_cnt] = create_ctx_with_priority(fd, prio_ctx2);
> +			batch_buffer_factory(fd, ctx[ctx_cnt], engine_id, scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence, ctx_cnt);
> +			context_set_watchdog(fd, engine_id, ctx[ctx_cnt], WATCHDOG_THRESHOLD);
> +
> +			if (reset_ctx2) {
> +				clear_error_state(fd);
> +				inject_hang(fd, engine_id, ctx[ctx_cnt], flags);
> +				active_count++;
> +			}
> +
> +			/* check batch_active count once again after the hang */
> +			igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> +			igt_assert_eq(rs.batch_active, active_count);
> +
> +			ctx_cnt++;
> +			i++;
> +		}
> +	}
> +
> +	for (i = 0; i < ctx_cnt; i++) {
> +		close(fence[i]);
> +		gem_context_destroy(fd, ctx[i]);
> +		gem_close(fd, scratch[i]);
> +	}
> +}
> +
> +static void gpu_watchodg_hang_long_batch_single_engine(int fd, const char *name)
> +{
> +	uint32_t ctx[16];
> +	uint32_t scratch[16];
> +	int *fence;
> +	const struct intel_execution_engine2 *e;
> +	unsigned nengine = 0;
> +	unsigned engine_id = 0;
> +	int active_count = 0;
> +	struct local_drm_i915_reset_stats rs;
> +	int i;
> +	unsigned flags = HANG_ALLOW_CAPTURE;
> +	const uint64_t batch_timeout_ms = timeout_100ms * 4;
> +
> +	fence = (int *)malloc(sizeof(int)*16);
> +
> +	if (!fence) {
> +		igt_info("Out of memory\n");
> +		exit(1);
> +	}
> +
> +	for_each_engine_class_instance(fd, e) {
> +		if ( strcmp(e->name, "bcs0") == 0 )
> +			continue;
> +
> +		engine_id = e2ring(fd, e);
> +
> +		scratch[nengine] = gem_create(fd, 4096);
> +		ctx[nengine] = create_ctx_with_priority(fd, -1);
> +
> +		/* get current batch_active count */
> +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> +		active_count = rs.batch_active;
> +		igt_assert_eq(rs.batch_active, active_count);
> +
> +		/* create some work on this engine using the above ctx*/
> +		batch_buffer_factory(fd, ctx[nengine], engine_id, scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
> +
> +		/* set the gpu watchdog timeout */
> +		context_set_watchdog(fd, engine_id, ctx[nengine], WATCHDOG_THRESHOLD);
> +		clear_error_state(fd);
> +
> +		/*
> +		 * give enough time for the batch buffer to run
> +		 */
> +		nanosleep(&(struct timespec){ .tv_sec = 0,
> +					      .tv_nsec = batch_timeout_ms * 2 }, NULL);
> +
> +		/* create some work on this engine using the above ctx*/
> +		batch_buffer_factory(fd, ctx[nengine], engine_id, scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
> +		context_set_watchdog(fd, engine_id, ctx[nengine], WATCHDOG_THRESHOLD);
> +		clear_error_state(fd);
> +
> +		/* cancel only the batch requested */
> +		if ( strcmp(e->name, name) == 0 ) {
> +			inject_hang(fd, engine_id, ctx[nengine], flags);
> +			active_count++;
> +		}
> +
> +		/* check batch_active count once again after the hang */
> +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> +		active_count = rs.batch_active;
> +		igt_assert_eq(rs.batch_active, active_count);
> +
> +		nengine++;
> +	}
> +
> +	for (i = 0; i < nengine; i++) {
> +		close(fence[i]);
> +		gem_context_destroy(fd, ctx[i]);
> +		gem_close(fd, scratch[i]);
> +	}
> +}
> +
> +igt_main
> +{
> +	int fd;
> +	unsigned int nengine = 0;
> +	unsigned int engine;
> +	const struct intel_execution_engine2 *e;
> +
> +	igt_skip_on_simulation();
> +
> +	igt_fixture {
> +		fd = drm_open_driver(DRIVER_INTEL);
> +		igt_require_gem(fd);
> +
> +		for_each_physical_engine(fd, engine)
> +			nengine++;
> +		igt_require(nengine);

Do we have GPUs with no engines?

> +	}
> +
> +	igt_subtest_group {
> +
> +		igt_subtest_f("gpu-watchdog-long-batch-low-prio-ctx1-reset-ctx2-high-prio-execute") {
> +			int prio_ctx1 = LOW;
> +			int prio_ctx2 = HIGH;
> +			int reset_ctx1 = TRUE;
> +			int reset_ctx2 = FALSE;
> +			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
> +		}

This is kind of verbose. I recommend out usual pattern along the lines of:

struct {
	char *name;
	int prio[2];
	bool reset[2];
} tests[] = {
	{ "change me", {0, 0}, {false, false} },
	{ "change me", {-1, 0}, {true, false} },
	...
};

for (i = 0; i < ARRAY_SIZE(tests); i++) {
	igt_subtest_f("long-batch-%s", tests[i]->name)
		long_batch(fd, tests[i].prio[0], tests[i].ctx[0], ...);
}

So you run all subtests from a single loop.

Regards,

Tvrtko

> +
> +		igt_subtest_f("gpu-watchdog-long-batch-high-prio-ctx1-reset-ctx2-low-prio-execute") {
> +			int prio_ctx1 = HIGH;
> +			int prio_ctx2 = LOW;
> +			int reset_ctx1 = TRUE;
> +			int reset_ctx2 = FALSE;
> +			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
> +		}
> +
> +		igt_subtest_f("gpu-watchdog-long-batch-ctx1-reset-ctx2-execute") {
> +			int prio_ctx1 = -1;
> +			int prio_ctx2 = -1;
> +			int reset_ctx1 = TRUE;
> +			int reset_ctx2 = FALSE;
> +			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
> +		}
> +
> +		igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-ctx2-reset") {
> +			int prio_ctx1 = -1;
> +			int prio_ctx2 = -1;
> +			int reset_ctx1 = FALSE;
> +			int reset_ctx2 = TRUE;
> +			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
> +		}
> +
> +		igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-ctx2-execute") {
> +			int prio_ctx1 = -1;
> +			int prio_ctx2 = -1;
> +			int reset_ctx1 = FALSE;
> +			int reset_ctx2 = FALSE;
> +			gpu_watchdog_long_batch_2_contexts(fd, nengine, prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
> +		}
> +
> +	       __for_each_engine_class_instance(e) {
> +			igt_subtest_group {
> +				igt_fixture {
> +				gem_require_engine(fd,
> +					e->class,
> +					e->instance);
> +				}
> +			/* no support for gpu watchdog on bcs0 */
> +			if (strcmp(e->name, "bcs0") == 0)
> +				continue;

Software implementation can support it. So don't skip here but skip in 
the subtest if you get -ENODEV when you try to configure it.

> +
> +		            igt_subtest_f("gpu-watchdog-long-batch-single-engine-reset-%s", e->name)

Don't prefix all tests with gpu-watchdog, it is very verbose to read and 
the test itself is called gem_watchdog already.

> +		                gpu_watchodg_hang_long_batch_single_engine(fd, e->name);
> +			}
> +		}
> +	}
> +
> +    igt_fixture {
> +	close(fd);
> +    }
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 5167a6c..b281b75 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -210,6 +210,7 @@ i915_progs = [
>   	'gem_unref_active_buffers',
>   	'gem_userptr_blits',
>   	'gem_wait',
> +        'gem_watchdog',
>   	'gem_workarounds',
>   	'gem_write_read_ring_switch',
>   	'i915_fb_tiling',
> 
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog
  2019-04-23 16:25 ` Tvrtko Ursulin
@ 2019-04-23 16:31   ` Tvrtko Ursulin
  2019-04-24  2:31   ` Carlos Santa
  1 sibling, 0 replies; 7+ messages in thread
From: Tvrtko Ursulin @ 2019-04-23 16:31 UTC (permalink / raw)
  To: Carlos Santa, igt-dev; +Cc: Ursulin Tvrtko


On 23/04/2019 17:25, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> How is the test doing against the code in progress? Any issues 
> discovered etc?
> 
> In general it looks a bit untidy. Coding style is off, alignment is 
> random etc.. so a bit difficult to read. But I'll make a pass and see if 
> I can make sense of it. I won't leave comments on coding style and 
> formatting since there are too many. But please do fix that before 
> re-posting.
> 
> On 23/04/2019 01:05, Carlos Santa wrote:
>> This test adds basic set of tests to reset the different
>> GPU engines through the gpu watchdog timer.
>>
>> Credits to Antonio for the original codebase this is based on.
>>
>> v2: remove gem_context_get_param() during set (Antonio)
>>      remove clearing of the engines_threshold[] in the default case
>>      inside context_set_watchdog(). (Antonio)
>>      fix indexing when creating low/high priority contexts
>>      get rid of 2 threads idea (Antonio)
>>      fix context prio bug due to wrong indexing (Antonio)
>> v3: no need to force clear drm_i915_gem_watchdog_timeout struct
>>      when setting the watchdog timeouts (Antonio)
>>      -use gem_context_set_priority() instead to use error checking
>>      -always use a looper to test all engines (Antonio)
>>      -modify test gpu_watchodg_hang_long_batch_single_engine()
>>      to use a couple of batches and set the timeout in between them 
>> (Antonio)
>>      -make use of for_each_engine_class_instance() (Antonio)
>>
>> Cc: Ursulin Tvrtko <tvrtko.ursulin@intel.com>
>> Cc: Antonio Argenziano <antonio.argenziano@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Carlos Santa <carlos.santa@intel.com>
>> ---
>>   tests/Makefile.sources    |   3 +
>>   tests/i915/gem_watchdog.c | 517 
>> ++++++++++++++++++++++++++++++++++++++++++++++
>>   tests/meson.build         |   1 +
>>   3 files changed, 521 insertions(+)
>>   create mode 100644 tests/i915/gem_watchdog.c
>>
>> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
>> index 214698d..7f17f20 100644
>> --- a/tests/Makefile.sources
>> +++ b/tests/Makefile.sources
>> @@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES = i915/gem_userptr_blits.c
>>   TESTS_progs += gem_wait
>>   gem_wait_SOURCES = i915/gem_wait.c
>> +TESTS_progs += gem_watchdog
>> +gem_watchdog_SOURCES = i915/gem_watchdog.c
>> +
>>   TESTS_progs += gem_workarounds
>>   gem_workarounds_SOURCES = i915/gem_workarounds.c
>> diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
>> new file mode 100644
>> index 0000000..8f789ab
>> --- /dev/null
>> +++ b/tests/i915/gem_watchdog.c
>> @@ -0,0 +1,517 @@
>> +/*
>> + * Copyright © 2016 Intel Corporation
>> + *
>> + * Permission is hereby granted, free of charge, to any person 
>> obtaining a
>> + * copy of this software and associated documentation files (the 
>> "Software"),
>> + * to deal in the Software without restriction, including without 
>> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, 
>> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including 
>> the next
>> + * paragraph) shall be included in all copies or substantial portions 
>> of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
>> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
>> OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>> ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
>> OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + */
>> +#include "igt.h"
>> +#include "igt_sysfs.h"
>> +#include "sw_sync.h"
>> +
>> +#include <pthread.h>
>> +#include <fcntl.h>
>> +
>> +#include <sys/ioctl.h>
>> +#include <sys/poll.h>
>> +#include <sys/signal.h>
>> +#include "i915/gem_ring.h"
>> +
>> +#define LOCAL_I915_EXEC_BSD_SHIFT    (13)
>> +#define LOCAL_I915_EXEC_BSD_RING1     (1 << LOCAL_I915_EXEC_BSD_SHIFT)
>> +#define LOCAL_I915_EXEC_BSD_RING2     (2 << LOCAL_I915_EXEC_BSD_SHIFT)
> 
> These are in old so no need for local definitions.
> 
>> +
>> +#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
>> +#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
>> +#define HIGH 1
>> +#define LOW 0
>> +#define TRUE 1
>> +#define FALSE 0
> 
> true and false are fair game in igt.
> 
>> +#define WATCHDOG_THRESHOLD (100)
>> +#define MAX_ENGINES 5
>> +#define RENDER_CLASS 0
>> +#define VIDEO_DECODE_CLASS 1
>> +#define VIDEO_ENHANCEMENT_CLASS 2
>> +#define COPY_ENGINE_CLASS 3
> 
> Classes are in i915_drm.h as well.
> 
>> +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
>> +
>> +#define RS_NO_ERROR      0
>> +#define RS_BATCH_ACTIVE  (1 << 0)
>> +#define RS_BATCH_PENDING (1 << 1)
>> +#define RS_UNKNOWN       (1 << 2)
> 
> What are these? Look unused in code.
> 
>> +
>> +#define GET_RESET_STATS_IOCTL DRM_IOWR(DRM_COMMAND_BASE + 0x32, 
>> struct local_drm_i915_reset_stats)
>> +struct local_drm_i915_reset_stats {
>> +    __u32 ctx_id;
>> +    __u32 flags;
>> +    __u32 reset_count;
>> +    __u32 batch_active;
>> +    __u32 batch_pending;
>> +    __u32 pad;
>> +};
>> +
>> +const uint64_t timeout_100ms = 100000000LL;
>> +int num;
>> +
>> +struct drm_i915_gem_watchdog_timeout {
>> +    union {
>> +        struct {
>> +            /*
>> +             * Engine class & instance to be configured or queried.
>> +             */
>> +            __u16 engine_class;
>> +            __u16 engine_instance;
>> +        };
>> +        /* Index based addressing mode */
>> +        __u32 index;
>> +    };
>> +    /* GPU Engine watchdog resets timeout in us */
>> +    __u32 timeout_us;
>> +};
>> +
>> +static void clear_error_state(int fd)
>> +{
>> +    int dir;
>> +
>> +    dir = igt_sysfs_open(fd);
>> +
>> +    if (dir < 0)
>> +        return;
> 
> igt_require(dir >= 0) if test relies on this being present?
> 
>> +
>> +    /* Any write to the error state clears it */
>> +    igt_sysfs_set(dir, "error", "");
>> +    close(dir);
>> +}
>> +
>> +static unsigned int e2ring(int fd, const struct 
>> intel_execution_engine2 *e)
>> +{
>> +    return gem_class_instance_to_eb_flags(fd, e->class, e->instance);
>> +}
>> +
>> +static void context_set_watchdog(int fd, unsigned engine_id,
>> +                                 unsigned ctx_id, unsigned threshold)
>> +{
>> +    struct drm_i915_gem_watchdog_timeout engines_threshold[MAX_ENGINES];
>> +    struct drm_i915_gem_context_param arg = {
>> +        .param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
>> +        .ctx_id = ctx_id,
>> +        .size = sizeof(engines_threshold),
>> +        .value = (uint64_t)&engines_threshold
>> +    };
>> +
>> +    memset(&engines_threshold, 0, sizeof(engines_threshold));
>> +
>> +    switch (engine_id & I915_EXEC_RING_MASK) {
>> +    case I915_EXEC_RENDER:
>> +        engines_threshold[RENDER_CLASS].timeout_us = threshold;
> 
> Looks like uAPI is in some interim state? I was expecting to see here 
> something like:
> 
>      engines_threshold.engine_class = RENDER_CLASS;
>      engines_threshold.engine_instance = 0;
>      engines_threshold.timeout_us = threshold;
> 
> But you seem to be passing an array of structs which carry 
> class/instance data, don't use it but instead infer the class from the 
> position in the array.
> 
>> +        break;
>> +    case I915_EXEC_BSD:
>> +        engines_threshold[VIDEO_DECODE_CLASS].timeout_us = threshold;
>> +        break;
>> +    case I915_EXEC_VEBOX:
>> +        engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 
>> threshold;
>> +        break;
>> +    default:
>> +        break;
>> +    }
>> +
>> +    gem_context_set_param(fd, &arg);
>> +}
>> +
>> +static void batch_buffer_factory(uint32_t fd, uint32_t ctx_id, 
>> unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle, 
>> uint64_t timeout, int *fence, int fence_index)
>> +{
>> +    struct drm_i915_gem_exec_object2 obj[2];
>> +    struct drm_i915_gem_relocation_entry reloc;
>> +    struct drm_i915_gem_execbuffer2 execbuf;
>> +    igt_spin_t *spin = NULL;
>> +    const uint32_t bbe = MI_BATCH_BUFFER_END;
>> +    int i = 0;
>> +
>> +    gem_quiescent_gpu(fd);
>> +
>> +    memset(&execbuf, 0, sizeof(execbuf));
>> +    memset(&obj, 0, sizeof(obj));
>> +    memset(&reloc, 0, sizeof(reloc));
>> +
>> +    execbuf.buffers_ptr = to_user_pointer(obj);
>> +
>> +    execbuf.buffer_count = 2;
>> +    execbuf.flags = exec_id | I915_EXEC_FENCE_OUT ;
>> +
>> +    obj[0].handle = target;
>> +    obj[1].handle = gem_create(fd, 4096);
>> +
>> +    obj[1].relocation_count = 1;
>> +    obj[1].relocs_ptr = to_user_pointer(&reloc);
>> +
>> +    reloc.target_handle = obj[0].handle;
>> +    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
>> +    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
>> +    reloc.delta = offset * sizeof(uint32_t);
>> +
>> +    reloc.offset = i * sizeof(uint32_t);
>> +    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
>> +
>> +    __sync_synchronize();
>> +
>> +    if (handle) {
>> +        *handle = obj[1].handle;
>> +        return;
> 
> Is this path used by the callers?
> 
>> +    }
>> +
>> +    gem_sync(fd, obj[1].handle);
>> +    execbuf.rsvd1 = ctx_id;
>> +    execbuf.rsvd2 = -1;
>> +
>> +    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
>> +    igt_spin_batch_set_timeout(spin, timeout);
>> +    igt_assert(gem_bo_busy(fd, obj[0].handle));
>> +
>> +    gem_execbuf_wr(fd, &execbuf);
>> +    igt_spin_batch_free(fd, spin);
>> +
>> +    fence[fence_index] = execbuf.rsvd2 >> 32;
>> +
>> +    gem_close(fd, obj[1].handle);
>> +    gem_quiescent_gpu(fd);
>> +}
>> +
>> +static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
>> +{
>> +    uint32_t ctx = gem_context_create(fd);
>> +
>> +    switch (ctx_prio) {
>> +    case HIGH:
>> +        gem_context_set_priority(fd, ctx, MAX_PRIO);
>> +        igt_info("Setting MAX priority %d\n", ctx_prio);
>> +        break;
>> +    case LOW:
>> +        gem_context_set_priority(fd, ctx, MIN_PRIO);
>> +        igt_info("Setting MIN priority %d\n", ctx_prio);
>> +        break;
>> +    default:
>> +        igt_info("Ignoring context priority %d\n", ctx_prio);
>> +        break;
>> +    }
>> +
>> +    return ctx;
> 
> Why do you need this helper which translates an integer to an integer? 
> Couldn't you just pass in the real priority to start with?
> 
>> +}
>> +
>> +static int gem_reset_stats(int fd, int ctx_id,
>> +               struct local_drm_i915_reset_stats *rs)
>> +{
>> +    memset(rs, 0, sizeof(*rs));
>> +    rs->ctx_id = ctx_id;
>> +    rs->reset_count = -1;
>> +
>> +    if (drmIoctl(fd, GET_RESET_STATS_IOCTL, rs)) {
>> +        printf("error:%d\n",-errno);
>> +        return -errno;
> 
> Make it assert success here since all callers do that anyway and it 
> matches the convention for gem prefixed ioctl helpers?
> 
>> +    }
>> +    igt_assert(rs->reset_count != -1);
>> +    return 0;
>> +}
>> +
>> +static unsigned index_to_engine_id(int fd, int index)
>> +{
>> +    unsigned engine_id = I915_EXEC_RENDER;
>> +
>> +    switch (index) {
>> +    case 1:
>> +        engine_id = I915_EXEC_RENDER;
>> +        break;
>> +    case 2:
>> +        if (gem_has_bsd2(fd))
>> +            engine_id = I915_EXEC_BSD | I915_EXEC_BSD_RING1;
>> +        else
>> +            engine_id = I915_EXEC_BSD;
>> +        break;
>> +    case 3:
>> +        if (gem_has_bsd2(fd))
>> +            engine_id = I915_EXEC_BSD | I915_EXEC_BSD_RING2;
>> +        else
>> +            engine_id = I915_EXEC_BSD;
>> +        break;
>> +    case 4:
>> +        engine_id = I915_EXEC_VEBOX;
>> +        break;
>> +    default:
>> +        break;
>> +    }
> 
> I hope we can avoid this maintenance burden, will see later in patch how.
> 
>> +
>> +    return engine_id;
>> +}
>> +
>> +static void inject_hang(uint32_t fd, unsigned engine_id, uint32_t 
>> ctx_id,  unsigned flags)
>> +{
>> +    igt_hang_t hang;
>> +    hang = igt_hang_ctx(fd, ctx_id, engine_id, flags);
>> +    gem_sync(fd, hang.spin->handle);
>> +}
>> +
>> +static void gpu_watchdog_long_batch_2_contexts(int fd, int nengine, 
>> int prio_ctx1, int reset_ctx1, int prio_ctx2, int reset_ctx2)
>> +{
>> +    uint32_t ctx[48];
>> +    uint32_t scratch[48];
>> +    unsigned flags = HANG_ALLOW_CAPTURE;
>> +    const uint64_t batch_timeout_ms = timeout_100ms * 3;
> 
> ms suffix usually means the unit stored is ms. While here it seems to be 
> something else.
> 
> const unsigned long batch_timeout_ms = 300;
> const unsigned long batch_timeout_us = 300e3;
> const unsigned long batch_timeout_ns = 300e6;
> 
> Not sure which one you want. Probably use ms and scale up at the use site.
> 
>> +    const struct intel_execution_engine2 *e;
>> +    const struct intel_execution_engine2 *e2;
>> +    int i = 0, index = 0, engine_id = 0, ctx_cnt = 0;
>> +    int *fence = 0, active_count = 0;
>> +    struct local_drm_i915_reset_stats rs;
>> +
>> +    igt_require(nengine);
>> +
>> +    fence = (int *)malloc(sizeof(int) * 48);
>> +
>> +    if (!fence) {
>> +        igt_info("Out of memory\n");
>> +        exit(1);
>> +    }
> 
> Replace with igt_assert(fence).
> 
>> +
>> +    /* Intializes random number generator */
>> +    srand((unsigned) time(NULL));
>> +
>> +    /* The outer looper is ctx1 and the inner looper is ctx2
>> +     * Both contexts can set their priority, create work, and cancel 
>> themselves
>> +     */
>> +    for_each_engine_class_instance(fd, e) {
>> +        if ( strcmp(e->name, "bcs0") == 0 )
>> +            continue;
> 
> Use -ENODEV or -EINVAL from context_set_watchdog to skip.
> 
>> +
>> +        scratch[ctx_cnt] = gem_create(fd, 4096);
>> +        index = rand() % 4 + 1;
>> +        engine_id = index_to_engine_id(fd, index);
>> +
>> +        /* get current batch_active count */
>> +        igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +        active_count = rs.batch_active;
>> +        igt_assert_eq(rs.batch_active, active_count);
>> +
>> +        ctx[ctx_cnt] = create_ctx_with_priority(fd, prio_ctx1);
>> +        batch_buffer_factory(fd, ctx[ctx_cnt], engine_id, 
>> scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence, ctx_cnt);
>> +        context_set_watchdog(fd, engine_id, ctx[ctx_cnt], 
>> WATCHDOG_THRESHOLD);
>> +        sleep(1);
> 
> What's the sleep for? If unavoidable at least needs a comment.
> 
>> +
>> +        if (reset_ctx1) {
>> +            clear_error_state(fd);
>> +            inject_hang(fd, engine_id, ctx[ctx_cnt], flags);
>> +            active_count++;
>> +        }
>> +
>> +        /* check batch_active count once again after the hang */
>> +        igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +        igt_assert_eq(rs.batch_active, active_count);
>> +
>> +        ctx_cnt++;
>> +        i = 1;
>> +
>> +        for_each_engine_class_instance(fd, e2) {
>> +            if ( strcmp(e->name, "bcs0") == 0 )
> 
> e2, but remove it altogether as said above.
> 
> I don't get what is the purpose of two for_each_engine_class_instance 
> loops when neither e or e2 appear to be used?
> 
> Since either I am missing something critical or the test really is very 
> confused, until clarified, I am skipping a bit lower down.
> 
>> +                continue;
>> +
>> +            scratch[ctx_cnt] = gem_create(fd, 4096);
>> +            engine_id = index_to_engine_id(fd, i);
>> +
>> +            /* get current batch_active count */
>> +            igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +            active_count = rs.batch_active;
>> +            igt_assert_eq(rs.batch_active, active_count);
>> +
>> +            ctx[ctx_cnt] = create_ctx_with_priority(fd, prio_ctx2);
>> +            batch_buffer_factory(fd, ctx[ctx_cnt], engine_id, 
>> scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence, ctx_cnt);
>> +            context_set_watchdog(fd, engine_id, ctx[ctx_cnt], 
>> WATCHDOG_THRESHOLD);
>> +
>> +            if (reset_ctx2) {
>> +                clear_error_state(fd);
>> +                inject_hang(fd, engine_id, ctx[ctx_cnt], flags);
>> +                active_count++;
>> +            }
>> +
>> +            /* check batch_active count once again after the hang */
>> +            igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +            igt_assert_eq(rs.batch_active, active_count);
>> +
>> +            ctx_cnt++;
>> +            i++;
>> +        }
>> +    }
>> +
>> +    for (i = 0; i < ctx_cnt; i++) {
>> +        close(fence[i]);
>> +        gem_context_destroy(fd, ctx[i]);
>> +        gem_close(fd, scratch[i]);
>> +    }
>> +}
>> +
>> +static void gpu_watchodg_hang_long_batch_single_engine(int fd, const 
>> char *name)
>> +{
>> +    uint32_t ctx[16];
>> +    uint32_t scratch[16];
>> +    int *fence;
>> +    const struct intel_execution_engine2 *e;
>> +    unsigned nengine = 0;
>> +    unsigned engine_id = 0;
>> +    int active_count = 0;
>> +    struct local_drm_i915_reset_stats rs;
>> +    int i;
>> +    unsigned flags = HANG_ALLOW_CAPTURE;
>> +    const uint64_t batch_timeout_ms = timeout_100ms * 4;
>> +
>> +    fence = (int *)malloc(sizeof(int)*16);
>> +
>> +    if (!fence) {
>> +        igt_info("Out of memory\n");
>> +        exit(1);
>> +    }
>> +
>> +    for_each_engine_class_instance(fd, e) {
>> +        if ( strcmp(e->name, "bcs0") == 0 )
>> +            continue;
>> +
>> +        engine_id = e2ring(fd, e);
>> +
>> +        scratch[nengine] = gem_create(fd, 4096);
>> +        ctx[nengine] = create_ctx_with_priority(fd, -1);
>> +
>> +        /* get current batch_active count */
>> +        igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +        active_count = rs.batch_active;
>> +        igt_assert_eq(rs.batch_active, active_count);
>> +
>> +        /* create some work on this engine using the above ctx*/
>> +        batch_buffer_factory(fd, ctx[nengine], engine_id, 
>> scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
>> +
>> +        /* set the gpu watchdog timeout */
>> +        context_set_watchdog(fd, engine_id, ctx[nengine], 
>> WATCHDOG_THRESHOLD);
>> +        clear_error_state(fd);
>> +
>> +        /*
>> +         * give enough time for the batch buffer to run
>> +         */
>> +        nanosleep(&(struct timespec){ .tv_sec = 0,
>> +                          .tv_nsec = batch_timeout_ms * 2 }, NULL);
>> +
>> +        /* create some work on this engine using the above ctx*/
>> +        batch_buffer_factory(fd, ctx[nengine], engine_id, 
>> scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
>> +        context_set_watchdog(fd, engine_id, ctx[nengine], 
>> WATCHDOG_THRESHOLD);
>> +        clear_error_state(fd);
>> +
>> +        /* cancel only the batch requested */
>> +        if ( strcmp(e->name, name) == 0 ) {
>> +            inject_hang(fd, engine_id, ctx[nengine], flags);
>> +            active_count++;
>> +        }
>> +
>> +        /* check batch_active count once again after the hang */
>> +        igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>> +        active_count = rs.batch_active;
>> +        igt_assert_eq(rs.batch_active, active_count);
>> +
>> +        nengine++;
>> +    }
>> +
>> +    for (i = 0; i < nengine; i++) {
>> +        close(fence[i]);
>> +        gem_context_destroy(fd, ctx[i]);
>> +        gem_close(fd, scratch[i]);
>> +    }
>> +}
>> +
>> +igt_main
>> +{
>> +    int fd;
>> +    unsigned int nengine = 0;
>> +    unsigned int engine;
>> +    const struct intel_execution_engine2 *e;
>> +
>> +    igt_skip_on_simulation();
>> +
>> +    igt_fixture {
>> +        fd = drm_open_driver(DRIVER_INTEL);
>> +        igt_require_gem(fd);
>> +
>> +        for_each_physical_engine(fd, engine)
>> +            nengine++;
>> +        igt_require(nengine);
> 
> Do we have GPUs with no engines?
> 
>> +    }
>> +
>> +    igt_subtest_group {
>> +
>> +        
>> igt_subtest_f("gpu-watchdog-long-batch-low-prio-ctx1-reset-ctx2-high-prio-execute") 
>> {
>> +            int prio_ctx1 = LOW;
>> +            int prio_ctx2 = HIGH;
>> +            int reset_ctx1 = TRUE;
>> +            int reset_ctx2 = FALSE;
>> +            gpu_watchdog_long_batch_2_contexts(fd, nengine, 
>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>> +        }
> 
> This is kind of verbose. I recommend out usual pattern along the lines of:
> 
> struct {
>      char *name;
>      int prio[2];
>      bool reset[2];
> } tests[] = {
>      { "change me", {0, 0}, {false, false} },
>      { "change me", {-1, 0}, {true, false} },
>      ...
> };
> 
> for (i = 0; i < ARRAY_SIZE(tests); i++) {
>      igt_subtest_f("long-batch-%s", tests[i]->name)
>          long_batch(fd, tests[i].prio[0], tests[i].ctx[0], ...);
> }
> 
> So you run all subtests from a single loop.

Close email too early, there were more comments below:

>> +
>> +        
>> igt_subtest_f("gpu-watchdog-long-batch-high-prio-ctx1-reset-ctx2-low-prio-execute") 
>> {
>> +            int prio_ctx1 = HIGH;
>> +            int prio_ctx2 = LOW;
>> +            int reset_ctx1 = TRUE;
>> +            int reset_ctx2 = FALSE;
>> +            gpu_watchdog_long_batch_2_contexts(fd, nengine, 
>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>> +        }
>> +
>> +        
>> igt_subtest_f("gpu-watchdog-long-batch-ctx1-reset-ctx2-execute") {
>> +            int prio_ctx1 = -1;
>> +            int prio_ctx2 = -1;
>> +            int reset_ctx1 = TRUE;
>> +            int reset_ctx2 = FALSE;
>> +            gpu_watchdog_long_batch_2_contexts(fd, nengine, 
>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>> +        }
>> +
>> +        
>> igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-ctx2-reset") {
>> +            int prio_ctx1 = -1;
>> +            int prio_ctx2 = -1;
>> +            int reset_ctx1 = FALSE;
>> +            int reset_ctx2 = TRUE;
>> +            gpu_watchdog_long_batch_2_contexts(fd, nengine, 
>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>> +        }
>> +
>> +        
>> igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-ctx2-execute") {
>> +            int prio_ctx1 = -1;
>> +            int prio_ctx2 = -1;
>> +            int reset_ctx1 = FALSE;
>> +            int reset_ctx2 = FALSE;
>> +            gpu_watchdog_long_batch_2_contexts(fd, nengine, 
>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>> +        }
>> +
>> +           __for_each_engine_class_instance(e) {
>> +            igt_subtest_group {
>> +                igt_fixture {
>> +                gem_require_engine(fd,
>> +                    e->class,
>> +                    e->instance);
>> +                }
>> +            /* no support for gpu watchdog on bcs0 */
>> +            if (strcmp(e->name, "bcs0") == 0)
>> +                continue;
> 
> Software implementation can support it. So don't skip here but skip in 
> the subtest if you get -ENODEV when you try to configure it.

Here ^

>> +
>> +                    
>> igt_subtest_f("gpu-watchdog-long-batch-single-engine-reset-%s", e->name)
> 
> Don't prefix all tests with gpu-watchdog, it is very verbose to read and 
> the test itself is called gem_watchdog already.

And here ^

I did not really get to dig into the test case coverage due all the 
noise and confusion with engine iteration.

Regards,

Tvrtko

>> +                        
>> gpu_watchodg_hang_long_batch_single_engine(fd, e->name);
>> +            }
>> +        }
>> +    }
>> +
>> +    igt_fixture {
>> +    close(fd);
>> +    }
>> +}
>> diff --git a/tests/meson.build b/tests/meson.build
>> index 5167a6c..b281b75 100644
>> --- a/tests/meson.build
>> +++ b/tests/meson.build
>> @@ -210,6 +210,7 @@ i915_progs = [
>>       'gem_unref_active_buffers',
>>       'gem_userptr_blits',
>>       'gem_wait',
>> +        'gem_watchdog',
>>       'gem_workarounds',
>>       'gem_write_read_ring_switch',
>>       'i915_fb_tiling',
>>
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog
  2019-04-23 16:25 ` Tvrtko Ursulin
  2019-04-23 16:31   ` Tvrtko Ursulin
@ 2019-04-24  2:31   ` Carlos Santa
  2019-04-24  8:21     ` Tvrtko Ursulin
  1 sibling, 1 reply; 7+ messages in thread
From: Carlos Santa @ 2019-04-24  2:31 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Ursulin Tvrtko

On Tue, 2019-04-23 at 17:25 +0100, Tvrtko Ursulin wrote:
> Hi,
> 
> How is the test doing against the code in progress? Any issues 
> discovered etc?

Test					Result

single-engine-reset-rcs0		PASS
single-engine-reset-vcs0		PASS
single-engine-reset-vcs1		PASS
single-engine-reset-vecs0		PASS
ctx1-execute-ctx2-execute		PASS
ctx1-execute-ctx2-reset			PASS
ctx1-reset-ctx2-reset			PASS
ctx2-high-prio-reset-ctx2-low-execute	PASS
ctx2-low-prio-reset-ctx2-high-execute   FAIL

There seems to be a problem with preemption as expected, the test
simple gets stuck, and the default reset then kicks in. I also removed
the dummy sleep(1) as you suggested, this still works as the output
above.

About the tests

1. The "single-engine" tests basically resets one engine, I use
the "for_each_engine_class_instance" looper to simple iterate over the
different engines. Given the option to specify which engine to reset is
not a hard requirement, I could simply loop through all of them at
once.

Also, on this test I took Antonio's suggestion where:

        - Submit work from context A
        - Set a non 0 threshold on context A
        - Submit more work on context A

And expect the work before you set the threshold is not reset and what 
was sent after is.

Then I assert if the "batch_active" doesn't match my own counter after
each reset:

	igt_assert_eq(rs.batch_active, active_count);


2. About the "long_batch_2_contexts" test case. The idea here is that
the outer looper is "ctx1" and the inner looper is "ctx2". The outer
loop randomnly generates engine id and the inner loop simply increments
engines 1 through 4, then based on the input parameters we can let it
execute, cancel, and set the priority.

For example if the test says "ctx1-execute-ctx2-cancel" then this means
the outer loop set the watchdog and executes while the inner loop sets
the watchdog and cancels the execution, but since I am looping through
all of the engines, then it will do that for all of them:

engine1 - executes
	engine1 - cancel
	engine2 - cancel
	engine3 - cancel
	engine4 - cancel
engine4 - executes
	engine1 - cancel
	engine2 - cancel
	...

v2 of the series was hardcoding the engines to say vecs0 and rcs0 but
Antonio suggested that we could iterate through the different engines.

Regards,
Carlos


> 
> In general it looks a bit untidy. Coding style is off, alignment is 
> random etc.. so a bit difficult to read. But I'll make a pass and see
> if 
> I can make sense of it. I won't leave comments on coding style and 
> formatting since there are too many. But please do fix that before 
> re-posting.
> 
> On 23/04/2019 01:05, Carlos Santa wrote:
> > This test adds basic set of tests to reset the different
> > GPU engines through the gpu watchdog timer.
> > 
> > Credits to Antonio for the original codebase this is based on.
> > 
> > v2: remove gem_context_get_param() during set (Antonio)
> >      remove clearing of the engines_threshold[] in the default case
> >      inside context_set_watchdog(). (Antonio)
> >      fix indexing when creating low/high priority contexts
> >      get rid of 2 threads idea (Antonio)
> >      fix context prio bug due to wrong indexing (Antonio)
> > v3: no need to force clear drm_i915_gem_watchdog_timeout struct
> >      when setting the watchdog timeouts (Antonio)
> >      -use gem_context_set_priority() instead to use error checking
> >      -always use a looper to test all engines (Antonio)
> >      -modify test gpu_watchodg_hang_long_batch_single_engine()
> >      to use a couple of batches and set the timeout in between them
> > (Antonio)
> >      -make use of for_each_engine_class_instance() (Antonio)
> > 
> > Cc: Ursulin Tvrtko <tvrtko.ursulin@intel.com>
> > Cc: Antonio Argenziano <antonio.argenziano@intel.com>
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > Signed-off-by: Carlos Santa <carlos.santa@intel.com>
> > ---
> >   tests/Makefile.sources    |   3 +
> >   tests/i915/gem_watchdog.c | 517
> > ++++++++++++++++++++++++++++++++++++++++++++++
> >   tests/meson.build         |   1 +
> >   3 files changed, 521 insertions(+)
> >   create mode 100644 tests/i915/gem_watchdog.c
> > 
> > diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> > index 214698d..7f17f20 100644
> > --- a/tests/Makefile.sources
> > +++ b/tests/Makefile.sources
> > @@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES =
> > i915/gem_userptr_blits.c
> >   TESTS_progs += gem_wait
> >   gem_wait_SOURCES = i915/gem_wait.c
> >   
> > +TESTS_progs += gem_watchdog
> > +gem_watchdog_SOURCES = i915/gem_watchdog.c
> > +
> >   TESTS_progs += gem_workarounds
> >   gem_workarounds_SOURCES = i915/gem_workarounds.c
> >   
> > diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
> > new file mode 100644
> > index 0000000..8f789ab
> > --- /dev/null
> > +++ b/tests/i915/gem_watchdog.c
> > @@ -0,0 +1,517 @@
> > +/*
> > + * Copyright © 2016 Intel Corporation
> > + *
> > + * Permission is hereby granted, free of charge, to any person
> > obtaining a
> > + * copy of this software and associated documentation files (the
> > "Software"),
> > + * to deal in the Software without restriction, including without
> > limitation
> > + * the rights to use, copy, modify, merge, publish, distribute,
> > sublicense,
> > + * and/or sell copies of the Software, and to permit persons to
> > whom the
> > + * Software is furnished to do so, subject to the following
> > conditions:
> > + *
> > + * The above copyright notice and this permission notice
> > (including the next
> > + * paragraph) shall be included in all copies or substantial
> > portions of the
> > + * Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> > EVENT SHALL
> > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
> > DAMAGES OR OTHER
> > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> > ARISING
> > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> > OTHER DEALINGS
> > + * IN THE SOFTWARE.
> > + */
> > +#include "igt.h"
> > +#include "igt_sysfs.h"
> > +#include "sw_sync.h"
> > +
> > +#include <pthread.h>
> > +#include <fcntl.h>
> > +
> > +#include <sys/ioctl.h>
> > +#include <sys/poll.h>
> > +#include <sys/signal.h>
> > +#include "i915/gem_ring.h"
> > +
> > +#define LOCAL_I915_EXEC_BSD_SHIFT	(13)
> > +#define LOCAL_I915_EXEC_BSD_RING1 	(1 <<
> > LOCAL_I915_EXEC_BSD_SHIFT)
> > +#define LOCAL_I915_EXEC_BSD_RING2 	(2 <<
> > LOCAL_I915_EXEC_BSD_SHIFT)
> 
> These are in old so no need for local definitions.
> 
> > +
> > +#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
> > +#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
> > +#define HIGH 1
> > +#define LOW 0
> > +#define TRUE 1
> > +#define FALSE 0
> 
> true and false are fair game in igt.
> 
> > +#define WATCHDOG_THRESHOLD (100)
> > +#define MAX_ENGINES 5
> > +#define RENDER_CLASS 0
> > +#define VIDEO_DECODE_CLASS 1
> > +#define VIDEO_ENHANCEMENT_CLASS 2
> > +#define COPY_ENGINE_CLASS 3
> 
> Classes are in i915_drm.h as well.
> 
> > +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
> > +
> > +#define RS_NO_ERROR      0
> > +#define RS_BATCH_ACTIVE  (1 << 0)
> > +#define RS_BATCH_PENDING (1 << 1)
> > +#define RS_UNKNOWN       (1 << 2)
> 
> What are these? Look unused in code.
> 
> > +
> > +#define GET_RESET_STATS_IOCTL DRM_IOWR(DRM_COMMAND_BASE + 0x32,
> > struct local_drm_i915_reset_stats)
> > +struct local_drm_i915_reset_stats {
> > +	__u32 ctx_id;
> > +	__u32 flags;
> > +	__u32 reset_count;
> > +	__u32 batch_active;
> > +	__u32 batch_pending;
> > +	__u32 pad;
> > +};
> > +
> > +const uint64_t timeout_100ms = 100000000LL;
> > +int num;
> > +
> > +struct drm_i915_gem_watchdog_timeout {
> > +	union {
> > +		struct {
> > +			/*
> > +			 * Engine class & instance to be configured or
> > queried.
> > +			 */
> > +			__u16 engine_class;
> > +			__u16 engine_instance;
> > +		};
> > +		/* Index based addressing mode */
> > +		__u32 index;
> > +	};
> > +	/* GPU Engine watchdog resets timeout in us */
> > +	__u32 timeout_us;
> > +};
> > +
> > +static void clear_error_state(int fd)
> > +{
> > +	int dir;
> > +
> > +	dir = igt_sysfs_open(fd);
> > +
> > +	if (dir < 0)
> > +		return;
> 
> igt_require(dir >= 0) if test relies on this being present?
> 
> > +
> > +	/* Any write to the error state clears it */
> > +	igt_sysfs_set(dir, "error", "");
> > +	close(dir);
> > +}
> > +
> > +static unsigned int e2ring(int fd, const struct
> > intel_execution_engine2 *e)
> > +{
> > +    return gem_class_instance_to_eb_flags(fd, e->class, e-
> > >instance);
> > +}
> > +
> > +static void context_set_watchdog(int fd, unsigned engine_id,
> > +                                 unsigned ctx_id, unsigned
> > threshold)
> > +{
> > +	struct drm_i915_gem_watchdog_timeout
> > engines_threshold[MAX_ENGINES];
> > +	struct drm_i915_gem_context_param arg = {
> > +		.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
> > +		.ctx_id = ctx_id,
> > +		.size = sizeof(engines_threshold),
> > +		.value = (uint64_t)&engines_threshold
> > +	};
> > +
> > +	memset(&engines_threshold, 0, sizeof(engines_threshold));
> > +
> > +	switch (engine_id & I915_EXEC_RING_MASK) {
> > +	case I915_EXEC_RENDER:
> > +		engines_threshold[RENDER_CLASS].timeout_us = threshold;
> 
> Looks like uAPI is in some interim state? I was expecting to see
> here 
> something like:
> 
> 	engines_threshold.engine_class = RENDER_CLASS;
> 	engines_threshold.engine_instance = 0;
> 	engines_threshold.timeout_us = threshold;
> 
> But you seem to be passing an array of structs which carry 
> class/instance data, don't use it but instead infer the class from
> the 
> position in the array.
> 
> > +		break;
> > +	case I915_EXEC_BSD:
> > +		engines_threshold[VIDEO_DECODE_CLASS].timeout_us =
> > threshold;
> > +		break;
> > +	case I915_EXEC_VEBOX:
> > +		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us =
> > threshold;
> > +		break;
> > +	default:
> > +		break;
> > +	}
> > +
> > +	gem_context_set_param(fd, &arg);
> > +}
> > +
> > +static void batch_buffer_factory(uint32_t fd, uint32_t ctx_id,
> > unsigned exec_id, uint32_t target, uint32_t offset, uint32_t
> > *handle, uint64_t timeout, int *fence, int fence_index)
> > +{
> > +    struct drm_i915_gem_exec_object2 obj[2];
> > +    struct drm_i915_gem_relocation_entry reloc;
> > +    struct drm_i915_gem_execbuffer2 execbuf;
> > +    igt_spin_t *spin = NULL;
> > +    const uint32_t bbe = MI_BATCH_BUFFER_END;
> > +    int i = 0;
> > +
> > +    gem_quiescent_gpu(fd);
> > +
> > +    memset(&execbuf, 0, sizeof(execbuf));
> > +    memset(&obj, 0, sizeof(obj));
> > +    memset(&reloc, 0, sizeof(reloc));
> > +
> > +    execbuf.buffers_ptr = to_user_pointer(obj);
> > +
> > +    execbuf.buffer_count = 2;
> > +    execbuf.flags = exec_id | I915_EXEC_FENCE_OUT ;
> > +
> > +    obj[0].handle = target;
> > +    obj[1].handle = gem_create(fd, 4096);
> > +
> > +    obj[1].relocation_count = 1;
> > +    obj[1].relocs_ptr = to_user_pointer(&reloc);
> > +
> > +    reloc.target_handle = obj[0].handle;
> > +    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
> > +    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
> > +    reloc.delta = offset * sizeof(uint32_t);
> > +
> > +    reloc.offset = i * sizeof(uint32_t);
> > +    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
> > +
> > +    __sync_synchronize();
> > +
> > +    if (handle) {
> > +        *handle = obj[1].handle;
> > +        return;
> 
> Is this path used by the callers?
> 
> > +    }
> > +
> > +    gem_sync(fd, obj[1].handle);
> > +    execbuf.rsvd1 = ctx_id;
> > +    execbuf.rsvd2 = -1;
> > +
> > +    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
> > +    igt_spin_batch_set_timeout(spin, timeout);
> > +    igt_assert(gem_bo_busy(fd, obj[0].handle));
> > +
> > +    gem_execbuf_wr(fd, &execbuf);
> > +    igt_spin_batch_free(fd, spin);
> > +
> > +    fence[fence_index] = execbuf.rsvd2 >> 32;
> > +
> > +    gem_close(fd, obj[1].handle);
> > +    gem_quiescent_gpu(fd);
> > +}
> > +
> > +static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
> > +{
> > +	uint32_t ctx = gem_context_create(fd);
> > +
> > +	switch (ctx_prio) {
> > +	case HIGH:
> > +		gem_context_set_priority(fd, ctx, MAX_PRIO);
> > +		igt_info("Setting MAX priority %d\n", ctx_prio);
> > +		break;
> > +	case LOW:
> > +		gem_context_set_priority(fd, ctx, MIN_PRIO);
> > +		igt_info("Setting MIN priority %d\n", ctx_prio);
> > +		break;
> > +	default:
> > +		igt_info("Ignoring context priority %d\n", ctx_prio);
> > +		break;
> > +	}
> > +
> > +	return ctx;
> 
> Why do you need this helper which translates an integer to an
> integer? 
> Couldn't you just pass in the real priority to start with?
> 
> > +}
> > +
> > +static int gem_reset_stats(int fd, int ctx_id,
> > +			   struct local_drm_i915_reset_stats *rs)
> > +{
> > +	memset(rs, 0, sizeof(*rs));
> > +	rs->ctx_id = ctx_id;
> > +	rs->reset_count = -1;
> > +
> > +	if (drmIoctl(fd, GET_RESET_STATS_IOCTL, rs)) {
> > +		printf("error:%d\n",-errno);
> > +		return -errno;
> 
> Make it assert success here since all callers do that anyway and it 
> matches the convention for gem prefixed ioctl helpers?
> 
> > +	}
> > +	igt_assert(rs->reset_count != -1);
> > +	return 0;
> > +}
> > +
> > +static unsigned index_to_engine_id(int fd, int index)
> > +{
> > +	unsigned engine_id = I915_EXEC_RENDER;
> > +
> > +	switch (index) {
> > +	case 1:
> > +		engine_id = I915_EXEC_RENDER;
> > +		break;
> > +	case 2:
> > +		if (gem_has_bsd2(fd))
> > +			engine_id = I915_EXEC_BSD |
> > I915_EXEC_BSD_RING1;
> > +		else
> > +			engine_id = I915_EXEC_BSD;
> > +		break;
> > +	case 3:
> > +		if (gem_has_bsd2(fd))
> > +			engine_id = I915_EXEC_BSD |
> > I915_EXEC_BSD_RING2;
> > +		else
> > +			engine_id = I915_EXEC_BSD;
> > +		break;
> > +	case 4:
> > +		engine_id = I915_EXEC_VEBOX;
> > +		break;
> > +	default:
> > +		break;
> > +	}
> 
> I hope we can avoid this maintenance burden, will see later in patch
> how.
> 
> > +
> > +	return engine_id;
> > +}
> > +
> > +static void inject_hang(uint32_t fd, unsigned engine_id, uint32_t
> > ctx_id,  unsigned flags)
> > +{
> > +	igt_hang_t hang;
> > +	hang = igt_hang_ctx(fd, ctx_id, engine_id, flags);
> > +	gem_sync(fd, hang.spin->handle);
> > +}
> > +
> > +static void gpu_watchdog_long_batch_2_contexts(int fd, int
> > nengine, int prio_ctx1, int reset_ctx1, int prio_ctx2, int
> > reset_ctx2)
> > +{
> > +	uint32_t ctx[48];
> > +	uint32_t scratch[48];
> > +	unsigned flags = HANG_ALLOW_CAPTURE;
> > +	const uint64_t batch_timeout_ms = timeout_100ms * 3;
> 
> ms suffix usually means the unit stored is ms. While here it seems to
> be 
> something else.
> 
> const unsigned long batch_timeout_ms = 300;
> const unsigned long batch_timeout_us = 300e3;
> const unsigned long batch_timeout_ns = 300e6;
> 
> Not sure which one you want. Probably use ms and scale up at the use
> site.
> 
> > +	const struct intel_execution_engine2 *e;
> > +	const struct intel_execution_engine2 *e2;
> > +	int i = 0, index = 0, engine_id = 0, ctx_cnt = 0;
> > +	int *fence = 0, active_count = 0;
> > +	struct local_drm_i915_reset_stats rs;
> > +
> > +	igt_require(nengine);
> > +
> > +	fence = (int *)malloc(sizeof(int) * 48);
> > +
> > +	if (!fence) {
> > +		igt_info("Out of memory\n");
> > +		exit(1);
> > +	}
> 
> Replace with igt_assert(fence).
> 
> > +
> > +	/* Intializes random number generator */
> > +	srand((unsigned) time(NULL));
> > +
> > +	/* The outer looper is ctx1 and the inner looper is ctx2
> > +	 * Both contexts can set their priority, create work, and
> > cancel themselves
> > +	 */
> > +	for_each_engine_class_instance(fd, e) {
> > +		if ( strcmp(e->name, "bcs0") == 0 )
> > +			continue;
> 
> Use -ENODEV or -EINVAL from context_set_watchdog to skip.
> 
> > +
> > +		scratch[ctx_cnt] = gem_create(fd, 4096);
> > +		index = rand() % 4 + 1;
> > +		engine_id = index_to_engine_id(fd, index);
> > +
> > +		/* get current batch_active count */
> > +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> > +		active_count = rs.batch_active;
> > +		igt_assert_eq(rs.batch_active, active_count);
> > +
> > +		ctx[ctx_cnt] = create_ctx_with_priority(fd, prio_ctx1);
> > +		batch_buffer_factory(fd, ctx[ctx_cnt], engine_id,
> > scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence, ctx_cnt);
> > +		context_set_watchdog(fd, engine_id, ctx[ctx_cnt],
> > WATCHDOG_THRESHOLD);
> > +		sleep(1);
> 
> What's the sleep for? If unavoidable at least needs a comment.
> 
> > +
> > +		if (reset_ctx1) {
> > +			clear_error_state(fd);
> > +			inject_hang(fd, engine_id, ctx[ctx_cnt],
> > flags);
> > +			active_count++;
> > +		}
> > +
> > +		/* check batch_active count once again after the hang
> > */
> > +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> > +		igt_assert_eq(rs.batch_active, active_count);
> > +
> > +		ctx_cnt++;
> > +		i = 1;
> > +
> > +		for_each_engine_class_instance(fd, e2) {
> > +			if ( strcmp(e->name, "bcs0") == 0 )
> 
> e2, but remove it altogether as said above.
> 
> I don't get what is the purpose of two
> for_each_engine_class_instance 
> loops when neither e or e2 appear to be used?
> 
> Since either I am missing something critical or the test really is
> very 
> confused, until clarified, I am skipping a bit lower down.
> 
> > +				continue;
> > +
> > +			scratch[ctx_cnt] = gem_create(fd, 4096);
> > +			engine_id = index_to_engine_id(fd, i);
> > +
> > +			/* get current batch_active count */
> > +			igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> > +			active_count = rs.batch_active;
> > +			igt_assert_eq(rs.batch_active, active_count);
> > +
> > +			ctx[ctx_cnt] = create_ctx_with_priority(fd,
> > prio_ctx2);
> > +			batch_buffer_factory(fd, ctx[ctx_cnt],
> > engine_id, scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence,
> > ctx_cnt);
> > +			context_set_watchdog(fd, engine_id,
> > ctx[ctx_cnt], WATCHDOG_THRESHOLD);
> > +
> > +			if (reset_ctx2) {
> > +				clear_error_state(fd);
> > +				inject_hang(fd, engine_id,
> > ctx[ctx_cnt], flags);
> > +				active_count++;
> > +			}
> > +
> > +			/* check batch_active count once again after
> > the hang */
> > +			igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> > +			igt_assert_eq(rs.batch_active, active_count);
> > +
> > +			ctx_cnt++;
> > +			i++;
> > +		}
> > +	}
> > +
> > +	for (i = 0; i < ctx_cnt; i++) {
> > +		close(fence[i]);
> > +		gem_context_destroy(fd, ctx[i]);
> > +		gem_close(fd, scratch[i]);
> > +	}
> > +}
> > +
> > +static void gpu_watchodg_hang_long_batch_single_engine(int fd,
> > const char *name)
> > +{
> > +	uint32_t ctx[16];
> > +	uint32_t scratch[16];
> > +	int *fence;
> > +	const struct intel_execution_engine2 *e;
> > +	unsigned nengine = 0;
> > +	unsigned engine_id = 0;
> > +	int active_count = 0;
> > +	struct local_drm_i915_reset_stats rs;
> > +	int i;
> > +	unsigned flags = HANG_ALLOW_CAPTURE;
> > +	const uint64_t batch_timeout_ms = timeout_100ms * 4;
> > +
> > +	fence = (int *)malloc(sizeof(int)*16);
> > +
> > +	if (!fence) {
> > +		igt_info("Out of memory\n");
> > +		exit(1);
> > +	}
> > +
> > +	for_each_engine_class_instance(fd, e) {
> > +		if ( strcmp(e->name, "bcs0") == 0 )
> > +			continue;
> > +
> > +		engine_id = e2ring(fd, e);
> > +
> > +		scratch[nengine] = gem_create(fd, 4096);
> > +		ctx[nengine] = create_ctx_with_priority(fd, -1);
> > +
> > +		/* get current batch_active count */
> > +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> > +		active_count = rs.batch_active;
> > +		igt_assert_eq(rs.batch_active, active_count);
> > +
> > +		/* create some work on this engine using the above
> > ctx*/
> > +		batch_buffer_factory(fd, ctx[nengine], engine_id,
> > scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
> > +
> > +		/* set the gpu watchdog timeout */
> > +		context_set_watchdog(fd, engine_id, ctx[nengine],
> > WATCHDOG_THRESHOLD);
> > +		clear_error_state(fd);
> > +
> > +		/*
> > +		 * give enough time for the batch buffer to run
> > +		 */
> > +		nanosleep(&(struct timespec){ .tv_sec = 0,
> > +					      .tv_nsec =
> > batch_timeout_ms * 2 }, NULL);
> > +
> > +		/* create some work on this engine using the above
> > ctx*/
> > +		batch_buffer_factory(fd, ctx[nengine], engine_id,
> > scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
> > +		context_set_watchdog(fd, engine_id, ctx[nengine],
> > WATCHDOG_THRESHOLD);
> > +		clear_error_state(fd);
> > +
> > +		/* cancel only the batch requested */
> > +		if ( strcmp(e->name, name) == 0 ) {
> > +			inject_hang(fd, engine_id, ctx[nengine],
> > flags);
> > +			active_count++;
> > +		}
> > +
> > +		/* check batch_active count once again after the hang
> > */
> > +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
> > +		active_count = rs.batch_active;
> > +		igt_assert_eq(rs.batch_active, active_count);
> > +
> > +		nengine++;
> > +	}
> > +
> > +	for (i = 0; i < nengine; i++) {
> > +		close(fence[i]);
> > +		gem_context_destroy(fd, ctx[i]);
> > +		gem_close(fd, scratch[i]);
> > +	}
> > +}
> > +
> > +igt_main
> > +{
> > +	int fd;
> > +	unsigned int nengine = 0;
> > +	unsigned int engine;
> > +	const struct intel_execution_engine2 *e;
> > +
> > +	igt_skip_on_simulation();
> > +
> > +	igt_fixture {
> > +		fd = drm_open_driver(DRIVER_INTEL);
> > +		igt_require_gem(fd);
> > +
> > +		for_each_physical_engine(fd, engine)
> > +			nengine++;
> > +		igt_require(nengine);
> 
> Do we have GPUs with no engines?
> 
> > +	}
> > +
> > +	igt_subtest_group {
> > +
> > +		igt_subtest_f("gpu-watchdog-long-batch-low-prio-ctx1-
> > reset-ctx2-high-prio-execute") {
> > +			int prio_ctx1 = LOW;
> > +			int prio_ctx2 = HIGH;
> > +			int reset_ctx1 = TRUE;
> > +			int reset_ctx2 = FALSE;
> > +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
> > prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
> > +		}
> 
> This is kind of verbose. I recommend out usual pattern along the
> lines of:
> 
> struct {
> 	char *name;
> 	int prio[2];
> 	bool reset[2];
> } tests[] = {
> 	{ "change me", {0, 0}, {false, false} },
> 	{ "change me", {-1, 0}, {true, false} },
> 	...
> };
> 
> for (i = 0; i < ARRAY_SIZE(tests); i++) {
> 	igt_subtest_f("long-batch-%s", tests[i]->name)
> 		long_batch(fd, tests[i].prio[0], tests[i].ctx[0], ...);
> }
> 
> So you run all subtests from a single loop.
> 
> Regards,
> 
> Tvrtko
> 
> > +
> > +		igt_subtest_f("gpu-watchdog-long-batch-high-prio-ctx1-
> > reset-ctx2-low-prio-execute") {
> > +			int prio_ctx1 = HIGH;
> > +			int prio_ctx2 = LOW;
> > +			int reset_ctx1 = TRUE;
> > +			int reset_ctx2 = FALSE;
> > +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
> > prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
> > +		}
> > +
> > +		igt_subtest_f("gpu-watchdog-long-batch-ctx1-reset-ctx2-
> > execute") {
> > +			int prio_ctx1 = -1;
> > +			int prio_ctx2 = -1;
> > +			int reset_ctx1 = TRUE;
> > +			int reset_ctx2 = FALSE;
> > +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
> > prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
> > +		}
> > +
> > +		igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-
> > ctx2-reset") {
> > +			int prio_ctx1 = -1;
> > +			int prio_ctx2 = -1;
> > +			int reset_ctx1 = FALSE;
> > +			int reset_ctx2 = TRUE;
> > +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
> > prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
> > +		}
> > +
> > +		igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-
> > ctx2-execute") {
> > +			int prio_ctx1 = -1;
> > +			int prio_ctx2 = -1;
> > +			int reset_ctx1 = FALSE;
> > +			int reset_ctx2 = FALSE;
> > +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
> > prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
> > +		}
> > +
> > +	       __for_each_engine_class_instance(e) {
> > +			igt_subtest_group {
> > +				igt_fixture {
> > +				gem_require_engine(fd,
> > +					e->class,
> > +					e->instance);
> > +				}
> > +			/* no support for gpu watchdog on bcs0 */
> > +			if (strcmp(e->name, "bcs0") == 0)
> > +				continue;
> 
> Software implementation can support it. So don't skip here but skip
> in 
> the subtest if you get -ENODEV when you try to configure it.
> 
> > +
> > +		            igt_subtest_f("gpu-watchdog-long-batch-
> > single-engine-reset-%s", e->name)
> 
> Don't prefix all tests with gpu-watchdog, it is very verbose to read
> and 
> the test itself is called gem_watchdog already.
> 
> > +		                gpu_watchodg_hang_long_batch_single_eng
> > ine(fd, e->name);
> > +			}
> > +		}
> > +	}
> > +
> > +    igt_fixture {
> > +	close(fd);
> > +    }
> > +}
> > diff --git a/tests/meson.build b/tests/meson.build
> > index 5167a6c..b281b75 100644
> > --- a/tests/meson.build
> > +++ b/tests/meson.build
> > @@ -210,6 +210,7 @@ i915_progs = [
> >   	'gem_unref_active_buffers',
> >   	'gem_userptr_blits',
> >   	'gem_wait',
> > +        'gem_watchdog',
> >   	'gem_workarounds',
> >   	'gem_write_read_ring_switch',
> >   	'i915_fb_tiling',
> > 

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog
  2019-04-24  2:31   ` Carlos Santa
@ 2019-04-24  8:21     ` Tvrtko Ursulin
  0 siblings, 0 replies; 7+ messages in thread
From: Tvrtko Ursulin @ 2019-04-24  8:21 UTC (permalink / raw)
  To: Carlos Santa, igt-dev; +Cc: Ursulin Tvrtko


On 24/04/2019 03:31, Carlos Santa wrote:
> On Tue, 2019-04-23 at 17:25 +0100, Tvrtko Ursulin wrote:
>> Hi,
>>
>> How is the test doing against the code in progress? Any issues
>> discovered etc?
> 
> Test					Result
> 
> single-engine-reset-rcs0		PASS
> single-engine-reset-vcs0		PASS
> single-engine-reset-vcs1		PASS
> single-engine-reset-vecs0		PASS
> ctx1-execute-ctx2-execute		PASS
> ctx1-execute-ctx2-reset			PASS
> ctx1-reset-ctx2-reset			PASS
> ctx2-high-prio-reset-ctx2-low-execute	PASS
> ctx2-low-prio-reset-ctx2-high-execute   FAIL
> 
> There seems to be a problem with preemption as expected, the test
> simple gets stuck, and the default reset then kicks in. I also removed
> the dummy sleep(1) as you suggested, this still works as the output

I only asked why the sleep is there and to document it. If it wasn't 
needed even better.

> above.

Is this with the hardware watchdog implementation?

> About the tests
> 
> 1. The "single-engine" tests basically resets one engine, I use
> the "for_each_engine_class_instance" looper to simple iterate over the
> different engines. Given the option to specify which engine to reset is
> not a hard requirement, I could simply loop through all of them at
> once.
> 
> Also, on this test I took Antonio's suggestion where:
> 
>          - Submit work from context A
>          - Set a non 0 threshold on context A
>          - Submit more work on context A
> 
> And expect the work before you set the threshold is not reset and what
> was sent after is.
> 
> Then I assert if the "batch_active" doesn't match my own counter after
> each reset:
> 
> 	igt_assert_eq(rs.batch_active, active_count);

What happened to testing the output fence status?

> 
> 2. About the "long_batch_2_contexts" test case. The idea here is that
> the outer looper is "ctx1" and the inner looper is "ctx2". The outer
> loop randomnly generates engine id and the inner loop simply increments
> engines 1 through 4, then based on the input parameters we can let it
> execute, cancel, and set the priority.
> 
> For example if the test says "ctx1-execute-ctx2-cancel" then this means
> the outer loop set the watchdog and executes while the inner loop sets
> the watchdog and cancels the execution, but since I am looping through
> all of the engines, then it will do that for all of them:

How are you looping through engines when either of the iterators wasn't 
used?

Anyways, do you have that reply where I outlined some ideas for things 
to test? Where do you think the current version stands against that?

Will you work on tidying the code a bit so it is easier to review the 
high level?

Regards,

Tvrtko

> 
> engine1 - executes
> 	engine1 - cancel
> 	engine2 - cancel
> 	engine3 - cancel
> 	engine4 - cancel
> engine4 - executes
> 	engine1 - cancel
> 	engine2 - cancel
> 	...
> 
> v2 of the series was hardcoding the engines to say vecs0 and rcs0 but
> Antonio suggested that we could iterate through the different engines.
> 
> Regards,
> Carlos
> 
> 
>>
>> In general it looks a bit untidy. Coding style is off, alignment is
>> random etc.. so a bit difficult to read. But I'll make a pass and see
>> if
>> I can make sense of it. I won't leave comments on coding style and
>> formatting since there are too many. But please do fix that before
>> re-posting.
>>
>> On 23/04/2019 01:05, Carlos Santa wrote:
>>> This test adds basic set of tests to reset the different
>>> GPU engines through the gpu watchdog timer.
>>>
>>> Credits to Antonio for the original codebase this is based on.
>>>
>>> v2: remove gem_context_get_param() during set (Antonio)
>>>       remove clearing of the engines_threshold[] in the default case
>>>       inside context_set_watchdog(). (Antonio)
>>>       fix indexing when creating low/high priority contexts
>>>       get rid of 2 threads idea (Antonio)
>>>       fix context prio bug due to wrong indexing (Antonio)
>>> v3: no need to force clear drm_i915_gem_watchdog_timeout struct
>>>       when setting the watchdog timeouts (Antonio)
>>>       -use gem_context_set_priority() instead to use error checking
>>>       -always use a looper to test all engines (Antonio)
>>>       -modify test gpu_watchodg_hang_long_batch_single_engine()
>>>       to use a couple of batches and set the timeout in between them
>>> (Antonio)
>>>       -make use of for_each_engine_class_instance() (Antonio)
>>>
>>> Cc: Ursulin Tvrtko <tvrtko.ursulin@intel.com>
>>> Cc: Antonio Argenziano <antonio.argenziano@intel.com>
>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> Signed-off-by: Carlos Santa <carlos.santa@intel.com>
>>> ---
>>>    tests/Makefile.sources    |   3 +
>>>    tests/i915/gem_watchdog.c | 517
>>> ++++++++++++++++++++++++++++++++++++++++++++++
>>>    tests/meson.build         |   1 +
>>>    3 files changed, 521 insertions(+)
>>>    create mode 100644 tests/i915/gem_watchdog.c
>>>
>>> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
>>> index 214698d..7f17f20 100644
>>> --- a/tests/Makefile.sources
>>> +++ b/tests/Makefile.sources
>>> @@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES =
>>> i915/gem_userptr_blits.c
>>>    TESTS_progs += gem_wait
>>>    gem_wait_SOURCES = i915/gem_wait.c
>>>    
>>> +TESTS_progs += gem_watchdog
>>> +gem_watchdog_SOURCES = i915/gem_watchdog.c
>>> +
>>>    TESTS_progs += gem_workarounds
>>>    gem_workarounds_SOURCES = i915/gem_workarounds.c
>>>    
>>> diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
>>> new file mode 100644
>>> index 0000000..8f789ab
>>> --- /dev/null
>>> +++ b/tests/i915/gem_watchdog.c
>>> @@ -0,0 +1,517 @@
>>> +/*
>>> + * Copyright © 2016 Intel Corporation
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person
>>> obtaining a
>>> + * copy of this software and associated documentation files (the
>>> "Software"),
>>> + * to deal in the Software without restriction, including without
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute,
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to
>>> whom the
>>> + * Software is furnished to do so, subject to the following
>>> conditions:
>>> + *
>>> + * The above copyright notice and this permission notice
>>> (including the next
>>> + * paragraph) shall be included in all copies or substantial
>>> portions of the
>>> + * Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
>>> EVENT SHALL
>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
>>> DAMAGES OR OTHER
>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>> ARISING
>>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>>> OTHER DEALINGS
>>> + * IN THE SOFTWARE.
>>> + */
>>> +#include "igt.h"
>>> +#include "igt_sysfs.h"
>>> +#include "sw_sync.h"
>>> +
>>> +#include <pthread.h>
>>> +#include <fcntl.h>
>>> +
>>> +#include <sys/ioctl.h>
>>> +#include <sys/poll.h>
>>> +#include <sys/signal.h>
>>> +#include "i915/gem_ring.h"
>>> +
>>> +#define LOCAL_I915_EXEC_BSD_SHIFT	(13)
>>> +#define LOCAL_I915_EXEC_BSD_RING1 	(1 <<
>>> LOCAL_I915_EXEC_BSD_SHIFT)
>>> +#define LOCAL_I915_EXEC_BSD_RING2 	(2 <<
>>> LOCAL_I915_EXEC_BSD_SHIFT)
>>
>> These are in old so no need for local definitions.
>>
>>> +
>>> +#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
>>> +#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
>>> +#define HIGH 1
>>> +#define LOW 0
>>> +#define TRUE 1
>>> +#define FALSE 0
>>
>> true and false are fair game in igt.
>>
>>> +#define WATCHDOG_THRESHOLD (100)
>>> +#define MAX_ENGINES 5
>>> +#define RENDER_CLASS 0
>>> +#define VIDEO_DECODE_CLASS 1
>>> +#define VIDEO_ENHANCEMENT_CLASS 2
>>> +#define COPY_ENGINE_CLASS 3
>>
>> Classes are in i915_drm.h as well.
>>
>>> +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
>>> +
>>> +#define RS_NO_ERROR      0
>>> +#define RS_BATCH_ACTIVE  (1 << 0)
>>> +#define RS_BATCH_PENDING (1 << 1)
>>> +#define RS_UNKNOWN       (1 << 2)
>>
>> What are these? Look unused in code.
>>
>>> +
>>> +#define GET_RESET_STATS_IOCTL DRM_IOWR(DRM_COMMAND_BASE + 0x32,
>>> struct local_drm_i915_reset_stats)
>>> +struct local_drm_i915_reset_stats {
>>> +	__u32 ctx_id;
>>> +	__u32 flags;
>>> +	__u32 reset_count;
>>> +	__u32 batch_active;
>>> +	__u32 batch_pending;
>>> +	__u32 pad;
>>> +};
>>> +
>>> +const uint64_t timeout_100ms = 100000000LL;
>>> +int num;
>>> +
>>> +struct drm_i915_gem_watchdog_timeout {
>>> +	union {
>>> +		struct {
>>> +			/*
>>> +			 * Engine class & instance to be configured or
>>> queried.
>>> +			 */
>>> +			__u16 engine_class;
>>> +			__u16 engine_instance;
>>> +		};
>>> +		/* Index based addressing mode */
>>> +		__u32 index;
>>> +	};
>>> +	/* GPU Engine watchdog resets timeout in us */
>>> +	__u32 timeout_us;
>>> +};
>>> +
>>> +static void clear_error_state(int fd)
>>> +{
>>> +	int dir;
>>> +
>>> +	dir = igt_sysfs_open(fd);
>>> +
>>> +	if (dir < 0)
>>> +		return;
>>
>> igt_require(dir >= 0) if test relies on this being present?
>>
>>> +
>>> +	/* Any write to the error state clears it */
>>> +	igt_sysfs_set(dir, "error", "");
>>> +	close(dir);
>>> +}
>>> +
>>> +static unsigned int e2ring(int fd, const struct
>>> intel_execution_engine2 *e)
>>> +{
>>> +    return gem_class_instance_to_eb_flags(fd, e->class, e-
>>>> instance);
>>> +}
>>> +
>>> +static void context_set_watchdog(int fd, unsigned engine_id,
>>> +                                 unsigned ctx_id, unsigned
>>> threshold)
>>> +{
>>> +	struct drm_i915_gem_watchdog_timeout
>>> engines_threshold[MAX_ENGINES];
>>> +	struct drm_i915_gem_context_param arg = {
>>> +		.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
>>> +		.ctx_id = ctx_id,
>>> +		.size = sizeof(engines_threshold),
>>> +		.value = (uint64_t)&engines_threshold
>>> +	};
>>> +
>>> +	memset(&engines_threshold, 0, sizeof(engines_threshold));
>>> +
>>> +	switch (engine_id & I915_EXEC_RING_MASK) {
>>> +	case I915_EXEC_RENDER:
>>> +		engines_threshold[RENDER_CLASS].timeout_us = threshold;
>>
>> Looks like uAPI is in some interim state? I was expecting to see
>> here
>> something like:
>>
>> 	engines_threshold.engine_class = RENDER_CLASS;
>> 	engines_threshold.engine_instance = 0;
>> 	engines_threshold.timeout_us = threshold;
>>
>> But you seem to be passing an array of structs which carry
>> class/instance data, don't use it but instead infer the class from
>> the
>> position in the array.
>>
>>> +		break;
>>> +	case I915_EXEC_BSD:
>>> +		engines_threshold[VIDEO_DECODE_CLASS].timeout_us =
>>> threshold;
>>> +		break;
>>> +	case I915_EXEC_VEBOX:
>>> +		engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us =
>>> threshold;
>>> +		break;
>>> +	default:
>>> +		break;
>>> +	}
>>> +
>>> +	gem_context_set_param(fd, &arg);
>>> +}
>>> +
>>> +static void batch_buffer_factory(uint32_t fd, uint32_t ctx_id,
>>> unsigned exec_id, uint32_t target, uint32_t offset, uint32_t
>>> *handle, uint64_t timeout, int *fence, int fence_index)
>>> +{
>>> +    struct drm_i915_gem_exec_object2 obj[2];
>>> +    struct drm_i915_gem_relocation_entry reloc;
>>> +    struct drm_i915_gem_execbuffer2 execbuf;
>>> +    igt_spin_t *spin = NULL;
>>> +    const uint32_t bbe = MI_BATCH_BUFFER_END;
>>> +    int i = 0;
>>> +
>>> +    gem_quiescent_gpu(fd);
>>> +
>>> +    memset(&execbuf, 0, sizeof(execbuf));
>>> +    memset(&obj, 0, sizeof(obj));
>>> +    memset(&reloc, 0, sizeof(reloc));
>>> +
>>> +    execbuf.buffers_ptr = to_user_pointer(obj);
>>> +
>>> +    execbuf.buffer_count = 2;
>>> +    execbuf.flags = exec_id | I915_EXEC_FENCE_OUT ;
>>> +
>>> +    obj[0].handle = target;
>>> +    obj[1].handle = gem_create(fd, 4096);
>>> +
>>> +    obj[1].relocation_count = 1;
>>> +    obj[1].relocs_ptr = to_user_pointer(&reloc);
>>> +
>>> +    reloc.target_handle = obj[0].handle;
>>> +    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
>>> +    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
>>> +    reloc.delta = offset * sizeof(uint32_t);
>>> +
>>> +    reloc.offset = i * sizeof(uint32_t);
>>> +    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
>>> +
>>> +    __sync_synchronize();
>>> +
>>> +    if (handle) {
>>> +        *handle = obj[1].handle;
>>> +        return;
>>
>> Is this path used by the callers?
>>
>>> +    }
>>> +
>>> +    gem_sync(fd, obj[1].handle);
>>> +    execbuf.rsvd1 = ctx_id;
>>> +    execbuf.rsvd2 = -1;
>>> +
>>> +    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
>>> +    igt_spin_batch_set_timeout(spin, timeout);
>>> +    igt_assert(gem_bo_busy(fd, obj[0].handle));
>>> +
>>> +    gem_execbuf_wr(fd, &execbuf);
>>> +    igt_spin_batch_free(fd, spin);
>>> +
>>> +    fence[fence_index] = execbuf.rsvd2 >> 32;
>>> +
>>> +    gem_close(fd, obj[1].handle);
>>> +    gem_quiescent_gpu(fd);
>>> +}
>>> +
>>> +static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
>>> +{
>>> +	uint32_t ctx = gem_context_create(fd);
>>> +
>>> +	switch (ctx_prio) {
>>> +	case HIGH:
>>> +		gem_context_set_priority(fd, ctx, MAX_PRIO);
>>> +		igt_info("Setting MAX priority %d\n", ctx_prio);
>>> +		break;
>>> +	case LOW:
>>> +		gem_context_set_priority(fd, ctx, MIN_PRIO);
>>> +		igt_info("Setting MIN priority %d\n", ctx_prio);
>>> +		break;
>>> +	default:
>>> +		igt_info("Ignoring context priority %d\n", ctx_prio);
>>> +		break;
>>> +	}
>>> +
>>> +	return ctx;
>>
>> Why do you need this helper which translates an integer to an
>> integer?
>> Couldn't you just pass in the real priority to start with?
>>
>>> +}
>>> +
>>> +static int gem_reset_stats(int fd, int ctx_id,
>>> +			   struct local_drm_i915_reset_stats *rs)
>>> +{
>>> +	memset(rs, 0, sizeof(*rs));
>>> +	rs->ctx_id = ctx_id;
>>> +	rs->reset_count = -1;
>>> +
>>> +	if (drmIoctl(fd, GET_RESET_STATS_IOCTL, rs)) {
>>> +		printf("error:%d\n",-errno);
>>> +		return -errno;
>>
>> Make it assert success here since all callers do that anyway and it
>> matches the convention for gem prefixed ioctl helpers?
>>
>>> +	}
>>> +	igt_assert(rs->reset_count != -1);
>>> +	return 0;
>>> +}
>>> +
>>> +static unsigned index_to_engine_id(int fd, int index)
>>> +{
>>> +	unsigned engine_id = I915_EXEC_RENDER;
>>> +
>>> +	switch (index) {
>>> +	case 1:
>>> +		engine_id = I915_EXEC_RENDER;
>>> +		break;
>>> +	case 2:
>>> +		if (gem_has_bsd2(fd))
>>> +			engine_id = I915_EXEC_BSD |
>>> I915_EXEC_BSD_RING1;
>>> +		else
>>> +			engine_id = I915_EXEC_BSD;
>>> +		break;
>>> +	case 3:
>>> +		if (gem_has_bsd2(fd))
>>> +			engine_id = I915_EXEC_BSD |
>>> I915_EXEC_BSD_RING2;
>>> +		else
>>> +			engine_id = I915_EXEC_BSD;
>>> +		break;
>>> +	case 4:
>>> +		engine_id = I915_EXEC_VEBOX;
>>> +		break;
>>> +	default:
>>> +		break;
>>> +	}
>>
>> I hope we can avoid this maintenance burden, will see later in patch
>> how.
>>
>>> +
>>> +	return engine_id;
>>> +}
>>> +
>>> +static void inject_hang(uint32_t fd, unsigned engine_id, uint32_t
>>> ctx_id,  unsigned flags)
>>> +{
>>> +	igt_hang_t hang;
>>> +	hang = igt_hang_ctx(fd, ctx_id, engine_id, flags);
>>> +	gem_sync(fd, hang.spin->handle);
>>> +}
>>> +
>>> +static void gpu_watchdog_long_batch_2_contexts(int fd, int
>>> nengine, int prio_ctx1, int reset_ctx1, int prio_ctx2, int
>>> reset_ctx2)
>>> +{
>>> +	uint32_t ctx[48];
>>> +	uint32_t scratch[48];
>>> +	unsigned flags = HANG_ALLOW_CAPTURE;
>>> +	const uint64_t batch_timeout_ms = timeout_100ms * 3;
>>
>> ms suffix usually means the unit stored is ms. While here it seems to
>> be
>> something else.
>>
>> const unsigned long batch_timeout_ms = 300;
>> const unsigned long batch_timeout_us = 300e3;
>> const unsigned long batch_timeout_ns = 300e6;
>>
>> Not sure which one you want. Probably use ms and scale up at the use
>> site.
>>
>>> +	const struct intel_execution_engine2 *e;
>>> +	const struct intel_execution_engine2 *e2;
>>> +	int i = 0, index = 0, engine_id = 0, ctx_cnt = 0;
>>> +	int *fence = 0, active_count = 0;
>>> +	struct local_drm_i915_reset_stats rs;
>>> +
>>> +	igt_require(nengine);
>>> +
>>> +	fence = (int *)malloc(sizeof(int) * 48);
>>> +
>>> +	if (!fence) {
>>> +		igt_info("Out of memory\n");
>>> +		exit(1);
>>> +	}
>>
>> Replace with igt_assert(fence).
>>
>>> +
>>> +	/* Intializes random number generator */
>>> +	srand((unsigned) time(NULL));
>>> +
>>> +	/* The outer looper is ctx1 and the inner looper is ctx2
>>> +	 * Both contexts can set their priority, create work, and
>>> cancel themselves
>>> +	 */
>>> +	for_each_engine_class_instance(fd, e) {
>>> +		if ( strcmp(e->name, "bcs0") == 0 )
>>> +			continue;
>>
>> Use -ENODEV or -EINVAL from context_set_watchdog to skip.
>>
>>> +
>>> +		scratch[ctx_cnt] = gem_create(fd, 4096);
>>> +		index = rand() % 4 + 1;
>>> +		engine_id = index_to_engine_id(fd, index);
>>> +
>>> +		/* get current batch_active count */
>>> +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>>> +		active_count = rs.batch_active;
>>> +		igt_assert_eq(rs.batch_active, active_count);
>>> +
>>> +		ctx[ctx_cnt] = create_ctx_with_priority(fd, prio_ctx1);
>>> +		batch_buffer_factory(fd, ctx[ctx_cnt], engine_id,
>>> scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence, ctx_cnt);
>>> +		context_set_watchdog(fd, engine_id, ctx[ctx_cnt],
>>> WATCHDOG_THRESHOLD);
>>> +		sleep(1);
>>
>> What's the sleep for? If unavoidable at least needs a comment.
>>
>>> +
>>> +		if (reset_ctx1) {
>>> +			clear_error_state(fd);
>>> +			inject_hang(fd, engine_id, ctx[ctx_cnt],
>>> flags);
>>> +			active_count++;
>>> +		}
>>> +
>>> +		/* check batch_active count once again after the hang
>>> */
>>> +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>>> +		igt_assert_eq(rs.batch_active, active_count);
>>> +
>>> +		ctx_cnt++;
>>> +		i = 1;
>>> +
>>> +		for_each_engine_class_instance(fd, e2) {
>>> +			if ( strcmp(e->name, "bcs0") == 0 )
>>
>> e2, but remove it altogether as said above.
>>
>> I don't get what is the purpose of two
>> for_each_engine_class_instance
>> loops when neither e or e2 appear to be used?
>>
>> Since either I am missing something critical or the test really is
>> very
>> confused, until clarified, I am skipping a bit lower down.
>>
>>> +				continue;
>>> +
>>> +			scratch[ctx_cnt] = gem_create(fd, 4096);
>>> +			engine_id = index_to_engine_id(fd, i);
>>> +
>>> +			/* get current batch_active count */
>>> +			igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>>> +			active_count = rs.batch_active;
>>> +			igt_assert_eq(rs.batch_active, active_count);
>>> +
>>> +			ctx[ctx_cnt] = create_ctx_with_priority(fd,
>>> prio_ctx2);
>>> +			batch_buffer_factory(fd, ctx[ctx_cnt],
>>> engine_id, scratch[ctx_cnt], 0, NULL, batch_timeout_ms, fence,
>>> ctx_cnt);
>>> +			context_set_watchdog(fd, engine_id,
>>> ctx[ctx_cnt], WATCHDOG_THRESHOLD);
>>> +
>>> +			if (reset_ctx2) {
>>> +				clear_error_state(fd);
>>> +				inject_hang(fd, engine_id,
>>> ctx[ctx_cnt], flags);
>>> +				active_count++;
>>> +			}
>>> +
>>> +			/* check batch_active count once again after
>>> the hang */
>>> +			igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>>> +			igt_assert_eq(rs.batch_active, active_count);
>>> +
>>> +			ctx_cnt++;
>>> +			i++;
>>> +		}
>>> +	}
>>> +
>>> +	for (i = 0; i < ctx_cnt; i++) {
>>> +		close(fence[i]);
>>> +		gem_context_destroy(fd, ctx[i]);
>>> +		gem_close(fd, scratch[i]);
>>> +	}
>>> +}
>>> +
>>> +static void gpu_watchodg_hang_long_batch_single_engine(int fd,
>>> const char *name)
>>> +{
>>> +	uint32_t ctx[16];
>>> +	uint32_t scratch[16];
>>> +	int *fence;
>>> +	const struct intel_execution_engine2 *e;
>>> +	unsigned nengine = 0;
>>> +	unsigned engine_id = 0;
>>> +	int active_count = 0;
>>> +	struct local_drm_i915_reset_stats rs;
>>> +	int i;
>>> +	unsigned flags = HANG_ALLOW_CAPTURE;
>>> +	const uint64_t batch_timeout_ms = timeout_100ms * 4;
>>> +
>>> +	fence = (int *)malloc(sizeof(int)*16);
>>> +
>>> +	if (!fence) {
>>> +		igt_info("Out of memory\n");
>>> +		exit(1);
>>> +	}
>>> +
>>> +	for_each_engine_class_instance(fd, e) {
>>> +		if ( strcmp(e->name, "bcs0") == 0 )
>>> +			continue;
>>> +
>>> +		engine_id = e2ring(fd, e);
>>> +
>>> +		scratch[nengine] = gem_create(fd, 4096);
>>> +		ctx[nengine] = create_ctx_with_priority(fd, -1);
>>> +
>>> +		/* get current batch_active count */
>>> +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>>> +		active_count = rs.batch_active;
>>> +		igt_assert_eq(rs.batch_active, active_count);
>>> +
>>> +		/* create some work on this engine using the above
>>> ctx*/
>>> +		batch_buffer_factory(fd, ctx[nengine], engine_id,
>>> scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
>>> +
>>> +		/* set the gpu watchdog timeout */
>>> +		context_set_watchdog(fd, engine_id, ctx[nengine],
>>> WATCHDOG_THRESHOLD);
>>> +		clear_error_state(fd);
>>> +
>>> +		/*
>>> +		 * give enough time for the batch buffer to run
>>> +		 */
>>> +		nanosleep(&(struct timespec){ .tv_sec = 0,
>>> +					      .tv_nsec =
>>> batch_timeout_ms * 2 }, NULL);
>>> +
>>> +		/* create some work on this engine using the above
>>> ctx*/
>>> +		batch_buffer_factory(fd, ctx[nengine], engine_id,
>>> scratch[nengine], 0, NULL, batch_timeout_ms, fence, nengine);
>>> +		context_set_watchdog(fd, engine_id, ctx[nengine],
>>> WATCHDOG_THRESHOLD);
>>> +		clear_error_state(fd);
>>> +
>>> +		/* cancel only the batch requested */
>>> +		if ( strcmp(e->name, name) == 0 ) {
>>> +			inject_hang(fd, engine_id, ctx[nengine],
>>> flags);
>>> +			active_count++;
>>> +		}
>>> +
>>> +		/* check batch_active count once again after the hang
>>> */
>>> +		igt_assert_eq(gem_reset_stats(fd, 0, &rs), 0);
>>> +		active_count = rs.batch_active;
>>> +		igt_assert_eq(rs.batch_active, active_count);
>>> +
>>> +		nengine++;
>>> +	}
>>> +
>>> +	for (i = 0; i < nengine; i++) {
>>> +		close(fence[i]);
>>> +		gem_context_destroy(fd, ctx[i]);
>>> +		gem_close(fd, scratch[i]);
>>> +	}
>>> +}
>>> +
>>> +igt_main
>>> +{
>>> +	int fd;
>>> +	unsigned int nengine = 0;
>>> +	unsigned int engine;
>>> +	const struct intel_execution_engine2 *e;
>>> +
>>> +	igt_skip_on_simulation();
>>> +
>>> +	igt_fixture {
>>> +		fd = drm_open_driver(DRIVER_INTEL);
>>> +		igt_require_gem(fd);
>>> +
>>> +		for_each_physical_engine(fd, engine)
>>> +			nengine++;
>>> +		igt_require(nengine);
>>
>> Do we have GPUs with no engines?
>>
>>> +	}
>>> +
>>> +	igt_subtest_group {
>>> +
>>> +		igt_subtest_f("gpu-watchdog-long-batch-low-prio-ctx1-
>>> reset-ctx2-high-prio-execute") {
>>> +			int prio_ctx1 = LOW;
>>> +			int prio_ctx2 = HIGH;
>>> +			int reset_ctx1 = TRUE;
>>> +			int reset_ctx2 = FALSE;
>>> +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
>>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>>> +		}
>>
>> This is kind of verbose. I recommend out usual pattern along the
>> lines of:
>>
>> struct {
>> 	char *name;
>> 	int prio[2];
>> 	bool reset[2];
>> } tests[] = {
>> 	{ "change me", {0, 0}, {false, false} },
>> 	{ "change me", {-1, 0}, {true, false} },
>> 	...
>> };
>>
>> for (i = 0; i < ARRAY_SIZE(tests); i++) {
>> 	igt_subtest_f("long-batch-%s", tests[i]->name)
>> 		long_batch(fd, tests[i].prio[0], tests[i].ctx[0], ...);
>> }
>>
>> So you run all subtests from a single loop.
>>
>> Regards,
>>
>> Tvrtko
>>
>>> +
>>> +		igt_subtest_f("gpu-watchdog-long-batch-high-prio-ctx1-
>>> reset-ctx2-low-prio-execute") {
>>> +			int prio_ctx1 = HIGH;
>>> +			int prio_ctx2 = LOW;
>>> +			int reset_ctx1 = TRUE;
>>> +			int reset_ctx2 = FALSE;
>>> +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
>>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>>> +		}
>>> +
>>> +		igt_subtest_f("gpu-watchdog-long-batch-ctx1-reset-ctx2-
>>> execute") {
>>> +			int prio_ctx1 = -1;
>>> +			int prio_ctx2 = -1;
>>> +			int reset_ctx1 = TRUE;
>>> +			int reset_ctx2 = FALSE;
>>> +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
>>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>>> +		}
>>> +
>>> +		igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-
>>> ctx2-reset") {
>>> +			int prio_ctx1 = -1;
>>> +			int prio_ctx2 = -1;
>>> +			int reset_ctx1 = FALSE;
>>> +			int reset_ctx2 = TRUE;
>>> +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
>>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>>> +		}
>>> +
>>> +		igt_subtest_f("gpu-watchdog-long-batch-ctx1-execute-
>>> ctx2-execute") {
>>> +			int prio_ctx1 = -1;
>>> +			int prio_ctx2 = -1;
>>> +			int reset_ctx1 = FALSE;
>>> +			int reset_ctx2 = FALSE;
>>> +			gpu_watchdog_long_batch_2_contexts(fd, nengine,
>>> prio_ctx1, reset_ctx1, prio_ctx2, reset_ctx2);
>>> +		}
>>> +
>>> +	       __for_each_engine_class_instance(e) {
>>> +			igt_subtest_group {
>>> +				igt_fixture {
>>> +				gem_require_engine(fd,
>>> +					e->class,
>>> +					e->instance);
>>> +				}
>>> +			/* no support for gpu watchdog on bcs0 */
>>> +			if (strcmp(e->name, "bcs0") == 0)
>>> +				continue;
>>
>> Software implementation can support it. So don't skip here but skip
>> in
>> the subtest if you get -ENODEV when you try to configure it.
>>
>>> +
>>> +		            igt_subtest_f("gpu-watchdog-long-batch-
>>> single-engine-reset-%s", e->name)
>>
>> Don't prefix all tests with gpu-watchdog, it is very verbose to read
>> and
>> the test itself is called gem_watchdog already.
>>
>>> +		                gpu_watchodg_hang_long_batch_single_eng
>>> ine(fd, e->name);
>>> +			}
>>> +		}
>>> +	}
>>> +
>>> +    igt_fixture {
>>> +	close(fd);
>>> +    }
>>> +}
>>> diff --git a/tests/meson.build b/tests/meson.build
>>> index 5167a6c..b281b75 100644
>>> --- a/tests/meson.build
>>> +++ b/tests/meson.build
>>> @@ -210,6 +210,7 @@ i915_progs = [
>>>    	'gem_unref_active_buffers',
>>>    	'gem_userptr_blits',
>>>    	'gem_wait',
>>> +        'gem_watchdog',
>>>    	'gem_workarounds',
>>>    	'gem_write_read_ring_switch',
>>>    	'i915_fb_tiling',
>>>
> 
> 
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-04-24  8:21 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-23  0:05 [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog Carlos Santa
2019-04-23  0:24 ` [igt-dev] ✗ Fi.CI.BAT: failure for tests/gem_watchdog: Initial set of tests for GPU watchdog (rev5) Patchwork
2019-04-23  9:54 ` [igt-dev] [RFC v3] tests/gem_watchdog: Initial set of tests for GPU watchdog Petri Latvala
2019-04-23 16:25 ` Tvrtko Ursulin
2019-04-23 16:31   ` Tvrtko Ursulin
2019-04-24  2:31   ` Carlos Santa
2019-04-24  8:21     ` Tvrtko Ursulin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.