All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Intel-gfx@lists.freedesktop.org
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>,
	dri-devel@lists.freedesktop.org,
	Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Subject: [RFC 4/6] drm/i915: Allow userspace to configure the watchdog
Date: Fri, 12 Mar 2021 15:46:20 +0000	[thread overview]
Message-ID: <20210312154622.1767865-5-tvrtko.ursulin@linux.intel.com> (raw)
In-Reply-To: <20210312154622.1767865-1-tvrtko.ursulin@linux.intel.com>

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Idea here is to make the watchdog mechanism more useful than for just
default request/fence expiry.

To this effect a new context param I915_CONTEXT_PARAM_WATCHDOG is added
where the value fields allows passing in a timeout in micro-seconds.

This allows userspace to set a limit to how long they expect their batches
to take, or otherwise they will be cancelled, and userspace notified via
one of the available mechanisms.

Main attractiveness of adding uapi here is perhaps to extend the proposal
by passing in a structure instead of a single value, like for illustration
only:

struct drm_i915_gem_context_watchdog {
	__u64 flags;
 #define I915_CONTEXT_WATCHDOG_WALL_TIME	BIT(0)
 #define I915_CONTEXT_WATCHDOG_GPU_TIME		BIT(1)
 #define I915_CONTEXT_WATCHDOG_FROM_SUBMIT	BIT(2)
 #define I915_CONTEXT_WATCHDOG_FROM_RUNNABLE	BIT(3)
	__64 timeout_us;
};

Point being to prepare the uapi for different semantics from the start.
Given how not a single one makes complete sense for all use cases. And
also perhaps satisfy the long wanted media watchdog feature request.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 57 +++++++++++++++++++
 .../gpu/drm/i915/gem/i915_gem_context_types.h |  4 ++
 drivers/gpu/drm/i915/gt/intel_context_param.h | 11 +++-
 include/uapi/drm/i915_drm.h                   |  5 +-
 4 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index ca37d93ef5e7..32b05af4fc8f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -233,6 +233,8 @@ static void intel_context_set_gem(struct intel_context *ce,
 	if (ctx->sched.priority >= I915_PRIORITY_NORMAL &&
 	    intel_engine_has_timeslices(ce->engine))
 		__set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags);
+
+	intel_context_set_watchdog_us(ce, ctx->watchdog.timeout_us);
 }
 
 static void __free_engines(struct i915_gem_engines *e, unsigned int count)
@@ -1397,6 +1399,28 @@ static int set_ringsize(struct i915_gem_context *ctx,
 				 __intel_context_ring_size(args->value));
 }
 
+static int __apply_watchdog(struct intel_context *ce, void *timeout_us)
+{
+	return intel_context_set_watchdog_us(ce, (uintptr_t)timeout_us);
+}
+
+static int set_watchdog(struct i915_gem_context *ctx,
+			struct drm_i915_gem_context_param *args)
+{
+	int ret;
+
+	if (args->size)
+		return -EINVAL;
+
+	ret = context_apply_all(ctx, __apply_watchdog,
+				(void *)(uintptr_t)args->value);
+
+	if (!ret)
+		ctx->watchdog.timeout_us = args->value;
+
+	return ret;
+}
+
 static int __get_ringsize(struct intel_context *ce, void *arg)
 {
 	long sz;
@@ -1426,6 +1450,17 @@ static int get_ringsize(struct i915_gem_context *ctx,
 	return 0;
 }
 
+static int get_watchdog(struct i915_gem_context *ctx,
+			struct drm_i915_gem_context_param *args)
+{
+	if (args->size)
+		return -EINVAL;
+
+	args->value = ctx->watchdog.timeout_us;
+
+	return 0;
+}
+
 int
 i915_gem_user_to_context_sseu(struct intel_gt *gt,
 			      const struct drm_i915_gem_context_param_sseu *user,
@@ -2075,6 +2110,10 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
 		ret = set_ringsize(ctx, args);
 		break;
 
+	case I915_CONTEXT_PARAM_WATCHDOG:
+		ret = set_watchdog(ctx, args);
+		break;
+
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	default:
 		ret = -EINVAL;
@@ -2196,6 +2235,19 @@ static int clone_schedattr(struct i915_gem_context *dst,
 	return 0;
 }
 
+static int clone_watchdog(struct i915_gem_context *dst,
+			  struct i915_gem_context *src)
+{
+	int ret;
+
+	ret = context_apply_all(dst, __apply_watchdog,
+				(void *)(uintptr_t)src->watchdog.timeout_us);
+	if (!ret)
+		dst->watchdog = src->watchdog;
+
+	return ret;
+}
+
 static int clone_sseu(struct i915_gem_context *dst,
 		      struct i915_gem_context *src)
 {
@@ -2279,6 +2331,7 @@ static int create_clone(struct i915_user_extension __user *ext, void *data)
 		MAP(SSEU, clone_sseu),
 		MAP(TIMELINE, clone_timeline),
 		MAP(VM, clone_vm),
+		MAP(WATCHDOG, clone_watchdog),
 #undef MAP
 	};
 	struct drm_i915_gem_context_create_ext_clone local;
@@ -2532,6 +2585,10 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		ret = get_ringsize(ctx, args);
 		break;
 
+	case I915_CONTEXT_PARAM_WATCHDOG:
+		ret = get_watchdog(ctx, args);
+		break;
+
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	default:
 		ret = -EINVAL;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index d5bc75508048..f17da7e26c43 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -150,6 +150,10 @@ struct i915_gem_context {
 	 */
 	atomic_t active_count;
 
+	struct {
+		u64 timeout_us;
+	} watchdog;
+
 	/**
 	 * @hang_timestamp: The last time(s) this context caused a GPU hang
 	 */
diff --git a/drivers/gpu/drm/i915/gt/intel_context_param.h b/drivers/gpu/drm/i915/gt/intel_context_param.h
index f053d8633fe2..3ecacc675f41 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_param.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_param.h
@@ -6,9 +6,18 @@
 #ifndef INTEL_CONTEXT_PARAM_H
 #define INTEL_CONTEXT_PARAM_H
 
-struct intel_context;
+#include <linux/types.h>
+
+#include "intel_context.h"
 
 int intel_context_set_ring_size(struct intel_context *ce, long sz);
 long intel_context_get_ring_size(struct intel_context *ce);
 
+static inline int
+intel_context_set_watchdog_us(struct intel_context *ce, u64 timeout_us)
+{
+	ce->watchdog.timeout_us = timeout_us;
+	return 0;
+}
+
 #endif /* INTEL_CONTEXT_PARAM_H */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1987e2ea79a3..a4c65780850c 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1694,6 +1694,8 @@ struct drm_i915_gem_context_param {
  * Default is 16 KiB.
  */
 #define I915_CONTEXT_PARAM_RINGSIZE	0xc
+
+#define I915_CONTEXT_PARAM_WATCHDOG	0xd
 /* Must be kept compact -- no holes and well documented */
 
 	__u64 value;
@@ -1863,7 +1865,8 @@ struct drm_i915_gem_context_create_ext_clone {
 #define I915_CONTEXT_CLONE_SSEU		(1u << 3)
 #define I915_CONTEXT_CLONE_TIMELINE	(1u << 4)
 #define I915_CONTEXT_CLONE_VM		(1u << 5)
-#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_VM << 1)
+#define I915_CONTEXT_CLONE_WATCHDOG	(1u << 6)
+#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_WATCHDOG << 1)
 	__u64 rsvd;
 };
 
-- 
2.27.0

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

WARNING: multiple messages have this Message-ID (diff)
From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Intel-gfx@lists.freedesktop.org
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>, dri-devel@lists.freedesktop.org
Subject: [Intel-gfx] [RFC 4/6] drm/i915: Allow userspace to configure the watchdog
Date: Fri, 12 Mar 2021 15:46:20 +0000	[thread overview]
Message-ID: <20210312154622.1767865-5-tvrtko.ursulin@linux.intel.com> (raw)
In-Reply-To: <20210312154622.1767865-1-tvrtko.ursulin@linux.intel.com>

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Idea here is to make the watchdog mechanism more useful than for just
default request/fence expiry.

To this effect a new context param I915_CONTEXT_PARAM_WATCHDOG is added
where the value fields allows passing in a timeout in micro-seconds.

This allows userspace to set a limit to how long they expect their batches
to take, or otherwise they will be cancelled, and userspace notified via
one of the available mechanisms.

Main attractiveness of adding uapi here is perhaps to extend the proposal
by passing in a structure instead of a single value, like for illustration
only:

struct drm_i915_gem_context_watchdog {
	__u64 flags;
 #define I915_CONTEXT_WATCHDOG_WALL_TIME	BIT(0)
 #define I915_CONTEXT_WATCHDOG_GPU_TIME		BIT(1)
 #define I915_CONTEXT_WATCHDOG_FROM_SUBMIT	BIT(2)
 #define I915_CONTEXT_WATCHDOG_FROM_RUNNABLE	BIT(3)
	__64 timeout_us;
};

Point being to prepare the uapi for different semantics from the start.
Given how not a single one makes complete sense for all use cases. And
also perhaps satisfy the long wanted media watchdog feature request.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 57 +++++++++++++++++++
 .../gpu/drm/i915/gem/i915_gem_context_types.h |  4 ++
 drivers/gpu/drm/i915/gt/intel_context_param.h | 11 +++-
 include/uapi/drm/i915_drm.h                   |  5 +-
 4 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index ca37d93ef5e7..32b05af4fc8f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -233,6 +233,8 @@ static void intel_context_set_gem(struct intel_context *ce,
 	if (ctx->sched.priority >= I915_PRIORITY_NORMAL &&
 	    intel_engine_has_timeslices(ce->engine))
 		__set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags);
+
+	intel_context_set_watchdog_us(ce, ctx->watchdog.timeout_us);
 }
 
 static void __free_engines(struct i915_gem_engines *e, unsigned int count)
@@ -1397,6 +1399,28 @@ static int set_ringsize(struct i915_gem_context *ctx,
 				 __intel_context_ring_size(args->value));
 }
 
+static int __apply_watchdog(struct intel_context *ce, void *timeout_us)
+{
+	return intel_context_set_watchdog_us(ce, (uintptr_t)timeout_us);
+}
+
+static int set_watchdog(struct i915_gem_context *ctx,
+			struct drm_i915_gem_context_param *args)
+{
+	int ret;
+
+	if (args->size)
+		return -EINVAL;
+
+	ret = context_apply_all(ctx, __apply_watchdog,
+				(void *)(uintptr_t)args->value);
+
+	if (!ret)
+		ctx->watchdog.timeout_us = args->value;
+
+	return ret;
+}
+
 static int __get_ringsize(struct intel_context *ce, void *arg)
 {
 	long sz;
@@ -1426,6 +1450,17 @@ static int get_ringsize(struct i915_gem_context *ctx,
 	return 0;
 }
 
+static int get_watchdog(struct i915_gem_context *ctx,
+			struct drm_i915_gem_context_param *args)
+{
+	if (args->size)
+		return -EINVAL;
+
+	args->value = ctx->watchdog.timeout_us;
+
+	return 0;
+}
+
 int
 i915_gem_user_to_context_sseu(struct intel_gt *gt,
 			      const struct drm_i915_gem_context_param_sseu *user,
@@ -2075,6 +2110,10 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
 		ret = set_ringsize(ctx, args);
 		break;
 
+	case I915_CONTEXT_PARAM_WATCHDOG:
+		ret = set_watchdog(ctx, args);
+		break;
+
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	default:
 		ret = -EINVAL;
@@ -2196,6 +2235,19 @@ static int clone_schedattr(struct i915_gem_context *dst,
 	return 0;
 }
 
+static int clone_watchdog(struct i915_gem_context *dst,
+			  struct i915_gem_context *src)
+{
+	int ret;
+
+	ret = context_apply_all(dst, __apply_watchdog,
+				(void *)(uintptr_t)src->watchdog.timeout_us);
+	if (!ret)
+		dst->watchdog = src->watchdog;
+
+	return ret;
+}
+
 static int clone_sseu(struct i915_gem_context *dst,
 		      struct i915_gem_context *src)
 {
@@ -2279,6 +2331,7 @@ static int create_clone(struct i915_user_extension __user *ext, void *data)
 		MAP(SSEU, clone_sseu),
 		MAP(TIMELINE, clone_timeline),
 		MAP(VM, clone_vm),
+		MAP(WATCHDOG, clone_watchdog),
 #undef MAP
 	};
 	struct drm_i915_gem_context_create_ext_clone local;
@@ -2532,6 +2585,10 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		ret = get_ringsize(ctx, args);
 		break;
 
+	case I915_CONTEXT_PARAM_WATCHDOG:
+		ret = get_watchdog(ctx, args);
+		break;
+
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	default:
 		ret = -EINVAL;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index d5bc75508048..f17da7e26c43 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -150,6 +150,10 @@ struct i915_gem_context {
 	 */
 	atomic_t active_count;
 
+	struct {
+		u64 timeout_us;
+	} watchdog;
+
 	/**
 	 * @hang_timestamp: The last time(s) this context caused a GPU hang
 	 */
diff --git a/drivers/gpu/drm/i915/gt/intel_context_param.h b/drivers/gpu/drm/i915/gt/intel_context_param.h
index f053d8633fe2..3ecacc675f41 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_param.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_param.h
@@ -6,9 +6,18 @@
 #ifndef INTEL_CONTEXT_PARAM_H
 #define INTEL_CONTEXT_PARAM_H
 
-struct intel_context;
+#include <linux/types.h>
+
+#include "intel_context.h"
 
 int intel_context_set_ring_size(struct intel_context *ce, long sz);
 long intel_context_get_ring_size(struct intel_context *ce);
 
+static inline int
+intel_context_set_watchdog_us(struct intel_context *ce, u64 timeout_us)
+{
+	ce->watchdog.timeout_us = timeout_us;
+	return 0;
+}
+
 #endif /* INTEL_CONTEXT_PARAM_H */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1987e2ea79a3..a4c65780850c 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1694,6 +1694,8 @@ struct drm_i915_gem_context_param {
  * Default is 16 KiB.
  */
 #define I915_CONTEXT_PARAM_RINGSIZE	0xc
+
+#define I915_CONTEXT_PARAM_WATCHDOG	0xd
 /* Must be kept compact -- no holes and well documented */
 
 	__u64 value;
@@ -1863,7 +1865,8 @@ struct drm_i915_gem_context_create_ext_clone {
 #define I915_CONTEXT_CLONE_SSEU		(1u << 3)
 #define I915_CONTEXT_CLONE_TIMELINE	(1u << 4)
 #define I915_CONTEXT_CLONE_VM		(1u << 5)
-#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_VM << 1)
+#define I915_CONTEXT_CLONE_WATCHDOG	(1u << 6)
+#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_WATCHDOG << 1)
 	__u64 rsvd;
 };
 
-- 
2.27.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  parent reply	other threads:[~2021-03-12 15:49 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-12 15:46 [RFC 0/6] Default request/fence expiry + watchdog Tvrtko Ursulin
2021-03-12 15:46 ` [Intel-gfx] " Tvrtko Ursulin
2021-03-12 15:46 ` [RFC 1/6] drm/i915: Individual request cancellation Tvrtko Ursulin
2021-03-12 15:46   ` [Intel-gfx] " Tvrtko Ursulin
2021-03-15 17:37   ` Tvrtko Ursulin
2021-03-15 17:37     ` Tvrtko Ursulin
2021-03-16 10:02     ` Daniel Vetter
2021-03-16 10:02       ` Daniel Vetter
2021-03-12 15:46 ` [RFC 2/6] drm/i915: Restrict sentinel requests further Tvrtko Ursulin
2021-03-12 15:46   ` [Intel-gfx] " Tvrtko Ursulin
2021-03-12 15:46 ` [RFC 3/6] drm/i915: Request watchdog infrastructure Tvrtko Ursulin
2021-03-12 15:46   ` [Intel-gfx] " Tvrtko Ursulin
2021-03-12 15:46 ` Tvrtko Ursulin [this message]
2021-03-12 15:46   ` [Intel-gfx] [RFC 4/6] drm/i915: Allow userspace to configure the watchdog Tvrtko Ursulin
2021-03-16 10:09   ` Daniel Vetter
2021-03-16 10:09     ` [Intel-gfx] " Daniel Vetter
2021-03-12 15:46 ` [RFC 5/6] drm/i915: Fail too long user submissions by default Tvrtko Ursulin
2021-03-12 15:46   ` [Intel-gfx] " Tvrtko Ursulin
2021-03-16 10:10   ` Daniel Vetter
2021-03-16 10:10     ` [Intel-gfx] " Daniel Vetter
2021-03-12 15:46 ` [RFC 6/6] drm/i915: Allow configuring default request expiry via modparam Tvrtko Ursulin
2021-03-12 15:46   ` [Intel-gfx] " Tvrtko Ursulin
2021-03-16 10:03   ` Daniel Vetter
2021-03-16 10:03     ` [Intel-gfx] " Daniel Vetter
2021-03-12 16:22 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for Default request/fence expiry + watchdog Patchwork
2021-03-12 16:48 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2021-03-12 18:25 ` [Intel-gfx] ✗ Fi.CI.IGT: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210312154622.1767865-5-tvrtko.ursulin@linux.intel.com \
    --to=tvrtko.ursulin@linux.intel.com \
    --cc=Intel-gfx@lists.freedesktop.org \
    --cc=daniel.vetter@ffwll.ch \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=tvrtko.ursulin@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.