All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/9] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20  9:32 ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx

i915_request_add() consumes the passed in reference to the i915_request,
so if the selftest caller wishes to wait upon it afterwards, it needs to
take a reference for itself.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../drm/i915/gem/selftests/i915_gem_context.c | 38 ++++++++++++++-----
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index 9a509c18b7c7..16df814f3efd 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -73,25 +73,34 @@ static int live_nop_switch(void *arg)
 	}
 
 	for_each_uabi_engine(engine, i915) {
-		struct i915_request *rq;
+		struct i915_request *rq = NULL;
 		unsigned long end_time, prime;
 		ktime_t times[2] = {};
 
 		times[0] = ktime_get_raw();
 		for (n = 0; n < nctx; n++) {
-			rq = igt_request_alloc(ctx[n], engine);
-			if (IS_ERR(rq)) {
-				err = PTR_ERR(rq);
+			struct i915_request *this;
+
+			this = igt_request_alloc(ctx[n], engine);
+			if (IS_ERR(this)) {
+				err = PTR_ERR(this);
 				goto out_file;
 			}
-			i915_request_add(rq);
+			if (rq) {
+				i915_request_await_dma_fence(this, &rq->fence);
+				i915_request_put(rq);
+			}
+			rq = i915_request_get(this);
+			i915_request_add(this);
 		}
 		if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 			pr_err("Failed to populated %d contexts\n", nctx);
 			intel_gt_set_wedged(&i915->gt);
+			i915_request_put(rq);
 			err = -EIO;
 			goto out_file;
 		}
+		i915_request_put(rq);
 
 		times[1] = ktime_get_raw();
 
@@ -106,13 +115,21 @@ static int live_nop_switch(void *arg)
 		for_each_prime_number_from(prime, 2, 8192) {
 			times[1] = ktime_get_raw();
 
+			rq = NULL;
 			for (n = 0; n < prime; n++) {
-				rq = igt_request_alloc(ctx[n % nctx], engine);
-				if (IS_ERR(rq)) {
-					err = PTR_ERR(rq);
+				struct i915_request *this;
+
+				this = igt_request_alloc(ctx[n % nctx], engine);
+				if (IS_ERR(this)) {
+					err = PTR_ERR(this);
 					goto out_file;
 				}
 
+				if (this) { /* Force submission order */
+					i915_request_await_dma_fence(this, &rq->fence);
+					i915_request_put(rq);
+				}
+
 				/*
 				 * This space is left intentionally blank.
 				 *
@@ -127,14 +144,17 @@ static int live_nop_switch(void *arg)
 				 * for latency.
 				 */
 
-				i915_request_add(rq);
+				i915_request_add(this);
 			}
+			GEM_BUG_ON(!rq);
 			if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 				pr_err("Switching between %ld contexts timed out\n",
 				       prime);
 				intel_gt_set_wedged(&i915->gt);
+				i915_request_put(rq);
 				break;
 			}
+			i915_request_put(rq);
 
 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
 			if (prime == 2)
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH 1/9] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20  9:32 ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx

i915_request_add() consumes the passed in reference to the i915_request,
so if the selftest caller wishes to wait upon it afterwards, it needs to
take a reference for itself.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../drm/i915/gem/selftests/i915_gem_context.c | 38 ++++++++++++++-----
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index 9a509c18b7c7..16df814f3efd 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -73,25 +73,34 @@ static int live_nop_switch(void *arg)
 	}
 
 	for_each_uabi_engine(engine, i915) {
-		struct i915_request *rq;
+		struct i915_request *rq = NULL;
 		unsigned long end_time, prime;
 		ktime_t times[2] = {};
 
 		times[0] = ktime_get_raw();
 		for (n = 0; n < nctx; n++) {
-			rq = igt_request_alloc(ctx[n], engine);
-			if (IS_ERR(rq)) {
-				err = PTR_ERR(rq);
+			struct i915_request *this;
+
+			this = igt_request_alloc(ctx[n], engine);
+			if (IS_ERR(this)) {
+				err = PTR_ERR(this);
 				goto out_file;
 			}
-			i915_request_add(rq);
+			if (rq) {
+				i915_request_await_dma_fence(this, &rq->fence);
+				i915_request_put(rq);
+			}
+			rq = i915_request_get(this);
+			i915_request_add(this);
 		}
 		if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 			pr_err("Failed to populated %d contexts\n", nctx);
 			intel_gt_set_wedged(&i915->gt);
+			i915_request_put(rq);
 			err = -EIO;
 			goto out_file;
 		}
+		i915_request_put(rq);
 
 		times[1] = ktime_get_raw();
 
@@ -106,13 +115,21 @@ static int live_nop_switch(void *arg)
 		for_each_prime_number_from(prime, 2, 8192) {
 			times[1] = ktime_get_raw();
 
+			rq = NULL;
 			for (n = 0; n < prime; n++) {
-				rq = igt_request_alloc(ctx[n % nctx], engine);
-				if (IS_ERR(rq)) {
-					err = PTR_ERR(rq);
+				struct i915_request *this;
+
+				this = igt_request_alloc(ctx[n % nctx], engine);
+				if (IS_ERR(this)) {
+					err = PTR_ERR(this);
 					goto out_file;
 				}
 
+				if (this) { /* Force submission order */
+					i915_request_await_dma_fence(this, &rq->fence);
+					i915_request_put(rq);
+				}
+
 				/*
 				 * This space is left intentionally blank.
 				 *
@@ -127,14 +144,17 @@ static int live_nop_switch(void *arg)
 				 * for latency.
 				 */
 
-				i915_request_add(rq);
+				i915_request_add(this);
 			}
+			GEM_BUG_ON(!rq);
 			if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 				pr_err("Switching between %ld contexts timed out\n",
 				       prime);
 				intel_gt_set_wedged(&i915->gt);
+				i915_request_put(rq);
 				break;
 			}
+			i915_request_put(rq);
 
 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
 			if (prime == 2)
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH 2/9] drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
@ 2019-11-20  9:32   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: Matthew Auld

The general concept was that intel_timeline.active_count was locked by
the intel_timeline.mutex. The exception was for power management, where
the engine->kernel_context->timeline could be manipulated under the
global wakeref.mutex.

This was quite solid, as we always manipulated the timeline only while
we held an engine wakeref.

And then we started retiring requests outside of struct_mutex, only
using the timelines.active_list and the timeline->mutex. There we
started manipulating intel_timeline.active_count outside of an engine
wakeref, and so introduced a race between __engine_park() and
intel_gt_retire_requests(), a race that could result in the
engine->kernel_context not being added to the active timelines and so
losing requests, which caused us to keep the system permanently powered
up [and unloadable].

The race would be easy to close if we could take the engine wakeref for
the timeline before we retire -- except timelines are not bound to any
engine and so we would need to keep all active engines awake. The
alternative is to guard intel_timeline_enter/intel_timeline_exit for use
outside of the timeline->mutex.

Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt_requests.c   |  8 ++---
 drivers/gpu/drm/i915/gt/intel_timeline.c      | 34 +++++++++++++++----
 .../gpu/drm/i915/gt/intel_timeline_types.h    |  2 +-
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index 25291e2af21e..1a005da8c588 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -49,8 +49,8 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 			continue;
 
 		intel_timeline_get(tl);
-		GEM_BUG_ON(!tl->active_count);
-		tl->active_count++; /* pin the list element */
+		GEM_BUG_ON(!atomic_read(&tl->active_count));
+		atomic_inc(&tl->active_count); /* pin the list element */
 		spin_unlock_irqrestore(&timelines->lock, flags);
 
 		if (timeout > 0) {
@@ -71,14 +71,14 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 
 		/* Resume iteration after dropping lock */
 		list_safe_reset_next(tl, tn, link);
-		if (!--tl->active_count)
+		if (atomic_dec_and_test(&tl->active_count))
 			list_del(&tl->link);
 
 		mutex_unlock(&tl->mutex);
 
 		/* Defer the final release to after the spinlock */
 		if (refcount_dec_and_test(&tl->kref.refcount)) {
-			GEM_BUG_ON(tl->active_count);
+			GEM_BUG_ON(atomic_read(&tl->active_count));
 			list_add(&tl->link, &free);
 		}
 	}
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
index 0e277835aad0..b35f12729983 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline.c
+++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -334,15 +334,33 @@ void intel_timeline_enter(struct intel_timeline *tl)
 	struct intel_gt_timelines *timelines = &tl->gt->timelines;
 	unsigned long flags;
 
+	/*
+	 * Pretend we are serialised by the timeline->mutex.
+	 *
+	 * While generally true, there are a few exceptions to the rule
+	 * for the engine->kernel_context being used to manage power
+	 * transitions. As the engine_park may be called from under any
+	 * timeline, it uses the power mutex as a global serialisation
+	 * lock to prevent any other request entering its timeline.
+	 *
+	 * The rule is generally tl->mutex, otherwise engine->wakeref.mutex.
+	 *
+	 * However, intel_gt_retire_request() does not know which engine
+	 * it is retiring along and so cannot partake in the engine-pm
+	 * barrier, and there we use the tl->active_count as a means to
+	 * pin the timeline in the active_list while the locks are dropped.
+	 * Ergo, as that is outside of the engine-pm barrier, we need to
+	 * use atomic to manipulate tl->active_count.
+	 */
 	lockdep_assert_held(&tl->mutex);
-
 	GEM_BUG_ON(!atomic_read(&tl->pin_count));
-	if (tl->active_count++)
+
+	if (atomic_add_unless(&tl->active_count, 1, 0))
 		return;
-	GEM_BUG_ON(!tl->active_count); /* overflow? */
 
 	spin_lock_irqsave(&timelines->lock, flags);
-	list_add_tail(&tl->link, &timelines->active_list);
+	if (!atomic_fetch_inc(&tl->active_count))
+		list_add_tail(&tl->link, &timelines->active_list);
 	spin_unlock_irqrestore(&timelines->lock, flags);
 }
 
@@ -351,14 +369,16 @@ void intel_timeline_exit(struct intel_timeline *tl)
 	struct intel_gt_timelines *timelines = &tl->gt->timelines;
 	unsigned long flags;
 
+	/* See intel_timeline_enter() */
 	lockdep_assert_held(&tl->mutex);
 
-	GEM_BUG_ON(!tl->active_count);
-	if (--tl->active_count)
+	GEM_BUG_ON(!atomic_read(&tl->active_count));
+	if (atomic_add_unless(&tl->active_count, -1, 1))
 		return;
 
 	spin_lock_irqsave(&timelines->lock, flags);
-	list_del(&tl->link);
+	if (atomic_dec_and_test(&tl->active_count))
+		list_del(&tl->link);
 	spin_unlock_irqrestore(&timelines->lock, flags);
 
 	/*
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
index 98d9ee166379..5244615ed1cb 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
@@ -42,7 +42,7 @@ struct intel_timeline {
 	 * from the intel_context caller plus internal atomicity.
 	 */
 	atomic_t pin_count;
-	unsigned int active_count;
+	atomic_t active_count;
 
 	const u32 *hwsp_seqno;
 	struct i915_vma *hwsp_ggtt;
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH 2/9] drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
@ 2019-11-20  9:32   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx; +Cc: Matthew Auld

The general concept was that intel_timeline.active_count was locked by
the intel_timeline.mutex. The exception was for power management, where
the engine->kernel_context->timeline could be manipulated under the
global wakeref.mutex.

This was quite solid, as we always manipulated the timeline only while
we held an engine wakeref.

And then we started retiring requests outside of struct_mutex, only
using the timelines.active_list and the timeline->mutex. There we
started manipulating intel_timeline.active_count outside of an engine
wakeref, and so introduced a race between __engine_park() and
intel_gt_retire_requests(), a race that could result in the
engine->kernel_context not being added to the active timelines and so
losing requests, which caused us to keep the system permanently powered
up [and unloadable].

The race would be easy to close if we could take the engine wakeref for
the timeline before we retire -- except timelines are not bound to any
engine and so we would need to keep all active engines awake. The
alternative is to guard intel_timeline_enter/intel_timeline_exit for use
outside of the timeline->mutex.

Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt_requests.c   |  8 ++---
 drivers/gpu/drm/i915/gt/intel_timeline.c      | 34 +++++++++++++++----
 .../gpu/drm/i915/gt/intel_timeline_types.h    |  2 +-
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index 25291e2af21e..1a005da8c588 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -49,8 +49,8 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 			continue;
 
 		intel_timeline_get(tl);
-		GEM_BUG_ON(!tl->active_count);
-		tl->active_count++; /* pin the list element */
+		GEM_BUG_ON(!atomic_read(&tl->active_count));
+		atomic_inc(&tl->active_count); /* pin the list element */
 		spin_unlock_irqrestore(&timelines->lock, flags);
 
 		if (timeout > 0) {
@@ -71,14 +71,14 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 
 		/* Resume iteration after dropping lock */
 		list_safe_reset_next(tl, tn, link);
-		if (!--tl->active_count)
+		if (atomic_dec_and_test(&tl->active_count))
 			list_del(&tl->link);
 
 		mutex_unlock(&tl->mutex);
 
 		/* Defer the final release to after the spinlock */
 		if (refcount_dec_and_test(&tl->kref.refcount)) {
-			GEM_BUG_ON(tl->active_count);
+			GEM_BUG_ON(atomic_read(&tl->active_count));
 			list_add(&tl->link, &free);
 		}
 	}
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
index 0e277835aad0..b35f12729983 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline.c
+++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -334,15 +334,33 @@ void intel_timeline_enter(struct intel_timeline *tl)
 	struct intel_gt_timelines *timelines = &tl->gt->timelines;
 	unsigned long flags;
 
+	/*
+	 * Pretend we are serialised by the timeline->mutex.
+	 *
+	 * While generally true, there are a few exceptions to the rule
+	 * for the engine->kernel_context being used to manage power
+	 * transitions. As the engine_park may be called from under any
+	 * timeline, it uses the power mutex as a global serialisation
+	 * lock to prevent any other request entering its timeline.
+	 *
+	 * The rule is generally tl->mutex, otherwise engine->wakeref.mutex.
+	 *
+	 * However, intel_gt_retire_request() does not know which engine
+	 * it is retiring along and so cannot partake in the engine-pm
+	 * barrier, and there we use the tl->active_count as a means to
+	 * pin the timeline in the active_list while the locks are dropped.
+	 * Ergo, as that is outside of the engine-pm barrier, we need to
+	 * use atomic to manipulate tl->active_count.
+	 */
 	lockdep_assert_held(&tl->mutex);
-
 	GEM_BUG_ON(!atomic_read(&tl->pin_count));
-	if (tl->active_count++)
+
+	if (atomic_add_unless(&tl->active_count, 1, 0))
 		return;
-	GEM_BUG_ON(!tl->active_count); /* overflow? */
 
 	spin_lock_irqsave(&timelines->lock, flags);
-	list_add_tail(&tl->link, &timelines->active_list);
+	if (!atomic_fetch_inc(&tl->active_count))
+		list_add_tail(&tl->link, &timelines->active_list);
 	spin_unlock_irqrestore(&timelines->lock, flags);
 }
 
@@ -351,14 +369,16 @@ void intel_timeline_exit(struct intel_timeline *tl)
 	struct intel_gt_timelines *timelines = &tl->gt->timelines;
 	unsigned long flags;
 
+	/* See intel_timeline_enter() */
 	lockdep_assert_held(&tl->mutex);
 
-	GEM_BUG_ON(!tl->active_count);
-	if (--tl->active_count)
+	GEM_BUG_ON(!atomic_read(&tl->active_count));
+	if (atomic_add_unless(&tl->active_count, -1, 1))
 		return;
 
 	spin_lock_irqsave(&timelines->lock, flags);
-	list_del(&tl->link);
+	if (atomic_dec_and_test(&tl->active_count))
+		list_del(&tl->link);
 	spin_unlock_irqrestore(&timelines->lock, flags);
 
 	/*
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
index 98d9ee166379..5244615ed1cb 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
@@ -42,7 +42,7 @@ struct intel_timeline {
 	 * from the intel_context caller plus internal atomicity.
 	 */
 	atomic_t pin_count;
-	unsigned int active_count;
+	atomic_t active_count;
 
 	const u32 *hwsp_seqno;
 	struct i915_vma *hwsp_ggtt;
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20  9:32   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx

In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
the backend"), I erroneously concluded that we last modify the engine
inside __i915_request_commit() meaning that we could enable concurrent
submission for userspace as we enqueued this request. However, this
falls into a trap with other users of the engine->kernel_context waking
up and submitting their request before the idle-switch is queued, with
the result that the kernel_context is executed out-of-sequence most
likely upsetting the GPU and certainly ourselves when we try to retire
the out-of-sequence requests.

As such we need to hold onto the effective engine->kernel_context mutex
lock (via the engine pm mutex proxy) until we have finish queuing the
request to the engine.

v2: Serialise against concurrent intel_gt_retire_requests()
v3: Describe the hairy locking scheme with intel_gt_retire_requests()
for future reference.
v4: Combine timeline->lock and engine pm release; it's hairy.

Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 3c0f490ff2c7..1f517357a268 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
 
 #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
 
+static void
+__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
+				      struct intel_engine_cs *engine)
+{
+	struct intel_gt_timelines *timelines = &engine->gt->timelines;
+
+	spin_lock(&timelines->lock);
+
+	if (!atomic_fetch_inc(&tl->active_count))
+		list_add_tail(&tl->link, &timelines->active_list);
+
+	__intel_wakeref_defer_park(&engine->wakeref);
+
+	spin_unlock(&timelines->lock);
+}
+
 static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 {
+	struct intel_context *ce = engine->kernel_context;
 	struct i915_request *rq;
 	unsigned long flags;
 	bool result = true;
@@ -98,16 +115,31 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 	 * This should hold true as we can only park the engine after
 	 * retiring the last request, thus all rings should be empty and
 	 * all timelines idle.
+	 *
+	 * For unlocking, there are 2 other parties and the GPU who have a
+	 * stake here.
+	 *
+	 * A new gpu user will be waiting on the engine-pm to start their
+	 * engine_unpark. New waiters are predicated on engine->wakeref.count
+	 * and so intel_wakeref_defer_park() acts like a mutex_unlock of the
+	 * engine->wakeref.
+	 *
+	 * The other party is intel_gt_retire_requests(), which is walking the
+	 * list of active timelines looking for completions. Meanwhile as soon
+	 * as we call __i915_request_queue(), the GPU may complete our request.
+	 * Ergo, if we put ourselves on the timelines.active_list
+	 * (se intel_timeline_enter()) before we increment the
+	 * engine->wakeref.count, we may see the request completion and retire
+	 * it causing an undeflow of the engine->wakeref.
 	 */
-	flags = __timeline_mark_lock(engine->kernel_context);
+	flags = __timeline_mark_lock(ce);
+	GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0);
 
-	rq = __i915_request_create(engine->kernel_context, GFP_NOWAIT);
+	rq = __i915_request_create(ce, GFP_NOWAIT);
 	if (IS_ERR(rq))
 		/* Context switch failed, hope for the best! Maybe reset? */
 		goto out_unlock;
 
-	intel_timeline_enter(i915_request_timeline(rq));
-
 	/* Check again on the next retirement. */
 	engine->wakeref_serial = engine->serial + 1;
 	i915_request_add_active_barriers(rq);
@@ -116,13 +148,14 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
 	__i915_request_commit(rq);
 
-	/* Release our exclusive hold on the engine */
-	__intel_wakeref_defer_park(&engine->wakeref);
 	__i915_request_queue(rq, NULL);
 
+	/* Expose ourselves to intel_gt_retire_requests() and new submission */
+	__intel_timeline_enter_and_pm_release(ce->timeline, engine);
+
 	result = false;
 out_unlock:
-	__timeline_mark_unlock(engine->kernel_context, flags);
+	__timeline_mark_unlock(ce, flags);
 	return result;
 }
 
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20  9:32   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx

In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
the backend"), I erroneously concluded that we last modify the engine
inside __i915_request_commit() meaning that we could enable concurrent
submission for userspace as we enqueued this request. However, this
falls into a trap with other users of the engine->kernel_context waking
up and submitting their request before the idle-switch is queued, with
the result that the kernel_context is executed out-of-sequence most
likely upsetting the GPU and certainly ourselves when we try to retire
the out-of-sequence requests.

As such we need to hold onto the effective engine->kernel_context mutex
lock (via the engine pm mutex proxy) until we have finish queuing the
request to the engine.

v2: Serialise against concurrent intel_gt_retire_requests()
v3: Describe the hairy locking scheme with intel_gt_retire_requests()
for future reference.
v4: Combine timeline->lock and engine pm release; it's hairy.

Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 3c0f490ff2c7..1f517357a268 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
 
 #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
 
+static void
+__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
+				      struct intel_engine_cs *engine)
+{
+	struct intel_gt_timelines *timelines = &engine->gt->timelines;
+
+	spin_lock(&timelines->lock);
+
+	if (!atomic_fetch_inc(&tl->active_count))
+		list_add_tail(&tl->link, &timelines->active_list);
+
+	__intel_wakeref_defer_park(&engine->wakeref);
+
+	spin_unlock(&timelines->lock);
+}
+
 static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 {
+	struct intel_context *ce = engine->kernel_context;
 	struct i915_request *rq;
 	unsigned long flags;
 	bool result = true;
@@ -98,16 +115,31 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 	 * This should hold true as we can only park the engine after
 	 * retiring the last request, thus all rings should be empty and
 	 * all timelines idle.
+	 *
+	 * For unlocking, there are 2 other parties and the GPU who have a
+	 * stake here.
+	 *
+	 * A new gpu user will be waiting on the engine-pm to start their
+	 * engine_unpark. New waiters are predicated on engine->wakeref.count
+	 * and so intel_wakeref_defer_park() acts like a mutex_unlock of the
+	 * engine->wakeref.
+	 *
+	 * The other party is intel_gt_retire_requests(), which is walking the
+	 * list of active timelines looking for completions. Meanwhile as soon
+	 * as we call __i915_request_queue(), the GPU may complete our request.
+	 * Ergo, if we put ourselves on the timelines.active_list
+	 * (se intel_timeline_enter()) before we increment the
+	 * engine->wakeref.count, we may see the request completion and retire
+	 * it causing an undeflow of the engine->wakeref.
 	 */
-	flags = __timeline_mark_lock(engine->kernel_context);
+	flags = __timeline_mark_lock(ce);
+	GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0);
 
-	rq = __i915_request_create(engine->kernel_context, GFP_NOWAIT);
+	rq = __i915_request_create(ce, GFP_NOWAIT);
 	if (IS_ERR(rq))
 		/* Context switch failed, hope for the best! Maybe reset? */
 		goto out_unlock;
 
-	intel_timeline_enter(i915_request_timeline(rq));
-
 	/* Check again on the next retirement. */
 	engine->wakeref_serial = engine->serial + 1;
 	i915_request_add_active_barriers(rq);
@@ -116,13 +148,14 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
 	__i915_request_commit(rq);
 
-	/* Release our exclusive hold on the engine */
-	__intel_wakeref_defer_park(&engine->wakeref);
 	__i915_request_queue(rq, NULL);
 
+	/* Expose ourselves to intel_gt_retire_requests() and new submission */
+	__intel_timeline_enter_and_pm_release(ce->timeline, engine);
+
 	result = false;
 out_unlock:
-	__timeline_mark_unlock(engine->kernel_context, flags);
+	__timeline_mark_unlock(ce, flags);
 	return result;
 }
 
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH 4/9] drm/i915: Mark up the calling context for intel_wakeref_put()
@ 2019-11-20  9:32   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx

Previously, we assumed we could use mutex_trylock() within an atomic
context, falling back to a worker if contended. However, such trickery
is illegal inside interrupt context, and so we need to always use a
worker under such circumstances. As we normally are in process context,
we can typically use a plain mutex, and only defer to a work when we
know we are caller from an interrupt path.

Fixes: 51fbd8de87dc ("drm/i915/pmu: Atomically acquire the gt_pm wakeref")
References: a0855d24fc22d ("locking/mutex: Complain upon mutex API misuse in IRQ contexts")
References: https://bugs.freedesktop.org/show_bug.cgi?id=111626
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_pm.c    |  3 +-
 drivers/gpu/drm/i915/gt/intel_engine_pm.h    | 10 +++++++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c        |  1 -
 drivers/gpu/drm/i915/gt/intel_gt_pm.h        |  5 ++++
 drivers/gpu/drm/i915/gt/intel_lrc.c          |  2 +-
 drivers/gpu/drm/i915/gt/intel_reset.c        |  2 +-
 drivers/gpu/drm/i915/gt/selftest_engine_pm.c |  7 +++--
 drivers/gpu/drm/i915/i915_active.c           |  5 ++--
 drivers/gpu/drm/i915/i915_pmu.c              |  6 ++--
 drivers/gpu/drm/i915/intel_wakeref.c         |  8 +++---
 drivers/gpu/drm/i915/intel_wakeref.h         | 30 ++++++++++++++------
 11 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 1f517357a268..f3035b3ab9fa 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -210,7 +210,8 @@ static int __engine_park(struct intel_wakeref *wf)
 
 	engine->execlists.no_priolist = false;
 
-	intel_gt_pm_put(engine->gt);
+	/* While we call i915_vma_parked, we have to break the lock cycle */
+	intel_gt_pm_put_async(engine->gt);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.h b/drivers/gpu/drm/i915/gt/intel_engine_pm.h
index 739c50fefcef..24e20344dc22 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.h
@@ -31,6 +31,16 @@ static inline void intel_engine_pm_put(struct intel_engine_cs *engine)
 	intel_wakeref_put(&engine->wakeref);
 }
 
+static inline void intel_engine_pm_put_async(struct intel_engine_cs *engine)
+{
+	intel_wakeref_put_async(&engine->wakeref);
+}
+
+static inline void intel_engine_pm_flush(struct intel_engine_cs *engine)
+{
+	intel_wakeref_unlock_wait(&engine->wakeref);
+}
+
 void intel_engine_init__pm(struct intel_engine_cs *engine);
 
 #endif /* INTEL_ENGINE_PM_H */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 470fbdc30e5a..f6b5169d623f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -105,7 +105,6 @@ static int __gt_park(struct intel_wakeref *wf)
 static const struct intel_wakeref_ops wf_ops = {
 	.get = __gt_unpark,
 	.put = __gt_park,
-	.flags = INTEL_WAKEREF_PUT_ASYNC,
 };
 
 void intel_gt_pm_init_early(struct intel_gt *gt)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
index b3e17399be9b..990efc27a4e4 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
@@ -32,6 +32,11 @@ static inline void intel_gt_pm_put(struct intel_gt *gt)
 	intel_wakeref_put(&gt->wakeref);
 }
 
+static inline void intel_gt_pm_put_async(struct intel_gt *gt)
+{
+	intel_wakeref_put_async(&gt->wakeref);
+}
+
 static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
 {
 	return intel_wakeref_wait_for_idle(&gt->wakeref);
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 33ce258d484f..b65bc06855b0 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1172,7 +1172,7 @@ __execlists_schedule_out(struct i915_request *rq,
 
 	intel_engine_context_out(engine);
 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
-	intel_gt_pm_put(engine->gt);
+	intel_gt_pm_put_async(engine->gt);
 
 	/*
 	 * If this is part of a virtual engine, its next request may
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index b7007cd78c6f..0388f9375366 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1125,7 +1125,7 @@ int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
 out:
 	intel_engine_cancel_stop_cs(engine);
 	reset_finish_engine(engine);
-	intel_engine_pm_put(engine);
+	intel_engine_pm_put_async(engine);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
index 20b9c83f43ad..cbf6b0735272 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -51,11 +51,12 @@ static int live_engine_pm(void *arg)
 				pr_err("intel_engine_pm_get_if_awake(%s) failed under %s\n",
 				       engine->name, p->name);
 			else
-				intel_engine_pm_put(engine);
-			intel_engine_pm_put(engine);
+				intel_engine_pm_put_async(engine);
+			intel_engine_pm_put_async(engine);
 			p->critical_section_end();
 
-			/* engine wakeref is sync (instant) */
+			intel_engine_pm_flush(engine);
+
 			if (intel_engine_pm_is_awake(engine)) {
 				pr_err("%s is still awake after flushing pm\n",
 				       engine->name);
diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index 5448f37c8102..dca15ace88f6 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -672,12 +672,13 @@ void i915_active_acquire_barrier(struct i915_active *ref)
 	 * populated by i915_request_add_active_barriers() to point to the
 	 * request that will eventually release them.
 	 */
-	spin_lock_irqsave_nested(&ref->tree_lock, flags, SINGLE_DEPTH_NESTING);
 	llist_for_each_safe(pos, next, take_preallocated_barriers(ref)) {
 		struct active_node *node = barrier_from_ll(pos);
 		struct intel_engine_cs *engine = barrier_to_engine(node);
 		struct rb_node **p, *parent;
 
+		spin_lock_irqsave_nested(&ref->tree_lock, flags,
+					 SINGLE_DEPTH_NESTING);
 		parent = NULL;
 		p = &ref->tree.rb_node;
 		while (*p) {
@@ -693,12 +694,12 @@ void i915_active_acquire_barrier(struct i915_active *ref)
 		}
 		rb_link_node(&node->node, parent, p);
 		rb_insert_color(&node->node, &ref->tree);
+		spin_unlock_irqrestore(&ref->tree_lock, flags);
 
 		GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
 		llist_add(barrier_to_ll(node), &engine->barrier_tasks);
 		intel_engine_pm_put(engine);
 	}
-	spin_unlock_irqrestore(&ref->tree_lock, flags);
 }
 
 void i915_request_add_active_barriers(struct i915_request *rq)
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 9b02be0ad4e6..95e824a78d4d 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -190,7 +190,7 @@ static u64 get_rc6(struct intel_gt *gt)
 	val = 0;
 	if (intel_gt_pm_get_if_awake(gt)) {
 		val = __get_rc6(gt);
-		intel_gt_pm_put(gt);
+		intel_gt_pm_put_async(gt);
 	}
 
 	spin_lock_irqsave(&pmu->lock, flags);
@@ -360,7 +360,7 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
 skip:
 		if (unlikely(mmio_lock))
 			spin_unlock_irqrestore(mmio_lock, flags);
-		intel_engine_pm_put(engine);
+		intel_engine_pm_put_async(engine);
 	}
 }
 
@@ -398,7 +398,7 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
 			if (stat)
 				val = intel_get_cagf(rps, stat);
 
-			intel_gt_pm_put(gt);
+			intel_gt_pm_put_async(gt);
 		}
 
 		add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_ACT],
diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
index ad26d7f4ca3d..59aa1b6f1827 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.c
+++ b/drivers/gpu/drm/i915/intel_wakeref.c
@@ -54,7 +54,8 @@ int __intel_wakeref_get_first(struct intel_wakeref *wf)
 
 static void ____intel_wakeref_put_last(struct intel_wakeref *wf)
 {
-	if (!atomic_dec_and_test(&wf->count))
+	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count) <= 0);
+	if (unlikely(!atomic_dec_and_test(&wf->count)))
 		goto unlock;
 
 	/* ops->put() must reschedule its own release on error/deferral */
@@ -67,13 +68,12 @@ static void ____intel_wakeref_put_last(struct intel_wakeref *wf)
 	mutex_unlock(&wf->mutex);
 }
 
-void __intel_wakeref_put_last(struct intel_wakeref *wf)
+void __intel_wakeref_put_last(struct intel_wakeref *wf, unsigned long flags)
 {
 	INTEL_WAKEREF_BUG_ON(work_pending(&wf->work));
 
 	/* Assume we are not in process context and so cannot sleep. */
-	if (wf->ops->flags & INTEL_WAKEREF_PUT_ASYNC ||
-	    !mutex_trylock(&wf->mutex)) {
+	if (flags & INTEL_WAKEREF_PUT_ASYNC || !mutex_trylock(&wf->mutex)) {
 		schedule_work(&wf->work);
 		return;
 	}
diff --git a/drivers/gpu/drm/i915/intel_wakeref.h b/drivers/gpu/drm/i915/intel_wakeref.h
index affe4de3746b..da6e8fd506e6 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.h
+++ b/drivers/gpu/drm/i915/intel_wakeref.h
@@ -9,6 +9,7 @@
 
 #include <linux/atomic.h>
 #include <linux/bits.h>
+#include <linux/lockdep.h>
 #include <linux/mutex.h>
 #include <linux/refcount.h>
 #include <linux/stackdepot.h>
@@ -29,9 +30,6 @@ typedef depot_stack_handle_t intel_wakeref_t;
 struct intel_wakeref_ops {
 	int (*get)(struct intel_wakeref *wf);
 	int (*put)(struct intel_wakeref *wf);
-
-	unsigned long flags;
-#define INTEL_WAKEREF_PUT_ASYNC BIT(0)
 };
 
 struct intel_wakeref {
@@ -57,7 +55,7 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
 } while (0)
 
 int __intel_wakeref_get_first(struct intel_wakeref *wf);
-void __intel_wakeref_put_last(struct intel_wakeref *wf);
+void __intel_wakeref_put_last(struct intel_wakeref *wf, unsigned long flags);
 
 /**
  * intel_wakeref_get: Acquire the wakeref
@@ -100,10 +98,9 @@ intel_wakeref_get_if_active(struct intel_wakeref *wf)
 }
 
 /**
- * intel_wakeref_put: Release the wakeref
- * @i915: the drm_i915_private device
+ * intel_wakeref_put_flags: Release the wakeref
  * @wf: the wakeref
- * @fn: callback for releasing the wakeref, called only on final release.
+ * @flags: control flags
  *
  * Release our hold on the wakeref. When there are no more users,
  * the runtime pm wakeref will be released after the @fn callback is called
@@ -116,11 +113,25 @@ intel_wakeref_get_if_active(struct intel_wakeref *wf)
  * code otherwise.
  */
 static inline void
-intel_wakeref_put(struct intel_wakeref *wf)
+__intel_wakeref_put(struct intel_wakeref *wf, unsigned long flags)
+#define INTEL_WAKEREF_PUT_ASYNC BIT(0)
 {
 	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count) <= 0);
 	if (unlikely(!atomic_add_unless(&wf->count, -1, 1)))
-		__intel_wakeref_put_last(wf);
+		__intel_wakeref_put_last(wf, flags);
+}
+
+static inline void
+intel_wakeref_put(struct intel_wakeref *wf)
+{
+	might_sleep();
+	__intel_wakeref_put(wf, 0);
+}
+
+static inline void
+intel_wakeref_put_async(struct intel_wakeref *wf)
+{
+	__intel_wakeref_put(wf, INTEL_WAKEREF_PUT_ASYNC);
 }
 
 /**
@@ -185,6 +196,7 @@ intel_wakeref_is_active(const struct intel_wakeref *wf)
 static inline void
 __intel_wakeref_defer_park(struct intel_wakeref *wf)
 {
+	lockdep_assert_held(&wf->mutex);
 	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count));
 	atomic_set_release(&wf->count, 1);
 }
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH 4/9] drm/i915: Mark up the calling context for intel_wakeref_put()
@ 2019-11-20  9:32   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx

Previously, we assumed we could use mutex_trylock() within an atomic
context, falling back to a worker if contended. However, such trickery
is illegal inside interrupt context, and so we need to always use a
worker under such circumstances. As we normally are in process context,
we can typically use a plain mutex, and only defer to a work when we
know we are caller from an interrupt path.

Fixes: 51fbd8de87dc ("drm/i915/pmu: Atomically acquire the gt_pm wakeref")
References: a0855d24fc22d ("locking/mutex: Complain upon mutex API misuse in IRQ contexts")
References: https://bugs.freedesktop.org/show_bug.cgi?id=111626
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_pm.c    |  3 +-
 drivers/gpu/drm/i915/gt/intel_engine_pm.h    | 10 +++++++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c        |  1 -
 drivers/gpu/drm/i915/gt/intel_gt_pm.h        |  5 ++++
 drivers/gpu/drm/i915/gt/intel_lrc.c          |  2 +-
 drivers/gpu/drm/i915/gt/intel_reset.c        |  2 +-
 drivers/gpu/drm/i915/gt/selftest_engine_pm.c |  7 +++--
 drivers/gpu/drm/i915/i915_active.c           |  5 ++--
 drivers/gpu/drm/i915/i915_pmu.c              |  6 ++--
 drivers/gpu/drm/i915/intel_wakeref.c         |  8 +++---
 drivers/gpu/drm/i915/intel_wakeref.h         | 30 ++++++++++++++------
 11 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 1f517357a268..f3035b3ab9fa 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -210,7 +210,8 @@ static int __engine_park(struct intel_wakeref *wf)
 
 	engine->execlists.no_priolist = false;
 
-	intel_gt_pm_put(engine->gt);
+	/* While we call i915_vma_parked, we have to break the lock cycle */
+	intel_gt_pm_put_async(engine->gt);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.h b/drivers/gpu/drm/i915/gt/intel_engine_pm.h
index 739c50fefcef..24e20344dc22 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.h
@@ -31,6 +31,16 @@ static inline void intel_engine_pm_put(struct intel_engine_cs *engine)
 	intel_wakeref_put(&engine->wakeref);
 }
 
+static inline void intel_engine_pm_put_async(struct intel_engine_cs *engine)
+{
+	intel_wakeref_put_async(&engine->wakeref);
+}
+
+static inline void intel_engine_pm_flush(struct intel_engine_cs *engine)
+{
+	intel_wakeref_unlock_wait(&engine->wakeref);
+}
+
 void intel_engine_init__pm(struct intel_engine_cs *engine);
 
 #endif /* INTEL_ENGINE_PM_H */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 470fbdc30e5a..f6b5169d623f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -105,7 +105,6 @@ static int __gt_park(struct intel_wakeref *wf)
 static const struct intel_wakeref_ops wf_ops = {
 	.get = __gt_unpark,
 	.put = __gt_park,
-	.flags = INTEL_WAKEREF_PUT_ASYNC,
 };
 
 void intel_gt_pm_init_early(struct intel_gt *gt)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
index b3e17399be9b..990efc27a4e4 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
@@ -32,6 +32,11 @@ static inline void intel_gt_pm_put(struct intel_gt *gt)
 	intel_wakeref_put(&gt->wakeref);
 }
 
+static inline void intel_gt_pm_put_async(struct intel_gt *gt)
+{
+	intel_wakeref_put_async(&gt->wakeref);
+}
+
 static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
 {
 	return intel_wakeref_wait_for_idle(&gt->wakeref);
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 33ce258d484f..b65bc06855b0 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1172,7 +1172,7 @@ __execlists_schedule_out(struct i915_request *rq,
 
 	intel_engine_context_out(engine);
 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
-	intel_gt_pm_put(engine->gt);
+	intel_gt_pm_put_async(engine->gt);
 
 	/*
 	 * If this is part of a virtual engine, its next request may
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index b7007cd78c6f..0388f9375366 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1125,7 +1125,7 @@ int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
 out:
 	intel_engine_cancel_stop_cs(engine);
 	reset_finish_engine(engine);
-	intel_engine_pm_put(engine);
+	intel_engine_pm_put_async(engine);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
index 20b9c83f43ad..cbf6b0735272 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -51,11 +51,12 @@ static int live_engine_pm(void *arg)
 				pr_err("intel_engine_pm_get_if_awake(%s) failed under %s\n",
 				       engine->name, p->name);
 			else
-				intel_engine_pm_put(engine);
-			intel_engine_pm_put(engine);
+				intel_engine_pm_put_async(engine);
+			intel_engine_pm_put_async(engine);
 			p->critical_section_end();
 
-			/* engine wakeref is sync (instant) */
+			intel_engine_pm_flush(engine);
+
 			if (intel_engine_pm_is_awake(engine)) {
 				pr_err("%s is still awake after flushing pm\n",
 				       engine->name);
diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index 5448f37c8102..dca15ace88f6 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -672,12 +672,13 @@ void i915_active_acquire_barrier(struct i915_active *ref)
 	 * populated by i915_request_add_active_barriers() to point to the
 	 * request that will eventually release them.
 	 */
-	spin_lock_irqsave_nested(&ref->tree_lock, flags, SINGLE_DEPTH_NESTING);
 	llist_for_each_safe(pos, next, take_preallocated_barriers(ref)) {
 		struct active_node *node = barrier_from_ll(pos);
 		struct intel_engine_cs *engine = barrier_to_engine(node);
 		struct rb_node **p, *parent;
 
+		spin_lock_irqsave_nested(&ref->tree_lock, flags,
+					 SINGLE_DEPTH_NESTING);
 		parent = NULL;
 		p = &ref->tree.rb_node;
 		while (*p) {
@@ -693,12 +694,12 @@ void i915_active_acquire_barrier(struct i915_active *ref)
 		}
 		rb_link_node(&node->node, parent, p);
 		rb_insert_color(&node->node, &ref->tree);
+		spin_unlock_irqrestore(&ref->tree_lock, flags);
 
 		GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
 		llist_add(barrier_to_ll(node), &engine->barrier_tasks);
 		intel_engine_pm_put(engine);
 	}
-	spin_unlock_irqrestore(&ref->tree_lock, flags);
 }
 
 void i915_request_add_active_barriers(struct i915_request *rq)
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 9b02be0ad4e6..95e824a78d4d 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -190,7 +190,7 @@ static u64 get_rc6(struct intel_gt *gt)
 	val = 0;
 	if (intel_gt_pm_get_if_awake(gt)) {
 		val = __get_rc6(gt);
-		intel_gt_pm_put(gt);
+		intel_gt_pm_put_async(gt);
 	}
 
 	spin_lock_irqsave(&pmu->lock, flags);
@@ -360,7 +360,7 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
 skip:
 		if (unlikely(mmio_lock))
 			spin_unlock_irqrestore(mmio_lock, flags);
-		intel_engine_pm_put(engine);
+		intel_engine_pm_put_async(engine);
 	}
 }
 
@@ -398,7 +398,7 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
 			if (stat)
 				val = intel_get_cagf(rps, stat);
 
-			intel_gt_pm_put(gt);
+			intel_gt_pm_put_async(gt);
 		}
 
 		add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_ACT],
diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
index ad26d7f4ca3d..59aa1b6f1827 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.c
+++ b/drivers/gpu/drm/i915/intel_wakeref.c
@@ -54,7 +54,8 @@ int __intel_wakeref_get_first(struct intel_wakeref *wf)
 
 static void ____intel_wakeref_put_last(struct intel_wakeref *wf)
 {
-	if (!atomic_dec_and_test(&wf->count))
+	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count) <= 0);
+	if (unlikely(!atomic_dec_and_test(&wf->count)))
 		goto unlock;
 
 	/* ops->put() must reschedule its own release on error/deferral */
@@ -67,13 +68,12 @@ static void ____intel_wakeref_put_last(struct intel_wakeref *wf)
 	mutex_unlock(&wf->mutex);
 }
 
-void __intel_wakeref_put_last(struct intel_wakeref *wf)
+void __intel_wakeref_put_last(struct intel_wakeref *wf, unsigned long flags)
 {
 	INTEL_WAKEREF_BUG_ON(work_pending(&wf->work));
 
 	/* Assume we are not in process context and so cannot sleep. */
-	if (wf->ops->flags & INTEL_WAKEREF_PUT_ASYNC ||
-	    !mutex_trylock(&wf->mutex)) {
+	if (flags & INTEL_WAKEREF_PUT_ASYNC || !mutex_trylock(&wf->mutex)) {
 		schedule_work(&wf->work);
 		return;
 	}
diff --git a/drivers/gpu/drm/i915/intel_wakeref.h b/drivers/gpu/drm/i915/intel_wakeref.h
index affe4de3746b..da6e8fd506e6 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.h
+++ b/drivers/gpu/drm/i915/intel_wakeref.h
@@ -9,6 +9,7 @@
 
 #include <linux/atomic.h>
 #include <linux/bits.h>
+#include <linux/lockdep.h>
 #include <linux/mutex.h>
 #include <linux/refcount.h>
 #include <linux/stackdepot.h>
@@ -29,9 +30,6 @@ typedef depot_stack_handle_t intel_wakeref_t;
 struct intel_wakeref_ops {
 	int (*get)(struct intel_wakeref *wf);
 	int (*put)(struct intel_wakeref *wf);
-
-	unsigned long flags;
-#define INTEL_WAKEREF_PUT_ASYNC BIT(0)
 };
 
 struct intel_wakeref {
@@ -57,7 +55,7 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
 } while (0)
 
 int __intel_wakeref_get_first(struct intel_wakeref *wf);
-void __intel_wakeref_put_last(struct intel_wakeref *wf);
+void __intel_wakeref_put_last(struct intel_wakeref *wf, unsigned long flags);
 
 /**
  * intel_wakeref_get: Acquire the wakeref
@@ -100,10 +98,9 @@ intel_wakeref_get_if_active(struct intel_wakeref *wf)
 }
 
 /**
- * intel_wakeref_put: Release the wakeref
- * @i915: the drm_i915_private device
+ * intel_wakeref_put_flags: Release the wakeref
  * @wf: the wakeref
- * @fn: callback for releasing the wakeref, called only on final release.
+ * @flags: control flags
  *
  * Release our hold on the wakeref. When there are no more users,
  * the runtime pm wakeref will be released after the @fn callback is called
@@ -116,11 +113,25 @@ intel_wakeref_get_if_active(struct intel_wakeref *wf)
  * code otherwise.
  */
 static inline void
-intel_wakeref_put(struct intel_wakeref *wf)
+__intel_wakeref_put(struct intel_wakeref *wf, unsigned long flags)
+#define INTEL_WAKEREF_PUT_ASYNC BIT(0)
 {
 	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count) <= 0);
 	if (unlikely(!atomic_add_unless(&wf->count, -1, 1)))
-		__intel_wakeref_put_last(wf);
+		__intel_wakeref_put_last(wf, flags);
+}
+
+static inline void
+intel_wakeref_put(struct intel_wakeref *wf)
+{
+	might_sleep();
+	__intel_wakeref_put(wf, 0);
+}
+
+static inline void
+intel_wakeref_put_async(struct intel_wakeref *wf)
+{
+	__intel_wakeref_put(wf, INTEL_WAKEREF_PUT_ASYNC);
 }
 
 /**
@@ -185,6 +196,7 @@ intel_wakeref_is_active(const struct intel_wakeref *wf)
 static inline void
 __intel_wakeref_defer_park(struct intel_wakeref *wf)
 {
+	lockdep_assert_held(&wf->mutex);
 	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count));
 	atomic_set_release(&wf->count, 1);
 }
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH 5/9] drm/i915/gt: Declare timeline.lock to be irq-free
@ 2019-11-20  9:32   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx

Now that we never allow the intel_wakeref callbacks to be invoked from
interrupt context, we do not need the irqsafe spinlock for the timeline.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt_requests.c |  9 ++++-----
 drivers/gpu/drm/i915/gt/intel_reset.c       |  9 ++++-----
 drivers/gpu/drm/i915/gt/intel_timeline.c    | 10 ++++------
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index 1a005da8c588..4dc3cbeb1b36 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -33,7 +33,6 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 {
 	struct intel_gt_timelines *timelines = &gt->timelines;
 	struct intel_timeline *tl, *tn;
-	unsigned long flags;
 	bool interruptible;
 	LIST_HEAD(free);
 
@@ -43,7 +42,7 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 
 	flush_submission(gt); /* kick the ksoftirqd tasklets */
 
-	spin_lock_irqsave(&timelines->lock, flags);
+	spin_lock(&timelines->lock);
 	list_for_each_entry_safe(tl, tn, &timelines->active_list, link) {
 		if (!mutex_trylock(&tl->mutex))
 			continue;
@@ -51,7 +50,7 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 		intel_timeline_get(tl);
 		GEM_BUG_ON(!atomic_read(&tl->active_count));
 		atomic_inc(&tl->active_count); /* pin the list element */
-		spin_unlock_irqrestore(&timelines->lock, flags);
+		spin_unlock(&timelines->lock);
 
 		if (timeout > 0) {
 			struct dma_fence *fence;
@@ -67,7 +66,7 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 
 		retire_requests(tl);
 
-		spin_lock_irqsave(&timelines->lock, flags);
+		spin_lock(&timelines->lock);
 
 		/* Resume iteration after dropping lock */
 		list_safe_reset_next(tl, tn, link);
@@ -82,7 +81,7 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 			list_add(&tl->link, &free);
 		}
 	}
-	spin_unlock_irqrestore(&timelines->lock, flags);
+	spin_unlock(&timelines->lock);
 
 	list_for_each_entry_safe(tl, tn, &free, link)
 		__intel_timeline_free(&tl->kref);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 0388f9375366..36189238e13c 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -831,7 +831,6 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 {
 	struct intel_gt_timelines *timelines = &gt->timelines;
 	struct intel_timeline *tl;
-	unsigned long flags;
 	bool ok;
 
 	if (!test_bit(I915_WEDGED, &gt->reset.flags))
@@ -853,7 +852,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 	 *
 	 * No more can be submitted until we reset the wedged bit.
 	 */
-	spin_lock_irqsave(&timelines->lock, flags);
+	spin_lock(&timelines->lock);
 	list_for_each_entry(tl, &timelines->active_list, link) {
 		struct dma_fence *fence;
 
@@ -861,7 +860,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 		if (!fence)
 			continue;
 
-		spin_unlock_irqrestore(&timelines->lock, flags);
+		spin_unlock(&timelines->lock);
 
 		/*
 		 * All internal dependencies (i915_requests) will have
@@ -874,10 +873,10 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 		dma_fence_put(fence);
 
 		/* Restart iteration after droping lock */
-		spin_lock_irqsave(&timelines->lock, flags);
+		spin_lock(&timelines->lock);
 		tl = list_entry(&timelines->active_list, typeof(*tl), link);
 	}
-	spin_unlock_irqrestore(&timelines->lock, flags);
+	spin_unlock(&timelines->lock);
 
 	/* We must reset pending GPU events before restoring our submission */
 	ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
index b35f12729983..b190a5d9ab02 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline.c
+++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -332,7 +332,6 @@ int intel_timeline_pin(struct intel_timeline *tl)
 void intel_timeline_enter(struct intel_timeline *tl)
 {
 	struct intel_gt_timelines *timelines = &tl->gt->timelines;
-	unsigned long flags;
 
 	/*
 	 * Pretend we are serialised by the timeline->mutex.
@@ -358,16 +357,15 @@ void intel_timeline_enter(struct intel_timeline *tl)
 	if (atomic_add_unless(&tl->active_count, 1, 0))
 		return;
 
-	spin_lock_irqsave(&timelines->lock, flags);
+	spin_lock(&timelines->lock);
 	if (!atomic_fetch_inc(&tl->active_count))
 		list_add_tail(&tl->link, &timelines->active_list);
-	spin_unlock_irqrestore(&timelines->lock, flags);
+	spin_unlock(&timelines->lock);
 }
 
 void intel_timeline_exit(struct intel_timeline *tl)
 {
 	struct intel_gt_timelines *timelines = &tl->gt->timelines;
-	unsigned long flags;
 
 	/* See intel_timeline_enter() */
 	lockdep_assert_held(&tl->mutex);
@@ -376,10 +374,10 @@ void intel_timeline_exit(struct intel_timeline *tl)
 	if (atomic_add_unless(&tl->active_count, -1, 1))
 		return;
 
-	spin_lock_irqsave(&timelines->lock, flags);
+	spin_lock(&timelines->lock);
 	if (atomic_dec_and_test(&tl->active_count))
 		list_del(&tl->link);
-	spin_unlock_irqrestore(&timelines->lock, flags);
+	spin_unlock(&timelines->lock);
 
 	/*
 	 * Since this timeline is idle, all bariers upon which we were waiting
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH 5/9] drm/i915/gt: Declare timeline.lock to be irq-free
@ 2019-11-20  9:32   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx

Now that we never allow the intel_wakeref callbacks to be invoked from
interrupt context, we do not need the irqsafe spinlock for the timeline.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt_requests.c |  9 ++++-----
 drivers/gpu/drm/i915/gt/intel_reset.c       |  9 ++++-----
 drivers/gpu/drm/i915/gt/intel_timeline.c    | 10 ++++------
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index 1a005da8c588..4dc3cbeb1b36 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -33,7 +33,6 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 {
 	struct intel_gt_timelines *timelines = &gt->timelines;
 	struct intel_timeline *tl, *tn;
-	unsigned long flags;
 	bool interruptible;
 	LIST_HEAD(free);
 
@@ -43,7 +42,7 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 
 	flush_submission(gt); /* kick the ksoftirqd tasklets */
 
-	spin_lock_irqsave(&timelines->lock, flags);
+	spin_lock(&timelines->lock);
 	list_for_each_entry_safe(tl, tn, &timelines->active_list, link) {
 		if (!mutex_trylock(&tl->mutex))
 			continue;
@@ -51,7 +50,7 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 		intel_timeline_get(tl);
 		GEM_BUG_ON(!atomic_read(&tl->active_count));
 		atomic_inc(&tl->active_count); /* pin the list element */
-		spin_unlock_irqrestore(&timelines->lock, flags);
+		spin_unlock(&timelines->lock);
 
 		if (timeout > 0) {
 			struct dma_fence *fence;
@@ -67,7 +66,7 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 
 		retire_requests(tl);
 
-		spin_lock_irqsave(&timelines->lock, flags);
+		spin_lock(&timelines->lock);
 
 		/* Resume iteration after dropping lock */
 		list_safe_reset_next(tl, tn, link);
@@ -82,7 +81,7 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 			list_add(&tl->link, &free);
 		}
 	}
-	spin_unlock_irqrestore(&timelines->lock, flags);
+	spin_unlock(&timelines->lock);
 
 	list_for_each_entry_safe(tl, tn, &free, link)
 		__intel_timeline_free(&tl->kref);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 0388f9375366..36189238e13c 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -831,7 +831,6 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 {
 	struct intel_gt_timelines *timelines = &gt->timelines;
 	struct intel_timeline *tl;
-	unsigned long flags;
 	bool ok;
 
 	if (!test_bit(I915_WEDGED, &gt->reset.flags))
@@ -853,7 +852,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 	 *
 	 * No more can be submitted until we reset the wedged bit.
 	 */
-	spin_lock_irqsave(&timelines->lock, flags);
+	spin_lock(&timelines->lock);
 	list_for_each_entry(tl, &timelines->active_list, link) {
 		struct dma_fence *fence;
 
@@ -861,7 +860,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 		if (!fence)
 			continue;
 
-		spin_unlock_irqrestore(&timelines->lock, flags);
+		spin_unlock(&timelines->lock);
 
 		/*
 		 * All internal dependencies (i915_requests) will have
@@ -874,10 +873,10 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 		dma_fence_put(fence);
 
 		/* Restart iteration after droping lock */
-		spin_lock_irqsave(&timelines->lock, flags);
+		spin_lock(&timelines->lock);
 		tl = list_entry(&timelines->active_list, typeof(*tl), link);
 	}
-	spin_unlock_irqrestore(&timelines->lock, flags);
+	spin_unlock(&timelines->lock);
 
 	/* We must reset pending GPU events before restoring our submission */
 	ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
index b35f12729983..b190a5d9ab02 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline.c
+++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -332,7 +332,6 @@ int intel_timeline_pin(struct intel_timeline *tl)
 void intel_timeline_enter(struct intel_timeline *tl)
 {
 	struct intel_gt_timelines *timelines = &tl->gt->timelines;
-	unsigned long flags;
 
 	/*
 	 * Pretend we are serialised by the timeline->mutex.
@@ -358,16 +357,15 @@ void intel_timeline_enter(struct intel_timeline *tl)
 	if (atomic_add_unless(&tl->active_count, 1, 0))
 		return;
 
-	spin_lock_irqsave(&timelines->lock, flags);
+	spin_lock(&timelines->lock);
 	if (!atomic_fetch_inc(&tl->active_count))
 		list_add_tail(&tl->link, &timelines->active_list);
-	spin_unlock_irqrestore(&timelines->lock, flags);
+	spin_unlock(&timelines->lock);
 }
 
 void intel_timeline_exit(struct intel_timeline *tl)
 {
 	struct intel_gt_timelines *timelines = &tl->gt->timelines;
-	unsigned long flags;
 
 	/* See intel_timeline_enter() */
 	lockdep_assert_held(&tl->mutex);
@@ -376,10 +374,10 @@ void intel_timeline_exit(struct intel_timeline *tl)
 	if (atomic_add_unless(&tl->active_count, -1, 1))
 		return;
 
-	spin_lock_irqsave(&timelines->lock, flags);
+	spin_lock(&timelines->lock);
 	if (atomic_dec_and_test(&tl->active_count))
 		list_del(&tl->link);
-	spin_unlock_irqrestore(&timelines->lock, flags);
+	spin_unlock(&timelines->lock);
 
 	/*
 	 * Since this timeline is idle, all bariers upon which we were waiting
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-20  9:32   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx

Bonded request submission is designed to allow requests to execute in
parallel as laid out by the user. If the master request is already
finished before its bonded pair is submitted, the pair were not destined
to run in parallel and we lose the information about the master engine
to dictate selection of the secondary. If the second request was
required to be run on a particular engine in a virtual set, that should
have been specified, rather than left to the whims of a random
unconnected requests!

In the selftest, I made the mistake of not ensuring the master would
overlap with its bonded pairs, meaning that it could indeed complete
before we submitted the bonds. Those bonds were then free to select any
available engine in their virtual set, and not the one expected by the
test.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 16ebe4d2308e..f3b0610d1f10 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -3036,15 +3036,21 @@ static int bond_virtual_engine(struct intel_gt *gt,
 	struct i915_gem_context *ctx;
 	struct i915_request *rq[16];
 	enum intel_engine_id id;
+	struct igt_spinner spin;
 	unsigned long n;
 	int err;
 
 	GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1);
 
-	ctx = kernel_context(gt->i915);
-	if (!ctx)
+	if (igt_spinner_init(&spin, gt))
 		return -ENOMEM;
 
+	ctx = kernel_context(gt->i915);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto err_spin;
+	}
+
 	err = 0;
 	rq[0] = ERR_PTR(-ENOMEM);
 	for_each_engine(master, gt, id) {
@@ -3055,7 +3061,7 @@ static int bond_virtual_engine(struct intel_gt *gt,
 
 		memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq));
 
-		rq[0] = igt_request_alloc(ctx, master);
+		rq[0] = spinner_create_request(&spin, ctx, master, MI_NOOP);
 		if (IS_ERR(rq[0])) {
 			err = PTR_ERR(rq[0]);
 			goto out;
@@ -3068,10 +3074,17 @@ static int bond_virtual_engine(struct intel_gt *gt,
 							       &fence,
 							       GFP_KERNEL);
 		}
+
 		i915_request_add(rq[0]);
 		if (err < 0)
 			goto out;
 
+		if (!(flags & BOND_SCHEDULE) &&
+		    !igt_wait_for_spinner(&spin, rq[0])) {
+			err = -EIO;
+			goto out;
+		}
+
 		for (n = 0; n < nsibling; n++) {
 			struct intel_context *ve;
 
@@ -3119,6 +3132,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
 			}
 		}
 		onstack_fence_fini(&fence);
+		intel_engine_flush_submission(master);
+		igt_spinner_end(&spin);
 
 		if (i915_request_wait(rq[0], 0, HZ / 10) < 0) {
 			pr_err("Master request did not execute (on %s)!\n",
@@ -3156,6 +3171,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
 		err = -EIO;
 
 	kernel_context_close(ctx);
+err_spin:
+	igt_spinner_fini(&spin);
 	return err;
 }
 
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-20  9:32   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:32 UTC (permalink / raw)
  To: intel-gfx

Bonded request submission is designed to allow requests to execute in
parallel as laid out by the user. If the master request is already
finished before its bonded pair is submitted, the pair were not destined
to run in parallel and we lose the information about the master engine
to dictate selection of the secondary. If the second request was
required to be run on a particular engine in a virtual set, that should
have been specified, rather than left to the whims of a random
unconnected requests!

In the selftest, I made the mistake of not ensuring the master would
overlap with its bonded pairs, meaning that it could indeed complete
before we submitted the bonds. Those bonds were then free to select any
available engine in their virtual set, and not the one expected by the
test.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 16ebe4d2308e..f3b0610d1f10 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -3036,15 +3036,21 @@ static int bond_virtual_engine(struct intel_gt *gt,
 	struct i915_gem_context *ctx;
 	struct i915_request *rq[16];
 	enum intel_engine_id id;
+	struct igt_spinner spin;
 	unsigned long n;
 	int err;
 
 	GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1);
 
-	ctx = kernel_context(gt->i915);
-	if (!ctx)
+	if (igt_spinner_init(&spin, gt))
 		return -ENOMEM;
 
+	ctx = kernel_context(gt->i915);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto err_spin;
+	}
+
 	err = 0;
 	rq[0] = ERR_PTR(-ENOMEM);
 	for_each_engine(master, gt, id) {
@@ -3055,7 +3061,7 @@ static int bond_virtual_engine(struct intel_gt *gt,
 
 		memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq));
 
-		rq[0] = igt_request_alloc(ctx, master);
+		rq[0] = spinner_create_request(&spin, ctx, master, MI_NOOP);
 		if (IS_ERR(rq[0])) {
 			err = PTR_ERR(rq[0]);
 			goto out;
@@ -3068,10 +3074,17 @@ static int bond_virtual_engine(struct intel_gt *gt,
 							       &fence,
 							       GFP_KERNEL);
 		}
+
 		i915_request_add(rq[0]);
 		if (err < 0)
 			goto out;
 
+		if (!(flags & BOND_SCHEDULE) &&
+		    !igt_wait_for_spinner(&spin, rq[0])) {
+			err = -EIO;
+			goto out;
+		}
+
 		for (n = 0; n < nsibling; n++) {
 			struct intel_context *ve;
 
@@ -3119,6 +3132,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
 			}
 		}
 		onstack_fence_fini(&fence);
+		intel_engine_flush_submission(master);
+		igt_spinner_end(&spin);
 
 		if (i915_request_wait(rq[0], 0, HZ / 10) < 0) {
 			pr_err("Master request did not execute (on %s)!\n",
@@ -3156,6 +3171,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
 		err = -EIO;
 
 	kernel_context_close(ctx);
+err_spin:
+	igt_spinner_fini(&spin);
 	return err;
 }
 
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH 7/9] drm/i915/selftests: Flush the active callbacks
@ 2019-11-20  9:33   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:33 UTC (permalink / raw)
  To: intel-gfx

Before checking the current i915_active state for the asynchronous work
we submitted, flush any ongoing callback. This ensures that our sampling
is robust and does not sporadically fail due to bad timing as the work
is running on another cpu.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/selftest_context.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c
index 3586af636304..939798338242 100644
--- a/drivers/gpu/drm/i915/gt/selftest_context.c
+++ b/drivers/gpu/drm/i915/gt/selftest_context.c
@@ -48,20 +48,25 @@ static int context_sync(struct intel_context *ce)
 
 	mutex_lock(&tl->mutex);
 	do {
-		struct dma_fence *fence;
+		struct i915_request *rq;
 		long timeout;
 
-		fence = i915_active_fence_get(&tl->last_request);
-		if (!fence)
+		if (list_empty(&tl->requests))
 			break;
 
-		timeout = dma_fence_wait_timeout(fence, false, HZ / 10);
+		rq = list_last_entry(&tl->requests, typeof(*rq), link);
+		i915_request_get(rq);
+
+		timeout = i915_request_wait(rq, 0, HZ / 10);
 		if (timeout < 0)
 			err = timeout;
 		else
-			i915_request_retire_upto(to_request(fence));
+			i915_request_retire_upto(rq);
 
-		dma_fence_put(fence);
+		spin_lock_irq(&rq->lock);
+		spin_unlock_irq(&rq->lock);
+
+		i915_request_put(rq);
 	} while (!err);
 	mutex_unlock(&tl->mutex);
 
@@ -282,6 +287,8 @@ static int __live_active_context(struct intel_engine_cs *engine,
 		err = -EINVAL;
 	}
 
+	intel_engine_pm_flush(engine);
+
 	if (intel_engine_pm_is_awake(engine)) {
 		struct drm_printer p = drm_debug_printer(__func__);
 
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH 7/9] drm/i915/selftests: Flush the active callbacks
@ 2019-11-20  9:33   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:33 UTC (permalink / raw)
  To: intel-gfx

Before checking the current i915_active state for the asynchronous work
we submitted, flush any ongoing callback. This ensures that our sampling
is robust and does not sporadically fail due to bad timing as the work
is running on another cpu.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/selftest_context.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c
index 3586af636304..939798338242 100644
--- a/drivers/gpu/drm/i915/gt/selftest_context.c
+++ b/drivers/gpu/drm/i915/gt/selftest_context.c
@@ -48,20 +48,25 @@ static int context_sync(struct intel_context *ce)
 
 	mutex_lock(&tl->mutex);
 	do {
-		struct dma_fence *fence;
+		struct i915_request *rq;
 		long timeout;
 
-		fence = i915_active_fence_get(&tl->last_request);
-		if (!fence)
+		if (list_empty(&tl->requests))
 			break;
 
-		timeout = dma_fence_wait_timeout(fence, false, HZ / 10);
+		rq = list_last_entry(&tl->requests, typeof(*rq), link);
+		i915_request_get(rq);
+
+		timeout = i915_request_wait(rq, 0, HZ / 10);
 		if (timeout < 0)
 			err = timeout;
 		else
-			i915_request_retire_upto(to_request(fence));
+			i915_request_retire_upto(rq);
 
-		dma_fence_put(fence);
+		spin_lock_irq(&rq->lock);
+		spin_unlock_irq(&rq->lock);
+
+		i915_request_put(rq);
 	} while (!err);
 	mutex_unlock(&tl->mutex);
 
@@ -282,6 +287,8 @@ static int __live_active_context(struct intel_engine_cs *engine,
 		err = -EINVAL;
 	}
 
+	intel_engine_pm_flush(engine);
+
 	if (intel_engine_pm_is_awake(engine)) {
 		struct drm_printer p = drm_debug_printer(__func__);
 
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH 8/9] drm/i915/selftests: Be explicit in ERR_PTR handling
@ 2019-11-20  9:33   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:33 UTC (permalink / raw)
  To: intel-gfx; +Cc: Dan Carpenter

When setting up a full GGTT, we expect the next insert to fail with
-ENOSPC. Simplify the use of ERR_PTR to not confuse either the reader or
smatch.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
References: f40a7b7558ef ("drm/i915: Initial selftests for exercising eviction")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/selftests/i915_gem_evict.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
index 5f133d177212..06ef88510209 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
@@ -198,8 +198,8 @@ static int igt_overcommit(void *arg)
 	quirk_add(obj, &objects);
 
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
-	if (!IS_ERR(vma) || PTR_ERR(vma) != -ENOSPC) {
-		pr_err("Failed to evict+insert, i915_gem_object_ggtt_pin returned err=%d\n", (int)PTR_ERR(vma));
+	if (vma != ERR_PTR(-ENOSPC)) {
+		pr_err("Failed to evict+insert, i915_gem_object_ggtt_pin returned err=%d\n", (int)PTR_ERR_OR_ZERO(vma));
 		err = -EINVAL;
 		goto cleanup;
 	}
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH 8/9] drm/i915/selftests: Be explicit in ERR_PTR handling
@ 2019-11-20  9:33   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:33 UTC (permalink / raw)
  To: intel-gfx; +Cc: Dan Carpenter

When setting up a full GGTT, we expect the next insert to fail with
-ENOSPC. Simplify the use of ERR_PTR to not confuse either the reader or
smatch.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
References: f40a7b7558ef ("drm/i915: Initial selftests for exercising eviction")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/selftests/i915_gem_evict.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
index 5f133d177212..06ef88510209 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
@@ -198,8 +198,8 @@ static int igt_overcommit(void *arg)
 	quirk_add(obj, &objects);
 
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
-	if (!IS_ERR(vma) || PTR_ERR(vma) != -ENOSPC) {
-		pr_err("Failed to evict+insert, i915_gem_object_ggtt_pin returned err=%d\n", (int)PTR_ERR(vma));
+	if (vma != ERR_PTR(-ENOSPC)) {
+		pr_err("Failed to evict+insert, i915_gem_object_ggtt_pin returned err=%d\n", (int)PTR_ERR_OR_ZERO(vma));
 		err = -EINVAL;
 		goto cleanup;
 	}
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
@ 2019-11-20  9:33   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:33 UTC (permalink / raw)
  To: intel-gfx

The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
corruption WA") is that it disables RC6 while Skylake (and friends) is
active, and we do not consider the GPU idle until all outstanding
requests have been retired and the engine switched over to the kernel
context. If userspace is idle, this task falls onto our background idle
worker, which only runs roughly once a second, meaning that userspace has
to have been idle for a couple of seconds before we enable RC6 again.
Naturally, this causes us to consume considerably more energy than
before as powersaving is effectively disabled while a display server
(here's looking at you Xorg) is running.

As execlists will get a completion event as each context is completed,
we can use this interrupt to queue a retire worker bound to this engine
to cleanup idle timelines. We will then immediately notice the idle
engine (without userspace intervention or the aid of the background
retire worker) and start parking the GPU. Thus during light workloads,
we will do much more work to idle the GPU faster...  Hopefully with
commensurate power saving!

v2: Watch context completions and only look at those local to the engine
when retiring to reduce the amount of excess work we perform.

References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
 drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
 drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
 drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
 drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
 .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
 7 files changed, 116 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index b9613d044393..8f6e353caa66 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -28,13 +28,13 @@
 
 #include "i915_drv.h"
 
-#include "gt/intel_gt.h"
-
+#include "intel_context.h"
 #include "intel_engine.h"
 #include "intel_engine_pm.h"
 #include "intel_engine_pool.h"
 #include "intel_engine_user.h"
-#include "intel_context.h"
+#include "intel_gt.h"
+#include "intel_gt_requests.h"
 #include "intel_lrc.h"
 #include "intel_reset.h"
 #include "intel_ring.h"
@@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
 	intel_engine_init_execlists(engine);
 	intel_engine_init_cmd_parser(engine);
 	intel_engine_init__pm(engine);
+	intel_engine_init_retire(engine);
 
 	intel_engine_pool_init(&engine->pool);
 
@@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 
 	cleanup_status_page(engine);
 
+	intel_engine_fini_retire(engine);
 	intel_engine_pool_fini(&engine->pool);
 	intel_engine_fini_breadcrumbs(engine);
 	intel_engine_cleanup_cmd_parser(engine);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 758f0e8ec672..17f1f1441efc 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -451,6 +451,14 @@ struct intel_engine_cs {
 
 	struct intel_engine_execlists execlists;
 
+	/*
+	 * Keep track of completed timelines on this engine for early
+	 * retirement with the goal of quickly enabling powersaving as
+	 * soon as the engine is idle.
+	 */
+	struct intel_timeline *retire;
+	struct work_struct retire_work;
+
 	/* status_notifier: list of callbacks for context-switch changes */
 	struct atomic_notifier_head context_status_notifier;
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index 4dc3cbeb1b36..4a98fefdf915 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
 		intel_engine_flush_submission(engine);
 }
 
+static void engine_retire(struct work_struct *work)
+{
+	struct intel_engine_cs *engine =
+		container_of(work, typeof(*engine), retire_work);
+	struct intel_timeline *tl = xchg(&engine->retire, NULL);
+
+	do {
+		struct intel_timeline *next = xchg(&tl->retire, NULL);
+
+		/*
+		 * Our goal here is to retire _idle_ timelines as soon as
+		 * possible (as they are idle, we do not expect userspace
+		 * to be cleaning up anytime soon).
+		 *
+		 * If the tl->active_count is already zero, someone else
+		 * should have retired the timeline. Equally if the timeline
+		 * is currently locked, either it is being retired elsewhere
+		 * or about to be!
+		 */
+		if (atomic_read(&tl->active_count) &&
+		    mutex_trylock(&tl->mutex)) {
+			retire_requests(tl);
+			mutex_unlock(&tl->mutex);
+		}
+		intel_timeline_put(tl);
+
+		GEM_BUG_ON(!next);
+		tl = ptr_mask_bits(next, 1);
+	} while (tl);
+}
+
+static bool add_retire(struct intel_engine_cs *engine,
+		       struct intel_timeline *tl)
+{
+	struct intel_timeline *first = READ_ONCE(engine->retire);
+
+	/*
+	 * We open-code a llist here to include the additional tag [BIT(0)]
+	 * so that we know when the timeline is already on a
+	 * retirement queue: either this engine or another.
+	 *
+	 * However, we rely on that a timeline can only be active on a single
+	 * engine at any one time and that add_retire() is called before the
+	 * engine releases the timeline and transferred to another to retire.
+	 */
+
+	if (READ_ONCE(tl->retire)) /* already queued */
+		return false;
+
+	intel_timeline_get(tl);
+	do
+		tl->retire = ptr_pack_bits(first, 1, 1);
+	while (!try_cmpxchg(&engine->retire, &first, tl));
+
+	return !first;
+}
+
+void intel_engine_add_retire(struct intel_engine_cs *engine,
+			     struct intel_timeline *tl)
+{
+	if (add_retire(engine, tl))
+		schedule_work(&engine->retire_work);
+}
+
+void intel_engine_init_retire(struct intel_engine_cs *engine)
+{
+	INIT_WORK(&engine->retire_work, engine_retire);
+}
+
+void intel_engine_fini_retire(struct intel_engine_cs *engine)
+{
+	flush_work(&engine->retire_work);
+}
+
 long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 {
 	struct intel_gt_timelines *timelines = &gt->timelines;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
index fde546424c63..8de559b5a033 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
@@ -7,7 +7,12 @@
 #ifndef INTEL_GT_REQUESTS_H
 #define INTEL_GT_REQUESTS_H
 
-struct intel_gt;
+#include <linux/workqueue.h>
+
+#include "intel_gt_types.h"
+
+struct intel_engine_cs;
+struct intel_timeline;
 
 long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout);
 static inline void intel_gt_retire_requests(struct intel_gt *gt)
@@ -15,6 +20,16 @@ static inline void intel_gt_retire_requests(struct intel_gt *gt)
 	intel_gt_retire_requests_timeout(gt, 0);
 }
 
+static inline void intel_gt_schedule_retire_requests(struct intel_gt *gt)
+{
+	mod_delayed_work(system_wq, &gt->requests.retire_work, 0);
+}
+
+void intel_engine_init_retire(struct intel_engine_cs *engine);
+void intel_engine_add_retire(struct intel_engine_cs *engine,
+			     struct intel_timeline *tl);
+void intel_engine_fini_retire(struct intel_engine_cs *engine);
+
 int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout);
 
 void intel_gt_init_requests(struct intel_gt *gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index b65bc06855b0..2ceaa2f22996 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -142,6 +142,7 @@
 #include "intel_engine_pm.h"
 #include "intel_gt.h"
 #include "intel_gt_pm.h"
+#include "intel_gt_requests.h"
 #include "intel_lrc_reg.h"
 #include "intel_mocs.h"
 #include "intel_reset.h"
@@ -1170,6 +1171,14 @@ __execlists_schedule_out(struct i915_request *rq,
 	 * refrain from doing non-trivial work here.
 	 */
 
+	/*
+	 * If we have just completed this context, the engine may now be
+	 * idle and we want to re-enter powersaving.
+	 */
+	if (list_is_last(&rq->link, &ce->timeline->requests) &&
+	    i915_request_completed(rq))
+		intel_engine_add_retire(engine, ce->timeline);
+
 	intel_engine_context_out(engine);
 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
 	intel_gt_pm_put_async(engine->gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
index b190a5d9ab02..c1d2419444f8 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline.c
+++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -277,6 +277,7 @@ void intel_timeline_fini(struct intel_timeline *timeline)
 {
 	GEM_BUG_ON(atomic_read(&timeline->pin_count));
 	GEM_BUG_ON(!list_empty(&timeline->requests));
+	GEM_BUG_ON(timeline->retire);
 
 	if (timeline->hwsp_cacheline)
 		cacheline_free(timeline->hwsp_cacheline);
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
index 5244615ed1cb..aaf15cbe1ce1 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
@@ -66,6 +66,9 @@ struct intel_timeline {
 	 */
 	struct i915_active_fence last_request;
 
+	/** A chain of completed timelines ready for early retirement. */
+	struct intel_timeline *retire;
+
 	/**
 	 * We track the most recent seqno that we wait on in every context so
 	 * that we only have to emit a new await and dependency on a more
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
@ 2019-11-20  9:33   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  9:33 UTC (permalink / raw)
  To: intel-gfx

The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
corruption WA") is that it disables RC6 while Skylake (and friends) is
active, and we do not consider the GPU idle until all outstanding
requests have been retired and the engine switched over to the kernel
context. If userspace is idle, this task falls onto our background idle
worker, which only runs roughly once a second, meaning that userspace has
to have been idle for a couple of seconds before we enable RC6 again.
Naturally, this causes us to consume considerably more energy than
before as powersaving is effectively disabled while a display server
(here's looking at you Xorg) is running.

As execlists will get a completion event as each context is completed,
we can use this interrupt to queue a retire worker bound to this engine
to cleanup idle timelines. We will then immediately notice the idle
engine (without userspace intervention or the aid of the background
retire worker) and start parking the GPU. Thus during light workloads,
we will do much more work to idle the GPU faster...  Hopefully with
commensurate power saving!

v2: Watch context completions and only look at those local to the engine
when retiring to reduce the amount of excess work we perform.

References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
 drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
 drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
 drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
 drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
 .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
 7 files changed, 116 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index b9613d044393..8f6e353caa66 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -28,13 +28,13 @@
 
 #include "i915_drv.h"
 
-#include "gt/intel_gt.h"
-
+#include "intel_context.h"
 #include "intel_engine.h"
 #include "intel_engine_pm.h"
 #include "intel_engine_pool.h"
 #include "intel_engine_user.h"
-#include "intel_context.h"
+#include "intel_gt.h"
+#include "intel_gt_requests.h"
 #include "intel_lrc.h"
 #include "intel_reset.h"
 #include "intel_ring.h"
@@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
 	intel_engine_init_execlists(engine);
 	intel_engine_init_cmd_parser(engine);
 	intel_engine_init__pm(engine);
+	intel_engine_init_retire(engine);
 
 	intel_engine_pool_init(&engine->pool);
 
@@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 
 	cleanup_status_page(engine);
 
+	intel_engine_fini_retire(engine);
 	intel_engine_pool_fini(&engine->pool);
 	intel_engine_fini_breadcrumbs(engine);
 	intel_engine_cleanup_cmd_parser(engine);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 758f0e8ec672..17f1f1441efc 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -451,6 +451,14 @@ struct intel_engine_cs {
 
 	struct intel_engine_execlists execlists;
 
+	/*
+	 * Keep track of completed timelines on this engine for early
+	 * retirement with the goal of quickly enabling powersaving as
+	 * soon as the engine is idle.
+	 */
+	struct intel_timeline *retire;
+	struct work_struct retire_work;
+
 	/* status_notifier: list of callbacks for context-switch changes */
 	struct atomic_notifier_head context_status_notifier;
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index 4dc3cbeb1b36..4a98fefdf915 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
 		intel_engine_flush_submission(engine);
 }
 
+static void engine_retire(struct work_struct *work)
+{
+	struct intel_engine_cs *engine =
+		container_of(work, typeof(*engine), retire_work);
+	struct intel_timeline *tl = xchg(&engine->retire, NULL);
+
+	do {
+		struct intel_timeline *next = xchg(&tl->retire, NULL);
+
+		/*
+		 * Our goal here is to retire _idle_ timelines as soon as
+		 * possible (as they are idle, we do not expect userspace
+		 * to be cleaning up anytime soon).
+		 *
+		 * If the tl->active_count is already zero, someone else
+		 * should have retired the timeline. Equally if the timeline
+		 * is currently locked, either it is being retired elsewhere
+		 * or about to be!
+		 */
+		if (atomic_read(&tl->active_count) &&
+		    mutex_trylock(&tl->mutex)) {
+			retire_requests(tl);
+			mutex_unlock(&tl->mutex);
+		}
+		intel_timeline_put(tl);
+
+		GEM_BUG_ON(!next);
+		tl = ptr_mask_bits(next, 1);
+	} while (tl);
+}
+
+static bool add_retire(struct intel_engine_cs *engine,
+		       struct intel_timeline *tl)
+{
+	struct intel_timeline *first = READ_ONCE(engine->retire);
+
+	/*
+	 * We open-code a llist here to include the additional tag [BIT(0)]
+	 * so that we know when the timeline is already on a
+	 * retirement queue: either this engine or another.
+	 *
+	 * However, we rely on that a timeline can only be active on a single
+	 * engine at any one time and that add_retire() is called before the
+	 * engine releases the timeline and transferred to another to retire.
+	 */
+
+	if (READ_ONCE(tl->retire)) /* already queued */
+		return false;
+
+	intel_timeline_get(tl);
+	do
+		tl->retire = ptr_pack_bits(first, 1, 1);
+	while (!try_cmpxchg(&engine->retire, &first, tl));
+
+	return !first;
+}
+
+void intel_engine_add_retire(struct intel_engine_cs *engine,
+			     struct intel_timeline *tl)
+{
+	if (add_retire(engine, tl))
+		schedule_work(&engine->retire_work);
+}
+
+void intel_engine_init_retire(struct intel_engine_cs *engine)
+{
+	INIT_WORK(&engine->retire_work, engine_retire);
+}
+
+void intel_engine_fini_retire(struct intel_engine_cs *engine)
+{
+	flush_work(&engine->retire_work);
+}
+
 long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 {
 	struct intel_gt_timelines *timelines = &gt->timelines;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
index fde546424c63..8de559b5a033 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
@@ -7,7 +7,12 @@
 #ifndef INTEL_GT_REQUESTS_H
 #define INTEL_GT_REQUESTS_H
 
-struct intel_gt;
+#include <linux/workqueue.h>
+
+#include "intel_gt_types.h"
+
+struct intel_engine_cs;
+struct intel_timeline;
 
 long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout);
 static inline void intel_gt_retire_requests(struct intel_gt *gt)
@@ -15,6 +20,16 @@ static inline void intel_gt_retire_requests(struct intel_gt *gt)
 	intel_gt_retire_requests_timeout(gt, 0);
 }
 
+static inline void intel_gt_schedule_retire_requests(struct intel_gt *gt)
+{
+	mod_delayed_work(system_wq, &gt->requests.retire_work, 0);
+}
+
+void intel_engine_init_retire(struct intel_engine_cs *engine);
+void intel_engine_add_retire(struct intel_engine_cs *engine,
+			     struct intel_timeline *tl);
+void intel_engine_fini_retire(struct intel_engine_cs *engine);
+
 int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout);
 
 void intel_gt_init_requests(struct intel_gt *gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index b65bc06855b0..2ceaa2f22996 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -142,6 +142,7 @@
 #include "intel_engine_pm.h"
 #include "intel_gt.h"
 #include "intel_gt_pm.h"
+#include "intel_gt_requests.h"
 #include "intel_lrc_reg.h"
 #include "intel_mocs.h"
 #include "intel_reset.h"
@@ -1170,6 +1171,14 @@ __execlists_schedule_out(struct i915_request *rq,
 	 * refrain from doing non-trivial work here.
 	 */
 
+	/*
+	 * If we have just completed this context, the engine may now be
+	 * idle and we want to re-enter powersaving.
+	 */
+	if (list_is_last(&rq->link, &ce->timeline->requests) &&
+	    i915_request_completed(rq))
+		intel_engine_add_retire(engine, ce->timeline);
+
 	intel_engine_context_out(engine);
 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
 	intel_gt_pm_put_async(engine->gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
index b190a5d9ab02..c1d2419444f8 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline.c
+++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -277,6 +277,7 @@ void intel_timeline_fini(struct intel_timeline *timeline)
 {
 	GEM_BUG_ON(atomic_read(&timeline->pin_count));
 	GEM_BUG_ON(!list_empty(&timeline->requests));
+	GEM_BUG_ON(timeline->retire);
 
 	if (timeline->hwsp_cacheline)
 		cacheline_free(timeline->hwsp_cacheline);
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
index 5244615ed1cb..aaf15cbe1ce1 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
@@ -66,6 +66,9 @@ struct intel_timeline {
 	 */
 	struct i915_active_fence last_request;
 
+	/** A chain of completed timelines ready for early retirement. */
+	struct intel_timeline *retire;
+
 	/**
 	 * We track the most recent seqno that we wait on in every context so
 	 * that we only have to emit a new await and dependency on a more
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* Re: [PATCH 1/9] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20 10:19   ` Matthew Auld
  0 siblings, 0 replies; 73+ messages in thread
From: Matthew Auld @ 2019-11-20 10:19 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Intel Graphics Development

On Wed, 20 Nov 2019 at 09:33, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>
> i915_request_add() consumes the passed in reference to the i915_request,
> so if the selftest caller wishes to wait upon it afterwards, it needs to
> take a reference for itself.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  .../drm/i915/gem/selftests/i915_gem_context.c | 38 ++++++++++++++-----
>  1 file changed, 29 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> index 9a509c18b7c7..16df814f3efd 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> @@ -73,25 +73,34 @@ static int live_nop_switch(void *arg)
>         }
>
>         for_each_uabi_engine(engine, i915) {
> -               struct i915_request *rq;
> +               struct i915_request *rq = NULL;
>                 unsigned long end_time, prime;
>                 ktime_t times[2] = {};
>
>                 times[0] = ktime_get_raw();
>                 for (n = 0; n < nctx; n++) {
> -                       rq = igt_request_alloc(ctx[n], engine);
> -                       if (IS_ERR(rq)) {
> -                               err = PTR_ERR(rq);
> +                       struct i915_request *this;
> +
> +                       this = igt_request_alloc(ctx[n], engine);
> +                       if (IS_ERR(this)) {
> +                               err = PTR_ERR(this);
>                                 goto out_file;
>                         }
> -                       i915_request_add(rq);
> +                       if (rq) {
> +                               i915_request_await_dma_fence(this, &rq->fence);
> +                               i915_request_put(rq);
> +                       }
> +                       rq = i915_request_get(this);
> +                       i915_request_add(this);
>                 }
>                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
>                         pr_err("Failed to populated %d contexts\n", nctx);
>                         intel_gt_set_wedged(&i915->gt);
> +                       i915_request_put(rq);
>                         err = -EIO;
>                         goto out_file;
>                 }
> +               i915_request_put(rq);
>
>                 times[1] = ktime_get_raw();
>
> @@ -106,13 +115,21 @@ static int live_nop_switch(void *arg)
>                 for_each_prime_number_from(prime, 2, 8192) {
>                         times[1] = ktime_get_raw();
>
> +                       rq = NULL;
>                         for (n = 0; n < prime; n++) {
> -                               rq = igt_request_alloc(ctx[n % nctx], engine);
> -                               if (IS_ERR(rq)) {
> -                                       err = PTR_ERR(rq);
> +                               struct i915_request *this;
> +
> +                               this = igt_request_alloc(ctx[n % nctx], engine);
> +                               if (IS_ERR(this)) {
> +                                       err = PTR_ERR(this);
>                                         goto out_file;
>                                 }
>
> +                               if (this) { /* Force submission order */

if (rq) ?

> +                                       i915_request_await_dma_fence(this, &rq->fence);
> +                                       i915_request_put(rq);
> +                               }
> +
>                                 /*
>                                  * This space is left intentionally blank.
>                                  *
> @@ -127,14 +144,17 @@ static int live_nop_switch(void *arg)
>                                  * for latency.
>                                  */
>
> -                               i915_request_add(rq);

 rq = i915_request_get(this) ?
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 1/9] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20 10:19   ` Matthew Auld
  0 siblings, 0 replies; 73+ messages in thread
From: Matthew Auld @ 2019-11-20 10:19 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Intel Graphics Development

On Wed, 20 Nov 2019 at 09:33, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>
> i915_request_add() consumes the passed in reference to the i915_request,
> so if the selftest caller wishes to wait upon it afterwards, it needs to
> take a reference for itself.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  .../drm/i915/gem/selftests/i915_gem_context.c | 38 ++++++++++++++-----
>  1 file changed, 29 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> index 9a509c18b7c7..16df814f3efd 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> @@ -73,25 +73,34 @@ static int live_nop_switch(void *arg)
>         }
>
>         for_each_uabi_engine(engine, i915) {
> -               struct i915_request *rq;
> +               struct i915_request *rq = NULL;
>                 unsigned long end_time, prime;
>                 ktime_t times[2] = {};
>
>                 times[0] = ktime_get_raw();
>                 for (n = 0; n < nctx; n++) {
> -                       rq = igt_request_alloc(ctx[n], engine);
> -                       if (IS_ERR(rq)) {
> -                               err = PTR_ERR(rq);
> +                       struct i915_request *this;
> +
> +                       this = igt_request_alloc(ctx[n], engine);
> +                       if (IS_ERR(this)) {
> +                               err = PTR_ERR(this);
>                                 goto out_file;
>                         }
> -                       i915_request_add(rq);
> +                       if (rq) {
> +                               i915_request_await_dma_fence(this, &rq->fence);
> +                               i915_request_put(rq);
> +                       }
> +                       rq = i915_request_get(this);
> +                       i915_request_add(this);
>                 }
>                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
>                         pr_err("Failed to populated %d contexts\n", nctx);
>                         intel_gt_set_wedged(&i915->gt);
> +                       i915_request_put(rq);
>                         err = -EIO;
>                         goto out_file;
>                 }
> +               i915_request_put(rq);
>
>                 times[1] = ktime_get_raw();
>
> @@ -106,13 +115,21 @@ static int live_nop_switch(void *arg)
>                 for_each_prime_number_from(prime, 2, 8192) {
>                         times[1] = ktime_get_raw();
>
> +                       rq = NULL;
>                         for (n = 0; n < prime; n++) {
> -                               rq = igt_request_alloc(ctx[n % nctx], engine);
> -                               if (IS_ERR(rq)) {
> -                                       err = PTR_ERR(rq);
> +                               struct i915_request *this;
> +
> +                               this = igt_request_alloc(ctx[n % nctx], engine);
> +                               if (IS_ERR(this)) {
> +                                       err = PTR_ERR(this);
>                                         goto out_file;
>                                 }
>
> +                               if (this) { /* Force submission order */

if (rq) ?

> +                                       i915_request_await_dma_fence(this, &rq->fence);
> +                                       i915_request_put(rq);
> +                               }
> +
>                                 /*
>                                  * This space is left intentionally blank.
>                                  *
> @@ -127,14 +144,17 @@ static int live_nop_switch(void *arg)
>                                  * for latency.
>                                  */
>
> -                               i915_request_add(rq);

 rq = i915_request_get(this) ?
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 8/9] drm/i915/selftests: Be explicit in ERR_PTR handling
@ 2019-11-20 10:23     ` Matthew Auld
  0 siblings, 0 replies; 73+ messages in thread
From: Matthew Auld @ 2019-11-20 10:23 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Intel Graphics Development, Dan Carpenter

On Wed, 20 Nov 2019 at 09:33, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>
> When setting up a full GGTT, we expect the next insert to fail with
> -ENOSPC. Simplify the use of ERR_PTR to not confuse either the reader or
> smatch.
>
> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
> References: f40a7b7558ef ("drm/i915: Initial selftests for exercising eviction")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 8/9] drm/i915/selftests: Be explicit in ERR_PTR handling
@ 2019-11-20 10:23     ` Matthew Auld
  0 siblings, 0 replies; 73+ messages in thread
From: Matthew Auld @ 2019-11-20 10:23 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Intel Graphics Development, Dan Carpenter

On Wed, 20 Nov 2019 at 09:33, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>
> When setting up a full GGTT, we expect the next insert to fail with
> -ENOSPC. Simplify the use of ERR_PTR to not confuse either the reader or
> smatch.
>
> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
> References: f40a7b7558ef ("drm/i915: Initial selftests for exercising eviction")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 1/9] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20 10:25     ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 10:25 UTC (permalink / raw)
  To: Matthew Auld; +Cc: Intel Graphics Development

Quoting Matthew Auld (2019-11-20 10:19:56)
> On Wed, 20 Nov 2019 at 09:33, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >
> > i915_request_add() consumes the passed in reference to the i915_request,
> > so if the selftest caller wishes to wait upon it afterwards, it needs to
> > take a reference for itself.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  .../drm/i915/gem/selftests/i915_gem_context.c | 38 ++++++++++++++-----
> >  1 file changed, 29 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> > index 9a509c18b7c7..16df814f3efd 100644
> > --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> > @@ -73,25 +73,34 @@ static int live_nop_switch(void *arg)
> >         }
> >
> >         for_each_uabi_engine(engine, i915) {
> > -               struct i915_request *rq;
> > +               struct i915_request *rq = NULL;
> >                 unsigned long end_time, prime;
> >                 ktime_t times[2] = {};
> >
> >                 times[0] = ktime_get_raw();
> >                 for (n = 0; n < nctx; n++) {
> > -                       rq = igt_request_alloc(ctx[n], engine);
> > -                       if (IS_ERR(rq)) {
> > -                               err = PTR_ERR(rq);
> > +                       struct i915_request *this;
> > +
> > +                       this = igt_request_alloc(ctx[n], engine);
> > +                       if (IS_ERR(this)) {
> > +                               err = PTR_ERR(this);
> >                                 goto out_file;
> >                         }
> > -                       i915_request_add(rq);
> > +                       if (rq) {
> > +                               i915_request_await_dma_fence(this, &rq->fence);
> > +                               i915_request_put(rq);
> > +                       }
> > +                       rq = i915_request_get(this);
> > +                       i915_request_add(this);
> >                 }
> >                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
> >                         pr_err("Failed to populated %d contexts\n", nctx);
> >                         intel_gt_set_wedged(&i915->gt);
> > +                       i915_request_put(rq);
> >                         err = -EIO;
> >                         goto out_file;
> >                 }
> > +               i915_request_put(rq);
> >
> >                 times[1] = ktime_get_raw();
> >
> > @@ -106,13 +115,21 @@ static int live_nop_switch(void *arg)
> >                 for_each_prime_number_from(prime, 2, 8192) {
> >                         times[1] = ktime_get_raw();
> >
> > +                       rq = NULL;
> >                         for (n = 0; n < prime; n++) {
> > -                               rq = igt_request_alloc(ctx[n % nctx], engine);
> > -                               if (IS_ERR(rq)) {
> > -                                       err = PTR_ERR(rq);
> > +                               struct i915_request *this;
> > +
> > +                               this = igt_request_alloc(ctx[n % nctx], engine);
> > +                               if (IS_ERR(this)) {
> > +                                       err = PTR_ERR(this);
> >                                         goto out_file;
> >                                 }
> >
> > +                               if (this) { /* Force submission order */
> 
> if (rq) ?

Yes. Still distinct lack of coffee.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 1/9] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20 10:25     ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 10:25 UTC (permalink / raw)
  To: Matthew Auld; +Cc: Intel Graphics Development

Quoting Matthew Auld (2019-11-20 10:19:56)
> On Wed, 20 Nov 2019 at 09:33, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >
> > i915_request_add() consumes the passed in reference to the i915_request,
> > so if the selftest caller wishes to wait upon it afterwards, it needs to
> > take a reference for itself.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  .../drm/i915/gem/selftests/i915_gem_context.c | 38 ++++++++++++++-----
> >  1 file changed, 29 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> > index 9a509c18b7c7..16df814f3efd 100644
> > --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> > @@ -73,25 +73,34 @@ static int live_nop_switch(void *arg)
> >         }
> >
> >         for_each_uabi_engine(engine, i915) {
> > -               struct i915_request *rq;
> > +               struct i915_request *rq = NULL;
> >                 unsigned long end_time, prime;
> >                 ktime_t times[2] = {};
> >
> >                 times[0] = ktime_get_raw();
> >                 for (n = 0; n < nctx; n++) {
> > -                       rq = igt_request_alloc(ctx[n], engine);
> > -                       if (IS_ERR(rq)) {
> > -                               err = PTR_ERR(rq);
> > +                       struct i915_request *this;
> > +
> > +                       this = igt_request_alloc(ctx[n], engine);
> > +                       if (IS_ERR(this)) {
> > +                               err = PTR_ERR(this);
> >                                 goto out_file;
> >                         }
> > -                       i915_request_add(rq);
> > +                       if (rq) {
> > +                               i915_request_await_dma_fence(this, &rq->fence);
> > +                               i915_request_put(rq);
> > +                       }
> > +                       rq = i915_request_get(this);
> > +                       i915_request_add(this);
> >                 }
> >                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
> >                         pr_err("Failed to populated %d contexts\n", nctx);
> >                         intel_gt_set_wedged(&i915->gt);
> > +                       i915_request_put(rq);
> >                         err = -EIO;
> >                         goto out_file;
> >                 }
> > +               i915_request_put(rq);
> >
> >                 times[1] = ktime_get_raw();
> >
> > @@ -106,13 +115,21 @@ static int live_nop_switch(void *arg)
> >                 for_each_prime_number_from(prime, 2, 8192) {
> >                         times[1] = ktime_get_raw();
> >
> > +                       rq = NULL;
> >                         for (n = 0; n < prime; n++) {
> > -                               rq = igt_request_alloc(ctx[n % nctx], engine);
> > -                               if (IS_ERR(rq)) {
> > -                                       err = PTR_ERR(rq);
> > +                               struct i915_request *this;
> > +
> > +                               this = igt_request_alloc(ctx[n % nctx], engine);
> > +                               if (IS_ERR(this)) {
> > +                                       err = PTR_ERR(this);
> >                                         goto out_file;
> >                                 }
> >
> > +                               if (this) { /* Force submission order */
> 
> if (rq) ?

Yes. Still distinct lack of coffee.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* [PATCH] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20 10:27   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 10:27 UTC (permalink / raw)
  To: intel-gfx; +Cc: Matthew Auld

i915_request_add() consumes the passed in reference to the i915_request,
so if the selftest caller wishes to wait upon it afterwards, it needs to
take a reference for itself.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
---
 .../drm/i915/gem/selftests/i915_gem_context.c | 39 ++++++++++++++-----
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index 9a509c18b7c7..f1ce5f64b221 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -73,25 +73,34 @@ static int live_nop_switch(void *arg)
 	}
 
 	for_each_uabi_engine(engine, i915) {
-		struct i915_request *rq;
+		struct i915_request *rq = NULL;
 		unsigned long end_time, prime;
 		ktime_t times[2] = {};
 
 		times[0] = ktime_get_raw();
 		for (n = 0; n < nctx; n++) {
-			rq = igt_request_alloc(ctx[n], engine);
-			if (IS_ERR(rq)) {
-				err = PTR_ERR(rq);
+			struct i915_request *this;
+
+			this = igt_request_alloc(ctx[n], engine);
+			if (IS_ERR(this)) {
+				err = PTR_ERR(this);
 				goto out_file;
 			}
-			i915_request_add(rq);
+			if (rq) {
+				i915_request_await_dma_fence(this, &rq->fence);
+				i915_request_put(rq);
+			}
+			rq = i915_request_get(this);
+			i915_request_add(this);
 		}
 		if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 			pr_err("Failed to populated %d contexts\n", nctx);
 			intel_gt_set_wedged(&i915->gt);
+			i915_request_put(rq);
 			err = -EIO;
 			goto out_file;
 		}
+		i915_request_put(rq);
 
 		times[1] = ktime_get_raw();
 
@@ -106,13 +115,21 @@ static int live_nop_switch(void *arg)
 		for_each_prime_number_from(prime, 2, 8192) {
 			times[1] = ktime_get_raw();
 
+			rq = NULL;
 			for (n = 0; n < prime; n++) {
-				rq = igt_request_alloc(ctx[n % nctx], engine);
-				if (IS_ERR(rq)) {
-					err = PTR_ERR(rq);
+				struct i915_request *this;
+
+				this = igt_request_alloc(ctx[n % nctx], engine);
+				if (IS_ERR(this)) {
+					err = PTR_ERR(this);
 					goto out_file;
 				}
 
+				if (rq) { /* Force submission order */
+					i915_request_await_dma_fence(this, &rq->fence);
+					i915_request_put(rq);
+				}
+
 				/*
 				 * This space is left intentionally blank.
 				 *
@@ -127,14 +144,18 @@ static int live_nop_switch(void *arg)
 				 * for latency.
 				 */
 
-				i915_request_add(rq);
+				rq = i915_request_get(this);
+				i915_request_add(this);
 			}
+			GEM_BUG_ON(!rq);
 			if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 				pr_err("Switching between %ld contexts timed out\n",
 				       prime);
 				intel_gt_set_wedged(&i915->gt);
+				i915_request_put(rq);
 				break;
 			}
+			i915_request_put(rq);
 
 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
 			if (prime == 2)
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20 10:27   ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 10:27 UTC (permalink / raw)
  To: intel-gfx; +Cc: Matthew Auld

i915_request_add() consumes the passed in reference to the i915_request,
so if the selftest caller wishes to wait upon it afterwards, it needs to
take a reference for itself.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
---
 .../drm/i915/gem/selftests/i915_gem_context.c | 39 ++++++++++++++-----
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index 9a509c18b7c7..f1ce5f64b221 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -73,25 +73,34 @@ static int live_nop_switch(void *arg)
 	}
 
 	for_each_uabi_engine(engine, i915) {
-		struct i915_request *rq;
+		struct i915_request *rq = NULL;
 		unsigned long end_time, prime;
 		ktime_t times[2] = {};
 
 		times[0] = ktime_get_raw();
 		for (n = 0; n < nctx; n++) {
-			rq = igt_request_alloc(ctx[n], engine);
-			if (IS_ERR(rq)) {
-				err = PTR_ERR(rq);
+			struct i915_request *this;
+
+			this = igt_request_alloc(ctx[n], engine);
+			if (IS_ERR(this)) {
+				err = PTR_ERR(this);
 				goto out_file;
 			}
-			i915_request_add(rq);
+			if (rq) {
+				i915_request_await_dma_fence(this, &rq->fence);
+				i915_request_put(rq);
+			}
+			rq = i915_request_get(this);
+			i915_request_add(this);
 		}
 		if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 			pr_err("Failed to populated %d contexts\n", nctx);
 			intel_gt_set_wedged(&i915->gt);
+			i915_request_put(rq);
 			err = -EIO;
 			goto out_file;
 		}
+		i915_request_put(rq);
 
 		times[1] = ktime_get_raw();
 
@@ -106,13 +115,21 @@ static int live_nop_switch(void *arg)
 		for_each_prime_number_from(prime, 2, 8192) {
 			times[1] = ktime_get_raw();
 
+			rq = NULL;
 			for (n = 0; n < prime; n++) {
-				rq = igt_request_alloc(ctx[n % nctx], engine);
-				if (IS_ERR(rq)) {
-					err = PTR_ERR(rq);
+				struct i915_request *this;
+
+				this = igt_request_alloc(ctx[n % nctx], engine);
+				if (IS_ERR(this)) {
+					err = PTR_ERR(this);
 					goto out_file;
 				}
 
+				if (rq) { /* Force submission order */
+					i915_request_await_dma_fence(this, &rq->fence);
+					i915_request_put(rq);
+				}
+
 				/*
 				 * This space is left intentionally blank.
 				 *
@@ -127,14 +144,18 @@ static int live_nop_switch(void *arg)
 				 * for latency.
 				 */
 
-				i915_request_add(rq);
+				rq = i915_request_get(this);
+				i915_request_add(this);
 			}
+			GEM_BUG_ON(!rq);
 			if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 				pr_err("Switching between %ld contexts timed out\n",
 				       prime);
 				intel_gt_set_wedged(&i915->gt);
+				i915_request_put(rq);
 				break;
 			}
+			i915_request_put(rq);
 
 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
 			if (prime == 2)
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* Re: [PATCH] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20 10:34     ` Matthew Auld
  0 siblings, 0 replies; 73+ messages in thread
From: Matthew Auld @ 2019-11-20 10:34 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Intel Graphics Development, Matthew Auld

On Wed, 20 Nov 2019 at 10:28, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>
> i915_request_add() consumes the passed in reference to the i915_request,
> so if the selftest caller wishes to wait upon it afterwards, it needs to
> take a reference for itself.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20 10:34     ` Matthew Auld
  0 siblings, 0 replies; 73+ messages in thread
From: Matthew Auld @ 2019-11-20 10:34 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Intel Graphics Development, Matthew Auld

On Wed, 20 Nov 2019 at 10:28, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>
> i915_request_add() consumes the passed in reference to the i915_request,
> so if the selftest caller wishes to wait upon it afterwards, it needs to
> take a reference for itself.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20 11:58     ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 11:58 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 09:32, Chris Wilson wrote:
> In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
> the backend"), I erroneously concluded that we last modify the engine
> inside __i915_request_commit() meaning that we could enable concurrent
> submission for userspace as we enqueued this request. However, this
> falls into a trap with other users of the engine->kernel_context waking
> up and submitting their request before the idle-switch is queued, with
> the result that the kernel_context is executed out-of-sequence most
> likely upsetting the GPU and certainly ourselves when we try to retire
> the out-of-sequence requests.
> 
> As such we need to hold onto the effective engine->kernel_context mutex
> lock (via the engine pm mutex proxy) until we have finish queuing the
> request to the engine.
> 
> v2: Serialise against concurrent intel_gt_retire_requests()
> v3: Describe the hairy locking scheme with intel_gt_retire_requests()
> for future reference.
> v4: Combine timeline->lock and engine pm release; it's hairy.
> 
> Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
>   1 file changed, 40 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index 3c0f490ff2c7..1f517357a268 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
>   
>   #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
>   
> +static void
> +__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
> +				      struct intel_engine_cs *engine)
> +{
> +	struct intel_gt_timelines *timelines = &engine->gt->timelines;
> +
> +	spin_lock(&timelines->lock);
> +
> +	if (!atomic_fetch_inc(&tl->active_count))
> +		list_add_tail(&tl->link, &timelines->active_list);

Hmm with these new part it maybe matches/answers my question from 
"drm/i915/gt: Close race between engine_park and 
intel_gt_retire_requests". I think at least. Since it now adds a second 
place timeline can enter the active_list.

But no, where does the intel_timeline_enter race come from? Can't be 
userspace submission since they are blocked on wf->lock.

Regards,

Tvrtko

> +
> +	__intel_wakeref_defer_park(&engine->wakeref);
> +
> +	spin_unlock(&timelines->lock);
> +}
> +
>   static bool switch_to_kernel_context(struct intel_engine_cs *engine)
>   {
> +	struct intel_context *ce = engine->kernel_context;
>   	struct i915_request *rq;
>   	unsigned long flags;
>   	bool result = true;
> @@ -98,16 +115,31 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
>   	 * This should hold true as we can only park the engine after
>   	 * retiring the last request, thus all rings should be empty and
>   	 * all timelines idle.
> +	 *
> +	 * For unlocking, there are 2 other parties and the GPU who have a
> +	 * stake here.
> +	 *
> +	 * A new gpu user will be waiting on the engine-pm to start their
> +	 * engine_unpark. New waiters are predicated on engine->wakeref.count
> +	 * and so intel_wakeref_defer_park() acts like a mutex_unlock of the
> +	 * engine->wakeref.
> +	 *
> +	 * The other party is intel_gt_retire_requests(), which is walking the
> +	 * list of active timelines looking for completions. Meanwhile as soon
> +	 * as we call __i915_request_queue(), the GPU may complete our request.
> +	 * Ergo, if we put ourselves on the timelines.active_list
> +	 * (se intel_timeline_enter()) before we increment the
> +	 * engine->wakeref.count, we may see the request completion and retire
> +	 * it causing an undeflow of the engine->wakeref.
>   	 */
> -	flags = __timeline_mark_lock(engine->kernel_context);
> +	flags = __timeline_mark_lock(ce);
> +	GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0);
>   
> -	rq = __i915_request_create(engine->kernel_context, GFP_NOWAIT);
> +	rq = __i915_request_create(ce, GFP_NOWAIT);
>   	if (IS_ERR(rq))
>   		/* Context switch failed, hope for the best! Maybe reset? */
>   		goto out_unlock;
>   
> -	intel_timeline_enter(i915_request_timeline(rq));
> -
>   	/* Check again on the next retirement. */
>   	engine->wakeref_serial = engine->serial + 1;
>   	i915_request_add_active_barriers(rq);
> @@ -116,13 +148,14 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
>   	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
>   	__i915_request_commit(rq);
>   
> -	/* Release our exclusive hold on the engine */
> -	__intel_wakeref_defer_park(&engine->wakeref);
>   	__i915_request_queue(rq, NULL);
>   
> +	/* Expose ourselves to intel_gt_retire_requests() and new submission */
> +	__intel_timeline_enter_and_pm_release(ce->timeline, engine);
> +
>   	result = false;
>   out_unlock:
> -	__timeline_mark_unlock(engine->kernel_context, flags);
> +	__timeline_mark_unlock(ce, flags);
>   	return result;
>   }
>   
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20 11:58     ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 11:58 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 09:32, Chris Wilson wrote:
> In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
> the backend"), I erroneously concluded that we last modify the engine
> inside __i915_request_commit() meaning that we could enable concurrent
> submission for userspace as we enqueued this request. However, this
> falls into a trap with other users of the engine->kernel_context waking
> up and submitting their request before the idle-switch is queued, with
> the result that the kernel_context is executed out-of-sequence most
> likely upsetting the GPU and certainly ourselves when we try to retire
> the out-of-sequence requests.
> 
> As such we need to hold onto the effective engine->kernel_context mutex
> lock (via the engine pm mutex proxy) until we have finish queuing the
> request to the engine.
> 
> v2: Serialise against concurrent intel_gt_retire_requests()
> v3: Describe the hairy locking scheme with intel_gt_retire_requests()
> for future reference.
> v4: Combine timeline->lock and engine pm release; it's hairy.
> 
> Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
>   1 file changed, 40 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index 3c0f490ff2c7..1f517357a268 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
>   
>   #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
>   
> +static void
> +__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
> +				      struct intel_engine_cs *engine)
> +{
> +	struct intel_gt_timelines *timelines = &engine->gt->timelines;
> +
> +	spin_lock(&timelines->lock);
> +
> +	if (!atomic_fetch_inc(&tl->active_count))
> +		list_add_tail(&tl->link, &timelines->active_list);

Hmm with these new part it maybe matches/answers my question from 
"drm/i915/gt: Close race between engine_park and 
intel_gt_retire_requests". I think at least. Since it now adds a second 
place timeline can enter the active_list.

But no, where does the intel_timeline_enter race come from? Can't be 
userspace submission since they are blocked on wf->lock.

Regards,

Tvrtko

> +
> +	__intel_wakeref_defer_park(&engine->wakeref);
> +
> +	spin_unlock(&timelines->lock);
> +}
> +
>   static bool switch_to_kernel_context(struct intel_engine_cs *engine)
>   {
> +	struct intel_context *ce = engine->kernel_context;
>   	struct i915_request *rq;
>   	unsigned long flags;
>   	bool result = true;
> @@ -98,16 +115,31 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
>   	 * This should hold true as we can only park the engine after
>   	 * retiring the last request, thus all rings should be empty and
>   	 * all timelines idle.
> +	 *
> +	 * For unlocking, there are 2 other parties and the GPU who have a
> +	 * stake here.
> +	 *
> +	 * A new gpu user will be waiting on the engine-pm to start their
> +	 * engine_unpark. New waiters are predicated on engine->wakeref.count
> +	 * and so intel_wakeref_defer_park() acts like a mutex_unlock of the
> +	 * engine->wakeref.
> +	 *
> +	 * The other party is intel_gt_retire_requests(), which is walking the
> +	 * list of active timelines looking for completions. Meanwhile as soon
> +	 * as we call __i915_request_queue(), the GPU may complete our request.
> +	 * Ergo, if we put ourselves on the timelines.active_list
> +	 * (se intel_timeline_enter()) before we increment the
> +	 * engine->wakeref.count, we may see the request completion and retire
> +	 * it causing an undeflow of the engine->wakeref.
>   	 */
> -	flags = __timeline_mark_lock(engine->kernel_context);
> +	flags = __timeline_mark_lock(ce);
> +	GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0);
>   
> -	rq = __i915_request_create(engine->kernel_context, GFP_NOWAIT);
> +	rq = __i915_request_create(ce, GFP_NOWAIT);
>   	if (IS_ERR(rq))
>   		/* Context switch failed, hope for the best! Maybe reset? */
>   		goto out_unlock;
>   
> -	intel_timeline_enter(i915_request_timeline(rq));
> -
>   	/* Check again on the next retirement. */
>   	engine->wakeref_serial = engine->serial + 1;
>   	i915_request_add_active_barriers(rq);
> @@ -116,13 +148,14 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
>   	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
>   	__i915_request_commit(rq);
>   
> -	/* Release our exclusive hold on the engine */
> -	__intel_wakeref_defer_park(&engine->wakeref);
>   	__i915_request_queue(rq, NULL);
>   
> +	/* Expose ourselves to intel_gt_retire_requests() and new submission */
> +	__intel_timeline_enter_and_pm_release(ce->timeline, engine);
> +
>   	result = false;
>   out_unlock:
> -	__timeline_mark_unlock(engine->kernel_context, flags);
> +	__timeline_mark_unlock(ce, flags);
>   	return result;
>   }
>   
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20 12:07       ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 12:07 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 11:58:27)
> 
> On 20/11/2019 09:32, Chris Wilson wrote:
> > In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
> > the backend"), I erroneously concluded that we last modify the engine
> > inside __i915_request_commit() meaning that we could enable concurrent
> > submission for userspace as we enqueued this request. However, this
> > falls into a trap with other users of the engine->kernel_context waking
> > up and submitting their request before the idle-switch is queued, with
> > the result that the kernel_context is executed out-of-sequence most
> > likely upsetting the GPU and certainly ourselves when we try to retire
> > the out-of-sequence requests.
> > 
> > As such we need to hold onto the effective engine->kernel_context mutex
> > lock (via the engine pm mutex proxy) until we have finish queuing the
> > request to the engine.
> > 
> > v2: Serialise against concurrent intel_gt_retire_requests()
> > v3: Describe the hairy locking scheme with intel_gt_retire_requests()
> > for future reference.
> > v4: Combine timeline->lock and engine pm release; it's hairy.
> > 
> > Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
> >   1 file changed, 40 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > index 3c0f490ff2c7..1f517357a268 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > @@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
> >   
> >   #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
> >   
> > +static void
> > +__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
> > +                                   struct intel_engine_cs *engine)
> > +{
> > +     struct intel_gt_timelines *timelines = &engine->gt->timelines;
> > +
> > +     spin_lock(&timelines->lock);
> > +
> > +     if (!atomic_fetch_inc(&tl->active_count))
> > +             list_add_tail(&tl->link, &timelines->active_list);
> 
> Hmm with these new part it maybe matches/answers my question from 
> "drm/i915/gt: Close race between engine_park and 
> intel_gt_retire_requests". I think at least. Since it now adds a second 
> place timeline can enter the active_list.
> 
> But no, where does the intel_timeline_enter race come from? Can't be 
> userspace submission since they are blocked on wf->lock.

The race is not just with intel_timeline_enter, but with
intel_gt_retire_requests.

As soon as we are on that list, we may be retired. If we are retired
before adjusting the engine->wakeref.count, we are b0rked.

As soon as we adjust the engine->wakeref.count, another submission may
call intel_timeline_enter, and again may even retire this request. The
enter itself is perfectly fine, but we need to serialise against the
retires.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20 12:07       ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 12:07 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 11:58:27)
> 
> On 20/11/2019 09:32, Chris Wilson wrote:
> > In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
> > the backend"), I erroneously concluded that we last modify the engine
> > inside __i915_request_commit() meaning that we could enable concurrent
> > submission for userspace as we enqueued this request. However, this
> > falls into a trap with other users of the engine->kernel_context waking
> > up and submitting their request before the idle-switch is queued, with
> > the result that the kernel_context is executed out-of-sequence most
> > likely upsetting the GPU and certainly ourselves when we try to retire
> > the out-of-sequence requests.
> > 
> > As such we need to hold onto the effective engine->kernel_context mutex
> > lock (via the engine pm mutex proxy) until we have finish queuing the
> > request to the engine.
> > 
> > v2: Serialise against concurrent intel_gt_retire_requests()
> > v3: Describe the hairy locking scheme with intel_gt_retire_requests()
> > for future reference.
> > v4: Combine timeline->lock and engine pm release; it's hairy.
> > 
> > Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
> >   1 file changed, 40 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > index 3c0f490ff2c7..1f517357a268 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > @@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
> >   
> >   #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
> >   
> > +static void
> > +__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
> > +                                   struct intel_engine_cs *engine)
> > +{
> > +     struct intel_gt_timelines *timelines = &engine->gt->timelines;
> > +
> > +     spin_lock(&timelines->lock);
> > +
> > +     if (!atomic_fetch_inc(&tl->active_count))
> > +             list_add_tail(&tl->link, &timelines->active_list);
> 
> Hmm with these new part it maybe matches/answers my question from 
> "drm/i915/gt: Close race between engine_park and 
> intel_gt_retire_requests". I think at least. Since it now adds a second 
> place timeline can enter the active_list.
> 
> But no, where does the intel_timeline_enter race come from? Can't be 
> userspace submission since they are blocked on wf->lock.

The race is not just with intel_timeline_enter, but with
intel_gt_retire_requests.

As soon as we are on that list, we may be retired. If we are retired
before adjusting the engine->wakeref.count, we are b0rked.

As soon as we adjust the engine->wakeref.count, another submission may
call intel_timeline_enter, and again may even retire this request. The
enter itself is perfectly fine, but we need to serialise against the
retires.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20 12:40         ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 12:40 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 12:07, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 11:58:27)
>>
>> On 20/11/2019 09:32, Chris Wilson wrote:
>>> In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
>>> the backend"), I erroneously concluded that we last modify the engine
>>> inside __i915_request_commit() meaning that we could enable concurrent
>>> submission for userspace as we enqueued this request. However, this
>>> falls into a trap with other users of the engine->kernel_context waking
>>> up and submitting their request before the idle-switch is queued, with
>>> the result that the kernel_context is executed out-of-sequence most
>>> likely upsetting the GPU and certainly ourselves when we try to retire
>>> the out-of-sequence requests.
>>>
>>> As such we need to hold onto the effective engine->kernel_context mutex
>>> lock (via the engine pm mutex proxy) until we have finish queuing the
>>> request to the engine.
>>>
>>> v2: Serialise against concurrent intel_gt_retire_requests()
>>> v3: Describe the hairy locking scheme with intel_gt_retire_requests()
>>> for future reference.
>>> v4: Combine timeline->lock and engine pm release; it's hairy.
>>>
>>> Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
>>>    1 file changed, 40 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>> index 3c0f490ff2c7..1f517357a268 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>> @@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
>>>    
>>>    #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
>>>    
>>> +static void
>>> +__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
>>> +                                   struct intel_engine_cs *engine)
>>> +{
>>> +     struct intel_gt_timelines *timelines = &engine->gt->timelines;
>>> +
>>> +     spin_lock(&timelines->lock);
>>> +
>>> +     if (!atomic_fetch_inc(&tl->active_count))
>>> +             list_add_tail(&tl->link, &timelines->active_list);
>>
>> Hmm with these new part it maybe matches/answers my question from
>> "drm/i915/gt: Close race between engine_park and
>> intel_gt_retire_requests". I think at least. Since it now adds a second
>> place timeline can enter the active_list.
>>
>> But no, where does the intel_timeline_enter race come from? Can't be
>> userspace submission since they are blocked on wf->lock.
> 
> The race is not just with intel_timeline_enter, but with
> intel_gt_retire_requests.
> 
> As soon as we are on that list, we may be retired. If we are retired
> before adjusting the engine->wakeref.count, we are b0rked.
> 
> As soon as we adjust the engine->wakeref.count, another submission may
> call intel_timeline_enter, and again may even retire this request. The
> enter itself is perfectly fine, but we need to serialise against the
> retires.

I think the two patches combined work, I am just not sure two 
atomic_fetch_inc()->list_add() are needed now that you re-ordered 
__i915_requests_queue and __intel_wakeref_defer_park - that's the part 
which is confusing me. But it also doesn't harm...

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20 12:40         ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 12:40 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 12:07, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 11:58:27)
>>
>> On 20/11/2019 09:32, Chris Wilson wrote:
>>> In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
>>> the backend"), I erroneously concluded that we last modify the engine
>>> inside __i915_request_commit() meaning that we could enable concurrent
>>> submission for userspace as we enqueued this request. However, this
>>> falls into a trap with other users of the engine->kernel_context waking
>>> up and submitting their request before the idle-switch is queued, with
>>> the result that the kernel_context is executed out-of-sequence most
>>> likely upsetting the GPU and certainly ourselves when we try to retire
>>> the out-of-sequence requests.
>>>
>>> As such we need to hold onto the effective engine->kernel_context mutex
>>> lock (via the engine pm mutex proxy) until we have finish queuing the
>>> request to the engine.
>>>
>>> v2: Serialise against concurrent intel_gt_retire_requests()
>>> v3: Describe the hairy locking scheme with intel_gt_retire_requests()
>>> for future reference.
>>> v4: Combine timeline->lock and engine pm release; it's hairy.
>>>
>>> Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
>>>    1 file changed, 40 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>> index 3c0f490ff2c7..1f517357a268 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>> @@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
>>>    
>>>    #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
>>>    
>>> +static void
>>> +__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
>>> +                                   struct intel_engine_cs *engine)
>>> +{
>>> +     struct intel_gt_timelines *timelines = &engine->gt->timelines;
>>> +
>>> +     spin_lock(&timelines->lock);
>>> +
>>> +     if (!atomic_fetch_inc(&tl->active_count))
>>> +             list_add_tail(&tl->link, &timelines->active_list);
>>
>> Hmm with these new part it maybe matches/answers my question from
>> "drm/i915/gt: Close race between engine_park and
>> intel_gt_retire_requests". I think at least. Since it now adds a second
>> place timeline can enter the active_list.
>>
>> But no, where does the intel_timeline_enter race come from? Can't be
>> userspace submission since they are blocked on wf->lock.
> 
> The race is not just with intel_timeline_enter, but with
> intel_gt_retire_requests.
> 
> As soon as we are on that list, we may be retired. If we are retired
> before adjusting the engine->wakeref.count, we are b0rked.
> 
> As soon as we adjust the engine->wakeref.count, another submission may
> call intel_timeline_enter, and again may even retire this request. The
> enter itself is perfectly fine, but we need to serialise against the
> retires.

I think the two patches combined work, I am just not sure two 
atomic_fetch_inc()->list_add() are needed now that you re-ordered 
__i915_requests_queue and __intel_wakeref_defer_park - that's the part 
which is confusing me. But it also doesn't harm...

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20 12:44           ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 12:44 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 12:40:13)
> 
> On 20/11/2019 12:07, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-20 11:58:27)
> >>
> >> On 20/11/2019 09:32, Chris Wilson wrote:
> >>> In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
> >>> the backend"), I erroneously concluded that we last modify the engine
> >>> inside __i915_request_commit() meaning that we could enable concurrent
> >>> submission for userspace as we enqueued this request. However, this
> >>> falls into a trap with other users of the engine->kernel_context waking
> >>> up and submitting their request before the idle-switch is queued, with
> >>> the result that the kernel_context is executed out-of-sequence most
> >>> likely upsetting the GPU and certainly ourselves when we try to retire
> >>> the out-of-sequence requests.
> >>>
> >>> As such we need to hold onto the effective engine->kernel_context mutex
> >>> lock (via the engine pm mutex proxy) until we have finish queuing the
> >>> request to the engine.
> >>>
> >>> v2: Serialise against concurrent intel_gt_retire_requests()
> >>> v3: Describe the hairy locking scheme with intel_gt_retire_requests()
> >>> for future reference.
> >>> v4: Combine timeline->lock and engine pm release; it's hairy.
> >>>
> >>> Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>> ---
> >>>    drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
> >>>    1 file changed, 40 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> >>> index 3c0f490ff2c7..1f517357a268 100644
> >>> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> >>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> >>> @@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
> >>>    
> >>>    #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
> >>>    
> >>> +static void
> >>> +__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
> >>> +                                   struct intel_engine_cs *engine)
> >>> +{
> >>> +     struct intel_gt_timelines *timelines = &engine->gt->timelines;
> >>> +
> >>> +     spin_lock(&timelines->lock);
> >>> +
> >>> +     if (!atomic_fetch_inc(&tl->active_count))
> >>> +             list_add_tail(&tl->link, &timelines->active_list);
> >>
> >> Hmm with these new part it maybe matches/answers my question from
> >> "drm/i915/gt: Close race between engine_park and
> >> intel_gt_retire_requests". I think at least. Since it now adds a second
> >> place timeline can enter the active_list.
> >>
> >> But no, where does the intel_timeline_enter race come from? Can't be
> >> userspace submission since they are blocked on wf->lock.
> > 
> > The race is not just with intel_timeline_enter, but with
> > intel_gt_retire_requests.
> > 
> > As soon as we are on that list, we may be retired. If we are retired
> > before adjusting the engine->wakeref.count, we are b0rked.
> > 
> > As soon as we adjust the engine->wakeref.count, another submission may
> > call intel_timeline_enter, and again may even retire this request. The
> > enter itself is perfectly fine, but we need to serialise against the
> > retires.
> 
> I think the two patches combined work, I am just not sure two 
> atomic_fetch_inc()->list_add() are needed now that you re-ordered 
> __i915_requests_queue and __intel_wakeref_defer_park - that's the part 
> which is confusing me. But it also doesn't harm...

I tried to get away with not, but the selftests hammer very heavily on
the engine->kernel_context so we do encounter the scenarios where we are
using the kernel_context to park on one cpu while submitting a new
request on another.

I would have got away with it but for these pesky selftests.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20 12:44           ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 12:44 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 12:40:13)
> 
> On 20/11/2019 12:07, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-20 11:58:27)
> >>
> >> On 20/11/2019 09:32, Chris Wilson wrote:
> >>> In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
> >>> the backend"), I erroneously concluded that we last modify the engine
> >>> inside __i915_request_commit() meaning that we could enable concurrent
> >>> submission for userspace as we enqueued this request. However, this
> >>> falls into a trap with other users of the engine->kernel_context waking
> >>> up and submitting their request before the idle-switch is queued, with
> >>> the result that the kernel_context is executed out-of-sequence most
> >>> likely upsetting the GPU and certainly ourselves when we try to retire
> >>> the out-of-sequence requests.
> >>>
> >>> As such we need to hold onto the effective engine->kernel_context mutex
> >>> lock (via the engine pm mutex proxy) until we have finish queuing the
> >>> request to the engine.
> >>>
> >>> v2: Serialise against concurrent intel_gt_retire_requests()
> >>> v3: Describe the hairy locking scheme with intel_gt_retire_requests()
> >>> for future reference.
> >>> v4: Combine timeline->lock and engine pm release; it's hairy.
> >>>
> >>> Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> >>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>> ---
> >>>    drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
> >>>    1 file changed, 40 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> >>> index 3c0f490ff2c7..1f517357a268 100644
> >>> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> >>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> >>> @@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
> >>>    
> >>>    #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
> >>>    
> >>> +static void
> >>> +__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
> >>> +                                   struct intel_engine_cs *engine)
> >>> +{
> >>> +     struct intel_gt_timelines *timelines = &engine->gt->timelines;
> >>> +
> >>> +     spin_lock(&timelines->lock);
> >>> +
> >>> +     if (!atomic_fetch_inc(&tl->active_count))
> >>> +             list_add_tail(&tl->link, &timelines->active_list);
> >>
> >> Hmm with these new part it maybe matches/answers my question from
> >> "drm/i915/gt: Close race between engine_park and
> >> intel_gt_retire_requests". I think at least. Since it now adds a second
> >> place timeline can enter the active_list.
> >>
> >> But no, where does the intel_timeline_enter race come from? Can't be
> >> userspace submission since they are blocked on wf->lock.
> > 
> > The race is not just with intel_timeline_enter, but with
> > intel_gt_retire_requests.
> > 
> > As soon as we are on that list, we may be retired. If we are retired
> > before adjusting the engine->wakeref.count, we are b0rked.
> > 
> > As soon as we adjust the engine->wakeref.count, another submission may
> > call intel_timeline_enter, and again may even retire this request. The
> > enter itself is perfectly fine, but we need to serialise against the
> > retires.
> 
> I think the two patches combined work, I am just not sure two 
> atomic_fetch_inc()->list_add() are needed now that you re-ordered 
> __i915_requests_queue and __intel_wakeref_defer_park - that's the part 
> which is confusing me. But it also doesn't harm...

I tried to get away with not, but the selftests hammer very heavily on
the engine->kernel_context so we do encounter the scenarios where we are
using the kernel_context to park on one cpu while submitting a new
request on another.

I would have got away with it but for these pesky selftests.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 4/9] drm/i915: Mark up the calling context for intel_wakeref_put()
@ 2019-11-20 12:46     ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 12:46 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 09:32, Chris Wilson wrote:
> Previously, we assumed we could use mutex_trylock() within an atomic
> context, falling back to a worker if contended. However, such trickery
> is illegal inside interrupt context, and so we need to always use a
> worker under such circumstances. As we normally are in process context,
> we can typically use a plain mutex, and only defer to a work when we
> know we are caller from an interrupt path.
> 
> Fixes: 51fbd8de87dc ("drm/i915/pmu: Atomically acquire the gt_pm wakeref")
> References: a0855d24fc22d ("locking/mutex: Complain upon mutex API misuse in IRQ contexts")
> References: https://bugs.freedesktop.org/show_bug.cgi?id=111626
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_pm.c    |  3 +-
>   drivers/gpu/drm/i915/gt/intel_engine_pm.h    | 10 +++++++
>   drivers/gpu/drm/i915/gt/intel_gt_pm.c        |  1 -
>   drivers/gpu/drm/i915/gt/intel_gt_pm.h        |  5 ++++
>   drivers/gpu/drm/i915/gt/intel_lrc.c          |  2 +-
>   drivers/gpu/drm/i915/gt/intel_reset.c        |  2 +-
>   drivers/gpu/drm/i915/gt/selftest_engine_pm.c |  7 +++--
>   drivers/gpu/drm/i915/i915_active.c           |  5 ++--
>   drivers/gpu/drm/i915/i915_pmu.c              |  6 ++--
>   drivers/gpu/drm/i915/intel_wakeref.c         |  8 +++---
>   drivers/gpu/drm/i915/intel_wakeref.h         | 30 ++++++++++++++------
>   11 files changed, 54 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index 1f517357a268..f3035b3ab9fa 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -210,7 +210,8 @@ static int __engine_park(struct intel_wakeref *wf)
>   
>   	engine->execlists.no_priolist = false;
>   
> -	intel_gt_pm_put(engine->gt);
> +	/* While we call i915_vma_parked, we have to break the lock cycle */
> +	intel_gt_pm_put_async(engine->gt);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.h b/drivers/gpu/drm/i915/gt/intel_engine_pm.h
> index 739c50fefcef..24e20344dc22 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.h
> @@ -31,6 +31,16 @@ static inline void intel_engine_pm_put(struct intel_engine_cs *engine)
>   	intel_wakeref_put(&engine->wakeref);
>   }
>   
> +static inline void intel_engine_pm_put_async(struct intel_engine_cs *engine)
> +{
> +	intel_wakeref_put_async(&engine->wakeref);
> +}
> +
> +static inline void intel_engine_pm_flush(struct intel_engine_cs *engine)
> +{
> +	intel_wakeref_unlock_wait(&engine->wakeref);
> +}
> +
>   void intel_engine_init__pm(struct intel_engine_cs *engine);
>   
>   #endif /* INTEL_ENGINE_PM_H */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> index 470fbdc30e5a..f6b5169d623f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> @@ -105,7 +105,6 @@ static int __gt_park(struct intel_wakeref *wf)
>   static const struct intel_wakeref_ops wf_ops = {
>   	.get = __gt_unpark,
>   	.put = __gt_park,
> -	.flags = INTEL_WAKEREF_PUT_ASYNC,
>   };
>   
>   void intel_gt_pm_init_early(struct intel_gt *gt)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> index b3e17399be9b..990efc27a4e4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> @@ -32,6 +32,11 @@ static inline void intel_gt_pm_put(struct intel_gt *gt)
>   	intel_wakeref_put(&gt->wakeref);
>   }
>   
> +static inline void intel_gt_pm_put_async(struct intel_gt *gt)
> +{
> +	intel_wakeref_put_async(&gt->wakeref);
> +}
> +
>   static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
>   {
>   	return intel_wakeref_wait_for_idle(&gt->wakeref);
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 33ce258d484f..b65bc06855b0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -1172,7 +1172,7 @@ __execlists_schedule_out(struct i915_request *rq,
>   
>   	intel_engine_context_out(engine);
>   	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> -	intel_gt_pm_put(engine->gt);
> +	intel_gt_pm_put_async(engine->gt);
>   
>   	/*
>   	 * If this is part of a virtual engine, its next request may
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index b7007cd78c6f..0388f9375366 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1125,7 +1125,7 @@ int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
>   out:
>   	intel_engine_cancel_stop_cs(engine);
>   	reset_finish_engine(engine);
> -	intel_engine_pm_put(engine);
> +	intel_engine_pm_put_async(engine);
>   	return ret;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> index 20b9c83f43ad..cbf6b0735272 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> @@ -51,11 +51,12 @@ static int live_engine_pm(void *arg)
>   				pr_err("intel_engine_pm_get_if_awake(%s) failed under %s\n",
>   				       engine->name, p->name);
>   			else
> -				intel_engine_pm_put(engine);
> -			intel_engine_pm_put(engine);
> +				intel_engine_pm_put_async(engine);
> +			intel_engine_pm_put_async(engine);
>   			p->critical_section_end();
>   
> -			/* engine wakeref is sync (instant) */
> +			intel_engine_pm_flush(engine);
> +
>   			if (intel_engine_pm_is_awake(engine)) {
>   				pr_err("%s is still awake after flushing pm\n",
>   				       engine->name);
> diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> index 5448f37c8102..dca15ace88f6 100644
> --- a/drivers/gpu/drm/i915/i915_active.c
> +++ b/drivers/gpu/drm/i915/i915_active.c
> @@ -672,12 +672,13 @@ void i915_active_acquire_barrier(struct i915_active *ref)
>   	 * populated by i915_request_add_active_barriers() to point to the
>   	 * request that will eventually release them.
>   	 */
> -	spin_lock_irqsave_nested(&ref->tree_lock, flags, SINGLE_DEPTH_NESTING);
>   	llist_for_each_safe(pos, next, take_preallocated_barriers(ref)) {
>   		struct active_node *node = barrier_from_ll(pos);
>   		struct intel_engine_cs *engine = barrier_to_engine(node);
>   		struct rb_node **p, *parent;
>   
> +		spin_lock_irqsave_nested(&ref->tree_lock, flags,
> +					 SINGLE_DEPTH_NESTING);
>   		parent = NULL;
>   		p = &ref->tree.rb_node;
>   		while (*p) {
> @@ -693,12 +694,12 @@ void i915_active_acquire_barrier(struct i915_active *ref)
>   		}
>   		rb_link_node(&node->node, parent, p);
>   		rb_insert_color(&node->node, &ref->tree);
> +		spin_unlock_irqrestore(&ref->tree_lock, flags);
>   
>   		GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
>   		llist_add(barrier_to_ll(node), &engine->barrier_tasks);
>   		intel_engine_pm_put(engine);
>   	}
> -	spin_unlock_irqrestore(&ref->tree_lock, flags);
>   }
>   
>   void i915_request_add_active_barriers(struct i915_request *rq)
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index 9b02be0ad4e6..95e824a78d4d 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -190,7 +190,7 @@ static u64 get_rc6(struct intel_gt *gt)
>   	val = 0;
>   	if (intel_gt_pm_get_if_awake(gt)) {
>   		val = __get_rc6(gt);
> -		intel_gt_pm_put(gt);
> +		intel_gt_pm_put_async(gt);
>   	}
>   
>   	spin_lock_irqsave(&pmu->lock, flags);
> @@ -360,7 +360,7 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
>   skip:
>   		if (unlikely(mmio_lock))
>   			spin_unlock_irqrestore(mmio_lock, flags);
> -		intel_engine_pm_put(engine);
> +		intel_engine_pm_put_async(engine);
>   	}
>   }
>   
> @@ -398,7 +398,7 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>   			if (stat)
>   				val = intel_get_cagf(rps, stat);
>   
> -			intel_gt_pm_put(gt);
> +			intel_gt_pm_put_async(gt);
>   		}
>   
>   		add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_ACT],
> diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
> index ad26d7f4ca3d..59aa1b6f1827 100644
> --- a/drivers/gpu/drm/i915/intel_wakeref.c
> +++ b/drivers/gpu/drm/i915/intel_wakeref.c
> @@ -54,7 +54,8 @@ int __intel_wakeref_get_first(struct intel_wakeref *wf)
>   
>   static void ____intel_wakeref_put_last(struct intel_wakeref *wf)
>   {
> -	if (!atomic_dec_and_test(&wf->count))
> +	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count) <= 0);
> +	if (unlikely(!atomic_dec_and_test(&wf->count)))
>   		goto unlock;
>   
>   	/* ops->put() must reschedule its own release on error/deferral */
> @@ -67,13 +68,12 @@ static void ____intel_wakeref_put_last(struct intel_wakeref *wf)
>   	mutex_unlock(&wf->mutex);
>   }
>   
> -void __intel_wakeref_put_last(struct intel_wakeref *wf)
> +void __intel_wakeref_put_last(struct intel_wakeref *wf, unsigned long flags)
>   {
>   	INTEL_WAKEREF_BUG_ON(work_pending(&wf->work));
>   
>   	/* Assume we are not in process context and so cannot sleep. */
> -	if (wf->ops->flags & INTEL_WAKEREF_PUT_ASYNC ||
> -	    !mutex_trylock(&wf->mutex)) {
> +	if (flags & INTEL_WAKEREF_PUT_ASYNC || !mutex_trylock(&wf->mutex)) {
>   		schedule_work(&wf->work);
>   		return;
>   	}
> diff --git a/drivers/gpu/drm/i915/intel_wakeref.h b/drivers/gpu/drm/i915/intel_wakeref.h
> index affe4de3746b..da6e8fd506e6 100644
> --- a/drivers/gpu/drm/i915/intel_wakeref.h
> +++ b/drivers/gpu/drm/i915/intel_wakeref.h
> @@ -9,6 +9,7 @@
>   
>   #include <linux/atomic.h>
>   #include <linux/bits.h>
> +#include <linux/lockdep.h>
>   #include <linux/mutex.h>
>   #include <linux/refcount.h>
>   #include <linux/stackdepot.h>
> @@ -29,9 +30,6 @@ typedef depot_stack_handle_t intel_wakeref_t;
>   struct intel_wakeref_ops {
>   	int (*get)(struct intel_wakeref *wf);
>   	int (*put)(struct intel_wakeref *wf);
> -
> -	unsigned long flags;
> -#define INTEL_WAKEREF_PUT_ASYNC BIT(0)
>   };
>   
>   struct intel_wakeref {
> @@ -57,7 +55,7 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
>   } while (0)
>   
>   int __intel_wakeref_get_first(struct intel_wakeref *wf);
> -void __intel_wakeref_put_last(struct intel_wakeref *wf);
> +void __intel_wakeref_put_last(struct intel_wakeref *wf, unsigned long flags);
>   
>   /**
>    * intel_wakeref_get: Acquire the wakeref
> @@ -100,10 +98,9 @@ intel_wakeref_get_if_active(struct intel_wakeref *wf)
>   }
>   
>   /**
> - * intel_wakeref_put: Release the wakeref
> - * @i915: the drm_i915_private device
> + * intel_wakeref_put_flags: Release the wakeref
>    * @wf: the wakeref
> - * @fn: callback for releasing the wakeref, called only on final release.
> + * @flags: control flags
>    *
>    * Release our hold on the wakeref. When there are no more users,
>    * the runtime pm wakeref will be released after the @fn callback is called
> @@ -116,11 +113,25 @@ intel_wakeref_get_if_active(struct intel_wakeref *wf)
>    * code otherwise.
>    */
>   static inline void
> -intel_wakeref_put(struct intel_wakeref *wf)
> +__intel_wakeref_put(struct intel_wakeref *wf, unsigned long flags)
> +#define INTEL_WAKEREF_PUT_ASYNC BIT(0)
>   {
>   	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count) <= 0);
>   	if (unlikely(!atomic_add_unless(&wf->count, -1, 1)))
> -		__intel_wakeref_put_last(wf);
> +		__intel_wakeref_put_last(wf, flags);
> +}
> +
> +static inline void
> +intel_wakeref_put(struct intel_wakeref *wf)
> +{
> +	might_sleep();
> +	__intel_wakeref_put(wf, 0);
> +}
> +
> +static inline void
> +intel_wakeref_put_async(struct intel_wakeref *wf)
> +{
> +	__intel_wakeref_put(wf, INTEL_WAKEREF_PUT_ASYNC);
>   }
>   
>   /**
> @@ -185,6 +196,7 @@ intel_wakeref_is_active(const struct intel_wakeref *wf)
>   static inline void
>   __intel_wakeref_defer_park(struct intel_wakeref *wf)
>   {
> +	lockdep_assert_held(&wf->mutex);
>   	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count));
>   	atomic_set_release(&wf->count, 1);
>   }
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 4/9] drm/i915: Mark up the calling context for intel_wakeref_put()
@ 2019-11-20 12:46     ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 12:46 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 09:32, Chris Wilson wrote:
> Previously, we assumed we could use mutex_trylock() within an atomic
> context, falling back to a worker if contended. However, such trickery
> is illegal inside interrupt context, and so we need to always use a
> worker under such circumstances. As we normally are in process context,
> we can typically use a plain mutex, and only defer to a work when we
> know we are caller from an interrupt path.
> 
> Fixes: 51fbd8de87dc ("drm/i915/pmu: Atomically acquire the gt_pm wakeref")
> References: a0855d24fc22d ("locking/mutex: Complain upon mutex API misuse in IRQ contexts")
> References: https://bugs.freedesktop.org/show_bug.cgi?id=111626
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_pm.c    |  3 +-
>   drivers/gpu/drm/i915/gt/intel_engine_pm.h    | 10 +++++++
>   drivers/gpu/drm/i915/gt/intel_gt_pm.c        |  1 -
>   drivers/gpu/drm/i915/gt/intel_gt_pm.h        |  5 ++++
>   drivers/gpu/drm/i915/gt/intel_lrc.c          |  2 +-
>   drivers/gpu/drm/i915/gt/intel_reset.c        |  2 +-
>   drivers/gpu/drm/i915/gt/selftest_engine_pm.c |  7 +++--
>   drivers/gpu/drm/i915/i915_active.c           |  5 ++--
>   drivers/gpu/drm/i915/i915_pmu.c              |  6 ++--
>   drivers/gpu/drm/i915/intel_wakeref.c         |  8 +++---
>   drivers/gpu/drm/i915/intel_wakeref.h         | 30 ++++++++++++++------
>   11 files changed, 54 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index 1f517357a268..f3035b3ab9fa 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -210,7 +210,8 @@ static int __engine_park(struct intel_wakeref *wf)
>   
>   	engine->execlists.no_priolist = false;
>   
> -	intel_gt_pm_put(engine->gt);
> +	/* While we call i915_vma_parked, we have to break the lock cycle */
> +	intel_gt_pm_put_async(engine->gt);
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.h b/drivers/gpu/drm/i915/gt/intel_engine_pm.h
> index 739c50fefcef..24e20344dc22 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.h
> @@ -31,6 +31,16 @@ static inline void intel_engine_pm_put(struct intel_engine_cs *engine)
>   	intel_wakeref_put(&engine->wakeref);
>   }
>   
> +static inline void intel_engine_pm_put_async(struct intel_engine_cs *engine)
> +{
> +	intel_wakeref_put_async(&engine->wakeref);
> +}
> +
> +static inline void intel_engine_pm_flush(struct intel_engine_cs *engine)
> +{
> +	intel_wakeref_unlock_wait(&engine->wakeref);
> +}
> +
>   void intel_engine_init__pm(struct intel_engine_cs *engine);
>   
>   #endif /* INTEL_ENGINE_PM_H */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> index 470fbdc30e5a..f6b5169d623f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> @@ -105,7 +105,6 @@ static int __gt_park(struct intel_wakeref *wf)
>   static const struct intel_wakeref_ops wf_ops = {
>   	.get = __gt_unpark,
>   	.put = __gt_park,
> -	.flags = INTEL_WAKEREF_PUT_ASYNC,
>   };
>   
>   void intel_gt_pm_init_early(struct intel_gt *gt)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> index b3e17399be9b..990efc27a4e4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> @@ -32,6 +32,11 @@ static inline void intel_gt_pm_put(struct intel_gt *gt)
>   	intel_wakeref_put(&gt->wakeref);
>   }
>   
> +static inline void intel_gt_pm_put_async(struct intel_gt *gt)
> +{
> +	intel_wakeref_put_async(&gt->wakeref);
> +}
> +
>   static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
>   {
>   	return intel_wakeref_wait_for_idle(&gt->wakeref);
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 33ce258d484f..b65bc06855b0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -1172,7 +1172,7 @@ __execlists_schedule_out(struct i915_request *rq,
>   
>   	intel_engine_context_out(engine);
>   	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> -	intel_gt_pm_put(engine->gt);
> +	intel_gt_pm_put_async(engine->gt);
>   
>   	/*
>   	 * If this is part of a virtual engine, its next request may
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index b7007cd78c6f..0388f9375366 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1125,7 +1125,7 @@ int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
>   out:
>   	intel_engine_cancel_stop_cs(engine);
>   	reset_finish_engine(engine);
> -	intel_engine_pm_put(engine);
> +	intel_engine_pm_put_async(engine);
>   	return ret;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> index 20b9c83f43ad..cbf6b0735272 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
> @@ -51,11 +51,12 @@ static int live_engine_pm(void *arg)
>   				pr_err("intel_engine_pm_get_if_awake(%s) failed under %s\n",
>   				       engine->name, p->name);
>   			else
> -				intel_engine_pm_put(engine);
> -			intel_engine_pm_put(engine);
> +				intel_engine_pm_put_async(engine);
> +			intel_engine_pm_put_async(engine);
>   			p->critical_section_end();
>   
> -			/* engine wakeref is sync (instant) */
> +			intel_engine_pm_flush(engine);
> +
>   			if (intel_engine_pm_is_awake(engine)) {
>   				pr_err("%s is still awake after flushing pm\n",
>   				       engine->name);
> diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> index 5448f37c8102..dca15ace88f6 100644
> --- a/drivers/gpu/drm/i915/i915_active.c
> +++ b/drivers/gpu/drm/i915/i915_active.c
> @@ -672,12 +672,13 @@ void i915_active_acquire_barrier(struct i915_active *ref)
>   	 * populated by i915_request_add_active_barriers() to point to the
>   	 * request that will eventually release them.
>   	 */
> -	spin_lock_irqsave_nested(&ref->tree_lock, flags, SINGLE_DEPTH_NESTING);
>   	llist_for_each_safe(pos, next, take_preallocated_barriers(ref)) {
>   		struct active_node *node = barrier_from_ll(pos);
>   		struct intel_engine_cs *engine = barrier_to_engine(node);
>   		struct rb_node **p, *parent;
>   
> +		spin_lock_irqsave_nested(&ref->tree_lock, flags,
> +					 SINGLE_DEPTH_NESTING);
>   		parent = NULL;
>   		p = &ref->tree.rb_node;
>   		while (*p) {
> @@ -693,12 +694,12 @@ void i915_active_acquire_barrier(struct i915_active *ref)
>   		}
>   		rb_link_node(&node->node, parent, p);
>   		rb_insert_color(&node->node, &ref->tree);
> +		spin_unlock_irqrestore(&ref->tree_lock, flags);
>   
>   		GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
>   		llist_add(barrier_to_ll(node), &engine->barrier_tasks);
>   		intel_engine_pm_put(engine);
>   	}
> -	spin_unlock_irqrestore(&ref->tree_lock, flags);
>   }
>   
>   void i915_request_add_active_barriers(struct i915_request *rq)
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index 9b02be0ad4e6..95e824a78d4d 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -190,7 +190,7 @@ static u64 get_rc6(struct intel_gt *gt)
>   	val = 0;
>   	if (intel_gt_pm_get_if_awake(gt)) {
>   		val = __get_rc6(gt);
> -		intel_gt_pm_put(gt);
> +		intel_gt_pm_put_async(gt);
>   	}
>   
>   	spin_lock_irqsave(&pmu->lock, flags);
> @@ -360,7 +360,7 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
>   skip:
>   		if (unlikely(mmio_lock))
>   			spin_unlock_irqrestore(mmio_lock, flags);
> -		intel_engine_pm_put(engine);
> +		intel_engine_pm_put_async(engine);
>   	}
>   }
>   
> @@ -398,7 +398,7 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>   			if (stat)
>   				val = intel_get_cagf(rps, stat);
>   
> -			intel_gt_pm_put(gt);
> +			intel_gt_pm_put_async(gt);
>   		}
>   
>   		add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_ACT],
> diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
> index ad26d7f4ca3d..59aa1b6f1827 100644
> --- a/drivers/gpu/drm/i915/intel_wakeref.c
> +++ b/drivers/gpu/drm/i915/intel_wakeref.c
> @@ -54,7 +54,8 @@ int __intel_wakeref_get_first(struct intel_wakeref *wf)
>   
>   static void ____intel_wakeref_put_last(struct intel_wakeref *wf)
>   {
> -	if (!atomic_dec_and_test(&wf->count))
> +	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count) <= 0);
> +	if (unlikely(!atomic_dec_and_test(&wf->count)))
>   		goto unlock;
>   
>   	/* ops->put() must reschedule its own release on error/deferral */
> @@ -67,13 +68,12 @@ static void ____intel_wakeref_put_last(struct intel_wakeref *wf)
>   	mutex_unlock(&wf->mutex);
>   }
>   
> -void __intel_wakeref_put_last(struct intel_wakeref *wf)
> +void __intel_wakeref_put_last(struct intel_wakeref *wf, unsigned long flags)
>   {
>   	INTEL_WAKEREF_BUG_ON(work_pending(&wf->work));
>   
>   	/* Assume we are not in process context and so cannot sleep. */
> -	if (wf->ops->flags & INTEL_WAKEREF_PUT_ASYNC ||
> -	    !mutex_trylock(&wf->mutex)) {
> +	if (flags & INTEL_WAKEREF_PUT_ASYNC || !mutex_trylock(&wf->mutex)) {
>   		schedule_work(&wf->work);
>   		return;
>   	}
> diff --git a/drivers/gpu/drm/i915/intel_wakeref.h b/drivers/gpu/drm/i915/intel_wakeref.h
> index affe4de3746b..da6e8fd506e6 100644
> --- a/drivers/gpu/drm/i915/intel_wakeref.h
> +++ b/drivers/gpu/drm/i915/intel_wakeref.h
> @@ -9,6 +9,7 @@
>   
>   #include <linux/atomic.h>
>   #include <linux/bits.h>
> +#include <linux/lockdep.h>
>   #include <linux/mutex.h>
>   #include <linux/refcount.h>
>   #include <linux/stackdepot.h>
> @@ -29,9 +30,6 @@ typedef depot_stack_handle_t intel_wakeref_t;
>   struct intel_wakeref_ops {
>   	int (*get)(struct intel_wakeref *wf);
>   	int (*put)(struct intel_wakeref *wf);
> -
> -	unsigned long flags;
> -#define INTEL_WAKEREF_PUT_ASYNC BIT(0)
>   };
>   
>   struct intel_wakeref {
> @@ -57,7 +55,7 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
>   } while (0)
>   
>   int __intel_wakeref_get_first(struct intel_wakeref *wf);
> -void __intel_wakeref_put_last(struct intel_wakeref *wf);
> +void __intel_wakeref_put_last(struct intel_wakeref *wf, unsigned long flags);
>   
>   /**
>    * intel_wakeref_get: Acquire the wakeref
> @@ -100,10 +98,9 @@ intel_wakeref_get_if_active(struct intel_wakeref *wf)
>   }
>   
>   /**
> - * intel_wakeref_put: Release the wakeref
> - * @i915: the drm_i915_private device
> + * intel_wakeref_put_flags: Release the wakeref
>    * @wf: the wakeref
> - * @fn: callback for releasing the wakeref, called only on final release.
> + * @flags: control flags
>    *
>    * Release our hold on the wakeref. When there are no more users,
>    * the runtime pm wakeref will be released after the @fn callback is called
> @@ -116,11 +113,25 @@ intel_wakeref_get_if_active(struct intel_wakeref *wf)
>    * code otherwise.
>    */
>   static inline void
> -intel_wakeref_put(struct intel_wakeref *wf)
> +__intel_wakeref_put(struct intel_wakeref *wf, unsigned long flags)
> +#define INTEL_WAKEREF_PUT_ASYNC BIT(0)
>   {
>   	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count) <= 0);
>   	if (unlikely(!atomic_add_unless(&wf->count, -1, 1)))
> -		__intel_wakeref_put_last(wf);
> +		__intel_wakeref_put_last(wf, flags);
> +}
> +
> +static inline void
> +intel_wakeref_put(struct intel_wakeref *wf)
> +{
> +	might_sleep();
> +	__intel_wakeref_put(wf, 0);
> +}
> +
> +static inline void
> +intel_wakeref_put_async(struct intel_wakeref *wf)
> +{
> +	__intel_wakeref_put(wf, INTEL_WAKEREF_PUT_ASYNC);
>   }
>   
>   /**
> @@ -185,6 +196,7 @@ intel_wakeref_is_active(const struct intel_wakeref *wf)
>   static inline void
>   __intel_wakeref_defer_park(struct intel_wakeref *wf)
>   {
> +	lockdep_assert_held(&wf->mutex);
>   	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count));
>   	atomic_set_release(&wf->count, 1);
>   }
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-20 12:55     ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 12:55 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 09:32, Chris Wilson wrote:
> Bonded request submission is designed to allow requests to execute in
> parallel as laid out by the user. If the master request is already
> finished before its bonded pair is submitted, the pair were not destined
> to run in parallel and we lose the information about the master engine
> to dictate selection of the secondary. If the second request was
> required to be run on a particular engine in a virtual set, that should
> have been specified, rather than left to the whims of a random
> unconnected requests!
> 
> In the selftest, I made the mistake of not ensuring the master would
> overlap with its bonded pairs, meaning that it could indeed complete
> before we submitted the bonds. Those bonds were then free to select any
> available engine in their virtual set, and not the one expected by the
> test.

There is a submit await which ensures master is not runnable before 
bonded pairs are submitted. Why was that not enough? Are the sporadic 
test failures?

Regards,

Tvrtko

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/selftest_lrc.c | 23 ++++++++++++++++++++---
>   1 file changed, 20 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 16ebe4d2308e..f3b0610d1f10 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -3036,15 +3036,21 @@ static int bond_virtual_engine(struct intel_gt *gt,
>   	struct i915_gem_context *ctx;
>   	struct i915_request *rq[16];
>   	enum intel_engine_id id;
> +	struct igt_spinner spin;
>   	unsigned long n;
>   	int err;
>   
>   	GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1);
>   
> -	ctx = kernel_context(gt->i915);
> -	if (!ctx)
> +	if (igt_spinner_init(&spin, gt))
>   		return -ENOMEM;
>   
> +	ctx = kernel_context(gt->i915);
> +	if (!ctx) {
> +		err = -ENOMEM;
> +		goto err_spin;
> +	}
> +
>   	err = 0;
>   	rq[0] = ERR_PTR(-ENOMEM);
>   	for_each_engine(master, gt, id) {
> @@ -3055,7 +3061,7 @@ static int bond_virtual_engine(struct intel_gt *gt,
>   
>   		memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq));
>   
> -		rq[0] = igt_request_alloc(ctx, master);
> +		rq[0] = spinner_create_request(&spin, ctx, master, MI_NOOP);
>   		if (IS_ERR(rq[0])) {
>   			err = PTR_ERR(rq[0]);
>   			goto out;
> @@ -3068,10 +3074,17 @@ static int bond_virtual_engine(struct intel_gt *gt,
>   							       &fence,
>   							       GFP_KERNEL);
>   		}
> +
>   		i915_request_add(rq[0]);
>   		if (err < 0)
>   			goto out;
>   
> +		if (!(flags & BOND_SCHEDULE) &&
> +		    !igt_wait_for_spinner(&spin, rq[0])) {
> +			err = -EIO;
> +			goto out;
> +		}
> +
>   		for (n = 0; n < nsibling; n++) {
>   			struct intel_context *ve;
>   
> @@ -3119,6 +3132,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
>   			}
>   		}
>   		onstack_fence_fini(&fence);
> +		intel_engine_flush_submission(master);
> +		igt_spinner_end(&spin);
>   
>   		if (i915_request_wait(rq[0], 0, HZ / 10) < 0) {
>   			pr_err("Master request did not execute (on %s)!\n",
> @@ -3156,6 +3171,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
>   		err = -EIO;
>   
>   	kernel_context_close(ctx);
> +err_spin:
> +	igt_spinner_fini(&spin);
>   	return err;
>   }
>   
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-20 12:55     ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 12:55 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 09:32, Chris Wilson wrote:
> Bonded request submission is designed to allow requests to execute in
> parallel as laid out by the user. If the master request is already
> finished before its bonded pair is submitted, the pair were not destined
> to run in parallel and we lose the information about the master engine
> to dictate selection of the secondary. If the second request was
> required to be run on a particular engine in a virtual set, that should
> have been specified, rather than left to the whims of a random
> unconnected requests!
> 
> In the selftest, I made the mistake of not ensuring the master would
> overlap with its bonded pairs, meaning that it could indeed complete
> before we submitted the bonds. Those bonds were then free to select any
> available engine in their virtual set, and not the one expected by the
> test.

There is a submit await which ensures master is not runnable before 
bonded pairs are submitted. Why was that not enough? Are the sporadic 
test failures?

Regards,

Tvrtko

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/selftest_lrc.c | 23 ++++++++++++++++++++---
>   1 file changed, 20 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 16ebe4d2308e..f3b0610d1f10 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -3036,15 +3036,21 @@ static int bond_virtual_engine(struct intel_gt *gt,
>   	struct i915_gem_context *ctx;
>   	struct i915_request *rq[16];
>   	enum intel_engine_id id;
> +	struct igt_spinner spin;
>   	unsigned long n;
>   	int err;
>   
>   	GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1);
>   
> -	ctx = kernel_context(gt->i915);
> -	if (!ctx)
> +	if (igt_spinner_init(&spin, gt))
>   		return -ENOMEM;
>   
> +	ctx = kernel_context(gt->i915);
> +	if (!ctx) {
> +		err = -ENOMEM;
> +		goto err_spin;
> +	}
> +
>   	err = 0;
>   	rq[0] = ERR_PTR(-ENOMEM);
>   	for_each_engine(master, gt, id) {
> @@ -3055,7 +3061,7 @@ static int bond_virtual_engine(struct intel_gt *gt,
>   
>   		memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq));
>   
> -		rq[0] = igt_request_alloc(ctx, master);
> +		rq[0] = spinner_create_request(&spin, ctx, master, MI_NOOP);
>   		if (IS_ERR(rq[0])) {
>   			err = PTR_ERR(rq[0]);
>   			goto out;
> @@ -3068,10 +3074,17 @@ static int bond_virtual_engine(struct intel_gt *gt,
>   							       &fence,
>   							       GFP_KERNEL);
>   		}
> +
>   		i915_request_add(rq[0]);
>   		if (err < 0)
>   			goto out;
>   
> +		if (!(flags & BOND_SCHEDULE) &&
> +		    !igt_wait_for_spinner(&spin, rq[0])) {
> +			err = -EIO;
> +			goto out;
> +		}
> +
>   		for (n = 0; n < nsibling; n++) {
>   			struct intel_context *ve;
>   
> @@ -3119,6 +3132,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
>   			}
>   		}
>   		onstack_fence_fini(&fence);
> +		intel_engine_flush_submission(master);
> +		igt_spinner_end(&spin);
>   
>   		if (i915_request_wait(rq[0], 0, HZ / 10) < 0) {
>   			pr_err("Master request did not execute (on %s)!\n",
> @@ -3156,6 +3171,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
>   		err = -EIO;
>   
>   	kernel_context_close(ctx);
> +err_spin:
> +	igt_spinner_fini(&spin);
>   	return err;
>   }
>   
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-20 12:59       ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 12:59 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 12:55:42)
> 
> On 20/11/2019 09:32, Chris Wilson wrote:
> > Bonded request submission is designed to allow requests to execute in
> > parallel as laid out by the user. If the master request is already
> > finished before its bonded pair is submitted, the pair were not destined
> > to run in parallel and we lose the information about the master engine
> > to dictate selection of the secondary. If the second request was
> > required to be run on a particular engine in a virtual set, that should
> > have been specified, rather than left to the whims of a random
> > unconnected requests!
> > 
> > In the selftest, I made the mistake of not ensuring the master would
> > overlap with its bonded pairs, meaning that it could indeed complete
> > before we submitted the bonds. Those bonds were then free to select any
> > available engine in their virtual set, and not the one expected by the
> > test.
> 
> There is a submit await which ensures master is not runnable before 
> bonded pairs are submitted. Why was that not enough? Are the sporadic 
> test failures?

One test is using the submit_await, the other does not. It takes the
background retire worker to run as we are submitting the secondaries...
But I have not noticed this failure before hooking up retirement to
process_csb. However, the issue is definitely present in the current
test.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-20 12:59       ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 12:59 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 12:55:42)
> 
> On 20/11/2019 09:32, Chris Wilson wrote:
> > Bonded request submission is designed to allow requests to execute in
> > parallel as laid out by the user. If the master request is already
> > finished before its bonded pair is submitted, the pair were not destined
> > to run in parallel and we lose the information about the master engine
> > to dictate selection of the secondary. If the second request was
> > required to be run on a particular engine in a virtual set, that should
> > have been specified, rather than left to the whims of a random
> > unconnected requests!
> > 
> > In the selftest, I made the mistake of not ensuring the master would
> > overlap with its bonded pairs, meaning that it could indeed complete
> > before we submitted the bonds. Those bonds were then free to select any
> > available engine in their virtual set, and not the one expected by the
> > test.
> 
> There is a submit await which ensures master is not runnable before 
> bonded pairs are submitted. Why was that not enough? Are the sporadic 
> test failures?

One test is using the submit_await, the other does not. It takes the
background retire worker to run as we are submitting the secondaries...
But I have not noticed this failure before hooking up retirement to
process_csb. However, the issue is definitely present in the current
test.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
@ 2019-11-20 13:16     ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 13:16 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 09:33, Chris Wilson wrote:
> The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
> corruption WA") is that it disables RC6 while Skylake (and friends) is
> active, and we do not consider the GPU idle until all outstanding
> requests have been retired and the engine switched over to the kernel
> context. If userspace is idle, this task falls onto our background idle
> worker, which only runs roughly once a second, meaning that userspace has
> to have been idle for a couple of seconds before we enable RC6 again.
> Naturally, this causes us to consume considerably more energy than
> before as powersaving is effectively disabled while a display server
> (here's looking at you Xorg) is running.
> 
> As execlists will get a completion event as each context is completed,
> we can use this interrupt to queue a retire worker bound to this engine
> to cleanup idle timelines. We will then immediately notice the idle
> engine (without userspace intervention or the aid of the background
> retire worker) and start parking the GPU. Thus during light workloads,
> we will do much more work to idle the GPU faster...  Hopefully with
> commensurate power saving!
> 
> v2: Watch context completions and only look at those local to the engine
> when retiring to reduce the amount of excess work we perform.
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
> References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
>   drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
>   drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
>   drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
>   drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
>   drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
>   .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
>   7 files changed, 116 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index b9613d044393..8f6e353caa66 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -28,13 +28,13 @@
>   
>   #include "i915_drv.h"
>   
> -#include "gt/intel_gt.h"
> -
> +#include "intel_context.h"
>   #include "intel_engine.h"
>   #include "intel_engine_pm.h"
>   #include "intel_engine_pool.h"
>   #include "intel_engine_user.h"
> -#include "intel_context.h"
> +#include "intel_gt.h"
> +#include "intel_gt_requests.h"
>   #include "intel_lrc.h"
>   #include "intel_reset.h"
>   #include "intel_ring.h"
> @@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
>   	intel_engine_init_execlists(engine);
>   	intel_engine_init_cmd_parser(engine);
>   	intel_engine_init__pm(engine);
> +	intel_engine_init_retire(engine);
>   
>   	intel_engine_pool_init(&engine->pool);
>   
> @@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>   
>   	cleanup_status_page(engine);
>   
> +	intel_engine_fini_retire(engine);
>   	intel_engine_pool_fini(&engine->pool);
>   	intel_engine_fini_breadcrumbs(engine);
>   	intel_engine_cleanup_cmd_parser(engine);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 758f0e8ec672..17f1f1441efc 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -451,6 +451,14 @@ struct intel_engine_cs {
>   
>   	struct intel_engine_execlists execlists;
>   
> +	/*
> +	 * Keep track of completed timelines on this engine for early
> +	 * retirement with the goal of quickly enabling powersaving as
> +	 * soon as the engine is idle.
> +	 */
> +	struct intel_timeline *retire;
> +	struct work_struct retire_work;
> +
>   	/* status_notifier: list of callbacks for context-switch changes */
>   	struct atomic_notifier_head context_status_notifier;
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> index 4dc3cbeb1b36..4a98fefdf915 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> @@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
>   		intel_engine_flush_submission(engine);
>   }
>   
> +static void engine_retire(struct work_struct *work)
> +{
> +	struct intel_engine_cs *engine =
> +		container_of(work, typeof(*engine), retire_work);
> +	struct intel_timeline *tl = xchg(&engine->retire, NULL);

Shouldn't this be atomic_xchg to avoid racing with add_retire?

> +
> +	do {
> +		struct intel_timeline *next = xchg(&tl->retire, NULL);

Here as well?

> +
> +		/*
> +		 * Our goal here is to retire _idle_ timelines as soon as
> +		 * possible (as they are idle, we do not expect userspace
> +		 * to be cleaning up anytime soon).
> +		 *
> +		 * If the tl->active_count is already zero, someone else
> +		 * should have retired the timeline. Equally if the timeline
> +		 * is currently locked, either it is being retired elsewhere
> +		 * or about to be!
> +		 */
> +		if (atomic_read(&tl->active_count) &&
> +		    mutex_trylock(&tl->mutex)) {
> +			retire_requests(tl);
> +			mutex_unlock(&tl->mutex);
> +		}
> +		intel_timeline_put(tl);
> +
> +		GEM_BUG_ON(!next);
> +		tl = ptr_mask_bits(next, 1);

You sometimes expect engine->retire to contain 0x1?

> +	} while (tl);
> +}
> +
> +static bool add_retire(struct intel_engine_cs *engine,
> +		       struct intel_timeline *tl)
> +{
> +	struct intel_timeline *first = READ_ONCE(engine->retire);
> +
> +	/*
> +	 * We open-code a llist here to include the additional tag [BIT(0)]
> +	 * so that we know when the timeline is already on a
> +	 * retirement queue: either this engine or another.
> +	 *
> +	 * However, we rely on that a timeline can only be active on a single
> +	 * engine at any one time and that add_retire() is called before the
> +	 * engine releases the timeline and transferred to another to retire.
> +	 */
> +
> +	if (READ_ONCE(tl->retire)) /* already queued */
> +		return false;

Can't this go first in the function?

> +
> +	intel_timeline_get(tl);
> +	do
> +		tl->retire = ptr_pack_bits(first, 1, 1);

Here you rely on assignment being atomic right?

> +	while (!try_cmpxchg(&engine->retire, &first, tl));

So the loop is effectively creating a chain of timelines to retire on 
this engine.

What happens with virtual engines when a timeline goes to different 
engine before (well or any single timeline context) the retire worker 
runs? Ah okay, it gets re-assigned to the most recent engine.

I am not sure about the BIT(0) business. It's always set on write so I 
am not getting why it is useful.

Regards,

Tvrtko

> +
> +	return !first;
> +}
> +
> +void intel_engine_add_retire(struct intel_engine_cs *engine,
> +			     struct intel_timeline *tl)
> +{
> +	if (add_retire(engine, tl))
> +		schedule_work(&engine->retire_work);
> +}
> +
> +void intel_engine_init_retire(struct intel_engine_cs *engine)
> +{
> +	INIT_WORK(&engine->retire_work, engine_retire);
> +}
> +
> +void intel_engine_fini_retire(struct intel_engine_cs *engine)
> +{
> +	flush_work(&engine->retire_work);
> +}
> +
>   long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
>   {
>   	struct intel_gt_timelines *timelines = &gt->timelines;
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> index fde546424c63..8de559b5a033 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> @@ -7,7 +7,12 @@
>   #ifndef INTEL_GT_REQUESTS_H
>   #define INTEL_GT_REQUESTS_H
>   
> -struct intel_gt;
> +#include <linux/workqueue.h>
> +
> +#include "intel_gt_types.h"
> +
> +struct intel_engine_cs;
> +struct intel_timeline;
>   
>   long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout);
>   static inline void intel_gt_retire_requests(struct intel_gt *gt)
> @@ -15,6 +20,16 @@ static inline void intel_gt_retire_requests(struct intel_gt *gt)
>   	intel_gt_retire_requests_timeout(gt, 0);
>   }
>   
> +static inline void intel_gt_schedule_retire_requests(struct intel_gt *gt)
> +{
> +	mod_delayed_work(system_wq, &gt->requests.retire_work, 0);
> +}

This is unused in v2.

> +
> +void intel_engine_init_retire(struct intel_engine_cs *engine);
> +void intel_engine_add_retire(struct intel_engine_cs *engine,
> +			     struct intel_timeline *tl);
> +void intel_engine_fini_retire(struct intel_engine_cs *engine);
> +
>   int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout);
>   
>   void intel_gt_init_requests(struct intel_gt *gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index b65bc06855b0..2ceaa2f22996 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -142,6 +142,7 @@
>   #include "intel_engine_pm.h"
>   #include "intel_gt.h"
>   #include "intel_gt_pm.h"
> +#include "intel_gt_requests.h"
>   #include "intel_lrc_reg.h"
>   #include "intel_mocs.h"
>   #include "intel_reset.h"
> @@ -1170,6 +1171,14 @@ __execlists_schedule_out(struct i915_request *rq,
>   	 * refrain from doing non-trivial work here.
>   	 */
>   
> +	/*
> +	 * If we have just completed this context, the engine may now be
> +	 * idle and we want to re-enter powersaving.
> +	 */
> +	if (list_is_last(&rq->link, &ce->timeline->requests) &&
> +	    i915_request_completed(rq))
> +		intel_engine_add_retire(engine, ce->timeline);
> +
>   	intel_engine_context_out(engine);
>   	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
>   	intel_gt_pm_put_async(engine->gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
> index b190a5d9ab02..c1d2419444f8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
> @@ -277,6 +277,7 @@ void intel_timeline_fini(struct intel_timeline *timeline)
>   {
>   	GEM_BUG_ON(atomic_read(&timeline->pin_count));
>   	GEM_BUG_ON(!list_empty(&timeline->requests));
> +	GEM_BUG_ON(timeline->retire);
>   
>   	if (timeline->hwsp_cacheline)
>   		cacheline_free(timeline->hwsp_cacheline);
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> index 5244615ed1cb..aaf15cbe1ce1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> @@ -66,6 +66,9 @@ struct intel_timeline {
>   	 */
>   	struct i915_active_fence last_request;
>   
> +	/** A chain of completed timelines ready for early retirement. */
> +	struct intel_timeline *retire;
> +
>   	/**
>   	 * We track the most recent seqno that we wait on in every context so
>   	 * that we only have to emit a new await and dependency on a more
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
@ 2019-11-20 13:16     ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 13:16 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 09:33, Chris Wilson wrote:
> The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
> corruption WA") is that it disables RC6 while Skylake (and friends) is
> active, and we do not consider the GPU idle until all outstanding
> requests have been retired and the engine switched over to the kernel
> context. If userspace is idle, this task falls onto our background idle
> worker, which only runs roughly once a second, meaning that userspace has
> to have been idle for a couple of seconds before we enable RC6 again.
> Naturally, this causes us to consume considerably more energy than
> before as powersaving is effectively disabled while a display server
> (here's looking at you Xorg) is running.
> 
> As execlists will get a completion event as each context is completed,
> we can use this interrupt to queue a retire worker bound to this engine
> to cleanup idle timelines. We will then immediately notice the idle
> engine (without userspace intervention or the aid of the background
> retire worker) and start parking the GPU. Thus during light workloads,
> we will do much more work to idle the GPU faster...  Hopefully with
> commensurate power saving!
> 
> v2: Watch context completions and only look at those local to the engine
> when retiring to reduce the amount of excess work we perform.
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
> References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
>   drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
>   drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
>   drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
>   drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
>   drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
>   .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
>   7 files changed, 116 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index b9613d044393..8f6e353caa66 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -28,13 +28,13 @@
>   
>   #include "i915_drv.h"
>   
> -#include "gt/intel_gt.h"
> -
> +#include "intel_context.h"
>   #include "intel_engine.h"
>   #include "intel_engine_pm.h"
>   #include "intel_engine_pool.h"
>   #include "intel_engine_user.h"
> -#include "intel_context.h"
> +#include "intel_gt.h"
> +#include "intel_gt_requests.h"
>   #include "intel_lrc.h"
>   #include "intel_reset.h"
>   #include "intel_ring.h"
> @@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
>   	intel_engine_init_execlists(engine);
>   	intel_engine_init_cmd_parser(engine);
>   	intel_engine_init__pm(engine);
> +	intel_engine_init_retire(engine);
>   
>   	intel_engine_pool_init(&engine->pool);
>   
> @@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>   
>   	cleanup_status_page(engine);
>   
> +	intel_engine_fini_retire(engine);
>   	intel_engine_pool_fini(&engine->pool);
>   	intel_engine_fini_breadcrumbs(engine);
>   	intel_engine_cleanup_cmd_parser(engine);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 758f0e8ec672..17f1f1441efc 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -451,6 +451,14 @@ struct intel_engine_cs {
>   
>   	struct intel_engine_execlists execlists;
>   
> +	/*
> +	 * Keep track of completed timelines on this engine for early
> +	 * retirement with the goal of quickly enabling powersaving as
> +	 * soon as the engine is idle.
> +	 */
> +	struct intel_timeline *retire;
> +	struct work_struct retire_work;
> +
>   	/* status_notifier: list of callbacks for context-switch changes */
>   	struct atomic_notifier_head context_status_notifier;
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> index 4dc3cbeb1b36..4a98fefdf915 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> @@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
>   		intel_engine_flush_submission(engine);
>   }
>   
> +static void engine_retire(struct work_struct *work)
> +{
> +	struct intel_engine_cs *engine =
> +		container_of(work, typeof(*engine), retire_work);
> +	struct intel_timeline *tl = xchg(&engine->retire, NULL);

Shouldn't this be atomic_xchg to avoid racing with add_retire?

> +
> +	do {
> +		struct intel_timeline *next = xchg(&tl->retire, NULL);

Here as well?

> +
> +		/*
> +		 * Our goal here is to retire _idle_ timelines as soon as
> +		 * possible (as they are idle, we do not expect userspace
> +		 * to be cleaning up anytime soon).
> +		 *
> +		 * If the tl->active_count is already zero, someone else
> +		 * should have retired the timeline. Equally if the timeline
> +		 * is currently locked, either it is being retired elsewhere
> +		 * or about to be!
> +		 */
> +		if (atomic_read(&tl->active_count) &&
> +		    mutex_trylock(&tl->mutex)) {
> +			retire_requests(tl);
> +			mutex_unlock(&tl->mutex);
> +		}
> +		intel_timeline_put(tl);
> +
> +		GEM_BUG_ON(!next);
> +		tl = ptr_mask_bits(next, 1);

You sometimes expect engine->retire to contain 0x1?

> +	} while (tl);
> +}
> +
> +static bool add_retire(struct intel_engine_cs *engine,
> +		       struct intel_timeline *tl)
> +{
> +	struct intel_timeline *first = READ_ONCE(engine->retire);
> +
> +	/*
> +	 * We open-code a llist here to include the additional tag [BIT(0)]
> +	 * so that we know when the timeline is already on a
> +	 * retirement queue: either this engine or another.
> +	 *
> +	 * However, we rely on that a timeline can only be active on a single
> +	 * engine at any one time and that add_retire() is called before the
> +	 * engine releases the timeline and transferred to another to retire.
> +	 */
> +
> +	if (READ_ONCE(tl->retire)) /* already queued */
> +		return false;

Can't this go first in the function?

> +
> +	intel_timeline_get(tl);
> +	do
> +		tl->retire = ptr_pack_bits(first, 1, 1);

Here you rely on assignment being atomic right?

> +	while (!try_cmpxchg(&engine->retire, &first, tl));

So the loop is effectively creating a chain of timelines to retire on 
this engine.

What happens with virtual engines when a timeline goes to different 
engine before (well or any single timeline context) the retire worker 
runs? Ah okay, it gets re-assigned to the most recent engine.

I am not sure about the BIT(0) business. It's always set on write so I 
am not getting why it is useful.

Regards,

Tvrtko

> +
> +	return !first;
> +}
> +
> +void intel_engine_add_retire(struct intel_engine_cs *engine,
> +			     struct intel_timeline *tl)
> +{
> +	if (add_retire(engine, tl))
> +		schedule_work(&engine->retire_work);
> +}
> +
> +void intel_engine_init_retire(struct intel_engine_cs *engine)
> +{
> +	INIT_WORK(&engine->retire_work, engine_retire);
> +}
> +
> +void intel_engine_fini_retire(struct intel_engine_cs *engine)
> +{
> +	flush_work(&engine->retire_work);
> +}
> +
>   long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
>   {
>   	struct intel_gt_timelines *timelines = &gt->timelines;
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> index fde546424c63..8de559b5a033 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> @@ -7,7 +7,12 @@
>   #ifndef INTEL_GT_REQUESTS_H
>   #define INTEL_GT_REQUESTS_H
>   
> -struct intel_gt;
> +#include <linux/workqueue.h>
> +
> +#include "intel_gt_types.h"
> +
> +struct intel_engine_cs;
> +struct intel_timeline;
>   
>   long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout);
>   static inline void intel_gt_retire_requests(struct intel_gt *gt)
> @@ -15,6 +20,16 @@ static inline void intel_gt_retire_requests(struct intel_gt *gt)
>   	intel_gt_retire_requests_timeout(gt, 0);
>   }
>   
> +static inline void intel_gt_schedule_retire_requests(struct intel_gt *gt)
> +{
> +	mod_delayed_work(system_wq, &gt->requests.retire_work, 0);
> +}

This is unused in v2.

> +
> +void intel_engine_init_retire(struct intel_engine_cs *engine);
> +void intel_engine_add_retire(struct intel_engine_cs *engine,
> +			     struct intel_timeline *tl);
> +void intel_engine_fini_retire(struct intel_engine_cs *engine);
> +
>   int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout);
>   
>   void intel_gt_init_requests(struct intel_gt *gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index b65bc06855b0..2ceaa2f22996 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -142,6 +142,7 @@
>   #include "intel_engine_pm.h"
>   #include "intel_gt.h"
>   #include "intel_gt_pm.h"
> +#include "intel_gt_requests.h"
>   #include "intel_lrc_reg.h"
>   #include "intel_mocs.h"
>   #include "intel_reset.h"
> @@ -1170,6 +1171,14 @@ __execlists_schedule_out(struct i915_request *rq,
>   	 * refrain from doing non-trivial work here.
>   	 */
>   
> +	/*
> +	 * If we have just completed this context, the engine may now be
> +	 * idle and we want to re-enter powersaving.
> +	 */
> +	if (list_is_last(&rq->link, &ce->timeline->requests) &&
> +	    i915_request_completed(rq))
> +		intel_engine_add_retire(engine, ce->timeline);
> +
>   	intel_engine_context_out(engine);
>   	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
>   	intel_gt_pm_put_async(engine->gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
> index b190a5d9ab02..c1d2419444f8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
> @@ -277,6 +277,7 @@ void intel_timeline_fini(struct intel_timeline *timeline)
>   {
>   	GEM_BUG_ON(atomic_read(&timeline->pin_count));
>   	GEM_BUG_ON(!list_empty(&timeline->requests));
> +	GEM_BUG_ON(timeline->retire);
>   
>   	if (timeline->hwsp_cacheline)
>   		cacheline_free(timeline->hwsp_cacheline);
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> index 5244615ed1cb..aaf15cbe1ce1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> @@ -66,6 +66,9 @@ struct intel_timeline {
>   	 */
>   	struct i915_active_fence last_request;
>   
> +	/** A chain of completed timelines ready for early retirement. */
> +	struct intel_timeline *retire;
> +
>   	/**
>   	 * We track the most recent seqno that we wait on in every context so
>   	 * that we only have to emit a new await and dependency on a more
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-20 13:18         ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 13:18 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 12:59, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 12:55:42)
>>
>> On 20/11/2019 09:32, Chris Wilson wrote:
>>> Bonded request submission is designed to allow requests to execute in
>>> parallel as laid out by the user. If the master request is already
>>> finished before its bonded pair is submitted, the pair were not destined
>>> to run in parallel and we lose the information about the master engine
>>> to dictate selection of the secondary. If the second request was
>>> required to be run on a particular engine in a virtual set, that should
>>> have been specified, rather than left to the whims of a random
>>> unconnected requests!
>>>
>>> In the selftest, I made the mistake of not ensuring the master would
>>> overlap with its bonded pairs, meaning that it could indeed complete
>>> before we submitted the bonds. Those bonds were then free to select any
>>> available engine in their virtual set, and not the one expected by the
>>> test.
>>
>> There is a submit await which ensures master is not runnable before
>> bonded pairs are submitted. Why was that not enough? Are the sporadic
>> test failures?
> 
> One test is using the submit_await, the other does not. It takes the
> background retire worker to run as we are submitting the secondaries...
> But I have not noticed this failure before hooking up retirement to
> process_csb. However, the issue is definitely present in the current
> test.

So what happens? Is this another issue limited to selftests? Because I 
don't see that uAPI itself can't be used in this way.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-20 13:18         ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 13:18 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 12:59, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 12:55:42)
>>
>> On 20/11/2019 09:32, Chris Wilson wrote:
>>> Bonded request submission is designed to allow requests to execute in
>>> parallel as laid out by the user. If the master request is already
>>> finished before its bonded pair is submitted, the pair were not destined
>>> to run in parallel and we lose the information about the master engine
>>> to dictate selection of the secondary. If the second request was
>>> required to be run on a particular engine in a virtual set, that should
>>> have been specified, rather than left to the whims of a random
>>> unconnected requests!
>>>
>>> In the selftest, I made the mistake of not ensuring the master would
>>> overlap with its bonded pairs, meaning that it could indeed complete
>>> before we submitted the bonds. Those bonds were then free to select any
>>> available engine in their virtual set, and not the one expected by the
>>> test.
>>
>> There is a submit await which ensures master is not runnable before
>> bonded pairs are submitted. Why was that not enough? Are the sporadic
>> test failures?
> 
> One test is using the submit_await, the other does not. It takes the
> background retire worker to run as we are submitting the secondaries...
> But I have not noticed this failure before hooking up retirement to
> process_csb. However, the issue is definitely present in the current
> test.

So what happens? Is this another issue limited to selftests? Because I 
don't see that uAPI itself can't be used in this way.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20 13:19             ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 13:19 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

666
On 20/11/2019 12:44, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 12:40:13)
>>
>> On 20/11/2019 12:07, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2019-11-20 11:58:27)
>>>>
>>>> On 20/11/2019 09:32, Chris Wilson wrote:
>>>>> In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
>>>>> the backend"), I erroneously concluded that we last modify the engine
>>>>> inside __i915_request_commit() meaning that we could enable concurrent
>>>>> submission for userspace as we enqueued this request. However, this
>>>>> falls into a trap with other users of the engine->kernel_context waking
>>>>> up and submitting their request before the idle-switch is queued, with
>>>>> the result that the kernel_context is executed out-of-sequence most
>>>>> likely upsetting the GPU and certainly ourselves when we try to retire
>>>>> the out-of-sequence requests.
>>>>>
>>>>> As such we need to hold onto the effective engine->kernel_context mutex
>>>>> lock (via the engine pm mutex proxy) until we have finish queuing the
>>>>> request to the engine.
>>>>>
>>>>> v2: Serialise against concurrent intel_gt_retire_requests()
>>>>> v3: Describe the hairy locking scheme with intel_gt_retire_requests()
>>>>> for future reference.
>>>>> v4: Combine timeline->lock and engine pm release; it's hairy.
>>>>>
>>>>> Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>> ---
>>>>>     drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
>>>>>     1 file changed, 40 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>>>> index 3c0f490ff2c7..1f517357a268 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>>>> @@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
>>>>>     
>>>>>     #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
>>>>>     
>>>>> +static void
>>>>> +__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
>>>>> +                                   struct intel_engine_cs *engine)
>>>>> +{
>>>>> +     struct intel_gt_timelines *timelines = &engine->gt->timelines;
>>>>> +
>>>>> +     spin_lock(&timelines->lock);
>>>>> +
>>>>> +     if (!atomic_fetch_inc(&tl->active_count))
>>>>> +             list_add_tail(&tl->link, &timelines->active_list);
>>>>
>>>> Hmm with these new part it maybe matches/answers my question from
>>>> "drm/i915/gt: Close race between engine_park and
>>>> intel_gt_retire_requests". I think at least. Since it now adds a second
>>>> place timeline can enter the active_list.
>>>>
>>>> But no, where does the intel_timeline_enter race come from? Can't be
>>>> userspace submission since they are blocked on wf->lock.
>>>
>>> The race is not just with intel_timeline_enter, but with
>>> intel_gt_retire_requests.
>>>
>>> As soon as we are on that list, we may be retired. If we are retired
>>> before adjusting the engine->wakeref.count, we are b0rked.
>>>
>>> As soon as we adjust the engine->wakeref.count, another submission may
>>> call intel_timeline_enter, and again may even retire this request. The
>>> enter itself is perfectly fine, but we need to serialise against the
>>> retires.
>>
>> I think the two patches combined work, I am just not sure two
>> atomic_fetch_inc()->list_add() are needed now that you re-ordered
>> __i915_requests_queue and __intel_wakeref_defer_park - that's the part
>> which is confusing me. But it also doesn't harm...
> 
> I tried to get away with not, but the selftests hammer very heavily on
> the engine->kernel_context so we do encounter the scenarios where we are
> using the kernel_context to park on one cpu while submitting a new
> request on another.
> 
> I would have got away with it but for these pesky selftests.

Okay, I'll trust you on that:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
@ 2019-11-20 13:19             ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 13:19 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

666
On 20/11/2019 12:44, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 12:40:13)
>>
>> On 20/11/2019 12:07, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2019-11-20 11:58:27)
>>>>
>>>> On 20/11/2019 09:32, Chris Wilson wrote:
>>>>> In commit a79ca656b648 ("drm/i915: Push the wakeref->count deferral to
>>>>> the backend"), I erroneously concluded that we last modify the engine
>>>>> inside __i915_request_commit() meaning that we could enable concurrent
>>>>> submission for userspace as we enqueued this request. However, this
>>>>> falls into a trap with other users of the engine->kernel_context waking
>>>>> up and submitting their request before the idle-switch is queued, with
>>>>> the result that the kernel_context is executed out-of-sequence most
>>>>> likely upsetting the GPU and certainly ourselves when we try to retire
>>>>> the out-of-sequence requests.
>>>>>
>>>>> As such we need to hold onto the effective engine->kernel_context mutex
>>>>> lock (via the engine pm mutex proxy) until we have finish queuing the
>>>>> request to the engine.
>>>>>
>>>>> v2: Serialise against concurrent intel_gt_retire_requests()
>>>>> v3: Describe the hairy locking scheme with intel_gt_retire_requests()
>>>>> for future reference.
>>>>> v4: Combine timeline->lock and engine pm release; it's hairy.
>>>>>
>>>>> Fixes: a79ca656b648 ("drm/i915: Push the wakeref->count deferral to the backend")
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>> ---
>>>>>     drivers/gpu/drm/i915/gt/intel_engine_pm.c | 47 +++++++++++++++++++----
>>>>>     1 file changed, 40 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>>>> index 3c0f490ff2c7..1f517357a268 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>>>> @@ -73,8 +73,25 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
>>>>>     
>>>>>     #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
>>>>>     
>>>>> +static void
>>>>> +__intel_timeline_enter_and_pm_release(struct intel_timeline *tl,
>>>>> +                                   struct intel_engine_cs *engine)
>>>>> +{
>>>>> +     struct intel_gt_timelines *timelines = &engine->gt->timelines;
>>>>> +
>>>>> +     spin_lock(&timelines->lock);
>>>>> +
>>>>> +     if (!atomic_fetch_inc(&tl->active_count))
>>>>> +             list_add_tail(&tl->link, &timelines->active_list);
>>>>
>>>> Hmm with these new part it maybe matches/answers my question from
>>>> "drm/i915/gt: Close race between engine_park and
>>>> intel_gt_retire_requests". I think at least. Since it now adds a second
>>>> place timeline can enter the active_list.
>>>>
>>>> But no, where does the intel_timeline_enter race come from? Can't be
>>>> userspace submission since they are blocked on wf->lock.
>>>
>>> The race is not just with intel_timeline_enter, but with
>>> intel_gt_retire_requests.
>>>
>>> As soon as we are on that list, we may be retired. If we are retired
>>> before adjusting the engine->wakeref.count, we are b0rked.
>>>
>>> As soon as we adjust the engine->wakeref.count, another submission may
>>> call intel_timeline_enter, and again may even retire this request. The
>>> enter itself is perfectly fine, but we need to serialise against the
>>> retires.
>>
>> I think the two patches combined work, I am just not sure two
>> atomic_fetch_inc()->list_add() are needed now that you re-ordered
>> __i915_requests_queue and __intel_wakeref_defer_park - that's the part
>> which is confusing me. But it also doesn't harm...
> 
> I tried to get away with not, but the selftests hammer very heavily on
> the engine->kernel_context so we do encounter the scenarios where we are
> using the kernel_context to park on one cpu while submitting a new
> request on another.
> 
> I would have got away with it but for these pesky selftests.

Okay, I'll trust you on that:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 2/9] drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
@ 2019-11-20 13:19     ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 13:19 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Matthew Auld


On 20/11/2019 09:32, Chris Wilson wrote:
> The general concept was that intel_timeline.active_count was locked by
> the intel_timeline.mutex. The exception was for power management, where
> the engine->kernel_context->timeline could be manipulated under the
> global wakeref.mutex.
> 
> This was quite solid, as we always manipulated the timeline only while
> we held an engine wakeref.
> 
> And then we started retiring requests outside of struct_mutex, only
> using the timelines.active_list and the timeline->mutex. There we
> started manipulating intel_timeline.active_count outside of an engine
> wakeref, and so introduced a race between __engine_park() and
> intel_gt_retire_requests(), a race that could result in the
> engine->kernel_context not being added to the active timelines and so
> losing requests, which caused us to keep the system permanently powered
> up [and unloadable].
> 
> The race would be easy to close if we could take the engine wakeref for
> the timeline before we retire -- except timelines are not bound to any
> engine and so we would need to keep all active engines awake. The
> alternative is to guard intel_timeline_enter/intel_timeline_exit for use
> outside of the timeline->mutex.
> 
> Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Matthew Auld <matthew.auld@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_gt_requests.c   |  8 ++---
>   drivers/gpu/drm/i915/gt/intel_timeline.c      | 34 +++++++++++++++----
>   .../gpu/drm/i915/gt/intel_timeline_types.h    |  2 +-
>   3 files changed, 32 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> index 25291e2af21e..1a005da8c588 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> @@ -49,8 +49,8 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
>   			continue;
>   
>   		intel_timeline_get(tl);
> -		GEM_BUG_ON(!tl->active_count);
> -		tl->active_count++; /* pin the list element */
> +		GEM_BUG_ON(!atomic_read(&tl->active_count));
> +		atomic_inc(&tl->active_count); /* pin the list element */
>   		spin_unlock_irqrestore(&timelines->lock, flags);
>   
>   		if (timeout > 0) {
> @@ -71,14 +71,14 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
>   
>   		/* Resume iteration after dropping lock */
>   		list_safe_reset_next(tl, tn, link);
> -		if (!--tl->active_count)
> +		if (atomic_dec_and_test(&tl->active_count))
>   			list_del(&tl->link);
>   
>   		mutex_unlock(&tl->mutex);
>   
>   		/* Defer the final release to after the spinlock */
>   		if (refcount_dec_and_test(&tl->kref.refcount)) {
> -			GEM_BUG_ON(tl->active_count);
> +			GEM_BUG_ON(atomic_read(&tl->active_count));
>   			list_add(&tl->link, &free);
>   		}
>   	}
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
> index 0e277835aad0..b35f12729983 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
> @@ -334,15 +334,33 @@ void intel_timeline_enter(struct intel_timeline *tl)
>   	struct intel_gt_timelines *timelines = &tl->gt->timelines;
>   	unsigned long flags;
>   
> +	/*
> +	 * Pretend we are serialised by the timeline->mutex.
> +	 *
> +	 * While generally true, there are a few exceptions to the rule
> +	 * for the engine->kernel_context being used to manage power
> +	 * transitions. As the engine_park may be called from under any
> +	 * timeline, it uses the power mutex as a global serialisation
> +	 * lock to prevent any other request entering its timeline.
> +	 *
> +	 * The rule is generally tl->mutex, otherwise engine->wakeref.mutex.
> +	 *
> +	 * However, intel_gt_retire_request() does not know which engine
> +	 * it is retiring along and so cannot partake in the engine-pm
> +	 * barrier, and there we use the tl->active_count as a means to
> +	 * pin the timeline in the active_list while the locks are dropped.
> +	 * Ergo, as that is outside of the engine-pm barrier, we need to
> +	 * use atomic to manipulate tl->active_count.
> +	 */
>   	lockdep_assert_held(&tl->mutex);
> -
>   	GEM_BUG_ON(!atomic_read(&tl->pin_count));
> -	if (tl->active_count++)
> +
> +	if (atomic_add_unless(&tl->active_count, 1, 0))
>   		return;
> -	GEM_BUG_ON(!tl->active_count); /* overflow? */
>   
>   	spin_lock_irqsave(&timelines->lock, flags);
> -	list_add_tail(&tl->link, &timelines->active_list);
> +	if (!atomic_fetch_inc(&tl->active_count))
> +		list_add_tail(&tl->link, &timelines->active_list);
>   	spin_unlock_irqrestore(&timelines->lock, flags);
>   }
>   
> @@ -351,14 +369,16 @@ void intel_timeline_exit(struct intel_timeline *tl)
>   	struct intel_gt_timelines *timelines = &tl->gt->timelines;
>   	unsigned long flags;
>   
> +	/* See intel_timeline_enter() */
>   	lockdep_assert_held(&tl->mutex);
>   
> -	GEM_BUG_ON(!tl->active_count);
> -	if (--tl->active_count)
> +	GEM_BUG_ON(!atomic_read(&tl->active_count));
> +	if (atomic_add_unless(&tl->active_count, -1, 1))
>   		return;
>   
>   	spin_lock_irqsave(&timelines->lock, flags);
> -	list_del(&tl->link);
> +	if (atomic_dec_and_test(&tl->active_count))
> +		list_del(&tl->link);
>   	spin_unlock_irqrestore(&timelines->lock, flags);
>   
>   	/*
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> index 98d9ee166379..5244615ed1cb 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> @@ -42,7 +42,7 @@ struct intel_timeline {
>   	 * from the intel_context caller plus internal atomicity.
>   	 */
>   	atomic_t pin_count;
> -	unsigned int active_count;
> +	atomic_t active_count;
>   
>   	const u32 *hwsp_seqno;
>   	struct i915_vma *hwsp_ggtt;
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 2/9] drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
@ 2019-11-20 13:19     ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 13:19 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Matthew Auld


On 20/11/2019 09:32, Chris Wilson wrote:
> The general concept was that intel_timeline.active_count was locked by
> the intel_timeline.mutex. The exception was for power management, where
> the engine->kernel_context->timeline could be manipulated under the
> global wakeref.mutex.
> 
> This was quite solid, as we always manipulated the timeline only while
> we held an engine wakeref.
> 
> And then we started retiring requests outside of struct_mutex, only
> using the timelines.active_list and the timeline->mutex. There we
> started manipulating intel_timeline.active_count outside of an engine
> wakeref, and so introduced a race between __engine_park() and
> intel_gt_retire_requests(), a race that could result in the
> engine->kernel_context not being added to the active timelines and so
> losing requests, which caused us to keep the system permanently powered
> up [and unloadable].
> 
> The race would be easy to close if we could take the engine wakeref for
> the timeline before we retire -- except timelines are not bound to any
> engine and so we would need to keep all active engines awake. The
> alternative is to guard intel_timeline_enter/intel_timeline_exit for use
> outside of the timeline->mutex.
> 
> Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Matthew Auld <matthew.auld@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_gt_requests.c   |  8 ++---
>   drivers/gpu/drm/i915/gt/intel_timeline.c      | 34 +++++++++++++++----
>   .../gpu/drm/i915/gt/intel_timeline_types.h    |  2 +-
>   3 files changed, 32 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> index 25291e2af21e..1a005da8c588 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> @@ -49,8 +49,8 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
>   			continue;
>   
>   		intel_timeline_get(tl);
> -		GEM_BUG_ON(!tl->active_count);
> -		tl->active_count++; /* pin the list element */
> +		GEM_BUG_ON(!atomic_read(&tl->active_count));
> +		atomic_inc(&tl->active_count); /* pin the list element */
>   		spin_unlock_irqrestore(&timelines->lock, flags);
>   
>   		if (timeout > 0) {
> @@ -71,14 +71,14 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
>   
>   		/* Resume iteration after dropping lock */
>   		list_safe_reset_next(tl, tn, link);
> -		if (!--tl->active_count)
> +		if (atomic_dec_and_test(&tl->active_count))
>   			list_del(&tl->link);
>   
>   		mutex_unlock(&tl->mutex);
>   
>   		/* Defer the final release to after the spinlock */
>   		if (refcount_dec_and_test(&tl->kref.refcount)) {
> -			GEM_BUG_ON(tl->active_count);
> +			GEM_BUG_ON(atomic_read(&tl->active_count));
>   			list_add(&tl->link, &free);
>   		}
>   	}
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
> index 0e277835aad0..b35f12729983 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
> @@ -334,15 +334,33 @@ void intel_timeline_enter(struct intel_timeline *tl)
>   	struct intel_gt_timelines *timelines = &tl->gt->timelines;
>   	unsigned long flags;
>   
> +	/*
> +	 * Pretend we are serialised by the timeline->mutex.
> +	 *
> +	 * While generally true, there are a few exceptions to the rule
> +	 * for the engine->kernel_context being used to manage power
> +	 * transitions. As the engine_park may be called from under any
> +	 * timeline, it uses the power mutex as a global serialisation
> +	 * lock to prevent any other request entering its timeline.
> +	 *
> +	 * The rule is generally tl->mutex, otherwise engine->wakeref.mutex.
> +	 *
> +	 * However, intel_gt_retire_request() does not know which engine
> +	 * it is retiring along and so cannot partake in the engine-pm
> +	 * barrier, and there we use the tl->active_count as a means to
> +	 * pin the timeline in the active_list while the locks are dropped.
> +	 * Ergo, as that is outside of the engine-pm barrier, we need to
> +	 * use atomic to manipulate tl->active_count.
> +	 */
>   	lockdep_assert_held(&tl->mutex);
> -
>   	GEM_BUG_ON(!atomic_read(&tl->pin_count));
> -	if (tl->active_count++)
> +
> +	if (atomic_add_unless(&tl->active_count, 1, 0))
>   		return;
> -	GEM_BUG_ON(!tl->active_count); /* overflow? */
>   
>   	spin_lock_irqsave(&timelines->lock, flags);
> -	list_add_tail(&tl->link, &timelines->active_list);
> +	if (!atomic_fetch_inc(&tl->active_count))
> +		list_add_tail(&tl->link, &timelines->active_list);
>   	spin_unlock_irqrestore(&timelines->lock, flags);
>   }
>   
> @@ -351,14 +369,16 @@ void intel_timeline_exit(struct intel_timeline *tl)
>   	struct intel_gt_timelines *timelines = &tl->gt->timelines;
>   	unsigned long flags;
>   
> +	/* See intel_timeline_enter() */
>   	lockdep_assert_held(&tl->mutex);
>   
> -	GEM_BUG_ON(!tl->active_count);
> -	if (--tl->active_count)
> +	GEM_BUG_ON(!atomic_read(&tl->active_count));
> +	if (atomic_add_unless(&tl->active_count, -1, 1))
>   		return;
>   
>   	spin_lock_irqsave(&timelines->lock, flags);
> -	list_del(&tl->link);
> +	if (atomic_dec_and_test(&tl->active_count))
> +		list_del(&tl->link);
>   	spin_unlock_irqrestore(&timelines->lock, flags);
>   
>   	/*
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> index 98d9ee166379..5244615ed1cb 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> @@ -42,7 +42,7 @@ struct intel_timeline {
>   	 * from the intel_context caller plus internal atomicity.
>   	 */
>   	atomic_t pin_count;
> -	unsigned int active_count;
> +	atomic_t active_count;
>   
>   	const u32 *hwsp_seqno;
>   	struct i915_vma *hwsp_ggtt;
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-20 13:29           ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 13:29 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 13:18:27)
> 
> On 20/11/2019 12:59, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-20 12:55:42)
> >>
> >> On 20/11/2019 09:32, Chris Wilson wrote:
> >>> Bonded request submission is designed to allow requests to execute in
> >>> parallel as laid out by the user. If the master request is already
> >>> finished before its bonded pair is submitted, the pair were not destined
> >>> to run in parallel and we lose the information about the master engine
> >>> to dictate selection of the secondary. If the second request was
> >>> required to be run on a particular engine in a virtual set, that should
> >>> have been specified, rather than left to the whims of a random
> >>> unconnected requests!
> >>>
> >>> In the selftest, I made the mistake of not ensuring the master would
> >>> overlap with its bonded pairs, meaning that it could indeed complete
> >>> before we submitted the bonds. Those bonds were then free to select any
> >>> available engine in their virtual set, and not the one expected by the
> >>> test.
> >>
> >> There is a submit await which ensures master is not runnable before
> >> bonded pairs are submitted. Why was that not enough? Are the sporadic
> >> test failures?
> > 
> > One test is using the submit_await, the other does not. It takes the
> > background retire worker to run as we are submitting the secondaries...
> > But I have not noticed this failure before hooking up retirement to
> > process_csb. However, the issue is definitely present in the current
> > test.
> 
> So what happens? Is this another issue limited to selftests? Because I 
> don't see that uAPI itself can't be used in this way.

Since the master batch is already completed & signaled by the time we
submit the secondaries, the submit-fence is a dud and the secondaries
are not constrained in their engine selection.

i915_request_await_execution:
	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
		continue;
	else
		__i915_request_await_execution()

Now, our choice is either to drop the check on the signaled bit (and so
we will apply the bonding constrained from the already finished batch)
or not. Given that the master is already complete, I feel justified in
the current decision to ignore the constraint (since equally the fence
could already have been retired and so completely inaccessible), so chose
to fix the test instead.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-20 13:29           ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 13:29 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 13:18:27)
> 
> On 20/11/2019 12:59, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-20 12:55:42)
> >>
> >> On 20/11/2019 09:32, Chris Wilson wrote:
> >>> Bonded request submission is designed to allow requests to execute in
> >>> parallel as laid out by the user. If the master request is already
> >>> finished before its bonded pair is submitted, the pair were not destined
> >>> to run in parallel and we lose the information about the master engine
> >>> to dictate selection of the secondary. If the second request was
> >>> required to be run on a particular engine in a virtual set, that should
> >>> have been specified, rather than left to the whims of a random
> >>> unconnected requests!
> >>>
> >>> In the selftest, I made the mistake of not ensuring the master would
> >>> overlap with its bonded pairs, meaning that it could indeed complete
> >>> before we submitted the bonds. Those bonds were then free to select any
> >>> available engine in their virtual set, and not the one expected by the
> >>> test.
> >>
> >> There is a submit await which ensures master is not runnable before
> >> bonded pairs are submitted. Why was that not enough? Are the sporadic
> >> test failures?
> > 
> > One test is using the submit_await, the other does not. It takes the
> > background retire worker to run as we are submitting the secondaries...
> > But I have not noticed this failure before hooking up retirement to
> > process_csb. However, the issue is definitely present in the current
> > test.
> 
> So what happens? Is this another issue limited to selftests? Because I 
> don't see that uAPI itself can't be used in this way.

Since the master batch is already completed & signaled by the time we
submit the secondaries, the submit-fence is a dud and the secondaries
are not constrained in their engine selection.

i915_request_await_execution:
	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
		continue;
	else
		__i915_request_await_execution()

Now, our choice is either to drop the check on the signaled bit (and so
we will apply the bonding constrained from the already finished batch)
or not. Given that the master is already complete, I feel justified in
the current decision to ignore the constraint (since equally the fence
could already have been retired and so completely inaccessible), so chose
to fix the test instead.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
@ 2019-11-20 13:39       ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 13:39 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 13:16:51)
> 
> On 20/11/2019 09:33, Chris Wilson wrote:
> > The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
> > corruption WA") is that it disables RC6 while Skylake (and friends) is
> > active, and we do not consider the GPU idle until all outstanding
> > requests have been retired and the engine switched over to the kernel
> > context. If userspace is idle, this task falls onto our background idle
> > worker, which only runs roughly once a second, meaning that userspace has
> > to have been idle for a couple of seconds before we enable RC6 again.
> > Naturally, this causes us to consume considerably more energy than
> > before as powersaving is effectively disabled while a display server
> > (here's looking at you Xorg) is running.
> > 
> > As execlists will get a completion event as each context is completed,
> > we can use this interrupt to queue a retire worker bound to this engine
> > to cleanup idle timelines. We will then immediately notice the idle
> > engine (without userspace intervention or the aid of the background
> > retire worker) and start parking the GPU. Thus during light workloads,
> > we will do much more work to idle the GPU faster...  Hopefully with
> > commensurate power saving!
> > 
> > v2: Watch context completions and only look at those local to the engine
> > when retiring to reduce the amount of excess work we perform.
> > 
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
> > References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> > References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
> >   drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
> >   drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
> >   drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
> >   drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
> >   drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
> >   .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
> >   7 files changed, 116 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > index b9613d044393..8f6e353caa66 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > @@ -28,13 +28,13 @@
> >   
> >   #include "i915_drv.h"
> >   
> > -#include "gt/intel_gt.h"
> > -
> > +#include "intel_context.h"
> >   #include "intel_engine.h"
> >   #include "intel_engine_pm.h"
> >   #include "intel_engine_pool.h"
> >   #include "intel_engine_user.h"
> > -#include "intel_context.h"
> > +#include "intel_gt.h"
> > +#include "intel_gt_requests.h"
> >   #include "intel_lrc.h"
> >   #include "intel_reset.h"
> >   #include "intel_ring.h"
> > @@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
> >       intel_engine_init_execlists(engine);
> >       intel_engine_init_cmd_parser(engine);
> >       intel_engine_init__pm(engine);
> > +     intel_engine_init_retire(engine);
> >   
> >       intel_engine_pool_init(&engine->pool);
> >   
> > @@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
> >   
> >       cleanup_status_page(engine);
> >   
> > +     intel_engine_fini_retire(engine);
> >       intel_engine_pool_fini(&engine->pool);
> >       intel_engine_fini_breadcrumbs(engine);
> >       intel_engine_cleanup_cmd_parser(engine);
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > index 758f0e8ec672..17f1f1441efc 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > @@ -451,6 +451,14 @@ struct intel_engine_cs {
> >   
> >       struct intel_engine_execlists execlists;
> >   
> > +     /*
> > +      * Keep track of completed timelines on this engine for early
> > +      * retirement with the goal of quickly enabling powersaving as
> > +      * soon as the engine is idle.
> > +      */
> > +     struct intel_timeline *retire;
> > +     struct work_struct retire_work;
> > +
> >       /* status_notifier: list of callbacks for context-switch changes */
> >       struct atomic_notifier_head context_status_notifier;
> >   
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> > index 4dc3cbeb1b36..4a98fefdf915 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> > @@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
> >               intel_engine_flush_submission(engine);
> >   }
> >   
> > +static void engine_retire(struct work_struct *work)
> > +{
> > +     struct intel_engine_cs *engine =
> > +             container_of(work, typeof(*engine), retire_work);
> > +     struct intel_timeline *tl = xchg(&engine->retire, NULL);
> 
> Shouldn't this be atomic_xchg to avoid racing with add_retire?
> 
> > +
> > +     do {
> > +             struct intel_timeline *next = xchg(&tl->retire, NULL);
> 
> Here as well?

xchg() are always locked.

atomic_xchg() operates on atomic_t; xchg() works on any variable, like
cmpxchg().

> > +
> > +             /*
> > +              * Our goal here is to retire _idle_ timelines as soon as
> > +              * possible (as they are idle, we do not expect userspace
> > +              * to be cleaning up anytime soon).
> > +              *
> > +              * If the tl->active_count is already zero, someone else
> > +              * should have retired the timeline. Equally if the timeline
> > +              * is currently locked, either it is being retired elsewhere
> > +              * or about to be!
> > +              */
> > +             if (atomic_read(&tl->active_count) &&
> > +                 mutex_trylock(&tl->mutex)) {
> > +                     retire_requests(tl);
> > +                     mutex_unlock(&tl->mutex);
> > +             }
> > +             intel_timeline_put(tl);
> > +
> > +             GEM_BUG_ON(!next);
> > +             tl = ptr_mask_bits(next, 1);
> 
> You sometimes expect engine->retire to contain 0x1?

Yes, imagine that we are submitting very fast such that we schedule_out
the same context before the worker ran, we would then try to
add_retire() the same timeline again. So I was using BIT(0) to tag an
active element in the retirement list.

> > +     } while (tl);
> > +}
> > +
> > +static bool add_retire(struct intel_engine_cs *engine,
> > +                    struct intel_timeline *tl)
> > +{
> > +     struct intel_timeline *first = READ_ONCE(engine->retire);
> > +
> > +     /*
> > +      * We open-code a llist here to include the additional tag [BIT(0)]
> > +      * so that we know when the timeline is already on a
> > +      * retirement queue: either this engine or another.
> > +      *
> > +      * However, we rely on that a timeline can only be active on a single
> > +      * engine at any one time and that add_retire() is called before the
> > +      * engine releases the timeline and transferred to another to retire.
> > +      */
> > +
> > +     if (READ_ONCE(tl->retire)) /* already queued */
> > +             return false;
> 
> Can't this go first in the function?

Conceptually it is. And I made it so because I also decided against
having the READ_ONCE() at the top.

> > +
> > +     intel_timeline_get(tl);
> > +     do
> > +             tl->retire = ptr_pack_bits(first, 1, 1);
> 
> Here you rely on assignment being atomic right?

Ish. Here we rely on the timeline being owned by the engine so it cannot
be submitted by another (and so schedule_out called) until this engine
has released it.

It is a weak point for generality, but the ordering is strong in
execlists.

> > +     while (!try_cmpxchg(&engine->retire, &first, tl));
> 
> So the loop is effectively creating a chain of timelines to retire on 
> this engine.
> 
> What happens with virtual engines when a timeline goes to different 
> engine before (well or any single timeline context) the retire worker 
> runs? Ah okay, it gets re-assigned to the most recent engine.

Right. The engine_retire() doesn't care which engine the timelines were
run on, it's just a list of suspected idle timelines.

> I am not sure about the BIT(0) business. It's always set on write so I 
> am not getting why it is useful.

It's also set to 0 on consumption :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
@ 2019-11-20 13:39       ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 13:39 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 13:16:51)
> 
> On 20/11/2019 09:33, Chris Wilson wrote:
> > The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
> > corruption WA") is that it disables RC6 while Skylake (and friends) is
> > active, and we do not consider the GPU idle until all outstanding
> > requests have been retired and the engine switched over to the kernel
> > context. If userspace is idle, this task falls onto our background idle
> > worker, which only runs roughly once a second, meaning that userspace has
> > to have been idle for a couple of seconds before we enable RC6 again.
> > Naturally, this causes us to consume considerably more energy than
> > before as powersaving is effectively disabled while a display server
> > (here's looking at you Xorg) is running.
> > 
> > As execlists will get a completion event as each context is completed,
> > we can use this interrupt to queue a retire worker bound to this engine
> > to cleanup idle timelines. We will then immediately notice the idle
> > engine (without userspace intervention or the aid of the background
> > retire worker) and start parking the GPU. Thus during light workloads,
> > we will do much more work to idle the GPU faster...  Hopefully with
> > commensurate power saving!
> > 
> > v2: Watch context completions and only look at those local to the engine
> > when retiring to reduce the amount of excess work we perform.
> > 
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
> > References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> > References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
> >   drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
> >   drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
> >   drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
> >   drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
> >   drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
> >   .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
> >   7 files changed, 116 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > index b9613d044393..8f6e353caa66 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > @@ -28,13 +28,13 @@
> >   
> >   #include "i915_drv.h"
> >   
> > -#include "gt/intel_gt.h"
> > -
> > +#include "intel_context.h"
> >   #include "intel_engine.h"
> >   #include "intel_engine_pm.h"
> >   #include "intel_engine_pool.h"
> >   #include "intel_engine_user.h"
> > -#include "intel_context.h"
> > +#include "intel_gt.h"
> > +#include "intel_gt_requests.h"
> >   #include "intel_lrc.h"
> >   #include "intel_reset.h"
> >   #include "intel_ring.h"
> > @@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
> >       intel_engine_init_execlists(engine);
> >       intel_engine_init_cmd_parser(engine);
> >       intel_engine_init__pm(engine);
> > +     intel_engine_init_retire(engine);
> >   
> >       intel_engine_pool_init(&engine->pool);
> >   
> > @@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
> >   
> >       cleanup_status_page(engine);
> >   
> > +     intel_engine_fini_retire(engine);
> >       intel_engine_pool_fini(&engine->pool);
> >       intel_engine_fini_breadcrumbs(engine);
> >       intel_engine_cleanup_cmd_parser(engine);
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > index 758f0e8ec672..17f1f1441efc 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> > @@ -451,6 +451,14 @@ struct intel_engine_cs {
> >   
> >       struct intel_engine_execlists execlists;
> >   
> > +     /*
> > +      * Keep track of completed timelines on this engine for early
> > +      * retirement with the goal of quickly enabling powersaving as
> > +      * soon as the engine is idle.
> > +      */
> > +     struct intel_timeline *retire;
> > +     struct work_struct retire_work;
> > +
> >       /* status_notifier: list of callbacks for context-switch changes */
> >       struct atomic_notifier_head context_status_notifier;
> >   
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> > index 4dc3cbeb1b36..4a98fefdf915 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> > @@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
> >               intel_engine_flush_submission(engine);
> >   }
> >   
> > +static void engine_retire(struct work_struct *work)
> > +{
> > +     struct intel_engine_cs *engine =
> > +             container_of(work, typeof(*engine), retire_work);
> > +     struct intel_timeline *tl = xchg(&engine->retire, NULL);
> 
> Shouldn't this be atomic_xchg to avoid racing with add_retire?
> 
> > +
> > +     do {
> > +             struct intel_timeline *next = xchg(&tl->retire, NULL);
> 
> Here as well?

xchg() are always locked.

atomic_xchg() operates on atomic_t; xchg() works on any variable, like
cmpxchg().

> > +
> > +             /*
> > +              * Our goal here is to retire _idle_ timelines as soon as
> > +              * possible (as they are idle, we do not expect userspace
> > +              * to be cleaning up anytime soon).
> > +              *
> > +              * If the tl->active_count is already zero, someone else
> > +              * should have retired the timeline. Equally if the timeline
> > +              * is currently locked, either it is being retired elsewhere
> > +              * or about to be!
> > +              */
> > +             if (atomic_read(&tl->active_count) &&
> > +                 mutex_trylock(&tl->mutex)) {
> > +                     retire_requests(tl);
> > +                     mutex_unlock(&tl->mutex);
> > +             }
> > +             intel_timeline_put(tl);
> > +
> > +             GEM_BUG_ON(!next);
> > +             tl = ptr_mask_bits(next, 1);
> 
> You sometimes expect engine->retire to contain 0x1?

Yes, imagine that we are submitting very fast such that we schedule_out
the same context before the worker ran, we would then try to
add_retire() the same timeline again. So I was using BIT(0) to tag an
active element in the retirement list.

> > +     } while (tl);
> > +}
> > +
> > +static bool add_retire(struct intel_engine_cs *engine,
> > +                    struct intel_timeline *tl)
> > +{
> > +     struct intel_timeline *first = READ_ONCE(engine->retire);
> > +
> > +     /*
> > +      * We open-code a llist here to include the additional tag [BIT(0)]
> > +      * so that we know when the timeline is already on a
> > +      * retirement queue: either this engine or another.
> > +      *
> > +      * However, we rely on that a timeline can only be active on a single
> > +      * engine at any one time and that add_retire() is called before the
> > +      * engine releases the timeline and transferred to another to retire.
> > +      */
> > +
> > +     if (READ_ONCE(tl->retire)) /* already queued */
> > +             return false;
> 
> Can't this go first in the function?

Conceptually it is. And I made it so because I also decided against
having the READ_ONCE() at the top.

> > +
> > +     intel_timeline_get(tl);
> > +     do
> > +             tl->retire = ptr_pack_bits(first, 1, 1);
> 
> Here you rely on assignment being atomic right?

Ish. Here we rely on the timeline being owned by the engine so it cannot
be submitted by another (and so schedule_out called) until this engine
has released it.

It is a weak point for generality, but the ordering is strong in
execlists.

> > +     while (!try_cmpxchg(&engine->retire, &first, tl));
> 
> So the loop is effectively creating a chain of timelines to retire on 
> this engine.
> 
> What happens with virtual engines when a timeline goes to different 
> engine before (well or any single timeline context) the retire worker 
> runs? Ah okay, it gets re-assigned to the most recent engine.

Right. The engine_retire() doesn't care which engine the timelines were
run on, it's just a list of suspected idle timelines.

> I am not sure about the BIT(0) business. It's always set on write so I 
> am not getting why it is useful.

It's also set to 0 on consumption :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* ✗ Fi.CI.CHECKPATCH: warning for series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
@ 2019-11-20 13:51   ` Patchwork
  0 siblings, 0 replies; 73+ messages in thread
From: Patchwork @ 2019-11-20 13:51 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
URL   : https://patchwork.freedesktop.org/series/69724/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
66d127012d3d drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
12939c87c717 drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
ab3ac0419a9d drm/i915: Mark up the calling context for intel_wakeref_put()
-:14: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#14: 
References: a0855d24fc22d ("locking/mutex: Complain upon mutex API misuse in IRQ contexts")

total: 0 errors, 1 warnings, 0 checks, 219 lines checked
720e78f98141 drm/i915/gt: Declare timeline.lock to be irq-free
ab329f78df97 drm/i915/selftests: Force bonded submission to overlap
c80055e3a42a drm/i915/selftests: Flush the active callbacks
3afae1117a83 drm/i915/gt: Schedule request retirement when timeline idles
-:29: ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit <12+ chars of sha1> ("<title line>")' - ie: 'commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")'
#29: 
References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")

-:30: ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit <12+ chars of sha1> ("<title line>")' - ie: 'commit 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")'
#30: 
References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")

total: 2 errors, 0 warnings, 0 checks, 190 lines checked

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
@ 2019-11-20 13:51   ` Patchwork
  0 siblings, 0 replies; 73+ messages in thread
From: Patchwork @ 2019-11-20 13:51 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
URL   : https://patchwork.freedesktop.org/series/69724/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
66d127012d3d drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
12939c87c717 drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
ab3ac0419a9d drm/i915: Mark up the calling context for intel_wakeref_put()
-:14: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#14: 
References: a0855d24fc22d ("locking/mutex: Complain upon mutex API misuse in IRQ contexts")

total: 0 errors, 1 warnings, 0 checks, 219 lines checked
720e78f98141 drm/i915/gt: Declare timeline.lock to be irq-free
ab329f78df97 drm/i915/selftests: Force bonded submission to overlap
c80055e3a42a drm/i915/selftests: Flush the active callbacks
3afae1117a83 drm/i915/gt: Schedule request retirement when timeline idles
-:29: ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit <12+ chars of sha1> ("<title line>")' - ie: 'commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")'
#29: 
References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")

-:30: ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit <12+ chars of sha1> ("<title line>")' - ie: 'commit 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")'
#30: 
References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")

total: 2 errors, 0 warnings, 0 checks, 190 lines checked

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
@ 2019-11-20 14:16         ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 14:16 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 13:39, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 13:16:51)
>>
>> On 20/11/2019 09:33, Chris Wilson wrote:
>>> The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
>>> corruption WA") is that it disables RC6 while Skylake (and friends) is
>>> active, and we do not consider the GPU idle until all outstanding
>>> requests have been retired and the engine switched over to the kernel
>>> context. If userspace is idle, this task falls onto our background idle
>>> worker, which only runs roughly once a second, meaning that userspace has
>>> to have been idle for a couple of seconds before we enable RC6 again.
>>> Naturally, this causes us to consume considerably more energy than
>>> before as powersaving is effectively disabled while a display server
>>> (here's looking at you Xorg) is running.
>>>
>>> As execlists will get a completion event as each context is completed,
>>> we can use this interrupt to queue a retire worker bound to this engine
>>> to cleanup idle timelines. We will then immediately notice the idle
>>> engine (without userspace intervention or the aid of the background
>>> retire worker) and start parking the GPU. Thus during light workloads,
>>> we will do much more work to idle the GPU faster...  Hopefully with
>>> commensurate power saving!
>>>
>>> v2: Watch context completions and only look at those local to the engine
>>> when retiring to reduce the amount of excess work we perform.
>>>
>>> References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
>>> References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
>>> References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
>>>    drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
>>>    drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
>>>    drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
>>>    drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
>>>    drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
>>>    .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
>>>    7 files changed, 116 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> index b9613d044393..8f6e353caa66 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> @@ -28,13 +28,13 @@
>>>    
>>>    #include "i915_drv.h"
>>>    
>>> -#include "gt/intel_gt.h"
>>> -
>>> +#include "intel_context.h"
>>>    #include "intel_engine.h"
>>>    #include "intel_engine_pm.h"
>>>    #include "intel_engine_pool.h"
>>>    #include "intel_engine_user.h"
>>> -#include "intel_context.h"
>>> +#include "intel_gt.h"
>>> +#include "intel_gt_requests.h"
>>>    #include "intel_lrc.h"
>>>    #include "intel_reset.h"
>>>    #include "intel_ring.h"
>>> @@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
>>>        intel_engine_init_execlists(engine);
>>>        intel_engine_init_cmd_parser(engine);
>>>        intel_engine_init__pm(engine);
>>> +     intel_engine_init_retire(engine);
>>>    
>>>        intel_engine_pool_init(&engine->pool);
>>>    
>>> @@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>>>    
>>>        cleanup_status_page(engine);
>>>    
>>> +     intel_engine_fini_retire(engine);
>>>        intel_engine_pool_fini(&engine->pool);
>>>        intel_engine_fini_breadcrumbs(engine);
>>>        intel_engine_cleanup_cmd_parser(engine);
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> index 758f0e8ec672..17f1f1441efc 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> @@ -451,6 +451,14 @@ struct intel_engine_cs {
>>>    
>>>        struct intel_engine_execlists execlists;
>>>    
>>> +     /*
>>> +      * Keep track of completed timelines on this engine for early
>>> +      * retirement with the goal of quickly enabling powersaving as
>>> +      * soon as the engine is idle.
>>> +      */
>>> +     struct intel_timeline *retire;
>>> +     struct work_struct retire_work;
>>> +
>>>        /* status_notifier: list of callbacks for context-switch changes */
>>>        struct atomic_notifier_head context_status_notifier;
>>>    
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>>> index 4dc3cbeb1b36..4a98fefdf915 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>>> @@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
>>>                intel_engine_flush_submission(engine);
>>>    }
>>>    
>>> +static void engine_retire(struct work_struct *work)
>>> +{
>>> +     struct intel_engine_cs *engine =
>>> +             container_of(work, typeof(*engine), retire_work);
>>> +     struct intel_timeline *tl = xchg(&engine->retire, NULL);
>>
>> Shouldn't this be atomic_xchg to avoid racing with add_retire?
>>
>>> +
>>> +     do {
>>> +             struct intel_timeline *next = xchg(&tl->retire, NULL);
>>
>> Here as well?
> 
> xchg() are always locked.
> 
> atomic_xchg() operates on atomic_t; xchg() works on any variable, like
> cmpxchg().
> 
>>> +
>>> +             /*
>>> +              * Our goal here is to retire _idle_ timelines as soon as
>>> +              * possible (as they are idle, we do not expect userspace
>>> +              * to be cleaning up anytime soon).
>>> +              *
>>> +              * If the tl->active_count is already zero, someone else
>>> +              * should have retired the timeline. Equally if the timeline
>>> +              * is currently locked, either it is being retired elsewhere
>>> +              * or about to be!
>>> +              */
>>> +             if (atomic_read(&tl->active_count) &&
>>> +                 mutex_trylock(&tl->mutex)) {
>>> +                     retire_requests(tl);
>>> +                     mutex_unlock(&tl->mutex);
>>> +             }
>>> +             intel_timeline_put(tl);
>>> +
>>> +             GEM_BUG_ON(!next);
>>> +             tl = ptr_mask_bits(next, 1);
>>
>> You sometimes expect engine->retire to contain 0x1?
> 
> Yes, imagine that we are submitting very fast such that we schedule_out
> the same context before the worker ran, we would then try to
> add_retire() the same timeline again. So I was using BIT(0) to tag an
> active element in the retirement list.

If we have schedule_out on the same context before the retire ran then 
add_retire is a no-op due tl->retire being set.

It could be a race between engine_retire and add_retire where 
engine->retire is set to NULL so latter would set tl->retire to NULL | 
BIT(0).

But BIT(0) is only filtered out and not acted upon anywhere so I am 
still lost.

So maybe from the opposite angle, what goes wrong if you drop all BIT(0) 
business?

> 
>>> +     } while (tl);
>>> +}
>>> +
>>> +static bool add_retire(struct intel_engine_cs *engine,
>>> +                    struct intel_timeline *tl)
>>> +{
>>> +     struct intel_timeline *first = READ_ONCE(engine->retire);
>>> +
>>> +     /*
>>> +      * We open-code a llist here to include the additional tag [BIT(0)]
>>> +      * so that we know when the timeline is already on a
>>> +      * retirement queue: either this engine or another.
>>> +      *
>>> +      * However, we rely on that a timeline can only be active on a single
>>> +      * engine at any one time and that add_retire() is called before the
>>> +      * engine releases the timeline and transferred to another to retire.
>>> +      */
>>> +
>>> +     if (READ_ONCE(tl->retire)) /* already queued */
>>> +             return false;
>>
>> Can't this go first in the function?
> 
> Conceptually it is. And I made it so because I also decided against
> having the READ_ONCE() at the top.

But READ_ONCE is at the top, just a different one which is discarded if 
this one is NULL.

> 
>>> +
>>> +     intel_timeline_get(tl);
>>> +     do
>>> +             tl->retire = ptr_pack_bits(first, 1, 1);
>>
>> Here you rely on assignment being atomic right?
> 
> Ish. Here we rely on the timeline being owned by the engine so it cannot
> be submitted by another (and so schedule_out called) until this engine
> has released it.
> 
> It is a weak point for generality, but the ordering is strong in
> execlists.
> 
>>> +     while (!try_cmpxchg(&engine->retire, &first, tl));
>>
>> So the loop is effectively creating a chain of timelines to retire on
>> this engine.
>>
>> What happens with virtual engines when a timeline goes to different
>> engine before (well or any single timeline context) the retire worker
>> runs? Ah okay, it gets re-assigned to the most recent engine.
> 
> Right. The engine_retire() doesn't care which engine the timelines were
> run on, it's just a list of suspected idle timelines.
> 
>> I am not sure about the BIT(0) business. It's always set on write so I
>> am not getting why it is useful.
> 
> It's also set to 0 on consumption :)

Another thing - assert that engine->retire is NULL after flush_work in 
intel_engine_fini_retire could be useful.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
@ 2019-11-20 14:16         ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-20 14:16 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 13:39, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 13:16:51)
>>
>> On 20/11/2019 09:33, Chris Wilson wrote:
>>> The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
>>> corruption WA") is that it disables RC6 while Skylake (and friends) is
>>> active, and we do not consider the GPU idle until all outstanding
>>> requests have been retired and the engine switched over to the kernel
>>> context. If userspace is idle, this task falls onto our background idle
>>> worker, which only runs roughly once a second, meaning that userspace has
>>> to have been idle for a couple of seconds before we enable RC6 again.
>>> Naturally, this causes us to consume considerably more energy than
>>> before as powersaving is effectively disabled while a display server
>>> (here's looking at you Xorg) is running.
>>>
>>> As execlists will get a completion event as each context is completed,
>>> we can use this interrupt to queue a retire worker bound to this engine
>>> to cleanup idle timelines. We will then immediately notice the idle
>>> engine (without userspace intervention or the aid of the background
>>> retire worker) and start parking the GPU. Thus during light workloads,
>>> we will do much more work to idle the GPU faster...  Hopefully with
>>> commensurate power saving!
>>>
>>> v2: Watch context completions and only look at those local to the engine
>>> when retiring to reduce the amount of excess work we perform.
>>>
>>> References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
>>> References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
>>> References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  8 +-
>>>    drivers/gpu/drm/i915/gt/intel_engine_types.h  |  8 ++
>>>    drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 74 +++++++++++++++++++
>>>    drivers/gpu/drm/i915/gt/intel_gt_requests.h   | 17 ++++-
>>>    drivers/gpu/drm/i915/gt/intel_lrc.c           |  9 +++
>>>    drivers/gpu/drm/i915/gt/intel_timeline.c      |  1 +
>>>    .../gpu/drm/i915/gt/intel_timeline_types.h    |  3 +
>>>    7 files changed, 116 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> index b9613d044393..8f6e353caa66 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>>> @@ -28,13 +28,13 @@
>>>    
>>>    #include "i915_drv.h"
>>>    
>>> -#include "gt/intel_gt.h"
>>> -
>>> +#include "intel_context.h"
>>>    #include "intel_engine.h"
>>>    #include "intel_engine_pm.h"
>>>    #include "intel_engine_pool.h"
>>>    #include "intel_engine_user.h"
>>> -#include "intel_context.h"
>>> +#include "intel_gt.h"
>>> +#include "intel_gt_requests.h"
>>>    #include "intel_lrc.h"
>>>    #include "intel_reset.h"
>>>    #include "intel_ring.h"
>>> @@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
>>>        intel_engine_init_execlists(engine);
>>>        intel_engine_init_cmd_parser(engine);
>>>        intel_engine_init__pm(engine);
>>> +     intel_engine_init_retire(engine);
>>>    
>>>        intel_engine_pool_init(&engine->pool);
>>>    
>>> @@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>>>    
>>>        cleanup_status_page(engine);
>>>    
>>> +     intel_engine_fini_retire(engine);
>>>        intel_engine_pool_fini(&engine->pool);
>>>        intel_engine_fini_breadcrumbs(engine);
>>>        intel_engine_cleanup_cmd_parser(engine);
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> index 758f0e8ec672..17f1f1441efc 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>>> @@ -451,6 +451,14 @@ struct intel_engine_cs {
>>>    
>>>        struct intel_engine_execlists execlists;
>>>    
>>> +     /*
>>> +      * Keep track of completed timelines on this engine for early
>>> +      * retirement with the goal of quickly enabling powersaving as
>>> +      * soon as the engine is idle.
>>> +      */
>>> +     struct intel_timeline *retire;
>>> +     struct work_struct retire_work;
>>> +
>>>        /* status_notifier: list of callbacks for context-switch changes */
>>>        struct atomic_notifier_head context_status_notifier;
>>>    
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>>> index 4dc3cbeb1b36..4a98fefdf915 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
>>> @@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
>>>                intel_engine_flush_submission(engine);
>>>    }
>>>    
>>> +static void engine_retire(struct work_struct *work)
>>> +{
>>> +     struct intel_engine_cs *engine =
>>> +             container_of(work, typeof(*engine), retire_work);
>>> +     struct intel_timeline *tl = xchg(&engine->retire, NULL);
>>
>> Shouldn't this be atomic_xchg to avoid racing with add_retire?
>>
>>> +
>>> +     do {
>>> +             struct intel_timeline *next = xchg(&tl->retire, NULL);
>>
>> Here as well?
> 
> xchg() are always locked.
> 
> atomic_xchg() operates on atomic_t; xchg() works on any variable, like
> cmpxchg().
> 
>>> +
>>> +             /*
>>> +              * Our goal here is to retire _idle_ timelines as soon as
>>> +              * possible (as they are idle, we do not expect userspace
>>> +              * to be cleaning up anytime soon).
>>> +              *
>>> +              * If the tl->active_count is already zero, someone else
>>> +              * should have retired the timeline. Equally if the timeline
>>> +              * is currently locked, either it is being retired elsewhere
>>> +              * or about to be!
>>> +              */
>>> +             if (atomic_read(&tl->active_count) &&
>>> +                 mutex_trylock(&tl->mutex)) {
>>> +                     retire_requests(tl);
>>> +                     mutex_unlock(&tl->mutex);
>>> +             }
>>> +             intel_timeline_put(tl);
>>> +
>>> +             GEM_BUG_ON(!next);
>>> +             tl = ptr_mask_bits(next, 1);
>>
>> You sometimes expect engine->retire to contain 0x1?
> 
> Yes, imagine that we are submitting very fast such that we schedule_out
> the same context before the worker ran, we would then try to
> add_retire() the same timeline again. So I was using BIT(0) to tag an
> active element in the retirement list.

If we have schedule_out on the same context before the retire ran then 
add_retire is a no-op due tl->retire being set.

It could be a race between engine_retire and add_retire where 
engine->retire is set to NULL so latter would set tl->retire to NULL | 
BIT(0).

But BIT(0) is only filtered out and not acted upon anywhere so I am 
still lost.

So maybe from the opposite angle, what goes wrong if you drop all BIT(0) 
business?

> 
>>> +     } while (tl);
>>> +}
>>> +
>>> +static bool add_retire(struct intel_engine_cs *engine,
>>> +                    struct intel_timeline *tl)
>>> +{
>>> +     struct intel_timeline *first = READ_ONCE(engine->retire);
>>> +
>>> +     /*
>>> +      * We open-code a llist here to include the additional tag [BIT(0)]
>>> +      * so that we know when the timeline is already on a
>>> +      * retirement queue: either this engine or another.
>>> +      *
>>> +      * However, we rely on that a timeline can only be active on a single
>>> +      * engine at any one time and that add_retire() is called before the
>>> +      * engine releases the timeline and transferred to another to retire.
>>> +      */
>>> +
>>> +     if (READ_ONCE(tl->retire)) /* already queued */
>>> +             return false;
>>
>> Can't this go first in the function?
> 
> Conceptually it is. And I made it so because I also decided against
> having the READ_ONCE() at the top.

But READ_ONCE is at the top, just a different one which is discarded if 
this one is NULL.

> 
>>> +
>>> +     intel_timeline_get(tl);
>>> +     do
>>> +             tl->retire = ptr_pack_bits(first, 1, 1);
>>
>> Here you rely on assignment being atomic right?
> 
> Ish. Here we rely on the timeline being owned by the engine so it cannot
> be submitted by another (and so schedule_out called) until this engine
> has released it.
> 
> It is a weak point for generality, but the ordering is strong in
> execlists.
> 
>>> +     while (!try_cmpxchg(&engine->retire, &first, tl));
>>
>> So the loop is effectively creating a chain of timelines to retire on
>> this engine.
>>
>> What happens with virtual engines when a timeline goes to different
>> engine before (well or any single timeline context) the retire worker
>> runs? Ah okay, it gets re-assigned to the most recent engine.
> 
> Right. The engine_retire() doesn't care which engine the timelines were
> run on, it's just a list of suspected idle timelines.
> 
>> I am not sure about the BIT(0) business. It's always set on write so I
>> am not getting why it is useful.
> 
> It's also set to 0 on consumption :)

Another thing - assert that engine->retire is NULL after flush_work in 
intel_engine_fini_retire could be useful.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
@ 2019-11-20 14:19   ` Patchwork
  0 siblings, 0 replies; 73+ messages in thread
From: Patchwork @ 2019-11-20 14:19 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
URL   : https://patchwork.freedesktop.org/series/69724/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_7384 -> Patchwork_15342
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_15342 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_15342, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_15342:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_selftest@live_gt_heartbeat:
    - fi-kbl-soraka:      [PASS][1] -> [INCOMPLETE][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7384/fi-kbl-soraka/igt@i915_selftest@live_gt_heartbeat.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-kbl-soraka/igt@i915_selftest@live_gt_heartbeat.html

  * igt@runner@aborted:
    - fi-kbl-soraka:      NOTRUN -> [FAIL][3]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-kbl-soraka/igt@runner@aborted.html

  
Known issues
------------

  Here are the changes found in Patchwork_15342 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_selftest@live_execlists:
    - fi-icl-u3:          [PASS][4] -> [INCOMPLETE][5] ([fdo#107713])
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7384/fi-icl-u3/igt@i915_selftest@live_execlists.html
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-icl-u3/igt@i915_selftest@live_execlists.html

  * igt@i915_selftest@live_gem_contexts:
    - fi-skl-lmem:        [PASS][6] -> [INCOMPLETE][7] ([fdo#111700])
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7384/fi-skl-lmem/igt@i915_selftest@live_gem_contexts.html
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-skl-lmem/igt@i915_selftest@live_gem_contexts.html

  * igt@kms_chamelium@hdmi-hpd-fast:
    - fi-kbl-7500u:       [PASS][8] -> [FAIL][9] ([fdo#111045] / [fdo#111096])
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7384/fi-kbl-7500u/igt@kms_chamelium@hdmi-hpd-fast.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-kbl-7500u/igt@kms_chamelium@hdmi-hpd-fast.html

  
  [fdo#107713]: https://bugs.freedesktop.org/show_bug.cgi?id=107713
  [fdo#111045]: https://bugs.freedesktop.org/show_bug.cgi?id=111045
  [fdo#111096]: https://bugs.freedesktop.org/show_bug.cgi?id=111096
  [fdo#111700]: https://bugs.freedesktop.org/show_bug.cgi?id=111700


Participating hosts (49 -> 45)
------------------------------

  Additional (1): fi-whl-u 
  Missing    (5): fi-byt-squawks fi-bsw-cyan fi-ctg-p8600 fi-byt-clapper fi-bdw-samus 


Build changes
-------------

  * CI: CI-20190529 -> None
  * Linux: CI_DRM_7384 -> Patchwork_15342

  CI-20190529: 20190529
  CI_DRM_7384: b148b4279057be545e5a3510b118be13a0446db5 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5298: a19df6f52812517a4a84e6e630506512575b10da @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_15342: 3afae1117a83c4103e0078f13feaf2c50ec722f5 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

3afae1117a83 drm/i915/gt: Schedule request retirement when timeline idles
c80055e3a42a drm/i915/selftests: Flush the active callbacks
ab329f78df97 drm/i915/selftests: Force bonded submission to overlap
720e78f98141 drm/i915/gt: Declare timeline.lock to be irq-free
ab3ac0419a9d drm/i915: Mark up the calling context for intel_wakeref_put()
12939c87c717 drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
66d127012d3d drm/i915/gt: Close race between engine_park and intel_gt_retire_requests

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BAT: failure for series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
@ 2019-11-20 14:19   ` Patchwork
  0 siblings, 0 replies; 73+ messages in thread
From: Patchwork @ 2019-11-20 14:19 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
URL   : https://patchwork.freedesktop.org/series/69724/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_7384 -> Patchwork_15342
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_15342 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_15342, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_15342:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_selftest@live_gt_heartbeat:
    - fi-kbl-soraka:      [PASS][1] -> [INCOMPLETE][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7384/fi-kbl-soraka/igt@i915_selftest@live_gt_heartbeat.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-kbl-soraka/igt@i915_selftest@live_gt_heartbeat.html

  * igt@runner@aborted:
    - fi-kbl-soraka:      NOTRUN -> [FAIL][3]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-kbl-soraka/igt@runner@aborted.html

  
Known issues
------------

  Here are the changes found in Patchwork_15342 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_selftest@live_execlists:
    - fi-icl-u3:          [PASS][4] -> [INCOMPLETE][5] ([fdo#107713])
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7384/fi-icl-u3/igt@i915_selftest@live_execlists.html
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-icl-u3/igt@i915_selftest@live_execlists.html

  * igt@i915_selftest@live_gem_contexts:
    - fi-skl-lmem:        [PASS][6] -> [INCOMPLETE][7] ([fdo#111700])
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7384/fi-skl-lmem/igt@i915_selftest@live_gem_contexts.html
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-skl-lmem/igt@i915_selftest@live_gem_contexts.html

  * igt@kms_chamelium@hdmi-hpd-fast:
    - fi-kbl-7500u:       [PASS][8] -> [FAIL][9] ([fdo#111045] / [fdo#111096])
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7384/fi-kbl-7500u/igt@kms_chamelium@hdmi-hpd-fast.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-kbl-7500u/igt@kms_chamelium@hdmi-hpd-fast.html

  
  [fdo#107713]: https://bugs.freedesktop.org/show_bug.cgi?id=107713
  [fdo#111045]: https://bugs.freedesktop.org/show_bug.cgi?id=111045
  [fdo#111096]: https://bugs.freedesktop.org/show_bug.cgi?id=111096
  [fdo#111700]: https://bugs.freedesktop.org/show_bug.cgi?id=111700


Participating hosts (49 -> 45)
------------------------------

  Additional (1): fi-whl-u 
  Missing    (5): fi-byt-squawks fi-bsw-cyan fi-ctg-p8600 fi-byt-clapper fi-bdw-samus 


Build changes
-------------

  * CI: CI-20190529 -> None
  * Linux: CI_DRM_7384 -> Patchwork_15342

  CI-20190529: 20190529
  CI_DRM_7384: b148b4279057be545e5a3510b118be13a0446db5 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5298: a19df6f52812517a4a84e6e630506512575b10da @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_15342: 3afae1117a83c4103e0078f13feaf2c50ec722f5 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

3afae1117a83 drm/i915/gt: Schedule request retirement when timeline idles
c80055e3a42a drm/i915/selftests: Flush the active callbacks
ab329f78df97 drm/i915/selftests: Force bonded submission to overlap
720e78f98141 drm/i915/gt: Declare timeline.lock to be irq-free
ab3ac0419a9d drm/i915: Mark up the calling context for intel_wakeref_put()
12939c87c717 drm/i915/gt: Unlock engine-pm after queuing the kernel context switch
66d127012d3d drm/i915/gt: Close race between engine_park and intel_gt_retire_requests

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: ✗ Fi.CI.BAT: failure for series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
@ 2019-11-20 14:20     ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 14:20 UTC (permalink / raw)
  To: Patchwork; +Cc: intel-gfx

Quoting Patchwork (2019-11-20 14:19:15)
> == Series Details ==
> 
> Series: series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
> URL   : https://patchwork.freedesktop.org/series/69724/
> State : failure
> 
> == Summary ==
> 
> CI Bug Log - changes from CI_DRM_7384 -> Patchwork_15342
> ====================================================
> 
> Summary
> -------
> 
>   **FAILURE**
> 
>   Serious unknown changes coming with Patchwork_15342 absolutely need to be
>   verified manually.
>   
>   If you think the reported changes have nothing to do with the changes
>   introduced in Patchwork_15342, please notify your bug team to allow them
>   to document this new failure mode, which will reduce false positives in CI.
> 
>   External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/index.html
> 
> Possible new issues
> -------------------
> 
>   Here are the unknown changes that may have been introduced in Patchwork_15342:
> 
> ### IGT changes ###
> 
> #### Possible regressions ####
> 
>   * igt@i915_selftest@live_gt_heartbeat:
>     - fi-kbl-soraka:      [PASS][1] -> [INCOMPLETE][2]
>    [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7384/fi-kbl-soraka/igt@i915_selftest@live_gt_heartbeat.html
>    [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-kbl-soraka/igt@i915_selftest@live_gt_heartbeat.html
> 
>   * igt@runner@aborted:
>     - fi-kbl-soraka:      NOTRUN -> [FAIL][3]
>    [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-kbl-soraka/igt@runner@aborted.html

Here's where gitlab's /tableflip would be so handy.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx]  ✗ Fi.CI.BAT: failure for series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
@ 2019-11-20 14:20     ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 14:20 UTC (permalink / raw)
  To: Patchwork, intel-gfx; +Cc: intel-gfx

Quoting Patchwork (2019-11-20 14:19:15)
> == Series Details ==
> 
> Series: series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2)
> URL   : https://patchwork.freedesktop.org/series/69724/
> State : failure
> 
> == Summary ==
> 
> CI Bug Log - changes from CI_DRM_7384 -> Patchwork_15342
> ====================================================
> 
> Summary
> -------
> 
>   **FAILURE**
> 
>   Serious unknown changes coming with Patchwork_15342 absolutely need to be
>   verified manually.
>   
>   If you think the reported changes have nothing to do with the changes
>   introduced in Patchwork_15342, please notify your bug team to allow them
>   to document this new failure mode, which will reduce false positives in CI.
> 
>   External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/index.html
> 
> Possible new issues
> -------------------
> 
>   Here are the unknown changes that may have been introduced in Patchwork_15342:
> 
> ### IGT changes ###
> 
> #### Possible regressions ####
> 
>   * igt@i915_selftest@live_gt_heartbeat:
>     - fi-kbl-soraka:      [PASS][1] -> [INCOMPLETE][2]
>    [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7384/fi-kbl-soraka/igt@i915_selftest@live_gt_heartbeat.html
>    [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-kbl-soraka/igt@i915_selftest@live_gt_heartbeat.html
> 
>   * igt@runner@aborted:
>     - fi-kbl-soraka:      NOTRUN -> [FAIL][3]
>    [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_15342/fi-kbl-soraka/igt@runner@aborted.html

Here's where gitlab's /tableflip would be so handy.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
@ 2019-11-20 14:25           ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 14:25 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 14:16:17)
> 
> On 20/11/2019 13:39, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-20 13:16:51)
> >>
> >> On 20/11/2019 09:33, Chris Wilson wrote:
> >>> +
> >>> +             /*
> >>> +              * Our goal here is to retire _idle_ timelines as soon as
> >>> +              * possible (as they are idle, we do not expect userspace
> >>> +              * to be cleaning up anytime soon).
> >>> +              *
> >>> +              * If the tl->active_count is already zero, someone else
> >>> +              * should have retired the timeline. Equally if the timeline
> >>> +              * is currently locked, either it is being retired elsewhere
> >>> +              * or about to be!
> >>> +              */
> >>> +             if (atomic_read(&tl->active_count) &&
> >>> +                 mutex_trylock(&tl->mutex)) {
> >>> +                     retire_requests(tl);
> >>> +                     mutex_unlock(&tl->mutex);
> >>> +             }
> >>> +             intel_timeline_put(tl);
> >>> +
> >>> +             GEM_BUG_ON(!next);
> >>> +             tl = ptr_mask_bits(next, 1);
> >>
> >> You sometimes expect engine->retire to contain 0x1?
> > 
> > Yes, imagine that we are submitting very fast such that we schedule_out
> > the same context before the worker ran, we would then try to
> > add_retire() the same timeline again. So I was using BIT(0) to tag an
> > active element in the retirement list.
> 
> If we have schedule_out on the same context before the retire ran then 
> add_retire is a no-op due tl->retire being set.

The first element in the list has timeline->retire = NULL. We only know
it is set because we make it timeline->retire = 1 instead.

> It could be a race between engine_retire and add_retire where 
> engine->retire is set to NULL so latter would set tl->retire to NULL | 
> BIT(0).
> 
> But BIT(0) is only filtered out and not acted upon anywhere so I am 
> still lost.
> 
> So maybe from the opposite angle, what goes wrong if you drop all BIT(0) 
> business?

You insert the element into the list twice causing a loop. Now that loop
is short lived as we clear timeline->retire on processing, but we may
still end up dropping a reference twice.

> 
> > 
> >>> +     } while (tl);
> >>> +}
> >>> +
> >>> +static bool add_retire(struct intel_engine_cs *engine,
> >>> +                    struct intel_timeline *tl)
> >>> +{
> >>> +     struct intel_timeline *first = READ_ONCE(engine->retire);
> >>> +
> >>> +     /*
> >>> +      * We open-code a llist here to include the additional tag [BIT(0)]
> >>> +      * so that we know when the timeline is already on a
> >>> +      * retirement queue: either this engine or another.
> >>> +      *
> >>> +      * However, we rely on that a timeline can only be active on a single
> >>> +      * engine at any one time and that add_retire() is called before the
> >>> +      * engine releases the timeline and transferred to another to retire.
> >>> +      */
> >>> +
> >>> +     if (READ_ONCE(tl->retire)) /* already queued */
> >>> +             return false;
> >>
> >> Can't this go first in the function?
> > 
> > Conceptually it is. And I made it so because I also decided against
> > having the READ_ONCE() at the top.
> 
> But READ_ONCE is at the top, just a different one which is discarded if 
> this one is NULL.

static bool add_retire(struct intel_engine_cs *engine,
                       struct intel_timeline *tl)
{
        struct intel_timeline *first;

        /*
         * We open-code a llist here to include the additional tag [BIT(0)]
         * so that we know when the timeline is already on a
         * retirement queue: either this engine or another.
         *
         * However, we rely on that a timeline can only be active on a single
         * engine at any one time and that add_retire() is called before the
         * engine releases the timeline and transferred to another to retire.
         */

        if (READ_ONCE(tl->retire)) /* already queued */
                return false;

        if (!atomic_read(&tl->active_count)) /* already retired */
                return false;

        intel_timeline_get(tl);
        first = READ_ONCE(engine->retire);
        do
                tl->retire = ptr_pack_bits(first, 1, 1);
        while (!try_cmpxchg(&engine->retire, &first, tl));

        return !first;
}

is the current incarnation.

> >>> +
> >>> +     intel_timeline_get(tl);
> >>> +     do
> >>> +             tl->retire = ptr_pack_bits(first, 1, 1);
> >>
> >> Here you rely on assignment being atomic right?
> > 
> > Ish. Here we rely on the timeline being owned by the engine so it cannot
> > be submitted by another (and so schedule_out called) until this engine
> > has released it.
> > 
> > It is a weak point for generality, but the ordering is strong in
> > execlists.
> > 
> >>> +     while (!try_cmpxchg(&engine->retire, &first, tl));
> >>
> >> So the loop is effectively creating a chain of timelines to retire on
> >> this engine.
> >>
> >> What happens with virtual engines when a timeline goes to different
> >> engine before (well or any single timeline context) the retire worker
> >> runs? Ah okay, it gets re-assigned to the most recent engine.
> > 
> > Right. The engine_retire() doesn't care which engine the timelines were
> > run on, it's just a list of suspected idle timelines.
> > 
> >> I am not sure about the BIT(0) business. It's always set on write so I
> >> am not getting why it is useful.
> > 
> > It's also set to 0 on consumption :)
> 
> Another thing - assert that engine->retire is NULL after flush_work in 
> intel_engine_fini_retire could be useful.

Yup, wilco.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
@ 2019-11-20 14:25           ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20 14:25 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-20 14:16:17)
> 
> On 20/11/2019 13:39, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-20 13:16:51)
> >>
> >> On 20/11/2019 09:33, Chris Wilson wrote:
> >>> +
> >>> +             /*
> >>> +              * Our goal here is to retire _idle_ timelines as soon as
> >>> +              * possible (as they are idle, we do not expect userspace
> >>> +              * to be cleaning up anytime soon).
> >>> +              *
> >>> +              * If the tl->active_count is already zero, someone else
> >>> +              * should have retired the timeline. Equally if the timeline
> >>> +              * is currently locked, either it is being retired elsewhere
> >>> +              * or about to be!
> >>> +              */
> >>> +             if (atomic_read(&tl->active_count) &&
> >>> +                 mutex_trylock(&tl->mutex)) {
> >>> +                     retire_requests(tl);
> >>> +                     mutex_unlock(&tl->mutex);
> >>> +             }
> >>> +             intel_timeline_put(tl);
> >>> +
> >>> +             GEM_BUG_ON(!next);
> >>> +             tl = ptr_mask_bits(next, 1);
> >>
> >> You sometimes expect engine->retire to contain 0x1?
> > 
> > Yes, imagine that we are submitting very fast such that we schedule_out
> > the same context before the worker ran, we would then try to
> > add_retire() the same timeline again. So I was using BIT(0) to tag an
> > active element in the retirement list.
> 
> If we have schedule_out on the same context before the retire ran then 
> add_retire is a no-op due tl->retire being set.

The first element in the list has timeline->retire = NULL. We only know
it is set because we make it timeline->retire = 1 instead.

> It could be a race between engine_retire and add_retire where 
> engine->retire is set to NULL so latter would set tl->retire to NULL | 
> BIT(0).
> 
> But BIT(0) is only filtered out and not acted upon anywhere so I am 
> still lost.
> 
> So maybe from the opposite angle, what goes wrong if you drop all BIT(0) 
> business?

You insert the element into the list twice causing a loop. Now that loop
is short lived as we clear timeline->retire on processing, but we may
still end up dropping a reference twice.

> 
> > 
> >>> +     } while (tl);
> >>> +}
> >>> +
> >>> +static bool add_retire(struct intel_engine_cs *engine,
> >>> +                    struct intel_timeline *tl)
> >>> +{
> >>> +     struct intel_timeline *first = READ_ONCE(engine->retire);
> >>> +
> >>> +     /*
> >>> +      * We open-code a llist here to include the additional tag [BIT(0)]
> >>> +      * so that we know when the timeline is already on a
> >>> +      * retirement queue: either this engine or another.
> >>> +      *
> >>> +      * However, we rely on that a timeline can only be active on a single
> >>> +      * engine at any one time and that add_retire() is called before the
> >>> +      * engine releases the timeline and transferred to another to retire.
> >>> +      */
> >>> +
> >>> +     if (READ_ONCE(tl->retire)) /* already queued */
> >>> +             return false;
> >>
> >> Can't this go first in the function?
> > 
> > Conceptually it is. And I made it so because I also decided against
> > having the READ_ONCE() at the top.
> 
> But READ_ONCE is at the top, just a different one which is discarded if 
> this one is NULL.

static bool add_retire(struct intel_engine_cs *engine,
                       struct intel_timeline *tl)
{
        struct intel_timeline *first;

        /*
         * We open-code a llist here to include the additional tag [BIT(0)]
         * so that we know when the timeline is already on a
         * retirement queue: either this engine or another.
         *
         * However, we rely on that a timeline can only be active on a single
         * engine at any one time and that add_retire() is called before the
         * engine releases the timeline and transferred to another to retire.
         */

        if (READ_ONCE(tl->retire)) /* already queued */
                return false;

        if (!atomic_read(&tl->active_count)) /* already retired */
                return false;

        intel_timeline_get(tl);
        first = READ_ONCE(engine->retire);
        do
                tl->retire = ptr_pack_bits(first, 1, 1);
        while (!try_cmpxchg(&engine->retire, &first, tl));

        return !first;
}

is the current incarnation.

> >>> +
> >>> +     intel_timeline_get(tl);
> >>> +     do
> >>> +             tl->retire = ptr_pack_bits(first, 1, 1);
> >>
> >> Here you rely on assignment being atomic right?
> > 
> > Ish. Here we rely on the timeline being owned by the engine so it cannot
> > be submitted by another (and so schedule_out called) until this engine
> > has released it.
> > 
> > It is a weak point for generality, but the ordering is strong in
> > execlists.
> > 
> >>> +     while (!try_cmpxchg(&engine->retire, &first, tl));
> >>
> >> So the loop is effectively creating a chain of timelines to retire on
> >> this engine.
> >>
> >> What happens with virtual engines when a timeline goes to different
> >> engine before (well or any single timeline context) the retire worker
> >> runs? Ah okay, it gets re-assigned to the most recent engine.
> > 
> > Right. The engine_retire() doesn't care which engine the timelines were
> > run on, it's just a list of suspected idle timelines.
> > 
> >> I am not sure about the BIT(0) business. It's always set on write so I
> >> am not getting why it is useful.
> > 
> > It's also set to 0 on consumption :)
> 
> Another thing - assert that engine->retire is NULL after flush_work in 
> intel_engine_fini_retire could be useful.

Yup, wilco.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-22  9:34             ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-22  9:34 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 13:29, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 13:18:27)
>>
>> On 20/11/2019 12:59, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2019-11-20 12:55:42)
>>>>
>>>> On 20/11/2019 09:32, Chris Wilson wrote:
>>>>> Bonded request submission is designed to allow requests to execute in
>>>>> parallel as laid out by the user. If the master request is already
>>>>> finished before its bonded pair is submitted, the pair were not destined
>>>>> to run in parallel and we lose the information about the master engine
>>>>> to dictate selection of the secondary. If the second request was
>>>>> required to be run on a particular engine in a virtual set, that should
>>>>> have been specified, rather than left to the whims of a random
>>>>> unconnected requests!
>>>>>
>>>>> In the selftest, I made the mistake of not ensuring the master would
>>>>> overlap with its bonded pairs, meaning that it could indeed complete
>>>>> before we submitted the bonds. Those bonds were then free to select any
>>>>> available engine in their virtual set, and not the one expected by the
>>>>> test.
>>>>
>>>> There is a submit await which ensures master is not runnable before
>>>> bonded pairs are submitted. Why was that not enough? Are the sporadic
>>>> test failures?
>>>
>>> One test is using the submit_await, the other does not. It takes the
>>> background retire worker to run as we are submitting the secondaries...
>>> But I have not noticed this failure before hooking up retirement to
>>> process_csb. However, the issue is definitely present in the current
>>> test.
>>
>> So what happens? Is this another issue limited to selftests? Because I
>> don't see that uAPI itself can't be used in this way.
> 
> Since the master batch is already completed & signaled by the time we
> submit the secondaries, the submit-fence is a dud and the secondaries
> are not constrained in their engine selection.
> 
> i915_request_await_execution:
> 	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
> 		continue;
> 	else
> 		__i915_request_await_execution()
> 
> Now, our choice is either to drop the check on the signaled bit (and so
> we will apply the bonding constrained from the already finished batch)
> or not. Given that the master is already complete, I feel justified in
> the current decision to ignore the constraint (since equally the fence
> could already have been retired and so completely inaccessible), so chose
> to fix the test instead.

Yes I agree it sounds okay to skip/ignore the constraint. But also seems 
a valid test what this test was doing before since it exercises a 
slightly different code path, or at least set of conditions.

What do you think? Would it be hard to add this as 3rd flavour? Maybe 
just a new flag and then allow spinner to finish as soon as is created 
to keep the existing flow?

Regards,

Tvrtko


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-22  9:34             ` Tvrtko Ursulin
  0 siblings, 0 replies; 73+ messages in thread
From: Tvrtko Ursulin @ 2019-11-22  9:34 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 20/11/2019 13:29, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-11-20 13:18:27)
>>
>> On 20/11/2019 12:59, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2019-11-20 12:55:42)
>>>>
>>>> On 20/11/2019 09:32, Chris Wilson wrote:
>>>>> Bonded request submission is designed to allow requests to execute in
>>>>> parallel as laid out by the user. If the master request is already
>>>>> finished before its bonded pair is submitted, the pair were not destined
>>>>> to run in parallel and we lose the information about the master engine
>>>>> to dictate selection of the secondary. If the second request was
>>>>> required to be run on a particular engine in a virtual set, that should
>>>>> have been specified, rather than left to the whims of a random
>>>>> unconnected requests!
>>>>>
>>>>> In the selftest, I made the mistake of not ensuring the master would
>>>>> overlap with its bonded pairs, meaning that it could indeed complete
>>>>> before we submitted the bonds. Those bonds were then free to select any
>>>>> available engine in their virtual set, and not the one expected by the
>>>>> test.
>>>>
>>>> There is a submit await which ensures master is not runnable before
>>>> bonded pairs are submitted. Why was that not enough? Are the sporadic
>>>> test failures?
>>>
>>> One test is using the submit_await, the other does not. It takes the
>>> background retire worker to run as we are submitting the secondaries...
>>> But I have not noticed this failure before hooking up retirement to
>>> process_csb. However, the issue is definitely present in the current
>>> test.
>>
>> So what happens? Is this another issue limited to selftests? Because I
>> don't see that uAPI itself can't be used in this way.
> 
> Since the master batch is already completed & signaled by the time we
> submit the secondaries, the submit-fence is a dud and the secondaries
> are not constrained in their engine selection.
> 
> i915_request_await_execution:
> 	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
> 		continue;
> 	else
> 		__i915_request_await_execution()
> 
> Now, our choice is either to drop the check on the signaled bit (and so
> we will apply the bonding constrained from the already finished batch)
> or not. Given that the master is already complete, I feel justified in
> the current decision to ignore the constraint (since equally the fence
> could already have been retired and so completely inaccessible), so chose
> to fix the test instead.

Yes I agree it sounds okay to skip/ignore the constraint. But also seems 
a valid test what this test was doing before since it exercises a 
slightly different code path, or at least set of conditions.

What do you think? Would it be hard to add this as 3rd flavour? Maybe 
just a new flag and then allow spinner to finish as soon as is created 
to keep the existing flow?

Regards,

Tvrtko


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-22 10:03               ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-22 10:03 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-22 09:34:34)
> 
> On 20/11/2019 13:29, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-20 13:18:27)
> >>
> >> On 20/11/2019 12:59, Chris Wilson wrote:
> >>> Quoting Tvrtko Ursulin (2019-11-20 12:55:42)
> >>>>
> >>>> On 20/11/2019 09:32, Chris Wilson wrote:
> >>>>> Bonded request submission is designed to allow requests to execute in
> >>>>> parallel as laid out by the user. If the master request is already
> >>>>> finished before its bonded pair is submitted, the pair were not destined
> >>>>> to run in parallel and we lose the information about the master engine
> >>>>> to dictate selection of the secondary. If the second request was
> >>>>> required to be run on a particular engine in a virtual set, that should
> >>>>> have been specified, rather than left to the whims of a random
> >>>>> unconnected requests!
> >>>>>
> >>>>> In the selftest, I made the mistake of not ensuring the master would
> >>>>> overlap with its bonded pairs, meaning that it could indeed complete
> >>>>> before we submitted the bonds. Those bonds were then free to select any
> >>>>> available engine in their virtual set, and not the one expected by the
> >>>>> test.
> >>>>
> >>>> There is a submit await which ensures master is not runnable before
> >>>> bonded pairs are submitted. Why was that not enough? Are the sporadic
> >>>> test failures?
> >>>
> >>> One test is using the submit_await, the other does not. It takes the
> >>> background retire worker to run as we are submitting the secondaries...
> >>> But I have not noticed this failure before hooking up retirement to
> >>> process_csb. However, the issue is definitely present in the current
> >>> test.
> >>
> >> So what happens? Is this another issue limited to selftests? Because I
> >> don't see that uAPI itself can't be used in this way.
> > 
> > Since the master batch is already completed & signaled by the time we
> > submit the secondaries, the submit-fence is a dud and the secondaries
> > are not constrained in their engine selection.
> > 
> > i915_request_await_execution:
> >       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
> >               continue;
> >       else
> >               __i915_request_await_execution()
> > 
> > Now, our choice is either to drop the check on the signaled bit (and so
> > we will apply the bonding constrained from the already finished batch)
> > or not. Given that the master is already complete, I feel justified in
> > the current decision to ignore the constraint (since equally the fence
> > could already have been retired and so completely inaccessible), so chose
> > to fix the test instead.
> 
> Yes I agree it sounds okay to skip/ignore the constraint. But also seems 
> a valid test what this test was doing before since it exercises a 
> slightly different code path, or at least set of conditions.

What's the verification step? If we submit a bonded pair without a
submit-fence or a completed submit-fence, it's free to run on either?

We're just testing that it degenerates into a normal submit.

	create stub (completed) fence [on bonded engines]
	rq = i915_request_create();
	i915_request_await_execution(rq, stub);
	if (fence_exists(rq, stub)) {
		FAIL;
	}
?

It doesn't even have to be a bonded setup.

> What do you think? Would it be hard to add this as 3rd flavour? Maybe 
> just a new flag and then allow spinner to finish as soon as is created 
> to keep the existing flow?

I'm a little worried that maybe it's enshrining an implementation detail
without a thorough userspace use case. I'm half waiting for somebody to
ask and then being able to determine what is the best approach here with
somebody that an has actual example and anticipated behaviour.

E.g. maybe we should make it a uAPI error? Although we can not
completely detect all signaled fences prior to submission -- as in some
cases the secondary engine may have to be delayed until it has a slot
(back to the nightmare of pipeline bubbles and whether we should stall
the bonded engines until the entire set is ready).

Maybe we should be keeping the bonding details around for as long as the
submit-fence exists and apply them even if the master is already
completed. I'm torn as to whether or not that is the better idea -- the
argument that one cannot tell between a completed fence and a
non-existent fence is compelling to me (given the implementation of
fences), but I can see how that would cause confusion to the user.


I'll try and summarise the discussion here and add that to the test.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [Intel-gfx] [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-22 10:03               ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-22 10:03 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-11-22 09:34:34)
> 
> On 20/11/2019 13:29, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-11-20 13:18:27)
> >>
> >> On 20/11/2019 12:59, Chris Wilson wrote:
> >>> Quoting Tvrtko Ursulin (2019-11-20 12:55:42)
> >>>>
> >>>> On 20/11/2019 09:32, Chris Wilson wrote:
> >>>>> Bonded request submission is designed to allow requests to execute in
> >>>>> parallel as laid out by the user. If the master request is already
> >>>>> finished before its bonded pair is submitted, the pair were not destined
> >>>>> to run in parallel and we lose the information about the master engine
> >>>>> to dictate selection of the secondary. If the second request was
> >>>>> required to be run on a particular engine in a virtual set, that should
> >>>>> have been specified, rather than left to the whims of a random
> >>>>> unconnected requests!
> >>>>>
> >>>>> In the selftest, I made the mistake of not ensuring the master would
> >>>>> overlap with its bonded pairs, meaning that it could indeed complete
> >>>>> before we submitted the bonds. Those bonds were then free to select any
> >>>>> available engine in their virtual set, and not the one expected by the
> >>>>> test.
> >>>>
> >>>> There is a submit await which ensures master is not runnable before
> >>>> bonded pairs are submitted. Why was that not enough? Are the sporadic
> >>>> test failures?
> >>>
> >>> One test is using the submit_await, the other does not. It takes the
> >>> background retire worker to run as we are submitting the secondaries...
> >>> But I have not noticed this failure before hooking up retirement to
> >>> process_csb. However, the issue is definitely present in the current
> >>> test.
> >>
> >> So what happens? Is this another issue limited to selftests? Because I
> >> don't see that uAPI itself can't be used in this way.
> > 
> > Since the master batch is already completed & signaled by the time we
> > submit the secondaries, the submit-fence is a dud and the secondaries
> > are not constrained in their engine selection.
> > 
> > i915_request_await_execution:
> >       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
> >               continue;
> >       else
> >               __i915_request_await_execution()
> > 
> > Now, our choice is either to drop the check on the signaled bit (and so
> > we will apply the bonding constrained from the already finished batch)
> > or not. Given that the master is already complete, I feel justified in
> > the current decision to ignore the constraint (since equally the fence
> > could already have been retired and so completely inaccessible), so chose
> > to fix the test instead.
> 
> Yes I agree it sounds okay to skip/ignore the constraint. But also seems 
> a valid test what this test was doing before since it exercises a 
> slightly different code path, or at least set of conditions.

What's the verification step? If we submit a bonded pair without a
submit-fence or a completed submit-fence, it's free to run on either?

We're just testing that it degenerates into a normal submit.

	create stub (completed) fence [on bonded engines]
	rq = i915_request_create();
	i915_request_await_execution(rq, stub);
	if (fence_exists(rq, stub)) {
		FAIL;
	}
?

It doesn't even have to be a bonded setup.

> What do you think? Would it be hard to add this as 3rd flavour? Maybe 
> just a new flag and then allow spinner to finish as soon as is created 
> to keep the existing flow?

I'm a little worried that maybe it's enshrining an implementation detail
without a thorough userspace use case. I'm half waiting for somebody to
ask and then being able to determine what is the best approach here with
somebody that an has actual example and anticipated behaviour.

E.g. maybe we should make it a uAPI error? Although we can not
completely detect all signaled fences prior to submission -- as in some
cases the secondary engine may have to be delayed until it has a slot
(back to the nightmare of pipeline bubbles and whether we should stall
the bonded engines until the entire set is ready).

Maybe we should be keeping the bonding details around for as long as the
submit-fence exists and apply them even if the master is already
completed. I'm torn as to whether or not that is the better idea -- the
argument that one cannot tell between a completed fence and a
non-existent fence is compelling to me (given the implementation of
fences), but I can see how that would cause confusion to the user.


I'll try and summarise the discussion here and add that to the test.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* [PATCH] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-22 10:43     ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-22 10:43 UTC (permalink / raw)
  To: intel-gfx

Bonded request submission is designed to allow requests to execute in
parallel as laid out by the user. If the master request is already
finished before its bonded pair is submitted, the pair were not destined
to run in parallel and we lose the information about the master engine
to dictate selection of the secondary. If the second request was
required to be run on a particular engine in a virtual set, that should
have been specified, rather than left to the whims of a random
unconnected requests!

In the selftest, I made the mistake of not ensuring the master would
overlap with its bonded pairs, meaning that it could indeed complete
before we submitted the bonds. Those bonds were then free to select any
available engine in their virtual set, and not the one expected by the
test.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 62 ++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 2baeedd5953f..e0ea930bee19 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -3081,15 +3081,60 @@ static int bond_virtual_engine(struct intel_gt *gt,
 	struct i915_gem_context *ctx;
 	struct i915_request *rq[16];
 	enum intel_engine_id id;
+	struct igt_spinner spin;
 	unsigned long n;
 	int err;
 
+	/*
+	 * A set of bonded requests is intended to be run concurrently
+	 * across a number of engines. We use one request per-engine
+	 * and a magic fence to schedule each of the bonded requests
+	 * at the same time. A consequence of our currently scheduler
+	 * is that we only move requests to the HW ready queue when
+	 * the request becomes ready, that is when all of its prerequisite
+	 * fences have been signaled. As one of those fences is the master
+	 * submit fence, there is a delay on all secondary fences as the
+	 * HW may be currently busy. Equally, as all the requests are
+	 * independent, they may have other fences that delay individual
+	 * request submission to HW. Ergo, we do not guarantee that
+	 * all requests are immediately submitted to HW at the same time,
+	 * just that if the rules are abided by, they are ready at the
+	 * same time as the first is submitted. Userspace can embed semaphores
+	 * in its batch to ensure parallel execution of phases as it requires.
+	 * Though naturally it gets requested that perhaps the scheduler should
+	 * take care of parallel execution, even across preemption events
+	 * on different HW. (The proper answer is of course "lalalala".)
+	 *
+	 * With the submit-fence, we have identified three possible phases
+	 * of synchronisation depending on the master fence: queued (not
+	 * ready), ready or executing, signaled. The first two are quite
+	 * simple and checked below. However, the signaled master fence
+	 * handling is contentious. Currently we do not distinguish between
+	 * a signaled fence and an expired fence, as once signaled it does
+	 * not convey any information about the previous execution, it may
+	 * be freed and hence checking later it may not exist at all. Ergo
+	 * we currently do not apply the bonding constraint for an already
+	 * signaled fence, as our expectation is that it should not constrain
+	 * the secondaries and is outside of the scope of the bonded request
+	 * API (i.e. all requests are meant to be running in parallel). As
+	 * it imposes no constraint, and is effectively a no-op, we do not
+	 * check below as normal execution flows are checked extensively above.
+	 *
+	 * XXX Is the degenerate handling of signaled submit fences the
+	 * expected behaviour for userpace?
+	 */
+
 	GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1);
 
-	ctx = kernel_context(gt->i915);
-	if (!ctx)
+	if (igt_spinner_init(&spin, gt))
 		return -ENOMEM;
 
+	ctx = kernel_context(gt->i915);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto err_spin;
+	}
+
 	err = 0;
 	rq[0] = ERR_PTR(-ENOMEM);
 	for_each_engine(master, gt, id) {
@@ -3100,7 +3145,7 @@ static int bond_virtual_engine(struct intel_gt *gt,
 
 		memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq));
 
-		rq[0] = igt_request_alloc(ctx, master);
+		rq[0] = spinner_create_request(&spin, ctx, master, MI_NOOP);
 		if (IS_ERR(rq[0])) {
 			err = PTR_ERR(rq[0]);
 			goto out;
@@ -3113,10 +3158,17 @@ static int bond_virtual_engine(struct intel_gt *gt,
 							       &fence,
 							       GFP_KERNEL);
 		}
+
 		i915_request_add(rq[0]);
 		if (err < 0)
 			goto out;
 
+		if (!(flags & BOND_SCHEDULE) &&
+		    !igt_wait_for_spinner(&spin, rq[0])) {
+			err = -EIO;
+			goto out;
+		}
+
 		for (n = 0; n < nsibling; n++) {
 			struct intel_context *ve;
 
@@ -3164,6 +3216,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
 			}
 		}
 		onstack_fence_fini(&fence);
+		intel_engine_flush_submission(master);
+		igt_spinner_end(&spin);
 
 		if (i915_request_wait(rq[0], 0, HZ / 10) < 0) {
 			pr_err("Master request did not execute (on %s)!\n",
@@ -3201,6 +3255,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
 		err = -EIO;
 
 	kernel_context_close(ctx);
+err_spin:
+	igt_spinner_fini(&spin);
 	return err;
 }
 
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/selftests: Force bonded submission to overlap
@ 2019-11-22 10:43     ` Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-22 10:43 UTC (permalink / raw)
  To: intel-gfx

Bonded request submission is designed to allow requests to execute in
parallel as laid out by the user. If the master request is already
finished before its bonded pair is submitted, the pair were not destined
to run in parallel and we lose the information about the master engine
to dictate selection of the secondary. If the second request was
required to be run on a particular engine in a virtual set, that should
have been specified, rather than left to the whims of a random
unconnected requests!

In the selftest, I made the mistake of not ensuring the master would
overlap with its bonded pairs, meaning that it could indeed complete
before we submitted the bonds. Those bonds were then free to select any
available engine in their virtual set, and not the one expected by the
test.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 62 ++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 2baeedd5953f..e0ea930bee19 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -3081,15 +3081,60 @@ static int bond_virtual_engine(struct intel_gt *gt,
 	struct i915_gem_context *ctx;
 	struct i915_request *rq[16];
 	enum intel_engine_id id;
+	struct igt_spinner spin;
 	unsigned long n;
 	int err;
 
+	/*
+	 * A set of bonded requests is intended to be run concurrently
+	 * across a number of engines. We use one request per-engine
+	 * and a magic fence to schedule each of the bonded requests
+	 * at the same time. A consequence of our currently scheduler
+	 * is that we only move requests to the HW ready queue when
+	 * the request becomes ready, that is when all of its prerequisite
+	 * fences have been signaled. As one of those fences is the master
+	 * submit fence, there is a delay on all secondary fences as the
+	 * HW may be currently busy. Equally, as all the requests are
+	 * independent, they may have other fences that delay individual
+	 * request submission to HW. Ergo, we do not guarantee that
+	 * all requests are immediately submitted to HW at the same time,
+	 * just that if the rules are abided by, they are ready at the
+	 * same time as the first is submitted. Userspace can embed semaphores
+	 * in its batch to ensure parallel execution of phases as it requires.
+	 * Though naturally it gets requested that perhaps the scheduler should
+	 * take care of parallel execution, even across preemption events
+	 * on different HW. (The proper answer is of course "lalalala".)
+	 *
+	 * With the submit-fence, we have identified three possible phases
+	 * of synchronisation depending on the master fence: queued (not
+	 * ready), ready or executing, signaled. The first two are quite
+	 * simple and checked below. However, the signaled master fence
+	 * handling is contentious. Currently we do not distinguish between
+	 * a signaled fence and an expired fence, as once signaled it does
+	 * not convey any information about the previous execution, it may
+	 * be freed and hence checking later it may not exist at all. Ergo
+	 * we currently do not apply the bonding constraint for an already
+	 * signaled fence, as our expectation is that it should not constrain
+	 * the secondaries and is outside of the scope of the bonded request
+	 * API (i.e. all requests are meant to be running in parallel). As
+	 * it imposes no constraint, and is effectively a no-op, we do not
+	 * check below as normal execution flows are checked extensively above.
+	 *
+	 * XXX Is the degenerate handling of signaled submit fences the
+	 * expected behaviour for userpace?
+	 */
+
 	GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1);
 
-	ctx = kernel_context(gt->i915);
-	if (!ctx)
+	if (igt_spinner_init(&spin, gt))
 		return -ENOMEM;
 
+	ctx = kernel_context(gt->i915);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto err_spin;
+	}
+
 	err = 0;
 	rq[0] = ERR_PTR(-ENOMEM);
 	for_each_engine(master, gt, id) {
@@ -3100,7 +3145,7 @@ static int bond_virtual_engine(struct intel_gt *gt,
 
 		memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq));
 
-		rq[0] = igt_request_alloc(ctx, master);
+		rq[0] = spinner_create_request(&spin, ctx, master, MI_NOOP);
 		if (IS_ERR(rq[0])) {
 			err = PTR_ERR(rq[0]);
 			goto out;
@@ -3113,10 +3158,17 @@ static int bond_virtual_engine(struct intel_gt *gt,
 							       &fence,
 							       GFP_KERNEL);
 		}
+
 		i915_request_add(rq[0]);
 		if (err < 0)
 			goto out;
 
+		if (!(flags & BOND_SCHEDULE) &&
+		    !igt_wait_for_spinner(&spin, rq[0])) {
+			err = -EIO;
+			goto out;
+		}
+
 		for (n = 0; n < nsibling; n++) {
 			struct intel_context *ve;
 
@@ -3164,6 +3216,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
 			}
 		}
 		onstack_fence_fini(&fence);
+		intel_engine_flush_submission(master);
+		igt_spinner_end(&spin);
 
 		if (i915_request_wait(rq[0], 0, HZ / 10) < 0) {
 			pr_err("Master request did not execute (on %s)!\n",
@@ -3201,6 +3255,8 @@ static int bond_virtual_engine(struct intel_gt *gt,
 		err = -EIO;
 
 	kernel_context_close(ctx);
+err_spin:
+	igt_spinner_fini(&spin);
 	return err;
 }
 
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* ✗ Fi.CI.BUILD: failure for series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev3)
@ 2019-11-22 12:33   ` Patchwork
  0 siblings, 0 replies; 73+ messages in thread
From: Patchwork @ 2019-11-22 12:33 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev3)
URL   : https://patchwork.freedesktop.org/series/69724/
State : failure

== Summary ==

Applying: drm/i915/selftests: Take a ref to the request we wait upon
Using index info to reconstruct a base tree...
M	drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
Falling back to patching base and 3-way merge...
Auto-merging drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
No changes -- Patch already applied.
Applying: drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
Using index info to reconstruct a base tree...
M	drivers/gpu/drm/i915/gt/intel_gt_requests.c
M	drivers/gpu/drm/i915/gt/intel_timeline.c
M	drivers/gpu/drm/i915/gt/intel_timeline_types.h
Falling back to patching base and 3-way merge...
Auto-merging drivers/gpu/drm/i915/gt/intel_timeline.c
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/gt/intel_timeline.c
Auto-merging drivers/gpu/drm/i915/gt/intel_gt_requests.c
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/gt/intel_gt_requests.c
error: Failed to merge in the changes.
hint: Use 'git am --show-current-patch' to see the failed patch
Patch failed at 0002 drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BUILD: failure for series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev3)
@ 2019-11-22 12:33   ` Patchwork
  0 siblings, 0 replies; 73+ messages in thread
From: Patchwork @ 2019-11-22 12:33 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev3)
URL   : https://patchwork.freedesktop.org/series/69724/
State : failure

== Summary ==

Applying: drm/i915/selftests: Take a ref to the request we wait upon
Using index info to reconstruct a base tree...
M	drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
Falling back to patching base and 3-way merge...
Auto-merging drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
No changes -- Patch already applied.
Applying: drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
Using index info to reconstruct a base tree...
M	drivers/gpu/drm/i915/gt/intel_gt_requests.c
M	drivers/gpu/drm/i915/gt/intel_timeline.c
M	drivers/gpu/drm/i915/gt/intel_timeline_types.h
Falling back to patching base and 3-way merge...
Auto-merging drivers/gpu/drm/i915/gt/intel_timeline.c
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/gt/intel_timeline.c
Auto-merging drivers/gpu/drm/i915/gt/intel_gt_requests.c
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/gt/intel_gt_requests.c
error: Failed to merge in the changes.
hint: Use 'git am --show-current-patch' to see the failed patch
Patch failed at 0002 drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 73+ messages in thread

* [PATCH] drm/i915/selftests: Take a ref to the request we wait upon
@ 2019-11-20  8:57 Chris Wilson
  0 siblings, 0 replies; 73+ messages in thread
From: Chris Wilson @ 2019-11-20  8:57 UTC (permalink / raw)
  To: intel-gfx

i915_request_add() consumes the passed in reference to the i915_request,
so if the selftest caller wishes to wait upon it afterwards, it needs to
take a reference for itself.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../drm/i915/gem/selftests/i915_gem_context.c | 38 ++++++++++++++-----
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index 9a509c18b7c7..16df814f3efd 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -73,25 +73,34 @@ static int live_nop_switch(void *arg)
 	}
 
 	for_each_uabi_engine(engine, i915) {
-		struct i915_request *rq;
+		struct i915_request *rq = NULL;
 		unsigned long end_time, prime;
 		ktime_t times[2] = {};
 
 		times[0] = ktime_get_raw();
 		for (n = 0; n < nctx; n++) {
-			rq = igt_request_alloc(ctx[n], engine);
-			if (IS_ERR(rq)) {
-				err = PTR_ERR(rq);
+			struct i915_request *this;
+
+			this = igt_request_alloc(ctx[n], engine);
+			if (IS_ERR(this)) {
+				err = PTR_ERR(this);
 				goto out_file;
 			}
-			i915_request_add(rq);
+			if (rq) {
+				i915_request_await_dma_fence(this, &rq->fence);
+				i915_request_put(rq);
+			}
+			rq = i915_request_get(this);
+			i915_request_add(this);
 		}
 		if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 			pr_err("Failed to populated %d contexts\n", nctx);
 			intel_gt_set_wedged(&i915->gt);
+			i915_request_put(rq);
 			err = -EIO;
 			goto out_file;
 		}
+		i915_request_put(rq);
 
 		times[1] = ktime_get_raw();
 
@@ -106,13 +115,21 @@ static int live_nop_switch(void *arg)
 		for_each_prime_number_from(prime, 2, 8192) {
 			times[1] = ktime_get_raw();
 
+			rq = NULL;
 			for (n = 0; n < prime; n++) {
-				rq = igt_request_alloc(ctx[n % nctx], engine);
-				if (IS_ERR(rq)) {
-					err = PTR_ERR(rq);
+				struct i915_request *this;
+
+				this = igt_request_alloc(ctx[n % nctx], engine);
+				if (IS_ERR(this)) {
+					err = PTR_ERR(this);
 					goto out_file;
 				}
 
+				if (this) { /* Force submission order */
+					i915_request_await_dma_fence(this, &rq->fence);
+					i915_request_put(rq);
+				}
+
 				/*
 				 * This space is left intentionally blank.
 				 *
@@ -127,14 +144,17 @@ static int live_nop_switch(void *arg)
 				 * for latency.
 				 */
 
-				i915_request_add(rq);
+				i915_request_add(this);
 			}
+			GEM_BUG_ON(!rq);
 			if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 				pr_err("Switching between %ld contexts timed out\n",
 				       prime);
 				intel_gt_set_wedged(&i915->gt);
+				i915_request_put(rq);
 				break;
 			}
+			i915_request_put(rq);
 
 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
 			if (prime == 2)
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 73+ messages in thread

end of thread, other threads:[~2019-11-22 12:33 UTC | newest]

Thread overview: 73+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-20  9:32 [PATCH 1/9] drm/i915/selftests: Take a ref to the request we wait upon Chris Wilson
2019-11-20  9:32 ` [Intel-gfx] " Chris Wilson
2019-11-20  9:32 ` [PATCH 2/9] drm/i915/gt: Close race between engine_park and intel_gt_retire_requests Chris Wilson
2019-11-20  9:32   ` [Intel-gfx] " Chris Wilson
2019-11-20 13:19   ` Tvrtko Ursulin
2019-11-20 13:19     ` [Intel-gfx] " Tvrtko Ursulin
2019-11-20  9:32 ` [PATCH 3/9] drm/i915/gt: Unlock engine-pm after queuing the kernel context switch Chris Wilson
2019-11-20  9:32   ` [Intel-gfx] " Chris Wilson
2019-11-20 11:58   ` Tvrtko Ursulin
2019-11-20 11:58     ` [Intel-gfx] " Tvrtko Ursulin
2019-11-20 12:07     ` Chris Wilson
2019-11-20 12:07       ` [Intel-gfx] " Chris Wilson
2019-11-20 12:40       ` Tvrtko Ursulin
2019-11-20 12:40         ` [Intel-gfx] " Tvrtko Ursulin
2019-11-20 12:44         ` Chris Wilson
2019-11-20 12:44           ` [Intel-gfx] " Chris Wilson
2019-11-20 13:19           ` Tvrtko Ursulin
2019-11-20 13:19             ` [Intel-gfx] " Tvrtko Ursulin
2019-11-20  9:32 ` [PATCH 4/9] drm/i915: Mark up the calling context for intel_wakeref_put() Chris Wilson
2019-11-20  9:32   ` [Intel-gfx] " Chris Wilson
2019-11-20 12:46   ` Tvrtko Ursulin
2019-11-20 12:46     ` [Intel-gfx] " Tvrtko Ursulin
2019-11-20  9:32 ` [PATCH 5/9] drm/i915/gt: Declare timeline.lock to be irq-free Chris Wilson
2019-11-20  9:32   ` [Intel-gfx] " Chris Wilson
2019-11-20  9:32 ` [PATCH 6/9] drm/i915/selftests: Force bonded submission to overlap Chris Wilson
2019-11-20  9:32   ` [Intel-gfx] " Chris Wilson
2019-11-20 12:55   ` Tvrtko Ursulin
2019-11-20 12:55     ` [Intel-gfx] " Tvrtko Ursulin
2019-11-20 12:59     ` Chris Wilson
2019-11-20 12:59       ` [Intel-gfx] " Chris Wilson
2019-11-20 13:18       ` Tvrtko Ursulin
2019-11-20 13:18         ` [Intel-gfx] " Tvrtko Ursulin
2019-11-20 13:29         ` Chris Wilson
2019-11-20 13:29           ` [Intel-gfx] " Chris Wilson
2019-11-22  9:34           ` Tvrtko Ursulin
2019-11-22  9:34             ` [Intel-gfx] " Tvrtko Ursulin
2019-11-22 10:03             ` Chris Wilson
2019-11-22 10:03               ` [Intel-gfx] " Chris Wilson
2019-11-22 10:43   ` [PATCH] " Chris Wilson
2019-11-22 10:43     ` [Intel-gfx] " Chris Wilson
2019-11-20  9:33 ` [PATCH 7/9] drm/i915/selftests: Flush the active callbacks Chris Wilson
2019-11-20  9:33   ` [Intel-gfx] " Chris Wilson
2019-11-20  9:33 ` [PATCH 8/9] drm/i915/selftests: Be explicit in ERR_PTR handling Chris Wilson
2019-11-20  9:33   ` [Intel-gfx] " Chris Wilson
2019-11-20 10:23   ` Matthew Auld
2019-11-20 10:23     ` [Intel-gfx] " Matthew Auld
2019-11-20  9:33 ` [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles Chris Wilson
2019-11-20  9:33   ` [Intel-gfx] " Chris Wilson
2019-11-20 13:16   ` Tvrtko Ursulin
2019-11-20 13:16     ` [Intel-gfx] " Tvrtko Ursulin
2019-11-20 13:39     ` Chris Wilson
2019-11-20 13:39       ` [Intel-gfx] " Chris Wilson
2019-11-20 14:16       ` Tvrtko Ursulin
2019-11-20 14:16         ` [Intel-gfx] " Tvrtko Ursulin
2019-11-20 14:25         ` Chris Wilson
2019-11-20 14:25           ` [Intel-gfx] " Chris Wilson
2019-11-20 10:19 ` [PATCH 1/9] drm/i915/selftests: Take a ref to the request we wait upon Matthew Auld
2019-11-20 10:19   ` [Intel-gfx] " Matthew Auld
2019-11-20 10:25   ` Chris Wilson
2019-11-20 10:25     ` [Intel-gfx] " Chris Wilson
2019-11-20 10:27 ` [PATCH] " Chris Wilson
2019-11-20 10:27   ` [Intel-gfx] " Chris Wilson
2019-11-20 10:34   ` Matthew Auld
2019-11-20 10:34     ` [Intel-gfx] " Matthew Auld
2019-11-20 13:51 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev2) Patchwork
2019-11-20 13:51   ` [Intel-gfx] " Patchwork
2019-11-20 14:19 ` ✗ Fi.CI.BAT: failure " Patchwork
2019-11-20 14:19   ` [Intel-gfx] " Patchwork
2019-11-20 14:20   ` Chris Wilson
2019-11-20 14:20     ` [Intel-gfx] " Chris Wilson
2019-11-22 12:33 ` ✗ Fi.CI.BUILD: failure for series starting with drm/i915/selftests: Take a ref to the request we wait upon (rev3) Patchwork
2019-11-22 12:33   ` [Intel-gfx] " Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2019-11-20  8:57 [PATCH] drm/i915/selftests: Take a ref to the request we wait upon Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.