Re: [Intel-gfx] [PATCH] drm/i915: Keep all engine locks across scheduling

From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Cc: "# v4 . 10+" <stable@vger.kernel.org>
Subject: Re: [Intel-gfx] [PATCH] drm/i915: Keep all engine locks across scheduling
Date: Mon, 27 Mar 2017 11:11:47 +0100	[thread overview]
Message-ID: <05a2f5b7-f004-573d-6f22-50ce59d2e62c@linux.intel.com> (raw)
In-Reply-To: <20170326084637.13394-1-chris@chris-wilson.co.uk>

On 26/03/2017 09:46, Chris Wilson wrote:
> Unlocking is dangerous. In this case we combine an early update to the
> out-of-queue request, because we know that it will be inserted into the
> correct FIFO priority-ordered slot when it becomes ready in the future.
> However, given sufficient enthusiasm, it may become ready as we are
> continuing to reschedule, and so may gazump the FIFO if we have since
> dropped its spinlock. The result is that it may be executed too early,
> before its dependees.
>
> Fixes: 20311bd35060 ("drm/i915/scheduler: Execute requests in order of priorities")
> Testcase: igt/gem_exec_whisper
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: <stable@vger.kernel.org> # v4.10+
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 54 +++++++++++++++++++++++++++-------------
>  1 file changed, 37 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index dd0e9d587852..3fdabba0a32d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -658,30 +658,47 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  }
>
> -static struct intel_engine_cs *
> -pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
> +static inline struct intel_engine_cs *
> +pt_lock_engine(struct i915_priotree *pt, unsigned long *locked)
>  {
> -	struct intel_engine_cs *engine;
> -
> -	engine = container_of(pt,
> -			      struct drm_i915_gem_request,
> -			      priotree)->engine;
> -	if (engine != locked) {
> -		if (locked)
> -			spin_unlock_irq(&locked->timeline->lock);
> -		spin_lock_irq(&engine->timeline->lock);
> -	}
> +	struct intel_engine_cs *engine =
> +		container_of(pt, struct drm_i915_gem_request, priotree)->engine;
> +
> +	/* Locking the engines in a random order will rightfully trigger a
> +	 * spasm in lockdep. However, we can ignore lockdep (by marking each
> +	 * as a seperate nesting) so long as we never nest the
> +	 * engine->timeline->lock elsewhere. Also the number of nesting
> +	 * subclasses is severely limited (7) which is going to cause an
> +	 * issue at some point.
> +	 * BUILD_BUG_ON(I915_NUM_ENGINES >= MAX_LOCKDEP_SUBCLASSES);

Lets bite the bullet and not hide this BUILD_BUG_ON in a comment. :I

> +	 */
> +	if (!__test_and_set_bit(engine->id, locked))
> +		spin_lock_nested(&engine->timeline->lock,
> +				 hweight_long(*locked));
>
>  	return engine;
>  }
>
> +static void
> +unlock_engines(struct drm_i915_private *i915, unsigned long locked)
> +{
> +	struct intel_engine_cs *engine;
> +	unsigned long tmp;
> +
> +	for_each_engine_masked(engine, i915, locked, tmp)
> +		spin_unlock(&engine->timeline->lock);
> +}
> +
>  static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
>  {
> -	struct intel_engine_cs *engine = NULL;
> +	struct intel_engine_cs *engine;
>  	struct i915_dependency *dep, *p;
>  	struct i915_dependency stack;
> +	unsigned long locked = 0;
>  	LIST_HEAD(dfs);
>
> +	BUILD_BUG_ON(I915_NUM_ENGINES > BITS_PER_LONG);
> +
>  	if (prio <= READ_ONCE(request->priotree.priority))
>  		return;
>
> @@ -691,6 +708,9 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
>  	stack.signaler = &request->priotree;
>  	list_add(&stack.dfs_link, &dfs);
>
> +	GEM_BUG_ON(irqs_disabled());
> +	local_irq_disable();
> +

Why not just irqsave/restore? Sounds like too low level for this 
position in the flow. If just optimisation it would need a comment I think.

>  	/* Recursively bump all dependent priorities to match the new request.
>  	 *
>  	 * A naive approach would be to use recursion:
> @@ -719,7 +739,7 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
>  		if (!RB_EMPTY_NODE(&pt->node))
>  			continue;
>
> -		engine = pt_lock_engine(pt, engine);
> +		engine = pt_lock_engine(pt, &locked);
>
>  		/* If it is not already in the rbtree, we can update the
>  		 * priority inplace and skip over it (and its dependencies)
> @@ -737,7 +757,7 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
>
>  		INIT_LIST_HEAD(&dep->dfs_link);
>
> -		engine = pt_lock_engine(pt, engine);
> +		engine = pt_lock_engine(pt, &locked);
>
>  		if (prio <= pt->priority)
>  			continue;
> @@ -750,8 +770,8 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
>  			engine->execlist_first = &pt->node;
>  	}
>
> -	if (engine)
> -		spin_unlock_irq(&engine->timeline->lock);
> +	unlock_engines(request->i915, locked);
> +	local_irq_enable();
>
>  	/* XXX Do we need to preempt to make room for us and our deps? */
>  }
>

I am trying to think whether removing the skip on requests not in the 
execution tree would work and help any. Or if the above scheme is 
completely safe or we would need to lock atomically all engines requests 
on which will be touched. Especially since the code is only dealing with 
adjusting the priorities so I don't immediately see how it can cause out 
of order execution.

Regards,

Tvrtko