Re: [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()

From: Mika Kuoppala <mika.kuoppala@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
Date: Tue, 15 Jan 2019 13:56:11 +0200	[thread overview]
Message-ID: <878szmyw10.fsf@gaia.fi.intel.com> (raw)
In-Reply-To: <20190114210408.4561-2-chris@chris-wilson.co.uk>

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Make i915_gem_set_wedged() and i915_gem_unset_wedged() behaviour more
> consistently if called concurrently.

More is needed in here. The purpose is to make them wait in turns
on top of mutex, instead of racing on the bit? Where is
the inconsistency tho.

>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c               | 32 ++++++++++++++-----
>  drivers/gpu/drm/i915/i915_gpu_error.h         |  4 ++-
>  .../gpu/drm/i915/selftests/mock_gem_device.c  |  1 +
>  3 files changed, 28 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 0bfed33178e1..910c49befc50 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3173,10 +3173,15 @@ static void nop_submit_request(struct i915_request *request)
>  
>  void i915_gem_set_wedged(struct drm_i915_private *i915)
>  {
> +	struct i915_gpu_error *error = &i915->gpu_error;
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>  
> -	GEM_TRACE("start\n");
> +	mutex_lock(&error->wedge_mutex);
> +	if (test_bit(I915_WEDGED, &error->flags)) {
> +		mutex_unlock(&error->wedge_mutex);
> +		return;
> +	}
>  
>  	if (GEM_SHOW_DEBUG()) {
>  		struct drm_printer p = drm_debug_printer(__func__);
> @@ -3185,8 +3190,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  			intel_engine_dump(engine, &p, "%s\n", engine->name);
>  	}
>  
> -	if (test_and_set_bit(I915_WEDGED, &i915->gpu_error.flags))
> -		goto out;
> +	GEM_TRACE("start\n");
>  
>  	/*
>  	 * First, stop submission to hw, but do not yet complete requests by
> @@ -3222,23 +3226,31 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  		intel_engine_wakeup(engine);
>  	}
>  
> -out:
> +	smp_mb__before_atomic();

I was thinking of what state you want to guard against as you
now hold the mutex for wedging. But the answer must: any other
external state. Make everything visible before flipping the bit.

-Mika

> +	set_bit(I915_WEDGED, &error->flags);
> +
>  	GEM_TRACE("end\n");
> +	mutex_unlock(&error->wedge_mutex);
>  
> -	wake_up_all(&i915->gpu_error.reset_queue);
> +	wake_up_all(&error->reset_queue);
>  }
>  
>  bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  {
> +	struct i915_gpu_error *error = &i915->gpu_error;
>  	struct i915_timeline *tl;
> +	bool ret = false;
>  
>  	lockdep_assert_held(&i915->drm.struct_mutex);
> -	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
> +
> +	if (!test_bit(I915_WEDGED, &error->flags))
>  		return true;
>  
>  	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
>  		return false;
>  
> +	mutex_lock(&error->wedge_mutex);
> +
>  	GEM_TRACE("start\n");
>  
>  	/*
> @@ -3272,7 +3284,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  		 */
>  		if (dma_fence_default_wait(&rq->fence, true,
>  					   MAX_SCHEDULE_TIMEOUT) < 0)
> -			return false;
> +			goto unlock;
>  	}
>  	i915_retire_requests(i915);
>  	GEM_BUG_ON(i915->gt.active_requests);
> @@ -3295,8 +3307,11 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  
>  	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
>  	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> +	ret = true;
> +unlock:
> +	mutex_unlock(&i915->gpu_error.wedge_mutex);
>  
> -	return true;
> +	return ret;
>  }
>  
>  static void
> @@ -5692,6 +5707,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
>  			  i915_gem_idle_work_handler);
>  	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
>  	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
> +	mutex_init(&dev_priv->gpu_error.wedge_mutex);
>  
>  	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 6d9f45468ac1..604291f7762d 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -271,8 +271,8 @@ struct i915_gpu_error {
>  #define I915_RESET_BACKOFF	0
>  #define I915_RESET_HANDOFF	1
>  #define I915_RESET_MODESET	2
> +#define I915_RESET_ENGINE	3
>  #define I915_WEDGED		(BITS_PER_LONG - 1)
> -#define I915_RESET_ENGINE	(I915_WEDGED - I915_NUM_ENGINES)
>  
>  	/** Number of times an engine has been reset */
>  	u32 reset_engine_count[I915_NUM_ENGINES];
> @@ -283,6 +283,8 @@ struct i915_gpu_error {
>  	/** Reason for the current *global* reset */
>  	const char *reason;
>  
> +	struct mutex wedge_mutex; /* serialises wedging/unwedging */
> +
>  	/**
>  	 * Waitqueue to signal when a hang is detected. Used to for waiters
>  	 * to release the struct_mutex for the reset to procede.
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index 082809569681..3cda66292e76 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -188,6 +188,7 @@ struct drm_i915_private *mock_gem_device(void)
>  
>  	init_waitqueue_head(&i915->gpu_error.wait_queue);
>  	init_waitqueue_head(&i915->gpu_error.reset_queue);
> +	mutex_init(&i915->gpu_error.wedge_mutex);
>  
>  	i915->wq = alloc_ordered_workqueue("mock", 0);
>  	if (!i915->wq)
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx