Re: [PATCH 2/2] drm/i915: Use rcu instead of stop_machine in set_wedged

From: Chris Wilson <chris@chris-wilson.co.uk>
To: Intel Graphics Development <intel-gfx@lists.freedesktop.org>
Cc: Daniel Vetter <daniel.vetter@intel.com>,
	Daniel Vetter <daniel.vetter@ffwll.ch>,
	Thomas Gleixner <tglx@linutronix.de>,
	Mika Kuoppala <mika.kuoppala@intel.com>
Subject: Re: [PATCH 2/2] drm/i915: Use rcu instead of stop_machine in set_wedged
Date: Tue, 10 Oct 2017 14:39:37 +0100	[thread overview]
Message-ID: <150764277728.797.17846869579061740704@mail.alporthouse.com> (raw)
In-Reply-To: <20171009164401.16035-2-daniel.vetter@ffwll.ch>

Style nits:

Quoting Daniel Vetter (2017-10-09 17:44:01)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 82a10036fb38..8d7d8d1f78db 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3057,49 +3057,71 @@ static void nop_submit_request(struct drm_i915_gem_request *request)
>  
>         spin_lock_irqsave(&request->engine->timeline->lock, flags);
>         __i915_gem_request_submit(request);
> -       intel_engine_init_global_seqno(request->engine, request->global_seqno);
>         spin_unlock_irqrestore(&request->engine->timeline->lock, flags);

This reduced to i915_gem_request_submit().

Hah, you can just engine->submit_request = i915_gem_request_submit, and
keep the nop_submit_request for the second phase! That may make the
diff neater?

> -static int __i915_gem_set_wedged_BKL(void *data)
> +void i915_gem_set_wedged(struct drm_i915_private *i915)
>  {
> -       struct drm_i915_private *i915 = data;
>         struct intel_engine_cs *engine;
>         enum intel_engine_id id;
>  
> +       /* First, stop submission to hw, but do not yet complete requests by
> +        * rolling the global seqno forward (since this would complete requests
> +        * for which we haven't set the fence error to EIO yet).
> +        */

I'm doing a quiet fixup of all my comments to follow
/*
 * The core convention, which you normally use anyway.
 */

>         for_each_engine(engine, i915, id)
> -               engine_set_wedged(engine);
> +               engine->submit_request = nop_submit_request;
>  
> -       set_bit(I915_WEDGED, &i915->gpu_error.flags);
> -       wake_up_all(&i915->gpu_error.reset_queue);
> +       /* Make sure no one is running the old callback before we proceed with
> +        * cancelling requests and resetting the completion tracking. Otherwise
> +        * we might submit a request to the hardware which never completes.
> +        */
> +       synchronize_rcu();
>  
> -       return 0;
> -}
> +       for_each_engine(engine, i915, id) {
> +               /* Mark all executing requests as skipped */
> +               engine->cancel_requests(engine);
>  
> -void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
> -{
> -       stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
> +               /* Only once we've force-cancelled all in-flight requests can we
> +                * start to complete all requests.
> +                */
> +               engine->submit_request = nop_complete_submit_request;
> +       }
> +
> +       /* Make sure no request can slip through without getting completed by
> +        * either this call here to intel_engine_init_global_seqno, or the one
> +        * in nop_complete_submit_request.
> +        */
> +       synchronize_rcu();
> +
> +       for_each_engine(engine, i915, id) {
> +               unsigned long flags;
> +
> +               /* Mark all pending requests as complete so that any concurrent
> +                * (lockless) lookup doesn't try and wait upon the request as we
> +                * reset it.
> +                */
> +               spin_lock_irqsave(&engine->timeline->lock, flags);
> +               intel_engine_init_global_seqno(engine,
> +                                              intel_engine_last_submit(engine));
> +               spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +       }
> +
> +       set_bit(I915_WEDGED, &i915->gpu_error.flags);
> +       wake_up_all(&i915->gpu_error.reset_queue);
>  }
>  
>  bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index b100b38f1dd2..ef78a85cb845 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -556,7 +556,9 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>         switch (state) {
>         case FENCE_COMPLETE:
>                 trace_i915_gem_request_submit(request);
> +               rcu_read_lock();

This trick needs a comment.

/*
 * We need to serialize use of the submit_request() callback with its
 * hotplugging performed during an emergency i915_gem_set_wedged().
 * We use the RCU mechanism to mark the critical section in order to
 * force i915_gem_set_wedged() to wait until the submit_request() is
 * completed before proceeding.
 */
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx