All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
@ 2014-09-04 12:04 Chris Wilson
  2014-09-04 12:17 ` Jani Nikula
  2014-09-04 15:09 ` Chris Wilson
  0 siblings, 2 replies; 12+ messages in thread
From: Chris Wilson @ 2014-09-04 12:04 UTC (permalink / raw)
  To: intel-gfx

When run as a timer, i915_hangcheck_elapsed() must adhere to all the
rules of running in a softirq context. This is advantageous to us as we
want to minimise the risk that a driver bug will prevent us from
detecting a hung GPU. However, that is irrelevant if the driver bug
prevents us from resetting and recovering. Still it is prudent not to
rely on mutexes inside the checker, but given the coarseness of
dev->struct_mutex doing so is extremely hard.

Give in and run from a work queue, i.e. outside of softirq.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_dma.c |  2 +-
 drivers/gpu/drm/i915/i915_drv.c |  2 +-
 drivers/gpu/drm/i915/i915_drv.h |  2 +-
 drivers/gpu/drm/i915/i915_gem.c |  2 +-
 drivers/gpu/drm/i915/i915_irq.c | 15 ++++++++-------
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 6502ae2d7b7d..6a8e71cb2be8 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1002,7 +1002,7 @@ int i915_driver_unload(struct drm_device *dev)
 	}
 
 	/* Free error state after interrupts are fully disabled. */
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	cancel_work_sync(&dev_priv->gpu_error.work);
 	i915_destroy_error_state(dev);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index f7cc6a9c14fd..ea9224a977c1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1431,7 +1431,7 @@ static int intel_runtime_suspend(struct device *device)
 		return ret;
 	}
 
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	intel_uncore_forcewake_reset(dev, false);
 	dev_priv->pm.suspended = true;
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a920ca3789d8..e1f8ffcb2cf3 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1209,7 +1209,7 @@ struct i915_gpu_error {
 	/* Hang gpu twice in this window and your context gets banned */
 #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
 
-	struct timer_list hangcheck_timer;
+	struct delayed_work hangcheck_work;
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b37177afc3c0..3e80c777bf12 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4530,7 +4530,7 @@ i915_gem_suspend(struct drm_device *dev)
 							     DRIVER_MODESET);
 	mutex_unlock(&dev->struct_mutex);
 
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
 	flush_delayed_work(&dev_priv->mm.idle_work);
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2ade9efe078c..5fe9cdb323b3 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -3189,9 +3189,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
  * we kick the ring. If we see no progress on three subsequent calls
  * we assume chip is wedged and try to fix it by resetting the chip.
  */
-static void i915_hangcheck_elapsed(unsigned long data)
+static void i915_hangcheck_elapsed(struct work_struct *work)
 {
-	struct drm_i915_private *dev_priv = (struct drm_i915_private *)data;
+	struct drm_i915_private *dev_priv =
+		container_of(work, typeof(*dev_priv),
+			     gpu_error.hangcheck_work.work);
 	struct intel_engine_cs *engine;
 	int i;
 	int busy_count = 0, rings_hung = 0;
@@ -3312,8 +3314,8 @@ void i915_queue_hangcheck(struct drm_device *dev)
 	if (!i915_module.enable_hangcheck)
 		return;
 
-	mod_timer(&to_i915(dev)->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	schedule_delayed_work(&to_i915(dev)->gpu_error.hangcheck_work,
+			      round_jiffies_up(DRM_I915_HANGCHECK_JIFFIES));
 }
 
 static void ibx_irq_reset(struct drm_device *dev)
@@ -4615,9 +4617,8 @@ void intel_irq_init(struct drm_device *dev)
 	else
 		dev_priv->rps.pm_events = GEN6_PM_RPS_EVENTS;
 
-	setup_timer(&dev_priv->gpu_error.hangcheck_timer,
-		    i915_hangcheck_elapsed,
-		    (unsigned long) dev_priv);
+	INIT_DELAYED_WORK(&dev_priv->gpu_error.hangcheck_work,
+			  i915_hangcheck_elapsed);
 	INIT_DELAYED_WORK(&dev_priv->hotplug_reenable_work,
 			  intel_hpd_irq_reenable);
 
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 12:04 [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item Chris Wilson
@ 2014-09-04 12:17 ` Jani Nikula
  2014-09-04 12:26   ` Chris Wilson
  2014-09-04 15:09 ` Chris Wilson
  1 sibling, 1 reply; 12+ messages in thread
From: Jani Nikula @ 2014-09-04 12:17 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On Thu, 04 Sep 2014, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> When run as a timer, i915_hangcheck_elapsed() must adhere to all the
> rules of running in a softirq context. This is advantageous to us as we
> want to minimise the risk that a driver bug will prevent us from
> detecting a hung GPU. However, that is irrelevant if the driver bug
> prevents us from resetting and recovering. Still it is prudent not to
> rely on mutexes inside the checker, but given the coarseness of
> dev->struct_mutex doing so is extremely hard.
>
> Give in and run from a work queue, i.e. outside of softirq.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_dma.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.h |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c |  2 +-
>  drivers/gpu/drm/i915/i915_irq.c | 15 ++++++++-------
>  5 files changed, 12 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 6502ae2d7b7d..6a8e71cb2be8 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1002,7 +1002,7 @@ int i915_driver_unload(struct drm_device *dev)
>  	}
>  
>  	/* Free error state after interrupts are fully disabled. */
> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	cancel_work_sync(&dev_priv->gpu_error.work);
>  	i915_destroy_error_state(dev);
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index f7cc6a9c14fd..ea9224a977c1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1431,7 +1431,7 @@ static int intel_runtime_suspend(struct device *device)
>  		return ret;
>  	}
>  
> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	intel_uncore_forcewake_reset(dev, false);
>  	dev_priv->pm.suspended = true;
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a920ca3789d8..e1f8ffcb2cf3 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1209,7 +1209,7 @@ struct i915_gpu_error {
>  	/* Hang gpu twice in this window and your context gets banned */
>  #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
>  
> -	struct timer_list hangcheck_timer;
> +	struct delayed_work hangcheck_work;
>  
>  	/* For reset and error_state handling. */
>  	spinlock_t lock;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b37177afc3c0..3e80c777bf12 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4530,7 +4530,7 @@ i915_gem_suspend(struct drm_device *dev)
>  							     DRIVER_MODESET);
>  	mutex_unlock(&dev->struct_mutex);
>  
> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
>  	flush_delayed_work(&dev_priv->mm.idle_work);
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 2ade9efe078c..5fe9cdb323b3 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -3189,9 +3189,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
>   * we kick the ring. If we see no progress on three subsequent calls
>   * we assume chip is wedged and try to fix it by resetting the chip.
>   */
> -static void i915_hangcheck_elapsed(unsigned long data)
> +static void i915_hangcheck_elapsed(struct work_struct *work)
>  {
> -	struct drm_i915_private *dev_priv = (struct drm_i915_private *)data;
> +	struct drm_i915_private *dev_priv =
> +		container_of(work, typeof(*dev_priv),
> +			     gpu_error.hangcheck_work.work);
>  	struct intel_engine_cs *engine;
>  	int i;
>  	int busy_count = 0, rings_hung = 0;
> @@ -3312,8 +3314,8 @@ void i915_queue_hangcheck(struct drm_device *dev)
>  	if (!i915_module.enable_hangcheck)
>  		return;
>  
> -	mod_timer(&to_i915(dev)->gpu_error.hangcheck_timer,
> -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> +	schedule_delayed_work(&to_i915(dev)->gpu_error.hangcheck_work,
> +			      round_jiffies_up(DRM_I915_HANGCHECK_JIFFIES));

schedule_delayed_work does not push the work timer forward if the work
is already queued, and it also expects a delay, not absolute jiffies.

BR,
Jani.


>  }
>  
>  static void ibx_irq_reset(struct drm_device *dev)
> @@ -4615,9 +4617,8 @@ void intel_irq_init(struct drm_device *dev)
>  	else
>  		dev_priv->rps.pm_events = GEN6_PM_RPS_EVENTS;
>  
> -	setup_timer(&dev_priv->gpu_error.hangcheck_timer,
> -		    i915_hangcheck_elapsed,
> -		    (unsigned long) dev_priv);
> +	INIT_DELAYED_WORK(&dev_priv->gpu_error.hangcheck_work,
> +			  i915_hangcheck_elapsed);
>  	INIT_DELAYED_WORK(&dev_priv->hotplug_reenable_work,
>  			  intel_hpd_irq_reenable);
>  
> -- 
> 2.1.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Jani Nikula, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 12:17 ` Jani Nikula
@ 2014-09-04 12:26   ` Chris Wilson
  2014-09-04 13:31     ` Daniel Vetter
  0 siblings, 1 reply; 12+ messages in thread
From: Chris Wilson @ 2014-09-04 12:26 UTC (permalink / raw)
  To: Jani Nikula; +Cc: intel-gfx

On Thu, Sep 04, 2014 at 03:17:57PM +0300, Jani Nikula wrote:
> On Thu, 04 Sep 2014, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > When run as a timer, i915_hangcheck_elapsed() must adhere to all the
> > rules of running in a softirq context. This is advantageous to us as we
> > want to minimise the risk that a driver bug will prevent us from
> > detecting a hung GPU. However, that is irrelevant if the driver bug
> > prevents us from resetting and recovering. Still it is prudent not to
> > rely on mutexes inside the checker, but given the coarseness of
> > dev->struct_mutex doing so is extremely hard.
> >
> > Give in and run from a work queue, i.e. outside of softirq.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/i915_dma.c |  2 +-
> >  drivers/gpu/drm/i915/i915_drv.c |  2 +-
> >  drivers/gpu/drm/i915/i915_drv.h |  2 +-
> >  drivers/gpu/drm/i915/i915_gem.c |  2 +-
> >  drivers/gpu/drm/i915/i915_irq.c | 15 ++++++++-------
> >  5 files changed, 12 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > index 6502ae2d7b7d..6a8e71cb2be8 100644
> > --- a/drivers/gpu/drm/i915/i915_dma.c
> > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > @@ -1002,7 +1002,7 @@ int i915_driver_unload(struct drm_device *dev)
> >  	}
> >  
> >  	/* Free error state after interrupts are fully disabled. */
> > -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> > +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> >  	cancel_work_sync(&dev_priv->gpu_error.work);
> >  	i915_destroy_error_state(dev);
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> > index f7cc6a9c14fd..ea9224a977c1 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -1431,7 +1431,7 @@ static int intel_runtime_suspend(struct device *device)
> >  		return ret;
> >  	}
> >  
> > -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> > +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> >  	intel_uncore_forcewake_reset(dev, false);
> >  	dev_priv->pm.suspended = true;
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index a920ca3789d8..e1f8ffcb2cf3 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1209,7 +1209,7 @@ struct i915_gpu_error {
> >  	/* Hang gpu twice in this window and your context gets banned */
> >  #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
> >  
> > -	struct timer_list hangcheck_timer;
> > +	struct delayed_work hangcheck_work;
> >  
> >  	/* For reset and error_state handling. */
> >  	spinlock_t lock;
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index b37177afc3c0..3e80c777bf12 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -4530,7 +4530,7 @@ i915_gem_suspend(struct drm_device *dev)
> >  							     DRIVER_MODESET);
> >  	mutex_unlock(&dev->struct_mutex);
> >  
> > -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> > +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> >  	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
> >  	flush_delayed_work(&dev_priv->mm.idle_work);
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index 2ade9efe078c..5fe9cdb323b3 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -3189,9 +3189,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
> >   * we kick the ring. If we see no progress on three subsequent calls
> >   * we assume chip is wedged and try to fix it by resetting the chip.
> >   */
> > -static void i915_hangcheck_elapsed(unsigned long data)
> > +static void i915_hangcheck_elapsed(struct work_struct *work)
> >  {
> > -	struct drm_i915_private *dev_priv = (struct drm_i915_private *)data;
> > +	struct drm_i915_private *dev_priv =
> > +		container_of(work, typeof(*dev_priv),
> > +			     gpu_error.hangcheck_work.work);
> >  	struct intel_engine_cs *engine;
> >  	int i;
> >  	int busy_count = 0, rings_hung = 0;
> > @@ -3312,8 +3314,8 @@ void i915_queue_hangcheck(struct drm_device *dev)
> >  	if (!i915_module.enable_hangcheck)
> >  		return;
> >  
> > -	mod_timer(&to_i915(dev)->gpu_error.hangcheck_timer,
> > -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> > +	schedule_delayed_work(&to_i915(dev)->gpu_error.hangcheck_work,
> > +			      round_jiffies_up(DRM_I915_HANGCHECK_JIFFIES));
> 
> schedule_delayed_work does not push the work timer forward if the work
> is already queued, and it also expects a delay, not absolute jiffies.

Indeed. We don't need to push the work forward, but we do need the
ability to requeue work. I do pass in a relative offset, but I guess we
want to replace the round_jiffies into its relative variant.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 12:26   ` Chris Wilson
@ 2014-09-04 13:31     ` Daniel Vetter
  2014-09-04 13:33       ` Chris Wilson
  0 siblings, 1 reply; 12+ messages in thread
From: Daniel Vetter @ 2014-09-04 13:31 UTC (permalink / raw)
  To: Chris Wilson, Jani Nikula, intel-gfx

On Thu, Sep 04, 2014 at 01:26:36PM +0100, Chris Wilson wrote:
> On Thu, Sep 04, 2014 at 03:17:57PM +0300, Jani Nikula wrote:
> > On Thu, 04 Sep 2014, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > When run as a timer, i915_hangcheck_elapsed() must adhere to all the
> > > rules of running in a softirq context. This is advantageous to us as we
> > > want to minimise the risk that a driver bug will prevent us from
> > > detecting a hung GPU. However, that is irrelevant if the driver bug
> > > prevents us from resetting and recovering. Still it is prudent not to
> > > rely on mutexes inside the checker, but given the coarseness of
> > > dev->struct_mutex doing so is extremely hard.
> > >
> > > Give in and run from a work queue, i.e. outside of softirq.
> > >
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > ---
> > >  drivers/gpu/drm/i915/i915_dma.c |  2 +-
> > >  drivers/gpu/drm/i915/i915_drv.c |  2 +-
> > >  drivers/gpu/drm/i915/i915_drv.h |  2 +-
> > >  drivers/gpu/drm/i915/i915_gem.c |  2 +-
> > >  drivers/gpu/drm/i915/i915_irq.c | 15 ++++++++-------
> > >  5 files changed, 12 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > > index 6502ae2d7b7d..6a8e71cb2be8 100644
> > > --- a/drivers/gpu/drm/i915/i915_dma.c
> > > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > > @@ -1002,7 +1002,7 @@ int i915_driver_unload(struct drm_device *dev)
> > >  	}
> > >  
> > >  	/* Free error state after interrupts are fully disabled. */
> > > -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> > > +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> > >  	cancel_work_sync(&dev_priv->gpu_error.work);
> > >  	i915_destroy_error_state(dev);
> > >  
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> > > index f7cc6a9c14fd..ea9224a977c1 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.c
> > > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > > @@ -1431,7 +1431,7 @@ static int intel_runtime_suspend(struct device *device)
> > >  		return ret;
> > >  	}
> > >  
> > > -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> > > +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> > >  	intel_uncore_forcewake_reset(dev, false);
> > >  	dev_priv->pm.suspended = true;
> > >  
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > > index a920ca3789d8..e1f8ffcb2cf3 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.h
> > > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > > @@ -1209,7 +1209,7 @@ struct i915_gpu_error {
> > >  	/* Hang gpu twice in this window and your context gets banned */
> > >  #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
> > >  
> > > -	struct timer_list hangcheck_timer;
> > > +	struct delayed_work hangcheck_work;
> > >  
> > >  	/* For reset and error_state handling. */
> > >  	spinlock_t lock;
> > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > > index b37177afc3c0..3e80c777bf12 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > > @@ -4530,7 +4530,7 @@ i915_gem_suspend(struct drm_device *dev)
> > >  							     DRIVER_MODESET);
> > >  	mutex_unlock(&dev->struct_mutex);
> > >  
> > > -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> > > +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
> > >  	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
> > >  	flush_delayed_work(&dev_priv->mm.idle_work);
> > >  
> > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > > index 2ade9efe078c..5fe9cdb323b3 100644
> > > --- a/drivers/gpu/drm/i915/i915_irq.c
> > > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > > @@ -3189,9 +3189,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
> > >   * we kick the ring. If we see no progress on three subsequent calls
> > >   * we assume chip is wedged and try to fix it by resetting the chip.
> > >   */
> > > -static void i915_hangcheck_elapsed(unsigned long data)
> > > +static void i915_hangcheck_elapsed(struct work_struct *work)
> > >  {
> > > -	struct drm_i915_private *dev_priv = (struct drm_i915_private *)data;
> > > +	struct drm_i915_private *dev_priv =
> > > +		container_of(work, typeof(*dev_priv),
> > > +			     gpu_error.hangcheck_work.work);
> > >  	struct intel_engine_cs *engine;
> > >  	int i;
> > >  	int busy_count = 0, rings_hung = 0;
> > > @@ -3312,8 +3314,8 @@ void i915_queue_hangcheck(struct drm_device *dev)
> > >  	if (!i915_module.enable_hangcheck)
> > >  		return;
> > >  
> > > -	mod_timer(&to_i915(dev)->gpu_error.hangcheck_timer,
> > > -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> > > +	schedule_delayed_work(&to_i915(dev)->gpu_error.hangcheck_work,
> > > +			      round_jiffies_up(DRM_I915_HANGCHECK_JIFFIES));
> > 
> > schedule_delayed_work does not push the work timer forward if the work
> > is already queued, and it also expects a delay, not absolute jiffies.
> 
> Indeed. We don't need to push the work forward, but we do need the
> ability to requeue work. I do pass in a relative offset, but I guess we
> want to replace the round_jiffies into its relative variant.

Didn't 3.17 just gain the mod_delayed_work interface?
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 13:31     ` Daniel Vetter
@ 2014-09-04 13:33       ` Chris Wilson
  2014-09-04 14:12         ` Daniel Vetter
  0 siblings, 1 reply; 12+ messages in thread
From: Chris Wilson @ 2014-09-04 13:33 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Thu, Sep 04, 2014 at 03:31:14PM +0200, Daniel Vetter wrote:
> Didn't 3.17 just gain the mod_delayed_work interface?

Yes. It was so exciting and doesn't appear to requeue.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 13:33       ` Chris Wilson
@ 2014-09-04 14:12         ` Daniel Vetter
  2014-09-04 14:18           ` Chris Wilson
  0 siblings, 1 reply; 12+ messages in thread
From: Daniel Vetter @ 2014-09-04 14:12 UTC (permalink / raw)
  To: Chris Wilson, Daniel Vetter, Jani Nikula, intel-gfx

On Thu, Sep 04, 2014 at 02:33:44PM +0100, Chris Wilson wrote:
> On Thu, Sep 04, 2014 at 03:31:14PM +0200, Daniel Vetter wrote:
> > Didn't 3.17 just gain the mod_delayed_work interface?
> 
> Yes. It was so exciting and doesn't appear to requeue.

I guess we could do the mod_delayed_work in add_request and use the
schedule_delayed_work everywhere in i915_irq.c then?

But when exactly does mod_delayed_work not requeue? Documentation at
least claims it does ...
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 14:12         ` Daniel Vetter
@ 2014-09-04 14:18           ` Chris Wilson
  0 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2014-09-04 14:18 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Thu, Sep 04, 2014 at 04:12:43PM +0200, Daniel Vetter wrote:
> On Thu, Sep 04, 2014 at 02:33:44PM +0100, Chris Wilson wrote:
> > On Thu, Sep 04, 2014 at 03:31:14PM +0200, Daniel Vetter wrote:
> > > Didn't 3.17 just gain the mod_delayed_work interface?
> > 
> > Yes. It was so exciting and doesn't appear to requeue.
> 
> I guess we could do the mod_delayed_work in add_request and use the
> schedule_delayed_work everywhere in i915_irq.c then?
> 
> But when exactly does mod_delayed_work not requeue? Documentation at
> least claims it does ...

I think the first version I tested used relative jiffies in
mod_delayed_work, but I'm not buying that we actually ever want to push
the first hangcheck back simply because we are queueing more work... We
can queue work right up until the we fill the ring behind a hung task in
that case.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 12:04 [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item Chris Wilson
  2014-09-04 12:17 ` Jani Nikula
@ 2014-09-04 15:09 ` Chris Wilson
  2014-09-04 15:25   ` Ville Syrjälä
  2014-10-07 15:34   ` Mika Kuoppala
  1 sibling, 2 replies; 12+ messages in thread
From: Chris Wilson @ 2014-09-04 15:09 UTC (permalink / raw)
  To: intel-gfx; +Cc: Jani Nikula, Daniel Vetter

When run as a timer, i915_hangcheck_elapsed() must adhere to all the
rules of running in a softirq context. This is advantageous to us as we
want to minimise the risk that a driver bug will prevent us from
detecting a hung GPU. However, that is irrelevant if the driver bug
prevents us from resetting and recovering. Still it is prudent not to
rely on mutexes inside the checker, but given the coarseness of
dev->struct_mutex doing so is extremely hard.

Give in and run from a work queue, i.e. outside of softirq.

v2:

The conversion does have one significant change, from the use of
mod_timer to schedule_delayed_work, means that the time that we execute
the first hangcheck is fixed and not continually deferred by later work.
This has the advantage of not allowing userspace to fill the ring before
hangcheck can finally run. At the same time, it removes the ability for
the interrupt to defer the hangcheck as well. This is sensible for that
an interrupt is only for a single engine, whereas we perform hangcheck
globally, so whilst one ring may have hung, the other could be running
normally and preventing the hangcheck from firing.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Daniel Vetter <dnaiel.vetter@ffwll.chm>
---
 drivers/gpu/drm/i915/i915_dma.c |  2 +-
 drivers/gpu/drm/i915/i915_drv.c |  2 +-
 drivers/gpu/drm/i915/i915_drv.h |  2 +-
 drivers/gpu/drm/i915/i915_gem.c |  2 +-
 drivers/gpu/drm/i915/i915_irq.c | 16 ++++++++--------
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 6502ae2d7b7d..6a8e71cb2be8 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1002,7 +1002,7 @@ int i915_driver_unload(struct drm_device *dev)
 	}
 
 	/* Free error state after interrupts are fully disabled. */
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	cancel_work_sync(&dev_priv->gpu_error.work);
 	i915_destroy_error_state(dev);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index f7cc6a9c14fd..ea9224a977c1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1431,7 +1431,7 @@ static int intel_runtime_suspend(struct device *device)
 		return ret;
 	}
 
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	intel_uncore_forcewake_reset(dev, false);
 	dev_priv->pm.suspended = true;
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a920ca3789d8..e1f8ffcb2cf3 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1209,7 +1209,7 @@ struct i915_gpu_error {
 	/* Hang gpu twice in this window and your context gets banned */
 #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
 
-	struct timer_list hangcheck_timer;
+	struct delayed_work hangcheck_work;
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b37177afc3c0..3e80c777bf12 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4530,7 +4530,7 @@ i915_gem_suspend(struct drm_device *dev)
 							     DRIVER_MODESET);
 	mutex_unlock(&dev->struct_mutex);
 
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
 	flush_delayed_work(&dev_priv->mm.idle_work);
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2ade9efe078c..d295f546b58d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1274,7 +1274,6 @@ static void notify_ring(struct drm_device *dev,
 	atomic_inc(&engine->interrupts);
 
 	wake_up_all(&engine->irq_queue);
-	i915_queue_hangcheck(dev);
 }
 
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
@@ -3189,9 +3188,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
  * we kick the ring. If we see no progress on three subsequent calls
  * we assume chip is wedged and try to fix it by resetting the chip.
  */
-static void i915_hangcheck_elapsed(unsigned long data)
+static void i915_hangcheck_elapsed(struct work_struct *work)
 {
-	struct drm_i915_private *dev_priv = (struct drm_i915_private *)data;
+	struct drm_i915_private *dev_priv =
+		container_of(work, typeof(*dev_priv),
+			     gpu_error.hangcheck_work.work);
 	struct intel_engine_cs *engine;
 	int i;
 	int busy_count = 0, rings_hung = 0;
@@ -3312,8 +3313,8 @@ void i915_queue_hangcheck(struct drm_device *dev)
 	if (!i915_module.enable_hangcheck)
 		return;
 
-	mod_timer(&to_i915(dev)->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	schedule_delayed_work(&to_i915(dev)->gpu_error.hangcheck_work,
+			      round_jiffies_up_relative(DRM_I915_HANGCHECK_JIFFIES));
 }
 
 static void ibx_irq_reset(struct drm_device *dev)
@@ -4615,9 +4616,8 @@ void intel_irq_init(struct drm_device *dev)
 	else
 		dev_priv->rps.pm_events = GEN6_PM_RPS_EVENTS;
 
-	setup_timer(&dev_priv->gpu_error.hangcheck_timer,
-		    i915_hangcheck_elapsed,
-		    (unsigned long) dev_priv);
+	INIT_DELAYED_WORK(&dev_priv->gpu_error.hangcheck_work,
+			  i915_hangcheck_elapsed);
 	INIT_DELAYED_WORK(&dev_priv->hotplug_reenable_work,
 			  intel_hpd_irq_reenable);
 
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 15:09 ` Chris Wilson
@ 2014-09-04 15:25   ` Ville Syrjälä
  2014-09-04 15:38     ` Chris Wilson
  2014-10-07 15:34   ` Mika Kuoppala
  1 sibling, 1 reply; 12+ messages in thread
From: Ville Syrjälä @ 2014-09-04 15:25 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Jani Nikula, Daniel Vetter, intel-gfx

On Thu, Sep 04, 2014 at 04:09:02PM +0100, Chris Wilson wrote:
> When run as a timer, i915_hangcheck_elapsed() must adhere to all the
> rules of running in a softirq context. This is advantageous to us as we
> want to minimise the risk that a driver bug will prevent us from
> detecting a hung GPU. However, that is irrelevant if the driver bug
> prevents us from resetting and recovering. Still it is prudent not to
> rely on mutexes inside the checker, but given the coarseness of
> dev->struct_mutex doing so is extremely hard.
> 
> Give in and run from a work queue, i.e. outside of softirq.
> 
> v2:
> 
> The conversion does have one significant change, from the use of
> mod_timer to schedule_delayed_work, means that the time that we execute
> the first hangcheck is fixed and not continually deferred by later work.
> This has the advantage of not allowing userspace to fill the ring before
> hangcheck can finally run. At the same time, it removes the ability for
> the interrupt to defer the hangcheck as well. This is sensible for that
> an interrupt is only for a single engine, whereas we perform hangcheck
> globally, so whilst one ring may have hung, the other could be running
> normally and preventing the hangcheck from firing.

But doesn't this make it so that we may not detect a hang unless more
work gets submitted constantly? Eg.

1. execbuffer batch 1 -> queue hangcheck schedules work
2. execbuffer batch 2 -> queue hangcheck does nothing
3. execbuffer batch 3 -> queue hangcheck does nothing
4. hangcheck expires and sees progress up to batch 2 -> everything is fine
5. batch 3 hangs

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Jani Nikula <jani.nikula@intel.com>
> Cc: Daniel Vetter <dnaiel.vetter@ffwll.chm>
> ---
>  drivers/gpu/drm/i915/i915_dma.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.h |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c |  2 +-
>  drivers/gpu/drm/i915/i915_irq.c | 16 ++++++++--------
>  5 files changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 6502ae2d7b7d..6a8e71cb2be8 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1002,7 +1002,7 @@ int i915_driver_unload(struct drm_device *dev)
>  	}
>  
>  	/* Free error state after interrupts are fully disabled. */
> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	cancel_work_sync(&dev_priv->gpu_error.work);
>  	i915_destroy_error_state(dev);
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index f7cc6a9c14fd..ea9224a977c1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1431,7 +1431,7 @@ static int intel_runtime_suspend(struct device *device)
>  		return ret;
>  	}
>  
> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	intel_uncore_forcewake_reset(dev, false);
>  	dev_priv->pm.suspended = true;
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a920ca3789d8..e1f8ffcb2cf3 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1209,7 +1209,7 @@ struct i915_gpu_error {
>  	/* Hang gpu twice in this window and your context gets banned */
>  #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
>  
> -	struct timer_list hangcheck_timer;
> +	struct delayed_work hangcheck_work;
>  
>  	/* For reset and error_state handling. */
>  	spinlock_t lock;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b37177afc3c0..3e80c777bf12 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4530,7 +4530,7 @@ i915_gem_suspend(struct drm_device *dev)
>  							     DRIVER_MODESET);
>  	mutex_unlock(&dev->struct_mutex);
>  
> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
>  	flush_delayed_work(&dev_priv->mm.idle_work);
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 2ade9efe078c..d295f546b58d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1274,7 +1274,6 @@ static void notify_ring(struct drm_device *dev,
>  	atomic_inc(&engine->interrupts);
>  
>  	wake_up_all(&engine->irq_queue);
> -	i915_queue_hangcheck(dev);
>  }
>  
>  static void vlv_c0_read(struct drm_i915_private *dev_priv,
> @@ -3189,9 +3188,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
>   * we kick the ring. If we see no progress on three subsequent calls
>   * we assume chip is wedged and try to fix it by resetting the chip.
>   */
> -static void i915_hangcheck_elapsed(unsigned long data)
> +static void i915_hangcheck_elapsed(struct work_struct *work)
>  {
> -	struct drm_i915_private *dev_priv = (struct drm_i915_private *)data;
> +	struct drm_i915_private *dev_priv =
> +		container_of(work, typeof(*dev_priv),
> +			     gpu_error.hangcheck_work.work);
>  	struct intel_engine_cs *engine;
>  	int i;
>  	int busy_count = 0, rings_hung = 0;
> @@ -3312,8 +3313,8 @@ void i915_queue_hangcheck(struct drm_device *dev)
>  	if (!i915_module.enable_hangcheck)
>  		return;
>  
> -	mod_timer(&to_i915(dev)->gpu_error.hangcheck_timer,
> -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> +	schedule_delayed_work(&to_i915(dev)->gpu_error.hangcheck_work,
> +			      round_jiffies_up_relative(DRM_I915_HANGCHECK_JIFFIES));
>  }
>  
>  static void ibx_irq_reset(struct drm_device *dev)
> @@ -4615,9 +4616,8 @@ void intel_irq_init(struct drm_device *dev)
>  	else
>  		dev_priv->rps.pm_events = GEN6_PM_RPS_EVENTS;
>  
> -	setup_timer(&dev_priv->gpu_error.hangcheck_timer,
> -		    i915_hangcheck_elapsed,
> -		    (unsigned long) dev_priv);
> +	INIT_DELAYED_WORK(&dev_priv->gpu_error.hangcheck_work,
> +			  i915_hangcheck_elapsed);
>  	INIT_DELAYED_WORK(&dev_priv->hotplug_reenable_work,
>  			  intel_hpd_irq_reenable);
>  
> -- 
> 2.1.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 15:25   ` Ville Syrjälä
@ 2014-09-04 15:38     ` Chris Wilson
  2014-09-04 16:40       ` Ville Syrjälä
  0 siblings, 1 reply; 12+ messages in thread
From: Chris Wilson @ 2014-09-04 15:38 UTC (permalink / raw)
  To: Ville Syrjälä; +Cc: Jani Nikula, Daniel Vetter, intel-gfx

On Thu, Sep 04, 2014 at 06:25:03PM +0300, Ville Syrjälä wrote:
> On Thu, Sep 04, 2014 at 04:09:02PM +0100, Chris Wilson wrote:
> > When run as a timer, i915_hangcheck_elapsed() must adhere to all the
> > rules of running in a softirq context. This is advantageous to us as we
> > want to minimise the risk that a driver bug will prevent us from
> > detecting a hung GPU. However, that is irrelevant if the driver bug
> > prevents us from resetting and recovering. Still it is prudent not to
> > rely on mutexes inside the checker, but given the coarseness of
> > dev->struct_mutex doing so is extremely hard.
> > 
> > Give in and run from a work queue, i.e. outside of softirq.
> > 
> > v2:
> > 
> > The conversion does have one significant change, from the use of
> > mod_timer to schedule_delayed_work, means that the time that we execute
> > the first hangcheck is fixed and not continually deferred by later work.
> > This has the advantage of not allowing userspace to fill the ring before
> > hangcheck can finally run. At the same time, it removes the ability for
> > the interrupt to defer the hangcheck as well. This is sensible for that
> > an interrupt is only for a single engine, whereas we perform hangcheck
> > globally, so whilst one ring may have hung, the other could be running
> > normally and preventing the hangcheck from firing.
> 
> But doesn't this make it so that we may not detect a hang unless more
> work gets submitted constantly? Eg.
> 
> 1. execbuffer batch 1 -> queue hangcheck schedules work
> 2. execbuffer batch 2 -> queue hangcheck does nothing
> 3. execbuffer batch 3 -> queue hangcheck does nothing
> 4. hangcheck expires and sees progress up to batch 2 -> everything is fine
4.b hangcheck rearms itself as there is outstanding wrok
> 5. batch 3 hangs
6. hangcheck fires, sees progress, rearms
7. hangcheck fires, sees no progress, shoots the user.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 15:38     ` Chris Wilson
@ 2014-09-04 16:40       ` Ville Syrjälä
  0 siblings, 0 replies; 12+ messages in thread
From: Ville Syrjälä @ 2014-09-04 16:40 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, Jani Nikula, Daniel Vetter

On Thu, Sep 04, 2014 at 04:38:39PM +0100, Chris Wilson wrote:
> On Thu, Sep 04, 2014 at 06:25:03PM +0300, Ville Syrjälä wrote:
> > On Thu, Sep 04, 2014 at 04:09:02PM +0100, Chris Wilson wrote:
> > > When run as a timer, i915_hangcheck_elapsed() must adhere to all the
> > > rules of running in a softirq context. This is advantageous to us as we
> > > want to minimise the risk that a driver bug will prevent us from
> > > detecting a hung GPU. However, that is irrelevant if the driver bug
> > > prevents us from resetting and recovering. Still it is prudent not to
> > > rely on mutexes inside the checker, but given the coarseness of
> > > dev->struct_mutex doing so is extremely hard.
> > > 
> > > Give in and run from a work queue, i.e. outside of softirq.
> > > 
> > > v2:
> > > 
> > > The conversion does have one significant change, from the use of
> > > mod_timer to schedule_delayed_work, means that the time that we execute
> > > the first hangcheck is fixed and not continually deferred by later work.
> > > This has the advantage of not allowing userspace to fill the ring before
> > > hangcheck can finally run. At the same time, it removes the ability for
> > > the interrupt to defer the hangcheck as well. This is sensible for that
> > > an interrupt is only for a single engine, whereas we perform hangcheck
> > > globally, so whilst one ring may have hung, the other could be running
> > > normally and preventing the hangcheck from firing.
> > 
> > But doesn't this make it so that we may not detect a hang unless more
> > work gets submitted constantly? Eg.
> > 
> > 1. execbuffer batch 1 -> queue hangcheck schedules work
> > 2. execbuffer batch 2 -> queue hangcheck does nothing
> > 3. execbuffer batch 3 -> queue hangcheck does nothing
> > 4. hangcheck expires and sees progress up to batch 2 -> everything is fine
> 4.b hangcheck rearms itself as there is outstanding wrok

Indeed. I should have actually read the code and it would have been
obvious.

> > 5. batch 3 hangs
> 6. hangcheck fires, sees progress, rearms
> 7. hangcheck fires, sees no progress, shoots the user.

Sounds like we need a disclaimer about the dangers of causing a GPU
hang :)

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item
  2014-09-04 15:09 ` Chris Wilson
  2014-09-04 15:25   ` Ville Syrjälä
@ 2014-10-07 15:34   ` Mika Kuoppala
  1 sibling, 0 replies; 12+ messages in thread
From: Mika Kuoppala @ 2014-10-07 15:34 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Jani Nikula, Daniel Vetter

Chris Wilson <chris@chris-wilson.co.uk> writes:

> When run as a timer, i915_hangcheck_elapsed() must adhere to all the
> rules of running in a softirq context. This is advantageous to us as we
> want to minimise the risk that a driver bug will prevent us from
> detecting a hung GPU. However, that is irrelevant if the driver bug
> prevents us from resetting and recovering. Still it is prudent not to
> rely on mutexes inside the checker, but given the coarseness of
> dev->struct_mutex doing so is extremely hard.
>
> Give in and run from a work queue, i.e. outside of softirq.
>
> v2:
>
> The conversion does have one significant change, from the use of
> mod_timer to schedule_delayed_work, means that the time that we execute
> the first hangcheck is fixed and not continually deferred by later work.
> This has the advantage of not allowing userspace to fill the ring before
> hangcheck can finally run. At the same time, it removes the ability for
> the interrupt to defer the hangcheck as well. This is sensible for that
> an interrupt is only for a single engine, whereas we perform hangcheck
> globally, so whilst one ring may have hung, the other could be running
> normally and preventing the hangcheck from firing.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Jani Nikula <jani.nikula@intel.com>
> Cc: Daniel Vetter <dnaiel.vetter@ffwll.chm>
> ---
>  drivers/gpu/drm/i915/i915_dma.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.h |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c |  2 +-
>  drivers/gpu/drm/i915/i915_irq.c | 16 ++++++++--------
>  5 files changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 6502ae2d7b7d..6a8e71cb2be8 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1002,7 +1002,7 @@ int i915_driver_unload(struct drm_device *dev)
>  	}
>  
>  	/* Free error state after interrupts are fully disabled. */
> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	cancel_work_sync(&dev_priv->gpu_error.work);
>  	i915_destroy_error_state(dev);
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index f7cc6a9c14fd..ea9224a977c1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1431,7 +1431,7 @@ static int intel_runtime_suspend(struct device *device)
>  		return ret;
>  	}
>  
> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	intel_uncore_forcewake_reset(dev, false);
>  	dev_priv->pm.suspended = true;
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a920ca3789d8..e1f8ffcb2cf3 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1209,7 +1209,7 @@ struct i915_gpu_error {
>  	/* Hang gpu twice in this window and your context gets banned */
>  #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
>  
> -	struct timer_list hangcheck_timer;
> +	struct delayed_work hangcheck_work;
>  
>  	/* For reset and error_state handling. */
>  	spinlock_t lock;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b37177afc3c0..3e80c777bf12 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4530,7 +4530,7 @@ i915_gem_suspend(struct drm_device *dev)
>  							     DRIVER_MODESET);
>  	mutex_unlock(&dev->struct_mutex);
>  
> -	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
> +	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
>  	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
>  	flush_delayed_work(&dev_priv->mm.idle_work);
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 2ade9efe078c..d295f546b58d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1274,7 +1274,6 @@ static void notify_ring(struct drm_device *dev,
>  	atomic_inc(&engine->interrupts);
>  
>  	wake_up_all(&engine->irq_queue);
> -	i915_queue_hangcheck(dev);
>  }
>  
>  static void vlv_c0_read(struct drm_i915_private *dev_priv,
> @@ -3189,9 +3188,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
>   * we kick the ring. If we see no progress on three subsequent calls
>   * we assume chip is wedged and try to fix it by resetting the chip.
>   */

As queuing was removed from notify_ring and thus we no longer
postpone  i915_hangcheck_elapsed but rather call it periodically,
the comment in here needs fixing. In the part where we claim that this
get only called if nothing has been reported back in a long time.

> -static void i915_hangcheck_elapsed(unsigned long data)
> +static void i915_hangcheck_elapsed(struct work_struct *work)
>  {
> -	struct drm_i915_private *dev_priv = (struct drm_i915_private *)data;
> +	struct drm_i915_private *dev_priv =
> +		container_of(work, typeof(*dev_priv),
> +			     gpu_error.hangcheck_work.work);

This wont apply to dinq as in dinq we get struct drm_device * as data.
Previous commit missing?

- Mika

>  	struct intel_engine_cs *engine;
>  	int i;
>  	int busy_count = 0, rings_hung = 0;
> @@ -3312,8 +3313,8 @@ void i915_queue_hangcheck(struct drm_device *dev)
>  	if (!i915_module.enable_hangcheck)
>  		return;
>  
> -	mod_timer(&to_i915(dev)->gpu_error.hangcheck_timer,
> -		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> +	schedule_delayed_work(&to_i915(dev)->gpu_error.hangcheck_work,
> +			      round_jiffies_up_relative(DRM_I915_HANGCHECK_JIFFIES));
>  }
>  
>  static void ibx_irq_reset(struct drm_device *dev)
> @@ -4615,9 +4616,8 @@ void intel_irq_init(struct drm_device *dev)
>  	else
>  		dev_priv->rps.pm_events = GEN6_PM_RPS_EVENTS;
>  
> -	setup_timer(&dev_priv->gpu_error.hangcheck_timer,
> -		    i915_hangcheck_elapsed,
> -		    (unsigned long) dev_priv);
> +	INIT_DELAYED_WORK(&dev_priv->gpu_error.hangcheck_work,
> +			  i915_hangcheck_elapsed);
>  	INIT_DELAYED_WORK(&dev_priv->hotplug_reenable_work,
>  			  intel_hpd_irq_reenable);
>  
> -- 
> 2.1.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-10-07 15:35 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-04 12:04 [PATCH] drm/i915: Convert hangcheck from a timer into a delayed work item Chris Wilson
2014-09-04 12:17 ` Jani Nikula
2014-09-04 12:26   ` Chris Wilson
2014-09-04 13:31     ` Daniel Vetter
2014-09-04 13:33       ` Chris Wilson
2014-09-04 14:12         ` Daniel Vetter
2014-09-04 14:18           ` Chris Wilson
2014-09-04 15:09 ` Chris Wilson
2014-09-04 15:25   ` Ville Syrjälä
2014-09-04 15:38     ` Chris Wilson
2014-09-04 16:40       ` Ville Syrjälä
2014-10-07 15:34   ` Mika Kuoppala

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.