All of lore.kernel.org
 help / color / mirror / Atom feed
From: John Harrison <john.c.harrison@intel.com>
To: Matthew Brost <matthew.brost@intel.com>,
	intel-gfx@lists.freedesktop.org,
	 dri-devel@lists.freedesktop.org
Cc: Michal.Wajdeczko@intel.com
Subject: Re: [PATCH 5/7] drm/i915/guc: Add stall timer to non blocking CTB send function
Date: Tue, 6 Jul 2021 11:27:55 -0700	[thread overview]
Message-ID: <6d1007b2-1f84-80e5-a598-81c7e0f4727d@intel.com> (raw)
In-Reply-To: <20210701171550.49353-6-matthew.brost@intel.com>

On 7/1/2021 10:15, Matthew Brost wrote:
> Implement a stall timer which fails H2G CTBs once a period of time
> with no forward progress is reached to prevent deadlock.
>
> v2:
>   (Michal)
>    - Improve error message in ct_deadlock()
>    - Set broken when ct_deadlock() returns true
>    - Return -EPIPE on ct_deadlock()
> v3:
>   (Michal)
>    - Add ms to stall timer comment
>   (Matthew)
>    - Move broken check to intel_guc_ct_send()
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Looks plausible to me.

Reviewed-by: John Harrison <John.C.Harrison@Intel.com>

> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 62 ++++++++++++++++++++---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  4 ++
>   2 files changed, 59 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index fb825cc1d090..a9cb7b608520 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -4,6 +4,9 @@
>    */
>   
>   #include <linux/circ_buf.h>
> +#include <linux/ktime.h>
> +#include <linux/time64.h>
> +#include <linux/timekeeping.h>
>   
>   #include "i915_drv.h"
>   #include "intel_guc_ct.h"
> @@ -316,6 +319,7 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
>   		goto err_deregister;
>   
>   	ct->enabled = true;
> +	ct->stall_time = KTIME_MAX;
>   
>   	return 0;
>   
> @@ -388,9 +392,6 @@ static int ct_write(struct intel_guc_ct *ct,
>   	u32 *cmds = ctb->cmds;
>   	unsigned int i;
>   
> -	if (unlikely(ctb->broken))
> -		return -EPIPE;
> -
>   	if (unlikely(desc->status))
>   		goto corrupted;
>   
> @@ -506,6 +507,25 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
>   	return err;
>   }
>   
> +#define GUC_CTB_TIMEOUT_MS	1500
> +static inline bool ct_deadlocked(struct intel_guc_ct *ct)
> +{
> +	long timeout = GUC_CTB_TIMEOUT_MS;
> +	bool ret = ktime_ms_delta(ktime_get(), ct->stall_time) > timeout;
> +
> +	if (unlikely(ret)) {
> +		struct guc_ct_buffer_desc *send = ct->ctbs.send.desc;
> +		struct guc_ct_buffer_desc *recv = ct->ctbs.send.desc;
> +
> +		CT_ERROR(ct, "Communication stalled for %lld ms, desc status=%#x,%#x\n",
> +			 ktime_ms_delta(ktime_get(), ct->stall_time),
> +			 send->status, recv->status);
> +		ct->ctbs.send.broken = true;
> +	}
> +
> +	return ret;
> +}
> +
>   static inline bool h2g_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
>   {
>   	struct guc_ct_buffer_desc *desc = ctb->desc;
> @@ -517,6 +537,26 @@ static inline bool h2g_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
>   	return space >= len_dw;
>   }
>   
> +static int has_room_nb(struct intel_guc_ct *ct, u32 len_dw)
> +{
> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> +
> +	lockdep_assert_held(&ct->ctbs.send.lock);
> +
> +	if (unlikely(!h2g_has_room(ctb, len_dw))) {
> +		if (ct->stall_time == KTIME_MAX)
> +			ct->stall_time = ktime_get();
> +
> +		if (unlikely(ct_deadlocked(ct)))
> +			return -EPIPE;
> +		else
> +			return -EBUSY;
> +	}
> +
> +	ct->stall_time = KTIME_MAX;
> +	return 0;
> +}
> +
>   static int ct_send_nb(struct intel_guc_ct *ct,
>   		      const u32 *action,
>   		      u32 len,
> @@ -529,11 +569,9 @@ static int ct_send_nb(struct intel_guc_ct *ct,
>   
>   	spin_lock_irqsave(&ctb->lock, spin_flags);
>   
> -	ret = h2g_has_room(ctb, len + GUC_CTB_HDR_LEN);
> -	if (unlikely(!ret)) {
> -		ret = -EBUSY;
> +	ret = has_room_nb(ct, len + GUC_CTB_HDR_LEN);
> +	if (unlikely(ret))
>   		goto out;
> -	}
>   
>   	fence = ct_get_next_fence(ct);
>   	ret = ct_write(ct, action, len, fence, flags);
> @@ -576,8 +614,13 @@ static int ct_send(struct intel_guc_ct *ct,
>   retry:
>   	spin_lock_irqsave(&ctb->lock, flags);
>   	if (unlikely(!h2g_has_room(ctb, len + GUC_CTB_HDR_LEN))) {
> +		if (ct->stall_time == KTIME_MAX)
> +			ct->stall_time = ktime_get();
>   		spin_unlock_irqrestore(&ctb->lock, flags);
>   
> +		if (unlikely(ct_deadlocked(ct)))
> +			return -EPIPE;
> +
>   		if (msleep_interruptible(sleep_period_ms))
>   			return -EINTR;
>   		sleep_period_ms = sleep_period_ms << 1;
> @@ -585,6 +628,8 @@ static int ct_send(struct intel_guc_ct *ct,
>   		goto retry;
>   	}
>   
> +	ct->stall_time = KTIME_MAX;
> +
>   	fence = ct_get_next_fence(ct);
>   	request.fence = fence;
>   	request.status = 0;
> @@ -647,6 +692,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>   		return -ENODEV;
>   	}
>   
> +	if (unlikely(ct->ctbs.send.broken))
> +		return -EPIPE;
> +
>   	if (flags & INTEL_GUC_CT_SEND_NB)
>   		return ct_send_nb(ct, action, len, flags);
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> index 5bb8bef024c8..bee03794c1eb 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> @@ -9,6 +9,7 @@
>   #include <linux/interrupt.h>
>   #include <linux/spinlock.h>
>   #include <linux/workqueue.h>
> +#include <linux/ktime.h>
>   
>   #include "intel_guc_fwif.h"
>   
> @@ -68,6 +69,9 @@ struct intel_guc_ct {
>   		struct list_head incoming; /* incoming requests */
>   		struct work_struct worker; /* handler for incoming requests */
>   	} requests;
> +
> +	/** @stall_time: time of first time a CTB submission is stalled */
> +	ktime_t stall_time;
>   };
>   
>   void intel_guc_ct_init_early(struct intel_guc_ct *ct);


WARNING: multiple messages have this Message-ID (diff)
From: John Harrison <john.c.harrison@intel.com>
To: Matthew Brost <matthew.brost@intel.com>,
	intel-gfx@lists.freedesktop.org,
	 dri-devel@lists.freedesktop.org
Subject: Re: [Intel-gfx] [PATCH 5/7] drm/i915/guc: Add stall timer to non blocking CTB send function
Date: Tue, 6 Jul 2021 11:27:55 -0700	[thread overview]
Message-ID: <6d1007b2-1f84-80e5-a598-81c7e0f4727d@intel.com> (raw)
In-Reply-To: <20210701171550.49353-6-matthew.brost@intel.com>

On 7/1/2021 10:15, Matthew Brost wrote:
> Implement a stall timer which fails H2G CTBs once a period of time
> with no forward progress is reached to prevent deadlock.
>
> v2:
>   (Michal)
>    - Improve error message in ct_deadlock()
>    - Set broken when ct_deadlock() returns true
>    - Return -EPIPE on ct_deadlock()
> v3:
>   (Michal)
>    - Add ms to stall timer comment
>   (Matthew)
>    - Move broken check to intel_guc_ct_send()
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Looks plausible to me.

Reviewed-by: John Harrison <John.C.Harrison@Intel.com>

> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 62 ++++++++++++++++++++---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  4 ++
>   2 files changed, 59 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index fb825cc1d090..a9cb7b608520 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -4,6 +4,9 @@
>    */
>   
>   #include <linux/circ_buf.h>
> +#include <linux/ktime.h>
> +#include <linux/time64.h>
> +#include <linux/timekeeping.h>
>   
>   #include "i915_drv.h"
>   #include "intel_guc_ct.h"
> @@ -316,6 +319,7 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
>   		goto err_deregister;
>   
>   	ct->enabled = true;
> +	ct->stall_time = KTIME_MAX;
>   
>   	return 0;
>   
> @@ -388,9 +392,6 @@ static int ct_write(struct intel_guc_ct *ct,
>   	u32 *cmds = ctb->cmds;
>   	unsigned int i;
>   
> -	if (unlikely(ctb->broken))
> -		return -EPIPE;
> -
>   	if (unlikely(desc->status))
>   		goto corrupted;
>   
> @@ -506,6 +507,25 @@ static int wait_for_ct_request_update(struct ct_request *req, u32 *status)
>   	return err;
>   }
>   
> +#define GUC_CTB_TIMEOUT_MS	1500
> +static inline bool ct_deadlocked(struct intel_guc_ct *ct)
> +{
> +	long timeout = GUC_CTB_TIMEOUT_MS;
> +	bool ret = ktime_ms_delta(ktime_get(), ct->stall_time) > timeout;
> +
> +	if (unlikely(ret)) {
> +		struct guc_ct_buffer_desc *send = ct->ctbs.send.desc;
> +		struct guc_ct_buffer_desc *recv = ct->ctbs.send.desc;
> +
> +		CT_ERROR(ct, "Communication stalled for %lld ms, desc status=%#x,%#x\n",
> +			 ktime_ms_delta(ktime_get(), ct->stall_time),
> +			 send->status, recv->status);
> +		ct->ctbs.send.broken = true;
> +	}
> +
> +	return ret;
> +}
> +
>   static inline bool h2g_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
>   {
>   	struct guc_ct_buffer_desc *desc = ctb->desc;
> @@ -517,6 +537,26 @@ static inline bool h2g_has_room(struct intel_guc_ct_buffer *ctb, u32 len_dw)
>   	return space >= len_dw;
>   }
>   
> +static int has_room_nb(struct intel_guc_ct *ct, u32 len_dw)
> +{
> +	struct intel_guc_ct_buffer *ctb = &ct->ctbs.send;
> +
> +	lockdep_assert_held(&ct->ctbs.send.lock);
> +
> +	if (unlikely(!h2g_has_room(ctb, len_dw))) {
> +		if (ct->stall_time == KTIME_MAX)
> +			ct->stall_time = ktime_get();
> +
> +		if (unlikely(ct_deadlocked(ct)))
> +			return -EPIPE;
> +		else
> +			return -EBUSY;
> +	}
> +
> +	ct->stall_time = KTIME_MAX;
> +	return 0;
> +}
> +
>   static int ct_send_nb(struct intel_guc_ct *ct,
>   		      const u32 *action,
>   		      u32 len,
> @@ -529,11 +569,9 @@ static int ct_send_nb(struct intel_guc_ct *ct,
>   
>   	spin_lock_irqsave(&ctb->lock, spin_flags);
>   
> -	ret = h2g_has_room(ctb, len + GUC_CTB_HDR_LEN);
> -	if (unlikely(!ret)) {
> -		ret = -EBUSY;
> +	ret = has_room_nb(ct, len + GUC_CTB_HDR_LEN);
> +	if (unlikely(ret))
>   		goto out;
> -	}
>   
>   	fence = ct_get_next_fence(ct);
>   	ret = ct_write(ct, action, len, fence, flags);
> @@ -576,8 +614,13 @@ static int ct_send(struct intel_guc_ct *ct,
>   retry:
>   	spin_lock_irqsave(&ctb->lock, flags);
>   	if (unlikely(!h2g_has_room(ctb, len + GUC_CTB_HDR_LEN))) {
> +		if (ct->stall_time == KTIME_MAX)
> +			ct->stall_time = ktime_get();
>   		spin_unlock_irqrestore(&ctb->lock, flags);
>   
> +		if (unlikely(ct_deadlocked(ct)))
> +			return -EPIPE;
> +
>   		if (msleep_interruptible(sleep_period_ms))
>   			return -EINTR;
>   		sleep_period_ms = sleep_period_ms << 1;
> @@ -585,6 +628,8 @@ static int ct_send(struct intel_guc_ct *ct,
>   		goto retry;
>   	}
>   
> +	ct->stall_time = KTIME_MAX;
> +
>   	fence = ct_get_next_fence(ct);
>   	request.fence = fence;
>   	request.status = 0;
> @@ -647,6 +692,9 @@ int intel_guc_ct_send(struct intel_guc_ct *ct, const u32 *action, u32 len,
>   		return -ENODEV;
>   	}
>   
> +	if (unlikely(ct->ctbs.send.broken))
> +		return -EPIPE;
> +
>   	if (flags & INTEL_GUC_CT_SEND_NB)
>   		return ct_send_nb(ct, action, len, flags);
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> index 5bb8bef024c8..bee03794c1eb 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> @@ -9,6 +9,7 @@
>   #include <linux/interrupt.h>
>   #include <linux/spinlock.h>
>   #include <linux/workqueue.h>
> +#include <linux/ktime.h>
>   
>   #include "intel_guc_fwif.h"
>   
> @@ -68,6 +69,9 @@ struct intel_guc_ct {
>   		struct list_head incoming; /* incoming requests */
>   		struct work_struct worker; /* handler for incoming requests */
>   	} requests;
> +
> +	/** @stall_time: time of first time a CTB submission is stalled */
> +	ktime_t stall_time;
>   };
>   
>   void intel_guc_ct_init_early(struct intel_guc_ct *ct);

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  reply	other threads:[~2021-07-06 18:28 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-01 17:15 [PATCH 0/7] CT changes required for GuC submission Matthew Brost
2021-07-01 17:15 ` [Intel-gfx] " Matthew Brost
2021-07-01 17:15 ` [PATCH 1/7] drm/i915/guc: Relax CTB response timeout Matthew Brost
2021-07-01 17:15   ` [Intel-gfx] " Matthew Brost
2021-07-01 17:15 ` [PATCH 2/7] drm/i915/guc: Improve error message for unsolicited CT response Matthew Brost
2021-07-01 17:15   ` [Intel-gfx] " Matthew Brost
2021-07-01 17:15 ` [PATCH 3/7] drm/i915/guc: Increase size of CTB buffers Matthew Brost
2021-07-01 17:15   ` [Intel-gfx] " Matthew Brost
2021-07-01 17:15 ` [PATCH 4/7] drm/i915/guc: Add non blocking CTB send function Matthew Brost
2021-07-01 17:15   ` [Intel-gfx] " Matthew Brost
2021-07-06 18:12   ` John Harrison
2021-07-06 18:12     ` [Intel-gfx] " John Harrison
2021-07-06 18:17     ` Matthew Brost
2021-07-06 18:17       ` [Intel-gfx] " Matthew Brost
2021-07-01 17:15 ` [PATCH 5/7] drm/i915/guc: Add stall timer to " Matthew Brost
2021-07-01 17:15   ` [Intel-gfx] " Matthew Brost
2021-07-06 18:27   ` John Harrison [this message]
2021-07-06 18:27     ` John Harrison
2021-07-01 17:15 ` [PATCH 6/7] drm/i915/guc: Optimize CTB writes and reads Matthew Brost
2021-07-01 17:15   ` [Intel-gfx] " Matthew Brost
2021-07-06 19:00   ` John Harrison
2021-07-06 19:00     ` [Intel-gfx] " John Harrison
2021-07-06 19:12     ` Michal Wajdeczko
2021-07-06 19:12       ` [Intel-gfx] " Michal Wajdeczko
2021-07-06 19:19       ` John Harrison
2021-07-06 19:19         ` [Intel-gfx] " John Harrison
2021-07-06 19:33         ` Michal Wajdeczko
2021-07-06 19:33           ` [Intel-gfx] " Michal Wajdeczko
2021-07-06 21:22           ` Matthew Brost
2021-07-06 21:22             ` [Intel-gfx] " Matthew Brost
2021-07-06 22:41           ` John Harrison
2021-07-06 22:41             ` [Intel-gfx] " John Harrison
2021-07-01 17:15 ` [PATCH 7/7] drm/i915/guc: Module load failure test for CT buffer creation Matthew Brost
2021-07-01 17:15   ` [Intel-gfx] " Matthew Brost
2021-07-01 23:20 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for CT changes required for GuC submission (rev2) Patchwork
2021-07-01 23:21 ` [Intel-gfx] ✗ Fi.CI.SPARSE: " Patchwork
2021-07-01 23:49 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2021-07-02  6:55 ` [Intel-gfx] ✗ Fi.CI.IGT: failure " Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2021-07-08 16:20 [PATCH 0/7] CT changes required for GuC submission Matthew Brost
2021-07-08 16:20 ` [PATCH 5/7] drm/i915/guc: Add stall timer to non blocking CTB send function Matthew Brost
2021-07-06 22:20 [PATCH 0/7] CT changes required for GuC submission Matthew Brost
2021-07-06 22:20 ` [PATCH 5/7] drm/i915/guc: Add stall timer to non blocking CTB send function Matthew Brost
2021-06-27 23:14 [PATCH 0/7] CT changes required for GuC submission Matthew Brost
2021-06-27 23:14 ` [PATCH 5/7] drm/i915/guc: Add stall timer to non blocking CTB send function Matthew Brost
2021-06-30 23:23   ` Michal Wajdeczko
2021-07-01  0:36     ` Matthew Brost

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6d1007b2-1f84-80e5-a598-81c7e0f4727d@intel.com \
    --to=john.c.harrison@intel.com \
    --cc=Michal.Wajdeczko@intel.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=intel-gfx@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.