dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] drm/i915/huc: Allow for very slow HuC loading
@ 2024-01-02 22:22 John.C.Harrison
  2024-01-04 20:34 ` Daniele Ceraolo Spurio
  0 siblings, 1 reply; 3+ messages in thread
From: John.C.Harrison @ 2024-01-02 22:22 UTC (permalink / raw)
  To: Intel-GFX; +Cc: John Harrison, DRI-Devel

From: John Harrison <John.C.Harrison@Intel.com>

A failure to load the HuC is occasionally observed where the cause is
believed to be a low GT frequency leading to very long load times.

So a) increase the timeout so that the user still gets a working
system even in the case of slow load. And b) report the frequency
during the load to see if that is the cause of the slow down.

Also update the similar code on the GuC load to not use uncore->gt
when there is a local gt available. The two should match, but no need
for unnecessary de-referencing.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c | 10 ++--
 drivers/gpu/drm/i915/gt/uc/intel_huc.c    | 64 ++++++++++++++++++++---
 2 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
index 0f79cb6585182..52332bb143395 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
@@ -184,7 +184,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
 	 * in the seconds range. However, there is a limit on how long an
 	 * individual wait_for() can wait. So wrap it in a loop.
 	 */
-	before_freq = intel_rps_read_actual_frequency(&uncore->gt->rps);
+	before_freq = intel_rps_read_actual_frequency(&gt->rps);
 	before = ktime_get();
 	for (count = 0; count < GUC_LOAD_RETRY_LIMIT; count++) {
 		ret = wait_for(guc_load_done(uncore, &status, &success), 1000);
@@ -192,7 +192,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
 			break;
 
 		guc_dbg(guc, "load still in progress, count = %d, freq = %dMHz, status = 0x%08X [0x%02X/%02X]\n",
-			count, intel_rps_read_actual_frequency(&uncore->gt->rps), status,
+			count, intel_rps_read_actual_frequency(&gt->rps), status,
 			REG_FIELD_GET(GS_BOOTROM_MASK, status),
 			REG_FIELD_GET(GS_UKERNEL_MASK, status));
 	}
@@ -204,7 +204,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
 		u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status);
 
 		guc_info(guc, "load failed: status = 0x%08X, time = %lldms, freq = %dMHz, ret = %d\n",
-			 status, delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps), ret);
+			 status, delta_ms, intel_rps_read_actual_frequency(&gt->rps), ret);
 		guc_info(guc, "load failed: status: Reset = %d, BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
 			 REG_FIELD_GET(GS_MIA_IN_RESET, status),
 			 bootrom, ukernel,
@@ -254,11 +254,11 @@ static int guc_wait_ucode(struct intel_guc *guc)
 		guc_warn(guc, "excessive init time: %lldms! [status = 0x%08X, count = %d, ret = %d]\n",
 			 delta_ms, status, count, ret);
 		guc_warn(guc, "excessive init time: [freq = %dMHz, before = %dMHz, perf_limit_reasons = 0x%08X]\n",
-			 intel_rps_read_actual_frequency(&uncore->gt->rps), before_freq,
+			 intel_rps_read_actual_frequency(&gt->rps), before_freq,
 			 intel_uncore_read(uncore, intel_gt_perf_limit_reasons_reg(gt)));
 	} else {
 		guc_dbg(guc, "init took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
-			delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps),
+			delta_ms, intel_rps_read_actual_frequency(&gt->rps),
 			before_freq, status, count, ret);
 	}
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
index ba9e07fc2b577..9ccec7de9628a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 
 #include "gt/intel_gt.h"
+#include "gt/intel_rps.h"
 #include "intel_guc_reg.h"
 #include "intel_huc.h"
 #include "intel_huc_print.h"
@@ -447,17 +448,68 @@ static const char *auth_mode_string(struct intel_huc *huc,
 	return partial ? "clear media" : "all workloads";
 }
 
+/*
+ * Use a longer timeout for debug builds so that problems can be detected
+ * and analysed. But a shorter timeout for releases so that user's don't
+ * wait forever to find out there is a problem. Note that the only reason
+ * an end user should hit the timeout is in case of extreme thermal throttling.
+ * And a system that is that hot during boot is probably dead anyway!
+ */
+#if defined(CONFIG_DRM_I915_DEBUG_GEM)
+#define HUC_LOAD_RETRY_LIMIT   20
+#else
+#define HUC_LOAD_RETRY_LIMIT   3
+#endif
+
 int intel_huc_wait_for_auth_complete(struct intel_huc *huc,
 				     enum intel_huc_authentication_type type)
 {
 	struct intel_gt *gt = huc_to_gt(huc);
-	int ret;
+	struct intel_uncore *uncore = gt->uncore;
+	ktime_t before, after, delta;
+	int ret, count;
+	u64 delta_ms;
+	u32 before_freq;
 
-	ret = __intel_wait_for_register(gt->uncore,
-					huc->status[type].reg,
-					huc->status[type].mask,
-					huc->status[type].value,
-					2, 50, NULL);
+	/*
+	 * The KMD requests maximum frequency during driver load, however thermal
+	 * throttling can force the frequency down to minimum (although the board
+	 * really should never get that hot in real life!). IFWI  issues have been
+	 * seen to cause sporadic failures to grant the higher frequency. And at
+	 * minimum frequency, the load time can be in the seconds range. Note that
+	 * there is a limit on how long an individual wait_for() can wait. So wrap
+	 * it in a loop.
+	 */
+	before_freq = intel_rps_read_actual_frequency(&gt->rps);
+	before = ktime_get();
+	for (count = 0; count < HUC_LOAD_RETRY_LIMIT; count++) {
+		ret = __intel_wait_for_register(gt->uncore,
+						huc->status[type].reg,
+						huc->status[type].mask,
+						huc->status[type].value,
+						2, 1000, NULL);
+		if (!ret)
+			break;
+
+		huc_dbg(huc, "auth still in progress, count = %d, freq = %dMHz, status = 0x%08X\n",
+			count, intel_rps_read_actual_frequency(&gt->rps),
+			huc->status[type].reg.reg);
+	}
+	after = ktime_get();
+	delta = ktime_sub(after, before);
+	delta_ms = ktime_to_ms(delta);
+
+	if (delta_ms > 50) {
+		huc_warn(huc, "excessive auth time: %lldms! [status = 0x%08X, count = %d, ret = %d]\n",
+			 delta_ms, huc->status[type].reg.reg, count, ret);
+		huc_warn(huc, "excessive auth time: [freq = %dMHz, before = %dMHz, perf_limit_reasons = 0x%08X]\n",
+			 intel_rps_read_actual_frequency(&gt->rps), before_freq,
+			 intel_uncore_read(uncore, intel_gt_perf_limit_reasons_reg(gt)));
+	} else {
+		huc_dbg(huc, "auth took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
+			delta_ms, intel_rps_read_actual_frequency(&gt->rps),
+			before_freq, huc->status[type].reg.reg, count, ret);
+	}
 
 	/* mark the load process as complete even if the wait failed */
 	delayed_huc_load_complete(huc);
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/i915/huc: Allow for very slow HuC loading
  2024-01-02 22:22 [PATCH] drm/i915/huc: Allow for very slow HuC loading John.C.Harrison
@ 2024-01-04 20:34 ` Daniele Ceraolo Spurio
  2024-01-05 19:57   ` John Harrison
  0 siblings, 1 reply; 3+ messages in thread
From: Daniele Ceraolo Spurio @ 2024-01-04 20:34 UTC (permalink / raw)
  To: John.C.Harrison, Intel-GFX; +Cc: DRI-Devel



On 1/2/2024 2:22 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> A failure to load the HuC is occasionally observed where the cause is
> believed to be a low GT frequency leading to very long load times.
>
> So a) increase the timeout so that the user still gets a working
> system even in the case of slow load. And b) report the frequency
> during the load to see if that is the cause of the slow down.
>
> Also update the similar code on the GuC load to not use uncore->gt
> when there is a local gt available. The two should match, but no need
> for unnecessary de-referencing.

Since the code is identical in almost identical in both places, I'm 
wondering if it is worth using a common waiter function and pass in a 
function pointer with the waiting logic. The cons of that is that we'd 
have to move to gt-level logging and pass in a tag, so not sure if it is 
worth it overall given that it isn't a lot of code. Maybe we should 
consider it when we implement this on the Xe-driver side?

>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c | 10 ++--
>   drivers/gpu/drm/i915/gt/uc/intel_huc.c    | 64 ++++++++++++++++++++---
>   2 files changed, 63 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
> index 0f79cb6585182..52332bb143395 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
> @@ -184,7 +184,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
>   	 * in the seconds range. However, there is a limit on how long an
>   	 * individual wait_for() can wait. So wrap it in a loop.
>   	 */
> -	before_freq = intel_rps_read_actual_frequency(&uncore->gt->rps);
> +	before_freq = intel_rps_read_actual_frequency(&gt->rps);
>   	before = ktime_get();
>   	for (count = 0; count < GUC_LOAD_RETRY_LIMIT; count++) {
>   		ret = wait_for(guc_load_done(uncore, &status, &success), 1000);
> @@ -192,7 +192,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
>   			break;
>   
>   		guc_dbg(guc, "load still in progress, count = %d, freq = %dMHz, status = 0x%08X [0x%02X/%02X]\n",
> -			count, intel_rps_read_actual_frequency(&uncore->gt->rps), status,
> +			count, intel_rps_read_actual_frequency(&gt->rps), status,
>   			REG_FIELD_GET(GS_BOOTROM_MASK, status),
>   			REG_FIELD_GET(GS_UKERNEL_MASK, status));
>   	}
> @@ -204,7 +204,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
>   		u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status);
>   
>   		guc_info(guc, "load failed: status = 0x%08X, time = %lldms, freq = %dMHz, ret = %d\n",
> -			 status, delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps), ret);
> +			 status, delta_ms, intel_rps_read_actual_frequency(&gt->rps), ret);
>   		guc_info(guc, "load failed: status: Reset = %d, BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
>   			 REG_FIELD_GET(GS_MIA_IN_RESET, status),
>   			 bootrom, ukernel,
> @@ -254,11 +254,11 @@ static int guc_wait_ucode(struct intel_guc *guc)
>   		guc_warn(guc, "excessive init time: %lldms! [status = 0x%08X, count = %d, ret = %d]\n",
>   			 delta_ms, status, count, ret);
>   		guc_warn(guc, "excessive init time: [freq = %dMHz, before = %dMHz, perf_limit_reasons = 0x%08X]\n",
> -			 intel_rps_read_actual_frequency(&uncore->gt->rps), before_freq,
> +			 intel_rps_read_actual_frequency(&gt->rps), before_freq,
>   			 intel_uncore_read(uncore, intel_gt_perf_limit_reasons_reg(gt)));
>   	} else {
>   		guc_dbg(guc, "init took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
> -			delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps),
> +			delta_ms, intel_rps_read_actual_frequency(&gt->rps),
>   			before_freq, status, count, ret);
>   	}
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
> index ba9e07fc2b577..9ccec7de9628a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
> @@ -6,6 +6,7 @@
>   #include <linux/types.h>
>   
>   #include "gt/intel_gt.h"
> +#include "gt/intel_rps.h"
>   #include "intel_guc_reg.h"
>   #include "intel_huc.h"
>   #include "intel_huc_print.h"
> @@ -447,17 +448,68 @@ static const char *auth_mode_string(struct intel_huc *huc,
>   	return partial ? "clear media" : "all workloads";
>   }
>   
> +/*
> + * Use a longer timeout for debug builds so that problems can be detected
> + * and analysed. But a shorter timeout for releases so that user's don't
> + * wait forever to find out there is a problem. Note that the only reason
> + * an end user should hit the timeout is in case of extreme thermal throttling.
> + * And a system that is that hot during boot is probably dead anyway!
> + */
> +#if defined(CONFIG_DRM_I915_DEBUG_GEM)
> +#define HUC_LOAD_RETRY_LIMIT   20
> +#else
> +#define HUC_LOAD_RETRY_LIMIT   3
> +#endif
> +
>   int intel_huc_wait_for_auth_complete(struct intel_huc *huc,
>   				     enum intel_huc_authentication_type type)
>   {
>   	struct intel_gt *gt = huc_to_gt(huc);
> -	int ret;
> +	struct intel_uncore *uncore = gt->uncore;
> +	ktime_t before, after, delta;
> +	int ret, count;
> +	u64 delta_ms;
> +	u32 before_freq;
>   
> -	ret = __intel_wait_for_register(gt->uncore,
> -					huc->status[type].reg,
> -					huc->status[type].mask,
> -					huc->status[type].value,
> -					2, 50, NULL);
> +	/*
> +	 * The KMD requests maximum frequency during driver load, however thermal
> +	 * throttling can force the frequency down to minimum (although the board
> +	 * really should never get that hot in real life!). IFWI  issues have been
> +	 * seen to cause sporadic failures to grant the higher frequency. And at
> +	 * minimum frequency, the load time can be in the seconds range. Note that

this is not load time but auth time. With this fixed (when applying is 
fine):

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Daniele

> +	 * there is a limit on how long an individual wait_for() can wait. So wrap
> +	 * it in a loop.
> +	 */
> +	before_freq = intel_rps_read_actual_frequency(&gt->rps);
> +	before = ktime_get();
> +	for (count = 0; count < HUC_LOAD_RETRY_LIMIT; count++) {
> +		ret = __intel_wait_for_register(gt->uncore,
> +						huc->status[type].reg,
> +						huc->status[type].mask,
> +						huc->status[type].value,
> +						2, 1000, NULL);
> +		if (!ret)
> +			break;
> +
> +		huc_dbg(huc, "auth still in progress, count = %d, freq = %dMHz, status = 0x%08X\n",
> +			count, intel_rps_read_actual_frequency(&gt->rps),
> +			huc->status[type].reg.reg);
> +	}
> +	after = ktime_get();
> +	delta = ktime_sub(after, before);
> +	delta_ms = ktime_to_ms(delta);
> +
> +	if (delta_ms > 50) {
> +		huc_warn(huc, "excessive auth time: %lldms! [status = 0x%08X, count = %d, ret = %d]\n",
> +			 delta_ms, huc->status[type].reg.reg, count, ret);
> +		huc_warn(huc, "excessive auth time: [freq = %dMHz, before = %dMHz, perf_limit_reasons = 0x%08X]\n",
> +			 intel_rps_read_actual_frequency(&gt->rps), before_freq,
> +			 intel_uncore_read(uncore, intel_gt_perf_limit_reasons_reg(gt)));
> +	} else {
> +		huc_dbg(huc, "auth took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
> +			delta_ms, intel_rps_read_actual_frequency(&gt->rps),
> +			before_freq, huc->status[type].reg.reg, count, ret);
> +	}
>   
>   	/* mark the load process as complete even if the wait failed */
>   	delayed_huc_load_complete(huc);


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/i915/huc: Allow for very slow HuC loading
  2024-01-04 20:34 ` Daniele Ceraolo Spurio
@ 2024-01-05 19:57   ` John Harrison
  0 siblings, 0 replies; 3+ messages in thread
From: John Harrison @ 2024-01-05 19:57 UTC (permalink / raw)
  To: Daniele Ceraolo Spurio, Intel-GFX; +Cc: DRI-Devel

On 1/4/2024 12:34, Daniele Ceraolo Spurio wrote:
> On 1/2/2024 2:22 PM, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> A failure to load the HuC is occasionally observed where the cause is
>> believed to be a low GT frequency leading to very long load times.
>>
>> So a) increase the timeout so that the user still gets a working
>> system even in the case of slow load. And b) report the frequency
>> during the load to see if that is the cause of the slow down.
>>
>> Also update the similar code on the GuC load to not use uncore->gt
>> when there is a local gt available. The two should match, but no need
>> for unnecessary de-referencing.
>
> Since the code is identical in almost identical in both places, I'm 
> wondering if it is worth using a common waiter function and pass in a 
> function pointer with the waiting logic. The cons of that is that we'd 
> have to move to gt-level logging and pass in a tag, so not sure if it 
> is worth it overall given that it isn't a lot of code. Maybe we should 
> consider it when we implement this on the Xe-driver side?
Yeah, I did think about trying to have something more common but it gets 
messy with all the error reports being specific to the firmware in 
question. But yeah, maybe think some more on it for the Xe version.


>
>>
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> ---
>>   drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c | 10 ++--
>>   drivers/gpu/drm/i915/gt/uc/intel_huc.c    | 64 ++++++++++++++++++++---
>>   2 files changed, 63 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
>> index 0f79cb6585182..52332bb143395 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
>> @@ -184,7 +184,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
>>        * in the seconds range. However, there is a limit on how long an
>>        * individual wait_for() can wait. So wrap it in a loop.
>>        */
>> -    before_freq = intel_rps_read_actual_frequency(&uncore->gt->rps);
>> +    before_freq = intel_rps_read_actual_frequency(&gt->rps);
>>       before = ktime_get();
>>       for (count = 0; count < GUC_LOAD_RETRY_LIMIT; count++) {
>>           ret = wait_for(guc_load_done(uncore, &status, &success), 
>> 1000);
>> @@ -192,7 +192,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
>>               break;
>>             guc_dbg(guc, "load still in progress, count = %d, freq = 
>> %dMHz, status = 0x%08X [0x%02X/%02X]\n",
>> -            count, 
>> intel_rps_read_actual_frequency(&uncore->gt->rps), status,
>> +            count, intel_rps_read_actual_frequency(&gt->rps), status,
>>               REG_FIELD_GET(GS_BOOTROM_MASK, status),
>>               REG_FIELD_GET(GS_UKERNEL_MASK, status));
>>       }
>> @@ -204,7 +204,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
>>           u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status);
>>             guc_info(guc, "load failed: status = 0x%08X, time = 
>> %lldms, freq = %dMHz, ret = %d\n",
>> -             status, delta_ms, 
>> intel_rps_read_actual_frequency(&uncore->gt->rps), ret);
>> +             status, delta_ms, 
>> intel_rps_read_actual_frequency(&gt->rps), ret);
>>           guc_info(guc, "load failed: status: Reset = %d, BootROM = 
>> 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
>>                REG_FIELD_GET(GS_MIA_IN_RESET, status),
>>                bootrom, ukernel,
>> @@ -254,11 +254,11 @@ static int guc_wait_ucode(struct intel_guc *guc)
>>           guc_warn(guc, "excessive init time: %lldms! [status = 
>> 0x%08X, count = %d, ret = %d]\n",
>>                delta_ms, status, count, ret);
>>           guc_warn(guc, "excessive init time: [freq = %dMHz, before = 
>> %dMHz, perf_limit_reasons = 0x%08X]\n",
>> - intel_rps_read_actual_frequency(&uncore->gt->rps), before_freq,
>> +             intel_rps_read_actual_frequency(&gt->rps), before_freq,
>>                intel_uncore_read(uncore, 
>> intel_gt_perf_limit_reasons_reg(gt)));
>>       } else {
>>           guc_dbg(guc, "init took %lldms, freq = %dMHz, before = 
>> %dMHz, status = 0x%08X, count = %d, ret = %d\n",
>> -            delta_ms, 
>> intel_rps_read_actual_frequency(&uncore->gt->rps),
>> +            delta_ms, intel_rps_read_actual_frequency(&gt->rps),
>>               before_freq, status, count, ret);
>>       }
>>   diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
>> index ba9e07fc2b577..9ccec7de9628a 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
>> @@ -6,6 +6,7 @@
>>   #include <linux/types.h>
>>     #include "gt/intel_gt.h"
>> +#include "gt/intel_rps.h"
>>   #include "intel_guc_reg.h"
>>   #include "intel_huc.h"
>>   #include "intel_huc_print.h"
>> @@ -447,17 +448,68 @@ static const char *auth_mode_string(struct 
>> intel_huc *huc,
>>       return partial ? "clear media" : "all workloads";
>>   }
>>   +/*
>> + * Use a longer timeout for debug builds so that problems can be 
>> detected
>> + * and analysed. But a shorter timeout for releases so that user's 
>> don't
>> + * wait forever to find out there is a problem. Note that the only 
>> reason
>> + * an end user should hit the timeout is in case of extreme thermal 
>> throttling.
>> + * And a system that is that hot during boot is probably dead anyway!
>> + */
>> +#if defined(CONFIG_DRM_I915_DEBUG_GEM)
>> +#define HUC_LOAD_RETRY_LIMIT   20
>> +#else
>> +#define HUC_LOAD_RETRY_LIMIT   3
>> +#endif
>> +
>>   int intel_huc_wait_for_auth_complete(struct intel_huc *huc,
>>                        enum intel_huc_authentication_type type)
>>   {
>>       struct intel_gt *gt = huc_to_gt(huc);
>> -    int ret;
>> +    struct intel_uncore *uncore = gt->uncore;
>> +    ktime_t before, after, delta;
>> +    int ret, count;
>> +    u64 delta_ms;
>> +    u32 before_freq;
>>   -    ret = __intel_wait_for_register(gt->uncore,
>> -                    huc->status[type].reg,
>> -                    huc->status[type].mask,
>> -                    huc->status[type].value,
>> -                    2, 50, NULL);
>> +    /*
>> +     * The KMD requests maximum frequency during driver load, 
>> however thermal
>> +     * throttling can force the frequency down to minimum (although 
>> the board
>> +     * really should never get that hot in real life!). IFWI issues 
>> have been
>> +     * seen to cause sporadic failures to grant the higher 
>> frequency. And at
>> +     * minimum frequency, the load time can be in the seconds range. 
>> Note that
>
> this is not load time but auth time. With this fixed (when applying is 
> fine):
Yeah, I guess so. Will fix it.

>
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Thanks.

>
> Daniele
>
>> +     * there is a limit on how long an individual wait_for() can 
>> wait. So wrap
>> +     * it in a loop.
>> +     */
>> +    before_freq = intel_rps_read_actual_frequency(&gt->rps);
>> +    before = ktime_get();
>> +    for (count = 0; count < HUC_LOAD_RETRY_LIMIT; count++) {
>> +        ret = __intel_wait_for_register(gt->uncore,
>> +                        huc->status[type].reg,
>> +                        huc->status[type].mask,
>> +                        huc->status[type].value,
>> +                        2, 1000, NULL);
>> +        if (!ret)
>> +            break;
>> +
>> +        huc_dbg(huc, "auth still in progress, count = %d, freq = 
>> %dMHz, status = 0x%08X\n",
>> +            count, intel_rps_read_actual_frequency(&gt->rps),
>> +            huc->status[type].reg.reg);
>> +    }
>> +    after = ktime_get();
>> +    delta = ktime_sub(after, before);
>> +    delta_ms = ktime_to_ms(delta);
>> +
>> +    if (delta_ms > 50) {
>> +        huc_warn(huc, "excessive auth time: %lldms! [status = 
>> 0x%08X, count = %d, ret = %d]\n",
>> +             delta_ms, huc->status[type].reg.reg, count, ret);
>> +        huc_warn(huc, "excessive auth time: [freq = %dMHz, before = 
>> %dMHz, perf_limit_reasons = 0x%08X]\n",
>> +             intel_rps_read_actual_frequency(&gt->rps), before_freq,
>> +             intel_uncore_read(uncore, 
>> intel_gt_perf_limit_reasons_reg(gt)));
>> +    } else {
>> +        huc_dbg(huc, "auth took %lldms, freq = %dMHz, before = 
>> %dMHz, status = 0x%08X, count = %d, ret = %d\n",
>> +            delta_ms, intel_rps_read_actual_frequency(&gt->rps),
>> +            before_freq, huc->status[type].reg.reg, count, ret);
>> +    }
>>         /* mark the load process as complete even if the wait failed */
>>       delayed_huc_load_complete(huc);
>


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-01-05 19:57 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-02 22:22 [PATCH] drm/i915/huc: Allow for very slow HuC loading John.C.Harrison
2024-01-04 20:34 ` Daniele Ceraolo Spurio
2024-01-05 19:57   ` John Harrison

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).